InternLM · lvhan028 · Sep 18, 2023 · Sep 5, 2023 · Sep 5, 2023 · Sep 5, 2023
diff --git a/benchmark/README.md b/benchmark/README.md
@@ -23,10 +23,14 @@ python profile_throughput.py \
 
 `profile_generation.py` perform benchmark with dummy data.
 
+```shell
+pip install nvidia-ml-py
+```
+
 ```bash
 python profile_generation.py \
- /path/to/your/model \
- --concurrency 8 --input_seqlen 0 --output_seqlen 2048
+ --model-path /path/to/your/model \
+ --concurrency 1 8 --prompt-tokens 0 512 --completion-tokens 2048 512
 ```
 
 ## profile serving

diff --git a/benchmark/profile_generation.py b/benchmark/profile_generation.py
@@ -1,12 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 # import multiprocessing as mp
+import argparse
+import csv
+import logging
+import os
 import os.path as osp
 import time
+from dataclasses import dataclass
 from queue import Queue
 from threading import Thread
 from typing import List
 
-import fire
 import numpy as np
+from pynvml import (NVMLError, nvmlDeviceGetCount, nvmlDeviceGetHandleByIndex,
+                    nvmlDeviceGetMemoryInfo, nvmlDeviceGetName,
+                    nvmlDeviceGetPowerState, nvmlDeviceGetTemperature,
+                    nvmlInit, nvmlShutdown, nvmlSystemGetDriverVersion)
+from tqdm import tqdm
 
 from lmdeploy.turbomind import Tokenizer, TurboMind
 
@@ -77,12 +87,12 @@ def _infer(model, session_id):
     print(f'end warmup, elapsed time: {round(_end - _start, 2)}s')
 
 
-def main(model_path: str,
-         concurrency: int = 1,
-         input_seqlen: int = 0,
-         output_seqlen: int = 512,
-         test_round: int = 10,
-         tp: int = 1):
+def profile_throughput(model_path: str,
+                       concurrency: int = 1,
+                       input_seqlen: int = 0,
+                       output_seqlen: int = 512,
+                       test_round: int = 10,
+                       tp: int = 1):
     tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer')
     tokenizer = Tokenizer(tokenizer_model_path)
     tm_model = TurboMind(model_path=model_path, tp=tp)
@@ -141,7 +151,176 @@ def main(model_path: str,
           f'{token_latency_min:.2f}s, {token_latency_max:.2f}s, '
           f'{token_latency_ave:.2f}s\n'
           f'throughput: {throughput:.2f} token/s\n{"-" * 50}')
+    return tm_model.model_name, throughput, tm_model.gpu_count
+
+
+class MemoryMonitor:
+    from multiprocessing import Manager
+    max_mem = Manager().Value('f', 0)  # GB
+    device_count = Manager().Value('f', 0)
+
+    @staticmethod
+    def nvidia_info():
+        # pip install nvidia-ml-py
+        nvidia_dict = {
+            'state': True,
+            'nvidia_version': '',
+            'nvidia_count': 0,
+            'gpus': []
+        }
+        try:
+            nvmlInit()
+            nvidia_dict['nvidia_version'] = nvmlSystemGetDriverVersion()
+            nvidia_dict['nvidia_count'] = nvmlDeviceGetCount()
+            for i in range(nvidia_dict['nvidia_count']):
+                handle = nvmlDeviceGetHandleByIndex(i)
+                memory_info = nvmlDeviceGetMemoryInfo(handle)
+                gpu = {
+                    'gpu_name': nvmlDeviceGetName(handle),
+                    'total': memory_info.total,
+                    'free': memory_info.free,
+                    'used': memory_info.used,
+                    'temperature': f'{nvmlDeviceGetTemperature(handle, 0)}℃',
+                    'powerStatus': nvmlDeviceGetPowerState(handle)
+                }
+                nvidia_dict['gpus'].append(gpu)
+        except NVMLError as _:  # noqa
+            nvidia_dict['state'] = False
+        except Exception as _:  # noqa
+            nvidia_dict['state'] = False
+        finally:
+            try:
+                nvmlShutdown()
+            except:  # noqa
+                pass
+        return nvidia_dict
+
+    @classmethod
+    def mem_monitor(cls):
+        info = cls.nvidia_info()
+        max_mem = 0
+        mem_start = 0
+        cls.device_count.value = len(info['gpus'])
+        for used_total in info['gpus']:
+            mem_start += used_total['used']
+        while True:
+            info = cls.nvidia_info()
+            used = 0
+            for used_total in info['gpus']:
+                used += used_total['used']
+            if used > max_mem:
+                max_mem = used
+                cls.max_mem.value = (max_mem - mem_start) / (1 << 30)
+
+    @classmethod
+    def start(cls):
+        cls._running = True
+        from multiprocessing import Process
+        cls.proc = Process(target=cls.mem_monitor)
+        cls.proc.start()
+
+    @classmethod
+    def terminate(cls) -> float:
+        """Terminate the subprocess and return maximum memory."""
+        cls.proc.kill()
+        return cls.max_mem.value
+
+
+@dataclass
+class ProfileResult:
+    model_name: str
+    batch: int
+    prompt_tokens: int
+    completion_tokens: int
+    throughput_per_proc: float
+    throughput_per_node: float
+    mem_per_proc: float
+    mem_per_gpu: float
+    mem_per_node: float
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Regression Test')
+    parser.add_argument('--model-path',
+                        type=str,
+                        help='benchmark test model path')
+    parser.add_argument('--concurrency',
+                        nargs='+',
+                        type=int,
+                        help='how many requests launched concurrently',
+                        default=[1, 8, 16, 32])
+    parser.add_argument(
+        '--prompt-tokens',
+        nargs='+',
+        type=int,
+        help='how many requests launched concurrently. One-to-one'
+        'correspondence with completion-tokens',
+        default=[64, 512, 512, 1024])
+    parser.add_argument('--completion-tokens',
+                        nargs='+',
+                        type=int,
+                        help='how many tokens to be generated. One-to-one'
+                        'correspondence with prompt-tokens',
+                        default=[512, 512, 1024, 1024])
+    parser.add_argument('--tp', type=int, help='Tensor parallel', default=1)
+    parser.add_argument('--dst-csv',
+                        type=str,
+                        help='Where to save the result.',
+                        default='profile_generation.csv')
+    parser.add_argument('--log-level',
+                        help='set log level',
+                        default='INFO',
+                        choices=list(logging._nameToLevel.keys()))
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    os.environ['TM_LOG_LEVEL'] = args.log_level
+    results: List[ProfileResult] = []
+    for batch in tqdm(args.concurrency):
+        for prompt_tokens, completion_tokens in tqdm(
+                zip(args.prompt_tokens, args.completion_tokens)):
+            MemoryMonitor.start()
+            from functools import partial
+            from multiprocessing import Pool
+            profile_target = partial(profile_throughput,
+                                     concurrency=batch,
+                                     input_seqlen=prompt_tokens,
+                                     output_seqlen=completion_tokens,
+                                     tp=args.tp)
+            output = Pool(1).map(profile_target, (args.model_path, ))
+            model_name, throughput_per_proc, tp = output[0]
+            time.sleep(5)  # wait a while for releasing GPU mem
+            memory = MemoryMonitor.terminate()
+            device_count = MemoryMonitor.device_count.value
+            results.append(
+                ProfileResult(model_name=model_name,
+                              batch=batch,
+                              prompt_tokens=prompt_tokens,
+                              completion_tokens=completion_tokens,
+                              throughput_per_proc=throughput_per_proc,
+                              throughput_per_node=throughput_per_proc / tp *
+                              device_count,
+                              mem_per_proc=memory,
+                              mem_per_gpu=memory / tp,
+                              mem_per_node=memory / tp * device_count))
+    with open(args.dst_csv, 'w') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerow([
+            'batch', 'prompt_tokens', 'completion_tokens',
+            'throughput_per_proc(token/s)', 'throughput_per_node(token/s)',
+            'mem_per_proc(GB)', 'mem_per_gpu(GB)', 'mem_per_node(GB)'
+        ])
+        for re in results:
+            writer.writerow([
+                re.batch, re.prompt_tokens, re.completion_tokens,
+                f'{re.throughput_per_proc:.2f}',
+                f'{re.throughput_per_node:.2f}', f'{re.mem_per_proc:.2f}',
+                f'{re.mem_per_gpu:.2f}', f'{re.mem_per_node:.2f}'
+            ])
 
 
 if __name__ == '__main__':
-    fire.Fire(main)
+    main()