Unverified Commit 2ba90822 authored by Lyu Han's avatar Lyu Han Committed by GitHub
Browse files

Fix missed arguments when benchmark static inference performance (#787)

* minor fix in the profile scripts and docs

* miss arguments

* typo

* fix lint

* update
parent 12dc3e14
...@@ -20,7 +20,10 @@ from lmdeploy.turbomind import TurboMind ...@@ -20,7 +20,10 @@ from lmdeploy.turbomind import TurboMind
def infer(model, session_id: int, input_ids: List, output_seqlen: int, def infer(model, session_id: int, input_ids: List, output_seqlen: int,
test_round: int, que: Queue): top_k: int, top_p: float, temperature: float, test_round: int,
que: Queue):
if session_id == 1:
pbar = tqdm(total=test_round)
chatbot = model.create_instance() chatbot = model.create_instance()
stats = [] stats = []
for _ in range(test_round): for _ in range(test_round):
...@@ -45,13 +48,18 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int, ...@@ -45,13 +48,18 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int,
sequence_start=True, sequence_start=True,
sequence_end=True, sequence_end=True,
ignore_eos=True, ignore_eos=True,
stream_output=True): stream_output=True,
top_k=top_k,
top_p=top_p,
temperature=temperature):
_, n_token = outputs[0] _, n_token = outputs[0]
now = time.perf_counter() now = time.perf_counter()
if n_pre_token != n_token: if n_pre_token != n_token:
token_latency_stats[n_pre_token] = np.round(now - prev, 3) token_latency_stats[n_pre_token] = np.round(now - prev, 3)
n_pre_token = n_token n_pre_token = n_token
prev = now prev = now
if session_id == 1:
pbar.update(1)
assert output_seqlen <= n_token <= output_seqlen + 1, \ assert output_seqlen <= n_token <= output_seqlen + 1, \
f'Error. session_id({session_id}) request {output_seqlen} ' \ f'Error. session_id({session_id}) request {output_seqlen} ' \
...@@ -60,11 +68,11 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int, ...@@ -60,11 +68,11 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int,
que.put((session_id, stats)) que.put((session_id, stats))
def warmup(model, def warmup(model, concurrency: int, input_ids: List[int], output_seqlen: int,
concurrency: int, warmup_round: int):
input_ids: List[int], if not warmup_round:
output_seqlen: int, return
warmup_round: int = 2):
print('start to warmup ...') print('start to warmup ...')
def _infer(model, session_id): def _infer(model, session_id):
...@@ -75,7 +83,10 @@ def warmup(model, ...@@ -75,7 +83,10 @@ def warmup(model,
request_output_len=output_seqlen, request_output_len=output_seqlen,
sequence_start=True, sequence_start=True,
sequence_end=True, sequence_end=True,
ignore_eos=True): ignore_eos=True,
top_k=1,
top_p=1.0,
temperature=1.0):
continue continue
_start = time.perf_counter() _start = time.perf_counter()
...@@ -85,38 +96,33 @@ def warmup(model, ...@@ -85,38 +96,33 @@ def warmup(model,
procs.append(proc) procs.append(proc)
proc.start() proc.start()
try: for proc in procs:
for proc in procs: proc.join()
proc.join()
except Exception:
for proc in procs:
proc.stop()
exit(1)
_end = time.perf_counter() _end = time.perf_counter()
print(f'end warmup, elapsed time: {round(_end - _start, 2)}s') print(f'end warmup, elapsed time: {round(_end - _start, 2)}s')
def profile_throughput(model_path: str, def profile_throughput(model_path: str, concurrency: int, input_seqlen: int,
concurrency: int = 1, output_seqlen: int, tp: int, top_k: int, top_p: float,
input_seqlen: int = 1, temperature: float, test_round: int, warmup_round: int,
output_seqlen: int = 512,
test_round: int = 10,
tp: int = 1,
**kwargs): **kwargs):
# avoid turbomind checking chat template name by setting
# `model_name='llama'` print(f'profiling ... concurrency: {concurrency}, '
f'n_prompt_token: {input_seqlen}, '
f'n_completion_token: {output_seqlen}, '
f'test_round: {test_round}, warmup_round: {warmup_round}')
# avoid turbomind checking chat template name by setting `model_name='llama'` # noqa
tm_model = TurboMind(model_path=model_path, tm_model = TurboMind(model_path=model_path,
tp=tp, tp=tp,
model_name='llama', model_name='llama',
**kwargs) **kwargs)
tokenizer = tm_model.tokenizer
# make up a prompt that can be tokenized into {input_seqlen} tokens # make up a dummy `input_ids` with the length of `input_seqlen` exactly
assert input_seqlen > 0, 'input_seqlen should > 0' assert input_seqlen > 0, 'input_seqlen should > 0'
input_ids = tokenizer('hi').input_ids input_ids = np.random.randint(low=0, high=101, size=input_seqlen).tolist()
input_ids = input_ids * input_seqlen warmup(tm_model, concurrency, input_ids, output_seqlen, warmup_round)
warmup(tm_model, concurrency, input_ids, output_seqlen)
que = Queue() que = Queue()
procs = [] procs = []
...@@ -124,18 +130,14 @@ def profile_throughput(model_path: str, ...@@ -124,18 +130,14 @@ def profile_throughput(model_path: str,
for i in range(concurrency): for i in range(concurrency):
proc = Thread(target=infer, proc = Thread(target=infer,
args=(tm_model, i + 1, input_ids, output_seqlen, args=(tm_model, i + 1, input_ids, output_seqlen, top_k,
test_round, que)) top_p, temperature, test_round, que))
procs.append(proc) procs.append(proc)
proc.start() proc.start()
try: for proc in procs:
for proc in procs: proc.join()
proc.join()
except Exception:
for proc in procs:
proc.stop()
exit(1)
_end = time.perf_counter() _end = time.perf_counter()
elapsed_time = _end - _start elapsed_time = _end - _start
...@@ -323,7 +325,11 @@ def parse_args(): ...@@ -323,7 +325,11 @@ def parse_args():
parser.add_argument('--test-round', parser.add_argument('--test-round',
type=int, type=int,
help='number of test rounds', help='number of test rounds',
default=10) default=6)
parser.add_argument('--warmup-round',
type=int,
help='number of warmuop rounds',
default=1)
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -336,9 +342,9 @@ def main(): ...@@ -336,9 +342,9 @@ def main():
os.environ['TM_LOG_LEVEL'] = args.log_level os.environ['TM_LOG_LEVEL'] = args.log_level
results: List[ProfileResult] = [] results: List[ProfileResult] = []
for batch in tqdm(args.concurrency): for batch in args.concurrency:
for prompt_tokens, completion_tokens in tqdm( for prompt_tokens, completion_tokens in zip(args.prompt_tokens,
zip(args.prompt_tokens, args.completion_tokens)): args.completion_tokens):
MemoryMonitor.start() MemoryMonitor.start()
from functools import partial from functools import partial
from multiprocessing import Pool from multiprocessing import Pool
...@@ -350,7 +356,8 @@ def main(): ...@@ -350,7 +356,8 @@ def main():
top_k=args.top_k, top_k=args.top_k,
top_p=args.top_p, top_p=args.top_p,
temperature=args.temperature, temperature=args.temperature,
test_round=args.test_round) test_round=args.test_round,
warmup_round=args.warmup_round)
output = Pool(1).map(profile_target, (args.model_path, )) output = Pool(1).map(profile_target, (args.model_path, ))
model_name, first_token_latency, percentiles, \ model_name, first_token_latency, percentiles, \
throughput_per_proc, tp = output[0] throughput_per_proc, tp = output[0]
...@@ -370,24 +377,25 @@ def main(): ...@@ -370,24 +377,25 @@ def main():
mem_per_proc=memory, mem_per_proc=memory,
mem_per_gpu=memory / tp, mem_per_gpu=memory / tp,
mem_per_node=memory / tp * device_count)) mem_per_node=memory / tp * device_count))
with open(args.csv, 'w') as csvfile: if args.csv:
writer = csv.writer(csvfile) with open(args.csv, 'w') as csvfile:
writer.writerow([ writer = csv.writer(csvfile)
'batch', 'prompt_tokens', 'completion_tokens',
'1st_token_latency(min)(s)', '1st_token_latency(max)(s)',
'1st_token_latency(ave)(s)', 'percentile50(s)', 'percentile75(s)',
'percentile95(s)', 'percentile99(s)', 'throughput(token/s)',
'mem_per_proc(GB)', 'mem_per_gpu(GB)'
])
for re in results:
writer.writerow([ writer.writerow([
re.batch, re.prompt_tokens, re.completion_tokens, 'batch', 'prompt_tokens', 'completion_tokens',
re.first_token_latency[0], re.first_token_latency[1], '1st_token_latency(min)(s)', '1st_token_latency(max)(s)',
re.first_token_latency[2], re.percentiles[0], '1st_token_latency(ave)(s)', 'percentile50(s)',
re.percentiles[1], re.percentiles[2], re.percentiles[3], 'percentile75(s)', 'percentile95(s)', 'percentile99(s)',
f'{re.throughput_per_proc:.2f}', f'{re.mem_per_proc:.2f}', 'throughput(token/s)', 'mem_per_proc(GB)', 'mem_per_gpu(GB)'
f'{re.mem_per_gpu:.2f}'
]) ])
for re in results:
writer.writerow([
re.batch, re.prompt_tokens, re.completion_tokens,
re.first_token_latency[0], re.first_token_latency[1],
re.first_token_latency[2], re.percentiles[0],
re.percentiles[1], re.percentiles[2], re.percentiles[3],
f'{re.throughput_per_proc:.2f}', f'{re.mem_per_proc:.2f}',
f'{re.mem_per_gpu:.2f}'
])
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -77,11 +77,13 @@ class Engine: ...@@ -77,11 +77,13 @@ class Engine:
stats = [] stats = []
for prompt, input_seqlen, output_seqlen in iter( for prompt, input_seqlen, output_seqlen in iter(
req_queue.get, [None, None, None]): req_queue.get, [None, None, None]):
input_ids = self.tokenizer(prompt).input_ids
offset = 0 offset = 0
timestamps = [] timestamps = []
tokens = [] tokens = []
timestamps.append(time.perf_counter()) timestamps.append(time.perf_counter())
input_ids = self.tokenizer(prompt).input_ids
for outputs in model_inst.stream_infer( for outputs in model_inst.stream_infer(
session_id, session_id,
input_ids=input_ids, input_ids=input_ids,
......
...@@ -51,7 +51,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r ...@@ -51,7 +51,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
# get internlm-7b from huggingface and convert it to turbomind format # get internlm-7b from huggingface and convert it to turbomind format
lmdeploy convert internlm internlm/internlm-7b --dst-path ./internlm-7b lmdeploy convert internlm internlm/internlm-7b --dst-path ./internlm-7b
python3 profile_throughput.py ./internlm-7b ./ShareGPT_V3_unfiltered_cleaned_split.json python3 profile_throughput.py ./ShareGPT_V3_unfiltered_cleaned_split.json ./internlm-7b
``` ```
## Command details ## Command details
......
...@@ -51,7 +51,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r ...@@ -51,7 +51,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
lmdeploy convert internlm internlm/internlm-7b --dst-path ./internlm-7b lmdeploy convert internlm internlm/internlm-7b --dst-path ./internlm-7b
# 执行测速脚本 # 执行测速脚本
python3 profile_throughput.py ./internlm-7b ./ShareGPT_V3_unfiltered_cleaned_split.json python3 profile_throughput.py ./ShareGPT_V3_unfiltered_cleaned_split.json ./internlm-7b
``` ```
## 测试方法 ## 测试方法
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment