Unverified Commit 2ba90822 authored by Lyu Han's avatar Lyu Han Committed by GitHub
Browse files

Fix missed arguments when benchmark static inference performance (#787)

* minor fix in the profile scripts and docs

* miss arguments

* typo

* fix lint

* update
parent 12dc3e14
......@@ -20,7 +20,10 @@ from lmdeploy.turbomind import TurboMind
def infer(model, session_id: int, input_ids: List, output_seqlen: int,
test_round: int, que: Queue):
top_k: int, top_p: float, temperature: float, test_round: int,
que: Queue):
if session_id == 1:
pbar = tqdm(total=test_round)
chatbot = model.create_instance()
stats = []
for _ in range(test_round):
......@@ -45,13 +48,18 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int,
sequence_start=True,
sequence_end=True,
ignore_eos=True,
stream_output=True):
stream_output=True,
top_k=top_k,
top_p=top_p,
temperature=temperature):
_, n_token = outputs[0]
now = time.perf_counter()
if n_pre_token != n_token:
token_latency_stats[n_pre_token] = np.round(now - prev, 3)
n_pre_token = n_token
prev = now
if session_id == 1:
pbar.update(1)
assert output_seqlen <= n_token <= output_seqlen + 1, \
f'Error. session_id({session_id}) request {output_seqlen} ' \
......@@ -60,11 +68,11 @@ def infer(model, session_id: int, input_ids: List, output_seqlen: int,
que.put((session_id, stats))
def warmup(model,
concurrency: int,
input_ids: List[int],
output_seqlen: int,
warmup_round: int = 2):
def warmup(model, concurrency: int, input_ids: List[int], output_seqlen: int,
warmup_round: int):
if not warmup_round:
return
print('start to warmup ...')
def _infer(model, session_id):
......@@ -75,7 +83,10 @@ def warmup(model,
request_output_len=output_seqlen,
sequence_start=True,
sequence_end=True,
ignore_eos=True):
ignore_eos=True,
top_k=1,
top_p=1.0,
temperature=1.0):
continue
_start = time.perf_counter()
......@@ -85,38 +96,33 @@ def warmup(model,
procs.append(proc)
proc.start()
try:
for proc in procs:
proc.join()
except Exception:
for proc in procs:
proc.stop()
exit(1)
_end = time.perf_counter()
print(f'end warmup, elapsed time: {round(_end - _start, 2)}s')
def profile_throughput(model_path: str,
concurrency: int = 1,
input_seqlen: int = 1,
output_seqlen: int = 512,
test_round: int = 10,
tp: int = 1,
def profile_throughput(model_path: str, concurrency: int, input_seqlen: int,
output_seqlen: int, tp: int, top_k: int, top_p: float,
temperature: float, test_round: int, warmup_round: int,
**kwargs):
# avoid turbomind checking chat template name by setting
# `model_name='llama'`
print(f'profiling ... concurrency: {concurrency}, '
f'n_prompt_token: {input_seqlen}, '
f'n_completion_token: {output_seqlen}, '
f'test_round: {test_round}, warmup_round: {warmup_round}')
# avoid turbomind checking chat template name by setting `model_name='llama'` # noqa
tm_model = TurboMind(model_path=model_path,
tp=tp,
model_name='llama',
**kwargs)
tokenizer = tm_model.tokenizer
# make up a prompt that can be tokenized into {input_seqlen} tokens
# make up a dummy `input_ids` with the length of `input_seqlen` exactly
assert input_seqlen > 0, 'input_seqlen should > 0'
input_ids = tokenizer('hi').input_ids
input_ids = input_ids * input_seqlen
warmup(tm_model, concurrency, input_ids, output_seqlen)
input_ids = np.random.randint(low=0, high=101, size=input_seqlen).tolist()
warmup(tm_model, concurrency, input_ids, output_seqlen, warmup_round)
que = Queue()
procs = []
......@@ -124,18 +130,14 @@ def profile_throughput(model_path: str,
for i in range(concurrency):
proc = Thread(target=infer,
args=(tm_model, i + 1, input_ids, output_seqlen,
test_round, que))
args=(tm_model, i + 1, input_ids, output_seqlen, top_k,
top_p, temperature, test_round, que))
procs.append(proc)
proc.start()
try:
for proc in procs:
proc.join()
except Exception:
for proc in procs:
proc.stop()
exit(1)
_end = time.perf_counter()
elapsed_time = _end - _start
......@@ -323,7 +325,11 @@ def parse_args():
parser.add_argument('--test-round',
type=int,
help='number of test rounds',
default=10)
default=6)
parser.add_argument('--warmup-round',
type=int,
help='number of warmuop rounds',
default=1)
args = parser.parse_args()
return args
......@@ -336,9 +342,9 @@ def main():
os.environ['TM_LOG_LEVEL'] = args.log_level
results: List[ProfileResult] = []
for batch in tqdm(args.concurrency):
for prompt_tokens, completion_tokens in tqdm(
zip(args.prompt_tokens, args.completion_tokens)):
for batch in args.concurrency:
for prompt_tokens, completion_tokens in zip(args.prompt_tokens,
args.completion_tokens):
MemoryMonitor.start()
from functools import partial
from multiprocessing import Pool
......@@ -350,7 +356,8 @@ def main():
top_k=args.top_k,
top_p=args.top_p,
temperature=args.temperature,
test_round=args.test_round)
test_round=args.test_round,
warmup_round=args.warmup_round)
output = Pool(1).map(profile_target, (args.model_path, ))
model_name, first_token_latency, percentiles, \
throughput_per_proc, tp = output[0]
......@@ -370,14 +377,15 @@ def main():
mem_per_proc=memory,
mem_per_gpu=memory / tp,
mem_per_node=memory / tp * device_count))
if args.csv:
with open(args.csv, 'w') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([
'batch', 'prompt_tokens', 'completion_tokens',
'1st_token_latency(min)(s)', '1st_token_latency(max)(s)',
'1st_token_latency(ave)(s)', 'percentile50(s)', 'percentile75(s)',
'percentile95(s)', 'percentile99(s)', 'throughput(token/s)',
'mem_per_proc(GB)', 'mem_per_gpu(GB)'
'1st_token_latency(ave)(s)', 'percentile50(s)',
'percentile75(s)', 'percentile95(s)', 'percentile99(s)',
'throughput(token/s)', 'mem_per_proc(GB)', 'mem_per_gpu(GB)'
])
for re in results:
writer.writerow([
......
......@@ -77,11 +77,13 @@ class Engine:
stats = []
for prompt, input_seqlen, output_seqlen in iter(
req_queue.get, [None, None, None]):
input_ids = self.tokenizer(prompt).input_ids
offset = 0
timestamps = []
tokens = []
timestamps.append(time.perf_counter())
input_ids = self.tokenizer(prompt).input_ids
for outputs in model_inst.stream_infer(
session_id,
input_ids=input_ids,
......
......@@ -51,7 +51,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
# get internlm-7b from huggingface and convert it to turbomind format
lmdeploy convert internlm internlm/internlm-7b --dst-path ./internlm-7b
python3 profile_throughput.py ./internlm-7b ./ShareGPT_V3_unfiltered_cleaned_split.json
python3 profile_throughput.py ./ShareGPT_V3_unfiltered_cleaned_split.json ./internlm-7b
```
## Command details
......
......@@ -51,7 +51,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
lmdeploy convert internlm internlm/internlm-7b --dst-path ./internlm-7b
# 执行测速脚本
python3 profile_throughput.py ./internlm-7b ./ShareGPT_V3_unfiltered_cleaned_split.json
python3 profile_throughput.py ./ShareGPT_V3_unfiltered_cleaned_split.json ./internlm-7b
```
## 测试方法
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment