Unverified Commit 2067862d authored by q.yao's avatar q.yao Committed by GitHub
Browse files

add profile throughput benchmark (#146)



* add profile throughput benchmark

* add output only throughput

* update req/min

* update benckmark readme

* fix lint

---------
Co-authored-by: default avatargrimoire <yaoqian@pjlab.org.cn>
parent b728064e
# Benchmark
We provide several profiling tools to benchmark our models.
## profiling with dataset
Download the dataset below or create your own dataset.
```bash
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
```
Profiling your model with `profile_throughput.py`
```bash
python profile_throughput.py \
ShareGPT_V3_unfiltered_cleaned_split.json \
/path/to/your/model \
${ModelType} \
--concurrency 64
```
## profile without dataset
`profile_generation.py` perform benchmark with dummy data.
```bash
python profile_generation.py \
/path/to/your/model \
${ModelType} \
--concurrency 8 --input_seqlen 0 --output_seqlen 2048
```
## profile serving
Tools above profile models with Python API. `profile_serving.py` is used to do benchmark on serving.
```bash
python profile_serving.py \
${TritonServerAddress} \
${ModelName} \
/path/to/tokenizer \
/path/to/dataset \
--concurrency 64
```
...@@ -136,7 +136,7 @@ def main(model_path: str, ...@@ -136,7 +136,7 @@ def main(model_path: str,
f'{first_token_latency_ave:.2f}s\ntoken latency(min, max, ave): ' f'{first_token_latency_ave:.2f}s\ntoken latency(min, max, ave): '
f'{token_latency_min:.2f}s, {token_latency_max:.2f}s, ' f'{token_latency_min:.2f}s, {token_latency_max:.2f}s, '
f'{token_latency_ave:.2f}s\n' f'{token_latency_ave:.2f}s\n'
f'throughput: {throughput} token/s\n{"-" * 50}') f'throughput per threads: {throughput} token/s\n{"-" * 50}')
if __name__ == '__main__': if __name__ == '__main__':
......
import json
import os.path as osp
import random
import time
from queue import Queue
from threading import Thread
from typing import List, Tuple
import fire
from lmdeploy.model import MODELS
from lmdeploy.turbomind import Tokenizer, TurboMind
def sample_requests(
dataset_path: str,
num_requests: int,
tokenizer: Tokenizer,
) -> List[Tuple[str, int, int]]:
# Load the dataset.
with open(dataset_path) as f:
dataset = json.load(f)
# Filter out the conversations with less than 2 turns.
dataset = [data for data in dataset if len(data['conversations']) >= 2]
# Only keep the first two turns of each conversation.
dataset = [(data['conversations'][0]['value'],
data['conversations'][1]['value']) for data in dataset]
# Tokenize the prompts and completions.
prompts = [prompt for prompt, _ in dataset]
prompt_token_ids = tokenizer(prompts).input_ids
completions = [completion for _, completion in dataset]
completion_token_ids = tokenizer(completions).input_ids
tokenized_dataset = []
for i in range(len(dataset)):
output_len = len(completion_token_ids[i])
tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
# Filter out too long sequences.
filtered_dataset: List[Tuple[str, int, int]] = []
for prompt, prompt_token_ids, output_len in tokenized_dataset:
prompt_len = len(prompt_token_ids)
if prompt_len < 4 or output_len < 4:
# Prune too short sequences.
continue
if prompt_len > 1024 or prompt_len + output_len > 2048:
# Prune too long sequences.
continue
filtered_dataset.append((prompt, prompt_len, output_len))
# Sample the requests.
sampled_requests = random.sample(filtered_dataset, num_requests)
return sampled_requests
class Engine:
def __init__(self, model_path: str, model_name: str):
tokenizer_model_path = osp.join(model_path, 'triton_models',
'tokenizer')
tokenizer = Tokenizer(tokenizer_model_path)
model = MODELS.get(model_name)()
stop_words = model.stop_words
tm_model = TurboMind(model_path=model_path, stop_words=stop_words)
self.tm_model = tm_model
self.tokenizer = tokenizer
def _inference(self, queue, session_id: int):
model_inst = self.tm_model.create_instance()
while True:
request = queue.get()
if request is None:
# stop signal
queue.put(None)
return
else:
prompt, _, output_seqlen = request
input_ids = self.tokenizer.encode(prompt)
for outputs in model_inst.stream_infer(
session_id,
input_ids=input_ids,
request_output_len=output_seqlen,
temperature=1.0,
top_p=1.0,
sequence_start=True,
sequence_end=True,
ignore_eos=True):
res, tokens = outputs[0]
self.tokenizer.decode(res)
def process_request(self, requests, concurrency: int = 1):
q = Queue()
threads = []
start = time.time()
# start threads
for i in range(concurrency):
t = Thread(target=self._inference, args=(q, i))
t.start()
threads.append(t)
# feed request to q
for req in requests:
q.put(req)
q.put(None)
# wait for finish
for t in threads:
t.join()
end = time.time()
return end - start
def main(dataset: str,
model_path: str,
model_name: str,
concurrency: int = 1,
num_prompts: int = 1000):
engine = Engine(model_path, model_name)
tokenizer = engine.tokenizer
requests = sample_requests(dataset, num_prompts, tokenizer)
elapsed_time = engine.process_request(requests, concurrency)
total_num_tokens = sum(prompt_len + output_len
for _, prompt_len, output_len in requests)
total_num_out_tokens = sum(output_len for _, _, output_len in requests)
print(f'Throughput requests: {len(requests) / elapsed_time:.2f} req/s')
print(
f'Throughput requests: {len(requests) * 60 / elapsed_time:.2f} req/min'
)
print(f'Throughput tokens: {total_num_tokens / elapsed_time:.2f} tokens/s')
print('Throughput tokens(output only):'
f'{total_num_out_tokens / elapsed_time:.2f} tokens/s')
if __name__ == '__main__':
fire.Fire(main)
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp import os.path as osp
from typing import Sequence from typing import Sequence, Union
import torch import torch
...@@ -61,6 +61,21 @@ class SentencePieceTokenizer: ...@@ -61,6 +61,21 @@ class SentencePieceTokenizer:
t = t.tolist() t = t.tolist()
return self.model.Decode(t) return self.model.Decode(t)
def __call__(self, s: Union[str, Sequence[str]]):
"""Tokenize prompts.
Args:
s (str): prompts
Returns:
list[int]: token ids
"""
import addict
add_bos = False
add_eos = False
input_ids = self.model.Encode(s, add_bos=add_bos, add_eos=add_eos)
return addict.Addict(input_ids=input_ids)
class HuggingFaceTokenizer: class HuggingFaceTokenizer:
"""Tokenizer of sentencepiece. """Tokenizer of sentencepiece.
...@@ -127,6 +142,17 @@ class HuggingFaceTokenizer: ...@@ -127,6 +142,17 @@ class HuggingFaceTokenizer:
skip_special_tokens = True skip_special_tokens = True
return self.model.decode(t, skip_special_tokens=skip_special_tokens) return self.model.decode(t, skip_special_tokens=skip_special_tokens)
def __call__(self, s: Union[str, Sequence[str]]):
"""Tokenize prompts.
Args:
s (str): prompts
Returns:
list[int]: token ids
"""
add_special_tokens = False
return self.model(s, add_special_tokens=add_special_tokens)
class Tokenizer: class Tokenizer:
"""Tokenize prompts or de-tokenize tokens into texts. """Tokenize prompts or de-tokenize tokens into texts.
...@@ -186,3 +212,13 @@ class Tokenizer: ...@@ -186,3 +212,13 @@ class Tokenizer:
str: text of decoding tokens str: text of decoding tokens
""" """
return self.model.decode(t) return self.model.decode(t)
def __call__(self, s: Union[str, Sequence[str]]):
"""Tokenize prompts.
Args:
s (str): prompts
Returns:
list[int]: token ids
"""
return self.model(s)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment