Unverified Commit 97dcdff7 authored by AllentDan's avatar AllentDan Committed by GitHub
Browse files

fix benchmark serving cannot use Qwen tokenizer (#443)

* fix benchmark serving cannot use Qwen tokenizer

* update benchmark readme
parent 22cd7d15
......@@ -42,7 +42,21 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
python profile_serving.py \
${TritonServerAddress} \
/path/to/tokenizer \
/path/to/tokenizer \ # ends with .model for most models. Otherwise, please pass model_path/triton_models/tokenizer.
ShareGPT_V3_unfiltered_cleaned_split.json \
--concurrency 64
```
## profile restful api
`profile_restful_api.py` is used to do benchmark on api server.
```bash
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python profile_restful_api.py \
${ServerAddress} \
/path/to/tokenizer \ # ends with .model for most models. Otherwise, please pass model_path/triton_models/tokenizer.
ShareGPT_V3_unfiltered_cleaned_split.json \
--concurrency 64
```
import json
import multiprocessing as mp
import os
import random
import time
from typing import Iterable, List
......@@ -8,8 +7,8 @@ from typing import Iterable, List
import fire
import numpy as np
import requests
from sentencepiece import SentencePieceProcessor
from lmdeploy.turbomind.tokenizer import Tokenizer
from lmdeploy.utils import get_logger
......@@ -45,20 +44,6 @@ def get_streaming_response(prompt: str,
yield output, tokens
class Tokenizer:
def __init__(self, model_path: str):
# reload tokenizer
assert os.path.isfile(model_path), model_path
self.sp_model = SentencePieceProcessor(model_file=model_path)
def encode(self, prompts: List):
prompts_token_ids = self.sp_model.Encode(prompts,
add_bos=False,
add_eos=False)
return [len(token_ids) for token_ids in prompts_token_ids]
def infer(server_addr: str, session_id: int, req_queue: mp.Queue,
res_que: mp.Queue):
stats = []
......@@ -132,8 +117,10 @@ def read_dataset(tokenizer_path: str, dataset_path: str, samples: int,
start = time.perf_counter()
tokenizer = Tokenizer(tokenizer_path)
prompts_token_lens = tokenizer.encode(prompts)
completions_token_lens = tokenizer.encode(completions)
prompts_token_lens = [len(tokenizer.encode(prompt)) for prompt in prompts]
completions_token_lens = [
len(tokenizer.encode(prompt)) for prompt in completions
]
print(f'elapsed time for tokenization: '
f'{round(time.perf_counter() - start, 2)} s')
......
import json
import logging
import multiprocessing as mp
import os
import random
import time
from typing import List
import fire
import numpy as np
from sentencepiece import SentencePieceProcessor
from lmdeploy.serve.turbomind.chatbot import Chatbot
class Tokenizer:
def __init__(self, model_path: str):
# reload tokenizer
assert os.path.isfile(model_path), model_path
self.sp_model = SentencePieceProcessor(model_file=model_path)
def encode(self, prompts: List):
prompts_token_ids = self.sp_model.Encode(prompts,
add_bos=False,
add_eos=False)
return [len(token_ids) for token_ids in prompts_token_ids]
from lmdeploy.turbomind.tokenizer import Tokenizer
def infer(chatbot, session_id: int, req_que: mp.Queue, res_que: mp.Queue):
......@@ -103,8 +87,10 @@ def read_dataset(tokenizer_path: str, dataset_path: str, samples: int,
start = time.perf_counter()
tokenizer = Tokenizer(tokenizer_path)
prompts_token_lens = tokenizer.encode(prompts)
completions_token_lens = tokenizer.encode(completions)
prompts_token_lens = [len(tokenizer.encode(prompt)) for prompt in prompts]
completions_token_lens = [
len(tokenizer.encode(prompt)) for prompt in completions
]
print(f'elapsed time for tokenization: '
f'{round(time.perf_counter() - start, 2)} s')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment