fix benchmark serving cannot use Qwen tokenizer (#443)

* fix benchmark serving cannot use Qwen tokenizer * update benchmark readme

fix benchmark serving cannot use Qwen tokenizer (#443)
* fix benchmark serving cannot use Qwen tokenizer * update benchmark readme
97dcdff7 · AllentDan · GitHub · 22cd7d15 · 97dcdff7 · 97dcdff7
Unverified Commit 97dcdff7 authored Sep 26, 2023 by AllentDan Committed by GitHub Sep 26, 2023
Showing with 25 additions and 38 deletions

benchmark/README.md benchmark/README.md +15 -1

benchmark/profile_restful_api.py benchmark/profile_restful_api.py +5 -18

benchmark/profile_serving.py benchmark/profile_serving.py +5 -19

No files found.
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -42,7 +42,21 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r

 python profile_serving.py \
    ${TritonServerAddress} \
-    /path/to/tokenizer \
+    /path/to/tokenizer \ # ends with .model for most models. Otherwise, please pass model_path/triton_models/tokenizer.
+    ShareGPT_V3_unfiltered_cleaned_split.json \
+    --concurrency 64
+```
+
+## profile restful api
+
+`profile_restful_api.py` is used to do benchmark on api server.
+
+```bash
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+python profile_restful_api.py \
+    ${ServerAddress} \
+    /path/to/tokenizer \ # ends with .model for most models. Otherwise, please pass model_path/triton_models/tokenizer.
    ShareGPT_V3_unfiltered_cleaned_split.json \
    --concurrency 64
 ```
--- a/benchmark/profile_restful_api.py
+++ b/benchmark/profile_restful_api.py
 import json
 import multiprocessing as mp
-import os
 import random
 import time
 from typing import Iterable, List
@@ -8,8 +7,8 @@ from typing import Iterable, List
 import fire
 import numpy as np
 import requests
-from sentencepiece import SentencePieceProcessor

+from lmdeploy.turbomind.tokenizer import Tokenizer
 from lmdeploy.utils import get_logger


@@ -45,20 +44,6 @@ def get_streaming_response(prompt: str,
            yield output, tokens


-class Tokenizer:
-
-    def __init__(self, model_path: str):
-        # reload tokenizer
-        assert os.path.isfile(model_path), model_path
-        self.sp_model = SentencePieceProcessor(model_file=model_path)
-
-    def encode(self, prompts: List):
-        prompts_token_ids = self.sp_model.Encode(prompts,
-                                                 add_bos=False,
-                                                 add_eos=False)
-        return [len(token_ids) for token_ids in prompts_token_ids]
-
-
 def infer(server_addr: str, session_id: int, req_queue: mp.Queue,
          res_que: mp.Queue):
    stats = []
@@ -132,8 +117,10 @@ def read_dataset(tokenizer_path: str, dataset_path: str, samples: int,

    start = time.perf_counter()
    tokenizer = Tokenizer(tokenizer_path)
-    prompts_token_lens = tokenizer.encode(prompts)
-    completions_token_lens = tokenizer.encode(completions)
+    prompts_token_lens = [len(tokenizer.encode(prompt)) for prompt in prompts]
+    completions_token_lens = [
+        len(tokenizer.encode(prompt)) for prompt in completions
+    ]
    print(f'elapsed time for tokenization: '
          f'{round(time.perf_counter() - start, 2)} s')


--- a/benchmark/profile_serving.py
+++ b/benchmark/profile_serving.py
 import json
 import logging
 import multiprocessing as mp
-import os
 import random
 import time
-from typing import List

 import fire
 import numpy as np
-from sentencepiece import SentencePieceProcessor

 from lmdeploy.serve.turbomind.chatbot import Chatbot
-
-
-class Tokenizer:
-
-    def __init__(self, model_path: str):
-        # reload tokenizer
-        assert os.path.isfile(model_path), model_path
-        self.sp_model = SentencePieceProcessor(model_file=model_path)
-
-    def encode(self, prompts: List):
-        prompts_token_ids = self.sp_model.Encode(prompts,
-                                                 add_bos=False,
-                                                 add_eos=False)
-        return [len(token_ids) for token_ids in prompts_token_ids]
+from lmdeploy.turbomind.tokenizer import Tokenizer


 def infer(chatbot, session_id: int, req_que: mp.Queue, res_que: mp.Queue):
@@ -103,8 +87,10 @@ def read_dataset(tokenizer_path: str, dataset_path: str, samples: int,

    start = time.perf_counter()
    tokenizer = Tokenizer(tokenizer_path)
-    prompts_token_lens = tokenizer.encode(prompts)
-    completions_token_lens = tokenizer.encode(completions)
+    prompts_token_lens = [len(tokenizer.encode(prompt)) for prompt in prompts]
+    completions_token_lens = [
+        len(tokenizer.encode(prompt)) for prompt in completions
+    ]
    print(f'elapsed time for tokenization: '
          f'{round(time.perf_counter() - start, 2)} s')