add profile throughput benchmark (#146)

* add profile throughput benchmark * add output only throughput * update req/min * update benckmark readme * fix lint --------- Co-authored-by: grimoire <yaoqian@pjlab.org.cn>

add profile throughput benchmark (#146)
* add profile throughput benchmark * add output only throughput * update req/min * update benckmark readme * fix lint --------- Co-authored-by: grimoire <yaoqian@pjlab.org.cn>
2067862d · q.yao · GitHub · b728064e · 2067862d · 2067862d
Unverified Commit 2067862d authored Jul 22, 2023 by q.yao Committed by GitHub Jul 22, 2023
4 changed files
--- a/benchmark/README.md
+++ b/benchmark/README.md
+# Benchmark
+
+We provide several profiling tools to benchmark our models.
+
+## profiling with dataset
+
+Download the dataset below or create your own dataset.
+
+```bash
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+```
+
+Profiling your model with `profile_throughput.py`
+
+```bash
+python profile_throughput.py \
+ ShareGPT_V3_unfiltered_cleaned_split.json \
+ /path/to/your/model \
+ ${ModelType} \
+ --concurrency 64
+```
+
+## profile without dataset
+
+`profile_generation.py` perform benchmark with dummy data.
+
+```bash
+python profile_generation.py \
+ /path/to/your/model \
+ ${ModelType} \
+ --concurrency 8 --input_seqlen 0 --output_seqlen 2048
+```
+
+## profile serving
+
+Tools above profile models with Python API. `profile_serving.py` is used to do benchmark on serving.
+
+```bash
+python profile_serving.py \
+    ${TritonServerAddress} \
+    ${ModelName} \
+    /path/to/tokenizer \
+    /path/to/dataset \
+    --concurrency 64
+```
--- a/benchmark/profile_generation.py
+++ b/benchmark/profile_generation.py
@@ -136,7 +136,7 @@ def main(model_path: str,
          f'{first_token_latency_ave:.2f}s\ntoken latency(min, max, ave): '
          f'{token_latency_min:.2f}s, {token_latency_max:.2f}s, '
          f'{token_latency_ave:.2f}s\n'
-          f'throughput: {throughput} token/s\n{"-" * 50}')
+          f'throughput per threads: {throughput} token/s\n{"-" * 50}')


 if __name__ == '__main__':

--- a/benchmark/profile_throughput.py
+++ b/benchmark/profile_throughput.py
+import json
+import os.path as osp
+import random
+import time
+from queue import Queue
+from threading import Thread
+from typing import List, Tuple
+
+import fire
+
+from lmdeploy.model import MODELS
+from lmdeploy.turbomind import Tokenizer, TurboMind
+
+
+def sample_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: Tokenizer,
+) -> List[Tuple[str, int, int]]:
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data['conversations']) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [(data['conversations'][0]['value'],
+                data['conversations'][1]['value']) for data in dataset]
+
+    # Tokenize the prompts and completions.
+    prompts = [prompt for prompt, _ in dataset]
+    prompt_token_ids = tokenizer(prompts).input_ids
+    completions = [completion for _, completion in dataset]
+    completion_token_ids = tokenizer(completions).input_ids
+    tokenized_dataset = []
+    for i in range(len(dataset)):
+        output_len = len(completion_token_ids[i])
+        tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
+
+    # Filter out too long sequences.
+    filtered_dataset: List[Tuple[str, int, int]] = []
+    for prompt, prompt_token_ids, output_len in tokenized_dataset:
+        prompt_len = len(prompt_token_ids)
+        if prompt_len < 4 or output_len < 4:
+            # Prune too short sequences.
+            continue
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
+            # Prune too long sequences.
+            continue
+        filtered_dataset.append((prompt, prompt_len, output_len))
+
+    # Sample the requests.
+    sampled_requests = random.sample(filtered_dataset, num_requests)
+    return sampled_requests
+
+
+class Engine:
+
+    def __init__(self, model_path: str, model_name: str):
+        tokenizer_model_path = osp.join(model_path, 'triton_models',
+                                        'tokenizer')
+        tokenizer = Tokenizer(tokenizer_model_path)
+        model = MODELS.get(model_name)()
+        stop_words = model.stop_words
+        tm_model = TurboMind(model_path=model_path, stop_words=stop_words)
+        self.tm_model = tm_model
+        self.tokenizer = tokenizer
+
+    def _inference(self, queue, session_id: int):
+
+        model_inst = self.tm_model.create_instance()
+        while True:
+            request = queue.get()
+            if request is None:
+                # stop signal
+                queue.put(None)
+                return
+            else:
+                prompt, _, output_seqlen = request
+                input_ids = self.tokenizer.encode(prompt)
+
+                for outputs in model_inst.stream_infer(
+                        session_id,
+                        input_ids=input_ids,
+                        request_output_len=output_seqlen,
+                        temperature=1.0,
+                        top_p=1.0,
+                        sequence_start=True,
+                        sequence_end=True,
+                        ignore_eos=True):
+                    res, tokens = outputs[0]
+                    self.tokenizer.decode(res)
+
+    def process_request(self, requests, concurrency: int = 1):
+        q = Queue()
+        threads = []
+
+        start = time.time()
+
+        # start threads
+        for i in range(concurrency):
+            t = Thread(target=self._inference, args=(q, i))
+            t.start()
+            threads.append(t)
+
+        # feed request to q
+        for req in requests:
+            q.put(req)
+
+        q.put(None)
+
+        # wait for finish
+        for t in threads:
+            t.join()
+
+        end = time.time()
+
+        return end - start
+
+
+def main(dataset: str,
+         model_path: str,
+         model_name: str,
+         concurrency: int = 1,
+         num_prompts: int = 1000):
+
+    engine = Engine(model_path, model_name)
+    tokenizer = engine.tokenizer
+
+    requests = sample_requests(dataset, num_prompts, tokenizer)
+
+    elapsed_time = engine.process_request(requests, concurrency)
+    total_num_tokens = sum(prompt_len + output_len
+                           for _, prompt_len, output_len in requests)
+    total_num_out_tokens = sum(output_len for _, _, output_len in requests)
+    print(f'Throughput requests: {len(requests) / elapsed_time:.2f} req/s')
+    print(
+        f'Throughput requests: {len(requests) * 60 / elapsed_time:.2f} req/min'
+    )
+    print(f'Throughput tokens: {total_num_tokens / elapsed_time:.2f} tokens/s')
+    print('Throughput tokens(output only):'
+          f'{total_num_out_tokens / elapsed_time:.2f} tokens/s')
+
+
+if __name__ == '__main__':
+    fire.Fire(main)
--- a/lmdeploy/turbomind/tokenizer.py
+++ b/lmdeploy/turbomind/tokenizer.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import os.path as osp
-from typing import Sequence
+from typing import Sequence, Union

 import torch

@@ -61,6 +61,21 @@ class SentencePieceTokenizer:
            t = t.tolist()
        return self.model.Decode(t)

+    def __call__(self, s: Union[str, Sequence[str]]):
+        """Tokenize prompts.
+
+        Args:
+            s (str): prompts
+        Returns:
+            list[int]: token ids
+        """
+        import addict
+        add_bos = False
+        add_eos = False
+
+        input_ids = self.model.Encode(s, add_bos=add_bos, add_eos=add_eos)
+        return addict.Addict(input_ids=input_ids)
+

 class HuggingFaceTokenizer:
    """Tokenizer of sentencepiece.
@@ -127,6 +142,17 @@ class HuggingFaceTokenizer:
        skip_special_tokens = True
        return self.model.decode(t, skip_special_tokens=skip_special_tokens)

+    def __call__(self, s: Union[str, Sequence[str]]):
+        """Tokenize prompts.
+
+        Args:
+            s (str): prompts
+        Returns:
+            list[int]: token ids
+        """
+        add_special_tokens = False
+        return self.model(s, add_special_tokens=add_special_tokens)
+

 class Tokenizer:
    """Tokenize prompts or de-tokenize tokens into texts.
@@ -186,3 +212,13 @@ class Tokenizer:
            str: text of decoding tokens
        """
        return self.model.decode(t)
+
+    def __call__(self, s: Union[str, Sequence[str]]):
+        """Tokenize prompts.
+
+        Args:
+            s (str): prompts
+        Returns:
+            list[int]: token ids
+        """
+        return self.model(s)