[Feature] Profiling tool for huggingface and deepspeed models (#161)

* initial * add docs * noqa on docstring * support no streamer * typo * add accel in model name * fix lint * fix CSVWRitter typo * typo

[Feature] Profiling tool for huggingface and deepspeed models (#161)
* initial * add docs * noqa on docstring * support no streamer * typo * add accel in model name * fix lint * fix CSVWRitter typo * typo
e50387cc · WRH · GitHub · e68a1d00 · e50387cc
Unverified Commit e50387cc authored Aug 16, 2023 by WRH Committed by GitHub Aug 16, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 290 additions and 0 deletions

benchmark/profile_hf_generation.py benchmark/profile_hf_generation.py +290 -0

No files found.
--- a/benchmark/profile_hf_generation.py
+++ b/benchmark/profile_hf_generation.py
+"""Benchmark huggingface models and maybe speedup by deepspeed.
+
+Theoretically, this tool is compatible with all huggingface models.
+
+Example 1: Test huggingface llama2 with simulated 128 input token and 128 generated token
+
+```shell
+python profile_hf_generation.py \
+    --model_path $PATH_TO_HF_LLAMA2 \
+    --batch_size 16 \
+    --input_seqlen 128 \
+    --gen_seqlen 128 \
+    --out_file profile_hf.csv
+```
+
+Example 2: Same test above but accelerated with DeepSpeed inference and more round to test
+
+```shell
+python profile_hf_generation.py \
+    --model_path $PATH_TO_HF_LLAMA2 \
+    --batch_size 16 \
+    --input_seqlen 128 \
+    --gen_seqlen 128 \
+    --test_round 2 \
+    --accel deepspeed \
+    --out_file profile_hf.csv
+```
+
+Example 3: Same test above but do not use streamer to measure time of every token
+        Only overall time is measured, a little bit faster than streaming mode
+
+```shell
+python profile_hf_generation.py \
+    --model_path $PATH_TO_HF_LLAMA2 \
+    --batch_size 16 \
+    --input_seqlen 128 \
+    --gen_seqlen 128 \
+    --test_round 2 \
+    --accel deepspeed \
+    --no-streamer \
+    --out_file profile_hf.csv
+```
+
+Result will be saved in `profile_hf.csv`, which is a comma-separated file
+with the following fields.
+
+1. model: name of model, specified with --model_log_name or otherwise f"{model.__class__.__name__}-{accel}"
+2. batch_size: as name
+3. input_seqlen: as name
+4. gen_len: length of sequence length to generate
+5. total_len: gen_len + input_len
+
+In total, the model will take a random input ids of shape (batch_size, input_seqlen) and
+run forward `gen_len` times to generate output ids of shape (batch_size, gen_len)
+
+6. first_time: latency to forward the first (batch_size, input_seqlen) ids
+    to get `input_seqlen+1`-th batch of output of shape (batch_size, 1)
+7. next_time: average latency of the next samples (averaged of 5 sample),
+    this measure latency when context length is short
+8. last_time: average latency of the last samples (averaged of 5 sample),
+    this measure latency when context length is long
+9. total time: total time to generate all tokens
+10. throughput(total): bs * total_len / total_time (same measure as vllm)
+11. throughput(gen): bs * gen_len / total_time
+"""   # noqa: E501
+
+import csv
+import logging
+import os
+import time
+from typing import Optional
+
+import fire
+import torch
+from transformers import AutoModelForCausalLM, GenerationConfig
+
+from lmdeploy.pytorch.accel import LoadNoInit
+
+logger = logging.getLogger(__file__)
+logger.setLevel(logging.DEBUG)
+info = logger.info
+warning = logger.warning
+debug = logger.debug
+cinfo = lambda x: info('\033[1;32m' + x + '\033[0m')  # noqa E731
+cprint = lambda x: print('\033[1;32m' + x + '\033[0m')  # noqa E731
+avg = lambda x: sum(x) / len(x)  # noqa E731
+
+
+class TimingStreamer:
+    """Timing helper for HuggingFace models."""
+
+    def __init__(self) -> None:
+        # self.token_cache = []
+        # self.tokens = None
+
+        torch.cuda.synchronize()
+
+        self.evts = []
+        # self.value = 0
+
+    def put(self, value):
+        """
+        Notes:
+            When `put` is called for the first time, no prompt is feed to the model yet.
+            When `put` is called later, event is recorded for the previous generation,
+                which means the second event records the time for the first prompt.
+
+            GenerationMixin will call `.cpu()` on output token which implies a sync.
+        """  # noqa: E501
+        # self.value += 1
+        # self.token_cache.append(value)
+        evt = torch.cuda.Event(enable_timing=True)
+        evt.record()
+        self.evts.append(evt)
+
+    def end(self):
+        torch.cuda.synchronize()
+        # self.tokens = torch.hstack([_atleast_2d(v) for v in self.token_cache])    # noqa: E501
+
+    def get_times(self):
+        """Maybe deprecated.
+
+        Returns:
+          a list of times in ms for (first prompt, avg next token, total time)
+        """
+        first = self.evts[0].elapsed_time(self.evts[1])
+        rest = [
+            self.evts[i].elapsed_time(self.evts[i + 1])
+            for i in range(1,
+                           len(self.evts) - 1)
+        ]
+        avg = sum(rest) / len(rest)
+        return first + sum(rest), first, avg
+
+    def raw_times(self):
+        """
+        Returns:
+          a list of times in ms.
+        """
+        evts = self.evts
+        r = [evts[i].elapsed_time(evts[i + 1]) for i in range(len(evts) - 1)]
+        return r
+
+
+class CSVWritter:
+
+    def __init__(
+        self,
+        file='unnamed.csv',
+        header=[
+            'model',
+            'batch_size',
+            'input_seqlen',
+            'gen_len',
+            'total_len',
+            'first_time',
+            'next_time',
+            'last_time',
+            'total time',
+            'throughput(total)',
+            'throughput(gen)',
+        ],
+    ):
+        if self.on_master:
+            self.file = file
+            csv.writer(open(file, 'a')).writerow(header)
+
+    def write(self, line):
+        if self.on_master:
+            csv.writer(open(self.file, 'a')).writerow(line)
+
+    @property
+    def on_master(self):
+        # return not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0  # noqa: E501
+        rank = int(os.environ.get('RANK', 0))
+        return rank == 0
+
+
+def init_hf_model(model_path: str):
+    start = time.monotonic()
+    with LoadNoInit():
+        model = AutoModelForCausalLM.from_pretrained(model_path,
+                                                     torch_dtype=torch.float16,
+                                                     trust_remote_code=True)
+    print(f'load model in {time.monotonic() -start} s')
+    return model
+
+
+def accel_deepspeed(model, max_out_tokens, tp_size=1):
+    import deepspeed
+    ds_model = deepspeed.init_inference(
+        model=model,  # Transformers models
+        tensor_parallel={'tp_size': tp_size},
+        dtype=torch.float16,  # dtype of the weights (fp16)
+        replace_with_kernel_inject=True,
+        max_out_tokens=max_out_tokens,
+    )
+
+    return ds_model
+
+
+def main(model_path: str,
+         batch_size: int,
+         input_seqlen: int,
+         gen_seqlen: int,
+         test_round: int = 1,
+         accel: Optional[str] = None,
+         out_file: Optional[str] = 'profile_hf.csv',
+         model_log_name: Optional[str] = None,
+         no_streamer: bool = False):
+
+    total_seqlen = input_seqlen + gen_seqlen
+
+    model = init_hf_model(model_path)
+
+    vocab_size = model.config.vocab_size
+    if model_log_name is None:
+        model_log_name = model.__class__.__name__
+        if accel is not None:
+            model_log_name += f'-{accel}'
+
+    if accel is None:
+        model = model.cuda()
+    elif accel == 'deepspeed':
+        model = accel_deepspeed(model, total_seqlen + 6)
+        # longer total seqlen for fault tolerance
+    else:
+        raise NotImplementedError(f'accel {accel} not supported.')
+
+    # log to file
+    csvwritter = CSVWritter(out_file)
+
+    cprint('Benchmarking {} '
+           f'with batch_size={batch_size}, input_seqlen={input_seqlen}, '
+           f'gen_seqlen={gen_seqlen}, accel={accel}')
+    for r in range(test_round):
+        # TODO: now write every round to csv
+        # Use external tool for analysis
+
+        cprint(f'Test round {r}')
+        # input_id = 0 sometimes cause some cuda error
+        fake_inputs = torch.randint(10, vocab_size, (batch_size, input_seqlen))
+        fake_inputs = fake_inputs.cuda()
+
+        ts = TimingStreamer() if not no_streamer else None
+
+        torch.cuda.synchronize()
+        start = time.monotonic()
+        fake_outputs = model.generate(
+            fake_inputs,
+            GenerationConfig(max_new_tokens=gen_seqlen,
+                             do_sample=False,
+                             eos_token_id=[-1]),
+            streamer=ts,
+        )
+        torch.cuda.synchronize()
+        end = time.monotonic()
+        assert fake_outputs.size() == (batch_size,
+                                       total_seqlen), fake_outputs.size()
+
+        # total_time, first_time, _ = ts.get_times()
+        if no_streamer:
+            total_time = (end - start) * 1000
+            first_time = next_time = last_time = 0
+        else:
+            raw_times = ts.raw_times()  # You may further analyze this
+            total_time = sum(raw_times)
+            first_time = raw_times[0]
+            next_time = avg(raw_times[1:6])
+            last_time = avg(raw_times[-5:])
+        tt = batch_size * total_seqlen * 1000 / total_time
+        tg = batch_size * gen_seqlen * 1000 / total_time
+        cprint(f'First token/ms: {first_time:.1f}, '
+               f'Next tokens/ms: {next_time:.1f}, '
+               f'Last tokens/ms: {last_time:.1f}, '
+               f'Total Time/ms: {total_time:5.3f}, '
+               f'Throughput Total(tok/s): {tt:5.3f}, '
+               f'Throughput Gen(tok/s): {tg:5.3f}')
+
+        if test_round > 1 and r > 0:
+            # First round is warm up
+            csvwritter.write([
+                model_log_name, batch_size, input_seqlen, gen_seqlen,
+                total_seqlen, first_time, next_time, last_time, total_time, tt,
+                tg
+            ])
+
+
+if __name__ == '__main__':
+    fire.Fire(main)