Unverified Commit 945aa9be authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Update readme (#568)

parent 2e6e62e1
### Download data
# Benchmark Latency and Throughput
## SGLang
### Launch server
``` ```
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
``` ```
Install [FlashInfer](https://github.com/flashinfer-ai/flashinfer) if you want it to be enabled.
### Benchmark one batch
### SGLang
``` ```
# use native attention python3 bench_one.py
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 1 --port 30000 python3 bench_one.py --batch-size 64
# use flashinfer attention: --enable-flashinfer
# disable RadixAttention: --disable-radix-cache
``` ```
### Benchmark online serving with many requests
``` ```
# run ShareGPT python3 bench_serving.py --backend srt --port 30000 --tokenizer meta-llama/Llama-2-7b-chat-hf --num-prompt 1000 --request-rate 100 --input-len 1024 --output-len 256
python3 bench_throughput.py --backend srt --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10 --port 30000
``` ```
### Benchmark online serving on the ShareGPT dataset
#### Download data
``` ```
# run synthetic wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 bench_throughput.py --backend srt --tokenizer meta-llama/Llama-2-7b-chat-hf --num-prompt 1000 --request-rate 100 --input-len 1024 --output-len 256 --port 30000 ```
#### Run ShareGPT
```
python3 bench_throughput.py --backend srt --port 30000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10
``` ```
## Other baselines
### vLLM ### vLLM
``` ```
...@@ -30,13 +41,13 @@ python3 -m vllm.entrypoints.api_server --model meta-llama/Llama-2-7b-chat-hf --t ...@@ -30,13 +41,13 @@ python3 -m vllm.entrypoints.api_server --model meta-llama/Llama-2-7b-chat-hf --t
``` ```
``` ```
# run ShareGPT # run synthetic
python3 bench_throughput.py --backend vllm --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10 --port 21000 python3 bench_throughput.py --backend vllm --port 30000 --tokenizer meta-llama/Llama-2-7b-chat-hf --num-prompt 1000 --request-rate 100 --input-len 1024 --output-len 256
``` ```
``` ```
# run synthetic # run ShareGPT
python3 bench_throughput.py --backend vllm --tokenizer meta-llama/Llama-2-7b-chat-hf --num-prompt 1000 --request-rate 100 --input-len 1024 --output-len 256 --port 30000 python3 bench_throughput.py --backend vllm --port 21000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10
``` ```
...@@ -46,5 +57,5 @@ python -m lightllm.server.api_server --model_dir ~/model_weights/Llama-2-7b-chat ...@@ -46,5 +57,5 @@ python -m lightllm.server.api_server --model_dir ~/model_weights/Llama-2-7b-chat
``` ```
``` ```
python3 bench_throughput.py --backend lightllm --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10 --port 22000 python3 bench_throughput.py --backend lightllm --port 22000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10
``` ```
\ No newline at end of file
...@@ -296,23 +296,27 @@ def main(args: argparse.Namespace): ...@@ -296,23 +296,27 @@ def main(args: argparse.Namespace):
) )
benchmark_end_time = time.perf_counter() benchmark_end_time = time.perf_counter()
benchmark_time = benchmark_end_time - benchmark_start_time benchmark_time = benchmark_end_time - benchmark_start_time
print(f"Total time: {benchmark_time:.2f} s")
print(f"Throughput: {args.num_prompts / benchmark_time:.2f} requests/s")
# Compute the latency statistics. # Compute the statistics.
avg_latency = np.mean([latency for _, _, latency in REQUEST_LATENCY]) avg_latency = np.mean([latency for _, _, latency in REQUEST_LATENCY])
print(f"Average latency: {avg_latency:.2f} s")
avg_per_token_latency = np.mean( avg_per_token_latency = np.mean(
[ [
latency / (prompt_len + output_len) latency / (prompt_len + output_len)
for prompt_len, output_len, latency in REQUEST_LATENCY for prompt_len, output_len, latency in REQUEST_LATENCY
] ]
) )
print(f"Average latency per token: {avg_per_token_latency:.2f} s")
avg_per_output_token_latency = np.mean( avg_per_output_token_latency = np.mean(
[latency / output_len for _, output_len, latency in REQUEST_LATENCY] [latency / output_len for _, output_len, latency in REQUEST_LATENCY]
) )
print("Average latency per output token: " f"{avg_per_output_token_latency:.2f} s") decoding_throughput = np.sum([
output_len for _, output_len, _ in REQUEST_LATENCY]) / benchmark_time
print(f"Total time: {benchmark_time:.2f} s")
print(f"Request throughput: {args.num_prompts / benchmark_time:.2f} requests/s")
print(f"Decoding throughput: {decoding_throughput:.2f} token/s")
print(f"Average latency: {avg_latency:.2f} s")
print(f"Average latency per token: {avg_per_token_latency:.2f} s")
print(f"Average latency per output token: {avg_per_output_token_latency:.2f} s")
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment