Unverified Commit 65c65776 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Improve benchmark scripts & fix llava (#613)

parent 66581596
...@@ -30,7 +30,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r ...@@ -30,7 +30,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
#### Run ShareGPT #### Run ShareGPT
``` ```
python3 bench_throughput.py --backend srt --port 30000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10 python3 bench_serving.py --backend srt --port 30000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10
``` ```
## Other baselines ## Other baselines
...@@ -42,14 +42,20 @@ python3 -m vllm.entrypoints.api_server --model meta-llama/Llama-2-7b-chat-hf --t ...@@ -42,14 +42,20 @@ python3 -m vllm.entrypoints.api_server --model meta-llama/Llama-2-7b-chat-hf --t
``` ```
# run synthetic # run synthetic
python3 bench_throughput.py --backend vllm --port 30000 --tokenizer meta-llama/Llama-2-7b-chat-hf --num-prompt 1000 --request-rate 100 --input-len 1024 --output-len 256 python3 bench_serving.py --backend vllm --port 30000 --tokenizer meta-llama/Llama-2-7b-chat-hf --num-prompt 1000 --request-rate 100 --input-len 1024 --output-len 256
``` ```
``` ```
# run ShareGPT # run ShareGPT
python3 bench_throughput.py --backend vllm --port 21000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10 python3 bench_serving.py --backend vllm --port 21000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10
``` ```
```
# run one batch
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B --tensor 8 --disable-log-requests --max-num-seqs 1024 --quantization fp8
python3 bench_one.py --input-len 1024 --batch-size 1 1 2 4 8 16 32 64 128 256 512 768 1024 --port 8000 --backend vllm
```
### LightLLM ### LightLLM
``` ```
...@@ -57,5 +63,5 @@ python -m lightllm.server.api_server --model_dir ~/model_weights/Llama-2-7b-chat ...@@ -57,5 +63,5 @@ python -m lightllm.server.api_server --model_dir ~/model_weights/Llama-2-7b-chat
``` ```
``` ```
python3 bench_throughput.py --backend lightllm --port 22000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10 python3 bench_serving.py --backend lightllm --port 22000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10
``` ```
\ No newline at end of file
...@@ -15,19 +15,19 @@ def run_one_batch_size(bs): ...@@ -15,19 +15,19 @@ def run_one_batch_size(bs):
url = f"{args.host}:{args.port}" url = f"{args.host}:{args.port}"
max_new_tokens = args.max_tokens max_new_tokens = args.max_tokens
a = 20 if args.input_len:
prompt = f"{a, }" input_ids = [
[int(x) for x in np.random.randint(0, high=16384, size=(args.input_len,))] for _ in range(bs)
]
else:
text = [f"{i, }" for i in range(bs)]
tic = time.time() tic = time.time()
if args.backend == "srt": if args.backend == "srt":
if args.input_len: if args.input_len:
inputs = {"input_ids": [ inputs = {"input_ids": input_ids}
[int(x) for x in np.random.randint(0, high=16384, size=(args.input_len,))] for _ in range(bs)
]}
else: else:
inputs = {"text": [ inputs = {"text": text}
f"{i, }" for i in range(bs)
]}
response = requests.post( response = requests.post(
url + "/generate", url + "/generate",
...@@ -44,7 +44,7 @@ def run_one_batch_size(bs): ...@@ -44,7 +44,7 @@ def run_one_batch_size(bs):
response = requests.post( response = requests.post(
url + "/generate", url + "/generate",
json={ json={
"inputs": prompt, "inputs": text[0],
"parameters": { "parameters": {
"temperature": 0, "temperature": 0,
"max_new_tokens": max_new_tokens, "max_new_tokens": max_new_tokens,
...@@ -53,13 +53,19 @@ def run_one_batch_size(bs): ...@@ -53,13 +53,19 @@ def run_one_batch_size(bs):
}, },
) )
elif args.backend == "vllm": elif args.backend == "vllm":
if args.input_len:
inputs = {"prompt": input_ids}
else:
inputs = {"prompt": text}
response = requests.post( response = requests.post(
url + "/generate", url + "/v1/completions",
json={ json={
"prompt": prompt, "model": args.vllm_model_name,
"temperature": 0, "temperature": 0,
"max_tokens": max_new_tokens, "max_tokens": max_new_tokens,
"ignore_eos": True, "ignore_eos": True,
**inputs,
}, },
) )
elif args.backend == "ginfer": elif args.backend == "ginfer":
...@@ -71,7 +77,7 @@ def run_one_batch_size(bs): ...@@ -71,7 +77,7 @@ def run_one_batch_size(bs):
tic = time.time() tic = time.time()
sample_request = sampler_pb2.SampleTextRequest( sample_request = sampler_pb2.SampleTextRequest(
prompt=prompt, prompt=text[0],
settings=sampler_pb2.SampleSettings( settings=sampler_pb2.SampleSettings(
max_len=max_new_tokens, max_len=max_new_tokens,
rng_seed=0, rng_seed=0,
...@@ -92,7 +98,7 @@ def run_one_batch_size(bs): ...@@ -92,7 +98,7 @@ def run_one_batch_size(bs):
output_throughput = bs * max_new_tokens / latency output_throughput = bs * max_new_tokens / latency
print(f"latency: {latency:.2f} s, speed: {output_throughput:.2f} token/s") print(f"latency: {latency:.2f} s, speed: {output_throughput:.2f} token/s")
with open("tmp_output.txt", "a") as fout: with open("results.jsonl", "a") as fout:
res = { res = {
"input_len": args.input_len, "input_len": args.input_len,
"output_len": args.max_tokens, "output_len": args.max_tokens,
...@@ -111,6 +117,7 @@ if __name__ == "__main__": ...@@ -111,6 +117,7 @@ if __name__ == "__main__":
parser.add_argument("--input-len", type=int, default=None) parser.add_argument("--input-len", type=int, default=None)
parser.add_argument("--batch-size", type=int, nargs='*', default=[1]) parser.add_argument("--batch-size", type=int, nargs='*', default=[1])
parser.add_argument("--max-tokens", type=int, default=256) parser.add_argument("--max-tokens", type=int, default=256)
parser.add_argument("--vllm-model-name", type=str, default="meta-llama/Meta-Llama-3-70B")
args = parser.parse_args() args = parser.parse_args()
if args.port is None: if args.port is None:
......
# Code Structure
- `backend`: Various backends for the language interpreter.
- `lang`: The frontend language.
- `srt`: The runtime for running local models.
- `test`: Test utilities.
- `api.py`: Public API.
- `bench_latency.py`: Benchmark utilities.
- `global_config.py`: The global configs and constants.
- `launch_server.py`: The entry point of launching local server.
- `utils.py`: Common utilities.
...@@ -276,17 +276,13 @@ class ModelRunner: ...@@ -276,17 +276,13 @@ class ModelRunner:
input_metadata = InputMetadata.create( input_metadata = InputMetadata.create(
self, self,
forward_mode=ForwardMode.EXTEND, forward_mode=ForwardMode.EXTEND,
tp_size=self.tp_size,
req_pool_indices=batch.req_pool_indices, req_pool_indices=batch.req_pool_indices,
seq_lens=batch.seq_lens, seq_lens=batch.seq_lens,
prefix_lens=batch.prefix_lens, prefix_lens=batch.prefix_lens,
position_ids_offsets=batch.position_ids_offsets, position_ids_offsets=batch.position_ids_offsets,
out_cache_loc=batch.out_cache_loc, out_cache_loc=batch.out_cache_loc,
top_logprobs_nums=batch.top_logprobs_nums,
return_logprob=batch.return_logprob, return_logprob=batch.return_logprob,
flashinfer_prefill_wrapper_ragged=self.flashinfer_prefill_wrapper_ragged, top_logprobs_nums=batch.top_logprobs_nums,
flashinfer_prefill_wrapper_paged=self.flashinfer_prefill_wrapper_paged,
flashinfer_decode_wrapper=self.flashinfer_decode_wrapper,
) )
return self.model.forward( return self.model.forward(
batch.input_ids, batch.input_ids,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment