Unverified Commit 70359bf3 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Update benchmark scripts (#8)

parent 01ca82d7
...@@ -23,7 +23,7 @@ python3 bench_dspy_intro.py --backend sglang ...@@ -23,7 +23,7 @@ python3 bench_dspy_intro.py --backend sglang
``` ```
docker run --name tgi --rm -ti --gpus all --network host \ docker run --name tgi --rm -ti --gpus all --network host \
-v /home/ubuntu/model_weights/Llama-2-7b-chat-hf:/Llama-2-7b-chat-hf \ -v /home/ubuntu/model_weights/Llama-2-7b-chat-hf:/Llama-2-7b-chat-hf \
ghcr.io/huggingface/text-generation-inference:1.1.0 \ ghcr.io/huggingface/text-generation-inference:1.3.0 \
--model-id /Llama-2-7b-chat-hf --num-shard 1 --trust-remote-code \ --model-id /Llama-2-7b-chat-hf --num-shard 1 --trust-remote-code \
--max-input-length 2048 --max-total-tokens 4096 \ --max-input-length 2048 --max-total-tokens 4096 \
--port 24000 --port 24000
......
...@@ -57,6 +57,8 @@ def main(args): ...@@ -57,6 +57,8 @@ def main(args):
out = model + context + select(choices, name="answer") out = model + context + select(choices, name="answer")
return choices.index(out["answer"]) return choices.index(out["answer"])
call_select("Hello,", ["world", "earth"])
elif args.backend == "lmql": elif args.backend == "lmql":
import lmql import lmql
model = lmql.model("meta-llama/Llama-2-7b-chat-hf", model = lmql.model("meta-llama/Llama-2-7b-chat-hf",
...@@ -135,6 +137,6 @@ if __name__ == "__main__": ...@@ -135,6 +137,6 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--num-shot", type=int, default=20) parser.add_argument("--num-shot", type=int, default=20)
parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl") parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
parser.add_argument("--num-questions", type=int, default=100) parser.add_argument("--num-questions", type=int, default=200)
args = add_common_other_args_and_parse(parser) args = add_common_other_args_and_parse(parser)
main(args) main(args)
...@@ -91,6 +91,6 @@ if __name__ == "__main__": ...@@ -91,6 +91,6 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--num-shot", type=int, default=20) parser.add_argument("--num-shot", type=int, default=20)
parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl") parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
parser.add_argument("--num-questions", type=int, default=100) parser.add_argument("--num-questions", type=int, default=200)
args = add_common_sglang_args_and_parse(parser) args = add_common_sglang_args_and_parse(parser)
main(args) main(args)
...@@ -17,14 +17,13 @@ outlines 0.0.22 ...@@ -17,14 +17,13 @@ outlines 0.0.22
### Benchmark sglang ### Benchmark sglang
Run llama-7b Run Llama-7B
``` ```
python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
``` ```
Run mixtral-8x7b Run Mixtral-8x7B
(When there is a CUDA out-of-memory error, try to reduce the `--mem-fraction-static`)
``` ```
python3 -m sglang.launch_server --model-path mistralai/Mixtral-8x7B-Instruct-v0.1 --port 30000 --tp-size 8 python3 -m sglang.launch_server --model-path mistralai/Mixtral-8x7B-Instruct-v0.1 --port 30000 --tp-size 8
...@@ -39,7 +38,7 @@ python3 bench_sglang.py --num-questions 10 ...@@ -39,7 +38,7 @@ python3 bench_sglang.py --num-questions 10
### Benchmark vllm ### Benchmark vllm
Run llama-7b Run Llama-7B
``` ```
python3 -m outlines.serve.serve --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000 python3 -m outlines.serve.serve --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
...@@ -54,8 +53,8 @@ python3 bench_other.py --backend vllm --num-questions 10 ...@@ -54,8 +53,8 @@ python3 bench_other.py --backend vllm --num-questions 10
### Benchmark guidance ### Benchmark guidance
Run llama-7b and benchmark Run Llama-7B and benchmark
``` ```
python3 bench_other.py --backend guidance --num-questions 10 --parallel 1 python3 bench_other.py --backend guidance --num-questions 10 --parallel 1
``` ```
\ No newline at end of file
...@@ -105,7 +105,7 @@ def main(args): ...@@ -105,7 +105,7 @@ def main(args):
with open(args.result_file, "a") as fout: with open(args.result_file, "a") as fout:
value = { value = {
"task": "json_regex_decode", "task": "json_decode_regex",
"backend": args.backend, "backend": args.backend,
"num_gpus": 1, "num_gpus": 1,
"latency": round(latency, 3), "latency": round(latency, 3),
......
...@@ -64,8 +64,6 @@ def main(args): ...@@ -64,8 +64,6 @@ def main(args):
# Run requests # Run requests
tic = time.time() tic = time.time()
states = json_decode.run_batch(arguments, temperature=0, num_threads=args.parallel) states = json_decode.run_batch(arguments, temperature=0, num_threads=args.parallel)
for state in states:
state.sync()
latency = time.time() - tic latency = time.time() - tic
# Compute accuracy # Compute accuracy
...@@ -80,7 +78,7 @@ def main(args): ...@@ -80,7 +78,7 @@ def main(args):
with open(args.result_file, "a") as fout: with open(args.result_file, "a") as fout:
value = { value = {
"task": "json_regex_decode", "task": "json_decode_regex",
"backend": args.backend, "backend": args.backend,
"num_gpus": 1, "num_gpus": 1,
"latency": round(latency, 3), "latency": round(latency, 3),
......
...@@ -3,19 +3,6 @@ ...@@ -3,19 +3,6 @@
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
``` ```
### Performance
- Model: Llama-2-7b-chat-hf
- `--num-prompts 2000 --request-rate 200`
- On 4 A10 (24G) GPUs
| Backend | Throughput | Latency |
| ----------- | --------------- | -------- |
| srt | 5.82 requests/s | 343.54 s |
| vllm==0.2.6 | 3.93 requests/s | 509.08 s |
| vllm==0.2.7 | 5.02 requests/s | 398.25 s |
### SGLang ### SGLang
``` ```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
...@@ -28,7 +15,7 @@ python3 bench_throughput.py --backend srt --tokenizer meta-llama/Llama-2-7b-chat ...@@ -28,7 +15,7 @@ python3 bench_throughput.py --backend srt --tokenizer meta-llama/Llama-2-7b-chat
### vLLM ### vLLM
``` ```
python3 -m vllm.entrypoints.api_server --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --swap-space 16 python3 -m vllm.entrypoints.api_server --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --swap-space 16 --port 21000
``` ```
``` ```
......
This diff is collapsed.
...@@ -95,6 +95,9 @@ def evaluate(args, subject, dev_df, test_df): ...@@ -95,6 +95,9 @@ def evaluate(args, subject, dev_df, test_df):
max_tokens=max_tokens, temperature=0) max_tokens=max_tokens, temperature=0)
return out["answer"] return out["answer"]
# warmup
call_generate("Hello,", temperature=1.0, max_tokens=8)
elif args.backend == "lmql": elif args.backend == "lmql":
import lmql import lmql
model = lmql.model("meta-llama/Llama-2-7b-chat-hf", model = lmql.model("meta-llama/Llama-2-7b-chat-hf",
......
### Benchmark sglang ### Benchmark sglang
Run llama-7b Run Llama-7B
``` ```
python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
``` ```
Run mixtral-8x7b Run Mixtral-8x7B
(When there is a CUDA out-of-memory error, try to reduce the `--mem-fraction-static`) (When there is a CUDA out-of-memory error, try to reduce the `--mem-fraction-static`)
``` ```
...@@ -27,13 +27,13 @@ python3 bench_sglang.py --tokenizer meta-llama/Llama-2-7b-chat-hf --long ...@@ -27,13 +27,13 @@ python3 bench_sglang.py --tokenizer meta-llama/Llama-2-7b-chat-hf --long
### Benchmark vLLM ### Benchmark vLLM
Run llama-7b Run Llama-7B
``` ```
python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000 python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
``` ```
Run mixtral-8x7b Run Mixtral-8x7B
``` ```
python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model mistralai/Mixtral-8x7B-Instruct-v0.1 --disable-log-requests --port 21000 --tensor-parallel-size 8 python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model mistralai/Mixtral-8x7B-Instruct-v0.1 --disable-log-requests --port 21000 --tensor-parallel-size 8
...@@ -53,14 +53,14 @@ python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend vllm ...@@ -53,14 +53,14 @@ python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend vllm
### Benchmark guidance ### Benchmark guidance
Benchmark llama-7b(short output) Benchmark Llama-7B (short output)
``` ```
python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend guidance --parallel 1 python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend guidance --parallel 1
``` ```
Benchmark llama-7b(long output) Benchmark Llama-7B (long output)
``` ```
python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend guidance --parallel 1 --long python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend guidance --parallel 1 --long
``` ```
\ No newline at end of file
...@@ -99,7 +99,7 @@ def main(args): ...@@ -99,7 +99,7 @@ def main(args):
with open(args.result_file, "a") as fout: with open(args.result_file, "a") as fout:
value = { value = {
"task": "multi_turns", "task": "multi_turn_chat",
"backend": args.backend, "backend": args.backend,
"num_gpus": 1, "num_gpus": 1,
"latency": round(latency, 3), "latency": round(latency, 3),
......
...@@ -21,8 +21,6 @@ def multi_turns(s, qas): ...@@ -21,8 +21,6 @@ def multi_turns(s, qas):
def main(args): def main(args):
print(args)
tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code) tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
multi_qas = gen_arguments(args, tokenizer) multi_qas = gen_arguments(args, tokenizer)
...@@ -33,8 +31,6 @@ def main(args): ...@@ -33,8 +31,6 @@ def main(args):
states = multi_turns.run_batch( states = multi_turns.run_batch(
multi_qas, temperature=0, backend=backend, num_threads=args.parallel multi_qas, temperature=0, backend=backend, num_threads=args.parallel
) )
for state in states:
state.sync()
latency = time.time() - tic latency = time.time() - tic
print(f"Latency: {latency:.3f}") print(f"Latency: {latency:.3f}")
...@@ -43,7 +39,7 @@ def main(args): ...@@ -43,7 +39,7 @@ def main(args):
with open(args.result_file, "a") as fout: with open(args.result_file, "a") as fout:
value = { value = {
"task": "multi_turns", "task": "multi_turn_chat",
"backend": args.backend, "backend": args.backend,
"num_gpus": 1, "num_gpus": 1,
"latency": round(latency, 3), "latency": round(latency, 3),
...@@ -74,4 +70,6 @@ if __name__ == "__main__": ...@@ -74,4 +70,6 @@ if __name__ == "__main__":
args.min_len_a = 256 args.min_len_a = 256
args.max_len_a = 512 args.max_len_a = 512
args.num_qa = 20 args.num_qa = 20
print(args)
main(args) main(args)
## Run benchmark ## Run benchmark
NOTE: This is an implementation for replaying a given trace for throughput/latency benchmark purposes. It is not an actual ReAct agent implementation.
### Benchmark sglang ### Benchmark sglang
``` ```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
......
...@@ -124,6 +124,9 @@ def main(args): ...@@ -124,6 +124,9 @@ def main(args):
)) ))
return out["result"] return out["result"]
# warmup
call_generate("Hello,", 1.0, 8, ".")
else: else:
raise ValueError(f"Invalid backend: {args.backend}") raise ValueError(f"Invalid backend: {args.backend}")
......
...@@ -82,9 +82,10 @@ Action 3: Finish[yes] ...@@ -82,9 +82,10 @@ Action 3: Finish[yes]
""" + question) """ + question)
for i in range(1, len(triplets) + 2): for i in range(1, len(triplets) + 2):
s += "Thought " + str(i) + ":" s += "Thought " + str(i) + ":"
# NOTE: This is an implementation for replaying a given trace for benchmark purposes. It is not an actual ReAct agent implementation.
ss = s.fork(1) ss = s.fork(1)
ss[0] += sgl.gen(name="thought_action", max_tokens=200, stop="Observation") ss[0] += sgl.gen(name="thought_action", max_tokens=200, stop="Observation")
# ss.join() ss.join()
# to verify the correctness of output, this should be collected # to verify the correctness of output, this should be collected
# print(ss[0]["thought_action"]) # print(ss[0]["thought_action"])
if i > len(triplets): if i > len(triplets):
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -5,13 +5,15 @@ wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_sch ...@@ -5,13 +5,15 @@ wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_sch
## Run benchmark ## Run benchmark
NOTE: This is an implementation for throughput/latency benchmark purposes. The prompts are not tuned to achieve good accuracy on the GSM-8K tasks.
### Benchmark sglang ### Benchmark sglang
``` ```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
``` ```
``` ```
python3 bench_sglang.py --num-questions 32 --parallel 8 python3 bench_sglang.py --num-questions 32
python3 bench_sglang.py --num-questions 16 --parallel 1 python3 bench_sglang.py --num-questions 16 --parallel 1
``` ```
......
...@@ -141,6 +141,9 @@ def main(args): ...@@ -141,6 +141,9 @@ def main(args):
rets.append(out["answer"]) rets.append(out["answer"])
return rets return rets
# warmup
call_generate("Hello,", 1.0, 8, ".", 1)
# Run requests # Run requests
states = [None] * len(questions) states = [None] * len(questions)
def get_one_answer(i): def get_one_answer(i):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment