Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
70359bf3
Unverified
Commit
70359bf3
authored
Jan 15, 2024
by
Lianmin Zheng
Committed by
GitHub
Jan 15, 2024
Browse files
Update benchmark scripts (#8)
parent
01ca82d7
Changes
28
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
165 additions
and
42 deletions
+165
-42
benchmark/dspy/README.md
benchmark/dspy/README.md
+1
-1
benchmark/hellaswag/bench_other.py
benchmark/hellaswag/bench_other.py
+3
-1
benchmark/hellaswag/bench_sglang.py
benchmark/hellaswag/bench_sglang.py
+1
-1
benchmark/json_decode_regex/README.md
benchmark/json_decode_regex/README.md
+5
-6
benchmark/json_decode_regex/bench_other.py
benchmark/json_decode_regex/bench_other.py
+1
-1
benchmark/json_decode_regex/bench_sglang.py
benchmark/json_decode_regex/bench_sglang.py
+1
-3
benchmark/json_decode_regex/build_dataset.py
benchmark/json_decode_regex/build_dataset.py
+0
-0
benchmark/latency_throughput/README.md
benchmark/latency_throughput/README.md
+1
-14
benchmark/llm_judge/articles.jsonl
benchmark/llm_judge/articles.jsonl
+25
-0
benchmark/mmlu/bench_other.py
benchmark/mmlu/bench_other.py
+3
-0
benchmark/multi_turn_chat/README.md
benchmark/multi_turn_chat/README.md
+7
-7
benchmark/multi_turn_chat/bench_other.py
benchmark/multi_turn_chat/bench_other.py
+1
-1
benchmark/multi_turn_chat/bench_sglang.py
benchmark/multi_turn_chat/bench_sglang.py
+3
-5
benchmark/multi_turn_chat/data_gen.py
benchmark/multi_turn_chat/data_gen.py
+0
-0
benchmark/react/README.md
benchmark/react/README.md
+2
-0
benchmark/react/bench_other.py
benchmark/react/bench_other.py
+3
-0
benchmark/react/bench_sglang.py
benchmark/react/bench_sglang.py
+2
-1
benchmark/react/hotpotqa_100.jsonl
benchmark/react/hotpotqa_100.jsonl
+100
-0
benchmark/tree_of_thought_deep/README.md
benchmark/tree_of_thought_deep/README.md
+3
-1
benchmark/tree_of_thought_deep/bench_other.py
benchmark/tree_of_thought_deep/bench_other.py
+3
-0
No files found.
benchmark/dspy/README.md
View file @
70359bf3
...
...
@@ -23,7 +23,7 @@ python3 bench_dspy_intro.py --backend sglang
```
docker run --name tgi --rm -ti --gpus all --network host \
-v /home/ubuntu/model_weights/Llama-2-7b-chat-hf:/Llama-2-7b-chat-hf \
ghcr.io/huggingface/text-generation-inference:1.
1
.0 \
ghcr.io/huggingface/text-generation-inference:1.
3
.0 \
--model-id /Llama-2-7b-chat-hf --num-shard 1 --trust-remote-code \
--max-input-length 2048 --max-total-tokens 4096 \
--port 24000
...
...
benchmark/hellaswag/bench_other.py
View file @
70359bf3
...
...
@@ -57,6 +57,8 @@ def main(args):
out
=
model
+
context
+
select
(
choices
,
name
=
"answer"
)
return
choices
.
index
(
out
[
"answer"
])
call_select
(
"Hello,"
,
[
"world"
,
"earth"
])
elif
args
.
backend
==
"lmql"
:
import
lmql
model
=
lmql
.
model
(
"meta-llama/Llama-2-7b-chat-hf"
,
...
...
@@ -135,6 +137,6 @@ if __name__ == "__main__":
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--num-shot"
,
type
=
int
,
default
=
20
)
parser
.
add_argument
(
"--data-path"
,
type
=
str
,
default
=
"hellaswag_val.jsonl"
)
parser
.
add_argument
(
"--num-questions"
,
type
=
int
,
default
=
1
00
)
parser
.
add_argument
(
"--num-questions"
,
type
=
int
,
default
=
2
00
)
args
=
add_common_other_args_and_parse
(
parser
)
main
(
args
)
benchmark/hellaswag/bench_sglang.py
View file @
70359bf3
...
...
@@ -91,6 +91,6 @@ if __name__ == "__main__":
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--num-shot"
,
type
=
int
,
default
=
20
)
parser
.
add_argument
(
"--data-path"
,
type
=
str
,
default
=
"hellaswag_val.jsonl"
)
parser
.
add_argument
(
"--num-questions"
,
type
=
int
,
default
=
1
00
)
parser
.
add_argument
(
"--num-questions"
,
type
=
int
,
default
=
2
00
)
args
=
add_common_sglang_args_and_parse
(
parser
)
main
(
args
)
benchmark/json_
regex_
decode/README.md
→
benchmark/json_decode
_regex
/README.md
View file @
70359bf3
...
...
@@ -17,14 +17,13 @@ outlines 0.0.22
### Benchmark sglang
Run
l
lama-7
b
Run
L
lama-7
B
```
python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
```
Run mixtral-8x7b
(When there is a CUDA out-of-memory error, try to reduce the
`--mem-fraction-static`
)
Run Mixtral-8x7B
```
python3 -m sglang.launch_server --model-path mistralai/Mixtral-8x7B-Instruct-v0.1 --port 30000 --tp-size 8
...
...
@@ -39,7 +38,7 @@ python3 bench_sglang.py --num-questions 10
### Benchmark vllm
Run
l
lama-7
b
Run
L
lama-7
B
```
python3 -m outlines.serve.serve --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
...
...
@@ -54,8 +53,8 @@ python3 bench_other.py --backend vllm --num-questions 10
### Benchmark guidance
Run
l
lama-7
b
and benchmark
Run
L
lama-7
B
and benchmark
```
python3 bench_other.py --backend guidance --num-questions 10 --parallel 1
```
\ No newline at end of file
```
benchmark/json_
regex_
decode/bench_other.py
→
benchmark/json_decode
_regex
/bench_other.py
View file @
70359bf3
...
...
@@ -105,7 +105,7 @@ def main(args):
with
open
(
args
.
result_file
,
"a"
)
as
fout
:
value
=
{
"task"
:
"json_
regex_
decode"
,
"task"
:
"json_decode
_regex
"
,
"backend"
:
args
.
backend
,
"num_gpus"
:
1
,
"latency"
:
round
(
latency
,
3
),
...
...
benchmark/json_
regex_
decode/bench_sglang.py
→
benchmark/json_decode
_regex
/bench_sglang.py
View file @
70359bf3
...
...
@@ -64,8 +64,6 @@ def main(args):
# Run requests
tic
=
time
.
time
()
states
=
json_decode
.
run_batch
(
arguments
,
temperature
=
0
,
num_threads
=
args
.
parallel
)
for
state
in
states
:
state
.
sync
()
latency
=
time
.
time
()
-
tic
# Compute accuracy
...
...
@@ -80,7 +78,7 @@ def main(args):
with
open
(
args
.
result_file
,
"a"
)
as
fout
:
value
=
{
"task"
:
"json_
regex_
decode"
,
"task"
:
"json_decode
_regex
"
,
"backend"
:
args
.
backend
,
"num_gpus"
:
1
,
"latency"
:
round
(
latency
,
3
),
...
...
benchmark/json_
regex_
decode/build_dataset.py
→
benchmark/json_decode
_regex
/build_dataset.py
View file @
70359bf3
File moved
benchmark/latency_throughput/README.md
View file @
70359bf3
...
...
@@ -3,19 +3,6 @@
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
```
### Performance
-
Model: Llama-2-7b-chat-hf
-
`--num-prompts 2000 --request-rate 200`
-
On 4 A10 (24G) GPUs
| Backend | Throughput | Latency |
| ----------- | --------------- | -------- |
| srt | 5.82 requests/s | 343.54 s |
| vllm==0.2.6 | 3.93 requests/s | 509.08 s |
| vllm==0.2.7 | 5.02 requests/s | 398.25 s |
### SGLang
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
...
...
@@ -28,7 +15,7 @@ python3 bench_throughput.py --backend srt --tokenizer meta-llama/Llama-2-7b-chat
### vLLM
```
python3 -m vllm.entrypoints.api_server --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --swap-space 16
python3 -m vllm.entrypoints.api_server --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --swap-space 16
--port 21000
```
```
...
...
benchmark/llm_judge/articles.jsonl
0 → 100644
View file @
70359bf3
This diff is collapsed.
Click to expand it.
benchmark/mmlu/bench_other.py
View file @
70359bf3
...
...
@@ -95,6 +95,9 @@ def evaluate(args, subject, dev_df, test_df):
max_tokens
=
max_tokens
,
temperature
=
0
)
return
out
[
"answer"
]
# warmup
call_generate
(
"Hello,"
,
temperature
=
1.0
,
max_tokens
=
8
)
elif
args
.
backend
==
"lmql"
:
import
lmql
model
=
lmql
.
model
(
"meta-llama/Llama-2-7b-chat-hf"
,
...
...
benchmark/multi_turn
s
/README.md
→
benchmark/multi_turn
_chat
/README.md
View file @
70359bf3
### Benchmark sglang
Run
l
lama-7
b
Run
L
lama-7
B
```
python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
```
Run
m
ixtral-8x7
b
Run
M
ixtral-8x7
B
(When there is a CUDA out-of-memory error, try to reduce the
`--mem-fraction-static`
)
```
...
...
@@ -27,13 +27,13 @@ python3 bench_sglang.py --tokenizer meta-llama/Llama-2-7b-chat-hf --long
### Benchmark vLLM
Run
l
lama-7
b
Run
L
lama-7
B
```
python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
```
Run
m
ixtral-8x7
b
Run
M
ixtral-8x7
B
```
python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model mistralai/Mixtral-8x7B-Instruct-v0.1 --disable-log-requests --port 21000 --tensor-parallel-size 8
...
...
@@ -53,14 +53,14 @@ python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend vllm
### Benchmark guidance
Benchmark
l
lama-7
b
(short output)
Benchmark
L
lama-7
B
(short output)
```
python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend guidance --parallel 1
```
Benchmark
l
lama-7
b
(long output)
Benchmark
L
lama-7
B
(long output)
```
python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend guidance --parallel 1 --long
```
\ No newline at end of file
```
benchmark/multi_turn
s
/bench_other.py
→
benchmark/multi_turn
_chat
/bench_other.py
View file @
70359bf3
...
...
@@ -99,7 +99,7 @@ def main(args):
with
open
(
args
.
result_file
,
"a"
)
as
fout
:
value
=
{
"task"
:
"multi_turn
s
"
,
"task"
:
"multi_turn
_chat
"
,
"backend"
:
args
.
backend
,
"num_gpus"
:
1
,
"latency"
:
round
(
latency
,
3
),
...
...
benchmark/multi_turn
s
/bench_sglang.py
→
benchmark/multi_turn
_chat
/bench_sglang.py
View file @
70359bf3
...
...
@@ -21,8 +21,6 @@ def multi_turns(s, qas):
def
main
(
args
):
print
(
args
)
tokenizer
=
get_tokenizer
(
args
.
tokenizer
,
trust_remote_code
=
args
.
trust_remote_code
)
multi_qas
=
gen_arguments
(
args
,
tokenizer
)
...
...
@@ -33,8 +31,6 @@ def main(args):
states
=
multi_turns
.
run_batch
(
multi_qas
,
temperature
=
0
,
backend
=
backend
,
num_threads
=
args
.
parallel
)
for
state
in
states
:
state
.
sync
()
latency
=
time
.
time
()
-
tic
print
(
f
"Latency:
{
latency
:.
3
f
}
"
)
...
...
@@ -43,7 +39,7 @@ def main(args):
with
open
(
args
.
result_file
,
"a"
)
as
fout
:
value
=
{
"task"
:
"multi_turn
s
"
,
"task"
:
"multi_turn
_chat
"
,
"backend"
:
args
.
backend
,
"num_gpus"
:
1
,
"latency"
:
round
(
latency
,
3
),
...
...
@@ -74,4 +70,6 @@ if __name__ == "__main__":
args
.
min_len_a
=
256
args
.
max_len_a
=
512
args
.
num_qa
=
20
print
(
args
)
main
(
args
)
benchmark/multi_turn
s
/data_gen.py
→
benchmark/multi_turn
_chat
/data_gen.py
View file @
70359bf3
File moved
benchmark/react/README.md
View file @
70359bf3
## Run benchmark
NOTE: This is an implementation for replaying a given trace for throughput/latency benchmark purposes. It is not an actual ReAct agent implementation.
### Benchmark sglang
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
...
...
benchmark/react/bench_other.py
View file @
70359bf3
...
...
@@ -124,6 +124,9 @@ def main(args):
))
return
out
[
"result"
]
# warmup
call_generate
(
"Hello,"
,
1.0
,
8
,
"."
)
else
:
raise
ValueError
(
f
"Invalid backend:
{
args
.
backend
}
"
)
...
...
benchmark/react/bench_sglang.py
View file @
70359bf3
...
...
@@ -82,9 +82,10 @@ Action 3: Finish[yes]
"""
+
question
)
for
i
in
range
(
1
,
len
(
triplets
)
+
2
):
s
+=
"Thought "
+
str
(
i
)
+
":"
# NOTE: This is an implementation for replaying a given trace for benchmark purposes. It is not an actual ReAct agent implementation.
ss
=
s
.
fork
(
1
)
ss
[
0
]
+=
sgl
.
gen
(
name
=
"thought_action"
,
max_tokens
=
200
,
stop
=
"Observation"
)
#
ss.join()
ss
.
join
()
# to verify the correctness of output, this should be collected
# print(ss[0]["thought_action"])
if
i
>
len
(
triplets
):
...
...
benchmark/react/hotpotqa_100.jsonl
0 → 100644
View file @
70359bf3
This source diff could not be displayed because it is too large. You can
view the blob
instead.
benchmark/tree_of_thought_deep/README.md
View file @
70359bf3
...
...
@@ -5,13 +5,15 @@ wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_sch
## Run benchmark
NOTE: This is an implementation for throughput/latency benchmark purposes. The prompts are not tuned to achieve good accuracy on the GSM-8K tasks.
### Benchmark sglang
```
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
```
```
python3 bench_sglang.py --num-questions 32
--parallel 8
python3 bench_sglang.py --num-questions 32
python3 bench_sglang.py --num-questions 16 --parallel 1
```
...
...
benchmark/tree_of_thought_deep/bench_other.py
View file @
70359bf3
...
...
@@ -141,6 +141,9 @@ def main(args):
rets
.
append
(
out
[
"answer"
])
return
rets
# warmup
call_generate
(
"Hello,"
,
1.0
,
8
,
"."
,
1
)
# Run requests
states
=
[
None
]
*
len
(
questions
)
def
get_one_answer
(
i
):
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment