Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
65c65776
Unverified
Commit
65c65776
authored
Jul 13, 2024
by
Lianmin Zheng
Committed by
GitHub
Jul 13, 2024
Browse files
Improve benchmark scripts & fix llava (#613)
parent
66581596
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
43 additions
and
22 deletions
+43
-22
benchmark/latency_throughput/README.md
benchmark/latency_throughput/README.md
+10
-4
benchmark/latency_throughput/bench_one.py
benchmark/latency_throughput/bench_one.py
+20
-13
python/sglang/README.md
python/sglang/README.md
+12
-0
python/sglang/srt/managers/controller/model_runner.py
python/sglang/srt/managers/controller/model_runner.py
+1
-5
No files found.
benchmark/latency_throughput/README.md
View file @
65c65776
...
...
@@ -30,7 +30,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
#### Run ShareGPT
```
python3 bench_
throughput
.py --backend srt --port 30000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10
python3 bench_
serving
.py --backend srt --port 30000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10
```
## Other baselines
...
...
@@ -42,14 +42,20 @@ python3 -m vllm.entrypoints.api_server --model meta-llama/Llama-2-7b-chat-hf --t
```
# run synthetic
python3 bench_
throughput
.py --backend vllm --port 30000 --tokenizer meta-llama/Llama-2-7b-chat-hf --num-prompt 1000 --request-rate 100 --input-len 1024 --output-len 256
python3 bench_
serving
.py --backend vllm --port 30000 --tokenizer meta-llama/Llama-2-7b-chat-hf --num-prompt 1000 --request-rate 100 --input-len 1024 --output-len 256
```
```
# run ShareGPT
python3 bench_
throughput
.py --backend vllm --port 21000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10
python3 bench_
serving
.py --backend vllm --port 21000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10
```
```
# run one batch
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B --tensor 8 --disable-log-requests --max-num-seqs 1024 --quantization fp8
python3 bench_one.py --input-len 1024 --batch-size 1 1 2 4 8 16 32 64 128 256 512 768 1024 --port 8000 --backend vllm
```
### LightLLM
```
...
...
@@ -57,5 +63,5 @@ python -m lightllm.server.api_server --model_dir ~/model_weights/Llama-2-7b-chat
```
```
python3 bench_
throughput
.py --backend lightllm --port 22000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10
python3 bench_
serving
.py --backend lightllm --port 22000 --tokenizer meta-llama/Llama-2-7b-chat-hf --dataset ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 10 --request-rate 10
```
\ No newline at end of file
benchmark/latency_throughput/bench_one.py
View file @
65c65776
...
...
@@ -15,19 +15,19 @@ def run_one_batch_size(bs):
url
=
f
"
{
args
.
host
}
:
{
args
.
port
}
"
max_new_tokens
=
args
.
max_tokens
a
=
20
prompt
=
f
"
{
a
,
}
"
if
args
.
input_len
:
input_ids
=
[
[
int
(
x
)
for
x
in
np
.
random
.
randint
(
0
,
high
=
16384
,
size
=
(
args
.
input_len
,))]
for
_
in
range
(
bs
)
]
else
:
text
=
[
f
"
{
i
,
}
"
for
i
in
range
(
bs
)]
tic
=
time
.
time
()
if
args
.
backend
==
"srt"
:
if
args
.
input_len
:
inputs
=
{
"input_ids"
:
[
[
int
(
x
)
for
x
in
np
.
random
.
randint
(
0
,
high
=
16384
,
size
=
(
args
.
input_len
,))]
for
_
in
range
(
bs
)
]}
inputs
=
{
"input_ids"
:
input_ids
}
else
:
inputs
=
{
"text"
:
[
f
"
{
i
,
}
"
for
i
in
range
(
bs
)
]}
inputs
=
{
"text"
:
text
}
response
=
requests
.
post
(
url
+
"/generate"
,
...
...
@@ -44,7 +44,7 @@ def run_one_batch_size(bs):
response
=
requests
.
post
(
url
+
"/generate"
,
json
=
{
"inputs"
:
prompt
,
"inputs"
:
text
[
0
]
,
"parameters"
:
{
"temperature"
:
0
,
"max_new_tokens"
:
max_new_tokens
,
...
...
@@ -53,13 +53,19 @@ def run_one_batch_size(bs):
},
)
elif
args
.
backend
==
"vllm"
:
if
args
.
input_len
:
inputs
=
{
"prompt"
:
input_ids
}
else
:
inputs
=
{
"prompt"
:
text
}
response
=
requests
.
post
(
url
+
"/
generate
"
,
url
+
"/
v1/completions
"
,
json
=
{
"
prompt"
:
prompt
,
"
model"
:
args
.
vllm_model_name
,
"temperature"
:
0
,
"max_tokens"
:
max_new_tokens
,
"ignore_eos"
:
True
,
**
inputs
,
},
)
elif
args
.
backend
==
"ginfer"
:
...
...
@@ -71,7 +77,7 @@ def run_one_batch_size(bs):
tic
=
time
.
time
()
sample_request
=
sampler_pb2
.
SampleTextRequest
(
prompt
=
prompt
,
prompt
=
text
[
0
]
,
settings
=
sampler_pb2
.
SampleSettings
(
max_len
=
max_new_tokens
,
rng_seed
=
0
,
...
...
@@ -92,7 +98,7 @@ def run_one_batch_size(bs):
output_throughput
=
bs
*
max_new_tokens
/
latency
print
(
f
"latency:
{
latency
:.
2
f
}
s, speed:
{
output_throughput
:.
2
f
}
token/s"
)
with
open
(
"
tmp_output.txt
"
,
"a"
)
as
fout
:
with
open
(
"
results.jsonl
"
,
"a"
)
as
fout
:
res
=
{
"input_len"
:
args
.
input_len
,
"output_len"
:
args
.
max_tokens
,
...
...
@@ -111,6 +117,7 @@ if __name__ == "__main__":
parser
.
add_argument
(
"--input-len"
,
type
=
int
,
default
=
None
)
parser
.
add_argument
(
"--batch-size"
,
type
=
int
,
nargs
=
'*'
,
default
=
[
1
])
parser
.
add_argument
(
"--max-tokens"
,
type
=
int
,
default
=
256
)
parser
.
add_argument
(
"--vllm-model-name"
,
type
=
str
,
default
=
"meta-llama/Meta-Llama-3-70B"
)
args
=
parser
.
parse_args
()
if
args
.
port
is
None
:
...
...
python/sglang/README.md
0 → 100644
View file @
65c65776
# Code Structure
-
`backend`
: Various backends for the language interpreter.
-
`lang`
: The frontend language.
-
`srt`
: The runtime for running local models.
-
`test`
: Test utilities.
-
`api.py`
: Public API.
-
`bench_latency.py`
: Benchmark utilities.
-
`global_config.py`
: The global configs and constants.
-
`launch_server.py`
: The entry point of launching local server.
-
`utils.py`
: Common utilities.
python/sglang/srt/managers/controller/model_runner.py
View file @
65c65776
...
...
@@ -276,17 +276,13 @@ class ModelRunner:
input_metadata
=
InputMetadata
.
create
(
self
,
forward_mode
=
ForwardMode
.
EXTEND
,
tp_size
=
self
.
tp_size
,
req_pool_indices
=
batch
.
req_pool_indices
,
seq_lens
=
batch
.
seq_lens
,
prefix_lens
=
batch
.
prefix_lens
,
position_ids_offsets
=
batch
.
position_ids_offsets
,
out_cache_loc
=
batch
.
out_cache_loc
,
top_logprobs_nums
=
batch
.
top_logprobs_nums
,
return_logprob
=
batch
.
return_logprob
,
flashinfer_prefill_wrapper_ragged
=
self
.
flashinfer_prefill_wrapper_ragged
,
flashinfer_prefill_wrapper_paged
=
self
.
flashinfer_prefill_wrapper_paged
,
flashinfer_decode_wrapper
=
self
.
flashinfer_decode_wrapper
,
top_logprobs_nums
=
batch
.
top_logprobs_nums
,
)
return
self
.
model
.
forward
(
batch
.
input_ids
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment