Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
easystart
Commits
3904b5c4
Commit
3904b5c4
authored
May 30, 2025
by
jerrrrry
Browse files
Delete benchmark_throughput_0.6.2.py
parent
ef115ee8
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
0 additions
and
731 deletions
+0
-731
benchmark_throughput_0.6.2.py
benchmark_throughput_0.6.2.py
+0
-731
No files found.
benchmark_throughput_0.6.2.py
deleted
100644 → 0
View file @
ef115ee8
"""Benchmark offline inference throughput."""
import
argparse
import
json
import
random
import
time
from
typing
import
List
,
Optional
,
Tuple
import
numpy
as
np
import
torch
import
uvloop
from
tqdm
import
tqdm
from
transformers
import
(
AutoModelForCausalLM
,
AutoTokenizer
,
PreTrainedTokenizerBase
)
from
vllm.inputs
import
PromptInputs
from
vllm.engine.arg_utils
import
DEVICE_OPTIONS
,
AsyncEngineArgs
,
EngineArgs
from
vllm.entrypoints.openai.api_server
import
(
build_async_engine_client_from_engine_args
)
from
vllm.model_executor.layers.quantization
import
QUANTIZATION_METHODS
from
vllm.utils
import
FlexibleArgumentParser
,
merge_async_iterators
def
sample_requests
(
dataset_path
:
str
,
num_requests
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
fixed_output_len
:
Optional
[
int
],
)
->
List
[
Tuple
[
str
,
int
,
int
]]:
if
fixed_output_len
is
not
None
and
fixed_output_len
<
4
:
raise
ValueError
(
"output_len too small"
)
# Load the dataset.
with
open
(
dataset_path
)
as
f
:
dataset
=
json
.
load
(
f
)
# Filter out the conversations with less than 2 turns.
dataset
=
[
data
for
data
in
dataset
if
len
(
data
[
"conversations"
])
>=
2
]
# Only keep the first two turns of each conversation.
dataset
=
[(
data
[
"conversations"
][
0
][
"value"
],
data
[
"conversations"
][
1
][
"value"
])
for
data
in
dataset
]
# Shuffle the dataset.
random
.
shuffle
(
dataset
)
# Filter out sequences that are too long or too short
filtered_dataset
:
List
[
Tuple
[
str
,
int
,
int
]]
=
[]
for
i
in
range
(
len
(
dataset
)):
if
len
(
filtered_dataset
)
==
num_requests
:
break
# Tokenize the prompts and completions.
prompt
=
dataset
[
i
][
0
]
prompt_token_ids
=
tokenizer
(
prompt
).
input_ids
completion
=
dataset
[
i
][
1
]
completion_token_ids
=
tokenizer
(
completion
).
input_ids
prompt_len
=
len
(
prompt_token_ids
)
output_len
=
len
(
completion_token_ids
)
if
fixed_output_len
is
None
else
fixed_output_len
if
prompt_len
<
4
or
output_len
<
4
:
# Prune too short sequences.
continue
if
prompt_len
>
1024
or
prompt_len
+
output_len
>
2048
:
# Prune too long sequences.
continue
filtered_dataset
.
append
((
prompt
,
prompt_len
,
output_len
))
return
filtered_dataset
def
run_vllm
(
warmup_requests
:
List
[
Tuple
[
str
,
int
,
int
]],
requests_json
:
List
[
Tuple
[
str
,
int
,
int
]],
model
:
str
,
tokenizer
:
str
,
quantization
:
Optional
[
str
],
tensor_parallel_size
:
int
,
seed
:
int
,
n
:
int
,
use_beam_search
:
bool
,
trust_remote_code
:
bool
,
dtype
:
str
,
max_model_len
:
Optional
[
int
],
enforce_eager
:
bool
,
kv_cache_dtype
:
str
,
quantization_param_path
:
Optional
[
str
],
device
:
str
,
enable_prefix_caching
:
bool
,
enable_chunked_prefill
:
bool
,
max_num_batched_tokens
:
int
,
distributed_executor_backend
:
Optional
[
str
],
gpu_memory_utilization
:
float
=
0.9
,
num_scheduler_steps
:
int
=
1
,
use_v2_block_manager
:
bool
=
False
,
download_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
EngineArgs
.
load_format
,
disable_async_output_proc
:
bool
=
False
,
use_new_beam_search_impl
:
bool
=
False
,
)
->
float
:
from
vllm
import
LLM
,
SamplingParams
llm
=
LLM
(
model
=
model
,
tokenizer
=
tokenizer
,
quantization
=
quantization
,
tensor_parallel_size
=
tensor_parallel_size
,
seed
=
seed
,
trust_remote_code
=
trust_remote_code
,
dtype
=
dtype
,
max_model_len
=
max_model_len
,
gpu_memory_utilization
=
gpu_memory_utilization
,
enforce_eager
=
enforce_eager
,
kv_cache_dtype
=
kv_cache_dtype
,
quantization_param_path
=
quantization_param_path
,
device
=
device
,
enable_prefix_caching
=
enable_prefix_caching
,
download_dir
=
download_dir
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
distributed_executor_backend
=
distributed_executor_backend
,
load_format
=
load_format
,
num_scheduler_steps
=
num_scheduler_steps
,
use_v2_block_manager
=
use_v2_block_manager
,
disable_async_output_proc
=
disable_async_output_proc
,
)
# warmup
warmup_prompts
=
[]
warmup_sampling_params
=
[]
for
prompt
,
_
,
output_len
in
warmup_requests
:
warmup_prompts
.
append
(
prompt
)
warmup_sampling_params
.
append
(
SamplingParams
(
n
=
n
,
temperature
=
0.0
if
use_beam_search
else
1.0
,
top_p
=
1.0
,
use_beam_search
=
use_beam_search
,
ignore_eos
=
True
,
max_tokens
=
output_len
,
))
print
(
"Warming up..."
)
for
_
in
tqdm
(
range
(
args
.
num_iters_warmup
),
desc
=
"Warmup iterations"
):
llm
.
generate
(
warmup_prompts
,
warmup_sampling_params
,
use_tqdm
=
True
)
info_json
=
{}
for
ELEprompt
in
args
.
num_prompts
:
for
ELEinput
,
ELEoutput
in
zip
(
args
.
input_len
,
args
.
output_len
):
info
=
{}
requests
=
requests_json
[
"{}_{}_{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
)]
# Add the requests to the engine.
prompts
:
List
[
str
]
=
[]
sampling_params
:
List
[
SamplingParams
]
=
[]
for
prompt
,
_
,
output_len
in
requests
:
prompts
.
append
(
prompt
)
sampling_params
.
append
(
SamplingParams
(
n
=
n
,
temperature
=
0.0
if
use_beam_search
else
1.0
,
top_p
=
1.0
,
use_beam_search
=
use_beam_search
,
ignore_eos
=
True
,
max_tokens
=
output_len
,
))
if
not
use_new_beam_search_impl
:
start
=
time
.
perf_counter
()
real_output
=
llm
.
generate
(
prompts
,
sampling_params
,
use_tqdm
=
True
)
end
=
time
.
perf_counter
()
else
:
assert
use_beam_search
prompts
=
[
prompt
for
prompt
,
_
,
_
in
requests
]
# output_len should be the same for all requests.
output_len
=
requests
[
0
][
2
]
for
prompt
,
input_len
,
_output_len
in
requests
:
assert
_output_len
==
output_len
start
=
time
.
perf_counter
()
llm
.
beam_search
(
prompts
,
beam_width
=
n
,
max_tokens
=
output_len
,
ignore_eos
=
True
)
end
=
time
.
perf_counter
()
total_ttfts
=
[]
total_tpops
=
[]
total_output_token_throughput
=
[]
total_inout_token_throughput
=
[]
for
output
in
real_output
:
ttft_
=
output
.
metrics
.
first_token_time
-
output
.
metrics
.
arrival_time
tpop_
=
(
output
.
metrics
.
finished_time
-
output
.
metrics
.
arrival_time
-
ttft_
)
/
(
ELEoutput
-
1
)
output_token_throughput
=
(
ELEoutput
)
/
(
output
.
metrics
.
finished_time
-
output
.
metrics
.
arrival_time
)
inout_token_throughput
=
(
ELEoutput
+
ELEinput
)
/
(
output
.
metrics
.
finished_time
-
output
.
metrics
.
arrival_time
)
total_ttfts
.
append
(
ttft_
)
total_tpops
.
append
(
tpop_
)
total_output_token_throughput
.
append
(
output_token_throughput
)
total_inout_token_throughput
.
append
(
inout_token_throughput
)
# total_num_tokens = sum(request.prompt_len + request.expected_output_len
# for request in requests)
# total_output_tokens = sum(request.expected_output_len
# for request in requests)
total_num_tokens
=
sum
(
prompt_len
+
output_len
for
_
,
prompt_len
,
output_len
in
requests
)
total_output_tokens
=
sum
(
output_len
for
_
,
prompt_len
,
output_len
in
requests
)
info
[
"elapsed_time"
]
=
np
.
around
(
end
-
start
,
2
)
info
[
"Throughput"
]
=
np
.
around
(
len
(
requests
)
/
info
[
'elapsed_time'
],
2
)
info
[
"total_tokens"
]
=
np
.
around
(
total_num_tokens
/
info
[
'elapsed_time'
],
2
)
info
[
"output_tokens"
]
=
np
.
around
(
total_output_tokens
/
info
[
'elapsed_time'
],
2
)
info
[
"ttft_mean"
]
=
np
.
around
(
np
.
mean
(
total_ttfts
),
5
)
info
[
"ttft_median"
]
=
np
.
around
(
np
.
median
(
total_ttfts
or
0
),
5
)
info
[
"ttft_p99"
]
=
np
.
around
(
np
.
percentile
(
total_ttfts
or
0
,
99
),
5
)
info
[
"tpop_mean"
]
=
np
.
around
(
np
.
mean
(
total_tpops
),
4
)
info
[
"tpop_median"
]
=
np
.
around
(
np
.
median
(
total_tpops
or
0
),
5
)
info
[
"tpop_p99"
]
=
np
.
around
(
np
.
percentile
(
total_tpops
or
0
,
99
),
5
)
info
[
"output_token_throughput_mean"
]
=
np
.
around
(
np
.
mean
(
total_output_token_throughput
),
2
)
info
[
"output_token_throughput_median"
]
=
np
.
around
(
np
.
median
(
total_output_token_throughput
or
0
),
2
)
info
[
"output_token_throughput_p99"
]
=
np
.
around
(
np
.
percentile
(
total_output_token_throughput
or
0
,
99
),
2
)
info
[
"inout_token_throughput_mean"
]
=
np
.
around
(
np
.
mean
(
total_inout_token_throughput
),
2
)
info
[
"inout_token_throughput_median"
]
=
np
.
around
(
np
.
median
(
total_inout_token_throughput
or
0
),
2
)
info
[
"inout_token_throughput_p99"
]
=
np
.
around
(
np
.
percentile
(
total_inout_token_throughput
or
0
,
99
),
2
)
info_json
[
"{}_{}_{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
)]
=
info
print
(
"promt:{},input:{},output:{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
))
print
(
f
"Latency:
{
info
[
'elapsed_time'
]:.
2
f
}
s"
)
print
(
f
"Throughput:
{
len
(
requests
)
/
info
[
'elapsed_time'
]:.
2
f
}
requests/s, "
f
"
{
total_num_tokens
/
info
[
'elapsed_time'
]:.
2
f
}
total tokens/s, "
f
"
{
total_output_tokens
/
info
[
'elapsed_time'
]:.
2
f
}
output tokens/s"
)
print
(
"=============================================="
)
print
(
f
"total_out_tokens:
{
total_output_tokens
:
.
2
f
}
tokens"
)
print
(
f
"elapsed_time:
{
info
[
'elapsed_time'
]:
.
2
f
}
s"
)
# 总耗时
print
(
f
"TTFT_mean:
{
info
[
'ttft_mean'
]:
.
5
f
}
s"
)
# 首字延时
print
(
f
"ttft_p99:
{
info
[
'ttft_p99'
]:
.
5
f
}
s"
)
print
(
f
"ttft_median:
{
info
[
'ttft_median'
]:
.
5
f
}
s"
)
print
(
f
"TPOP_mean:
{
info
[
'tpop_mean'
]:
.
5
f
}
s"
)
# 单字decode时间
print
(
f
"tpop_median:
{
info
[
'tpop_median'
]:
.
5
f
}
s"
)
print
(
f
"tpop_p99:
{
info
[
'tpop_p99'
]:
.
5
f
}
s"
)
print
(
f
"output_token_throughput_mean:
{
info
[
'output_token_throughput_mean'
]:.
2
f
}
tokens/s"
)
# 单路生成吞吐
print
(
f
"output_token_throughput_median:
{
info
[
'output_token_throughput_median'
]:.
2
f
}
tokens/s"
)
print
(
f
"output_token_throughput_p99:
{
info
[
'output_token_throughput_p99'
]:.
2
f
}
tokens/s"
)
print
(
f
"inout_token_throughput_mean:
{
info
[
'inout_token_throughput_mean'
]:.
2
f
}
tokens/s"
)
# 单路总吞吐
print
(
f
"tinout_token_throughput_median:
{
info
[
'inout_token_throughput_median'
]:.
2
f
}
tokens/s"
)
print
(
f
"inout_token_throughput_p99:
{
info
[
'inout_token_throughput_p99'
]:.
2
f
}
tokens/s"
)
print
(
"=============================================="
)
print
(
"
\n
"
)
return
info_json
async
def
run_vllm_async
(
requests
:
List
[
Tuple
[
str
,
int
,
int
]],
model
:
str
,
tokenizer
:
str
,
quantization
:
Optional
[
str
],
tensor_parallel_size
:
int
,
seed
:
int
,
n
:
int
,
use_beam_search
:
bool
,
trust_remote_code
:
bool
,
dtype
:
str
,
max_model_len
:
Optional
[
int
],
enforce_eager
:
bool
,
kv_cache_dtype
:
str
,
quantization_param_path
:
Optional
[
str
],
device
:
str
,
enable_prefix_caching
:
bool
,
enable_chunked_prefill
:
bool
,
max_num_batched_tokens
:
int
,
distributed_executor_backend
:
Optional
[
str
],
gpu_memory_utilization
:
float
=
0.9
,
num_scheduler_steps
:
int
=
1
,
use_v2_block_manager
:
bool
=
False
,
download_dir
:
Optional
[
str
]
=
None
,
load_format
:
str
=
EngineArgs
.
load_format
,
disable_async_output_proc
:
bool
=
False
,
disable_frontend_multiprocessing
:
bool
=
False
,
)
->
float
:
from
vllm
import
SamplingParams
engine_args
=
AsyncEngineArgs
(
model
=
model
,
tokenizer
=
tokenizer
,
quantization
=
quantization
,
tensor_parallel_size
=
tensor_parallel_size
,
seed
=
seed
,
trust_remote_code
=
trust_remote_code
,
dtype
=
dtype
,
max_model_len
=
max_model_len
,
gpu_memory_utilization
=
gpu_memory_utilization
,
enforce_eager
=
enforce_eager
,
kv_cache_dtype
=
kv_cache_dtype
,
quantization_param_path
=
quantization_param_path
,
device
=
device
,
enable_prefix_caching
=
enable_prefix_caching
,
download_dir
=
download_dir
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
distributed_executor_backend
=
distributed_executor_backend
,
load_format
=
load_format
,
num_scheduler_steps
=
num_scheduler_steps
,
use_v2_block_manager
=
use_v2_block_manager
,
disable_async_output_proc
=
disable_async_output_proc
,
worker_use_ray
=
False
,
disable_log_requests
=
True
,
)
async
with
build_async_engine_client_from_engine_args
(
engine_args
,
disable_frontend_multiprocessing
)
as
llm
:
# Add the requests to the engine.
prompts
:
List
[
str
]
=
[]
sampling_params
:
List
[
SamplingParams
]
=
[]
for
prompt
,
_
,
output_len
in
requests
:
prompts
.
append
(
prompt
)
sampling_params
.
append
(
SamplingParams
(
n
=
n
,
temperature
=
0.0
if
use_beam_search
else
1.0
,
top_p
=
1.0
,
use_beam_search
=
use_beam_search
,
ignore_eos
=
True
,
max_tokens
=
output_len
,
))
generators
=
[]
start
=
time
.
perf_counter
()
for
i
,
(
prompt
,
sp
)
in
enumerate
(
zip
(
prompts
,
sampling_params
)):
generator
=
llm
.
generate
(
prompt
,
sp
,
request_id
=
f
"test
{
i
}
"
)
generators
.
append
(
generator
)
all_gens
=
merge_async_iterators
(
*
generators
)
async
for
i
,
res
in
all_gens
:
pass
end
=
time
.
perf_counter
()
return
end
-
start
def
run_hf
(
requests
:
List
[
Tuple
[
str
,
int
,
int
]],
model
:
str
,
tokenizer
:
PreTrainedTokenizerBase
,
n
:
int
,
use_beam_search
:
bool
,
max_batch_size
:
int
,
trust_remote_code
:
bool
,
)
->
float
:
assert
not
use_beam_search
llm
=
AutoModelForCausalLM
.
from_pretrained
(
model
,
torch_dtype
=
torch
.
float16
,
trust_remote_code
=
trust_remote_code
)
if
llm
.
config
.
model_type
==
"llama"
:
# To enable padding in the HF backend.
tokenizer
.
pad_token
=
tokenizer
.
eos_token
llm
=
llm
.
cuda
()
pbar
=
tqdm
(
total
=
len
(
requests
))
start
=
time
.
perf_counter
()
batch
:
List
[
str
]
=
[]
max_prompt_len
=
0
max_output_len
=
0
for
i
in
range
(
len
(
requests
)):
prompt
,
prompt_len
,
output_len
=
requests
[
i
]
# Add the prompt to the batch.
batch
.
append
(
prompt
)
max_prompt_len
=
max
(
max_prompt_len
,
prompt_len
)
max_output_len
=
max
(
max_output_len
,
output_len
)
if
len
(
batch
)
<
max_batch_size
and
i
!=
len
(
requests
)
-
1
:
# Check if we can add more requests to the batch.
_
,
next_prompt_len
,
next_output_len
=
requests
[
i
+
1
]
if
(
max
(
max_prompt_len
,
next_prompt_len
)
+
max
(
max_output_len
,
next_output_len
))
<=
2048
:
# We can add more requests to the batch.
continue
# Generate the sequences.
input_ids
=
tokenizer
(
batch
,
return_tensors
=
"pt"
,
padding
=
True
).
input_ids
llm_outputs
=
llm
.
generate
(
input_ids
=
input_ids
.
cuda
(),
do_sample
=
not
use_beam_search
,
num_return_sequences
=
n
,
temperature
=
1.0
,
top_p
=
1.0
,
use_cache
=
True
,
max_new_tokens
=
max_output_len
,
)
# Include the decoding time.
tokenizer
.
batch_decode
(
llm_outputs
,
skip_special_tokens
=
True
)
pbar
.
update
(
len
(
batch
))
# Clear the batch.
batch
=
[]
max_prompt_len
=
0
max_output_len
=
0
end
=
time
.
perf_counter
()
return
end
-
start
def
run_mii
(
requests
:
List
[
Tuple
[
str
,
int
,
int
]],
model
:
str
,
tensor_parallel_size
:
int
,
output_len
:
int
,
)
->
float
:
from
mii
import
client
,
serve
llm
=
serve
(
model
,
tensor_parallel
=
tensor_parallel_size
)
prompts
=
[
prompt
for
prompt
,
_
,
_
in
requests
]
start
=
time
.
perf_counter
()
llm
.
generate
(
prompts
,
max_new_tokens
=
output_len
)
end
=
time
.
perf_counter
()
client
=
client
(
model
)
client
.
terminate_server
()
return
end
-
start
def
main
(
args
:
argparse
.
Namespace
):
print
(
args
)
random
.
seed
(
args
.
seed
)
# Sample the requests.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
tokenizer
,
trust_remote_code
=
args
.
trust_remote_code
)
warmup_prompt
=
"hi"
*
10
warmup_requests
=
[(
warmup_prompt
,
10
,
10
)
for
_
in
range
(
1
)]
if
args
.
dataset
is
None
:
requests_json
=
{}
for
ELEprompt
in
args
.
num_prompts
:
for
ELEinput
,
ELEoutput
in
zip
(
args
.
input_len
,
args
.
output_len
):
# Synthesize a prompt with the given input length.
prompt
=
"hi"
*
(
ELEinput
-
1
)
requests
=
[(
prompt
,
ELEinput
,
ELEoutput
)
for
_
in
range
(
ELEprompt
)]
print
(
"type(requests):"
,
type
(
requests
))
requests_json
[
"{}_{}_{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
)]
=
requests
else
:
requests
=
sample_requests
(
args
.
dataset
,
args
.
num_prompts
,
tokenizer
,
args
.
output_len
)
if
args
.
backend
==
"vllm"
:
if
args
.
async_engine
:
run_args
=
[
requests
,
args
.
model
,
args
.
tokenizer
,
args
.
quantization
,
args
.
tensor_parallel_size
,
args
.
seed
,
args
.
n
,
args
.
use_beam_search
,
args
.
trust_remote_code
,
args
.
dtype
,
args
.
max_model_len
,
args
.
enforce_eager
,
args
.
kv_cache_dtype
,
args
.
quantization_param_path
,
args
.
device
,
args
.
enable_prefix_caching
,
args
.
enable_chunked_prefill
,
args
.
max_num_batched_tokens
,
args
.
distributed_executor_backend
,
args
.
gpu_memory_utilization
,
args
.
num_scheduler_steps
,
args
.
use_v2_block_manager
,
args
.
download_dir
,
args
.
load_format
,
args
.
disable_async_output_proc
]
else
:
run_args
=
[
warmup_requests
,
requests_json
,
args
.
model
,
args
.
tokenizer
,
args
.
quantization
,
args
.
tensor_parallel_size
,
args
.
seed
,
args
.
n
,
args
.
use_beam_search
,
args
.
trust_remote_code
,
args
.
dtype
,
args
.
max_model_len
,
args
.
enforce_eager
,
args
.
kv_cache_dtype
,
args
.
quantization_param_path
,
args
.
device
,
args
.
enable_prefix_caching
,
args
.
enable_chunked_prefill
,
args
.
max_num_batched_tokens
,
args
.
distributed_executor_backend
,
args
.
gpu_memory_utilization
,
args
.
num_scheduler_steps
,
args
.
use_v2_block_manager
,
args
.
download_dir
,
args
.
load_format
,
args
.
disable_async_output_proc
]
if
args
.
async_engine
:
run_args
.
append
(
args
.
disable_frontend_multiprocessing
)
elapsed_time
=
uvloop
.
run
(
run_vllm_async
(
*
run_args
))
else
:
info_json
=
run_vllm
(
*
run_args
,
args
.
use_new_beam_search_impl
)
elif
args
.
backend
==
"hf"
:
assert
args
.
tensor_parallel_size
==
1
elapsed_time
=
run_hf
(
requests
,
args
.
model
,
tokenizer
,
args
.
n
,
args
.
use_beam_search
,
args
.
hf_max_batch_size
,
args
.
trust_remote_code
)
elif
args
.
backend
==
"mii"
:
elapsed_time
=
run_mii
(
requests
,
args
.
model
,
args
.
tensor_parallel_size
,
args
.
output_len
)
else
:
raise
ValueError
(
f
"Unknown backend:
{
args
.
backend
}
"
)
# total_num_tokens = sum(prompt_len + output_len
# for _, prompt_len, output_len in requests)
# if args.dataset is None:
# total_out_tokens = args.output_len * args.num_prompts
# else:
# total_out_tokens = sum(output_len for _, _, output_len in requests)
# print(f"Latency: {elapsed_time:.2f} s")
# print(f"All Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
# f"{total_num_tokens / elapsed_time:.2f} tokens/s")
# print(f"Generate Throughput: {total_out_tokens / elapsed_time:.2f} tokens/s")
with
open
(
args
.
output_json
,
"w"
)
as
f
:
title
=
"bs_in_out"
data_keys
=
info_json
[
list
(
info_json
.
keys
())[
0
]].
keys
()
keys_string
=
','
.
join
(
data_keys
)
title
=
title
+
","
+
keys_string
f
.
write
(
title
)
f
.
write
(
"
\n
"
)
for
key
,
value
in
info_json
.
items
():
values_as_strings
=
[
str
(
value
)
for
value
in
info_json
[
key
].
values
()]
values_string
=
','
.
join
(
values_as_strings
)
key
=
key
+
","
+
values_string
f
.
writelines
(
key
)
f
.
write
(
"
\n
"
)
# Output JSON results if specified
# if args.output_json:
# results = {
# "elapsed_time": elapsed_time,
# "num_requests": len(requests),
# "total_num_tokens": total_num_tokens,
# "requests_per_second": len(requests) / elapsed_time,
# "tokens_per_second": total_num_tokens / elapsed_time,
# }
# with open(args.output_json, "w") as f:
# json.dump(results, f, indent=4)
if
__name__
==
"__main__"
:
parser
=
FlexibleArgumentParser
(
description
=
"Benchmark the throughput."
)
parser
.
add_argument
(
"--backend"
,
type
=
str
,
choices
=
[
"vllm"
,
"hf"
,
"mii"
],
default
=
"vllm"
)
parser
.
add_argument
(
"--dataset"
,
type
=
str
,
default
=
None
,
help
=
"Path to the dataset."
)
parser
.
add_argument
(
"--input-len"
,
type
=
int
,
nargs
=
"*"
,
default
=
None
,
help
=
"Input prompt length for each request"
)
parser
.
add_argument
(
"--output-len"
,
type
=
int
,
nargs
=
"*"
,
default
=
None
,
help
=
"Output length for each request. Overrides the "
"output length from the dataset."
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
"facebook/opt-125m"
)
parser
.
add_argument
(
"--tokenizer"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
'--quantization'
,
'-q'
,
choices
=
[
*
QUANTIZATION_METHODS
,
None
],
default
=
None
)
parser
.
add_argument
(
"--tensor-parallel-size"
,
"-tp"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--n"
,
type
=
int
,
default
=
1
,
help
=
"Number of generated sequences per prompt."
)
parser
.
add_argument
(
"--use-beam-search"
,
action
=
"store_true"
)
parser
.
add_argument
(
'--num-iters-warmup'
,
type
=
int
,
default
=
1
,
help
=
'Number of iterations to run for warmup.'
)
parser
.
add_argument
(
"--use-new-beam-search-impl"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--num-prompts"
,
nargs
=
"*"
,
type
=
int
,
default
=
1000
,
help
=
"Number of prompts to process."
)
parser
.
add_argument
(
"--seed"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--hf-max-batch-size"
,
type
=
int
,
default
=
None
,
help
=
"Maximum batch size for HF backend."
)
parser
.
add_argument
(
'--trust-remote-code'
,
action
=
'store_true'
,
help
=
'trust remote code from huggingface'
)
parser
.
add_argument
(
'--max-model-len'
,
type
=
int
,
default
=
None
,
help
=
'Maximum length of a sequence (including prompt and output). '
'If None, will be derived from the model.'
)
parser
.
add_argument
(
'--dtype'
,
type
=
str
,
default
=
'auto'
,
choices
=
[
'auto'
,
'half'
,
'float16'
,
'bfloat16'
,
'float'
,
'float32'
],
help
=
'data type for model weights and activations. '
'The "auto" option will use FP16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for BF16 models.'
)
parser
.
add_argument
(
'--gpu-memory-utilization'
,
type
=
float
,
default
=
0.9
,
help
=
'the fraction of GPU memory to be used for '
'the model executor, which can range from 0 to 1.'
'If unspecified, will use the default value of 0.9.'
)
parser
.
add_argument
(
"--enforce-eager"
,
action
=
"store_true"
,
help
=
"enforce eager execution"
)
parser
.
add_argument
(
'--kv-cache-dtype'
,
type
=
str
,
choices
=
[
'auto'
,
'fp8'
,
'fp8_e5m2'
,
'fp8_e4m3'
],
default
=
"auto"
,
help
=
'Data type for kv cache storage. If "auto", will use model '
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
'ROCm (hcu) supports fp8 (=fp8_e4m3)'
)
parser
.
add_argument
(
'--quantization-param-path'
,
type
=
str
,
default
=
None
,
help
=
'Path to the JSON file containing the KV cache scaling factors. '
'This should generally be supplied, when KV cache dtype is FP8. '
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
'cuda version greater than 11.8. On ROCm (hcu), FP8_E4M3 is '
'instead supported for common inference criteria.'
)
parser
.
add_argument
(
"--device"
,
type
=
str
,
default
=
"auto"
,
choices
=
DEVICE_OPTIONS
,
help
=
'device type for vLLM execution'
)
parser
.
add_argument
(
"--num-scheduler-steps"
,
type
=
int
,
default
=
1
,
help
=
"Maximum number of forward steps per scheduler call."
)
parser
.
add_argument
(
"--use-v2-block-manager"
,
action
=
'store_true'
,
help
=
"Enable block manager v2."
)
parser
.
add_argument
(
"--enable-prefix-caching"
,
action
=
'store_true'
,
help
=
"Enable automatic prefix caching for vLLM backend."
)
parser
.
add_argument
(
"--enable-chunked-prefill"
,
action
=
'store_true'
,
help
=
"enable chunked prefill for vLLM backend."
)
parser
.
add_argument
(
'--max-num-batched-tokens'
,
type
=
int
,
default
=
None
,
help
=
'maximum number of batched tokens per '
'iteration'
)
parser
.
add_argument
(
'--download-dir'
,
type
=
str
,
default
=
None
,
help
=
'directory to download and load the weights, '
'default to the default cache dir of huggingface'
)
parser
.
add_argument
(
'--output-json'
,
type
=
str
,
default
=
None
,
help
=
'Path to save the throughput results in JSON format.'
)
parser
.
add_argument
(
'--distributed-executor-backend'
,
choices
=
[
'ray'
,
'mp'
],
default
=
None
,
help
=
'Backend to use for distributed serving. When more than 1 GPU '
'is used, will be automatically set to "ray" if installed '
'or "mp" (multiprocessing) otherwise.'
)
parser
.
add_argument
(
'--load-format'
,
type
=
str
,
default
=
EngineArgs
.
load_format
,
choices
=
[
'auto'
,
'pt'
,
'safetensors'
,
'npcache'
,
'dummy'
,
'tensorizer'
,
'bitsandbytes'
],
help
=
'The format of the model weights to load.
\n\n
'
'* "auto" will try to load the weights in the safetensors format '
'and fall back to the pytorch bin format if safetensors format '
'is not available.
\n
'
'* "pt" will load the weights in the pytorch bin format.
\n
'
'* "safetensors" will load the weights in the safetensors format.
\n
'
'* "npcache" will load the weights in pytorch format and store '
'a numpy cache to speed up the loading.
\n
'
'* "dummy" will initialize the weights with random values, '
'which is mainly for profiling.
\n
'
'* "tensorizer" will load the weights using tensorizer from '
'CoreWeave. See the Tensorize vLLM Model script in the Examples'
'section for more information.
\n
'
'* "bitsandbytes" will load the weights using bitsandbytes '
'quantization.
\n
'
)
parser
.
add_argument
(
"--disable-async-output-proc"
,
action
=
'store_true'
,
default
=
False
,
help
=
"Disable async output processor for vLLM backend."
)
parser
.
add_argument
(
"--async-engine"
,
action
=
'store_true'
,
default
=
False
,
help
=
"Use vLLM async engine rather than LLM class."
)
parser
.
add_argument
(
"--disable-frontend-multiprocessing"
,
action
=
'store_true'
,
default
=
False
,
help
=
"Disable decoupled async engine frontend."
)
args
=
parser
.
parse_args
()
if
args
.
tokenizer
is
None
:
args
.
tokenizer
=
args
.
model
if
args
.
dataset
is
None
:
assert
args
.
input_len
is
not
None
assert
args
.
output_len
is
not
None
else
:
assert
args
.
input_len
is
None
if
args
.
backend
==
"vllm"
:
if
args
.
hf_max_batch_size
is
not
None
:
raise
ValueError
(
"HF max batch size is only for HF backend."
)
elif
args
.
backend
==
"hf"
:
if
args
.
hf_max_batch_size
is
None
:
raise
ValueError
(
"HF max batch size is required for HF backend."
)
if
args
.
quantization
is
not
None
:
raise
ValueError
(
"Quantization is only for vLLM backend."
)
elif
args
.
backend
==
"mii"
:
if
args
.
dtype
!=
"auto"
:
raise
ValueError
(
"dtype must be auto for MII backend."
)
if
args
.
n
!=
1
:
raise
ValueError
(
"n must be 1 for MII backend."
)
if
args
.
use_beam_search
:
raise
ValueError
(
"Beam search is not supported for MII backend."
)
if
args
.
quantization
is
not
None
:
raise
ValueError
(
"Quantization is only for vLLM backend."
)
if
args
.
hf_max_batch_size
is
not
None
:
raise
ValueError
(
"HF max batch size is only for HF backend."
)
if
args
.
tokenizer
!=
args
.
model
:
raise
ValueError
(
"Tokenizer must be the same as the model for MII "
"backend."
)
main
(
args
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment