Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
vllm_test_tools
Commits
1decb2f9
Commit
1decb2f9
authored
May 29, 2025
by
jerrrrry
Browse files
Upload New File
parent
691362e0
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
679 additions
and
0 deletions
+679
-0
085-offline/benchmark_throughput_0.8.5.py
085-offline/benchmark_throughput_0.8.5.py
+679
-0
No files found.
085-offline/benchmark_throughput_0.8.5.py
0 → 100644
View file @
1decb2f9
# SPDX-License-Identifier: Apache-2.0
"""Benchmark offline inference throughput."""
import
argparse
import
dataclasses
import
json
import
random
import
time
from
functools
import
cache
from
typing
import
Dict
,
List
,
Optional
,
Tuple
import
numpy
as
np
import
torch
import
uvloop
from
PIL
import
Image
from
tqdm
import
tqdm
from
transformers
import
(
AutoModelForCausalLM
,
AutoTokenizer
,
PreTrainedTokenizerBase
)
from
benchmark_dataset
import
(
AIMODataset
,
BurstGPTDataset
,
ConversationDataset
,
InstructCoderDataset
,
RandomDataset
,
SampleRequest
,
ShareGPTDataset
,
SonnetDataset
,
VisionArenaDataset
,
)
from
benchmark_utils
import
convert_to_pytorch_benchmark_format
,
write_to_json
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
EngineArgs
from
vllm.entrypoints.openai.api_server
import
(
build_async_engine_client_from_engine_args
,
)
from
vllm.inputs
import
TextPrompt
,
TokensPrompt
from
vllm.lora.request
import
LoRARequest
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
BeamSearchParams
from
vllm.utils
import
FlexibleArgumentParser
,
merge_async_iterators
@
dataclasses
.
dataclass
class
SampleRequest
:
"""A class representing a single inference request for benchmarking.
Attributes:
prompt: The input text prompt for the model.
prompt_len: The length of the prompt in tokens.
expected_output_len: The expected length of the output in tokens.
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
images).
lora_request: Optional LoRARequest specifying the LoRA to use.
"""
prompt
:
str
prompt_len
:
int
expected_output_len
:
int
multi_modal_data
:
Optional
[
MultiModalDataDict
]
=
None
lora_request
:
Optional
[
LoRARequest
]
=
None
def
_get_prompt_for_image_model
(
question
:
str
,
*
,
model
:
str
)
->
str
:
"""Prepend and append special tokens around the question to form a prompt.
Args:
question: The input question text to wrap with special tokens
model: The name of the model being used, to determine which special
tokens to add
Returns:
The formatted prompt string with appropriate special tokens for the
model
Raises:
ValueError: If an unsupported model name is provided
"""
model
=
model
.
lower
()
if
"pixtral"
in
model
:
return
f
"<s>[INST]
{
question
}
\n
[IMG][/INST]"
raise
ValueError
(
f
"Unsupported model
{
model
}
"
)
@
cache
def
lora_path_on_disk
(
lora_path
:
str
)
->
str
:
return
get_adapter_absolute_path
(
lora_path
)
lora_tokenizer_cache
:
Dict
[
int
,
AnyTokenizer
]
=
{}
def
get_random_lora_request
(
args
:
argparse
.
Namespace
)
->
Tuple
[
LoRARequest
,
Optional
[
AnyTokenizer
]]:
global
lora_tokenizer_cache
lora_id
=
random
.
randint
(
1
,
args
.
max_loras
)
lora_request
=
LoRARequest
(
lora_name
=
str
(
lora_id
),
lora_int_id
=
lora_id
,
lora_path
=
lora_path_on_disk
(
args
.
lora_path
))
if
lora_id
not
in
lora_tokenizer_cache
:
lora_tokenizer_cache
[
lora_id
]
=
get_lora_tokenizer
(
lora_request
)
return
lora_request
,
lora_tokenizer_cache
[
lora_id
]
def
sample_requests
(
tokenizer
:
PreTrainedTokenizerBase
,
args
:
argparse
.
Namespace
)
->
List
[
SampleRequest
]:
dataset_path
:
str
=
args
.
dataset
num_requests
:
int
=
args
.
num_prompts
fixed_output_len
:
Optional
[
int
]
=
args
.
output_len
model
:
str
=
args
.
model
if
fixed_output_len
is
not
None
and
fixed_output_len
<
4
:
raise
ValueError
(
"output_len too small"
)
# Load the dataset.
with
open
(
dataset_path
)
as
f
:
dataset
=
json
.
load
(
f
)
# Filter out the conversations with less than 2 turns.
dataset
=
[
data
for
data
in
dataset
if
len
(
data
[
"conversations"
])
>=
2
]
# Shuffle the dataset.
random
.
shuffle
(
dataset
)
# Filter out sequences that are too long or too short
filtered_dataset
:
List
[
SampleRequest
]
=
[]
for
data
in
tqdm
(
dataset
,
total
=
len
(
filtered_dataset
),
desc
=
"sampling requests"
):
if
len
(
filtered_dataset
)
==
num_requests
:
break
# Only keep the first two turns of each conversation.
prompt
=
data
[
"conversations"
][
0
][
"value"
]
completion
=
data
[
"conversations"
][
1
][
"value"
]
multi_modal_data
:
Optional
[
MultiModalDataDict
]
=
None
if
"image"
in
data
:
multi_modal_data
=
multi_modal_data
or
{}
image_path
=
data
[
"image"
]
# TODO(vllm-project/vllm/issues/9778): Support multiple images.
assert
isinstance
(
image_path
,
str
),
"Only support single image input"
try
:
multi_modal_data
[
"image"
]
=
Image
.
open
(
image_path
).
convert
(
"RGB"
)
except
FileNotFoundError
:
# Ignore datapoint where asset is missing
continue
prompt
=
_get_prompt_for_image_model
(
question
=
prompt
,
model
=
model
)
request_tokenizer
=
tokenizer
lora_request
:
Optional
[
LoRARequest
]
=
None
if
args
.
enable_lora
:
lora_request
,
lora_tokenizer
=
get_random_lora_request
(
args
)
if
lora_tokenizer
:
request_tokenizer
=
lora_tokenizer
# Tokenize the prompts and completions.
prompt_token_ids
=
request_tokenizer
(
prompt
).
input_ids
completion_token_ids
=
request_tokenizer
(
completion
).
input_ids
prompt_len
=
len
(
prompt_token_ids
)
output_len
=
len
(
completion_token_ids
)
if
fixed_output_len
is
None
else
fixed_output_len
if
prompt_len
<
4
or
output_len
<
4
:
# Prune too short sequences.
continue
if
prompt_len
>
1024
or
prompt_len
+
output_len
>
2048
:
# Prune too long sequences.
continue
filtered_dataset
.
append
(
SampleRequest
(
prompt
=
prompt
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
multi_modal_data
,
lora_request
=
lora_request
))
return
filtered_dataset
def
run_vllm
(
requests_json
:
List
[
SampleRequest
],
n
:
int
,
num_iters_warmup
:
int
,
engine_args
:
EngineArgs
,
)
->
float
:
from
vllm
import
LLM
,
SamplingParams
llm
=
LLM
(
**
dataclasses
.
asdict
(
engine_args
))
# warmup
warmup_sampling_params
=
SamplingParams
(
n
=
args
.
n
,
temperature
=
1.0
,
top_p
=
1.0
,
ignore_eos
=
True
,
max_tokens
=
10
,
)
dummy_prompt_token_ids
=
np
.
random
.
randint
(
10000
,
size
=
(
1
,
10
))
dummy_prompts
:
List
[
PromptType
]
=
[{
"prompt_token_ids"
:
batch
}
for
batch
in
dummy_prompt_token_ids
.
tolist
()]
print
(
"Warming up..."
)
for
_
in
tqdm
(
range
(
num_iters_warmup
),
desc
=
"Warmup iterations"
):
llm
.
generate
(
dummy_prompts
,
sampling_params
=
warmup_sampling_params
,
use_tqdm
=
False
)
info_json
=
{}
for
ELEprompt
in
args
.
num_prompts
:
for
ELEinput
,
ELEoutput
in
zip
(
args
.
input_len
,
args
.
output_len
):
info
=
{}
requests
=
requests_json
[
"{}_{}_{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
)]
# Add the requests to the engine.
prompts
:
List
[
TextPrompt
]
=
[]
sampling_params
:
List
[
SamplingParams
]
=
[]
for
request
in
requests
:
prompts
.
append
(
TextPrompt
(
prompt
=
request
.
prompt
,
multi_modal_data
=
request
.
multi_modal_data
))
sampling_params
.
append
(
SamplingParams
(
n
=
n
,
temperature
=
1.0
,
top_p
=
1.0
,
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
))
lora_requests
:
Optional
[
List
[
LoRARequest
]]
=
None
if
engine_args
.
enable_lora
:
lora_requests
=
[
request
.
lora_request
for
request
in
requests
]
use_beam_search
=
False
if
not
use_beam_search
:
start
=
time
.
perf_counter
()
real_output
=
llm
.
generate
(
prompts
,
sampling_params
,
lora_request
=
lora_requests
,
use_tqdm
=
True
)
end
=
time
.
perf_counter
()
else
:
assert
lora_requests
is
None
,
"BeamSearch API does not support LoRA"
prompts
=
[
request
.
prompt
for
request
in
requests
]
# output_len should be the same for all requests.
output_len
=
requests
[
0
][
2
]
for
request
in
requests
:
assert
request
.
expected_output_len
==
output_len
start
=
time
.
perf_counter
()
real_output
=
llm
.
beam_search
(
prompts
,
BeamSearchParams
(
beam_width
=
n
,
max_tokens
=
output_len
,
ignore_eos
=
True
,
))
end
=
time
.
perf_counter
()
total_ttfts
=
[]
total_tpops
=
[]
total_output_token_throughput
=
[]
total_inout_token_throughput
=
[]
for
output
in
real_output
:
ttft_
=
output
.
metrics
.
first_token_time
-
output
.
metrics
.
arrival_time
tpop_
=
(
output
.
metrics
.
finished_time
-
output
.
metrics
.
arrival_time
-
ttft_
)
/
(
ELEoutput
-
1
)
output_token_throughput
=
(
ELEoutput
)
/
(
output
.
metrics
.
finished_time
-
output
.
metrics
.
arrival_time
)
inout_token_throughput
=
(
ELEoutput
+
ELEinput
)
/
(
output
.
metrics
.
finished_time
-
output
.
metrics
.
arrival_time
)
total_ttfts
.
append
(
ttft_
)
total_tpops
.
append
(
tpop_
)
total_output_token_throughput
.
append
(
output_token_throughput
)
total_inout_token_throughput
.
append
(
inout_token_throughput
)
total_num_tokens
=
sum
(
request
.
prompt_len
+
request
.
expected_output_len
for
request
in
requests
)
total_output_tokens
=
sum
(
request
.
expected_output_len
for
request
in
requests
)
# ttft_mean = np.mean(total_ttfts)
# ttft_median = np.median(total_ttfts or 0)
# ttft_p99 = np.percentile(total_ttfts or 0, 99)
# tpop_mean = np.mean(total_tpops)
# tpop_median = np.median(total_tpops or 0)
# tpop_p99 = np.percentile(total_tpops or 0, 99)
# output_token_throughput_mean = np.mean(total_output_token_throughput)
# output_token_throughput_median = np.median(total_output_token_throughput or 0)
# output_token_throughput_p99 = np.percentile(total_output_token_throughput or 0, 99)
# inout_token_throughput_mean = np.mean(total_inout_token_throughput)
# inout_token_throughput_median = np.median(total_inout_token_throughput or 0)
# inout_token_throughput_p99 = np.percentile(total_inout_token_throughput or 0, 99)
info
[
"elapsed_time"
]
=
np
.
around
(
end
-
start
,
2
)
info
[
"Throughput"
]
=
np
.
around
(
len
(
requests
)
/
info
[
'elapsed_time'
],
2
)
info
[
"total_tokens"
]
=
np
.
around
(
total_num_tokens
/
info
[
'elapsed_time'
],
2
)
info
[
"output_tokens"
]
=
np
.
around
(
total_output_tokens
/
info
[
'elapsed_time'
],
2
)
info
[
"ttft_mean"
]
=
np
.
around
(
np
.
mean
(
total_ttfts
),
5
)
info
[
"ttft_median"
]
=
np
.
around
(
np
.
median
(
total_ttfts
or
0
),
5
)
info
[
"ttft_p99"
]
=
np
.
around
(
np
.
percentile
(
total_ttfts
or
0
,
99
),
5
)
info
[
"tpop_mean"
]
=
np
.
around
(
np
.
mean
(
total_tpops
),
4
)
info
[
"tpop_median"
]
=
np
.
around
(
np
.
median
(
total_tpops
or
0
),
5
)
info
[
"tpop_p99"
]
=
np
.
around
(
np
.
percentile
(
total_tpops
or
0
,
99
),
5
)
info
[
"output_token_throughput_mean"
]
=
np
.
around
(
np
.
mean
(
total_output_token_throughput
),
2
)
info
[
"output_token_throughput_median"
]
=
np
.
around
(
np
.
median
(
total_output_token_throughput
or
0
),
2
)
info
[
"output_token_throughput_p99"
]
=
np
.
around
(
np
.
percentile
(
total_output_token_throughput
or
0
,
99
),
2
)
info
[
"inout_token_throughput_mean"
]
=
np
.
around
(
np
.
mean
(
total_inout_token_throughput
),
2
)
info
[
"inout_token_throughput_median"
]
=
np
.
around
(
np
.
median
(
total_inout_token_throughput
or
0
),
2
)
info
[
"inout_token_throughput_p99"
]
=
np
.
around
(
np
.
percentile
(
total_inout_token_throughput
or
0
,
99
),
2
)
info_json
[
"{}_{}_{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
)]
=
info
print
(
"promt:{},input:{},output:{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
))
print
(
f
"Latency:
{
info
[
'elapsed_time'
]:.
2
f
}
s"
)
print
(
f
"Throughput:
{
len
(
requests
)
/
info
[
'elapsed_time'
]:.
2
f
}
requests/s, "
f
"
{
total_num_tokens
/
info
[
'elapsed_time'
]:.
2
f
}
total tokens/s, "
f
"
{
total_output_tokens
/
info
[
'elapsed_time'
]:.
2
f
}
output tokens/s"
)
print
(
"=============================================="
)
print
(
f
"total_out_tokens:
{
total_output_tokens
:
.
2
f
}
tokens"
)
print
(
f
"elapsed_time:
{
info
[
'elapsed_time'
]:
.
2
f
}
s"
)
# 总耗时
print
(
f
"TTFT_mean:
{
info
[
'ttft_mean'
]:
.
5
f
}
s"
)
# 首字延时
print
(
f
"ttft_p99:
{
info
[
'ttft_p99'
]:
.
5
f
}
s"
)
print
(
f
"ttft_median:
{
info
[
'ttft_median'
]:
.
5
f
}
s"
)
print
(
f
"TPOP_mean:
{
info
[
'tpop_mean'
]:
.
5
f
}
s"
)
# 单字decode时间
print
(
f
"tpop_median:
{
info
[
'tpop_median'
]:
.
5
f
}
s"
)
print
(
f
"tpop_p99:
{
info
[
'tpop_p99'
]:
.
5
f
}
s"
)
print
(
f
"output_token_throughput_mean:
{
info
[
'output_token_throughput_mean'
]:.
2
f
}
tokens/s"
)
# 单路生成吞吐
print
(
f
"output_token_throughput_median:
{
info
[
'output_token_throughput_median'
]:.
2
f
}
tokens/s"
)
print
(
f
"output_token_throughput_p99:
{
info
[
'output_token_throughput_p99'
]:.
2
f
}
tokens/s"
)
print
(
f
"inout_token_throughput_mean:
{
info
[
'inout_token_throughput_mean'
]:.
2
f
}
tokens/s"
)
# 单路总吞吐
print
(
f
"tinout_token_throughput_median:
{
info
[
'inout_token_throughput_median'
]:.
2
f
}
tokens/s"
)
print
(
f
"inout_token_throughput_p99:
{
info
[
'inout_token_throughput_p99'
]:.
2
f
}
tokens/s"
)
print
(
"=============================================="
)
print
(
"
\n
"
)
return
info_json
async
def
run_vllm_async
(
requests
:
List
[
SampleRequest
],
n
:
int
,
engine_args
:
AsyncEngineArgs
,
disable_frontend_multiprocessing
:
bool
=
False
,
)
->
float
:
from
vllm
import
SamplingParams
async
with
build_async_engine_client_from_engine_args
(
engine_args
,
disable_frontend_multiprocessing
)
as
llm
:
# Add the requests to the engine.
prompts
:
List
[
TextPrompt
]
=
[]
sampling_params
:
List
[
SamplingParams
]
=
[]
lora_requests
:
List
[
Optional
[
LoRARequest
]]
=
[]
for
request
in
requests
:
prompts
.
append
(
TextPrompt
(
prompt
=
request
.
prompt
,
multi_modal_data
=
request
.
multi_modal_data
))
sampling_params
.
append
(
SamplingParams
(
n
=
n
,
temperature
=
1.0
,
top_p
=
1.0
,
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
))
lora_requests
.
append
(
request
.
lora_request
)
generators
=
[]
start
=
time
.
perf_counter
()
for
i
,
(
prompt
,
sp
,
lr
)
in
enumerate
(
zip
(
prompts
,
sampling_params
,
lora_requests
)):
generator
=
llm
.
generate
(
prompt
,
sp
,
lora_request
=
lr
,
request_id
=
f
"test
{
i
}
"
)
generators
.
append
(
generator
)
all_gens
=
merge_async_iterators
(
*
generators
)
async
for
i
,
res
in
all_gens
:
pass
end
=
time
.
perf_counter
()
return
end
-
start
def
run_hf
(
requests
:
List
[
SampleRequest
],
model
:
str
,
tokenizer
:
PreTrainedTokenizerBase
,
n
:
int
,
max_batch_size
:
int
,
trust_remote_code
:
bool
,
)
->
float
:
llm
=
AutoModelForCausalLM
.
from_pretrained
(
model
,
torch_dtype
=
torch
.
float16
,
trust_remote_code
=
trust_remote_code
)
if
llm
.
config
.
model_type
==
"llama"
:
# To enable padding in the HF backend.
tokenizer
.
pad_token
=
tokenizer
.
eos_token
llm
=
llm
.
cuda
()
pbar
=
tqdm
(
total
=
len
(
requests
))
start
=
time
.
perf_counter
()
batch
:
List
[
str
]
=
[]
max_prompt_len
=
0
max_output_len
=
0
for
i
in
range
(
len
(
requests
)):
prompt
,
prompt_len
,
output_len
=
requests
[
i
]
# Add the prompt to the batch.
batch
.
append
(
prompt
)
max_prompt_len
=
max
(
max_prompt_len
,
prompt_len
)
max_output_len
=
max
(
max_output_len
,
output_len
)
if
len
(
batch
)
<
max_batch_size
and
i
!=
len
(
requests
)
-
1
:
# Check if we can add more requests to the batch.
_
,
next_prompt_len
,
next_output_len
=
requests
[
i
+
1
]
if
(
max
(
max_prompt_len
,
next_prompt_len
)
+
max
(
max_output_len
,
next_output_len
))
<=
2048
:
# We can add more requests to the batch.
continue
# Generate the sequences.
input_ids
=
tokenizer
(
batch
,
return_tensors
=
"pt"
,
padding
=
True
).
input_ids
llm_outputs
=
llm
.
generate
(
input_ids
=
input_ids
.
cuda
(),
do_sample
=
True
,
num_return_sequences
=
n
,
temperature
=
1.0
,
top_p
=
1.0
,
use_cache
=
True
,
max_new_tokens
=
max_output_len
,
)
# Include the decoding time.
tokenizer
.
batch_decode
(
llm_outputs
,
skip_special_tokens
=
True
)
pbar
.
update
(
len
(
batch
))
# Clear the batch.
batch
=
[]
max_prompt_len
=
0
max_output_len
=
0
end
=
time
.
perf_counter
()
return
end
-
start
def
run_mii
(
requests
:
List
[
SampleRequest
],
model
:
str
,
tensor_parallel_size
:
int
,
output_len
:
int
,
)
->
float
:
from
mii
import
client
,
serve
llm
=
serve
(
model
,
tensor_parallel
=
tensor_parallel_size
)
prompts
=
[
request
.
prompt
for
request
in
requests
]
start
=
time
.
perf_counter
()
llm
.
generate
(
prompts
,
max_new_tokens
=
output_len
)
end
=
time
.
perf_counter
()
client
=
client
(
model
)
client
.
terminate_server
()
return
end
-
start
def
main
(
args
:
argparse
.
Namespace
):
print
(
args
)
random
.
seed
(
args
.
seed
)
# Sample the requests.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
tokenizer
,
trust_remote_code
=
args
.
trust_remote_code
)
if
args
.
dataset
is
None
:
vocab_size
=
tokenizer
.
vocab_size
requests_json
=
{}
for
ELEprompt
in
args
.
num_prompts
:
for
ELEinput
,
ELEoutput
in
zip
(
args
.
input_len
,
args
.
output_len
):
requests
=
[]
for
_
in
range
(
ELEprompt
):
request_tokenizer
=
tokenizer
lora_request
:
Optional
[
LoRARequest
]
=
None
if
args
.
enable_lora
:
lora_request
,
lora_tokenizer
=
get_random_lora_request
(
args
)
if
lora_tokenizer
:
request_tokenizer
=
lora_tokenizer
# Synthesize a prompt with the given input length.
candidate_ids
=
[
random
.
randint
(
0
,
vocab_size
-
1
)
for
_
in
range
(
ELEinput
)
]
# As tokenizer may add additional tokens like BOS, we need to try
# different lengths to get the desired input length.
for
_
in
range
(
5
):
# Max attempts to correct
candidate_prompt
=
request_tokenizer
.
decode
(
candidate_ids
)
tokenized_len
=
len
(
request_tokenizer
.
encode
(
candidate_prompt
))
if
tokenized_len
==
ELEinput
:
break
# Adjust length based on difference
diff
=
ELEinput
-
tokenized_len
if
diff
>
0
:
candidate_ids
.
extend
([
random
.
randint
(
100
,
vocab_size
-
100
)
for
_
in
range
(
diff
)
])
else
:
candidate_ids
=
candidate_ids
[:
diff
]
requests
.
append
(
SampleRequest
(
prompt
=
candidate_prompt
,
prompt_len
=
ELEinput
,
expected_output_len
=
ELEoutput
,
lora_request
=
lora_request
))
requests_json
[
"{}_{}_{}"
.
format
(
ELEprompt
,
ELEinput
,
ELEoutput
)]
=
requests
else
:
requests
=
sample_requests
(
tokenizer
,
args
)
is_multi_modal
=
any
(
request
.
multi_modal_data
is
not
None
for
request
in
requests
)
if
args
.
backend
==
"vllm"
:
if
args
.
async_engine
:
elapsed_time
=
uvloop
.
run
(
run_vllm_async
(
requests
,
args
.
n
,
AsyncEngineArgs
.
from_cli_args
(
args
),
args
.
disable_frontend_multiprocessing
,
))
else
:
info_json
=
run_vllm
(
requests_json
,
args
.
n
,
args
.
num_iters_warmup
,
EngineArgs
.
from_cli_args
(
args
))
elif
args
.
backend
==
"hf"
:
assert
args
.
tensor_parallel_size
==
1
elapsed_time
=
run_hf
(
requests
,
args
.
model
,
tokenizer
,
args
.
n
,
args
.
hf_max_batch_size
,
args
.
trust_remote_code
)
elif
args
.
backend
==
"mii"
:
elapsed_time
=
run_mii
(
requests
,
args
.
model
,
args
.
tensor_parallel_size
,
args
.
output_len
)
else
:
raise
ValueError
(
f
"Unknown backend:
{
args
.
backend
}
"
)
# file_name=args.model.rsplit("/")[-1]+"-tp"+str(args.tensor_parallel_size)+".txt"
if
is_multi_modal
:
print
(
"
\033
[91mWARNING
\033
[0m: Multi-modal request detected. The "
"following metrics are not accurate because image tokens are not"
" counted. See vllm-project/vllm/issues/9778 for details."
)
with
open
(
args
.
output_json
,
"w"
)
as
f
:
title
=
"bs_in_out"
data_keys
=
info_json
[
list
(
info_json
.
keys
())[
0
]].
keys
()
keys_string
=
','
.
join
(
data_keys
)
title
=
title
+
","
+
keys_string
f
.
write
(
title
)
f
.
write
(
"
\n
"
)
for
key
,
value
in
info_json
.
items
():
values_as_strings
=
[
str
(
value
)
for
value
in
info_json
[
key
].
values
()]
values_string
=
','
.
join
(
values_as_strings
)
key
=
key
+
","
+
values_string
f
.
writelines
(
key
)
f
.
write
(
"
\n
"
)
# json.dump(info_json, f, indent=4)
# Output JSON results if specified
# if args.output_json:
# results = {
# "elapsed_time": elapsed_time,
# "num_requests": len(requests),
# "total_num_tokens": total_num_tokens,
# "requests_per_second": len(requests) / elapsed_time,
# "tokens_per_second": total_num_tokens / elapsed_time,
# }
# with open(args.output_json, "w") as f:
# json.dump(results, f, indent=4)
if
__name__
==
"__main__"
:
parser
=
FlexibleArgumentParser
(
description
=
"Benchmark the throughput."
)
parser
.
add_argument
(
"--backend"
,
type
=
str
,
choices
=
[
"vllm"
,
"hf"
,
"mii"
],
default
=
"vllm"
)
parser
.
add_argument
(
"--dataset"
,
type
=
str
,
default
=
None
,
help
=
"Path to the dataset. The dataset is expected to "
"be a json in form of List[Dict[..., conversations: "
"List[Dict[..., value: <prompt_or_response>]]]]"
)
parser
.
add_argument
(
"--input-len"
,
type
=
int
,
nargs
=
"*"
,
default
=
None
,
help
=
"Input prompt length for each request"
)
parser
.
add_argument
(
"--output-len"
,
type
=
int
,
nargs
=
"*"
,
default
=
None
,
help
=
"Output length for each request. Overrides the "
"output length from the dataset."
)
parser
.
add_argument
(
"--n"
,
type
=
int
,
default
=
1
,
help
=
"Number of generated sequences per prompt."
)
parser
.
add_argument
(
'--num-iters-warmup'
,
type
=
int
,
default
=
1
,
help
=
'Number of iterations to run for warmup.'
)
parser
.
add_argument
(
"--num-prompts"
,
type
=
int
,
nargs
=
"*"
,
default
=
1000
,
help
=
"Number of prompts to process."
)
parser
.
add_argument
(
"--hf-max-batch-size"
,
type
=
int
,
default
=
None
,
help
=
"Maximum batch size for HF backend."
)
parser
.
add_argument
(
'--output-json'
,
type
=
str
,
default
=
None
,
help
=
'Path to save the throughput results in JSON format.'
)
parser
.
add_argument
(
"--async-engine"
,
action
=
'store_true'
,
default
=
False
,
help
=
"Use vLLM async engine rather than LLM class."
)
parser
.
add_argument
(
"--disable-frontend-multiprocessing"
,
action
=
'store_true'
,
default
=
False
,
help
=
"Disable decoupled async engine frontend."
)
# LoRA
parser
.
add_argument
(
"--lora-path"
,
type
=
str
,
default
=
None
,
help
=
"Path to the lora adapters to use. This can be an absolute path, "
"a relative path, or a Hugging Face model identifier."
)
parser
=
AsyncEngineArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
if
args
.
tokenizer
is
None
:
args
.
tokenizer
=
args
.
model
if
args
.
dataset
is
None
:
assert
args
.
input_len
is
not
None
assert
args
.
output_len
is
not
None
else
:
assert
args
.
input_len
is
None
if
args
.
enable_lora
:
assert
args
.
lora_path
is
not
None
if
args
.
backend
==
"vllm"
:
if
args
.
hf_max_batch_size
is
not
None
:
raise
ValueError
(
"HF max batch size is only for HF backend."
)
elif
args
.
backend
==
"hf"
:
if
args
.
hf_max_batch_size
is
None
:
raise
ValueError
(
"HF max batch size is required for HF backend."
)
if
args
.
quantization
is
not
None
:
raise
ValueError
(
"Quantization is only for vLLM backend."
)
if
args
.
enable_lora
is
not
None
:
raise
ValueError
(
"LoRA benchmarking is only supported for vLLM"
" backend"
)
elif
args
.
backend
==
"mii"
:
if
args
.
dtype
!=
"auto"
:
raise
ValueError
(
"dtype must be auto for MII backend."
)
if
args
.
n
!=
1
:
raise
ValueError
(
"n must be 1 for MII backend."
)
if
args
.
quantization
is
not
None
:
raise
ValueError
(
"Quantization is only for vLLM backend."
)
if
args
.
hf_max_batch_size
is
not
None
:
raise
ValueError
(
"HF max batch size is only for HF backend."
)
if
args
.
tokenizer
!=
args
.
model
:
raise
ValueError
(
"Tokenizer must be the same as the model for MII "
"backend."
)
if
args
.
enable_lora
is
not
None
:
raise
ValueError
(
"LoRA benchmarking is only supported for vLLM"
" backend"
)
main
(
args
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment