Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
31436e8b
Unverified
Commit
31436e8b
authored
Aug 19, 2025
by
hustxiayang
Committed by
GitHub
Aug 19, 2025
Browse files
[Misc] Add request_id into benchmark_serve.py (#23065)
Signed-off-by:
yangxia
<
yangxiast@gmail.com
>
parent
4efd43e9
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
243 additions
and
46 deletions
+243
-46
benchmarks/backend_request_func.py
benchmarks/backend_request_func.py
+21
-2
benchmarks/benchmark_dataset.py
benchmarks/benchmark_dataset.py
+90
-19
benchmarks/benchmark_serving.py
benchmarks/benchmark_serving.py
+21
-2
vllm/benchmarks/datasets.py
vllm/benchmarks/datasets.py
+92
-21
vllm/benchmarks/lib/endpoint_request_func.py
vllm/benchmarks/lib/endpoint_request_func.py
+7
-0
vllm/benchmarks/serve.py
vllm/benchmarks/serve.py
+12
-2
No files found.
benchmarks/backend_request_func.py
View file @
31436e8b
...
...
@@ -34,6 +34,7 @@ class RequestFuncInput:
multi_modal_content
:
Optional
[
dict
|
list
[
dict
]]
=
None
ignore_eos
:
bool
=
False
language
:
Optional
[
str
]
=
None
request_id
:
Optional
[
str
]
=
None
@
dataclass
...
...
@@ -71,6 +72,9 @@ async def async_request_tgi(
"inputs"
:
request_func_input
.
prompt
,
"parameters"
:
params
,
}
headers
=
None
if
request_func_input
.
request_id
:
headers
=
{
"x-request-id"
:
request_func_input
.
request_id
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
if
request_func_input
.
ignore_eos
:
...
...
@@ -82,7 +86,9 @@ async def async_request_tgi(
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
)
as
response
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
,
headers
=
headers
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk_bytes
in
response
.
content
:
chunk_bytes
=
chunk_bytes
.
strip
()
...
...
@@ -145,6 +151,9 @@ async def async_request_trt_llm(
}
if
request_func_input
.
ignore_eos
:
payload
[
"min_length"
]
=
request_func_input
.
output_len
headers
=
None
if
request_func_input
.
request_id
:
headers
=
{
"x-request-id"
:
request_func_input
.
request_id
}
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
...
...
@@ -152,7 +161,9 @@ async def async_request_trt_llm(
st
=
time
.
perf_counter
()
most_recent_timestamp
=
st
try
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
)
as
response
:
async
with
session
.
post
(
url
=
api_url
,
json
=
payload
,
headers
=
headers
)
as
response
:
if
response
.
status
==
200
:
async
for
chunk_bytes
in
response
.
content
:
chunk_bytes
=
chunk_bytes
.
strip
()
...
...
@@ -211,6 +222,8 @@ async def async_request_deepspeed_mii(
"top_p"
:
1.0
,
}
headers
=
{
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
}
if
request_func_input
.
request_id
:
headers
[
"x-request-id"
]
=
request_func_input
.
request_id
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
...
...
@@ -283,6 +296,8 @@ async def async_request_openai_completions(
if
request_func_input
.
extra_body
:
payload
.
update
(
request_func_input
.
extra_body
)
headers
=
{
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
}
if
request_func_input
.
request_id
:
headers
[
"x-request-id"
]
=
request_func_input
.
request_id
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
...
...
@@ -395,6 +410,8 @@ async def async_request_openai_chat_completions(
"Content-Type"
:
"application/json"
,
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
,
}
if
request_func_input
.
request_id
:
headers
[
"x-request-id"
]
=
request_func_input
.
request_id
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
...
...
@@ -491,6 +508,8 @@ async def async_request_openai_audio(
headers
=
{
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
,
}
if
request_func_input
.
request_id
:
headers
[
"x-request-id"
]
=
request_func_input
.
request_id
# Send audio file
def
to_bytes
(
y
,
sr
):
...
...
benchmarks/benchmark_dataset.py
View file @
31436e8b
...
...
@@ -19,6 +19,7 @@ import logging
import
random
from
abc
import
ABC
,
abstractmethod
from
collections.abc
import
Mapping
from
copy
import
deepcopy
from
dataclasses
import
dataclass
from
functools
import
cache
from
io
import
BytesIO
...
...
@@ -54,6 +55,7 @@ class SampleRequest:
expected_output_len
:
int
multi_modal_data
:
Optional
[
Union
[
MultiModalDataDict
,
dict
,
list
[
dict
]]]
=
None
lora_request
:
Optional
[
LoRARequest
]
=
None
request_id
:
Optional
[
str
]
=
None
# -----------------------------------------------------------------------------
...
...
@@ -155,7 +157,10 @@ class BenchmarkDataset(ABC):
@
abstractmethod
def
sample
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
self
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
request_id_prefix
:
str
=
""
,
)
->
list
[
SampleRequest
]:
"""
Abstract method to generate sample requests from the dataset.
...
...
@@ -167,6 +172,7 @@ class BenchmarkDataset(ABC):
tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
for processing the dataset's text.
num_requests (int): The number of sample requests to generate.
request_id_prefix (str) The prefix of request_id.
Returns:
list[SampleRequest]: A list of sample requests generated from the
...
...
@@ -175,7 +181,10 @@ class BenchmarkDataset(ABC):
raise
NotImplementedError
(
"sample must be implemented in subclasses."
)
def
maybe_oversample_requests
(
self
,
requests
:
list
[
SampleRequest
],
num_requests
:
int
self
,
requests
:
list
[
SampleRequest
],
num_requests
:
int
,
request_id_prefix
:
str
=
""
,
)
->
None
:
"""
Oversamples the list of requests if its size is less than the desired
...
...
@@ -183,11 +192,18 @@ class BenchmarkDataset(ABC):
Args:
requests (List[SampleRequest]): The current list of sampled
requests. num_requests (int): The target number of requests.
requests.
num_requests (int): The target number of requests.
request_id_prefix (str) The prefix of the request ids.
"""
if
len
(
requests
)
<
num_requests
:
random
.
seed
(
self
.
random_seed
)
additional
=
random
.
choices
(
requests
,
k
=
num_requests
-
len
(
requests
))
additional
=
deepcopy
(
random
.
choices
(
requests
,
k
=
num_requests
-
len
(
requests
))
)
for
i
in
range
(
len
(
additional
)):
req
=
additional
[
i
]
req
.
request_id
=
request_id_prefix
+
str
(
len
(
requests
)
+
i
)
requests
.
extend
(
additional
)
logger
.
info
(
"Oversampled requests to reach %d total samples."
,
num_requests
)
...
...
@@ -303,6 +319,7 @@ class RandomDataset(BenchmarkDataset):
range_ratio
:
float
=
DEFAULT_RANGE_RATIO
,
input_len
:
int
=
DEFAULT_INPUT_LEN
,
output_len
:
int
=
DEFAULT_OUTPUT_LEN
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
)
->
list
[
SampleRequest
]:
# Enforce range_ratio < 1
...
...
@@ -363,8 +380,10 @@ class RandomDataset(BenchmarkDataset):
prompt
=
prompt
,
prompt_len
=
total_input_len
,
expected_output_len
=
int
(
output_lens
[
i
]),
request_id
=
request_id_prefix
+
str
(
i
),
)
)
return
requests
...
...
@@ -406,9 +425,11 @@ class ShareGPTDataset(BenchmarkDataset):
max_loras
:
Optional
[
int
]
=
None
,
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
)
->
list
:
samples
:
list
=
[]
ind
=
0
for
entry
in
self
.
data
:
if
len
(
samples
)
>=
num_requests
:
break
...
...
@@ -444,9 +465,11 @@ class ShareGPTDataset(BenchmarkDataset):
expected_output_len
=
new_output_len
,
lora_request
=
lora_request
,
multi_modal_data
=
mm_content
,
request_id
=
request_id_prefix
+
str
(
ind
),
)
)
self
.
maybe_oversample_requests
(
samples
,
num_requests
)
ind
+=
1
self
.
maybe_oversample_requests
(
samples
,
num_requests
,
request_id_prefix
)
return
samples
...
...
@@ -512,10 +535,11 @@ class CustomDataset(BenchmarkDataset):
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
skip_chat_template
:
bool
=
False
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
)
->
list
:
sampled_requests
=
[]
for
item
in
self
.
data
:
for
i
,
item
in
enumerate
(
self
.
data
)
:
if
len
(
sampled_requests
)
>=
num_requests
:
break
prompt
=
item
[
"prompt"
]
...
...
@@ -534,9 +558,12 @@ class CustomDataset(BenchmarkDataset):
prompt
=
prompt
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
request_id
=
request_id_prefix
+
str
(
i
),
)
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
return
sampled_requests
...
...
@@ -578,6 +605,7 @@ class SonnetDataset(BenchmarkDataset):
input_len
:
int
=
DEFAULT_INPUT_LEN
,
output_len
:
int
=
DEFAULT_OUTPUT_LEN
,
return_prompt_formatted
:
bool
=
False
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
)
->
list
:
# Calculate average token length for a poem line.
...
...
@@ -603,6 +631,7 @@ class SonnetDataset(BenchmarkDataset):
prefix_lines
=
self
.
data
[:
num_prefix_lines
]
samples
=
[]
ind
=
0
while
len
(
samples
)
<
num_requests
:
extra_lines
=
random
.
choices
(
self
.
data
,
k
=
num_input_lines
-
num_prefix_lines
...
...
@@ -613,14 +642,17 @@ class SonnetDataset(BenchmarkDataset):
msg
,
add_generation_prompt
=
True
,
tokenize
=
False
)
prompt_len
=
len
(
tokenizer
(
prompt_formatted
).
input_ids
)
if
prompt_len
<=
input_len
:
samples
.
append
(
SampleRequest
(
prompt
=
prompt_formatted
if
return_prompt_formatted
else
prompt
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
request_id
=
request_id_prefix
+
str
(
ind
),
)
)
ind
+=
1
return
samples
...
...
@@ -672,6 +704,7 @@ class BurstGPTDataset(BenchmarkDataset):
num_requests
:
int
,
max_loras
:
Optional
[
int
]
=
None
,
lora_path
:
Optional
[
str
]
=
None
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
)
->
list
[
SampleRequest
]:
samples
=
[]
...
...
@@ -693,6 +726,7 @@ class BurstGPTDataset(BenchmarkDataset):
prompt_len
=
input_len
,
expected_output_len
=
output_len
,
lora_request
=
lora_req
,
request_id
=
request_id_prefix
+
str
(
i
),
)
)
return
samples
...
...
@@ -752,12 +786,14 @@ class ConversationDataset(HuggingFaceDataset):
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
)
->
list
:
# Filter examples with at least 2 conversations
filtered_data
=
self
.
data
.
filter
(
lambda
x
:
len
(
x
[
"conversations"
])
>=
2
)
sampled_requests
=
[]
dynamic_output
=
output_len
is
None
ind
=
0
for
item
in
filtered_data
:
if
len
(
sampled_requests
)
>=
num_requests
:
...
...
@@ -785,9 +821,13 @@ class ConversationDataset(HuggingFaceDataset):
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
mm_content
,
request_id
=
request_id_prefix
+
str
(
ind
),
)
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
ind
+=
1
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
return
sampled_requests
...
...
@@ -814,11 +854,12 @@ class VisionArenaDataset(HuggingFaceDataset):
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
)
->
list
:
output_len
=
output_len
if
output_len
is
not
None
else
self
.
DEFAULT_OUTPUT_LEN
sampled_requests
=
[]
for
item
in
self
.
data
:
for
i
,
item
in
enumerate
(
self
.
data
)
:
if
len
(
sampled_requests
)
>=
num_requests
:
break
parser_fn
=
self
.
SUPPORTED_DATASET_PATHS
.
get
(
self
.
dataset_path
)
...
...
@@ -838,9 +879,12 @@ class VisionArenaDataset(HuggingFaceDataset):
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
mm_content
,
request_id
=
request_id_prefix
+
str
(
i
),
)
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
return
sampled_requests
...
...
@@ -870,11 +914,12 @@ class InstructCoderDataset(HuggingFaceDataset):
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
)
->
list
:
output_len
=
output_len
if
output_len
is
not
None
else
self
.
DEFAULT_OUTPUT_LEN
sampled_requests
=
[]
for
item
in
self
.
data
:
for
i
,
item
in
enumerate
(
self
.
data
)
:
if
len
(
sampled_requests
)
>=
num_requests
:
break
prompt
=
f
"
{
item
[
'input'
]
}
\n\n
{
item
[
'instruction'
]
}
Just output
\
...
...
@@ -892,9 +937,12 @@ class InstructCoderDataset(HuggingFaceDataset):
prompt
=
prompt
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
request_id
=
request_id_prefix
+
str
(
i
),
)
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
return
sampled_requests
...
...
@@ -924,12 +972,13 @@ class MTBenchDataset(HuggingFaceDataset):
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
)
->
list
:
output_len
=
output_len
if
output_len
is
not
None
else
self
.
DEFAULT_OUTPUT_LEN
sampled_requests
=
[]
for
item
in
self
.
data
:
for
i
,
item
in
enumerate
(
self
.
data
)
:
if
len
(
sampled_requests
)
>=
num_requests
:
break
prompt
=
item
[
"turns"
][
0
]
...
...
@@ -947,9 +996,12 @@ class MTBenchDataset(HuggingFaceDataset):
prompt
=
prompt
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
request_id
=
request_id_prefix
+
str
(
i
),
)
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
return
sampled_requests
...
...
@@ -974,10 +1026,12 @@ class AIMODataset(HuggingFaceDataset):
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
)
->
list
:
sampled_requests
=
[]
dynamic_output
=
output_len
is
None
ind
=
0
for
item
in
self
.
data
:
if
len
(
sampled_requests
)
>=
num_requests
:
...
...
@@ -1000,9 +1054,13 @@ class AIMODataset(HuggingFaceDataset):
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
None
,
request_id
=
request_id_prefix
+
str
(
ind
),
)
)
ind
+=
1
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
return
sampled_requests
...
...
@@ -1072,12 +1130,18 @@ class NextEditPredictionDataset(HuggingFaceDataset):
"zed-industries/zeta"
:
_format_zeta_prompt
,
}
def
sample
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
**
kwargs
):
def
sample
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
):
formatting_prompt_func
=
self
.
MAPPING_PROMPT_FUNCS
.
get
(
self
.
dataset_path
)
if
formatting_prompt_func
is
None
:
raise
ValueError
(
f
"Unsupported dataset path:
{
self
.
dataset_path
}
"
)
samples
=
[]
for
sample
in
self
.
data
:
for
i
,
sample
in
enumerate
(
self
.
data
)
:
sample
=
formatting_prompt_func
(
sample
)
samples
.
append
(
SampleRequest
(
...
...
@@ -1086,11 +1150,12 @@ class NextEditPredictionDataset(HuggingFaceDataset):
expected_output_len
=
len
(
tokenizer
(
sample
[
"expected_output"
]).
input_ids
),
request_id
=
request_id_prefix
+
str
(
i
),
)
)
if
len
(
samples
)
>=
num_requests
:
break
self
.
maybe_oversample_requests
(
samples
,
num_requests
)
self
.
maybe_oversample_requests
(
samples
,
num_requests
,
request_id_prefix
)
return
samples
...
...
@@ -1139,6 +1204,7 @@ class ASRDataset(HuggingFaceDataset):
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
)
->
list
:
import
librosa
...
...
@@ -1148,6 +1214,7 @@ class ASRDataset(HuggingFaceDataset):
prompt_len
=
len
(
tokenizer
(
prompt
).
input_ids
)
sampled_requests
=
[]
skipped
=
0
ind
=
0
for
item
in
self
.
data
:
if
len
(
sampled_requests
)
>=
num_requests
:
break
...
...
@@ -1166,8 +1233,10 @@ class ASRDataset(HuggingFaceDataset):
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
mm_content
,
request_id
=
request_id_prefix
+
str
(
ind
),
)
)
ind
+=
1
if
skipped
:
logger
.
warning
(
"%d samples discarded from dataset due to"
...
...
@@ -1175,5 +1244,7 @@ class ASRDataset(HuggingFaceDataset):
" what Whisper supports."
,
skipped
,
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
return
sampled_requests
benchmarks/benchmark_serving.py
View file @
31436e8b
...
...
@@ -375,11 +375,12 @@ async def benchmark(
rps_change_events
.
append
({
"rps"
:
rps_val
,
"timestamp"
:
timestamp
})
last_int_rps
=
current_int_rps
prompt
,
prompt_len
,
output_len
,
mm_content
=
(
prompt
,
prompt_len
,
output_len
,
mm_content
,
request_id
=
(
request
.
prompt
,
request
.
prompt_len
,
request
.
expected_output_len
,
request
.
multi_modal_data
,
request
.
request_id
,
)
req_model_id
,
req_model_name
=
model_id
,
model_name
if
lora_modules
:
...
...
@@ -397,6 +398,7 @@ async def benchmark(
multi_modal_content
=
mm_content
,
ignore_eos
=
ignore_eos
,
extra_body
=
extra_body
,
request_id
=
request_id
,
)
task
=
limited_request_func
(
request_func_input
=
request_func_input
,
pbar
=
pbar
)
tasks
.
append
(
asyncio
.
create_task
(
task
))
...
...
@@ -665,6 +667,7 @@ def main(args: argparse.Namespace):
tokenizer
=
tokenizer
,
output_len
=
args
.
custom_output_len
,
skip_chat_template
=
args
.
custom_skip_chat_template
,
request_id_prefix
=
args
.
request_id_prefix
,
)
elif
args
.
dataset_name
==
"sonnet"
:
...
...
@@ -678,6 +681,7 @@ def main(args: argparse.Namespace):
prefix_len
=
args
.
sonnet_prefix_len
,
tokenizer
=
tokenizer
,
return_prompt_formatted
=
False
,
request_id_prefix
=
args
.
request_id_prefix
,
)
else
:
assert
tokenizer
.
chat_template
or
tokenizer
.
default_chat_template
,
(
...
...
@@ -690,6 +694,7 @@ def main(args: argparse.Namespace):
prefix_len
=
args
.
sonnet_prefix_len
,
tokenizer
=
tokenizer
,
return_prompt_formatted
=
True
,
request_id_prefix
=
args
.
request_id_prefix
,
)
elif
args
.
dataset_name
==
"hf"
:
...
...
@@ -751,6 +756,7 @@ def main(args: argparse.Namespace):
num_requests
=
args
.
num_prompts
,
tokenizer
=
tokenizer
,
output_len
=
args
.
hf_output_len
,
request_id_prefix
=
args
.
request_id_prefix
,
)
else
:
...
...
@@ -762,10 +768,15 @@ def main(args: argparse.Namespace):
tokenizer
=
tokenizer
,
num_requests
=
args
.
num_prompts
,
output_len
=
args
.
sharegpt_output_len
,
request_id_prefix
=
args
.
request_id_prefix
,
),
"burstgpt"
:
lambda
:
BurstGPTDataset
(
random_seed
=
args
.
seed
,
dataset_path
=
args
.
dataset_path
).
sample
(
tokenizer
=
tokenizer
,
num_requests
=
args
.
num_prompts
),
).
sample
(
tokenizer
=
tokenizer
,
num_requests
=
args
.
num_prompts
,
request_id_prefix
=
args
.
request_id_prefix
,
),
"random"
:
lambda
:
RandomDataset
(
dataset_path
=
args
.
dataset_path
).
sample
(
tokenizer
=
tokenizer
,
num_requests
=
args
.
num_prompts
,
...
...
@@ -773,6 +784,7 @@ def main(args: argparse.Namespace):
input_len
=
args
.
random_input_len
,
output_len
=
args
.
random_output_len
,
range_ratio
=
args
.
random_range_ratio
,
request_id_prefix
=
args
.
request_id_prefix
,
),
}
...
...
@@ -1118,6 +1130,13 @@ def create_argument_parser():
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
"and the blog: https://hao-ai-lab.github.io/blogs/distserve"
,
)
parser
.
add_argument
(
"--request-id-prefix"
,
type
=
str
,
required
=
False
,
default
=
"benchmark-serving"
,
help
=
"Specify the prefix of request id."
,
)
# group for dataset specific arguments
custom_group
=
parser
.
add_argument_group
(
"custom dataset options"
)
...
...
vllm/benchmarks/datasets.py
View file @
31436e8b
...
...
@@ -18,6 +18,7 @@ import logging
import
random
from
abc
import
ABC
,
abstractmethod
from
collections.abc
import
Mapping
from
copy
import
deepcopy
from
dataclasses
import
dataclass
from
functools
import
cache
from
io
import
BytesIO
...
...
@@ -76,6 +77,7 @@ class SampleRequest:
Union
[
MultiModalDataDict
,
dict
,
list
[
dict
]]
]
=
None
lora_request
:
Optional
[
LoRARequest
]
=
None
request_id
:
Optional
[
str
]
=
None
# -----------------------------------------------------------------------------
...
...
@@ -183,7 +185,8 @@ class BenchmarkDataset(ABC):
@
abstractmethod
def
sample
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
)
->
list
[
SampleRequest
]:
num_requests
:
int
,
request_id_prefix
:
str
=
""
)
->
list
[
SampleRequest
]:
"""
Abstract method to generate sample requests from the dataset.
...
...
@@ -194,6 +197,8 @@ class BenchmarkDataset(ABC):
tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
for processing the dataset's text.
num_requests (int): The number of sample requests to generate.
request_id_prefix (str) The prefix of request_id.
Returns:
list[SampleRequest]: A list of sample requests generated from the
...
...
@@ -201,8 +206,12 @@ class BenchmarkDataset(ABC):
"""
raise
NotImplementedError
(
"sample must be implemented in subclasses."
)
def
maybe_oversample_requests
(
self
,
requests
:
list
[
SampleRequest
],
num_requests
:
int
)
->
None
:
def
maybe_oversample_requests
(
self
,
requests
:
list
[
SampleRequest
],
num_requests
:
int
,
request_id_prefix
:
str
=
""
,
)
->
None
:
"""
Oversamples the list of requests if its size is less than the desired
number.
...
...
@@ -211,11 +220,17 @@ class BenchmarkDataset(ABC):
requests (List[SampleRequest]): The current list of sampled
requests.
num_requests (int): The target number of requests.
request_id_prefix (str) The prefix of the request ids.
"""
if
len
(
requests
)
<
num_requests
:
random
.
seed
(
self
.
random_seed
)
additional
=
random
.
choices
(
requests
,
k
=
num_requests
-
len
(
requests
))
additional
=
deepcopy
(
random
.
choices
(
requests
,
k
=
num_requests
-
len
(
requests
))
)
for
i
in
range
(
len
(
additional
)):
req
=
additional
[
i
]
req
.
request_id
=
request_id_prefix
+
str
(
len
(
requests
)
+
i
)
requests
.
extend
(
additional
)
logger
.
info
(
"Oversampled requests to reach %d total samples."
,
num_requests
)
...
...
@@ -334,6 +349,7 @@ class RandomDataset(BenchmarkDataset):
range_ratio
:
float
=
DEFAULT_RANGE_RATIO
,
input_len
:
int
=
DEFAULT_INPUT_LEN
,
output_len
:
int
=
DEFAULT_OUTPUT_LEN
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
)
->
list
[
SampleRequest
]:
# Enforce range_ratio < 1
...
...
@@ -391,6 +407,7 @@ class RandomDataset(BenchmarkDataset):
prompt
=
prompt
,
prompt_len
=
total_input_len
,
expected_output_len
=
int
(
output_lens
[
i
]),
request_id
=
request_id_prefix
+
str
(
i
),
))
return
requests
...
...
@@ -432,9 +449,11 @@ class ShareGPTDataset(BenchmarkDataset):
max_loras
:
Optional
[
int
]
=
None
,
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
)
->
list
:
samples
:
list
=
[]
ind
=
0
for
entry
in
self
.
data
:
if
len
(
samples
)
>=
num_requests
:
break
...
...
@@ -470,8 +489,10 @@ class ShareGPTDataset(BenchmarkDataset):
expected_output_len
=
new_output_len
,
lora_request
=
lora_request
,
multi_modal_data
=
mm_content
,
request_id
=
request_id_prefix
+
str
(
ind
),
))
self
.
maybe_oversample_requests
(
samples
,
num_requests
)
ind
+=
1
self
.
maybe_oversample_requests
(
samples
,
num_requests
,
request_id_prefix
)
return
samples
...
...
@@ -647,6 +668,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
tokenizer
=
tokenizer
,
output_len
=
args
.
custom_output_len
,
skip_chat_template
=
args
.
custom_skip_chat_template
,
request_id_prefix
=
args
.
request_id_prefix
,
)
elif
args
.
dataset_name
==
"sonnet"
:
...
...
@@ -660,6 +682,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
prefix_len
=
args
.
sonnet_prefix_len
,
tokenizer
=
tokenizer
,
return_prompt_formatted
=
False
,
request_id_prefix
=
args
.
request_id_prefix
,
)
else
:
assert
tokenizer
.
chat_template
or
tokenizer
.
default_chat_template
,
(
...
...
@@ -671,6 +694,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
prefix_len
=
args
.
sonnet_prefix_len
,
tokenizer
=
tokenizer
,
return_prompt_formatted
=
True
,
request_id_prefix
=
args
.
request_id_prefix
,
)
elif
args
.
dataset_name
==
"hf"
:
...
...
@@ -730,6 +754,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
num_requests
=
args
.
num_prompts
,
tokenizer
=
tokenizer
,
output_len
=
args
.
hf_output_len
,
request_id_prefix
=
args
.
request_id_prefix
,
)
else
:
...
...
@@ -741,11 +766,13 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
tokenizer
=
tokenizer
,
num_requests
=
args
.
num_prompts
,
output_len
=
args
.
sharegpt_output_len
,
request_id_prefix
=
args
.
request_id_prefix
,
),
"burstgpt"
:
lambda
:
BurstGPTDataset
(
random_seed
=
args
.
seed
,
dataset_path
=
args
.
dataset_path
).
sample
(
tokenizer
=
tokenizer
,
num_requests
=
args
.
num_prompts
),
sample
(
tokenizer
=
tokenizer
,
num_requests
=
args
.
num_prompts
,
request_id_prefix
=
args
.
request_id_prefix
,),
"random"
:
lambda
:
RandomDataset
(
random_seed
=
args
.
seed
,
dataset_path
=
args
.
dataset_path
).
sample
(
...
...
@@ -755,6 +782,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
input_len
=
args
.
random_input_len
,
output_len
=
args
.
random_output_len
,
range_ratio
=
args
.
random_range_ratio
,
request_id_prefix
=
args
.
request_id_prefix
,
),
"prefix_repetition"
:
lambda
:
PrefixRepetitionRandomDataset
(
...
...
@@ -766,6 +794,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
suffix_len
=
args
.
prefix_repetition_suffix_len
,
num_prefixes
=
args
.
prefix_repetition_num_prefixes
,
output_len
=
args
.
prefix_repetition_output_len
,
request_id_prefix
=
args
.
request_id_prefix
,
),
}
...
...
@@ -839,10 +868,11 @@ class CustomDataset(BenchmarkDataset):
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
skip_chat_template
:
bool
=
False
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
)
->
list
:
sampled_requests
=
[]
for
item
in
self
.
data
:
for
i
,
item
in
enumerate
(
self
.
data
)
:
if
len
(
sampled_requests
)
>=
num_requests
:
break
prompt
=
item
[
"prompt"
]
...
...
@@ -864,8 +894,10 @@ class CustomDataset(BenchmarkDataset):
prompt
=
prompt
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
request_id
=
request_id_prefix
+
str
(
i
),
))
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
return
sampled_requests
...
...
@@ -909,6 +941,7 @@ class SonnetDataset(BenchmarkDataset):
input_len
:
int
=
DEFAULT_INPUT_LEN
,
output_len
:
int
=
DEFAULT_OUTPUT_LEN
,
return_prompt_formatted
:
bool
=
False
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
)
->
list
:
# Calculate average token length for a poem line.
...
...
@@ -934,6 +967,7 @@ class SonnetDataset(BenchmarkDataset):
prefix_lines
=
self
.
data
[:
num_prefix_lines
]
samples
=
[]
ind
=
0
while
len
(
samples
)
<
num_requests
:
extra_lines
=
random
.
choices
(
self
.
data
,
k
=
num_input_lines
-
num_prefix_lines
)
...
...
@@ -949,7 +983,9 @@ class SonnetDataset(BenchmarkDataset):
if
return_prompt_formatted
else
prompt
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
request_id
=
request_id_prefix
+
str
(
ind
),
))
ind
+=
1
return
samples
...
...
@@ -1000,6 +1036,7 @@ class BurstGPTDataset(BenchmarkDataset):
num_requests
:
int
,
max_loras
:
Optional
[
int
]
=
None
,
lora_path
:
Optional
[
str
]
=
None
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
)
->
list
[
SampleRequest
]:
samples
=
[]
...
...
@@ -1020,6 +1057,7 @@ class BurstGPTDataset(BenchmarkDataset):
prompt_len
=
input_len
,
expected_output_len
=
output_len
,
lora_request
=
lora_req
,
request_id
=
request_id_prefix
+
str
(
i
),
))
return
samples
...
...
@@ -1075,11 +1113,13 @@ class ConversationDataset(HuggingFaceDataset):
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
request_id_prefix
:
str
=
""
,
**
kwargs
)
->
list
:
# Filter examples with at least 2 conversations
filtered_data
=
self
.
data
.
filter
(
lambda
x
:
len
(
x
[
"conversations"
])
>=
2
)
sampled_requests
=
[]
ind
=
0
dynamic_output
=
output_len
is
None
for
item
in
filtered_data
:
...
...
@@ -1111,8 +1151,11 @@ class ConversationDataset(HuggingFaceDataset):
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
mm_content
,
request_id
=
request_id_prefix
+
str
(
ind
),
))
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
ind
+=
1
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
return
sampled_requests
...
...
@@ -1141,12 +1184,13 @@ class VisionArenaDataset(HuggingFaceDataset):
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
)
->
list
:
output_len
=
(
output_len
if
output_len
is
not
None
else
self
.
DEFAULT_OUTPUT_LEN
)
sampled_requests
=
[]
for
item
in
self
.
data
:
for
i
,
item
in
enumerate
(
self
.
data
)
:
if
len
(
sampled_requests
)
>=
num_requests
:
break
parser_fn
=
self
.
SUPPORTED_DATASET_PATHS
.
get
(
self
.
dataset_path
)
...
...
@@ -1168,8 +1212,10 @@ class VisionArenaDataset(HuggingFaceDataset):
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
mm_content
,
request_id
=
request_id_prefix
+
str
(
i
),
))
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
return
sampled_requests
...
...
@@ -1198,11 +1244,12 @@ class InstructCoderDataset(HuggingFaceDataset):
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
request_id_prefix
:
str
=
""
,
**
kwargs
)
->
list
:
output_len
=
(
output_len
if
output_len
is
not
None
else
self
.
DEFAULT_OUTPUT_LEN
)
sampled_requests
=
[]
for
item
in
self
.
data
:
for
i
,
item
in
enumerate
(
self
.
data
)
:
if
len
(
sampled_requests
)
>=
num_requests
:
break
prompt
=
f
"
{
item
[
'input'
]
}
\n\n
{
item
[
'instruction'
]
}
Just output
\
...
...
@@ -1224,8 +1271,10 @@ class InstructCoderDataset(HuggingFaceDataset):
prompt
=
prompt
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
request_id
=
request_id_prefix
+
str
(
i
),
))
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
return
sampled_requests
...
...
@@ -1255,13 +1304,14 @@ class MTBenchDataset(HuggingFaceDataset):
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
enable_multimodal_chat
:
bool
=
False
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
)
->
list
:
output_len
=
(
output_len
if
output_len
is
not
None
else
self
.
DEFAULT_OUTPUT_LEN
)
sampled_requests
=
[]
for
item
in
self
.
data
:
for
i
,
item
in
enumerate
(
self
.
data
)
:
if
len
(
sampled_requests
)
>=
num_requests
:
break
prompt
=
item
[
"turns"
][
0
]
...
...
@@ -1282,8 +1332,10 @@ class MTBenchDataset(HuggingFaceDataset):
prompt
=
prompt
,
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
request_id
=
request_id_prefix
+
str
(
i
),
))
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
return
sampled_requests
...
...
@@ -1305,8 +1357,10 @@ class AIMODataset(HuggingFaceDataset):
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
request_id_prefix
:
str
=
""
,
**
kwargs
)
->
list
:
sampled_requests
=
[]
ind
=
0
dynamic_output
=
output_len
is
None
for
item
in
self
.
data
:
...
...
@@ -1331,8 +1385,12 @@ class AIMODataset(HuggingFaceDataset):
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
None
,
request_id
=
request_id_prefix
+
str
(
ind
),
))
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
ind
+=
1
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
return
sampled_requests
...
...
@@ -1403,13 +1461,14 @@ class NextEditPredictionDataset(HuggingFaceDataset):
}
def
sample
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
request_id_prefix
:
str
=
""
,
**
kwargs
):
formatting_prompt_func
=
self
.
MAPPING_PROMPT_FUNCS
.
get
(
self
.
dataset_path
)
if
formatting_prompt_func
is
None
:
raise
ValueError
(
f
"Unsupported dataset path:
{
self
.
dataset_path
}
"
)
samples
=
[]
for
sample
in
self
.
data
:
for
i
,
sample
in
enumerate
(
self
.
data
)
:
sample
=
formatting_prompt_func
(
sample
)
samples
.
append
(
SampleRequest
(
...
...
@@ -1417,10 +1476,11 @@ class NextEditPredictionDataset(HuggingFaceDataset):
prompt_len
=
len
(
tokenizer
(
sample
[
"prompt"
]).
input_ids
),
expected_output_len
=
len
(
tokenizer
(
sample
[
"expected_output"
]).
input_ids
),
request_id
=
request_id_prefix
+
str
(
i
),
))
if
len
(
samples
)
>=
num_requests
:
break
self
.
maybe_oversample_requests
(
samples
,
num_requests
)
self
.
maybe_oversample_requests
(
samples
,
num_requests
,
request_id_prefix
)
return
samples
...
...
@@ -1470,6 +1530,7 @@ class ASRDataset(HuggingFaceDataset):
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
)
->
list
:
output_len
=
(
output_len
...
...
@@ -1477,6 +1538,7 @@ class ASRDataset(HuggingFaceDataset):
prompt
=
ASRDataset
.
TRANSCRIPTION_PREAMBLE
prompt_len
=
len
(
tokenizer
(
prompt
).
input_ids
)
sampled_requests
=
[]
ind
=
0
skipped
=
0
for
item
in
self
.
data
:
if
len
(
sampled_requests
)
>=
num_requests
:
...
...
@@ -1496,7 +1558,9 @@ class ASRDataset(HuggingFaceDataset):
prompt_len
=
prompt_len
,
expected_output_len
=
output_len
,
multi_modal_data
=
mm_content
,
request_id
=
request_id_prefix
+
str
(
ind
),
))
ind
+=
1
if
skipped
:
logger
.
warning
(
"%d samples discarded from dataset due to"
...
...
@@ -1504,7 +1568,8 @@ class ASRDataset(HuggingFaceDataset):
" what Whisper supports."
,
skipped
,
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
return
sampled_requests
...
...
@@ -1541,11 +1606,13 @@ class MLPerfDataset(HuggingFaceDataset):
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
output_len
:
Optional
[
int
]
=
None
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
)
->
list
[
SampleRequest
]:
# Force dynamic output length based on reference completion.
dynamic_output
=
output_len
is
None
sampled_requests
:
list
[
SampleRequest
]
=
[]
ind
=
0
for
item
in
self
.
data
:
if
len
(
sampled_requests
)
>=
num_requests
:
...
...
@@ -1580,10 +1647,13 @@ class MLPerfDataset(HuggingFaceDataset):
prompt
=
prompt_formatted
,
prompt_len
=
prompt_len
,
expected_output_len
=
expected_output_len
,
request_id
=
request_id_prefix
+
str
(
ind
),
)
)
ind
+=
1
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
)
self
.
maybe_oversample_requests
(
sampled_requests
,
num_requests
,
request_id_prefix
)
return
sampled_requests
...
...
@@ -1616,6 +1686,7 @@ class PrefixRepetitionRandomDataset(BenchmarkDataset):
suffix_len
:
int
=
DEFAULT_SUFFIX_LEN
,
num_prefixes
:
int
=
DEFAULT_NUM_PREFIXES
,
output_len
:
int
=
DEFAULT_OUTPUT_LEN
,
request_id_prefix
:
str
=
""
,
**
kwargs
,
)
->
list
[
SampleRequest
]:
vocab_size
=
tokenizer
.
vocab_size
...
...
vllm/benchmarks/lib/endpoint_request_func.py
View file @
31436e8b
...
...
@@ -31,6 +31,7 @@ class RequestFuncInput:
multi_modal_content
:
Optional
[
dict
|
list
[
dict
]]
=
None
ignore_eos
:
bool
=
False
language
:
Optional
[
str
]
=
None
request_id
:
Optional
[
str
]
=
None
@
dataclass
...
...
@@ -87,6 +88,8 @@ async def async_request_openai_completions(
headers
=
{
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
}
if
request_func_input
.
request_id
:
headers
[
"x-request-id"
]
=
request_func_input
.
request_id
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
...
...
@@ -210,6 +213,8 @@ async def async_request_openai_chat_completions(
"Content-Type"
:
"application/json"
,
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
,
}
if
request_func_input
.
request_id
:
headers
[
"x-request-id"
]
=
request_func_input
.
request_id
output
=
RequestFuncOutput
()
output
.
prompt_len
=
request_func_input
.
prompt_len
...
...
@@ -311,6 +316,8 @@ async def async_request_openai_audio(
headers
=
{
"Authorization"
:
f
"Bearer
{
os
.
environ
.
get
(
'OPENAI_API_KEY'
)
}
"
,
}
if
request_func_input
.
request_id
:
headers
[
"x-request-id"
]
=
request_func_input
.
request_id
# Send audio file
def
to_bytes
(
y
,
sr
):
...
...
vllm/benchmarks/serve.py
View file @
31436e8b
...
...
@@ -478,11 +478,12 @@ async def benchmark(
"timestamp"
:
timestamp
})
last_int_rps
=
current_int_rps
prompt
,
prompt_len
,
output_len
,
mm_content
=
(
prompt
,
prompt_len
,
output_len
,
mm_content
,
request_id
=
(
request
.
prompt
,
request
.
prompt_len
,
request
.
expected_output_len
,
request
.
multi_modal_data
,
request
.
request_id
,
)
req_model_id
,
req_model_name
=
model_id
,
model_name
if
lora_modules
:
...
...
@@ -498,7 +499,8 @@ async def benchmark(
logprobs
=
logprobs
,
multi_modal_content
=
mm_content
,
ignore_eos
=
ignore_eos
,
extra_body
=
extra_body
)
extra_body
=
extra_body
,
request_id
=
request_id
,)
tasks
.
append
(
asyncio
.
create_task
(
limited_request_func
(
request_func_input
=
request_func_input
,
...
...
@@ -865,6 +867,14 @@ def add_cli_args(parser: argparse.ArgumentParser):
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
"and the blog: https://hao-ai-lab.github.io/blogs/distserve"
,
)
parser
.
add_argument
(
"--request-id-prefix"
,
type
=
str
,
required
=
False
,
default
=
"benchmark-serving"
,
help
=
"Specify the prefix of request id."
,
)
sampling_group
=
parser
.
add_argument_group
(
"sampling parameters"
)
sampling_group
.
add_argument
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment