Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
58abe354
Unverified
Commit
58abe354
authored
Mar 07, 2025
by
Jeremy Arnold
Committed by
GitHub
Mar 07, 2025
Browse files
[Benchmarks] Make detokenization optional in benchmark scripts (#11697)
Signed-off-by:
Jeremy Arnold
<
Jeremy.Arnold@amd.com
>
parent
f7ebad23
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
45 additions
and
7 deletions
+45
-7
benchmarks/benchmark_latency.py
benchmarks/benchmark_latency.py
+7
-0
benchmarks/benchmark_prefix_caching.py
benchmarks/benchmark_prefix_caching.py
+9
-1
benchmarks/benchmark_prioritization.py
benchmarks/benchmark_prioritization.py
+11
-2
benchmarks/benchmark_throughput.py
benchmarks/benchmark_throughput.py
+18
-4
No files found.
benchmarks/benchmark_latency.py
View file @
58abe354
...
...
@@ -52,6 +52,7 @@ def main(args: argparse.Namespace):
top_p
=
1.0
,
ignore_eos
=
True
,
max_tokens
=
args
.
output_len
,
detokenize
=
not
args
.
disable_detokenize
,
)
print
(
sampling_params
)
dummy_prompt_token_ids
=
np
.
random
.
randint
(
10000
,
...
...
@@ -173,6 +174,12 @@ if __name__ == "__main__":
default
=
None
,
help
=
"Path to save the latency results in JSON format."
,
)
parser
.
add_argument
(
"--disable-detokenize"
,
action
=
"store_true"
,
help
=
(
"Do not detokenize responses (i.e. do not include "
"detokenization time in the latency measurement)"
),
)
parser
=
EngineArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
...
...
benchmarks/benchmark_prefix_caching.py
View file @
58abe354
...
...
@@ -194,7 +194,9 @@ def main(args):
llm
=
LLM
(
**
dataclasses
.
asdict
(
engine_args
))
sampling_params
=
SamplingParams
(
temperature
=
0
,
max_tokens
=
args
.
output_len
)
sampling_params
=
SamplingParams
(
temperature
=
0
,
max_tokens
=
args
.
output_len
,
detokenize
=
not
args
.
disable_detokenize
)
print
(
"Testing filtered requests"
)
prompts
=
repeat_and_sort_requests
(
filtered_requests
,
...
...
@@ -243,6 +245,12 @@ if __name__ == "__main__":
"subtract this length when filtering prompts. Only used "
"when dataset-path is not provided."
,
)
parser
.
add_argument
(
'--disable-detokenize'
,
action
=
'store_true'
,
help
=
(
"Do not detokenize responses (i.e. do not include "
"detokenization time in the latency measurement)"
),
)
parser
=
EngineArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
...
...
benchmarks/benchmark_prioritization.py
View file @
58abe354
...
...
@@ -23,7 +23,7 @@ def sample_requests(
num_requests
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
fixed_output_len
:
Optional
[
int
],
)
->
list
[
tuple
[
str
,
int
,
int
]]:
)
->
list
[
tuple
[
str
,
int
,
int
,
int
]]:
if
fixed_output_len
is
not
None
and
fixed_output_len
<
4
:
raise
ValueError
(
"output_len too small"
)
...
...
@@ -71,6 +71,7 @@ def run_vllm(
requests
:
list
[
tuple
[
str
,
int
,
int
]],
n
:
int
,
engine_args
:
EngineArgs
,
disable_detokenize
:
bool
=
False
,
)
->
float
:
from
vllm
import
LLM
,
SamplingParams
llm
=
LLM
(
**
dataclasses
.
asdict
(
engine_args
))
...
...
@@ -95,6 +96,7 @@ def run_vllm(
top_p
=
1.0
,
ignore_eos
=
True
,
max_tokens
=
output_len
,
detokenize
=
not
disable_detokenize
,
))
start
=
time
.
perf_counter
()
...
...
@@ -121,7 +123,8 @@ def main(args: argparse.Namespace):
if
args
.
backend
==
"vllm"
:
elapsed_time
=
run_vllm
(
requests
,
args
.
n
,
EngineArgs
.
from_cli_args
(
args
))
EngineArgs
.
from_cli_args
(
args
),
args
.
disable_detokenize
)
else
:
raise
ValueError
(
f
"Unknown backend:
{
args
.
backend
}
"
)
total_num_tokens
=
sum
(
prompt_len
+
output_len
...
...
@@ -174,6 +177,12 @@ if __name__ == "__main__":
type
=
str
,
default
=
None
,
help
=
'Path to save the throughput results in JSON format.'
)
parser
.
add_argument
(
'--disable-detokenize'
,
action
=
'store_true'
,
help
=
(
"Do not detokenize responses (i.e. do not include "
"detokenization time in the latency measurement)"
),
)
parser
=
EngineArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
...
...
benchmarks/benchmark_throughput.py
View file @
58abe354
...
...
@@ -168,6 +168,7 @@ def run_vllm(
requests
:
list
[
SampleRequest
],
n
:
int
,
engine_args
:
EngineArgs
,
disable_detokenize
:
bool
=
False
,
)
->
float
:
from
vllm
import
LLM
,
SamplingParams
llm
=
LLM
(
**
dataclasses
.
asdict
(
engine_args
))
...
...
@@ -194,6 +195,7 @@ def run_vllm(
top_p
=
1.0
,
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
detokenize
=
not
disable_detokenize
,
))
lora_requests
:
Optional
[
list
[
LoRARequest
]]
=
None
if
engine_args
.
enable_lora
:
...
...
@@ -232,6 +234,7 @@ async def run_vllm_async(
n
:
int
,
engine_args
:
AsyncEngineArgs
,
disable_frontend_multiprocessing
:
bool
=
False
,
disable_detokenize
:
bool
=
False
,
)
->
float
:
from
vllm
import
SamplingParams
...
...
@@ -262,6 +265,7 @@ async def run_vllm_async(
top_p
=
1.0
,
ignore_eos
=
True
,
max_tokens
=
request
.
expected_output_len
,
detokenize
=
not
disable_detokenize
,
))
lora_requests
.
append
(
request
.
lora_request
)
...
...
@@ -288,6 +292,7 @@ def run_hf(
n
:
int
,
max_batch_size
:
int
,
trust_remote_code
:
bool
,
disable_detokenize
:
bool
=
False
,
)
->
float
:
llm
=
AutoModelForCausalLM
.
from_pretrained
(
model
,
torch_dtype
=
torch
.
float16
,
trust_remote_code
=
trust_remote_code
)
...
...
@@ -327,8 +332,9 @@ def run_hf(
use_cache
=
True
,
max_new_tokens
=
max_output_len
,
)
# Include the decoding time.
tokenizer
.
batch_decode
(
llm_outputs
,
skip_special_tokens
=
True
)
if
not
disable_detokenize
:
# Include the decoding time.
tokenizer
.
batch_decode
(
llm_outputs
,
skip_special_tokens
=
True
)
pbar
.
update
(
len
(
batch
))
# Clear the batch.
...
...
@@ -440,14 +446,17 @@ def main(args: argparse.Namespace):
args
.
n
,
AsyncEngineArgs
.
from_cli_args
(
args
),
args
.
disable_frontend_multiprocessing
,
args
.
disable_detokenize
,
))
else
:
elapsed_time
=
run_vllm
(
requests
,
args
.
n
,
EngineArgs
.
from_cli_args
(
args
))
EngineArgs
.
from_cli_args
(
args
),
args
.
disable_detokenize
)
elif
args
.
backend
==
"hf"
:
assert
args
.
tensor_parallel_size
==
1
elapsed_time
=
run_hf
(
requests
,
args
.
model
,
tokenizer
,
args
.
n
,
args
.
hf_max_batch_size
,
args
.
trust_remote_code
)
args
.
hf_max_batch_size
,
args
.
trust_remote_code
,
args
.
disable_detokenize
)
elif
args
.
backend
==
"mii"
:
elapsed_time
=
run_mii
(
requests
,
args
.
model
,
args
.
tensor_parallel_size
,
args
.
output_len
)
...
...
@@ -526,6 +535,11 @@ if __name__ == "__main__":
action
=
'store_true'
,
default
=
False
,
help
=
"Disable decoupled async engine frontend."
)
parser
.
add_argument
(
"--disable-detokenize"
,
action
=
"store_true"
,
help
=
(
"Do not detokenize the response (i.e. do not include "
"detokenization time in the measurement)"
))
# LoRA
parser
.
add_argument
(
"--lora-path"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment