Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
9502c381
Unverified
Commit
9502c381
authored
Jun 26, 2025
by
Ekagra Ranjan
Committed by
GitHub
Jun 25, 2025
Browse files
[Benchmark][Bug] Fix multiple bugs in bench and add args to spec_decode offline (#20083)
parent
25826835
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
28 additions
and
11 deletions
+28
-11
benchmarks/benchmark_dataset.py
benchmarks/benchmark_dataset.py
+2
-1
examples/offline_inference/spec_decode.py
examples/offline_inference/spec_decode.py
+13
-7
vllm/benchmarks/datasets.py
vllm/benchmarks/datasets.py
+7
-3
vllm/benchmarks/serve.py
vllm/benchmarks/serve.py
+6
-0
No files found.
benchmarks/benchmark_dataset.py
View file @
9502c381
...
@@ -349,8 +349,9 @@ class RandomDataset(BenchmarkDataset):
...
@@ -349,8 +349,9 @@ class RandomDataset(BenchmarkDataset):
# [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
# [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
# To avoid uncontrolled change of the prompt length,
# To avoid uncontrolled change of the prompt length,
# the encoded sequence is truncated before being decode again.
# the encoded sequence is truncated before being decode again.
total_input_len
=
prefix_len
+
int
(
input_lens
[
i
])
re_encoded_sequence
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
False
)[
re_encoded_sequence
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
False
)[
:
input_len
s
[
i
]
:
total_
input_len
]
]
prompt
=
tokenizer
.
decode
(
re_encoded_sequence
)
prompt
=
tokenizer
.
decode
(
re_encoded_sequence
)
total_input_len
=
len
(
re_encoded_sequence
)
total_input_len
=
len
(
re_encoded_sequence
)
...
...
examples/offline_inference/spec_decode.py
View file @
9502c381
...
@@ -39,6 +39,9 @@ def parse_args():
...
@@ -39,6 +39,9 @@ def parse_args():
parser
.
add_argument
(
"--top-k"
,
type
=
int
,
default
=-
1
)
parser
.
add_argument
(
"--top-k"
,
type
=
int
,
default
=-
1
)
parser
.
add_argument
(
"--print-output"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--print-output"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--output-len"
,
type
=
int
,
default
=
256
)
parser
.
add_argument
(
"--output-len"
,
type
=
int
,
default
=
256
)
parser
.
add_argument
(
"--model-dir"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--eagle-dir"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--max-model-len"
,
type
=
int
,
default
=
2048
)
return
parser
.
parse_args
()
return
parser
.
parse_args
()
...
@@ -46,9 +49,10 @@ def main():
...
@@ -46,9 +49,10 @@ def main():
args
=
parse_args
()
args
=
parse_args
()
args
.
endpoint_type
=
"openai-chat"
args
.
endpoint_type
=
"openai-chat"
model_dir
=
args
.
model_dir
if
args
.
model_dir
is
None
:
model_dir
=
"meta-llama/Llama-3.1-8B-Instruct"
model_dir
=
"meta-llama/Llama-3.1-8B-Instruct"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_dir
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_dir
)
max_model_len
=
2048
prompts
=
get_samples
(
args
,
tokenizer
)
prompts
=
get_samples
(
args
,
tokenizer
)
# add_special_tokens is False to avoid adding bos twice when using chat templates
# add_special_tokens is False to avoid adding bos twice when using chat templates
...
@@ -57,16 +61,18 @@ def main():
...
@@ -57,16 +61,18 @@ def main():
]
]
if
args
.
method
==
"eagle"
or
args
.
method
==
"eagle3"
:
if
args
.
method
==
"eagle"
or
args
.
method
==
"eagle3"
:
if
args
.
method
==
"eagle"
:
eagle_dir
=
args
.
eagle_dir
if
args
.
method
==
"eagle"
and
eagle_dir
is
None
:
eagle_dir
=
"yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
eagle_dir
=
"yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
elif
args
.
method
==
"eagle3"
:
elif
args
.
method
==
"eagle3"
and
eagle_dir
is
None
:
eagle_dir
=
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
eagle_dir
=
"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
speculative_config
=
{
speculative_config
=
{
"method"
:
args
.
method
,
"method"
:
args
.
method
,
"model"
:
eagle_dir
,
"model"
:
eagle_dir
,
"num_speculative_tokens"
:
args
.
num_spec_tokens
,
"num_speculative_tokens"
:
args
.
num_spec_tokens
,
"draft_tensor_parallel_size"
:
args
.
draft_tp
,
"draft_tensor_parallel_size"
:
args
.
draft_tp
,
"max_model_len"
:
max_model_len
,
"max_model_len"
:
args
.
max_model_len
,
}
}
elif
args
.
method
==
"ngram"
:
elif
args
.
method
==
"ngram"
:
speculative_config
=
{
speculative_config
=
{
...
@@ -74,7 +80,7 @@ def main():
...
@@ -74,7 +80,7 @@ def main():
"num_speculative_tokens"
:
args
.
num_spec_tokens
,
"num_speculative_tokens"
:
args
.
num_spec_tokens
,
"prompt_lookup_max"
:
args
.
prompt_lookup_max
,
"prompt_lookup_max"
:
args
.
prompt_lookup_max
,
"prompt_lookup_min"
:
args
.
prompt_lookup_min
,
"prompt_lookup_min"
:
args
.
prompt_lookup_min
,
"max_model_len"
:
max_model_len
,
"max_model_len"
:
args
.
max_model_len
,
}
}
else
:
else
:
raise
ValueError
(
f
"unknown method:
{
args
.
method
}
"
)
raise
ValueError
(
f
"unknown method:
{
args
.
method
}
"
)
...
@@ -86,7 +92,7 @@ def main():
...
@@ -86,7 +92,7 @@ def main():
enable_chunked_prefill
=
args
.
enable_chunked_prefill
,
enable_chunked_prefill
=
args
.
enable_chunked_prefill
,
max_num_batched_tokens
=
args
.
max_num_batched_tokens
,
max_num_batched_tokens
=
args
.
max_num_batched_tokens
,
enforce_eager
=
args
.
enforce_eager
,
enforce_eager
=
args
.
enforce_eager
,
max_model_len
=
max_model_len
,
max_model_len
=
args
.
max_model_len
,
max_num_seqs
=
args
.
max_num_seqs
,
max_num_seqs
=
args
.
max_num_seqs
,
gpu_memory_utilization
=
0.8
,
gpu_memory_utilization
=
0.8
,
speculative_config
=
speculative_config
,
speculative_config
=
speculative_config
,
...
...
vllm/benchmarks/datasets.py
View file @
9502c381
...
@@ -320,6 +320,8 @@ class RandomDataset(BenchmarkDataset):
...
@@ -320,6 +320,8 @@ class RandomDataset(BenchmarkDataset):
**
kwargs
,
**
kwargs
,
)
->
None
:
)
->
None
:
super
().
__init__
(
**
kwargs
)
super
().
__init__
(
**
kwargs
)
random
.
seed
(
self
.
random_seed
)
np
.
random
.
seed
(
self
.
random_seed
)
def
sample
(
def
sample
(
self
,
self
,
...
@@ -376,10 +378,11 @@ class RandomDataset(BenchmarkDataset):
...
@@ -376,10 +378,11 @@ class RandomDataset(BenchmarkDataset):
# [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
# [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
# To avoid uncontrolled change of the prompt length,
# To avoid uncontrolled change of the prompt length,
# the encoded sequence is truncated before being decode again.
# the encoded sequence is truncated before being decode again.
total_input_len
=
prefix_len
+
int
(
input_lens
[
i
])
re_encoded_sequence
=
tokenizer
.
encode
(
re_encoded_sequence
=
tokenizer
.
encode
(
prompt
,
add_special_tokens
=
False
)[:
input_len
s
[
i
]
]
prompt
,
add_special_tokens
=
False
)[:
total_
input_len
]
prompt
=
tokenizer
.
decode
(
re_encoded_sequence
)
prompt
=
tokenizer
.
decode
(
re_encoded_sequence
)
total_input_len
=
prefix_len
+
int
(
input_lens
[
i
]
)
total_input_len
=
len
(
re_encoded_sequence
)
requests
.
append
(
requests
.
append
(
SampleRequest
(
SampleRequest
(
prompt
=
prompt
,
prompt
=
prompt
,
...
@@ -692,7 +695,8 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
...
@@ -692,7 +695,8 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
dataset_path
=
args
.
dataset_path
).
dataset_path
=
args
.
dataset_path
).
sample
(
tokenizer
=
tokenizer
,
num_requests
=
args
.
num_prompts
),
sample
(
tokenizer
=
tokenizer
,
num_requests
=
args
.
num_prompts
),
"random"
:
"random"
:
lambda
:
RandomDataset
(
dataset_path
=
args
.
dataset_path
).
sample
(
lambda
:
RandomDataset
(
random_seed
=
args
.
seed
,
dataset_path
=
args
.
dataset_path
).
sample
(
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
num_requests
=
args
.
num_prompts
,
num_requests
=
args
.
num_prompts
,
prefix_len
=
args
.
random_prefix_len
,
prefix_len
=
args
.
random_prefix_len
,
...
...
vllm/benchmarks/serve.py
View file @
9502c381
...
@@ -631,6 +631,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
...
@@ -631,6 +631,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
help
=
"The label (prefix) of the benchmark results. If not specified, "
help
=
"The label (prefix) of the benchmark results. If not specified, "
"the endpoint type will be used as the label."
,
"the endpoint type will be used as the label."
,
)
)
parser
.
add_argument
(
"--backend"
,
type
=
str
,
default
=
"vllm"
,
choices
=
list
(
ASYNC_REQUEST_FUNCS
.
keys
()),
)
parser
.
add_argument
(
parser
.
add_argument
(
"--base-url"
,
"--base-url"
,
type
=
str
,
type
=
str
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment