Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c83c4ff8
Unverified
Commit
c83c4ff8
authored
Sep 03, 2025
by
Jiangyun Zhu
Committed by
GitHub
Sep 02, 2025
Browse files
[Benchmark] Add support for local hf dataset path in benchmark (#23999)
Signed-off-by:
zjy0516
<
riverclouds.zhu@qq.com
>
parent
0e1759cd
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
56 additions
and
15 deletions
+56
-15
benchmarks/README.md
benchmarks/README.md
+6
-1
vllm/benchmarks/datasets.py
vllm/benchmarks/datasets.py
+50
-14
No files found.
benchmarks/README.md
View file @
c83c4ff8
...
...
@@ -110,7 +110,12 @@ become available.
🚧: to be supported
**Note**
: HuggingFace dataset's
`dataset-name`
should be set to
`hf`
**Note**
: HuggingFace dataset's
`dataset-name`
should be set to
`hf`
.
For local
`dataset-path`
, please set
`hf-name`
to its Hugging Face ID like
```
bash
--dataset-path
/datasets/VisionArena-Chat/
--hf-name
lmarena-ai/VisionArena-Chat
```
## 🚀 Example - Online Benchmark
...
...
vllm/benchmarks/datasets.py
View file @
c83c4ff8
...
...
@@ -1227,6 +1227,16 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
type
=
str
,
default
=
None
,
help
=
"Split of the HF dataset."
)
hf_group
.
add_argument
(
"--hf-name"
,
type
=
str
,
default
=
None
,
help
=
(
"Name of the dataset on HuggingFace "
"(e.g., 'lmarena-ai/VisionArena-Chat'). "
"Specify this if your dataset-path is a local path."
),
)
hf_group
.
add_argument
(
"--hf-output-len"
,
type
=
int
,
...
...
@@ -1307,28 +1317,53 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
elif
args
.
dataset_name
==
"hf"
:
# all following datasets are implemented from the
# HuggingFaceDataset base class
if
args
.
dataset_path
in
VisionArenaDataset
.
SUPPORTED_DATASET_PATHS
:
if
(
args
.
dataset_path
in
VisionArenaDataset
.
SUPPORTED_DATASET_PATHS
or
args
.
hf_name
in
VisionArenaDataset
.
SUPPORTED_DATASET_PATHS
):
dataset_class
=
VisionArenaDataset
args
.
hf_split
=
"train"
args
.
hf_subset
=
None
elif
args
.
dataset_path
in
InstructCoderDataset
.
SUPPORTED_DATASET_PATHS
:
elif
(
args
.
dataset_path
in
InstructCoderDataset
.
SUPPORTED_DATASET_PATHS
or
args
.
hf_name
in
InstructCoderDataset
.
SUPPORTED_DATASET_PATHS
):
dataset_class
=
InstructCoderDataset
args
.
hf_split
=
"train"
elif
args
.
dataset_path
in
MTBenchDataset
.
SUPPORTED_DATASET_PATHS
:
elif
(
args
.
dataset_path
in
MTBenchDataset
.
SUPPORTED_DATASET_PATHS
or
args
.
hf_name
in
MTBenchDataset
.
SUPPORTED_DATASET_PATHS
):
dataset_class
=
MTBenchDataset
args
.
hf_split
=
"train"
elif
args
.
dataset_path
in
ConversationDataset
.
SUPPORTED_DATASET_PATHS
:
elif
(
args
.
dataset_path
in
ConversationDataset
.
SUPPORTED_DATASET_PATHS
or
args
.
hf_name
in
ConversationDataset
.
SUPPORTED_DATASET_PATHS
):
dataset_class
=
ConversationDataset
elif
args
.
dataset_path
in
AIMODataset
.
SUPPORTED_DATASET_PATHS
:
elif
(
args
.
dataset_path
in
AIMODataset
.
SUPPORTED_DATASET_PATHS
or
args
.
hf_name
in
AIMODataset
.
SUPPORTED_DATASET_PATHS
):
dataset_class
=
AIMODataset
args
.
hf_split
=
"train"
elif
args
.
dataset_path
in
NextEditPredictionDataset
.
SUPPORTED_DATASET_PATHS
:
# noqa: E501
elif
(
args
.
dataset_path
in
NextEditPredictionDataset
.
SUPPORTED_DATASET_PATHS
# noqa: E501
or
args
.
hf_name
in
NextEditPredictionDataset
.
SUPPORTED_DATASET_PATHS
):
dataset_class
=
NextEditPredictionDataset
args
.
hf_split
=
"train"
elif
args
.
dataset_path
in
ASRDataset
.
SUPPORTED_DATASET_PATHS
:
elif
(
args
.
dataset_path
in
ASRDataset
.
SUPPORTED_DATASET_PATHS
or
args
.
hf_name
in
ASRDataset
.
SUPPORTED_DATASET_PATHS
):
dataset_class
=
ASRDataset
args
.
hf_split
=
"train"
elif
args
.
dataset_path
in
MLPerfDataset
.
SUPPORTED_DATASET_PATHS
:
elif
(
args
.
dataset_path
in
MLPerfDataset
.
SUPPORTED_DATASET_PATHS
or
args
.
hf_name
in
MLPerfDataset
.
SUPPORTED_DATASET_PATHS
):
dataset_class
=
MLPerfDataset
args
.
hf_split
=
"train"
else
:
...
...
@@ -1358,6 +1393,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
dataset_split
=
args
.
hf_split
,
random_seed
=
args
.
seed
,
no_stream
=
args
.
no_stream
,
hf_name
=
args
.
hf_name
,
).
sample
(
num_requests
=
args
.
num_prompts
,
tokenizer
=
tokenizer
,
...
...
@@ -1710,6 +1746,7 @@ class HuggingFaceDataset(BenchmarkDataset):
dataset_split
:
str
,
no_stream
:
bool
=
False
,
dataset_subset
:
Optional
[
str
]
=
None
,
hf_name
:
Optional
[
str
]
=
None
,
**
kwargs
,
)
->
None
:
super
().
__init__
(
dataset_path
=
dataset_path
,
**
kwargs
)
...
...
@@ -1717,6 +1754,7 @@ class HuggingFaceDataset(BenchmarkDataset):
self
.
dataset_split
=
dataset_split
self
.
dataset_subset
=
dataset_subset
self
.
load_stream
=
not
no_stream
self
.
hf_name
=
hf_name
or
dataset_path
self
.
load_data
()
def
load_data
(
self
)
->
None
:
...
...
@@ -1827,10 +1865,9 @@ class VisionArenaDataset(HuggingFaceDataset):
for
i
,
item
in
enumerate
(
self
.
data
):
if
len
(
sampled_requests
)
>=
num_requests
:
break
parser_fn
=
self
.
SUPPORTED_DATASET_PATHS
.
get
(
self
.
dataset_path
)
parser_fn
=
self
.
SUPPORTED_DATASET_PATHS
.
get
(
self
.
hf_name
)
if
parser_fn
is
None
:
raise
ValueError
(
f
"Unsupported dataset path:
{
self
.
dataset_path
}
"
)
raise
ValueError
(
f
"Unsupported dataset path:
{
self
.
hf_name
}
"
)
prompt
=
parser_fn
(
item
)
mm_content
=
process_image
(
item
[
"images"
][
0
])
prompt_len
=
len
(
tokenizer
(
prompt
).
input_ids
)
...
...
@@ -2099,10 +2136,9 @@ class NextEditPredictionDataset(HuggingFaceDataset):
def
sample
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
num_requests
:
int
,
request_id_prefix
:
str
=
""
,
**
kwargs
):
formatting_prompt_func
=
self
.
MAPPING_PROMPT_FUNCS
.
get
(
self
.
dataset_path
)
formatting_prompt_func
=
self
.
MAPPING_PROMPT_FUNCS
.
get
(
self
.
hf_name
)
if
formatting_prompt_func
is
None
:
raise
ValueError
(
f
"Unsupported dataset path:
{
self
.
dataset_path
}
"
)
raise
ValueError
(
f
"Unsupported dataset path:
{
self
.
hf_name
}
"
)
samples
=
[]
for
i
,
sample
in
enumerate
(
self
.
data
):
sample
=
formatting_prompt_func
(
sample
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment