Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinilm
Commits
6ae48322
Unverified
Commit
6ae48322
authored
Mar 02, 2026
by
gongchensu
Committed by
GitHub
Mar 02, 2026
Browse files
Merge pull request #242 from InfiniTensor/issue/241
issue/241 fix mmlu test, add vllm support
parents
e76bb324
0086ff2f
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
148 additions
and
15 deletions
+148
-15
test/bench/test_benchmark.py
test/bench/test_benchmark.py
+148
-15
No files found.
test/bench/test_benchmark.py
View file @
6ae48322
...
@@ -4,11 +4,6 @@ import time
...
@@ -4,11 +4,6 @@ import time
import
re
import
re
import
csv
import
csv
import
numpy
as
np
import
numpy
as
np
import
infinicore
from
infinilm.modeling_utils
import
load_model_state_dict_by_file
from
infinilm.distributed
import
DistConfig
from
infinilm.cache
import
StaticKVCacheConfig
,
PagedKVCacheConfig
from
infinilm.infer_engine
import
GenerationConfig
,
InferEngine
from
datasets
import
load_dataset
,
Dataset
from
datasets
import
load_dataset
,
Dataset
from
abc
import
ABC
,
abstractmethod
from
abc
import
ABC
,
abstractmethod
...
@@ -57,6 +52,11 @@ class InfiniLMBenchmark(BaseBenchmark):
...
@@ -57,6 +52,11 @@ class InfiniLMBenchmark(BaseBenchmark):
enable_paged_attn
=
False
,
enable_paged_attn
=
False
,
):
):
import
transformers
import
transformers
import
infinicore
from
infinilm.modeling_utils
import
load_model_state_dict_by_file
from
infinilm.distributed
import
DistConfig
from
infinilm.cache
import
StaticKVCacheConfig
,
PagedKVCacheConfig
from
infinilm.infer_engine
import
InferEngine
self
.
benchmark
=
benchmark
self
.
benchmark
=
benchmark
...
@@ -103,7 +103,9 @@ class InfiniLMBenchmark(BaseBenchmark):
...
@@ -103,7 +103,9 @@ class InfiniLMBenchmark(BaseBenchmark):
)
)
elif
model_type
in
[
"qwen2"
,
"qwen3"
]:
elif
model_type
in
[
"qwen2"
,
"qwen3"
]:
# For qwen2/qwen3 models: no trust_remote_code (matches jiuge line 534-536)
# For qwen2/qwen3 models: no trust_remote_code (matches jiuge line 534-536)
self
.
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
model_dir_path
)
self
.
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
model_dir_path
,
trust_remote_code
=
True
)
else
:
else
:
# Default: use trust_remote_code=True for other models
# Default: use trust_remote_code=True for other models
self
.
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
self
.
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
...
@@ -179,6 +181,9 @@ class InfiniLMBenchmark(BaseBenchmark):
...
@@ -179,6 +181,9 @@ class InfiniLMBenchmark(BaseBenchmark):
which properly handles KV cache through GenerationMixin.
which properly handles KV cache through GenerationMixin.
"""
"""
# Convert tokens to infinicore format
# Convert tokens to infinicore format
import
infinicore
from
infinilm.infer_engine
import
GenerationConfig
input_ids_list
=
[
tokens
]
input_ids_list
=
[
tokens
]
input_ids
=
infinicore
.
from_list
(
input_ids_list
)
input_ids
=
infinicore
.
from_list
(
input_ids_list
)
...
@@ -370,6 +375,124 @@ class TorchBenchmark(BaseBenchmark):
...
@@ -370,6 +375,124 @@ class TorchBenchmark(BaseBenchmark):
print
(
"Torch model destroyed"
)
print
(
"Torch model destroyed"
)
class
VLLMBenchmark
(
BaseBenchmark
):
"""vLLM backend using vllm.LLM"""
def
__init__
(
self
,
model_dir_path
,
device_type_str
=
"nvidia"
,
tensor_parallel_size
=
1
,
benchmark
=
"ceval"
,
):
import
transformers
from
vllm
import
LLM
if
device_type_str
==
"cpu"
:
raise
ValueError
(
"vLLM backend does not support CPU device type."
)
self
.
benchmark
=
benchmark
# ---- tokenizer ----
with
open
(
os
.
path
.
join
(
model_dir_path
,
"config.json"
),
"r"
)
as
f
:
import
json
self
.
config_dict
=
json
.
load
(
f
)
model_type
=
self
.
config_dict
.
get
(
"model_type"
,
""
)
if
model_type
in
[
"qwen2"
,
"qwen3"
]:
self
.
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
model_dir_path
,
trust_remote_code
=
True
)
else
:
self
.
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
model_dir_path
,
trust_remote_code
=
True
)
eos_token_id
=
self
.
config_dict
.
get
(
"eos_token_id"
)
self
.
eos_token_id
=
(
[
eos_token_id
]
if
isinstance
(
eos_token_id
,
int
)
else
eos_token_id
)
# ---- vLLM engine ----
print
(
"Loading model with vLLM backend..."
)
self
.
llm
=
LLM
(
model
=
model_dir_path
,
tensor_parallel_size
=
tensor_parallel_size
,
trust_remote_code
=
True
,
)
print
(
"vLLM model loaded successfully"
)
def
max_context_len
(
self
):
return
self
.
config_dict
.
get
(
"max_position_embeddings"
,
2048
)
def
render_input_content
(
self
,
*
args
,
**
kwargs
):
if
self
.
benchmark
==
"ceval"
:
return
render_ceval
(
self
.
tokenizer
,
*
args
,
**
kwargs
)
elif
self
.
benchmark
==
"mmlu"
:
return
render_mmlu
(
self
.
tokenizer
,
*
args
,
**
kwargs
)
else
:
raise
ValueError
(
f
"Unknown benchmark:
{
self
.
benchmark
}
"
)
def
generate
(
self
,
*
args
,
max_steps
=
500
,
topp_
=
1.0
,
topk_
=
1
,
temperature_
=
1.0
):
input_content
=
self
.
render_input_content
(
*
args
)
print
(
input_content
,
end
=
""
,
flush
=
True
)
tokens
=
self
.
encode_text
(
input_content
)
return
self
.
_generate_step
(
tokens
,
max_steps
,
topp_
,
topk_
,
temperature_
)
def
_generate_step
(
self
,
tokens
,
max_steps
,
topp_
,
topk_
,
temperature_
):
from
vllm
import
SamplingParams
prompt
=
self
.
tokenizer
.
decode
(
tokens
)
sampling_params
=
SamplingParams
(
max_tokens
=
max_steps
,
temperature
=
temperature_
,
top_p
=
topp_
,
top_k
=
topk_
,
stop_token_ids
=
self
.
eos_token_id
,
)
start_time
=
time
.
perf_counter
()
outputs
=
self
.
llm
.
generate
(
prompts
=
[
prompt
],
sampling_params
=
sampling_params
,
)
end_time
=
time
.
perf_counter
()
# ---- post process ----
output_text
=
outputs
[
0
].
outputs
[
0
].
text
# ---- stats ----
input_tokens
=
len
(
tokens
)
new_tokens
=
len
(
self
.
encode_text
(
output_text
))
total_tokens
=
input_tokens
+
new_tokens
total_time
=
end_time
-
start_time
throughput
=
total_tokens
/
total_time
if
total_time
>
0
else
0.0
print
(
output_text
)
print
()
print
(
f
"Total time:
{
total_time
*
1000
:.
2
f
}
ms"
)
print
(
f
"Input tokens:
{
input_tokens
}
"
)
print
(
f
"New tokens:
{
new_tokens
}
"
)
print
(
f
"Total tokens processed:
{
total_tokens
}
"
)
print
(
f
"Throughput:
{
throughput
:.
2
f
}
tok/s"
)
global
TOTAL_TOKENS
,
TOTAL_TIME
TOTAL_TOKENS
+=
total_tokens
TOTAL_TIME
+=
total_time
return
output_text
def
destroy_model_instance
(
self
):
del
self
.
llm
print
(
"vLLM model destroyed"
)
def
render_ceval
(
_tokenizer
,
conversation
):
def
render_ceval
(
_tokenizer
,
conversation
):
"""Render C-Eval conversation to input content"""
"""Render C-Eval conversation to input content"""
return
(
return
(
...
@@ -397,14 +520,17 @@ def render_mmlu(_tokenizer, question, choices):
...
@@ -397,14 +520,17 @@ def render_mmlu(_tokenizer, question, choices):
if
hasattr
(
_tokenizer
,
"apply_chat_template"
):
if
hasattr
(
_tokenizer
,
"apply_chat_template"
):
conversation
=
[
conversation
=
[
{
"role"
:
"system"
,
"content"
:
instruction
},
{
"role"
:
"system"
,
"content"
:
instruction
},
{
"role"
:
"user"
,
"content"
:
f
"
{
question
}
\n
{
choices_text
}
\n
Answer:
"
},
{
"role"
:
"user"
,
"content"
:
f
"
{
question
}
\n
{
choices_text
}
\n
"
},
]
]
try
:
try
:
return
_tokenizer
.
apply_chat_template
(
return
(
_tokenizer
.
apply_chat_template
(
conversation
=
conversation
,
conversation
=
conversation
,
add_generation_prompt
=
True
,
add_generation_prompt
=
True
,
tokenize
=
False
,
tokenize
=
False
,
)
)
+
"The answer is: "
)
except
Exception
:
except
Exception
:
return
prompt
return
prompt
return
prompt
return
prompt
...
@@ -663,7 +789,7 @@ def test():
...
@@ -663,7 +789,7 @@ def test():
# Parse arguments manually to handle device flags properly
# Parse arguments manually to handle device flags properly
if
len
(
sys
.
argv
)
<
4
:
if
len
(
sys
.
argv
)
<
4
:
print
(
print
(
"Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp|torch] [--ndev N] [--subject SUBJECT] [--split {test|val|all}] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]"
"Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp|torch
|vllm
] [--ndev N] [--subject SUBJECT] [--split {test|val|all}] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]"
)
)
sys
.
exit
(
1
)
sys
.
exit
(
1
)
...
@@ -750,7 +876,7 @@ def test():
...
@@ -750,7 +876,7 @@ def test():
device_type_str
=
"ali"
device_type_str
=
"ali"
else
:
else
:
print
(
print
(
"Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp|torch] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]"
"Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon | --ali] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp|torch
|vllm
] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]"
)
)
sys
.
exit
(
1
)
sys
.
exit
(
1
)
...
@@ -773,7 +899,10 @@ def test():
...
@@ -773,7 +899,10 @@ def test():
# Create model based on backend (create once, reuse for all subjects)
# Create model based on backend (create once, reuse for all subjects)
if
backend
==
"torch"
:
if
backend
==
"torch"
:
assert
ndev
==
1
,
"Torch backend only supports single-device evaluation"
model
=
TorchBenchmark
(
model_path
,
device_type_str
,
benchmark
)
model
=
TorchBenchmark
(
model_path
,
device_type_str
,
benchmark
)
elif
backend
==
"vllm"
:
model
=
VLLMBenchmark
(
model_path
,
device_type_str
,
ndev
,
benchmark
)
else
:
else
:
model
=
InfiniLMBenchmark
(
model
=
InfiniLMBenchmark
(
model_path
,
device_type_str
,
ndev
,
backend
,
benchmark
,
enable_paged_attn
model_path
,
device_type_str
,
ndev
,
backend
,
benchmark
,
enable_paged_attn
...
@@ -944,7 +1073,9 @@ def test():
...
@@ -944,7 +1073,9 @@ def test():
splits_to_load
=
(
splits_to_load
=
(
[
"test"
]
[
"test"
]
if
split
==
"test"
if
split
==
"test"
else
[
"validation"
]
if
split
==
"val"
else
[
"validation"
,
"test"
]
else
[
"validation"
]
if
split
==
"val"
else
[
"validation"
,
"test"
]
)
)
# Load each subject individually from hardcoded list, excluding "all"
# Load each subject individually from hardcoded list, excluding "all"
for
subject_name
in
mmlu_subjects
:
for
subject_name
in
mmlu_subjects
:
...
@@ -966,7 +1097,9 @@ def test():
...
@@ -966,7 +1097,9 @@ def test():
splits_to_load
=
(
splits_to_load
=
(
[
"test"
]
[
"test"
]
if
split
==
"test"
if
split
==
"test"
else
[
"validation"
]
if
split
==
"val"
else
[
"validation"
,
"test"
]
else
[
"validation"
]
if
split
==
"val"
else
[
"validation"
,
"test"
]
)
)
records
=
[]
records
=
[]
for
sp
in
splits_to_load
:
for
sp
in
splits_to_load
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment