Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinilm
Commits
9e434492
Commit
9e434492
authored
Dec 18, 2025
by
PanZezhong
Browse files
issue/140 准确率脚本支持torch,添加总吞吐
parent
7862a723
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
370 additions
and
126 deletions
+370
-126
csrc/engine/rank_worker.cpp
csrc/engine/rank_worker.cpp
+1
-0
csrc/models/llama/llama_for_causal_lm.cpp
csrc/models/llama/llama_for_causal_lm.cpp
+0
-7
python/infinilm/generation/utils.py
python/infinilm/generation/utils.py
+13
-4
test/bench/test_benchmark.py
test/bench/test_benchmark.py
+356
-115
No files found.
csrc/engine/rank_worker.cpp
View file @
9e434492
...
...
@@ -251,6 +251,7 @@ void RankWorker::thread_loop() {
}
else
if
(
local_cmd
==
Command
::
RUN
)
{
try
{
auto
out
=
model_
->
forward
(
local_args
);
infinicore
::
context
::
syncStream
();
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
mutex_
);
...
...
csrc/models/llama/llama_for_causal_lm.cpp
View file @
9e434492
...
...
@@ -34,13 +34,6 @@ infinicore::Tensor LlamaForCausalLM::forward(const infinicore::Tensor &input_ids
// 2. Apply language modeling head to get logits
auto
logits
=
lm_head_
->
forward
(
hidden_states
);
// 3. CRITICAL: Synchronize the C++ backend's context after forward pass
// This ensures all C++ backend operations complete before returning to Python
if
(
device_
.
getType
()
!=
infinicore
::
Device
::
Type
::
CPU
)
{
infinicore
::
context
::
setDevice
(
device_
,
false
);
infinicore
::
context
::
syncStream
();
}
return
logits
;
}
...
...
python/infinilm/generation/utils.py
View file @
9e434492
...
...
@@ -251,22 +251,31 @@ class GenerationMixin:
output_content
+=
output_str
end_time
=
time
.
time
()
time_list
.
append
((
end_time
-
start_time
)
*
1000
)
time_list
.
append
((
end_time
-
start_time
))
print
(
output_str
,
end
=
""
,
flush
=
True
)
if
stop_on_eos
and
token_id
in
eos_token_id_list
:
break
print
(
"
\n
</s>"
)
print
(
f
"
\n\n\n
Generation completed in
{
round
(
sum
(
time_list
),
2
)
}
ms"
)
print
(
f
"
\n\n\n
Generation completed in
{
round
(
sum
(
time_list
)
*
1000
,
2
)
}
ms"
)
print
(
f
" Batchsize=
{
batch_size
}
Per_Batch_Input_Len=
{
seq_len
}
Per_Batch_New_Tokens=
{
len
(
time_list
)
}
\n
"
)
print
(
f
" Prefill TTFT:
{
round
(
time_list
[
0
],
2
)
}
ms Throughput:
{
round
((
1000
*
batch_size
*
seq_len
)
/
time_list
[
0
],
2
)
}
tok/s
\n
"
,
f
" Prefill TTFT:
{
round
(
time_list
[
0
],
2
)
}
ms Throughput:
{
round
((
batch_size
*
seq_len
)
/
time_list
[
0
],
2
)
}
tok/s
\n
"
,
)
if
len
(
time_list
)
>
1
:
print
(
f
" Decode Avg ITL:
{
round
(
sum
(
time_list
[
1
:])
/
(
len
(
time_list
)
-
1
),
2
)
}
ms Throughput:
{
round
((
1000
*
batch_size
*
(
len
(
time_list
)
-
1
))
/
sum
(
time_list
[
1
:]),
2
)
}
tok/s
\n
"
,
f
" Decode Avg ITL:
{
round
(
sum
(
time_list
[
1
:])
*
1000
/
(
len
(
time_list
)
-
1
),
2
)
}
ms Throughput:
{
round
((
batch_size
*
(
len
(
time_list
)
-
1
))
/
sum
(
time_list
[
1
:]),
2
)
}
tok/s
\n
"
,
)
return
{
"output_token_ids"
:
output_tokens_list
,
"output_content"
:
output_content
,
"total_latency"
:
sum
(
time_list
),
"prefill_latency"
:
time_list
[
0
],
"decode_latency"
:
sum
(
time_list
[
1
:]),
"total_input_tokens"
:
batch_size
*
seq_len
,
"total_output_tokens"
:
len
(
time_list
),
}
return
output_tokens_list
,
output_content
test/bench/test_benchmark.py
View file @
9e434492
...
...
@@ -13,6 +13,10 @@ from infinilm.distributed import DistConfig
from
abc
import
ABC
,
abstractmethod
TOTAL_TOKENS
=
0
TOTAL_TIME
=
0.0
class
BaseBenchmark
(
ABC
):
"""Base class for benchmark evaluation with common tokenizer and generation utilities"""
...
...
@@ -43,7 +47,14 @@ class BaseBenchmark(ABC):
class
InfiniLMBenchmark
(
BaseBenchmark
):
"""Wrapper class for InfiniLM cpp backend for benchmark evaluation"""
def
__init__
(
self
,
model_dir_path
,
device_type_str
=
"cpu"
,
ndev
=
1
,
backend
=
"cpp"
,
benchmark
=
"ceval"
):
def
__init__
(
self
,
model_dir_path
,
device_type_str
=
"cpu"
,
ndev
=
1
,
backend
=
"cpp"
,
benchmark
=
"ceval"
,
):
import
transformers
self
.
benchmark
=
benchmark
...
...
@@ -71,6 +82,7 @@ class InfiniLMBenchmark(BaseBenchmark):
# Load config and tokenizer
with
open
(
os
.
path
.
join
(
model_dir_path
,
"config.json"
),
"r"
)
as
f
:
import
json
self
.
config_dict
=
json
.
load
(
f
)
# Align tokenizer initialization with jiuge backend (010)
...
...
@@ -78,7 +90,9 @@ class InfiniLMBenchmark(BaseBenchmark):
model_type
=
self
.
config_dict
.
get
(
"model_type"
,
""
)
if
model_type
==
"llama"
:
# For llama models: no trust_remote_code (matches jiuge line 465)
self
.
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
model_dir_path
)
self
.
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
model_dir_path
,
trust_remote_code
=
True
)
elif
model_type
in
[
"fm9g"
,
"minicpm"
,
"fm9g7b"
]:
# For fm9g/minicpm/fm9g7b models: use trust_remote_code=True (matches jiuge lines 493-495, 518-520)
self
.
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
...
...
@@ -126,48 +140,12 @@ class InfiniLMBenchmark(BaseBenchmark):
def
render_input_content
(
self
,
*
args
,
**
kwargs
):
"""Render input content based on benchmark type"""
if
self
.
benchmark
==
"ceval"
:
return
self
.
_
render_ceval
(
*
args
,
**
kwargs
)
return
render_ceval
(
self
.
tokenizer
,
*
args
,
**
kwargs
)
elif
self
.
benchmark
==
"mmlu"
:
return
self
.
_
render_mmlu
(
*
args
,
**
kwargs
)
return
render_mmlu
(
self
.
tokenizer
,
*
args
,
**
kwargs
)
else
:
raise
ValueError
(
f
"Unknown benchmark:
{
self
.
benchmark
}
"
)
def
_render_ceval
(
self
,
conversation
):
"""Render C-Eval conversation to input content"""
return
(
self
.
tokenizer
.
apply_chat_template
(
conversation
=
conversation
,
add_generation_prompt
=
True
,
tokenize
=
False
,
)
+
"正确答案是"
)
def
_render_mmlu
(
self
,
question
,
choices
):
"""Render MMLU question and choices to input content"""
choices_text
=
"
\n
"
.
join
([
f
"
{
chr
(
65
+
i
)
}
.
{
choice
}
"
for
i
,
choice
in
enumerate
(
choices
)])
instruction
=
(
"You are a multiple-choice question solver. "
"Select the correct option and respond with only the letter A, B, C, or D."
)
prompt
=
f
"
{
instruction
}
\n\n
Question:
{
question
}
\n
{
choices_text
}
\n
Answer:"
# Use chat template if available, otherwise return plain text
if
hasattr
(
self
.
tokenizer
,
'apply_chat_template'
):
conversation
=
[
{
"role"
:
"system"
,
"content"
:
instruction
},
{
"role"
:
"user"
,
"content"
:
f
"
{
question
}
\n
{
choices_text
}
\n
Answer:"
}
]
try
:
return
self
.
tokenizer
.
apply_chat_template
(
conversation
=
conversation
,
add_generation_prompt
=
True
,
tokenize
=
False
,
)
except
Exception
:
return
prompt
return
prompt
def
generate
(
self
,
*
args
,
max_steps
=
500
,
topp_
=
1.0
,
topk_
=
1
,
temperature_
=
1.0
):
"""Generate response based on benchmark type"""
# Render input content
...
...
@@ -178,11 +156,11 @@ class InfiniLMBenchmark(BaseBenchmark):
tokens
=
self
.
encode_text
(
input_content
)
# Delegate to backend-specific generation implementation
output_content
,
avg_time
=
self
.
_generate_step
(
output_content
=
self
.
_generate_step
(
tokens
,
max_steps
,
topp_
,
topk_
,
temperature_
)
return
output_content
,
avg_time
return
output_content
def
_generate_step
(
self
,
tokens
,
max_steps
,
topp_
,
topk_
,
temperature_
):
"""
...
...
@@ -199,7 +177,7 @@ class InfiniLMBenchmark(BaseBenchmark):
# Use model's built-in generate() method which properly handles KV cache
# Pass sampling parameters (temperature, topk, topp) via kwargs
output_tokens_list
,
output_conten
t
=
self
.
model
.
generate
(
resul
t
=
self
.
model
.
generate
(
input_ids
=
input_ids
,
max_new_tokens
=
max_steps
,
tokenizer
=
self
.
tokenizer
,
...
...
@@ -208,14 +186,11 @@ class InfiniLMBenchmark(BaseBenchmark):
topk
=
topk_
,
topp
=
topp_
,
)
global
TOTAL_TOKENS
,
TOTAL_TIME
TOTAL_TIME
+=
result
[
"total_latency"
]
TOTAL_TOKENS
+=
result
[
"total_input_tokens"
]
+
result
[
"total_output_tokens"
]
# Calculate average time (GenerationMixin doesn't return timing info)
# We'll use a placeholder since the timing info isn't available
print
(
"
\n
"
)
avg_time
=
0.0
# GenerationMixin doesn't expose per-step timing
print
(
f
"Time per step: N/A (using GenerationMixin.generate)"
)
return
output_content
,
avg_time
return
result
[
"output_content"
]
def
destroy_model_instance
(
self
):
# Cleanup if needed
...
...
@@ -223,6 +198,173 @@ class InfiniLMBenchmark(BaseBenchmark):
print
(
"Model destroyed"
)
class
TorchBenchmark
(
BaseBenchmark
):
"""Torch backend using HuggingFace Transformers"""
def
__init__
(
self
,
model_dir_path
,
device_type_str
=
"cpu"
,
benchmark
=
"ceval"
):
import
torch
import
transformers
self
.
benchmark
=
benchmark
# Device
if
device_type_str
==
"nvidia"
:
self
.
device
=
torch
.
device
(
"cuda"
)
elif
device_type_str
==
"cpu"
:
self
.
device
=
torch
.
device
(
"cpu"
)
else
:
raise
ValueError
(
f
"Torch backend unsupported device type:
{
device_type_str
}
"
)
# Load tokenizer
with
open
(
os
.
path
.
join
(
model_dir_path
,
"config.json"
),
"r"
)
as
f
:
import
json
self
.
config_dict
=
json
.
load
(
f
)
model_type
=
self
.
config_dict
.
get
(
"model_type"
,
""
)
if
model_type
in
[
"fm9g"
,
"minicpm"
,
"fm9g7b"
]:
self
.
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
model_dir_path
,
trust_remote_code
=
True
)
else
:
self
.
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
model_dir_path
,
trust_remote_code
=
True
)
# Load model
print
(
"Loading model with torch backend..."
)
self
.
model
=
transformers
.
AutoModelForCausalLM
.
from_pretrained
(
model_dir_path
,
torch_dtype
=
torch
.
bfloat16
if
self
.
device
.
type
==
"cuda"
else
torch
.
float32
,
trust_remote_code
=
True
,
).
to
(
self
.
device
)
self
.
model
.
eval
()
print
(
"Torch model loaded successfully"
)
eos_token_id
=
self
.
config_dict
.
get
(
"eos_token_id"
)
self
.
eos_token_id
=
(
[
eos_token_id
]
if
isinstance
(
eos_token_id
,
int
)
else
eos_token_id
)
def
max_context_len
(
self
):
return
self
.
config_dict
.
get
(
"max_position_embeddings"
,
2048
)
def
render_input_content
(
self
,
*
args
,
**
kwargs
):
if
self
.
benchmark
==
"ceval"
:
return
render_ceval
(
self
.
tokenizer
,
*
args
,
**
kwargs
)
elif
self
.
benchmark
==
"mmlu"
:
return
render_mmlu
(
self
.
tokenizer
,
*
args
,
**
kwargs
)
else
:
raise
ValueError
(
f
"Unknown benchmark:
{
self
.
benchmark
}
"
)
def
_generate_step
(
self
,
tokens
,
max_steps
,
topp_
,
topk_
,
temperature_
):
import
torch
import
time
input_ids
=
torch
.
tensor
([
tokens
],
device
=
self
.
device
)
if
self
.
device
.
type
==
"cuda"
:
torch
.
cuda
.
synchronize
()
start_time
=
time
.
perf_counter
()
outputs
=
self
.
model
.
generate
(
input_ids
=
input_ids
,
max_new_tokens
=
max_steps
,
do_sample
=
temperature_
>
0
,
temperature
=
temperature_
,
top_k
=
topk_
,
top_p
=
topp_
,
eos_token_id
=
self
.
eos_token_id
,
pad_token_id
=
2
,
)
# --- end sync ---
if
self
.
device
.
type
==
"cuda"
:
torch
.
cuda
.
synchronize
()
end_time
=
time
.
perf_counter
()
# ---- post process ----
generated_ids
=
outputs
[
0
][
len
(
tokens
)
:]
output_text
=
self
.
tokenizer
.
decode
(
generated_ids
)
# ---- stats ----
input_tokens
=
len
(
tokens
)
new_tokens
=
generated_ids
.
numel
()
total_tokens
=
input_tokens
+
new_tokens
total_time
=
end_time
-
start_time
throughput
=
total_tokens
/
total_time
if
total_time
>
0
else
0.0
print
(
output_text
)
print
()
print
(
f
"Total time:
{
total_time
*
1000
:.
2
f
}
ms"
)
print
(
f
"Input tokens:
{
input_tokens
}
"
)
print
(
f
"New tokens:
{
new_tokens
}
"
)
print
(
f
"Total tokens processed:
{
total_tokens
}
"
)
print
(
f
"Throughput:
{
throughput
:.
2
f
}
tok/s"
)
global
TOTAL_TOKENS
,
TOTAL_TIME
TOTAL_TOKENS
+=
total_tokens
TOTAL_TIME
+=
total_time
return
output_text
def
generate
(
self
,
*
args
,
max_steps
=
500
,
topp_
=
1.0
,
topk_
=
1
,
temperature_
=
1.0
):
input_content
=
self
.
render_input_content
(
*
args
)
print
(
input_content
,
end
=
""
,
flush
=
True
)
tokens
=
self
.
encode_text
(
input_content
)
return
self
.
_generate_step
(
tokens
,
max_steps
,
topp_
,
topk_
,
temperature_
)
def
destroy_model_instance
(
self
):
del
self
.
model
print
(
"Torch model destroyed"
)
def
render_ceval
(
_tokenizer
,
conversation
):
"""Render C-Eval conversation to input content"""
return
(
_tokenizer
.
apply_chat_template
(
conversation
=
conversation
,
add_generation_prompt
=
True
,
tokenize
=
False
,
)
+
"正确答案是"
)
def
render_mmlu
(
_tokenizer
,
question
,
choices
):
"""Render MMLU question and choices to input content"""
choices_text
=
"
\n
"
.
join
(
[
f
"
{
chr
(
65
+
i
)
}
.
{
choice
}
"
for
i
,
choice
in
enumerate
(
choices
)]
)
instruction
=
(
"You are a multiple-choice question solver. "
"Select the correct option and respond with only the letter A, B, C, or D."
)
prompt
=
f
"
{
instruction
}
\n\n
Question:
{
question
}
\n
{
choices_text
}
\n
Answer:"
# Use chat template if available, otherwise return plain text
if
hasattr
(
_tokenizer
,
"apply_chat_template"
):
conversation
=
[
{
"role"
:
"system"
,
"content"
:
instruction
},
{
"role"
:
"user"
,
"content"
:
f
"
{
question
}
\n
{
choices_text
}
\n
Answer:"
},
]
try
:
return
_tokenizer
.
apply_chat_template
(
conversation
=
conversation
,
add_generation_prompt
=
True
,
tokenize
=
False
,
)
except
Exception
:
return
prompt
return
prompt
def
extract_answer_ceval
(
output_content
,
answer
):
"""Extract predicted answer from C-Eval output"""
output_upper
=
output_content
.
upper
().
strip
()
...
...
@@ -238,7 +380,7 @@ def extract_answer_mmlu(output_content):
# Find first meaningful token
match
=
re
.
search
(
r
"\b([ABCD])\b"
,
output_upper
)
if
match
:
return
ord
(
match
.
group
(
1
))
-
ord
(
'A'
)
return
ord
(
match
.
group
(
1
))
-
ord
(
"A"
)
else
:
match_num
=
re
.
search
(
r
"\b([0-3])\b"
,
output_upper
)
if
match_num
:
...
...
@@ -260,44 +402,63 @@ def evaluate_samples(model, samples, benchmark, max_new_tokens, subject_name=Non
{
"role"
:
"user"
,
"content"
:
input_content
},
]
answer
=
sample
[
"answer"
]
output_content
,
avg_time
=
model
.
generate
(
conversation
,
max_steps
=
max_new_tokens
,
topp_
=
1.0
,
topk_
=
1
,
temperature_
=
1.0
output_content
=
model
.
generate
(
conversation
,
max_steps
=
max_new_tokens
,
topp_
=
1.0
,
topk_
=
1
,
temperature_
=
1.0
,
)
is_correct
=
extract_answer_ceval
(
output_content
,
answer
)
answers_list
.
append
({
"id"
:
sample
.
get
(
"id"
,
idx
),
"output_content"
:
output_content
,
"answer"
:
answer
,
"is_correct"
:
is_correct
,
"subject"
:
subject_name
})
answers_list
.
append
(
{
"id"
:
sample
.
get
(
"id"
,
idx
),
"output_content"
:
output_content
,
"answer"
:
answer
,
"is_correct"
:
is_correct
,
"subject"
:
subject_name
,
}
)
if
benchmark
==
"ceval"
:
print
(
"标准答案:"
,
answer
)
elif
benchmark
==
"mmlu"
:
question
=
sample
[
'question'
]
choices
=
sample
[
'choices'
]
answer_idx
=
sample
[
'answer'
]
# MMLU answer is 0-3 index
output_content
,
avg_time
=
model
.
generate
(
question
,
choices
,
max_steps
=
max_new_tokens
,
topp_
=
1.0
,
topk_
=
1
,
temperature_
=
1.0
question
=
sample
[
"question"
]
choices
=
sample
[
"choices"
]
answer_idx
=
sample
[
"answer"
]
# MMLU answer is 0-3 index
output_content
=
model
.
generate
(
question
,
choices
,
max_steps
=
max_new_tokens
,
topp_
=
1.0
,
topk_
=
1
,
temperature_
=
1.0
,
)
predicted_answer
=
extract_answer_mmlu
(
output_content
)
# Convert answer index to letter for display
answer_letter
=
chr
(
65
+
answer_idx
)
if
answer_idx
<
4
else
"?"
predicted_letter
=
chr
(
65
+
predicted_answer
)
if
predicted_answer
is
not
None
and
predicted_answer
<
4
else
"?"
predicted_letter
=
(
chr
(
65
+
predicted_answer
)
if
predicted_answer
is
not
None
and
predicted_answer
<
4
else
"?"
)
print
(
f
"Sample
{
idx
}
: Correct answer:
{
answer_letter
}
(
{
answer_idx
}
), Predicted:
{
predicted_letter
}
(
{
predicted_answer
}
)"
)
print
(
f
"Sample
{
idx
}
: Correct answer:
{
answer_letter
}
(
{
answer_idx
}
), Predicted:
{
predicted_letter
}
(
{
predicted_answer
}
)"
)
answers_list
.
append
({
"id"
:
idx
,
"output_content"
:
output_content
,
"answer"
:
answer_idx
,
"predicted"
:
predicted_answer
,
"subject"
:
subject_name
})
answers_list
.
append
(
{
"id"
:
idx
,
"output_content"
:
output_content
,
"answer"
:
answer_idx
,
"predicted"
:
predicted_answer
,
"subject"
:
subject_name
,
}
)
# Evaluate results for this subject
true_num
=
0
...
...
@@ -323,8 +484,14 @@ def evaluate_samples(model, samples, benchmark, max_new_tokens, subject_name=Non
print
(
f
"id
{
id
}
: Correct"
)
else
:
answer_letter
=
chr
(
65
+
answer
)
if
answer
<
4
else
"?"
predicted_letter
=
chr
(
65
+
predicted
)
if
predicted
is
not
None
and
predicted
<
4
else
"?"
print
(
f
"id
{
id
}
: Wrong (correct:
{
answer_letter
}
, predicted:
{
predicted_letter
}
)"
)
predicted_letter
=
(
chr
(
65
+
predicted
)
if
predicted
is
not
None
and
predicted
<
4
else
"?"
)
print
(
f
"id
{
id
}
: Wrong (correct:
{
answer_letter
}
, predicted:
{
predicted_letter
}
)"
)
accuracy
=
true_num
/
all_num
if
all_num
>
0
else
0.0
if
benchmark
==
"ceval"
:
...
...
@@ -337,7 +504,7 @@ def evaluate_samples(model, samples, benchmark, max_new_tokens, subject_name=Non
"correct"
:
true_num
,
"total"
:
all_num
,
"accuracy"
:
accuracy
,
"answers_list"
:
answers_list
"answers_list"
:
answers_list
,
}
...
...
@@ -360,9 +527,13 @@ def _load_ceval_from_cache(cache_dir, subject_name, split, ceval_subjects):
lower
=
fname
.
lower
()
if
split
==
"test"
and
"test"
not
in
lower
:
continue
if
split
==
"val"
and
not
any
(
x
in
lower
for
x
in
[
"val"
,
"validation"
,
"dev"
]):
if
split
==
"val"
and
not
any
(
x
in
lower
for
x
in
[
"val"
,
"validation"
,
"dev"
]
):
continue
if
split
==
"all"
and
not
any
(
x
in
lower
for
x
in
[
"val"
,
"validation"
,
"dev"
,
"test"
]):
if
split
==
"all"
and
not
any
(
x
in
lower
for
x
in
[
"val"
,
"validation"
,
"dev"
,
"test"
]
):
continue
try
:
ds
=
Dataset
.
from_file
(
os
.
path
.
join
(
root
,
fname
))
...
...
@@ -373,7 +544,9 @@ def _load_ceval_from_cache(cache_dir, subject_name, split, ceval_subjects):
return
records
# If cache_dir provided and nothing loaded, fail without network
raise
FileNotFoundError
(
f
"CEval cached data not found for subject '
{
subject_name
}
' with splits
{
split_names
}
"
)
raise
FileNotFoundError
(
f
"CEval cached data not found for subject '
{
subject_name
}
' with splits
{
split_names
}
"
)
def
_load_mmlu_from_cache
(
cache_dir
,
subject_name
,
split
,
mmlu_subjects
):
...
...
@@ -381,13 +554,16 @@ def _load_mmlu_from_cache(cache_dir, subject_name, split, mmlu_subjects):
Load MMLU data from local cache avoiding network calls.
Scans cached Arrow files under cache_dir/cais___mmlu and filters by split.
"""
def
load_one
(
subj
):
split_names
=
(
[
"test"
]
if
split
==
"test"
else
[
"validation"
,
"dev"
]
if
split
==
"val"
else
[
"validation"
,
"dev"
,
"test"
]
else
(
[
"validation"
,
"dev"
]
if
split
==
"val"
else
[
"validation"
,
"dev"
,
"test"
]
)
)
base
=
os
.
path
.
join
(
cache_dir
,
"cais___mmlu"
,
subj
)
...
...
@@ -402,9 +578,13 @@ def _load_mmlu_from_cache(cache_dir, subject_name, split, mmlu_subjects):
lower
=
fname
.
lower
()
if
split
==
"test"
and
"test"
not
in
lower
:
continue
if
split
==
"val"
and
not
any
(
x
in
lower
for
x
in
[
"validation"
,
"dev"
]):
if
split
==
"val"
and
not
any
(
x
in
lower
for
x
in
[
"validation"
,
"dev"
]
):
continue
if
split
==
"all"
and
not
any
(
x
in
lower
for
x
in
[
"validation"
,
"dev"
,
"test"
]):
if
split
==
"all"
and
not
any
(
x
in
lower
for
x
in
[
"validation"
,
"dev"
,
"test"
]
):
continue
try
:
ds
=
Dataset
.
from_file
(
os
.
path
.
join
(
root
,
fname
))
...
...
@@ -413,7 +593,9 @@ def _load_mmlu_from_cache(cache_dir, subject_name, split, mmlu_subjects):
continue
if
records
:
return
records
raise
FileNotFoundError
(
f
"MMLU cached data not found for subject '
{
subj
}
' with splits
{
split_names
}
"
)
raise
FileNotFoundError
(
f
"MMLU cached data not found for subject '
{
subj
}
' with splits
{
split_names
}
"
)
if
subject_name
==
"all"
:
# Use hardcoded list of MMLU subjects, excluding "all"
...
...
@@ -436,7 +618,7 @@ def test():
# Parse arguments manually to handle device flags properly
if
len
(
sys
.
argv
)
<
4
:
print
(
"Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp] [--ndev N] [--subject SUBJECT] [--split {test|val|all}] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]"
"Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp
|torch
] [--ndev N] [--subject SUBJECT] [--split {test|val|all}] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]"
)
sys
.
exit
(
1
)
...
...
@@ -449,7 +631,7 @@ def test():
ndev
=
1
benchmark
=
None
subject
=
"all"
# Shared for both C-Eval and MMLU, can be comma-separated
split
=
"test"
# test | val | all
split
=
"test"
# test | val | all
num_samples
=
None
max_new_tokens
=
500
output_csv
=
None
...
...
@@ -517,7 +699,7 @@ def test():
device_type_str
=
"hygon"
else
:
print
(
"Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]"
"Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp
|torch
] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N] [--output_csv PATH] [--cache_dir PATH]"
)
sys
.
exit
(
1
)
...
...
@@ -539,7 +721,12 @@ def test():
# Create model based on backend (create once, reuse for all subjects)
if
backend
!=
"010"
:
model
=
InfiniLMBenchmark
(
model_path
,
device_type_str
,
ndev
,
backend
,
benchmark
)
if
backend
==
"torch"
:
model
=
TorchBenchmark
(
model_path
,
device_type_str
,
benchmark
)
else
:
model
=
InfiniLMBenchmark
(
model_path
,
device_type_str
,
ndev
,
backend
,
benchmark
)
else
:
print
(
f
"test 010 backend by scripts/test_ceval.py"
)
exit
(
0
)
...
...
@@ -610,13 +797,17 @@ def test():
records
=
[]
for
split_name
in
[
"val"
,
"test"
]:
try
:
ds
=
load_dataset
(
r
"ceval/ceval-exam"
,
name
=
subj
,
split
=
split_name
)
ds
=
load_dataset
(
r
"ceval/ceval-exam"
,
name
=
subj
,
split
=
split_name
)
records
.
extend
(
ds
.
to_list
())
except
Exception
:
continue
if
records
:
return
records
raise
FileNotFoundError
(
f
"No ceval splits found online for subject
{
subj
}
"
)
raise
FileNotFoundError
(
f
"No ceval splits found online for subject
{
subj
}
"
)
hf_split
=
"test"
if
split
==
"test"
else
"val"
ds
=
load_dataset
(
r
"ceval/ceval-exam"
,
name
=
subj
,
split
=
hf_split
)
data
=
ds
.
to_list
()
...
...
@@ -630,7 +821,9 @@ def test():
return
samples
,
"all"
else
:
if
subj_name
not
in
ceval_subjects
:
raise
ValueError
(
f
"Unknown C-Eval subject '
{
subj_name
}
'. Available subjects:
{
', '
.
join
(
ceval_subjects
)
}
"
)
raise
ValueError
(
f
"Unknown C-Eval subject '
{
subj_name
}
'. Available subjects:
{
', '
.
join
(
ceval_subjects
)
}
"
)
return
_load_ceval_subject
(
subj_name
),
subj_name
elif
benchmark
==
"mmlu"
:
...
...
@@ -700,35 +893,47 @@ def test():
return
_load_mmlu_from_cache
(
cache_dir
,
subj
,
split
,
mmlu_subjects
)
if
subj
==
"all"
:
samples
=
[]
splits_to_load
=
[
"test"
]
if
split
==
"test"
else
[
"validation"
]
if
split
==
"val"
else
[
"validation"
,
"test"
]
splits_to_load
=
(
[
"test"
]
if
split
==
"test"
else
[
"validation"
]
if
split
==
"val"
else
[
"validation"
,
"test"
]
)
# Load each subject individually from hardcoded list, excluding "all"
for
subject_name
in
mmlu_subjects
:
for
sp
in
splits_to_load
:
try
:
dataset
=
load_dataset
(
"cais/mmlu"
,
subject_name
,
split
=
sp
)
if
hasattr
(
dataset
,
'
to_list
'
):
if
hasattr
(
dataset
,
"
to_list
"
):
samples
.
extend
(
dataset
.
to_list
())
else
:
samples
.
extend
(
list
(
dataset
))
except
Exception
:
continue
if
not
samples
:
raise
FileNotFoundError
(
f
"No MMLU data found for any subject in the list"
)
raise
FileNotFoundError
(
f
"No MMLU data found for any subject in the list"
)
return
samples
,
"all"
else
:
splits_to_load
=
[
"test"
]
if
split
==
"test"
else
[
"validation"
]
if
split
==
"val"
else
[
"validation"
,
"test"
]
splits_to_load
=
(
[
"test"
]
if
split
==
"test"
else
[
"validation"
]
if
split
==
"val"
else
[
"validation"
,
"test"
]
)
records
=
[]
for
sp
in
splits_to_load
:
try
:
dataset
=
load_dataset
(
"cais/mmlu"
,
subj
,
split
=
sp
)
if
hasattr
(
dataset
,
'
to_list
'
):
if
hasattr
(
dataset
,
"
to_list
"
):
records
.
extend
(
dataset
.
to_list
())
else
:
records
.
extend
(
list
(
dataset
))
except
Exception
:
continue
if
not
records
:
raise
FileNotFoundError
(
f
"MMLU subject
{
subj
}
split(s)
{
splits_to_load
}
not found"
)
raise
FileNotFoundError
(
f
"MMLU subject
{
subj
}
split(s)
{
splits_to_load
}
not found"
)
return
records
,
subj
def
load_subject_samples
(
subj_name
):
...
...
@@ -759,7 +964,9 @@ def test():
if
num_samples
is
not
None
and
num_samples
>
0
:
original_count
=
len
(
samples
)
samples
=
samples
[:
num_samples
]
print
(
f
"Limited to
{
len
(
samples
)
}
samples for validation (from
{
original_count
}
total)"
)
print
(
f
"Limited to
{
len
(
samples
)
}
samples for validation (from
{
original_count
}
total)"
)
# Test with first sample if available
if
len
(
samples
)
>
0
:
...
...
@@ -773,17 +980,34 @@ def test():
},
{
"role"
:
"user"
,
"content"
:
input_content
},
]
test_output
,
_
=
model
.
generate
(
test_conversation
,
max_steps
=
max_new_tokens
,
topp_
=
1.0
,
topk_
=
1
,
temperature_
=
1.0
)
test_output
=
model
.
generate
(
test_conversation
,
max_steps
=
max_new_tokens
,
topp_
=
1.0
,
topk_
=
1
,
temperature_
=
1.0
,
)
elif
benchmark
==
"mmlu"
:
question
=
sample
[
'question'
]
choices
=
sample
[
'choices'
]
test_output
,
_
=
model
.
generate
(
question
,
choices
,
max_steps
=
max_new_tokens
,
topp_
=
1.0
,
topk_
=
1
,
temperature_
=
1.0
)
question
=
sample
[
"question"
]
choices
=
sample
[
"choices"
]
test_output
=
model
.
generate
(
question
,
choices
,
max_steps
=
max_new_tokens
,
topp_
=
1.0
,
topk_
=
1
,
temperature_
=
1.0
,
)
print
(
f
"
\n
Test output:
{
test_output
}
\n
"
)
# Evaluate samples for this subject
result
=
evaluate_samples
(
model
,
samples
,
benchmark
,
max_new_tokens
,
actual_subj_name
)
result
=
evaluate_samples
(
model
,
samples
,
benchmark
,
max_new_tokens
,
actual_subj_name
)
all_results
.
append
(
result
)
print
(
f
"
\n
Subject '
{
actual_subj_name
}
' completed:
{
result
[
'correct'
]
}
/
{
result
[
'total'
]
}
=
{
result
[
'accuracy'
]:.
2
%
}
"
)
print
(
f
"
\n
Subject '
{
actual_subj_name
}
' completed:
{
result
[
'correct'
]
}
/
{
result
[
'total'
]
}
=
{
result
[
'accuracy'
]:.
2
%
}
"
)
except
Exception
as
e
:
print
(
f
"Error evaluating subject '
{
subj
}
':
{
e
}
"
)
...
...
@@ -792,27 +1016,44 @@ def test():
model
.
destroy_model_instance
()
# Calculate overall results
overall_correct
=
sum
(
r
[
'
correct
'
]
for
r
in
all_results
)
overall_total
=
sum
(
r
[
'
total
'
]
for
r
in
all_results
)
overall_correct
=
sum
(
r
[
"
correct
"
]
for
r
in
all_results
)
overall_total
=
sum
(
r
[
"
total
"
]
for
r
in
all_results
)
overall_accuracy
=
overall_correct
/
overall_total
if
overall_total
>
0
else
0.0
print
(
f
"
\n
{
'='
*
60
}
"
)
print
(
"OVERALL RESULTS"
)
print
(
f
"
{
'='
*
60
}
"
)
if
benchmark
==
"ceval"
:
print
(
f
"Overall 成绩:
{
overall_correct
}
/
{
overall_total
}
=
{
overall_accuracy
:.
2
%
}
"
)
print
(
f
"Overall 成绩:
{
overall_correct
}
/
{
overall_total
}
=
{
overall_accuracy
:.
2
%
}
"
)
else
:
print
(
f
"Overall Accuracy:
{
overall_correct
}
/
{
overall_total
}
=
{
overall_accuracy
:.
2
%
}
"
)
print
(
f
"Overall Accuracy:
{
overall_correct
}
/
{
overall_total
}
=
{
overall_accuracy
:.
2
%
}
"
)
print
(
f
"Total Latency:
{
TOTAL_TIME
}
seconds"
)
print
(
f
"Total Tokens Processed:
{
TOTAL_TOKENS
}
tokens"
)
print
(
f
"Overall Throughput:
{
TOTAL_TOKENS
/
TOTAL_TIME
:.
2
f
}
tokens/s"
)
# Write CSV if output path is specified
if
output_csv
:
print
(
f
"
\n
Writing results to CSV:
{
output_csv
}
"
)
with
open
(
output_csv
,
'w'
,
newline
=
''
,
encoding
=
'
utf-8
'
)
as
csvfile
:
with
open
(
output_csv
,
"w"
,
newline
=
""
,
encoding
=
"
utf-8
"
)
as
csvfile
:
writer
=
csv
.
writer
(
csvfile
)
writer
.
writerow
([
'
Subject
'
,
'
Correct
'
,
'
Total
'
,
'
Accuracy
'
])
writer
.
writerow
([
"
Subject
"
,
"
Correct
"
,
"
Total
"
,
"
Accuracy
"
])
for
result
in
all_results
:
writer
.
writerow
([
result
[
'subject'
],
result
[
'correct'
],
result
[
'total'
],
f
"
{
result
[
'accuracy'
]:.
4
f
}
"
])
writer
.
writerow
([
'Overall'
,
overall_correct
,
overall_total
,
f
"
{
overall_accuracy
:.
4
f
}
"
])
writer
.
writerow
(
[
result
[
"subject"
],
result
[
"correct"
],
result
[
"total"
],
f
"
{
result
[
'accuracy'
]:.
4
f
}
"
,
]
)
writer
.
writerow
(
[
"Overall"
,
overall_correct
,
overall_total
,
f
"
{
overall_accuracy
:.
4
f
}
"
]
)
print
(
f
"CSV file written successfully:
{
output_csv
}
"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment