Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinilm
Commits
515b9245
Commit
515b9245
authored
Oct 10, 2025
by
PanZezhong
Browse files
feat: add ceval test script
parent
ddea3d19
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
158 additions
and
0 deletions
+158
-0
scripts/test_ceval.py
scripts/test_ceval.py
+158
-0
No files found.
scripts/test_ceval.py
0 → 100644
View file @
515b9245
import
sys
from
jiuge
import
*
from
datasets
import
load_dataset
class
JiugeForCeval
(
JiugeForCauslLM
):
def
__init__
(
self
,
model_dir_path
,
device
=
DeviceType
.
DEVICE_TYPE_CPU
,
ndev
=
1
,
max_tokens
=
None
):
super
().
__init__
(
model_dir_path
,
device
,
ndev
,
max_tokens
)
pass
def
generate
(
self
,
conversation
,
max_steps
,
topp_
=
1.0
,
topk_
=
1
,
temperature_
=
1.0
):
input_content
=
(
self
.
tokenizer
.
apply_chat_template
(
conversation
=
conversation
,
add_generation_prompt
=
True
,
tokenize
=
False
,
)
+
"正确答案是"
)
print
(
input_content
,
end
=
""
,
flush
=
True
)
tokens
=
self
.
tokenizer
.
encode
(
input_content
)
infer_task
=
InferTask
(
0
,
tokens
,
self
.
max_context_len
(),
temperature_
,
topk_
,
topp_
,
self
.
eos_token_id
,
)
infer_task
.
bind_kvcache
(
KVCache
(
self
))
steps
=
0
total_time
=
0
output_content
=
""
for
step_i
in
range
(
max_steps
):
start_time
=
time
.
time
()
output_tokens
=
self
.
batch_infer_one_round
([
infer_task
])
end_time
=
time
.
time
()
steps
+=
1
output_str
=
(
self
.
tokenizer
.
_tokenizer
.
id_to_token
(
output_tokens
[
0
])
.
replace
(
"▁"
,
" "
)
.
replace
(
"<0x0A>"
,
"
\n
"
)
)
output_content
+=
output_str
print
(
output_str
,
end
=
""
,
flush
=
True
)
if
output_tokens
[
0
]
in
self
.
eos_token_id
:
break
infer_task
.
next
(
output_tokens
[
0
])
if
step_i
>
0
:
total_time
+=
end_time
-
start_time
print
(
"
\n
"
)
avg_time
=
total_time
*
1000
/
(
steps
-
1
+
1e-9
)
print
(
f
"Time per step:
{
avg_time
:.
3
f
}
ms"
)
infer_task
.
_kv_cache
.
drop
(
self
)
return
output_content
,
avg_time
def
test
():
if
len
(
sys
.
argv
)
<
3
:
print
(
"Usage: python test_ceval.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> [n_device]"
)
sys
.
exit
(
1
)
model_path
=
sys
.
argv
[
2
]
device_type
=
DeviceType
.
DEVICE_TYPE_CPU
if
sys
.
argv
[
1
]
==
"--cpu"
:
device_type
=
DeviceType
.
DEVICE_TYPE_CPU
elif
sys
.
argv
[
1
]
==
"--nvidia"
:
device_type
=
DeviceType
.
DEVICE_TYPE_NVIDIA
elif
sys
.
argv
[
1
]
==
"--cambricon"
:
device_type
=
DeviceType
.
DEVICE_TYPE_CAMBRICON
elif
sys
.
argv
[
1
]
==
"--ascend"
:
device_type
=
DeviceType
.
DEVICE_TYPE_ASCEND
elif
sys
.
argv
[
1
]
==
"--metax"
:
device_type
=
DeviceType
.
DEVICE_TYPE_METAX
elif
sys
.
argv
[
1
]
==
"--moore"
:
device_type
=
DeviceType
.
DEVICE_TYPE_MOORE
elif
sys
.
argv
[
1
]
==
"--iluvatar"
:
device_type
=
DeviceType
.
DEVICE_TYPE_ILUVATAR
elif
sys
.
argv
[
1
]
==
"--kunlun"
:
device_type
=
DeviceType
.
DEVICE_TYPE_KUNLUN
elif
sys
.
argv
[
1
]
==
"--hygon"
:
device_type
=
DeviceType
.
DEVICE_TYPE_HYGON
else
:
print
(
"Usage: python test_ceval.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> [n_device]"
)
sys
.
exit
(
1
)
# https://huggingface.co/datasets/ceval/ceval-exam/tree/main/middle_school_geography
dataset
=
load_dataset
(
r
"ceval/ceval-exam"
,
name
=
"middle_school_mathematics"
)
# dataset = load_dataset(r"ceval/ceval-exam", name="high_school_history")
# dataset = load_dataset(r"ceval/ceval-exam", name="high_school_chinese")
# dataset = load_dataset(r"ceval/ceval-exam", name="high_school_physics")
# dataset = load_dataset(r"ceval/ceval-exam", name="middle_school_geography")
# dataset = load_dataset(r"ceval/ceval-exam", name="middle_school_physics")
samples
=
dataset
[
"val"
]
ndev
=
int
(
sys
.
argv
[
3
])
if
len
(
sys
.
argv
)
>
3
else
1
model
=
JiugeForCeval
(
model_path
,
device_type
,
ndev
)
answers_list
=
[]
for
sample
in
samples
:
input_content
=
f
"'question':
{
sample
[
'question'
]
}
,'A':
{
sample
[
'A'
]
}
, 'B':
{
sample
[
'B'
]
}
, 'C':
{
sample
[
'C'
]
}
,'D':
{
sample
[
'D'
]
}
。"
conversation
=
[
{
"role"
:
"system"
,
"content"
:
"请从question的A,B,C,D四个选项中选择正确的选项。例如,标准答案:A。"
,
},
{
"role"
:
"user"
,
"content"
:
input_content
},
]
answer
=
sample
[
"answer"
]
output_content
,
avg_time
=
model
.
generate
(
conversation
,
500
,
topp_
=
1.0
,
topk_
=
1
,
temperature_
=
1.0
)
print
(
"标准答案:"
,
answer
)
answers_list
.
append
(
{
"id"
:
sample
[
"id"
],
"output_content"
:
output_content
,
"answer"
:
answer
}
)
model
.
destroy_model_instance
()
print
(
"-------------------------------------------------------------"
)
true_num
=
0
all_num
=
0
for
cont
in
answers_list
:
id
=
cont
[
"id"
]
output
=
cont
[
"output_content"
]
answer
=
cont
[
"answer"
]
all_num
=
all_num
+
1
position
=
0
ABCD
=
output
[
position
:
position
+
2
]
if
answer
in
ABCD
:
true_num
=
true_num
+
1
print
(
f
"id
{
id
}
: "
,
"正确"
)
else
:
print
(
f
"id
{
id
}
: "
,
"错误"
)
print
(
f
"成绩:
{
true_num
}
/
{
all_num
}
"
,
true_num
/
all_num
)
if
__name__
==
"__main__"
:
test
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment