Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
9710f718
Unverified
Commit
9710f718
authored
Sep 30, 2025
by
Liangsheng Yin
Committed by
GitHub
Sep 30, 2025
Browse files
[Eval] Add `--repeat` in `run_eval` (#11101)
parent
91847e38
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
46 additions
and
11 deletions
+46
-11
python/sglang/test/run_eval.py
python/sglang/test/run_eval.py
+46
-11
No files found.
python/sglang/test/run_eval.py
View file @
9710f718
...
...
@@ -10,11 +10,29 @@ import time
from
sglang.test.simple_eval_common
import
(
ChatCompletionSampler
,
Eval
,
make_report
,
set_ulimit
,
)
def
run_eval_once
(
args
,
base_url
:
str
,
eval_obj
:
Eval
)
->
dict
:
sampler
=
ChatCompletionSampler
(
model
=
args
.
model
,
max_tokens
=
getattr
(
args
,
"max_tokens"
,
2048
),
base_url
=
base_url
,
temperature
=
getattr
(
args
,
"temperature"
,
0.0
),
reasoning_effort
=
getattr
(
args
,
"reasoning_effort"
,
None
),
)
# Run eval
tic
=
time
.
perf_counter
()
result
=
eval_obj
(
sampler
)
latency
=
time
.
perf_counter
()
-
tic
return
result
,
latency
,
sampler
def
run_eval
(
args
):
set_ulimit
()
...
...
@@ -68,18 +86,32 @@ def run_eval(args):
else
:
raise
ValueError
(
f
"Invalid eval name:
{
args
.
eval_name
}
"
)
sampler
=
ChatCompletionSampler
(
model
=
args
.
model
,
max_tokens
=
getattr
(
args
,
"max_tokens"
,
2048
),
base_url
=
base_url
,
temperature
=
getattr
(
args
,
"temperature"
,
0.0
),
reasoning_effort
=
getattr
(
args
,
"reasoning_effort"
,
None
),
)
if
getattr
(
args
,
"repeat"
,
1
)
==
1
:
result
,
latency
,
sampler
=
run_eval_once
(
args
,
base_url
,
eval_obj
)
else
:
from
concurrent.futures
import
ThreadPoolExecutor
# Run eval
tic
=
time
.
perf_counter
()
result
=
eval_obj
(
sampler
)
latency
=
time
.
perf_counter
()
-
tic
executor
=
ThreadPoolExecutor
(
max_workers
=
args
.
repeat
)
futures
=
[
executor
.
submit
(
run_eval_once
,
args
,
base_url
,
eval_obj
)
for
_
in
range
(
args
.
repeat
)
]
scores_repeat
=
[]
for
f
in
futures
:
result
,
latency
,
sampler
=
f
.
result
()
scores_repeat
.
append
(
result
.
score
)
mean_score
=
sum
(
scores_repeat
)
/
len
(
scores_repeat
)
scores_repeat
=
[
f
"
{
s
:.
3
f
}
"
for
s
in
scores_repeat
]
print
(
"="
*
20
)
print
(
f
"Repeat:
{
args
.
repeat
}
, mean:
{
mean_score
:.
3
f
}
"
)
print
(
f
"Scores:
{
scores_repeat
}
"
)
print
(
"="
*
20
)
executor
.
shutdown
()
# Dump reports
metrics
=
result
.
metrics
|
{
"score"
:
result
.
score
}
...
...
@@ -125,6 +157,9 @@ if __name__ == "__main__":
type
=
str
,
help
=
"Name or path of the model. If not set, the default model will request /v1/models for conf."
,
)
parser
.
add_argument
(
"--repeat"
,
type
=
int
,
default
=
1
,
help
=
"repeat the evaluation n times"
)
parser
.
add_argument
(
"--eval-name"
,
type
=
str
,
default
=
"mmlu"
)
parser
.
add_argument
(
"--num-examples"
,
type
=
int
)
parser
.
add_argument
(
"--num-threads"
,
type
=
int
,
default
=
512
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment