Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
5be1bed7
Unverified
Commit
5be1bed7
authored
Oct 30, 2025
by
Huamin Li
Committed by
GitHub
Oct 30, 2025
Browse files
[CI/Build]Add eval config for Qwen3-235B-A22B-Instruct-2507-FP8 (#27113)
Signed-off-by:
Huamin Li
<
3ericli@gmail.com
>
parent
31b55ffc
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
39 additions
and
4 deletions
+39
-4
.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
...al-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
+14
-0
.buildkite/lm-eval-harness/configs/models-large-h100.txt
.buildkite/lm-eval-harness/configs/models-large-h100.txt
+0
-1
.buildkite/lm-eval-harness/configs/models-large-hopper.txt
.buildkite/lm-eval-harness/configs/models-large-hopper.txt
+1
-0
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+11
-3
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+13
-0
No files found.
.buildkite/lm-eval-harness/configs/Qwen3-235B-A22B-Instruct-2507-FP8.yaml
0 → 100644
View file @
5be1bed7
model_name
:
"
Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
tasks
:
-
name
:
"
mmlu_pro"
metrics
:
-
name
:
"
exact_match,custom-extract"
value
:
0.82
limit
:
250
# will run on 250 * 14 subjects = 3500 samples
num_fewshot
:
5
enforce_eager
:
false
# we use false to speed up the eval process
kv_cache_dtype
:
fp8
# we use fp8 to speed up the eval process
max_model_len
:
40960
apply_chat_template
:
true
fewshot_as_multiturn
:
true
gen_kwargs
:
"
temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>"
.buildkite/lm-eval-harness/configs/models-large-h100.txt
deleted
100644 → 0
View file @
31b55ffc
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
.buildkite/lm-eval-harness/configs/models-large-hopper.txt
0 → 100644
View file @
5be1bed7
Qwen3-235B-A22B-Instruct-2507-FP8.yaml
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
View file @
5be1bed7
...
@@ -21,10 +21,13 @@ def launch_lm_eval(eval_config, tp_size):
...
@@ -21,10 +21,13 @@ def launch_lm_eval(eval_config, tp_size):
max_model_len
=
eval_config
.
get
(
"max_model_len"
,
4096
)
max_model_len
=
eval_config
.
get
(
"max_model_len"
,
4096
)
batch_size
=
eval_config
.
get
(
"batch_size"
,
"auto"
)
batch_size
=
eval_config
.
get
(
"batch_size"
,
"auto"
)
backend
=
eval_config
.
get
(
"backend"
,
"vllm"
)
backend
=
eval_config
.
get
(
"backend"
,
"vllm"
)
enforce_eager
=
eval_config
.
get
(
"enforce_eager"
,
"true"
)
kv_cache_dtype
=
eval_config
.
get
(
"kv_cache_dtype"
,
"auto"
)
model_args
=
(
model_args
=
(
f
"pretrained=
{
eval_config
[
'model_name'
]
}
,"
f
"pretrained=
{
eval_config
[
'model_name'
]
}
,"
f
"tensor_parallel_size=
{
tp_size
}
,"
f
"tensor_parallel_size=
{
tp_size
}
,"
f
"enforce_eager=true,"
f
"enforce_eager=
{
enforce_eager
}
,"
f
"kv_cache_dtype=
{
kv_cache_dtype
}
,"
f
"add_bos_token=true,"
f
"add_bos_token=true,"
f
"trust_remote_code=
{
trust_remote_code
}
,"
f
"trust_remote_code=
{
trust_remote_code
}
,"
f
"max_model_len=
{
max_model_len
}
,"
f
"max_model_len=
{
max_model_len
}
,"
...
@@ -37,8 +40,13 @@ def launch_lm_eval(eval_config, tp_size):
...
@@ -37,8 +40,13 @@ def launch_lm_eval(eval_config, tp_size):
limit
=
eval_config
[
"limit"
],
limit
=
eval_config
[
"limit"
],
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
# text models. however, this is regressing measured strict-match for
# text models. however, this is regressing measured strict-match for
# existing text models in CI, so only apply it for mm.
# existing text models in CI, so only apply it for mm, or explicitly set
apply_chat_template
=
backend
==
"vllm-vlm"
,
apply_chat_template
=
eval_config
.
get
(
"apply_chat_template"
,
backend
==
"vllm-vlm"
),
fewshot_as_multiturn
=
eval_config
.
get
(
"fewshot_as_multiturn"
,
False
),
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
gen_kwargs
=
eval_config
.
get
(
"gen_kwargs"
),
batch_size
=
batch_size
,
batch_size
=
batch_size
,
)
)
return
results
return
results
...
...
.buildkite/test-pipeline.yaml
View file @
5be1bed7
...
@@ -1186,6 +1186,19 @@ steps:
...
@@ -1186,6 +1186,19 @@ steps:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
##### H100 test #####
-
label
:
LM Eval Large Models (H100)
# optional
gpu
:
h100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies
:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
export VLLM_USE_DEEP_GEMM=0
# We found Triton is faster than DeepGEMM for H100
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
##### H200 test #####
##### H200 test #####
-
label
:
Distributed Tests (H200)
# optional
-
label
:
Distributed Tests (H200)
# optional
gpu
:
h200
gpu
:
h200
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment