Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
bcf43ab1
Unverified
Commit
bcf43ab1
authored
Dec 04, 2025
by
Zhewen Li
Committed by
GitHub
Dec 04, 2025
Browse files
[CI/Build][AMD] Add Llama4 Maverick FP8 to AMD CI (#28695)
Signed-off-by:
zhewenli
<
zhewenli@meta.com
>
parent
4470ee2f
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
158 additions
and
112 deletions
+158
-112
.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
.../configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+1
-0
.buildkite/lm-eval-harness/configs/models-large-rocm.txt
.buildkite/lm-eval-harness/configs/models-large-rocm.txt
+1
-0
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
+55
-20
.buildkite/test-amd.yaml
.buildkite/test-amd.yaml
+101
-92
No files found.
.buildkite/lm-eval-harness/configs/Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
View file @
bcf43ab1
...
...
@@ -8,3 +8,4 @@ tasks:
value
:
0.80
limit
:
250
# will run on 250 * 14 subjects = 3500 samples
num_fewshot
:
5
rtol
:
0.05
.buildkite/lm-eval-harness/configs/models-large-rocm.txt
0 → 100644
View file @
bcf43ab1
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
.buildkite/lm-eval-harness/test_lm_eval_correctness.py
View file @
bcf43ab1
...
...
@@ -9,11 +9,40 @@ pytest -s -v test_lm_eval_correctness.py \
--tp-size=1
"""
import
os
from
contextlib
import
contextmanager
import
lm_eval
import
numpy
as
np
import
yaml
RTOL
=
0.08
DEFAULT_RTOL
=
0.08
@
contextmanager
def
scoped_env_vars
(
new_env
:
dict
[
str
,
str
]):
if
not
new_env
:
# Fast path: nothing to do
yield
return
old_values
=
{}
new_keys
=
[]
try
:
for
key
,
value
in
new_env
.
items
():
if
key
in
os
.
environ
:
old_values
[
key
]
=
os
.
environ
[
key
]
else
:
new_keys
.
append
(
key
)
os
.
environ
[
key
]
=
str
(
value
)
yield
finally
:
# Restore / clean up
for
key
,
value
in
old_values
.
items
():
os
.
environ
[
key
]
=
value
for
key
in
new_keys
:
os
.
environ
.
pop
(
key
,
None
)
def
launch_lm_eval
(
eval_config
,
tp_size
):
...
...
@@ -32,23 +61,26 @@ def launch_lm_eval(eval_config, tp_size):
f
"trust_remote_code=
{
trust_remote_code
}
,"
f
"max_model_len=
{
max_model_len
}
,"
)
results
=
lm_eval
.
simple_evaluate
(
model
=
backend
,
model_args
=
model_args
,
tasks
=
[
task
[
"name"
]
for
task
in
eval_config
[
"tasks"
]],
num_fewshot
=
eval_config
[
"num_fewshot"
],
limit
=
eval_config
[
"limit"
],
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
# text models. however, this is regressing measured strict-match for
# existing text models in CI, so only apply it for mm, or explicitly set
apply_chat_template
=
eval_config
.
get
(
"apply_chat_template"
,
backend
==
"vllm-vlm"
),
fewshot_as_multiturn
=
eval_config
.
get
(
"fewshot_as_multiturn"
,
False
),
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
gen_kwargs
=
eval_config
.
get
(
"gen_kwargs"
),
batch_size
=
batch_size
,
)
env_vars
=
eval_config
.
get
(
"env_vars"
,
None
)
with
scoped_env_vars
(
env_vars
):
results
=
lm_eval
.
simple_evaluate
(
model
=
backend
,
model_args
=
model_args
,
tasks
=
[
task
[
"name"
]
for
task
in
eval_config
[
"tasks"
]],
num_fewshot
=
eval_config
[
"num_fewshot"
],
limit
=
eval_config
[
"limit"
],
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
# text models. however, this is regressing measured strict-match for
# existing text models in CI, so only apply it for mm, or explicitly set
apply_chat_template
=
eval_config
.
get
(
"apply_chat_template"
,
backend
==
"vllm-vlm"
),
fewshot_as_multiturn
=
eval_config
.
get
(
"fewshot_as_multiturn"
,
False
),
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
gen_kwargs
=
eval_config
.
get
(
"gen_kwargs"
),
batch_size
=
batch_size
,
)
return
results
...
...
@@ -57,6 +89,8 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
results
=
launch_lm_eval
(
eval_config
,
tp_size
)
rtol
=
eval_config
.
get
(
"rtol"
,
DEFAULT_RTOL
)
success
=
True
for
task
in
eval_config
[
"tasks"
]:
for
metric
in
task
[
"metrics"
]:
...
...
@@ -64,8 +98,9 @@ def test_lm_eval_correctness_param(config_filename, tp_size):
measured_value
=
results
[
"results"
][
task
[
"name"
]][
metric
[
"name"
]]
print
(
f
"
{
task
[
'name'
]
}
|
{
metric
[
'name'
]
}
: "
f
"ground_truth=
{
ground_truth
}
| measured=
{
measured_value
}
"
f
"ground_truth=
{
ground_truth
:.
3
f
}
| "
f
"measured=
{
measured_value
:.
3
f
}
| rtol=
{
rtol
}
"
)
success
=
success
and
np
.
isclose
(
ground_truth
,
measured_value
,
rtol
=
RTOL
)
success
=
success
and
np
.
isclose
(
ground_truth
,
measured_value
,
rtol
=
rtol
)
assert
success
.buildkite/test-amd.yaml
View file @
bcf43ab1
...
...
@@ -718,17 +718,6 @@ steps:
-
uv pip install --system conch-triton-kernels
-
VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
-
label
:
LM Eval Small Models
# 15min
timeout_in_minutes
:
20
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
agent_pool
:
mi325_1
# grade: Blocking
source_file_dependencies
:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
-
label
:
OpenAI API correctness
# 10min
timeout_in_minutes
:
15
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
...
...
@@ -974,19 +963,6 @@ steps:
-
pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
-
cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model
# Otherwise, mp_method="spawn" doesn't work
-
label
:
Multi-Modal Accuracy Eval (Small Models)
# 10min
timeout_in_minutes
:
70
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
agent_pool
:
mi325_1
# grade: Blocking
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies
:
-
vllm/multimodal/
-
vllm/inputs/
-
vllm/v1/core/
commands
:
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
-
label
:
Multi-Modal Models Test (Extended)
1
# 60min
timeout_in_minutes
:
120
mirror_hardwares
:
[
amdexperimental
]
...
...
@@ -1162,21 +1138,6 @@ steps:
# Run all e2e fusion tests
-
pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-
label
:
ROCm GPT-OSS Eval
timeout_in_minutes
:
60
working_dir
:
"
/vllm-workspace/"
agent_pool
:
mi325_1
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
optional
:
true
# run on nightlies
source_file_dependencies
:
-
tests/evals/gpt_oss
-
vllm/model_executor/models/gpt_oss.py
-
vllm/model_executor/layers/quantization/mxfp4.py
-
vllm/v1/attention/backends/flashinfer.py
commands
:
-
uv pip install --system 'gpt-oss[eval]==0.0.5'
-
VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric
0.58
-
label
:
Blackwell Quantized MoE Test
timeout_in_minutes
:
60
working_dir
:
"
/vllm-workspace/"
...
...
@@ -1194,16 +1155,6 @@ steps:
commands
:
-
pytest -s -v tests/quantization/test_blackwell_moe.py
-
label
:
Blackwell LM Eval Small Models
timeout_in_minutes
:
120
gpu
:
b200
optional
:
true
# run on nightlies
source_file_dependencies
:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
##### 1 GPU test #####
##### multi gpus test #####
...
...
@@ -1380,7 +1331,7 @@ steps:
-
pytest -v -s -x lora/test_llm_with_multi_loras.py
-
pytest -v -s -x lora/test_olmoe_tp.py
# Disabled for now because MXFP4 backend on non-cuda platform
# Disabled for now because MXFP4 backend on non-cuda platform
# doesn't support LoRA yet
#- pytest -v -s -x lora/test_gptoss_tp.py
...
...
@@ -1446,37 +1397,6 @@ steps:
-
TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
-
pytest -v -s -x lora/test_mixtral.py
-
label
:
LM Eval Large Models
# optional
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
agent_pool
:
mi325_4
# grade: Blocking
gpu
:
a100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies
:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
##### H100 test #####
-
label
:
LM Eval Large Models (H100)
# optional
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
agent_pool
:
mi325_4
# grade: Blocking
gpu
:
h100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies
:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
export VLLM_USE_DEEP_GEMM=0
# We found Triton is faster than DeepGEMM for H100
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
##### H200 test #####
-
label
:
Distributed Tests (H200)
# optional
mirror_hardwares
:
[
amdexperimental
]
...
...
@@ -1508,20 +1428,94 @@ steps:
-
pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
-
pytest -v -s tests/v1/distributed/test_dbo.py
##### RL Integration Tests #####
-
label
:
Prime-RL Integration Test
# 15min
mirror_hardwares
:
[
amdexperimental
]
agent_pool
:
mi325_2
##### E2E Eval Tests #####
-
label
:
LM Eval Small Models (1 Card)
# 15min
timeout_in_minutes
:
20
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
agent_pool
:
mi325_1
# grade: Blocking
timeout_in_minutes
:
30
source_file_dependencies
:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
-
label
:
Blackwell LM Eval Small Models
timeout_in_minutes
:
120
gpu
:
b200
optional
:
true
# run on nightlies
source_file_dependencies
:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
-
label
:
Multi-Modal Accuracy Eval (Small Models)
# 10min
timeout_in_minutes
:
70
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
agent_pool
:
mi325_1
# grade: Blocking
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies
:
-
vllm/multimodal/
-
vllm/inputs/
-
vllm/v1/core/
commands
:
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
-
label
:
LM Eval Large Models (4 Card)
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
agent_pool
:
mi325_4
# grade: Blocking
gpu
:
a100
optional
:
true
num_gpus
:
2
working_dir
:
"
/vllm-workspace"
num_gpus
:
4
working_dir
:
"
/vllm-workspace
/.buildkite/lm-eval-harness
"
source_file_dependencies
:
-
vllm
/
-
.buildkite/scripts/run-prime-rl-test.sh
-
csrc
/
-
vllm/model_executor/layers/quantization
commands
:
-
bash .buildkite/scripts/run-prime-rl-test.sh
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
-
label
:
LM Eval Large Models (H100)
# optional
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
agent_pool
:
mi325_4
# grade: Blocking
gpu
:
h100
optional
:
true
num_gpus
:
4
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies
:
-
csrc/
-
vllm/model_executor/layers/quantization
commands
:
-
export VLLM_USE_DEEP_GEMM=0
# We found Triton is faster than DeepGEMM for H100
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
-
label
:
ROCm LM Eval Large Models (8 Card)
mirror_hardwares
:
[
amdproduction
]
agent_pool
:
mi325_8
num_gpus
:
8
working_dir
:
"
/vllm-workspace/.buildkite/lm-eval-harness"
commands
:
-
export VLLM_WORKER_MULTIPROC_METHOD=spawn
-
pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8
-
label
:
ROCm GPT-OSS Eval
timeout_in_minutes
:
60
working_dir
:
"
/vllm-workspace/"
agent_pool
:
mi325_1
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
optional
:
true
# run on nightlies
source_file_dependencies
:
-
tests/evals/gpt_oss
-
vllm/model_executor/models/gpt_oss.py
-
vllm/model_executor/layers/quantization/mxfp4.py
-
vllm/v1/attention/backends/flashinfer.py
commands
:
-
uv pip install --system 'gpt-oss[eval]==0.0.5'
-
VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric
0.58
-
label
:
DeepSeek V2-Lite Accuracy
mirror_hardwares
:
[
amdexperimental
,
amdproduction
]
...
...
@@ -1554,4 +1548,19 @@ steps:
num_gpus
:
2
working_dir
:
"
/vllm-workspace"
commands
:
-
bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file
-
bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2
1
##### RL Integration Tests #####
-
label
:
Prime-RL Integration Test
# 15min
mirror_hardwares
:
[
amdexperimental
]
agent_pool
:
mi325_2
# grade: Blocking
timeout_in_minutes
:
30
optional
:
true
num_gpus
:
2
working_dir
:
"
/vllm-workspace"
source_file_dependencies
:
-
vllm/
-
.buildkite/scripts/run-prime-rl-test.sh
commands
:
-
bash .buildkite/scripts/run-prime-rl-test.sh
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment