Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f8a0acbd
Unverified
Commit
f8a0acbd
authored
Oct 15, 2025
by
Michael Goin
Committed by
GitHub
Oct 15, 2025
Browse files
[CI] Enable Blackwell Llama4 MoE tests (#26731)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
13170343
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
36 additions
and
22 deletions
+36
-22
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+1
-1
tests/quantization/test_blackwell_moe.py
tests/quantization/test_blackwell_moe.py
+35
-21
No files found.
.buildkite/test-pipeline.yaml
View file @
f8a0acbd
...
...
@@ -529,7 +529,7 @@ steps:
# we can only upgrade after this is resolved
# TODO(jerryzh168): resolve the above comment
-
uv pip install --system torchao==0.13.0
-
VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
-
VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
--ignore quantization/test_blackwell_moe.py
-
label
:
LM Eval Small Models
# 53min
timeout_in_minutes
:
75
...
...
tests/quantization/test_blackwell_moe.py
View file @
f8a0acbd
...
...
@@ -3,6 +3,7 @@
import
json
import
os
from
typing
import
Any
import
pytest
...
...
@@ -24,12 +25,21 @@ def set_test_environment():
os
.
environ
[
"FLASHINFER_NVCC_THREADS"
]
=
"16"
# dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4,
# "text_config": {"num_layers": 4, "num_hidden_layers": 4}}
dummy_hf_overrides
=
{
"num_layers"
:
4
,
"num_hidden_layers"
:
4
}
# Overide the backbone layers to 4 for faster startup
HF_OVERRIDE_TEXT
=
{
"num_layers"
:
4
,
"num_hidden_layers"
:
4
,
}
HF_OVERRIDE_MM
=
{
"text_config"
:
{
"num_layers"
:
4
,
"num_hidden_layers"
:
4
},
}
def
can_initialize
(
model
:
str
,
extra_args
:
list
[
str
]
|
None
=
None
):
def
can_initialize
(
model
:
str
,
hf_overrides
:
dict
[
str
,
Any
]
|
None
=
None
,
extra_args
:
list
[
str
]
|
None
=
None
,
):
# Server arguments
extra_args
=
extra_args
if
extra_args
is
not
None
else
[]
server_args
=
[
...
...
@@ -50,7 +60,7 @@ def can_initialize(model: str, extra_args: list[str] | None = None):
model
,
server_args
,
max_wait_seconds
=
1500
,
# Due to FlashInfer compile
override_hf_configs
=
dummy_
hf_overrides
,
override_hf_configs
=
hf_overrides
,
)
as
server
:
client
=
server
.
get_client
()
# Make a simple request to verify the server works
...
...
@@ -77,28 +87,33 @@ def can_initialize(model: str, extra_args: list[str] | None = None):
def
test_llama4_fp8_tensor_moe_flashinfer_cutlass
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP8"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"throughput"
)
can_initialize
(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
)
can_initialize
(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
,
hf_overrides
=
HF_OVERRIDE_MM
)
@
pytest
.
mark
.
skip
(
reason
=
"Works, but takes too long to run"
)
def
test_llama4_fp8_tensor_moe_flashinfer_trtllm
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP8"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"latency"
)
can_initialize
(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
)
can_initialize
(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
,
hf_overrides
=
HF_OVERRIDE_MM
)
@
pytest
.
mark
.
skip
(
reason
=
"Works, but takes too long to run"
)
def
test_llama4_nvfp4_moe_flashinfer_cutlass
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP4"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"throughput"
)
can_initialize
(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4"
)
can_initialize
(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4"
,
hf_overrides
=
HF_OVERRIDE_MM
)
@
pytest
.
mark
.
skip
(
reason
=
"RuntimeError: No kernel found for the given options"
)
def
test_llama4_nvfp4_moe_flashinfer_trtllm
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP4"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"latency"
)
can_initialize
(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4"
)
can_initialize
(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4"
,
hf_overrides
=
HF_OVERRIDE_MM
)
## DeepSeekV3 ##
...
...
@@ -106,7 +121,7 @@ def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
def
test_deepseek_fp8_block_moe_deep_gemm
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_DEEP_GEMM"
,
"1"
)
can_initialize
(
"deepseek-ai/DeepSeek-V3.1"
)
can_initialize
(
"deepseek-ai/DeepSeek-V3.1"
,
hf_overrides
=
HF_OVERRIDE_TEXT
)
@
pytest
.
mark
.
skip
(
...
...
@@ -118,26 +133,25 @@ def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
def
test_deepseek_fp8_block_moe_flashinfer_cutlass
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP8"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"throughput"
)
can_initialize
(
"deepseek-ai/DeepSeek-V3.1"
)
can_initialize
(
"deepseek-ai/DeepSeek-V3.1"
,
hf_overrides
=
HF_OVERRIDE_TEXT
)
def
test_deepseek_fp8_block_moe_flashinfer_trtllm
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP8"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"latency"
)
can_initialize
(
"deepseek-ai/DeepSeek-V3.1"
)
can_initialize
(
"deepseek-ai/DeepSeek-V3.1"
,
hf_overrides
=
HF_OVERRIDE_TEXT
)
def
test_deepseek_nvfp4_moe_flashinfer_cutlass
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP4"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"throughput"
)
can_initialize
(
"nvidia/DeepSeek-R1-0528-FP4-v2"
)
can_initialize
(
"nvidia/DeepSeek-R1-0528-FP4-v2"
,
hf_overrides
=
HF_OVERRIDE_TEXT
)
@
pytest
.
mark
.
skip
(
reason
=
"RuntimeError: No kernel found for the given options"
)
def
test_deepseek_nvfp4_moe_flashinfer_trtllm
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP4"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"latency"
)
can_initialize
(
"nvidia/DeepSeek-R1-0528-FP4-v2"
)
can_initialize
(
"nvidia/DeepSeek-R1-0528-FP4-v2"
,
hf_overrides
=
HF_OVERRIDE_TEXT
)
## GPT-OSS ##
...
...
@@ -145,14 +159,14 @@ def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
def
test_gptoss_mxfp4bf16_moe_flashinfer
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_MXFP4_BF16"
,
"1"
)
can_initialize
(
"openai/gpt-oss-20b"
)
can_initialize
(
"openai/gpt-oss-20b"
,
hf_overrides
=
HF_OVERRIDE_TEXT
)
def
test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS"
,
"1"
)
can_initialize
(
"openai/gpt-oss-20b"
)
can_initialize
(
"openai/gpt-oss-20b"
,
hf_overrides
=
HF_OVERRIDE_TEXT
)
def
test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8"
,
"1"
)
can_initialize
(
"openai/gpt-oss-20b"
)
can_initialize
(
"openai/gpt-oss-20b"
,
hf_overrides
=
HF_OVERRIDE_TEXT
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment