Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3b279a84
Unverified
Commit
3b279a84
authored
Oct 02, 2025
by
Michael Goin
Committed by
GitHub
Oct 02, 2025
Browse files
[CI] Add Blackwell DeepSeek FP8 FlashInfer MoE tests (#26040)
Signed-off-by:
mgoin
<
mgoin64@gmail.com
>
parent
5e4a8223
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
29 additions
and
11 deletions
+29
-11
tests/quantization/test_blackwell_moe.py
tests/quantization/test_blackwell_moe.py
+29
-11
No files found.
tests/quantization/test_blackwell_moe.py
View file @
3b279a84
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
import
json
import
json
import
os
import
os
from
typing
import
Optional
import
pytest
import
pytest
...
@@ -20,9 +21,10 @@ os.environ["FLASHINFER_NVCC_THREADS"] = "16"
...
@@ -20,9 +21,10 @@ os.environ["FLASHINFER_NVCC_THREADS"] = "16"
dummy_hf_overrides
=
{
"num_layers"
:
4
,
"num_hidden_layers"
:
4
}
dummy_hf_overrides
=
{
"num_layers"
:
4
,
"num_hidden_layers"
:
4
}
def
can_initialize
(
model
:
str
,
extra_args
:
list
[
str
]):
def
can_initialize
(
model
:
str
,
extra_args
:
Optional
[
list
[
str
]
]
=
None
):
# Server arguments
# Server arguments
extra_args
=
extra_args
if
extra_args
is
not
None
else
[]
server_args
=
[
server_args
=
[
"--max-model-len"
,
"--max-model-len"
,
"2048"
,
"2048"
,
...
@@ -65,7 +67,7 @@ def test_llama4_fp8_tensor_moe_flashinfer_cutlass(
...
@@ -65,7 +67,7 @@ def test_llama4_fp8_tensor_moe_flashinfer_cutlass(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP8"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP8"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"throughput"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"throughput"
)
can_initialize
(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
,
[]
)
can_initialize
(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
)
@
pytest
.
mark
.
skip
(
reason
=
"Works, but takes too long to run"
)
@
pytest
.
mark
.
skip
(
reason
=
"Works, but takes too long to run"
)
...
@@ -73,21 +75,21 @@ def test_llama4_fp8_tensor_moe_flashinfer_trtllm(
...
@@ -73,21 +75,21 @@ def test_llama4_fp8_tensor_moe_flashinfer_trtllm(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP8"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP8"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"latency"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"latency"
)
can_initialize
(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
,
[]
)
can_initialize
(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
)
@
pytest
.
mark
.
skip
(
reason
=
"Works, but takes too long to run"
)
@
pytest
.
mark
.
skip
(
reason
=
"Works, but takes too long to run"
)
def
test_llama4_nvfp4_moe_flashinfer_cutlass
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_llama4_nvfp4_moe_flashinfer_cutlass
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP4"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP4"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"throughput"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"throughput"
)
can_initialize
(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4"
,
[]
)
can_initialize
(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4"
)
@
pytest
.
mark
.
skip
(
reason
=
"RuntimeError: No kernel found for the given options"
)
@
pytest
.
mark
.
skip
(
reason
=
"RuntimeError: No kernel found for the given options"
)
def
test_llama4_nvfp4_moe_flashinfer_trtllm
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_llama4_nvfp4_moe_flashinfer_trtllm
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP4"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP4"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"latency"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"latency"
)
can_initialize
(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4"
,
[]
)
can_initialize
(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4"
)
## DeepSeekV3 ##
## DeepSeekV3 ##
...
@@ -95,21 +97,37 @@ def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
...
@@ -95,21 +97,37 @@ def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
def
test_deepseek_fp8_block_moe_deep_gemm
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_deepseek_fp8_block_moe_deep_gemm
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_DEEP_GEMM"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_USE_DEEP_GEMM"
,
"1"
)
can_initialize
(
"deepseek-ai/DeepSeek-V3.1"
,
[])
can_initialize
(
"deepseek-ai/DeepSeek-V3.1"
)
@
pytest
.
mark
.
skip
(
reason
=
(
"Known issue: lack of kernel support. "
"Expected failure: assert self.block_quant is None"
))
def
test_deepseek_fp8_block_moe_flashinfer_cutlass
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP8"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"throughput"
)
can_initialize
(
"deepseek-ai/DeepSeek-V3.1"
)
def
test_deepseek_fp8_block_moe_flashinfer_trtllm
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP8"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"latency"
)
can_initialize
(
"deepseek-ai/DeepSeek-V3.1"
)
def
test_deepseek_nvfp4_moe_flashinfer_cutlass
(
def
test_deepseek_nvfp4_moe_flashinfer_cutlass
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP4"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP4"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"throughput"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"throughput"
)
can_initialize
(
"nvidia/DeepSeek-R1-0528-FP4-v2"
,
[]
)
can_initialize
(
"nvidia/DeepSeek-R1-0528-FP4-v2"
)
@
pytest
.
mark
.
skip
(
reason
=
"RuntimeError: No kernel found for the given options"
)
@
pytest
.
mark
.
skip
(
reason
=
"RuntimeError: No kernel found for the given options"
)
def
test_deepseek_nvfp4_moe_flashinfer_trtllm
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_deepseek_nvfp4_moe_flashinfer_trtllm
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP4"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_FP4"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"latency"
)
monkeypatch
.
setenv
(
"VLLM_FLASHINFER_MOE_BACKEND"
,
"latency"
)
can_initialize
(
"nvidia/DeepSeek-R1-0528-FP4-v2"
,
[]
)
can_initialize
(
"nvidia/DeepSeek-R1-0528-FP4-v2"
)
## GPT-OSS ##
## GPT-OSS ##
...
@@ -117,16 +135,16 @@ def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
...
@@ -117,16 +135,16 @@ def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
def
test_gptoss_mxfp4bf16_moe_flashinfer
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_gptoss_mxfp4bf16_moe_flashinfer
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_MXFP4_BF16"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_MXFP4_BF16"
,
"1"
)
can_initialize
(
"openai/gpt-oss-20b"
,
[]
)
can_initialize
(
"openai/gpt-oss-20b"
)
def
test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass
(
def
test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS"
,
"1"
)
can_initialize
(
"openai/gpt-oss-20b"
,
[]
)
can_initialize
(
"openai/gpt-oss-20b"
)
def
test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm
(
def
test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm
(
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
:
pytest
.
MonkeyPatch
):
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8"
,
"1"
)
can_initialize
(
"openai/gpt-oss-20b"
,
[]
)
can_initialize
(
"openai/gpt-oss-20b"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment