test_blackwell_moe.py 5.17 KB
Newer Older
1
2
3
4
5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import json
import os
6
from typing import Optional
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23

import pytest

from tests.utils import RemoteOpenAIServer
from vllm.platforms import current_platform

if not current_platform.is_device_capability(100):
    pytest.skip("This test only runs on Blackwell GPUs (SM100).",
                allow_module_level=True)

os.environ["FLASHINFER_NVCC_THREADS"] = "16"

# dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4,
# "text_config": {"num_layers": 4, "num_hidden_layers": 4}}
dummy_hf_overrides = {"num_layers": 4, "num_hidden_layers": 4}


24
def can_initialize(model: str, extra_args: Optional[list[str]] = None):
25
26

    # Server arguments
27
    extra_args = extra_args if extra_args is not None else []
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
    server_args = [
        "--max-model-len",
        "2048",
        "--max-num-batched-tokens",
        "256",
        "--load-format",
        "dummy",
        "--trust-remote-code",
        "--limit-mm-per-prompt",
        json.dumps({"image": 0}),
        *extra_args,
    ]

    # Launch server and make a simple request
    with RemoteOpenAIServer(
            model,
            server_args,
            max_wait_seconds=1000,  # Due to FlashInfer compile
            override_hf_configs=dummy_hf_overrides) as server:
        client = server.get_client()
        # Make a simple request to verify the server works
        completion = client.completions.create(
            model=model,
            prompt=["Hello, World!"],
            temperature=0,
            max_tokens=2,
        )
        print(completion)
        assert completion.choices[0].text is not None


## Llama4 ##


@pytest.mark.skip(reason=(
    "RuntimeError: run_moe() Expected a value of type "
    "'Optional[List[Tensor]]' for argument '_9' but instead found type "
    "'list'."))
def test_llama4_fp8_tensor_moe_flashinfer_cutlass(
        monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
70
    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8")
71
72
73
74
75
76
77


@pytest.mark.skip(reason="Works, but takes too long to run")
def test_llama4_fp8_tensor_moe_flashinfer_trtllm(
        monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
78
    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8")
79
80
81
82
83
84


@pytest.mark.skip(reason="Works, but takes too long to run")
def test_llama4_nvfp4_moe_flashinfer_cutlass(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
85
    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4")
86
87
88
89
90
91


@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
def test_llama4_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
92
    can_initialize("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4")
93
94
95
96
97
98
99


## DeepSeekV3 ##


def test_deepseek_fp8_block_moe_deep_gemm(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1")
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
    can_initialize("deepseek-ai/DeepSeek-V3.1")


@pytest.mark.skip(reason=("Known issue: lack of kernel support. "
                          "Expected failure: assert self.block_quant is None"))
def test_deepseek_fp8_block_moe_flashinfer_cutlass(
        monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
    can_initialize("deepseek-ai/DeepSeek-V3.1")


def test_deepseek_fp8_block_moe_flashinfer_trtllm(
        monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP8", "1")
    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
    can_initialize("deepseek-ai/DeepSeek-V3.1")
117
118
119
120
121
122


def test_deepseek_nvfp4_moe_flashinfer_cutlass(
        monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "throughput")
123
    can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2")
124
125
126
127
128
129


@pytest.mark.skip(reason="RuntimeError: No kernel found for the given options")
def test_deepseek_nvfp4_moe_flashinfer_trtllm(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_FP4", "1")
    monkeypatch.setenv("VLLM_FLASHINFER_MOE_BACKEND", "latency")
130
    can_initialize("nvidia/DeepSeek-R1-0528-FP4-v2")
131
132
133
134
135
136
137


## GPT-OSS ##


def test_gptoss_mxfp4bf16_moe_flashinfer(monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "1")
138
    can_initialize("openai/gpt-oss-20b")
139
140
141
142
143


def test_gptoss_mxfp4mxfp8_moe_flashinfer_cutlass(
        monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS", "1")
144
    can_initialize("openai/gpt-oss-20b")
145
146
147
148
149


def test_gptoss_mxfp4mxfp8_moe_flashinfer_trtllm(
        monkeypatch: pytest.MonkeyPatch):
    monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
150
    can_initialize("openai/gpt-oss-20b")