test_batched_moe.py 9.66 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8

from dataclasses import dataclass

import pytest
import torch

9
10
11
12
13
14
from tests.kernels.moe.utils import (
    batched_moe,
    make_quantized_test_activations,
    make_test_weights,
    naive_batched_moe,
)
bnellnm's avatar
bnellnm committed
15
16
17
from tests.kernels.quant_utils import native_batched_masked_quant_matmul
from tests.kernels.utils import torch_experts
from vllm.config import VllmConfig, set_current_vllm_config
18
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
19
20
    invoke_moe_batched_triton_kernel,
)
bnellnm's avatar
bnellnm committed
21
22
from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
from vllm.platforms import current_platform
23
from vllm.triton_utils import tl
24
from vllm.utils.torch_utils import set_random_seed
bnellnm's avatar
bnellnm committed
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42

MNK_FACTORS = [
    (1, 128, 128),
    (1, 512, 512),
    (1, 1024, 2048),
    (32, 128, 128),
    (32, 512, 512),
    (32, 1024, 2048),
    (45, 128, 2048),
    (45, 1024, 128),
    (64, 512, 512),
    (64, 1024, 2048),
    (222, 128, 2048),
    (222, 1024, 2048),
]
NUM_EXPERTS = [8, 64]
TOP_KS = [1, 2, 6]

43
44
45
46
47
DTYPES = [torch.bfloat16]

if not current_platform.is_fp8_fnuz():
    DTYPES.append(torch.float8_e4m3fn)

bnellnm's avatar
bnellnm committed
48
vllm_config = VllmConfig()
49
50
51
52


@dataclass
class BatchedMMConfig:
bnellnm's avatar
bnellnm committed
53
    in_dtype: torch.dtype
54
    quant_dtype: torch.dtype | None
bnellnm's avatar
bnellnm committed
55
    out_dtype: torch.dtype
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
    num_experts: int
    max_tokens_per_expert: int
    K: int
    N: int


@dataclass
class BatchedMMTensors:
    A: torch.Tensor  # [E, max_tokens, K]
    B: torch.Tensor  # [E, K, N] - column major
    C: torch.Tensor  # [E, max_tokens, N]
    num_expert_tokens: torch.Tensor  # [E]

    @staticmethod
    def make_tensors(config: BatchedMMConfig):
71
72
73
74
75
76
77
78
79
80
        A = (
            torch.randn(
                (config.num_experts, config.max_tokens_per_expert, config.K),
                device="cuda",
                dtype=config.in_dtype,
            )
            / 10
        )
        B = torch.randn(
            (config.num_experts, config.N, config.K),
81
            device="cuda",
82
83
            dtype=config.in_dtype,
        )
84
85
86
        C = torch.zeros(
            (config.num_experts, config.max_tokens_per_expert, config.N),
            device="cuda",
87
88
            dtype=config.out_dtype,
        )
bnellnm's avatar
bnellnm committed
89

90
91
92
93
94
95
96
        num_expert_tokens = torch.randint(
            low=0,
            high=config.max_tokens_per_expert,
            size=(config.num_experts,),
            device="cuda",
            dtype=torch.int32,
        )
97

bnellnm's avatar
bnellnm committed
98
        return BatchedMMTensors(A, B, C, num_expert_tokens)
99
100


101
102
103
104
@pytest.mark.parametrize("num_experts", [8, 32])
@pytest.mark.parametrize("max_tokens_per_expert", [32, 224, 512])
@pytest.mark.parametrize("K", [128, 1024])
@pytest.mark.parametrize("N", [128, 1024])
105
@pytest.mark.parametrize("dtype", DTYPES)
106
107
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
@pytest.mark.parametrize("per_act_token_quant", [False, True])
108
109
110
111
112
113
def test_batched_mm(
    num_experts: int,
    max_tokens_per_expert: int,
    K: int,
    N: int,
    dtype: torch.dtype,
114
    block_shape: list[int] | None,
115
116
    per_act_token_quant: bool,
):
117
118
    """Note: float8_e4m3fn is not supported on CUDA architecture < 89,
    and those tests will be skipped on unsupported hardware."""
119
    set_random_seed(7)
120

bnellnm's avatar
bnellnm committed
121
    use_fp8_w8a8 = dtype == torch.float8_e4m3fn
122

123
124
125
126
127
128
129
    if (dtype == torch.float8_e4m3fn) and not current_platform.has_device_capability(
        89
    ):
        pytest.skip(
            "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89"
        )

bnellnm's avatar
bnellnm committed
130
131
132
133
134
135
136
137
138
139
140
141
142
    if (per_act_token_quant or block_shape is not None) and not use_fp8_w8a8:
        pytest.skip("Don't test blocking for non-quantized types.")

    if per_act_token_quant and block_shape is not None:
        pytest.skip("Skip illegal quantization test.")

    if dtype.itemsize == 1:
        act_dtype = torch.bfloat16
        quant_dtype = dtype
    else:
        act_dtype = dtype
        quant_dtype = None

143
144
145
146
147
148
149
    num_expert_tokens = torch.randint(
        low=0,
        high=max_tokens_per_expert,
        size=(num_experts,),
        device="cuda",
        dtype=torch.int32,
    )
bnellnm's avatar
bnellnm committed
150
151
152
153
154
155
156
157

    A, A_q, A_scale = make_quantized_test_activations(
        num_experts,
        max_tokens_per_expert,
        K,
        in_dtype=act_dtype,
        quant_dtype=quant_dtype,
        block_shape=block_shape,
158
159
        per_act_token_quant=per_act_token_quant,
    )
bnellnm's avatar
bnellnm committed
160

161
    (B, B_q, B_scale, _), _ = make_test_weights(
bnellnm's avatar
bnellnm committed
162
163
164
165
166
167
        num_experts,
        N // 2,
        K,
        in_dtype=act_dtype,
        quant_dtype=quant_dtype,
        block_shape=block_shape,
168
        per_out_ch_quant=per_act_token_quant,
bnellnm's avatar
bnellnm committed
169
170
171
172
173
174
    )

    out_shape = (num_experts, max_tokens_per_expert, N)
    test_output = torch.zeros(out_shape, dtype=act_dtype, device="cuda")
    ref_output = torch.zeros(out_shape, dtype=act_dtype, device="cuda")
    q_ref_output = torch.zeros(out_shape, dtype=act_dtype, device="cuda")
175
176
177
178

    compute_tl_dtype = {
        torch.float16: tl.float16,
        torch.bfloat16: tl.bfloat16,
179
        torch.float32: tl.float32,
180
    }[test_output.dtype]
bnellnm's avatar
bnellnm committed
181
182
183

    assert A_q.dtype == B_q.dtype

184
    invoke_moe_batched_triton_kernel(
bnellnm's avatar
bnellnm committed
185
186
        A_q,
        B_q,
187
        test_output,
bnellnm's avatar
bnellnm committed
188
        num_expert_tokens,
189
190
        compute_tl_dtype,
        # Quantization data
bnellnm's avatar
bnellnm committed
191
192
        A_scale,
        B_scale,
193
194
        None,
        # Quantization schemes
bnellnm's avatar
bnellnm committed
195
        use_fp8_w8a8,
196
197
198
199
200
        False,
        False,
        config={
            "BLOCK_SIZE_M": 16,
            "BLOCK_SIZE_N": 16,
201
            "BLOCK_SIZE_K": 16 if dtype.itemsize > 1 else 32,
bnellnm's avatar
bnellnm committed
202
        },
203
        per_act_token_quant=per_act_token_quant,
bnellnm's avatar
bnellnm committed
204
205
        block_shape=block_shape,
    )
206

bnellnm's avatar
bnellnm committed
207
208
209
210
211
212
213
    ref_output = native_batched_masked_quant_matmul(
        A,
        B,
        ref_output,
        num_expert_tokens,
    )

214
215
216
217
218
219
220
221
222
223
    q_ref_output = native_batched_masked_quant_matmul(
        A_q,
        B_q,
        q_ref_output,
        num_expert_tokens,
        A_scale,
        B_scale,
        block_shape,
        per_act_token_quant,
    )
224
225
226
227
228
229
230

    rtol, atol = {
        torch.float16: (6e-2, 6e-2),
        torch.bfloat16: (6e-2, 6e-2),
        torch.float32: (1e-2, 1e-2),
    }[test_output.dtype]

231
    torch.testing.assert_close(ref_output, q_ref_output, atol=atol, rtol=rtol)
bnellnm's avatar
bnellnm committed
232
233
234
235
236
237
    torch.testing.assert_close(test_output, q_ref_output, atol=atol, rtol=rtol)


@pytest.mark.parametrize(("m", "n", "k"), MNK_FACTORS)
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
238
@pytest.mark.parametrize("dtype", DTYPES)
239
240
241
@pytest.mark.parametrize("per_act_token_quant", [False, True])
@pytest.mark.parametrize("block_shape", [None, [128, 128]])
@pytest.mark.parametrize("input_scales", [False])
bnellnm's avatar
bnellnm committed
242
243
244
245
246
247
248
249
def test_fused_moe_batched_experts(
    m: int,
    n: int,
    k: int,
    e: int,
    topk: int,
    dtype: torch.dtype,
    per_act_token_quant: bool,
250
    block_shape: list[int] | None,
251
    input_scales: bool,
252
    workspace_init,
bnellnm's avatar
bnellnm committed
253
):
254
255
    """Note: float8_e4m3fn is not supported on CUDA architecture < 89,
    and those tests will be skipped on unsupported hardware."""
256
    set_random_seed(7)
bnellnm's avatar
bnellnm committed
257
258
259

    use_fp8_w8a8 = dtype == torch.float8_e4m3fn

260
261
262
263
264
265
266
    if (dtype == torch.float8_e4m3fn) and not current_platform.has_device_capability(
        89
    ):
        pytest.skip(
            "Triton limitation: fp8e4nv data type is not supported on CUDA arch < 89"
        )

267
268
269
    if topk > e:
        pytest.skip("topk > e")

bnellnm's avatar
bnellnm committed
270
271
272
    if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
        pytest.skip("Skip quantization test for non-quantized type")

273
    if per_act_token_quant and block_shape is not None:
bnellnm's avatar
bnellnm committed
274
275
276
277
278
279
280
281
282
283
284
285
        pytest.skip("Skip illegal quantization test.")

    a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
    score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)

    if dtype.itemsize == 1:
        act_dtype = torch.bfloat16
        quant_dtype = dtype
    else:
        act_dtype = dtype
        quant_dtype = None

286
    (w1_16, w1, w1_s, _), (w2_16, w2, w2_s, _) = make_test_weights(
287
288
289
290
291
292
        e,
        n,
        k,
        block_shape=block_shape,
        in_dtype=act_dtype,
        quant_dtype=quant_dtype,
293
        per_out_ch_quant=per_act_token_quant,
294
295
296
297
298
299
300
301
    )

    if input_scales and quant_dtype is not None:
        a1_scale = torch.tensor(1, device="cuda", dtype=torch.float32)
        a2_scale = torch.tensor(1, device="cuda", dtype=torch.float32)
    else:
        a1_scale = None
        a2_scale = None
bnellnm's avatar
bnellnm committed
302
303
304

    with set_current_vllm_config(vllm_config):
        topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
305
306

        baseline_output = torch_experts(
bnellnm's avatar
bnellnm committed
307
308
309
310
311
312
313
            a,
            w1,
            w2,
            topk_weight,
            topk_ids,
            w1_scale=w1_s,
            w2_scale=w2_s,
314
315
            a1_scale=a1_scale,
            a2_scale=a2_scale,
bnellnm's avatar
bnellnm committed
316
317
318
319
            quant_dtype=quant_dtype,
            per_act_token_quant=per_act_token_quant,
            block_shape=block_shape,
        )
320
321

        batched_output = naive_batched_moe(
bnellnm's avatar
bnellnm committed
322
323
324
325
326
327
328
            a,
            w1,
            w2,
            topk_weight,
            topk_ids,
            w1_scale=w1_s,
            w2_scale=w2_s,
329
330
            a1_scale=a1_scale,
            a2_scale=a2_scale,
bnellnm's avatar
bnellnm committed
331
332
            quant_dtype=quant_dtype,
            per_act_token_quant=per_act_token_quant,
333
334
            block_shape=block_shape,
        )
bnellnm's avatar
bnellnm committed
335

336
        triton_output = batched_moe(
bnellnm's avatar
bnellnm committed
337
338
339
340
341
342
343
            a,
            w1,
            w2,
            topk_weight,
            topk_ids,
            w1_scale=w1_s,
            w2_scale=w2_s,
344
345
            a1_scale=a1_scale,
            a2_scale=a2_scale,
bnellnm's avatar
bnellnm committed
346
347
348
349
350
            quant_dtype=quant_dtype,
            per_act_token_quant=per_act_token_quant,
            block_shape=block_shape,
        )

351
    torch.testing.assert_close(batched_output, baseline_output, atol=3e-2, rtol=2e-2)
bnellnm's avatar
bnellnm committed
352

353
    torch.testing.assert_close(triton_output, batched_output, atol=2e-2, rtol=2e-2)