test_silu_mul_fp8.py 11.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import pytest
import torch
import torch.nn.functional as F

from vllm.utils.import_utils import has_helion

if not has_helion():
    pytest.skip(
        "Helion is not installed. Install with: pip install vllm[helion]",
        allow_module_level=True,
    )

from vllm.kernels.helion.config_manager import ConfigManager
from vllm.kernels.helion.ops.silu_mul_fp8 import (
    pick_silu_mul_fp8_config,
    silu_mul_fp8,
    silu_mul_fp8_baseline,
)


def skip_if_platform_unsupported():
    try:
        from vllm.kernels.helion.utils import get_canonical_gpu_name

        if not torch.cuda.is_available():
            pytest.skip("CUDA not available")

        platform = get_canonical_gpu_name()

        try:
            config_manager = ConfigManager.get_instance()
        except RuntimeError:
            config_manager = ConfigManager()

        configs = config_manager.get_platform_configs("silu_mul_fp8", platform)
        if len(configs) == 0:
            pytest.skip("Current GPU platform not supported for silu_mul_fp8 kernel")

    except (ImportError, RuntimeError, KeyError):
        pytest.skip("Error detecting platform support for silu_mul_fp8 kernel")


@pytest.fixture(autouse=True)
def reset_config_manager_singleton():
    ConfigManager.reset_instance()
    ConfigManager()
    yield
    ConfigManager.reset_instance()


class TestSiluMulFp8ConfigPicker:
    def test_config_picker_exact_match(self):
        config_keys = [
            "intermediate_2048_batchsize_256",
            "intermediate_4096_batchsize_256",
        ]

        input_tensor = torch.randn(32, 4096, dtype=torch.bfloat16, device="cuda")
        scale = torch.tensor([0.5], dtype=torch.float32, device="cuda")
        args = (input_tensor, scale)

        selected_key = pick_silu_mul_fp8_config(args, config_keys)
        assert selected_key == "intermediate_2048_batchsize_256"

    def test_config_picker_closest_match(self):
        config_keys = [
            "intermediate_2048_batchsize_256",
            "intermediate_4096_batchsize_256",
        ]
        # Use 7000 (intermediate_size=3500) which is closer to 4096 than 2048
        input_tensor = torch.randn(32, 7000, dtype=torch.bfloat16, device="cuda")
        scale = torch.tensor([0.5], dtype=torch.float32, device="cuda")
        args = (input_tensor, scale)

        selected_key = pick_silu_mul_fp8_config(args, config_keys)
        assert selected_key == "intermediate_4096_batchsize_256"

    def test_config_picker_fallback_to_default(self):
        config_keys = ["default", "some_other_key"]

        input_tensor = torch.randn(32, 4096, dtype=torch.bfloat16, device="cuda")
        scale = torch.tensor([0.5], dtype=torch.float32, device="cuda")
        args = (input_tensor, scale)

        selected_key = pick_silu_mul_fp8_config(args, config_keys)
        assert selected_key == "default"

    def test_config_picker_no_configs(self):
        config_keys: list[str] = []

        input_tensor = torch.randn(32, 4096, dtype=torch.bfloat16, device="cuda")
        scale = torch.tensor([0.5], dtype=torch.float32, device="cuda")
        args = (input_tensor, scale)

        selected_key = pick_silu_mul_fp8_config(args, config_keys)
        assert selected_key is None

    @pytest.mark.parametrize("intermediate_size", [2048, 4096, 5120])
    def test_config_picker_different_sizes(self, intermediate_size):
        config_keys = [
            "intermediate_2048_batchsize_256",
            "intermediate_4096_batchsize_256",
            "intermediate_5120_batchsize_256",
        ]

        input_tensor = torch.randn(
            32, 2 * intermediate_size, dtype=torch.bfloat16, device="cuda"
        )
        scale = torch.tensor([0.5], dtype=torch.float32, device="cuda")
        args = (input_tensor, scale)

        selected_key = pick_silu_mul_fp8_config(args, config_keys)
        expected_key = f"intermediate_{intermediate_size}_batchsize_256"
        assert selected_key == expected_key


class TestSiluMulFp8Correctness:
    @pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
    @pytest.mark.parametrize("intermediate_size", [2048, 3000, 3500, 4096, 5000])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
    def test_silu_mul_fp8_correctness(self, batch_size, intermediate_size, dtype):
        skip_if_platform_unsupported()

        input_size = 2 * intermediate_size
        input_tensor = torch.randn(batch_size, input_size, dtype=dtype, device="cuda")
        scale = torch.tensor([0.5], dtype=torch.float32, device="cuda")

        reference_output = silu_mul_fp8_baseline(input_tensor, scale)
        helion_output = silu_mul_fp8(input_tensor, scale)

        assert helion_output.shape == reference_output.shape
        assert helion_output.dtype == torch.float8_e4m3fn
        assert reference_output.dtype == torch.float8_e4m3fn

        ref_f32 = reference_output.to(torch.float32)
        helion_f32 = helion_output.to(torch.float32)
        # FP8 E4M3 has limited precision. Values near quantization boundaries
        # can round differently due to intermediate precision differences.
        torch.testing.assert_close(
            helion_f32,
            ref_f32,
            atol=0.05,
            rtol=0.05,
            msg=f"Mismatch at batch={batch_size}, size={intermediate_size}",
        )

    def test_silu_mul_fp8_shape_inference(self):
        skip_if_platform_unsupported()
        batch_size, input_size = 32, 8192
        intermediate_size = input_size // 2

        input_tensor = torch.randn(
            batch_size, input_size, dtype=torch.bfloat16, device="cuda"
        )
        scale = torch.tensor([0.5], dtype=torch.float32, device="cuda")

        output = silu_mul_fp8(input_tensor, scale)

        expected_shape = (batch_size, intermediate_size)
        assert output.shape == expected_shape
        assert output.dtype == torch.float8_e4m3fn

    def test_silu_mul_fp8_scale_variations(self):
        skip_if_platform_unsupported()
        batch_size, input_size = 16, 4096

        input_tensor = torch.randn(
            batch_size, input_size, dtype=torch.bfloat16, device="cuda"
        )

        scales = [0.1, 0.5, 1.0, 2.0, 10.0]

        for scale_val in scales:
            scale = torch.tensor([scale_val], dtype=torch.float32, device="cuda")

            reference_output = silu_mul_fp8_baseline(input_tensor, scale)
            helion_output = silu_mul_fp8(input_tensor, scale)
            ref_f32 = reference_output.to(torch.float32)
            helion_f32 = helion_output.to(torch.float32)

            torch.testing.assert_close(
                helion_f32,
                ref_f32,
                atol=0.05,
                rtol=0.05,
                msg=f"Mismatch for scale={scale_val}",
            )

    @pytest.mark.parametrize(
        "shape",
        [
            (1, 4096),
            (16, 4096),
            (128, 4096),
            (1024, 4096),
            (1, 8192),
            (16, 8192),
            (128, 8192),
        ],
    )
    def test_silu_mul_fp8_various_shapes(self, shape):
        skip_if_platform_unsupported()

        input_tensor = torch.randn(*shape, dtype=torch.bfloat16, device="cuda")
        scale = torch.tensor([0.5], dtype=torch.float32, device="cuda")

        reference_output = silu_mul_fp8_baseline(input_tensor, scale)
        helion_output = silu_mul_fp8(input_tensor, scale)

        assert helion_output.shape == reference_output.shape

        ref_f32 = reference_output.to(torch.float32)
        helion_f32 = helion_output.to(torch.float32)

        torch.testing.assert_close(
            helion_f32, ref_f32, atol=0.05, rtol=0.05, msg=f"Mismatch for shape={shape}"
        )


def silu_mul_fp8_pytorch(input: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
    """Pure PyTorch reference using F.silu.

    This matches vLLM's SiluAndMul.forward_native exactly:
    F.silu(x[..., :d]) * x[..., d:]
    """
    d = input.shape[-1] // 2
    result = F.silu(input[..., :d]) * input[..., d:]
    return (result.to(torch.float32) / scale).to(torch.float8_e4m3fn)


class TestSiluMulFp8PytorchReference:
    """Tests comparing Helion kernel against pure PyTorch implementation.

    Uses tighter tolerance since both use PyTorch's FP8 conversion
    (same rounding mode), unlike the vLLM C++ baseline which uses
    NVIDIA's hardware FP8 conversion with different rounding.
    """

    @pytest.mark.parametrize("batch_size", [1, 8, 32, 128, 256])
    @pytest.mark.parametrize("intermediate_size", [1024, 2048, 4096])
    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
    def test_silu_mul_fp8_vs_pytorch(self, batch_size, intermediate_size, dtype):
        skip_if_platform_unsupported()

        input_tensor = torch.randn(
            batch_size, 2 * intermediate_size, dtype=dtype, device="cuda"
        )
        scale = torch.tensor([0.5], dtype=torch.float32, device="cuda")

        pytorch_output = silu_mul_fp8_pytorch(input_tensor, scale)
        helion_output = silu_mul_fp8(input_tensor, scale)

        assert helion_output.shape == pytorch_output.shape
        assert helion_output.dtype == torch.float8_e4m3fn

        pytorch_f32 = pytorch_output.to(torch.float32)
        helion_f32 = helion_output.to(torch.float32)

        # Tolerance accounts for FP8 quantization boundary effects
        torch.testing.assert_close(
            helion_f32,
            pytorch_f32,
            atol=0.05,
            rtol=0.05,
            msg=(
                f"Mismatch at batch={batch_size}, size={intermediate_size}, "
                f"dtype={dtype}"
            ),
        )

    @pytest.mark.parametrize(
        "shape",
        [
            (1, 2, 4096),  # 3D input
            (2, 4, 2048),  # 3D input
            (1, 1, 1, 8192),  # 4D input
        ],
    )
    def test_silu_mul_fp8_multidim_vs_pytorch(self, shape):
        skip_if_platform_unsupported()

        input_tensor = torch.randn(*shape, dtype=torch.bfloat16, device="cuda")
        scale = torch.tensor([0.5], dtype=torch.float32, device="cuda")

        pytorch_output = silu_mul_fp8_pytorch(input_tensor, scale)
        helion_output = silu_mul_fp8(input_tensor, scale)

        assert helion_output.shape == pytorch_output.shape

        pytorch_f32 = pytorch_output.to(torch.float32)
        helion_f32 = helion_output.to(torch.float32)

        torch.testing.assert_close(
            helion_f32,
            pytorch_f32,
            atol=0.05,
            rtol=0.05,
            msg=f"Mismatch for shape={shape}",
        )


class TestSiluMulFp8Integration:
    def test_kernel_registration_integration(self):
        from vllm.kernels.helion.register import get_registered_kernels

        registered_kernels = get_registered_kernels()
        assert "silu_mul_fp8" in registered_kernels

        kernel_wrapper = registered_kernels["silu_mul_fp8"]
        assert kernel_wrapper.op_name == "silu_mul_fp8"
        assert kernel_wrapper._config_picker is not None

    def test_fake_impl_functionality(self):
        skip_if_platform_unsupported()
        from vllm.kernels.helion.register import get_registered_kernels

        input_tensor = torch.randn(32, 4096, dtype=torch.bfloat16, device="cuda")
        scale = torch.tensor([0.5], dtype=torch.float32, device="cuda")
        registered_kernels = get_registered_kernels()
        kernel_wrapper = registered_kernels["silu_mul_fp8"]
        fake_impl = kernel_wrapper._fake_impl

        fake_output = fake_impl(input_tensor, scale)

        expected_shape = (32, 2048)
        assert fake_output.shape == expected_shape
        assert fake_output.dtype == torch.float8_e4m3fn
        assert fake_output.device == input_tensor.device