Merge tag 'v0.14.0' into v0.14.0-dev

7e63ef82 · zhuwenwen · 8cbcac5d · b17039bc · 7e63ef82 · 7e63ef82
Commit 7e63ef82 authored Jan 21, 2026 by zhuwenwen
20 changed files
--- a/tests/kernels/moe/test_modular_oai_triton_moe.py
+++ b/tests/kernels/moe/test_modular_oai_triton_moe.py
@@ -34,6 +34,7 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import (
 )
 from vllm.model_executor.layers.utils import shuffle_weight
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 MNK = [
    (1, 512, 384),
@@ -211,7 +212,7 @@ def test_oai_triton_moe(
    unfused: bool,
    workspace_init,
 ):
-    current_platform.seed_everything(0)
+    set_random_seed(0)
    (
        w1,
        w2,

--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -60,10 +60,14 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_w
 from vllm.model_executor.models.mixtral import MixtralMoE
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
+from vllm.utils.torch_utils import set_random_seed
+from vllm.v1.worker.workspace import init_workspace_manager
 NUM_EXPERTS = [8, 64, 192]
+NUM_EXPERTS_LARGE = [128, 256]
 EP_SIZE = [1, 4]
 TOP_KS = [2, 6]
+TOP_KS_SMALL = [1, 2]
 MOE_MARLIN_QUANT_TEST_CONFIGS = [
    # AWQ-INT4
@@ -131,6 +135,13 @@ FUSED_MOE_MNK_FACTORS = [
    (40000, 1024, 1024),
 ]
+FUSED_MOE_MNK_FACTORS_SMALL_M = [
+    (1, 128, 128),
+    (1, 2048, 128),
+    (2, 2048, 128),
+    (2, 2048, 511),
+]
 FUSED_MOE_WN16_MNK_FACTORS = [
    (1, 128, 128),
    (1, 1024, 1024),
@@ -233,7 +244,7 @@ def test_fused_moe(
    monkeypatch,
    workspace_init,
 ):
-    current_platform.seed_everything(7)
+    set_random_seed(7)
    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
@@ -328,6 +339,111 @@ def test_fused_moe(
        )
+@pytest.mark.parametrize("m,n,k", FUSED_MOE_MNK_FACTORS_SMALL_M)
+@pytest.mark.parametrize("e", NUM_EXPERTS_LARGE)
+@pytest.mark.parametrize("topk", TOP_KS_SMALL)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("padding", [True, False])
+@pytest.mark.parametrize("chunk_size", [8192])
+def test_naive_block_assignment_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+    padding: bool,
+    chunk_size: int,
+    monkeypatch,
+    workspace_init,
+):
+    current_platform.seed_everything(7)
+    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
+    #
+    # Setup test data
+    #
+    #
+    # Setup test data
+    #
+    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
+    score = torch.randn((m, e), device="cuda", dtype=dtype)
+    e_map = None
+    #
+    # Setup test functions
+    #
+    quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
+    m_fused_moe_fn = modular_triton_fused_moe(quant_config)
+    def m_fused_moe(
+        a: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        score: torch.Tensor,
+        topk: int,
+        global_num_experts: int = -1,
+        expert_map: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)
+        return m_fused_moe_fn(
+            a,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+        )
+    fused_moe_fn = functools.partial(fused_moe, renormalize=False)
+    #
+    # Run tests
+    #
+    runner = functools.partial(
+        run_moe_test,
+        a=a,
+        w1=w1,
+        w2=w2,
+        score=score,
+        topk=topk,
+        global_num_experts=e,
+        expert_map=e_map,
+        padding=padding,
+    )
+    # Note: for now use_compile will error out if the problem size is
+    # large enough to trigger chunking. I'm leaving the flag and
+    # setup code in case we are able to revisit this later.
+    use_compile = False
+    use_cudagraph = n >= 1024 and k >= 1024 and current_platform.is_cuda_alike()
+    with set_current_vllm_config(vllm_config):
+        baseline_output = runner(torch_moe, iterative_moe)
+        runner(
+            baseline_output,
+            fused_moe_fn,
+            use_compile=use_compile,
+            use_cudagraph=use_cudagraph,
+        )
+        runner(
+            baseline_output,
+            m_fused_moe,
+            use_compile=use_compile,
+            use_cudagraph=use_cudagraph,
+        )
 @pytest.mark.parametrize("m,n,k", FUSED_MOE_WN16_MNK_FACTORS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
@@ -466,7 +582,12 @@ def test_fused_moe_wn16(
 )
 @torch.inference_mode()
 def test_mixtral_moe(
-    dist_init, dtype: torch.dtype, padding: bool, use_rocm_aiter: bool, monkeypatch
+    default_vllm_config,
+    dist_init,
+    dtype: torch.dtype,
+    padding: bool,
+    use_rocm_aiter: bool,
+    monkeypatch,
 ):
    """Make sure our Mixtral MoE implementation agrees with the one from
    huggingface."""
@@ -487,6 +608,7 @@ def test_mixtral_moe(
    monkeypatch.setenv("MASTER_ADDR", "localhost")
    monkeypatch.setenv("MASTER_PORT", "12345")
    init_distributed_environment()
+    init_workspace_manager(torch.cuda.current_device())
    # Instantiate our and huggingface's MoE blocks
    vllm_config.compilation_config.static_forward_context = dict()
@@ -540,6 +662,11 @@ def test_mixtral_moe(
            torch.cuda.synchronize()
            torch.cuda.empty_cache()
+        # FIXME (zyongye) fix this after we move self.kernel
+        # assignment in FusedMoE.__init__
+        vllm_moe.experts.quant_method.process_weights_after_loading(vllm_moe.experts)
        # Run forward passes for both MoE blocks
        hf_states, _ = hf_moe.forward(hf_inputs)
        vllm_states = vllm_moe.forward(vllm_inputs)

--- a/tests/kernels/moe/test_moe_align_block_size.py
+++ b/tests/kernels/moe/test_moe_align_block_size.py
@@ -14,12 +14,13 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
 )
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import round_up
+from vllm.utils.torch_utils import set_random_seed
 NUM_TOKENS = [1, 3, 256, 2256, 4096]
 NUM_EXPERTS = [32, 160, 256, 257]
 TOP_KS = [1, 2, 16, 32]
 BLOCK_SIZES = [32, 128]
-current_platform.seed_everything(0)
+set_random_seed(0)
 def _group_tokens_by_expert(

--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -44,8 +44,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularK
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
    TopKWeightAndReduceDelegate,
 )
-from vllm.platforms import current_platform
 from vllm.utils.math_utils import round_up
+from vllm.utils.torch_utils import set_random_seed
 from vllm.v1.worker.workspace import init_workspace_manager
 from ...utils import multi_gpu_test
@@ -184,7 +184,7 @@ def test_fused_moe_batched_experts(
    dtype: torch.dtype,
    workspace_init,
 ):
-    current_platform.seed_everything(7)
+    set_random_seed(7)
    a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
@@ -491,7 +491,7 @@ def test_pplx_prepare_finalize_slow(
    if per_act_token_quant and block_shape is not None:
        pytest.skip("Skip illegal quantization combination")
-    current_platform.seed_everything(7)
+    set_random_seed(7)
    m, n, k = mnk
    world_size, dp_size = world_dp_size
    device = "cuda"
@@ -809,7 +809,7 @@ def test_pplx_moe_slow(
    block_shape: list[int] | None,
    use_internode: bool,
 ):
-    current_platform.seed_everything(7)
+    set_random_seed(7)
    m, n, k = mnk
    world_size, dp_size = world_dp_size
@@ -888,7 +888,7 @@ def _pplx_test_loop(
        new_vllm_config.parallel_config.enable_expert_parallel = True
        _set_vllm_config(new_vllm_config, pgi.world_size, pgi.rank, pgi.local_rank)
-    current_platform.seed_everything(7)
+    set_random_seed(7)
    combos = itertools.product(
        PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES, [False, True], [None, [128, 128]]
    )
@@ -982,7 +982,7 @@ def test_pplx_prepare_finalize(
    world_dp_size: tuple[int, int],
    use_internode: bool,
 ):
-    current_platform.seed_everything(7)
+    set_random_seed(7)
    world_size, dp_size = world_dp_size
    parallel_launch(
        world_size * dp_size,
@@ -1005,7 +1005,7 @@ def test_pplx_moe(
    use_internode: bool,
    use_shared_experts: bool,
 ):
-    current_platform.seed_everything(7)
+    set_random_seed(7)
    world_size, dp_size = world_dp_size
    parallel_launch(
        world_size,

--- a/tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py
+++ b/tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py
@@ -11,6 +11,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
 from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+from vllm.utils.torch_utils import set_random_seed
 FLOAT8_DTYPE = torch.float8_e4m3fn
 GROUP_SIZE = 128
@@ -67,8 +68,12 @@ def reference(x: torch.Tensor, use_ue8m0: bool) -> tuple[torch.Tensor, torch.Ten
 @pytest.mark.parametrize("T", [128, 256, 512])
 @pytest.mark.parametrize("N", [128 * 2, 256 * 2, 768 * 2, 2048 * 2, 7168 * 2])
+@pytest.mark.skipif(
+    current_platform.is_rocm(),
+    reason="ROCm does not support DeepGemm.",
+)
 def test_silu_mul_fp8_quant_deep_gemm(T: int, N: int):
-    current_platform.seed_everything(42)
+    set_random_seed(42)
    input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")

--- a/tests/kernels/moe/test_triton_moe_no_act_mul.py
+++ b/tests/kernels/moe/test_triton_moe_no_act_mul.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for MoE with non-gated activations (*_no_mul).
+These tests verify that MoE layers work correctly with activations like
+silu_no_mul, gelu_no_mul, relu2_no_mul where the activation output dimension
+equals N (not N // 2 like gated activations).
+"""
+import pytest
+import torch
+from vllm.model_executor.layers.fused_moe.config import (
+    FUSED_MOE_UNQUANTIZED_CONFIG,
+)
+from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
+from vllm.model_executor.layers.fused_moe.utils import (
+    GELU_NO_MUL,
+    RELU2_NO_MUL,
+    SILU_NO_MUL,
+)
+from vllm.platforms import current_platform
+# Test parameters
+M_SIZES = [1, 16, 64]
+N_SIZES = [128, 256]
+K_SIZES = [64, 128]
+TOPK_VALUES = [1, 2]
+NUM_EXPERTS = 8
+NO_MUL_ACTIVATIONS = [SILU_NO_MUL, GELU_NO_MUL, RELU2_NO_MUL]
+def make_test_tensors(
+    m: int,
+    n: int,
+    k: int,
+    num_experts: int,
+    topk: int,
+    dtype: torch.dtype = torch.bfloat16,
+    device: str = "cuda",
+):
+    """Create test tensors for MoE with non-gated activation.
+    For non-gated activations (*_no_mul):
+    - w1: (E, N, K) - projects from K to N
+    - w2: (E, K, N) - projects from N back to K (note: N, not N//2)
+    """
+    hidden_states = torch.randn(m, k, dtype=dtype, device=device)
+    # For non-gated: w1 projects K -> N, w2 projects N -> K
+    w1 = torch.randn(num_experts, n, k, dtype=dtype, device=device) * 0.1
+    w2 = torch.randn(num_experts, k, n, dtype=dtype, device=device) * 0.1
+    topk_weights = torch.ones(m, topk, dtype=torch.float32, device=device) / topk
+    topk_ids = torch.randint(0, num_experts, (m, topk), device=device)
+    return hidden_states, w1, w2, topk_weights, topk_ids
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(80),
+    reason="Requires compute capability >= 8.0",
+)
+@pytest.mark.parametrize("m", M_SIZES)
+@pytest.mark.parametrize("n", N_SIZES)
+@pytest.mark.parametrize("k", K_SIZES)
+@pytest.mark.parametrize("topk", TOPK_VALUES)
+@pytest.mark.parametrize("activation", NO_MUL_ACTIVATIONS)
+@torch.inference_mode()
+def test_triton_experts_no_mul_activation(
+    m: int,
+    n: int,
+    k: int,
+    topk: int,
+    activation: str,
+):
+    hidden_states, w1, w2, topk_weights, topk_ids = make_test_tensors(
+        m, n, k, NUM_EXPERTS, topk
+    )
+    experts = TritonExperts(FUSED_MOE_UNQUANTIZED_CONFIG)
+    ws1_shape, ws2_shape, out_shape = experts.workspace_shapes(
+        M=m,
+        N=n,
+        K=k,
+        topk=topk,
+        global_num_experts=NUM_EXPERTS,
+        local_num_experts=NUM_EXPERTS,
+        expert_tokens_meta=None,
+        activation=activation,
+    )
+    # Verify workspace shapes are correct for no_mul activation
+    # workspace1 should handle activation_out_dim = N (not N//2)
+    assert ws1_shape == (m, topk, max(n, k)), (
+        f"workspace1 shape mismatch: expected {(m, topk, max(n, k))}, got {ws1_shape}"
+    )
+    # workspace2 should handle max(N, K) for intermediate_cache1/cache3
+    assert ws2_shape == (m, topk, max(n, k)), (
+        f"workspace2 shape mismatch: expected {(m, topk, max(n, k))}, got {ws2_shape}"
+    )
+    assert out_shape == (m, k), (
+        f"output shape mismatch: expected {(m, k)}, got {out_shape}"
+    )
+    workspace1 = torch.empty(
+        ws1_shape[0] * ws1_shape[1] * ws1_shape[2],
+        dtype=hidden_states.dtype,
+        device=hidden_states.device,
+    )
+    workspace2 = torch.empty(
+        ws2_shape[0] * ws2_shape[1] * ws2_shape[2],
+        dtype=hidden_states.dtype,
+        device=hidden_states.device,
+    )
+    output = torch.zeros(m, k, dtype=hidden_states.dtype, device=hidden_states.device)
+    experts.apply(
+        output=output,
+        hidden_states=hidden_states,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        activation=activation,
+        global_num_experts=NUM_EXPERTS,
+        expert_map=None,
+        a1q_scale=None,
+        a2_scale=None,
+        workspace13=workspace1,
+        workspace2=workspace2,
+        expert_tokens_meta=None,
+        apply_router_weight_on_input=False,
+    )
+    assert output.shape == (m, k), f"Expected shape {(m, k)}, got {output.shape}"
+    assert not torch.isnan(output).any(), "Output contains NaN"
+    assert not torch.isinf(output).any(), "Output contains Inf"
+    assert output.abs().sum() > 0, "Output is all zeros"
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(80),
+    reason="Requires compute capability >= 8.0",
+)
+@torch.inference_mode()
+def test_workspace_shapes_no_mul_vs_gated():
+    """Test that workspace shapes differ correctly between gated and non-gated."""
+    from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
+    M, N, K, topk = 64, 256, 128, 2
+    experts = TritonExperts(FUSED_MOE_UNQUANTIZED_CONFIG)
+    ws1_no_mul, _, out_no_mul = experts.workspace_shapes(
+        M, N, K, topk, 8, 8, None, SILU_NO_MUL
+    )
+    ws1_gated, _, out_gated = experts.workspace_shapes(
+        M, N, K, topk, 8, 8, None, "silu"
+    )
+    # For no_mul: activation_out_dim = N
+    # For gated: activation_out_dim = N // 2
+    # workspace1 should use max(activation_out_dim, K)
+    activation_out_dim_no_mul = N
+    activation_out_dim_gated = N // 2
+    assert ws1_no_mul[2] == max(activation_out_dim_no_mul, K), (
+        f"no_mul workspace1 last dim should be max({activation_out_dim_no_mul}, {K})"
+    )
+    assert ws1_gated[2] == max(activation_out_dim_gated, K), (
+        f"gated workspace1 last dim should be max({activation_out_dim_gated}, {K})"
+    )
+    # Output shapes should be the same
+    assert out_no_mul == out_gated == (M, K)
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(80),
+    reason="Requires compute capability >= 8.0",
+)
+@torch.inference_mode()
+def test_adjust_n_for_activation():
+    """Test the adjust_N_for_activation method."""
+    from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
+    experts = TritonExperts(FUSED_MOE_UNQUANTIZED_CONFIG)
+    N = 256
+    # Gated activations should return N // 2
+    assert experts.adjust_N_for_activation(N, "silu") == N // 2
+    assert experts.adjust_N_for_activation(N, "gelu") == N // 2
+    # Non-gated activations should return N
+    assert experts.adjust_N_for_activation(N, SILU_NO_MUL) == N
+    assert experts.adjust_N_for_activation(N, GELU_NO_MUL) == N
+    assert experts.adjust_N_for_activation(N, RELU2_NO_MUL) == N
--- a/tests/kernels/moe/untest_cutlass_moe.py
+++ b/tests/kernels/moe/untest_cutlass_moe.py
@@ -7,19 +7,25 @@ from math import prod
 import pytest
 import torch
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import (
    FUSED_MOE_UNQUANTIZED_CONFIG,
+    FusedMoEQuantConfig,
    fp8_w8a8_moe_quant_config,
 )
 from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-    cutlass_moe_fp8,
+    CutlassExpertsFp8,
    run_cutlass_moe_fp8,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 NUM_EXPERTS = [40, 64]
 TOP_KS = [6, 8]
@@ -149,16 +155,15 @@ class MOETensors8Bit(MOETensors):
 def run_with_expert_maps(
-    num_experts: int, num_local_experts: int, **cutlass_moe_kwargs
+    num_experts: int,
+    num_local_experts: int,
+    quant_config: FusedMoEQuantConfig,
+    **cutlass_moe_kwargs,
 ):
    def slice_experts():
        slice_params = [
-            "w1_q",
+            "w1",
-            "w2_q",
+            "w2",
-            "ab_strides1",
-            "ab_strides2",
-            "c_strides1",
-            "c_strides2",
        ]
        full_tensors = {
            k: v
@@ -166,8 +171,6 @@ def run_with_expert_maps(
            if k in slice_params and k in cutlass_moe_kwargs
        }
-        quant_config = cutlass_moe_kwargs["quant_config"]
        for i in range(0, num_experts, num_local_experts):
            s, e = i, i + num_local_experts
@@ -186,13 +189,23 @@ def run_with_expert_maps(
            new_quant_config._w1.scale = quant_config.w1_scale[s:e]
            new_quant_config._w2.scale = quant_config.w2_scale[s:e]
-            cutlass_moe_kwargs["quant_config"] = new_quant_config
+            yield cutlass_moe_kwargs, new_quant_config
-            yield cutlass_moe_kwargs
+    out_tensor = torch.zeros_like(cutlass_moe_kwargs["hidden_states"])
+    for kwargs, new_quant_config in slice_experts():
-    out_tensor = torch.zeros_like(cutlass_moe_kwargs["a"])
+        kernel = mk.FusedMoEModularKernel(
-    for kwargs in slice_experts():
+            MoEPrepareAndFinalizeNoEP(),
-        out_tensor = out_tensor + cutlass_moe_fp8(**kwargs)
+            CutlassExpertsFp8(
+                out_dtype=kwargs["hidden_states"].dtype,
+                # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
+                e=kwargs["w2"].shape[0],  # type: ignore[union-attr]
+                n=kwargs["w2"].shape[2],  # type: ignore[union-attr]
+                k=kwargs["w2"].shape[1],  # type: ignore[union-attr]
+                quant_config=new_quant_config,
+                device="cuda",
+            ),
+        )
+        out_tensor = out_tensor + kernel(**kwargs)
    return out_tensor
@@ -229,27 +242,35 @@ def run_8_bit(
    )
    kwargs = {
-        "a": moe_tensors.a,
+        "hidden_states": moe_tensors.a,
-        "w1_q": moe_tensors.w1_q,  # type: ignore[union-attr]
+        "w1": moe_tensors.w1_q,  # type: ignore[union-attr]
-        "w2_q": moe_tensors.w2_q,  # type: ignore[union-attr]
+        "w2": moe_tensors.w2_q,  # type: ignore[union-attr]
        "topk_weights": topk_weights,
        "topk_ids": topk_ids,
-        "ab_strides1": moe_tensors.ab_strides1,
-        "ab_strides2": moe_tensors.ab_strides2,
-        "c_strides1": moe_tensors.c_strides1,
-        "c_strides2": moe_tensors.c_strides2,
-        "quant_config": quant_config,
    }
    num_experts = moe_tensors.w1.size(0)
    with_ep = num_local_experts is not None or num_local_experts == num_experts
    if not with_ep:
-        return cutlass_moe_fp8(**kwargs)
+        kernel = mk.FusedMoEModularKernel(
+            MoEPrepareAndFinalizeNoEP(),
+            CutlassExpertsFp8(
+                out_dtype=moe_tensors.a.dtype,
+                # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
+                e=moe_tensors.w2_q.shape[0],  # type: ignore[union-attr]
+                n=moe_tensors.w2_q.shape[2],  # type: ignore[union-attr]
+                k=moe_tensors.w2_q.shape[1],  # type: ignore[union-attr]
+                quant_config=quant_config,
+                device="cuda",
+            ),
+        )
+        return kernel(**kwargs)
    assert num_local_experts is not None
    return run_with_expert_maps(
        num_experts,
        num_local_experts,  # type: ignore[arg-type]
+        quant_config,
        **kwargs,
    )
@@ -277,7 +298,7 @@ def test_cutlass_moe_8_bit_no_graph(
    workspace_init,
    ep_size: int | None = None,
 ):
-    current_platform.seed_everything(7)
+    set_random_seed(7)
    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
    with set_current_vllm_config(vllm_config):
        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, per_out_ch)
@@ -332,7 +353,7 @@ def test_cutlass_moe_8_bit_cuda_graph(
    monkeypatch,
    workspace_init,
 ):
-    current_platform.seed_everything(7)
+    set_random_seed(7)
    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
    with set_current_vllm_config(vllm_config):
        dtype = torch.half
@@ -469,7 +490,7 @@ def test_run_cutlass_moe_fp8(
    ep_size: int,
    workspace_init,
 ):
-    current_platform.seed_everything(7)
+    set_random_seed(7)
    with set_current_vllm_config(vllm_config):
        mt = MOETensors8Bit.make_moe_tensors_8bit(
            m, k, n, e, per_act_token, per_out_channel

--- a/tests/kernels/moe/untest_moe_permute_unpermute.py
+++ b/tests/kernels/moe/untest_moe_permute_unpermute.py
@@ -17,11 +17,12 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
    moe_unpermute,
 )
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 NUM_EXPERTS = [16, 64, 256]
 TOP_KS = [2, 6, 8]
 EP_SIZE = [1, 4, 16]
-current_platform.seed_everything(0)
+set_random_seed(0)
 if current_platform.is_rocm():
    pytest.skip(
@@ -226,7 +227,7 @@ def test_moe_permute_unpermute(
        n_local_expert, expert_map, _ = determine_expert_map(ep_size, ep_rank, n_expert)
        expert_map = expert_map.cuda()
    start_expert = n_local_expert * ep_rank
-    current_platform.seed_everything(0)
+    set_random_seed(0)
    hidden_states = torch.randn((n_token, n_hidden), device="cuda").to(dtype)
    gating_output = torch.randn((n_token, n_expert), device="cuda").to(dtype)
    topk_weights, topk_ids, token_expert_indices = fused_topk(

--- a/tests/kernels/moe/untest_nvfp4_moe.py
+++ b/tests/kernels/moe/untest_nvfp4_moe.py
@@ -3,6 +3,7 @@
 import pytest
 import torch
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from tests.kernels.moe.utils import make_test_weights
 from tests.kernels.quantization.nvfp4_utils import (
    FLOAT4_E2M1_MAX,
@@ -13,9 +14,15 @@ from tests.kernels.utils import torch_moe
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import nvfp4_moe_quant_config
-from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
+from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+    CutlassExpertsFp4,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
 from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 if not current_platform.has_device_capability(100):
    pytest.skip(
@@ -42,7 +49,7 @@ MNK_FACTORS = [
 def test_cutlass_fp4_moe_no_graph(
    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype, workspace_init
 ):
-    current_platform.seed_everything(7)
+    set_random_seed(7)
    with set_current_vllm_config(
        VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
    ):
@@ -82,17 +89,21 @@ def test_cutlass_fp4_moe_no_graph(
            w2_scale=w2_blockscale,
        )
-        cutlass_output = cutlass_moe_fp4(
+        kernel = mk.FusedMoEModularKernel(
-            a=a,
+            MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
-            w1_fp4=w1_q,
+            CutlassExpertsFp4(
-            w2_fp4=w2_q,
+                out_dtype=dtype,
+                max_experts_per_worker=e,
+                quant_config=quant_config,
+            ),
+        )
+        cutlass_output = kernel(
+            hidden_states=a,
+            w1=w1_q,
+            w2=w2_q,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
-            quant_config=quant_config,
-            m=m,
-            n=n,
-            k=k,
-            e=e,
        )
        # Reference check:

--- a/tests/kernels/moe/untest_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/untest_pplx_cutlass_moe.py
@@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
+from vllm.utils.torch_utils import set_random_seed
 from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
@@ -290,7 +291,7 @@ def test_cutlass_moe_pplx(
    world_dp_size: tuple[int, int],
    use_internode: bool,
 ):
-    current_platform.seed_everything(7)
+    set_random_seed(7)
    with set_current_vllm_config(vllm_config):
        dtype = torch.half

--- a/tests/kernels/moe/untest_silu_mul_fp8_quant_deep_gemm.py
+++ b/tests/kernels/moe/untest_silu_mul_fp8_quant_deep_gemm.py
@@ -13,6 +13,7 @@ from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
 from vllm.platforms import current_platform
 from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm
 from vllm.utils.math_utils import cdiv, round_up
+from vllm.utils.torch_utils import set_random_seed
 if current_platform.is_fp8_fnuz():
    pytest.skip(
@@ -201,7 +202,7 @@ def token_random(E, T, H2, tokens_per_expert):
 @torch.inference_mode()
 def test_silu_mul_fp8_quant_deep_gemm(E: int, T: int, H: int, fp8_type: torch.dtype):
    group_size = 128
-    current_platform.seed_everything(42)
+    set_random_seed(42)
    tokens_per_expert = torch.randint(
        low=0,

--- a/tests/kernels/quant_utils.py
+++ b/tests/kernels/quant_utils.py
@@ -4,13 +4,13 @@
 import torch
-from vllm.model_executor.layers.quantization.utils.quant_utils import group_broadcast
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    get_fp8_min_max,
+    group_broadcast,
+)
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import round_up
-# Using the default value (240.0) from pytorch will cause accuracy
-# issue on dynamic quantization models. Here use 224.0 for rocm.
-ROCM_FP8FNUZ_MAX = 224.0
 FP8_DTYPE = current_platform.fp8_dtype()
@@ -25,16 +25,12 @@ def ref_dynamic_per_token_quant(
    if scale_ub is not None:
        assert quant_dtype == FP8_DTYPE
-    qtype_traits = (
+    if quant_dtype == torch.int8:
-        torch.iinfo(quant_dtype)
+        qtype_traits = torch.iinfo(quant_dtype)
-        if quant_dtype == torch.int8
+        qtype_traits_min = qtype_traits.min
-        else torch.finfo(quant_dtype)
+        qtype_traits_max = qtype_traits.max
-    )
+    else:
-    use_fp8fnuz = (
+        qtype_traits_min, qtype_traits_max = get_fp8_min_max()
-        current_platform.is_fp8_fnuz() and quant_dtype == current_platform.fp8_dtype()
-    )
-    qtype_traits_max = ROCM_FP8FNUZ_MAX if use_fp8fnuz else qtype_traits.max
-    qtype_traits_min = -ROCM_FP8FNUZ_MAX if use_fp8fnuz else qtype_traits.min
    qtype_max = as_float32_tensor(qtype_traits_max)
    s_1 = as_float32_tensor(1.0)
    s_512 = as_float32_tensor(512.0)
@@ -72,17 +68,7 @@ def ref_dynamic_per_token_quant(
 def ref_dynamic_per_tensor_fp8_quant(
    x: torch.Tensor,
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    fp8_traits = torch.finfo(FP8_DTYPE)
+    fp8_traits_min, fp8_traits_max = get_fp8_min_max()
-    fp8_traits_max = (
-        ROCM_FP8FNUZ_MAX
-        if current_platform.is_rocm() and current_platform.is_fp8_fnuz()
-        else fp8_traits.max
-    )
-    fp8_traits_min = (
-        -ROCM_FP8FNUZ_MAX
-        if current_platform.is_rocm() and current_platform.is_fp8_fnuz()
-        else fp8_traits.min
-    )
    fp8_max = as_float32_tensor(fp8_traits_max)
    one = as_float32_tensor(1.0)

--- a/tests/kernels/quantization/test_awq_triton.py
+++ b/tests/kernels/quantization/test_awq_triton.py
@@ -13,7 +13,7 @@ from vllm.model_executor.layers.quantization.awq_triton import (
    awq_dequantize_triton,
    awq_gemm_triton,
 )
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 device = "cuda"  
@@ -86,7 +86,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size):
    zeros_cols = qweight_cols
    zeros_dtype = torch.int32
-    current_platform.seed_everything(0)
+    set_random_seed(0)
    qweight = torch.randint(
        0,
@@ -141,7 +141,7 @@ def test_gemm(N, K, M, splitK, group_size):
    qzeros_rows = scales_rows
    qzeros_cols = qweight_cols
-    current_platform.seed_everything(0)
+    set_random_seed(0)
    input = torch.rand((input_rows, input_cols), dtype=input_dtype, device=device)
    qweight = torch.randint(

--- a/tests/kernels/quantization/test_cutlass_w4a8_moe.py
+++ b/tests/kernels/quantization/test_cutlass_w4a8_moe.py
@@ -17,6 +17,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 )
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
+from vllm.utils.torch_utils import set_random_seed
 IS_SUPPORTED_BY_GPU = (
    current_platform.is_cuda() and current_platform.get_device_capability()[0] >= 9
@@ -248,7 +249,7 @@ def compute_moe_reference_output(setup: MoETestSetup) -> torch.Tensor:
 @pytest.mark.parametrize("random_zero", [True, False])
 def test_cutlass_w4a8_moe_mm_end_to_end(shape, random_zero):
    num_experts, N, K = shape
-    current_platform.seed_everything(42)
+    set_random_seed(42)
    setup = make_moe_test_setup(
        num_experts=num_experts, K=K, N=N, max_blocks=64, random_zero=random_zero
    )
@@ -308,7 +309,7 @@ class W4A8MoELayer(torch.nn.Module):
    reason="W4A8 Grouped GEMM is not supported on this GPU type.",
 )
 def test_cutlass_w4a8_moe_mm_cuda_graph():
-    current_platform.seed_everything(42)
+    set_random_seed(42)
    # Fixed config for CUDA graph test (single parameter point).
    num_experts = 8
    K = 512

--- a/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
+++ b/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
@@ -12,6 +12,7 @@ from nvfp4_utils import (
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm
+from vllm.utils.torch_utils import set_random_seed
 if not current_platform.has_device_capability(100):
    pytest.skip(
@@ -72,7 +73,7 @@ def test_flashinfer_nvfp4_gemm(
    if backend == "trtllm" and dtype == torch.float16:
        pytest.skip("Only torch.bfloat16 is supported for TRTLLM FP4 GEMM operations")
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    m, n, packed_k = shape
    k = packed_k * 2
    block_size = 16

--- a/tests/kernels/quantization/test_flashinfer_scaled_mm.py
+++ b/tests/kernels/quantization/test_flashinfer_scaled_mm.py
@@ -6,6 +6,7 @@ import torch
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm
+from vllm.utils.torch_utils import set_random_seed
 if not current_platform.has_device_capability(100):
    pytest.skip(
@@ -38,7 +39,7 @@ def test_flashinfer_fp8_gemm(
    device: str,
    autotune: bool,
 ) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    m, n, k = shape
    a = torch.randn((m, k), dtype=dtype, device=device)
    b = torch.randn((n, k), dtype=dtype, device=device) / k

--- a/tests/kernels/quantization/test_fp8_min_max_helper.py
+++ b/tests/kernels/quantization/test_fp8_min_max_helper.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Unit tests for the get_fp8_min_max() helper function.
+These tests verify the FP8 min/max value logic for both standard
+and fnuz (ROCm MI300) dtype handling.
+"""
+from unittest.mock import patch
+import pytest
+import torch
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    get_fp8_min_max,
+)
+class TestGetFp8MinMax:
+    """Test cases for get_fp8_min_max() function."""
+    @patch("vllm.model_executor.layers.quantization.utils.quant_utils.current_platform")
+    def test_standard_fp8_platform(self, mock_platform):
+        """Test that standard FP8 platform uses PyTorch's finfo values."""
+        mock_platform.is_fp8_fnuz.return_value = False
+        mock_platform.fp8_dtype.return_value = torch.float8_e4m3fn
+        fp8_min, fp8_max = get_fp8_min_max()
+        finfo = torch.finfo(torch.float8_e4m3fn)
+        # Standard FP8 max is 448.0 for e4m3fn
+        assert fp8_max == finfo.max, f"Expected finfo.max={finfo.max}, got {fp8_max}"
+        assert fp8_min == finfo.min, f"Expected finfo.min={finfo.min}, got {fp8_min}"
+    @patch("vllm.model_executor.layers.quantization.utils.quant_utils.current_platform")
+    def test_fnuz_platform_returns_224(self, mock_platform):
+        """Test that fnuz platform returns 224.0."""
+        mock_platform.is_fp8_fnuz.return_value = True
+        fp8_min, fp8_max = get_fp8_min_max()
+        # fnuz on ROCm MI300 should return 224.0, not 240.0
+        assert fp8_max == 224.0, f"Expected 224.0 for fnuz platform, got {fp8_max}"
+        assert fp8_min == -224.0, f"Expected -224.0 for fnuz platform, got {fp8_min}"
+    @patch("vllm.model_executor.layers.quantization.utils.quant_utils.current_platform")
+    def test_non_fnuz_platform_uses_finfo(self, mock_platform):
+        """Test that non-fnuz platform uses finfo values."""
+        mock_platform.is_fp8_fnuz.return_value = False
+        mock_platform.fp8_dtype.return_value = torch.float8_e4m3fn
+        fp8_min, fp8_max = get_fp8_min_max()
+        finfo = torch.finfo(torch.float8_e4m3fn)
+        assert fp8_max == finfo.max, (
+            f"Non-fnuz platform should use finfo.max={finfo.max}, got {fp8_max}"
+        )
+        assert fp8_min == finfo.min, (
+            f"Non-fnuz platform should use finfo.min={finfo.min}, got {fp8_min}"
+        )
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
--- a/tests/kernels/quantization/test_fp8_quant_group.py
+++ b/tests/kernels/quantization/test_fp8_quant_group.py
@@ -7,7 +7,7 @@ import torch
 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
-from vllm.platforms import current_platform
+from vllm.utils.torch_utils import set_random_seed
 @pytest.mark.parametrize(
@@ -23,14 +23,19 @@ from vllm.platforms import current_platform
 @pytest.mark.parametrize("use_ue8m0", [True, False])
 @torch.inference_mode()
 def test_quantfp8_group_functionality(
-    batch_size: int, hidden_dim: int, group_size: int, seed: int, use_ue8m0: bool
+    default_vllm_config,
+    batch_size: int,
+    hidden_dim: int,
+    group_size: int,
+    seed: int,
+    use_ue8m0: bool,
 ) -> None:
    """Test QuantFP8 group quantization with various configurations.
    Tests both CUDA and native implementations, column-major scales,
    and verifies consistency between implementations.
    """
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    x = torch.randn((batch_size, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8
    expected_num_groups = (hidden_dim + group_size - 1) // group_size
@@ -82,8 +87,10 @@ def test_quantfp8_group_functionality(
 @pytest.mark.parametrize("seed", [42])
 @pytest.mark.parametrize("use_ue8m0", [True, False])
 @torch.inference_mode()
-def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
+def test_quantfp8_group_multidimensional(
-    current_platform.seed_everything(seed)
+    default_vllm_config, seed: int, use_ue8m0: bool
+) -> None:
+    set_random_seed(seed)
    group_size = 64
@@ -135,8 +142,8 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
 @pytest.mark.parametrize("seed", [42])
 @torch.inference_mode()
-def test_quantfp8_group_edge_cases(seed: int) -> None:
+def test_quantfp8_group_edge_cases(default_vllm_config, seed: int) -> None:
-    current_platform.seed_everything(seed)
+    set_random_seed(seed)
    batch_size = 16
    group_size = 64

--- a/tests/kernels/quantization/test_gguf.py
+++ b/tests/kernels/quantization/test_gguf.py
@@ -12,8 +12,8 @@ from huggingface_hub import snapshot_download
 import vllm._custom_ops as ops
 from vllm.model_executor.layers.fused_moe import fused_experts
 from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf
-from vllm.platforms import current_platform
 from ...utils import models_path_prefix
+from vllm.utils.torch_utils import set_random_seed
 # GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
 # GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample")
@@ -95,7 +95,7 @@ def test_dequantize(
 @pytest.mark.parametrize("quant_type", QUANT_TYPES)
 @torch.inference_mode()
 def test_mmvq(hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType):
-    current_platform.seed_everything(0)
+    set_random_seed(0)
    tensors = get_gguf_sample_tensors(hidden_size, quant_type)
    x = torch.rand((1, hidden_size), dtype=dtype, device="cuda")
@@ -138,7 +138,7 @@ def test_mmq(
    dtype: torch.dtype,
    quant_type: GGMLQuantizationType,
 ):
-    current_platform.seed_everything(0)
+    set_random_seed(0)
    tensors = get_gguf_sample_tensors(hidden_size, quant_type)
    x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda")
@@ -173,7 +173,7 @@ def test_moe(
    quant_type: GGMLQuantizationType,
    top_k: int,
 ):
-    current_platform.seed_everything(0)
+    set_random_seed(0)
    H, E = 1024, 256
    x = torch.rand((num_tokens, H), dtype=dtype, device="cuda")

--- a/tests/kernels/quantization/test_int8_kernel.py
+++ b/tests/kernels/quantization/test_int8_kernel.py
@@ -107,7 +107,7 @@ SEEDS = [0]
    itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS),
 )
 @torch.inference_mode()
-def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
+def test_w8a8_fp8_fused_moe(default_vllm_config, M, N, K, E, topk, dtype, seed):
    torch.manual_seed(seed)
    # Initialize int8 quantization parameters
    factor_for_scale = 1e-2