Merge tag 'v0.10.0' into v0.10.0-dev

711aa9d5 · zhuwenwen · 751c492c · 6d8d0a24 · 711aa9d5 · 711aa9d5
Commit 711aa9d5 authored Jul 30, 2025 by zhuwenwen
20 changed files
--- a/tests/kernels/moe/modular_kernel_tools/utils.py
+++ b/tests/kernels/moe/modular_kernel_tools/utils.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+
+import torch
+
+import vllm._custom_ops as ops
+
+
+def per_token_cast_to_fp8(
+        x: torch.Tensor, block_size: int) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    pad_size = (block_size - (n % block_size)) % block_size
+    x = torch.nn.functional.pad(x,
+                                (0, pad_size), value=0) if pad_size > 0 else x
+    x_view = x.view(m, -1, block_size)
+    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
+    fp8_data = (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn)
+    return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
+
+
+def per_block_cast_to_fp8(
+        x: torch.Tensor, block_size_k: int,
+        block_size_n: int) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros(
+        (
+            int(math.ceil(m / block_size_k)) * block_size_k,
+            int(math.ceil(n / block_size_n)) * block_size_n,
+        ),
+        dtype=x.dtype,
+        device=x.device,
+    )
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, block_size_k,
+                           x_padded.size(1) // block_size_k, block_size_n)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
+    x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
+    scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
+    return x_scaled_sub, scales
+
+
+def make_non_quant_weights(
+    e: int,
+    n: int,
+    k: int,
+    dtype: torch.dtype,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Return weights w1, w2
+    """
+    device = torch.cuda.current_device()
+    w1 = torch.randn((e, 2 * n, k), device=device, dtype=dtype) / 15
+    w2 = torch.randn((e, k, n), device=device, dtype=dtype) / 15
+    return w1, w2
+
+
+def make_block_quant_fp8_weights(
+    e: int,
+    n: int,
+    k: int,
+    block_size: list[int],
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Return weights w1, w2, w1_scale, w2_scale
+    """
+    dtype = torch.bfloat16
+    device = torch.cuda.current_device()
+
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    w1_bf16, w2_bf16 = make_non_quant_weights(e, n, k, dtype)
+    w1_bf16 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype)
+    w2_bf16 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype)
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles_w1 = ((2 * n) + block_n - 1) // block_n
+    k_tiles_w1 = (k + block_k - 1) // block_k
+    n_tiles_w2 = (k + block_n - 1) // block_n
+    k_tiles_w2 = (n + block_k - 1) // block_k
+
+    w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn, device=device)
+    w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn, device=device)
+
+    w1_s = torch.empty((e, n_tiles_w1, k_tiles_w1),
+                       device=device,
+                       dtype=torch.float32)
+    w2_s = torch.empty((e, n_tiles_w2, k_tiles_w2),
+                       device=device,
+                       dtype=torch.float32)
+
+    assert w1_s.shape == (e, (2 * n + (block_n - 1)) // block_n,
+                          (k + (block_k - 1)) // block_k)
+    assert (w2.shape[-2] + block_n - 1) // block_n == w2_s.shape[-2]
+
+    for i in range(e):
+        w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i],
+                                               block_size_k=block_k,
+                                               block_size_n=block_n)
+        w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i],
+                                               block_size_k=block_k,
+                                               block_size_n=block_n)
+
+    return w1, w2, w1_s, w2_s
+
+
+def make_quant_fp8_weights(
+    e: int,
+    n: int,
+    k: int,
+    per_out_channel_quant: bool,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Return w1, w2, w1_scale, w2_scale
+    """
+    q_dtype = torch.float8_e4m3fn
+
+    w1, w2 = make_non_quant_weights(e, n, k, dtype=torch.bfloat16)
+
+    # w1 -> w1_q, w2 -> w2_q
+    w1_q = torch.empty((e, 2 * n, k), device="cuda", dtype=q_dtype)
+    w2_q = torch.empty((e, k, n), device="cuda", dtype=q_dtype)
+
+    n_b_scales = 2 * n if per_out_channel_quant else 1
+    k_b_scales = k if per_out_channel_quant else 1
+    w1_scale = torch.empty((e, n_b_scales, 1),
+                           device="cuda",
+                           dtype=torch.float32)
+    w2_scale = torch.empty((e, k_b_scales, 1),
+                           device="cuda",
+                           dtype=torch.float32)
+
+    for expert in range(e):
+        w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
+            w1[expert], use_per_token_if_dynamic=per_out_channel_quant)
+        w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
+            w2[expert], use_per_token_if_dynamic=per_out_channel_quant)
+    return w1_q, w2_q, w1_scale, w2_scale
--- a/tests/kernels/moe/parallel_utils.py
+++ b/tests/kernels/moe/parallel_utils.py
@@ -4,7 +4,6 @@
 DeepEP test utilities
 """
 import dataclasses
-import importlib
 import os
 import traceback
 from typing import Callable, Optional
@@ -15,10 +14,9 @@ from torch.multiprocessing import (
    spawn)  # pyright: ignore[reportPrivateImportUsage]
 from typing_extensions import Concatenate, ParamSpec

-from vllm.utils import get_open_port
+from vllm.utils import get_open_port, has_deep_ep

-has_deep_ep = importlib.util.find_spec("deep_ep") is not None
-if has_deep_ep:
+if has_deep_ep():
    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
        DeepEPHTPrepareAndFinalize)
    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501

--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -6,7 +6,6 @@ from typing import Optional

 import pytest
 import torch
-import triton.language as tl

 from tests.kernels.moe.utils import (batched_moe,
                                     make_quantized_test_activations,
@@ -18,6 +17,7 @@ from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
    invoke_moe_batched_triton_kernel)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.platforms import current_platform
+from vllm.triton_utils import tl

 MNK_FACTORS = [
    (1, 128, 128),

--- a/tests/kernels/moe/test_block_fp8.py
+++ b/tests/kernels/moe/test_block_fp8.py
@@ -15,13 +15,13 @@ from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
 from vllm.model_executor.layers.fused_moe.fused_moe import (
    fused_topk, modular_triton_fused_moe)
 from vllm.platforms import current_platform
+from vllm.utils import has_deep_gemm
+from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used

-dg_available = False
-try:
-    import deep_gemm
-    dg_available = True
-except ImportError:
-    pass
+dg_available = has_deep_gemm()
+
+if dg_available:
+    from deep_gemm import get_m_alignment_for_contiguous_layout

 if current_platform.get_device_capability() < (9, 0):
    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
@@ -224,6 +224,7 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
+@pytest.mark.skipif(is_blackwell_deep_gemm_used(), reason="Not E8M0 scale MOE")
 @torch.inference_mode()
 def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
                                            monkeypatch):
@@ -238,8 +239,7 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
    torch.manual_seed(seed)

    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
-
-    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
+    block_m = get_m_alignment_for_contiguous_layout()
    block_size = [block_m, block_m]
    dtype = torch.bfloat16


--- a/tests/kernels/moe/test_count_expert_num_tokens.py
+++ b/tests/kernels/moe/test_count_expert_num_tokens.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests compute_expert_num_tokens kernels
+"""
+
+import dataclasses
+from typing import Optional
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.fused_moe.utils import count_expert_num_tokens
+
+
+@dataclasses.dataclass
+class TestTensors:
+
+    topk_ids: torch.Tensor
+    expert_map: Optional[torch.Tensor] = None
+
+    def to_device(self, device: str):
+        self.topk_ids = self.topk_ids.to(device=device)
+        if self.expert_map is not None:
+            self.expert_map = self.expert_map.to(device=device)
+
+    @staticmethod
+    def make(num_tokens: int, num_topk: int, num_experts: int, device: str,
+             topk_ids_dtype: torch.dtype) -> "TestTensors":
+
+        # make topk ids
+        topk_ids = torch.empty((num_tokens, num_topk),
+                               device=device,
+                               dtype=torch.int64)
+        for x in range(num_tokens):
+            topk_ids[x] = torch.randperm(num_experts)[:num_topk]
+        topk_ids = topk_ids.to(dtype=torch.int64)
+        return TestTensors(topk_ids=topk_ids)
+
+    def with_ep_rank(self, ep_rank: int, num_global_experts: int,
+                     num_local_experts: int, device: str):
+        # make an expert map
+        expert_map = torch.empty((num_global_experts),
+                                 device=device,
+                                 dtype=torch.int32)
+        expert_map.fill_(-1)
+        s = ep_rank * num_local_experts
+        e = s + num_local_experts
+        expert_map[s:e] = torch.tensor(list(range(num_local_experts)),
+                                       device=device)
+
+        return TestTensors(topk_ids=self.topk_ids.clone(),
+                           expert_map=expert_map)
+
+
+def ref_impl(tt: TestTensors, expert_num_tokens: torch.Tensor):
+    # do the reference in cpu
+    tt.to_device("cpu")
+    expert_ids, counts = tt.topk_ids.unique(return_counts=True)
+
+    for eid, count in zip(expert_ids, counts):
+        if eid != -1 and tt.expert_map is not None:
+            eid = tt.expert_map[eid]
+
+        if eid == -1:
+            continue
+
+        expert_num_tokens[eid] += count
+
+
+def do_test_compute_expert_num_tokens(num_tokens: int, num_topk: int,
+                                      num_experts: int, ep_size: int,
+                                      topk_ids_dtype: torch.dtype):
+
+    assert num_topk <= num_experts
+
+    tt = TestTensors.make(num_tokens,
+                          num_topk,
+                          num_experts,
+                          topk_ids_dtype=topk_ids_dtype,
+                          device="cpu")
+
+    num_global_experts = num_experts
+    assert num_global_experts % ep_size == 0
+    num_local_experts = num_global_experts // ep_size
+    for ep_rank in range(ep_size):
+        tt_rank = tt.with_ep_rank(ep_rank, num_global_experts,
+                                  num_local_experts, "cpu")
+
+        ref_expert_num_tokens = torch.zeros((num_local_experts),
+                                            device="cpu",
+                                            dtype=torch.int32)
+        ref_impl(tt_rank, ref_expert_num_tokens)
+        ref_expert_num_tokens = ref_expert_num_tokens.to("cuda")
+
+        tt_rank.to_device("cuda")
+        # Test with expert_map
+        triton_expert_num_tokens_w_emap = count_expert_num_tokens(
+            tt_rank.topk_ids, num_local_experts, tt_rank.expert_map)
+
+        # Test without expert map
+        topk_ids = tt_rank.expert_map[tt_rank.topk_ids].to(topk_ids_dtype)
+        triton_expert_num_tokens_wo_emap = count_expert_num_tokens(
+            topk_ids, num_local_experts, expert_map=None)
+
+        torch.testing.assert_close(ref_expert_num_tokens,
+                                   triton_expert_num_tokens_w_emap,
+                                   atol=0,
+                                   rtol=0)
+        torch.testing.assert_close(ref_expert_num_tokens,
+                                   triton_expert_num_tokens_wo_emap,
+                                   atol=0,
+                                   rtol=0)
+
+
+@pytest.mark.parametrize(
+    "num_tokens", [1, 4, 8, 11, 19, 128, 127, 405, 1024, 3333, 6666, 7317])
+@pytest.mark.parametrize("num_topk", [2, 6, 8])
+@pytest.mark.parametrize("num_experts", [64])
+@pytest.mark.parametrize("ep_size", [1, 2, 4])
+@pytest.mark.parametrize("topk_ids_dtype", [torch.int64])
+def test_compute_expert_num_tokens(num_tokens: int, num_topk: int,
+                                   num_experts: int, ep_size: int,
+                                   topk_ids_dtype: torch.dtype):
+    do_test_compute_expert_num_tokens(num_tokens, num_topk, num_experts,
+                                      ep_size, topk_ids_dtype)
+
+
+@pytest.mark.parametrize("numel", list(range(1, 8192, 11)))
+@pytest.mark.parametrize("num_experts", [32])
+@pytest.mark.parametrize("ep_size", [2])
+@pytest.mark.parametrize("topk_ids_dtype", [torch.int64])
+def test_compute_expert_num_tokens_from_numel(numel: int, num_experts: int,
+                                              ep_size: int,
+                                              topk_ids_dtype: torch.dtype):
+    do_test_compute_expert_num_tokens(num_tokens=numel,
+                                      num_topk=1,
+                                      num_experts=num_experts,
+                                      ep_size=ep_size,
+                                      topk_ids_dtype=topk_ids_dtype)
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -20,6 +20,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
    FusedMoEModularKernel)
 from vllm.platforms import current_platform
 from vllm.utils import has_deep_ep, has_deep_gemm
+from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used

 from .parallel_utils import ProcessGroupInfo, parallel_launch
 from .utils import make_test_weights
@@ -368,6 +369,8 @@ NUM_EXPERTS = [32]
 @pytest.mark.parametrize("world_dp_size", [(2, 1)])
 @requires_deep_ep
 @requires_deep_gemm
+@pytest.mark.skipif(is_blackwell_deep_gemm_used(),
+                    reason="Skipping test for Blackwell DeepGEMM")
 def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
                                topk: int, world_dp_size: tuple[int, int]):
    """
@@ -423,6 +426,8 @@ USE_FP8_DISPATCH = [False]
 @pytest.mark.parametrize("world_dp_size", [(2, 1)])
 @requires_deep_ep
 @requires_deep_gemm
+@pytest.mark.skipif(is_blackwell_deep_gemm_used(),
+                    reason="Skipping test for Blackwell DeepGEMM")
 def test_ll_deepep_deepgemm_moe(
    mnk: tuple[int, int, int],
    num_experts: int,

--- a/tests/kernels/moe/test_deepgemm.py
+++ b/tests/kernels/moe/test_deepgemm.py
@@ -15,46 +15,17 @@ import torch
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
    per_token_group_quant_fp8)
-from vllm.utils import cdiv
+from vllm.utils import has_deep_gemm
+from vllm.utils.deep_gemm import calc_diff, per_block_cast_to_fp8

-has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None
-
-if has_deep_gemm:
-    import deep_gemm
-    BLOCK_M = deep_gemm.get_m_alignment_for_contiguous_layout()
-    BLOCK_SIZE = [BLOCK_M, BLOCK_M]
+BLOCK_SIZE = [128, 128]

 requires_deep_gemm = pytest.mark.skipif(
-    not has_deep_gemm,
+    not has_deep_gemm(),
    reason="Requires deep_gemm kernels",
 )


-def calc_diff(x: torch.Tensor, y: torch.Tensor):
-    x, y = x.double(), y.double()
-    denominator = (x * x + y * y).sum()
-    sim = 2 * (x * y).sum() / denominator
-    return 1 - sim
-
-
-def per_block_cast_to_fp8(
-        x: torch.Tensor,
-        block_size_n: int = 128) -> tuple[torch.Tensor, torch.Tensor]:
-    assert x.dim() == 2
-    m, n = x.shape
-    x_padded = torch.zeros(
-        (cdiv(m, 128) * 128, cdiv(n, block_size_n) * block_size_n),
-        dtype=x.dtype,
-        device=x.device)
-    x_padded[:m, :n] = x
-    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, block_size_n)
-    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
-    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
-    x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
-    scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
-    return x_scaled_sub, scales
-
-
 def make_block_quant_fp8_weights(
    e: int,
    n: int,
@@ -124,7 +95,7 @@ def run_single_case(m, n, k, topk, num_experts, block_size):
    topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1)
    topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)

-    # triton referrence
+    # triton reference
    out_triton = fused_experts(
        hidden_states=tokens_bf16,
        w1=w1,
@@ -155,17 +126,8 @@ def run_single_case(m, n, k, topk, num_experts, block_size):
        block_shape=block_size,
        allow_deep_gemm=True,
    )
-
-    base = out_triton.abs().mean()
-    atol = 0.1 * base.clamp(min=1e-2)  # 10% of mean, but not lower than 1e-3
-    rtol = 0.05
-    # ----- Compare -----
-    torch.testing.assert_close(
-        out_deepgemm.to(torch.float32),
-        out_triton.to(torch.float32),
-        rtol=rtol,
-        atol=float(atol),
-    )
+    diff = calc_diff(out_deepgemm, out_triton)
+    assert diff < 0.001, f"Diff exceeded 1%: {diff}"


 # Note: W1 has shape (E, 2N, K), so N = 512

--- a/tests/kernels/moe/test_modular_kernel_combinations.py
+++ b/tests/kernels/moe/test_modular_kernel_combinations.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import copy
+from itertools import product
+from typing import Optional
+
+import pytest
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.config import VllmConfig, current_platform, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
+    BatchedTritonOrDeepGemmExperts)
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
+from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+    BatchedTritonExperts)
+from vllm.model_executor.layers.fused_moe.layer import TritonExperts
+from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
+    TritonOrDeepGemmExperts)
+from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
+
+from .modular_kernel_tools.common import (Config, RankTensors, WeightTensors,
+                                          reference_moe_impl,
+                                          run_modular_kernel)
+from .modular_kernel_tools.mk_objects import (
+    MK_FUSED_EXPERT_TYPES, MK_MULTI_GPU_PREPARE_FINALIZE_TYPES,
+    MK_QUANT_CONFIGS, MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES)
+from .modular_kernel_tools.parallel_utils import (ProcessGroupInfo,
+                                                  parallel_launch_with_config)
+
+# TODO (varun): These requirements are very strict and could be relaxed.
+has_all_packages = (has_deep_ep() and has_deep_gemm() and has_pplx())
+
+meets_package_requirements = pytest.mark.skipif(
+    not has_all_packages,
+    reason="Requires deep_ep & deep_gemm & pplx packages",
+)
+
+
+def rank_worker(
+    pgi: ProcessGroupInfo,
+    vllm_config: VllmConfig,
+    cpu_group,
+    config: Config,
+    weights: WeightTensors,
+):
+    current_platform.seed_everything(pgi.rank)
+
+    # sanity check
+    from vllm import envs
+    if config.fused_moe_chunk_size is not None:
+        assert (config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE)
+
+    # get weights to this device
+    weights.to_current_device()
+
+    Ms = config.Ms
+    assert isinstance(Ms, list)
+    TOPKs = config.topks
+    assert isinstance(TOPKs, list)
+
+    for m, topk in product(Ms, TOPKs):
+        print(f"Running m={m}, topk={topk} ...")
+        # override m and topk
+        cfgx = copy.deepcopy(config)
+        cfgx.Ms = m
+        cfgx.topks = topk
+
+        # inputs for rank
+        rank_tensors = RankTensors.make(cfgx, pgi)
+
+        # modular kernel out
+        mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights,
+                                    rank_tensors)
+
+        with set_current_vllm_config(vllm_config):
+            ref_out = reference_moe_impl(cfgx, weights, rank_tensors)
+
+        torch.testing.assert_close(ref_out, mk_out, atol=3e-2, rtol=3e-2)
+
+
+def run(config: Config):
+    assert config.is_valid()
+    print(f"Testing config \n{config.describe()} ...")
+
+    weights: WeightTensors = WeightTensors.make(config)
+
+    vllm_config, env_dict = config.make_env_data()
+    parallel_launch_with_config(config.world_size, rank_worker, vllm_config,
+                                env_dict, config, weights)
+
+
+Ms = [32, 64]
+Ks = [7168]  # hidden sizes
+Ns = [2048]
+TOPKs = [4, 1]
+Es = [32]
+DTYPEs = [torch.bfloat16]
+FUSED_MOE_CHUNK_SIZEs = [None, 16]
+
+
+def is_nyi_config(config: Config) -> bool:
+    # We know these configs to be legitimate. but still fail.
+
+    if (config.fused_experts_type in [
+            BatchedTritonExperts, BatchedTritonOrDeepGemmExperts,
+            TritonExperts, TritonOrDeepGemmExperts
+    ]):
+        # The triton kernels expect both per-act-token-quant and
+        # per-out-ch-quant or neither.
+        unsupported_quant_config = ((config.is_per_act_token_quant +
+                                     config.is_per_out_ch_quant) == 1)
+        return unsupported_quant_config
+
+    # cutlass kernels dont support expert_maps yet.
+    return config.fused_experts_type == CutlassExpertsFp8
+
+
+@pytest.mark.parametrize("k", Ks)
+@pytest.mark.parametrize("n", Ns)
+@pytest.mark.parametrize("e", Es)
+@pytest.mark.parametrize("dtype", DTYPEs)
+@pytest.mark.parametrize("quant_config", MK_QUANT_CONFIGS)
+@pytest.mark.parametrize(
+    "combination",
+    product(MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
+@pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
+@pytest.mark.parametrize("world_size", [2])
+@meets_package_requirements
+def test_modular_kernel_combinations_multigpu(
+        k: int, n: int, e: int, dtype: torch.dtype,
+        quant_config: FusedMoEQuantConfig,
+        combination: tuple[mk.FusedMoEPrepareAndFinalize,
+                           mk.FusedMoEPermuteExpertsUnpermute],
+        fused_moe_chunk_size: Optional[int], world_size: int):
+
+    config = Config(
+        Ms=Ms,
+        K=k,
+        N=n,
+        E=e,
+        topks=TOPKs,
+        dtype=dtype,
+        quant_config=quant_config,
+        prepare_finalize_type=combination[0],
+        fused_experts_type=combination[1],
+        fused_moe_chunk_size=fused_moe_chunk_size,
+        world_size=world_size,
+    )
+    if not config.is_valid():
+        pytest.skip(f"Tests config {config} is not valid. Skipping ...")
+
+    if is_nyi_config(config):
+        pytest.skip(f"Tests config {config} is nyi. Skipping ...")
+
+    print(f"{config.describe()}")
+    run(config)
+
+
+@pytest.mark.parametrize("k", Ks)
+@pytest.mark.parametrize("n", Ns)
+@pytest.mark.parametrize("e", Es)
+@pytest.mark.parametrize("dtype", DTYPEs)
+@pytest.mark.parametrize("quant_config", MK_QUANT_CONFIGS)
+@pytest.mark.parametrize(
+    "combination",
+    product(MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
+@pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
+@pytest.mark.parametrize("world_size", [1])
+@meets_package_requirements
+def test_modular_kernel_combinations_singlegpu(
+        k: int, n: int, e: int, dtype: torch.dtype,
+        quant_config: FusedMoEQuantConfig,
+        combination: tuple[mk.FusedMoEPrepareAndFinalize,
+                           mk.FusedMoEPermuteExpertsUnpermute],
+        fused_moe_chunk_size: Optional[int], world_size: int):
+    config = Config(
+        Ms=Ms,
+        K=k,
+        N=n,
+        E=e,
+        topks=TOPKs,
+        dtype=dtype,
+        quant_config=quant_config,
+        prepare_finalize_type=combination[0],
+        fused_experts_type=combination[1],
+        fused_moe_chunk_size=fused_moe_chunk_size,
+        world_size=world_size,
+    )
+
+    if not config.is_valid():
+        pytest.skip(f"Tests config {config} is not valid. Skipping ...")
+
+    if is_nyi_config(config):
+        pytest.skip(f"Tests config {config} is nyi. Skipping ...")
+
+    run(config)
+
+
+if __name__ == '__main__':
+    # Ability to test individual PrepareAndFinalize and FusedExperts combination
+    from .modular_kernel_tools.cli_args import (make_config,
+                                                make_config_arg_parser)
+    parser = make_config_arg_parser(description=(
+        "Run single prepare-finalize & fused-experts combination test"
+        "Example : python3 -m tests.kernels.moe.test_modular_kernel_combinations "  #noqa: E501
+        "--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
+    ))
+    args = parser.parse_args()
+    config = make_config(args)
+
+    run(config)
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -174,6 +174,7 @@ def test_fused_moe(
                                              use_int8_w8a8=False,
                                              use_int8_w8a16=False,
                                              use_int4_w4a16=False,
+                                              use_mxfp4_w4a4=False,
                                              per_act_token_quant=False,
                                              block_shape=None)


--- a/tests/kernels/moe/test_moe_align_block_size.py
+++ b/tests/kernels/moe/test_moe_align_block_size.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import itertools
+"""Tests for the MOE align block size function.
+
+Run `pytest tests/kernels/moe/test_moe_align_block_size.py`.
+"""
+
+from typing import Optional

 import pytest
 import torch

-from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
-    moe_align_block_size_triton)
-
-
-@pytest.mark.parametrize(
-    "block_size,num_tokens,topk,num_experts",
-    list(
-        itertools.product(
-            [32, 64, 128, 256],  # block_size
-            [
-                1,
-                3,
-                7,
-                16,
-                256,
-                2256,
-                4096,
-            ],  # num_tokens
-            [1, 4, 16, 64],  # topk
-            [64, 160, 256, 257, 260, 264],  #  num_experts
-        )),
-)
-def test_moe_align_block_size_compare_implementations(block_size, num_tokens,
-                                                      topk, num_experts):
-    topk_ids = torch.stack([
-        torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk]
-        for _ in range(num_tokens)
-    ])
+    moe_align_block_size)
+from vllm.platforms import current_platform
+from vllm.utils import round_up
+
+NUM_TOKENS = [1, 3, 7, 16, 256, 2256, 4096]
+NUM_EXPERTS = [32, 160, 256, 257, 512]
+TOP_KS = [1, 2, 16, 32]
+BLOCK_SIZES = [32, 64, 128, 256]
+current_platform.seed_everything(0)
+
+
+def _group_tokens_by_expert(
+    sorted_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    block_size: int,
+    valid_length: int,
+    total_tokens: int,
+) -> dict:
+    num_blocks = valid_length // block_size
+    expert_tokens: dict[int, list[int]] = {}
+
+    for block_idx in range(num_blocks):
+        expert_id = expert_ids[block_idx].item()
+        block_start = block_idx * block_size
+        block_end = min(block_start + block_size, valid_length)
+
+        block_tokens = sorted_ids[block_start:block_end]
+        valid_tokens = block_tokens[block_tokens < total_tokens]
+
+        if expert_id not in expert_tokens:
+            expert_tokens[expert_id] = []
+        expert_tokens[expert_id].extend(valid_tokens.tolist())
+    return expert_tokens
+

+def _verify_expert_level_sorting(
+    actual_sorted_ids: torch.Tensor,
+    golden_sorted_ids: torch.Tensor,
+    expert_ids: torch.Tensor,
+    block_size: int,
+    valid_length: int,
+    total_tokens: int,
+):
+    """
+    Verify that actual_sorted_ids follows the correct expert-level sorting.
+    The kerne limplementation may or may not preserve original token order
+    in topk_ids in the final sorted_ids however this does not impact quality.
+    """
+    # Group tokens by expert from the golden implementation
+    golden_expert_tokens = _group_tokens_by_expert(golden_sorted_ids,
+                                                   expert_ids, block_size,
+                                                   valid_length, total_tokens)
+
+    actual_expert_tokens = _group_tokens_by_expert(actual_sorted_ids,
+                                                   expert_ids, block_size,
+                                                   valid_length, total_tokens)
+
+    assert set(golden_expert_tokens.keys()) == set(
+        actual_expert_tokens.keys()), (
+            f"Expert IDs mismatch: golden={set(golden_expert_tokens.keys())}, "
+            f"actual={set(actual_expert_tokens.keys())}")
+
+    for expert_id in golden_expert_tokens:
+        golden_tokens = torch.tensor(golden_expert_tokens[expert_id],
+                                     device=actual_sorted_ids.device)
+        actual_tokens = torch.tensor(actual_expert_tokens[expert_id],
+                                     device=actual_sorted_ids.device)
+        assert torch.equal(
+            torch.sort(golden_tokens)[0],
+            torch.sort(actual_tokens)[0]), (
+                f"Expert {expert_id} token mismatch: "
+                f"golden={golden_expert_tokens[expert_id]}, "
+                f"actual={actual_expert_tokens[expert_id]}")
+
+
+def torch_moe_align_block_size(
+    topk_ids: torch.Tensor,
+    block_size: int,
+    num_experts: int,
+    expert_map: Optional[torch.Tensor] = None,
+    pad_sorted_ids: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Golden torch implementation of moe_align_block_size.
+
+    This function aligns the token distribution across experts to be compatible
+    with block size for matrix multiplication by sorting tokens by expert and
+    padding to block boundaries.
+    """
    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    if pad_sorted_ids:
+        max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
+
+    flattened_token_indices = torch.arange(topk_ids.numel(),
+                                           device=topk_ids.device,
+                                           dtype=torch.int32)
+    flattened_expert_ids = topk_ids.flatten()
+    sorted_expert_ids, sort_indices = torch.sort(flattened_expert_ids,
+                                                 stable=True)
+    sorted_token_indices = flattened_token_indices[sort_indices]
+
+    expert_token_counts = torch.zeros(num_experts,
+                                      dtype=torch.int64,
+                                      device=topk_ids.device)
+    for expert_id in range(num_experts):
+        mask = sorted_expert_ids == expert_id
+        expert_token_counts[expert_id] = mask.sum()
+
+    expert_padded_counts = torch.zeros(num_experts,
+                                       dtype=torch.int64,
+                                       device=topk_ids.device)
+    for expert_id in range(num_experts):
+        original_count = expert_token_counts[expert_id]
+        if original_count > 0:
+            expert_padded_counts[expert_id] = (
+                (original_count + block_size - 1) // block_size) * block_size

-    sorted_ids_cuda = torch.empty((max_num_tokens_padded, ),
-                                  dtype=torch.int32,
-                                  device=topk_ids.device)
-    sorted_ids_cuda.fill_(topk_ids.numel())
-    max_num_m_blocks = max_num_tokens_padded // block_size
-    expert_ids_cuda = torch.zeros((max_num_m_blocks, ),
-                                  dtype=torch.int32,
-                                  device=topk_ids.device)
-    num_tokens_post_pad_cuda = torch.empty((1),
-                                           dtype=torch.int32,
-                                           device=topk_ids.device)
-
-    sorted_ids_triton = torch.empty_like(sorted_ids_cuda)
-    sorted_ids_triton.fill_(topk_ids.numel())
-    expert_ids_triton = torch.zeros_like(expert_ids_cuda)
-    num_tokens_post_pad_triton = torch.empty_like(num_tokens_post_pad_cuda)
-
-    ops.moe_align_block_size(
-        topk_ids,
-        num_experts,
+    sorted_token_ids = torch.full(
+        (max_num_tokens_padded, ),
+        topk_ids.numel(),
+        dtype=torch.int32,
+        device=topk_ids.device,
+    )
+    max_num_blocks = (max_num_tokens_padded + block_size - 1) // block_size
+    expert_ids = torch.zeros(max_num_blocks,
+                             dtype=torch.int32,
+                             device=topk_ids.device)
+
+    current_pos = 0
+    current_block = 0
+    for expert_id in range(num_experts):
+        expert_mask = sorted_expert_ids == expert_id
+        expert_tokens = sorted_token_indices[expert_mask]
+        num_expert_tokens = expert_tokens.shape[0]
+
+        if num_expert_tokens > 0:
+            sorted_token_ids[current_pos:current_pos +
+                             num_expert_tokens] = (expert_tokens)
+
+            expert_blocks_needed = expert_padded_counts[expert_id] // block_size
+            expert_ids[current_block:current_block +
+                       expert_blocks_needed] = (expert_id)
+
+            current_pos += expert_padded_counts[expert_id]
+            current_block += expert_blocks_needed
+
+    total_padded_tokens = expert_padded_counts.sum()
+    num_tokens_post_pad = torch.tensor([total_padded_tokens],
+                                       dtype=torch.int32,
+                                       device=topk_ids.device)
+
+    if expert_map is not None:
+        expert_ids = expert_map[expert_ids]
+    return sorted_token_ids, expert_ids, num_tokens_post_pad
+
+
+@pytest.mark.parametrize("m", NUM_TOKENS)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
+@pytest.mark.parametrize("block_size", BLOCK_SIZES)
+@pytest.mark.parametrize("pad_sorted_ids", [False, True])
+@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
+def test_moe_align_block_size(m: int, topk: int, num_experts: int,
+                              block_size: int, pad_sorted_ids: bool):
+    """Test moe_align_block_size without expert mapping"""
+    topk_ids = torch.zeros((m, topk), device="cuda", dtype=torch.int32)
+    for i in range(m):
+        experts = torch.randperm(num_experts, device="cuda")[:topk]
+        topk_ids[i] = experts
+
+    actual_sorted_ids, actual_expert_ids, actual_num_tokens = (
+        moe_align_block_size(
+            topk_ids=topk_ids,
+            block_size=block_size,
+            num_experts=num_experts,
+            pad_sorted_ids=pad_sorted_ids,
+        ))
+    golden_sorted_ids, golden_expert_ids, golden_num_tokens = (
+        torch_moe_align_block_size(
+            topk_ids=topk_ids,
+            block_size=block_size,
+            num_experts=num_experts,
+            pad_sorted_ids=pad_sorted_ids,
+        ))
+
+    torch.testing.assert_close(actual_num_tokens,
+                               golden_num_tokens,
+                               atol=0,
+                               rtol=0)
+    torch.testing.assert_close(actual_expert_ids,
+                               golden_expert_ids,
+                               atol=0,
+                               rtol=0)
+
+    # For sorted_token_ids, verify block-level correctness rather than exact
+    # order Tokens within each expert's blocks can be in any order, but expert
+    # regions must be correct
+    _verify_expert_level_sorting(
+        actual_sorted_ids,
+        golden_sorted_ids,
+        actual_expert_ids,
        block_size,
-        sorted_ids_cuda,
-        expert_ids_cuda,
-        num_tokens_post_pad_cuda,
+        actual_num_tokens.item(),
+        m * topk,
    )

-    moe_align_block_size_triton(
-        topk_ids,
-        num_experts,
+    total_tokens = m * topk
+    assert actual_num_tokens.item() % block_size == 0, (
+        "num_tokens_post_pad should be divisible by block_size")
+    assert actual_num_tokens.item() >= total_tokens, (
+        "num_tokens_post_pad should be at least total_tokens")
+    valid_tokens = actual_sorted_ids[actual_sorted_ids < total_tokens]
+    assert len(valid_tokens) == total_tokens, (
+        f"Should have exactly {total_tokens} valid tokens, "
+        f"got {len(valid_tokens)}")
+    assert (actual_expert_ids >= 0).all() and (
+        actual_expert_ids
+        < num_experts).all(), "expert_ids should contain valid expert indices"
+
+
+@pytest.mark.parametrize("m", [16, 32])
+@pytest.mark.parametrize("topk", [2, 4])
+@pytest.mark.parametrize("num_experts", [8])
+@pytest.mark.parametrize("block_size", [64])
+@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
+def test_moe_align_block_size_with_expert_map(m: int, topk: int,
+                                              num_experts: int,
+                                              block_size: int):
+    """Test moe_align_block_size with expert mapping (EP scenario)"""
+    topk_ids = torch.zeros((m, topk), device="cuda", dtype=torch.int32)
+    for i in range(m):
+        experts = torch.randperm(num_experts, device="cuda")[:topk]
+        topk_ids[i] = experts
+
+    expert_map = torch.full((num_experts, ),
+                            -1,
+                            device="cuda",
+                            dtype=torch.int32)
+    local_experts = list(range(0, num_experts, 2))
+    for i, expert_id in enumerate(local_experts):
+        expert_map[expert_id] = i
+
+    actual_sorted_ids, actual_expert_ids, actual_num_tokens = (
+        moe_align_block_size(
+            topk_ids=topk_ids,
+            block_size=block_size,
+            num_experts=num_experts,
+            expert_map=expert_map,
+        ))
+    golden_sorted_ids, golden_expert_ids, golden_num_tokens = (
+        torch_moe_align_block_size(
+            topk_ids=topk_ids,
+            block_size=block_size,
+            num_experts=num_experts,
+            expert_map=expert_map,
+        ))
+
+    torch.testing.assert_close(actual_num_tokens,
+                               golden_num_tokens,
+                               atol=0,
+                               rtol=0)
+    torch.testing.assert_close(actual_expert_ids,
+                               golden_expert_ids,
+                               atol=0,
+                               rtol=0)
+    _verify_expert_level_sorting(
+        actual_sorted_ids,
+        golden_sorted_ids,
+        actual_expert_ids,
        block_size,
-        sorted_ids_triton,
-        expert_ids_triton,
-        num_tokens_post_pad_triton,
+        actual_num_tokens.item(),
+        m * topk,
    )

-    assert torch.allclose(expert_ids_cuda, expert_ids_triton), (
-        f"Expert IDs mismatch for block_size={block_size}, "
-        f"num_tokens={num_tokens}, topk={topk}\n"
-        f"CUDA expert_ids: {expert_ids_cuda}\n"
-        f"Triton expert_ids: {expert_ids_triton}")

-    assert torch.allclose(
-        num_tokens_post_pad_cuda, num_tokens_post_pad_triton), (
-            f"Num tokens post pad mismatch for block_size={block_size}, "
-            f"num_tokens={num_tokens}, topk={topk}\n"
-            f"CUDA num_tokens_post_pad: {num_tokens_post_pad_cuda}\n"
-            f"Triton num_tokens_post_pad: {num_tokens_post_pad_triton}")
+def test_moe_align_block_size_deterministic():
+    m, topk, num_experts, block_size = 128, 2, 32, 64
+
+    torch.manual_seed(42)
+    topk_ids = torch.randint(0,
+                             num_experts, (m, topk),
+                             device="cuda",
+                             dtype=torch.int32)

+    # expect the results to be reproducible
+    results = []
+    for _ in range(5):
+        sorted_ids, expert_ids, num_tokens = moe_align_block_size(
+            topk_ids=topk_ids, block_size=block_size, num_experts=num_experts)
+        results.append(
+            (sorted_ids.clone(), expert_ids.clone(), num_tokens.clone()))

-if __name__ == "__main__":
-    pytest.main([__file__])
+    for i in range(1, len(results)):
+        assert torch.equal(
+            results[0][0],
+            results[i][0]), ("sorted_ids should be deterministic")
+        assert torch.equal(
+            results[0][1],
+            results[i][1]), ("expert_ids should be deterministic")
+        assert torch.equal(
+            results[0][2],
+            results[i][2]), ("num_tokens should be deterministic")
--- a/tests/kernels/moe/test_mxfp4_moe.py
+++ b/tests/kernels/moe/test_mxfp4_moe.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import importlib
+import importlib.metadata
+from dataclasses import dataclass
+
+import pytest
+import torch
+from packaging import version
+
+QUARK_MXFP4_AVAILABLE = importlib.util.find_spec(
+    "quark") is not None and version.parse(
+        importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
+
+
+@dataclass
+class ModelCase:
+    model_id: str
+    tp: int
+
+
+@pytest.mark.parametrize('model_case', [
+    ModelCase("fxmarty/qwen_1.5-moe-a2.7b-mxfp4", tp=1),
+    ModelCase("fxmarty/deepseek_r1_3_layers_mxfp4", tp=8),
+    ModelCase("fxmarty/Llama-4-Scout-17B-16E-Instruct-2-layers-mxfp4", tp=1)
+])
+@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE,
+                    reason="amd-quark>=0.9 is not available")
+def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
+    if torch.cuda.device_count() < model_case.tp:
+        pytest.skip(f"This test requires >={model_case.tp} gpus, got only "
+                    f"{torch.cuda.device_count()}")
+
+    with vllm_runner(model_case.model_id,
+                     tensor_parallel_size=model_case.tp,
+                     load_format="dummy") as llm:
+
+        # TODO: llm.apply_model(check_model) currently relies on V0 internals.
+        # Re-enable this later.
+        # def check_model(model):
+        #     layer = model.model.layers[0]
+
+        #     qkv_proj = layer.self_attn.qkv_proj
+
+        #     assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
+        #     assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4)
+
+        #     assert isinstance(layer.mlp.experts.quant_method,
+        #                       QuarkW4A4MXFp4MoEMethod)
+
+        # if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4":
+        #     llm.apply_model(check_model)
+
+        output = llm.generate_greedy("Today I am in the French Alps and",
+                                     max_tokens=20)
+        assert output
\ No newline at end of file
--- a/tests/kernels/moe/test_nvfp4_moe.py
+++ b/tests/kernels/moe/test_nvfp4_moe.py
@@ -93,11 +93,11 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
            a1_gscale=a1_gs,
            w1_fp4=w1_q,
            w1_blockscale=w1_blockscale,
-            w1_alphas=(1 / w1_gs),
+            g1_alphas=(1 / w1_gs),
            a2_gscale=a2_gs,
            w2_fp4=w2_q,
            w2_blockscale=w2_blockscale,
-            w2_alphas=(1 / w2_gs),
+            g2_alphas=(1 / w2_gs),
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            m=m,

--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -32,6 +32,8 @@ from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
 from vllm.model_executor.layers.fused_moe.fused_moe import get_default_config
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
    FusedMoEModularKernel)
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceDelegate)
 from vllm.platforms import current_platform
 from vllm.utils import round_up

@@ -371,6 +373,7 @@ def pplx_prepare_finalize(
        chunk_topk_weight,
        chunk_topk_ids,
        False,
+        weight_and_reduce_impl=TopKWeightAndReduceDelegate(),
    )

    torch.cuda.synchronize()

--- a/tests/kernels/moe/untest_cutlass_moe.py
+++ b/tests/kernels/moe/untest_cutlass_moe.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import dataclasses
+from math import prod
 from typing import Optional

 import pytest
@@ -8,9 +9,12 @@ import torch

 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
+from vllm.model_executor.layers.fused_moe.cutlass_moe import (
+    cutlass_moe_fp8, run_cutlass_moe_fp8)
 from vllm.model_executor.layers.fused_moe.fused_moe import (fused_experts,
                                                            fused_topk)
+from vllm.model_executor.layers.fused_moe.utils import (
+    moe_kernel_quantize_input)
 from vllm.platforms import current_platform

 NUM_EXPERTS = [40, 64]
@@ -21,6 +25,7 @@ MNK_FACTORS = [
    (2, 1024, 1536),
    (2, 3072, 1024),
    (2, 3072, 1536),
+    (7, 3072, 1536),
    (64, 1024, 1024),
    (64, 1024, 1536),
    (64, 3072, 1024),
@@ -236,6 +241,7 @@ def test_cutlass_moe_8_bit_no_graph(
    per_act_token: bool,
    per_out_ch: bool,
    monkeypatch,
+    ep_size: Optional[int] = None,
 ):
    current_platform.seed_everything(7)
    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
@@ -254,7 +260,13 @@ def test_cutlass_moe_8_bit_no_graph(
        triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights,
                                      topk_ids)

-        cutlass_output = run_8_bit(mt, topk_weights, topk_ids, per_act_token)
+        if ep_size is not None:
+            assert e % ep_size == 0, "Cannot distribute experts evenly"
+            number_local_experts = e // ep_size
+        else:
+            number_local_experts = None
+        cutlass_output = run_8_bit(mt, topk_weights, topk_ids, per_act_token,
+                                   number_local_experts)

        # Note 5.5 only needed for larger problem sizes, 5 works ok for
        # the rest.
@@ -340,9 +352,62 @@ def test_cutlass_moe_8_bit_EP(
    per_out_channel: bool,
    ep_size: int,
    monkeypatch,
+):
+    test_cutlass_moe_8_bit_no_graph(m, n, k, e, topk, per_act_token,
+                                    per_out_channel, monkeypatch, ep_size)
+
+
+LARGE_MNK_FACTORS = [
+    (1, 8192, 5120, 31),
+    (32768, 1024, 1024, 16),
+    (65536, 512, 1024, 16),
+]
+
+
+@pytest.mark.parametrize("m,n,k,topk", LARGE_MNK_FACTORS)
+@pytest.mark.parametrize("e", [128])
+@pytest.mark.parametrize("per_act_token", [False])
+@pytest.mark.parametrize("per_out_channel", [True])
+@pytest.mark.parametrize("ep_size", [8])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_cutlass_moe_8_bit_EP_large(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_channel: bool,
+    ep_size: int,
+    monkeypatch,
+):
+    test_cutlass_moe_8_bit_no_graph(m, n, k, e, topk, per_act_token,
+                                    per_out_channel, monkeypatch, ep_size)
+
+
+@pytest.mark.parametrize("m,n,k,topk", [(1, 8192, 5120, 31)])
+@pytest.mark.parametrize("e", [128])
+@pytest.mark.parametrize("per_act_token", [False])
+@pytest.mark.parametrize("per_out_channel", [True])
+@pytest.mark.parametrize("ep_size", [8])
+@pytest.mark.skipif(
+    (lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
+        current_platform.get_device_capability()),
+    reason="Grouped gemm is not supported on this GPU type.")
+def test_run_cutlass_moe_fp8(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    per_act_token: bool,
+    per_out_channel: bool,
+    ep_size: int,
 ):
    current_platform.seed_everything(7)
-    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
    with set_current_vllm_config(vllm_config):
        mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
                                                  per_out_channel)
@@ -352,20 +417,53 @@ def test_cutlass_moe_8_bit_EP(
                                               score,
                                               topk,
                                               renormalize=False)
-
-        # Note that we are using the dequantized versions of the tensors.
-        # Using a, w1 and w2 directly results in minor output differences.
-        triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights,
-                                      topk_ids)
-
-        assert e % ep_size == 0, "Cannot distribute experts evenly"
-        cutlass_output = run_8_bit(mt,
-                                   topk_weights,
-                                   topk_ids,
-                                   per_act_token,
-                                   num_local_experts=e // ep_size)
-
-        torch.testing.assert_close(triton_output,
-                                   cutlass_output,
-                                   atol=5e-2,
-                                   rtol=1e-2)
\ No newline at end of file
+        # we want to make sure there is at least one token that's generated in
+        # this expert shard and at least one token that's NOT generated in this
+        # expert shard
+        topk_ids[0][0] = -1
+        topk_ids[0][1] = 1
+
+        workspace13_shape = (m * topk, max(2 * n, k))
+        workspace2_shape = (m * topk, n)
+        output_shape = (m * topk, k)
+
+        workspace13 = torch.empty(prod(workspace13_shape),
+                                  device="cuda",
+                                  dtype=mt.a.dtype)
+        workspace2 = torch.empty(prod(workspace2_shape),
+                                 device="cuda",
+                                 dtype=mt.a.dtype)
+
+        num_local_experts = e // ep_size
+        start, end = 0, num_local_experts
+        expert_map = [-1] * e
+        expert_map[start:end] = list(range(num_local_experts))
+        expert_map = torch.tensor(expert_map, dtype=torch.int32, device="cuda")
+
+        activation = lambda o, i: torch.ops._C.silu_and_mul(o, i)
+        a1q, a1q_scale = moe_kernel_quantize_input(mt.a, mt.a_scale,
+                                                   torch.float8_e4m3fn,
+                                                   per_act_token)
+        global_num_experts = -1 if mt.w1_q is None else mt.w1_q.size(0)
+        func = lambda output: run_cutlass_moe_fp8(
+            output, a1q, mt.w1_q, mt.w2_q, topk_ids, activation,
+            global_num_experts, expert_map, mt.w1_scale, mt.w2_scale,
+            a1q_scale, None, workspace13, workspace2, None, mt.a.dtype,
+            per_act_token, per_out_channel, False)
+
+        workspace13.random_()
+        output_random_workspace = torch.empty(output_shape,
+                                              device="cuda",
+                                              dtype=mt.a.dtype)
+        func(output_random_workspace)
+
+        workspace13.fill_(0)
+        output_zero_workspace = torch.zeros(output_shape,
+                                            device="cuda",
+                                            dtype=mt.a.dtype)
+        func(output_zero_workspace)
+
+        torch.testing.assert_close(output_random_workspace,
+                                   output_zero_workspace,
+                                   atol=5e-3,
+                                   rtol=1e-3)
--- a/tests/kernels/quantization/test_per_token_group_quant.py
+++ b/tests/kernels/quantization/test_per_token_group_quant.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.quantization.utils import fp8_utils
+
+
+@pytest.mark.parametrize("shape", [(32, 128), (64, 256), (16, 512)])
+@pytest.mark.parametrize("column_major", [False, True])
+@pytest.mark.parametrize("scale_ue8m0", [False, True])
+@pytest.mark.parametrize("group_size", [64, 128])
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_per_token_group_quant_fp8(shape, column_major: bool,
+                                   scale_ue8m0: bool, group_size: int):
+    device = "cuda"
+
+    torch.manual_seed(42)
+    num_tokens, hidden_dim = shape
+
+    x = (torch.randn(
+        (num_tokens, hidden_dim), device=device, dtype=torch.bfloat16) * 8)
+
+    # cuda path
+    out_q, scale = fp8_utils.per_token_group_quant_fp8(
+        x,
+        group_size,
+        column_major_scales=column_major,
+        use_ue8m0=scale_ue8m0,
+    )
+
+    # triton ref
+    with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
+        ref_q, ref_s = fp8_utils.per_token_group_quant_fp8(
+            x,
+            group_size,
+            column_major_scales=column_major,
+            use_ue8m0=scale_ue8m0,
+        )
+
+    assert torch.allclose(out_q.float(), ref_q.float(), atol=0.15, rtol=0.15)
+    assert torch.allclose(scale, ref_s, atol=0.01, rtol=0.01)
--- a/tests/kernels/quantization/untest_block_fp8.py
+++ b/tests/kernels/quantization/untest_block_fp8.py
@@ -8,19 +8,14 @@ import pytest
 import torch

 from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
-                                       native_w8a8_block_matmul,
-                                       per_block_cast_to_fp8)
+                                       native_w8a8_block_matmul)
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    per_token_group_quant_fp8, w8a8_block_fp8_matmul)
+    get_col_major_tma_aligned_tensor, per_token_group_quant_fp8,
+    w8a8_block_fp8_matmul)
 from vllm.platforms import current_platform
-
-dg_available = False
-try:
-    import deep_gemm
-    dg_available = True
-except ImportError:
-    pass
+from vllm.utils import has_deep_gemm
+from vllm.utils.deep_gemm import fp8_gemm_nt, per_block_cast_to_fp8

 if current_platform.get_device_capability() < (9, 0):
    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
@@ -106,7 +101,8 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
 @pytest.mark.parametrize(
    "M,N,K,block_size,out_dtype,seed",
    itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
-@pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
+@pytest.mark.skipif(not has_deep_gemm(),
+                    reason="DeepGemm kernels not available.")
 @torch.inference_mode()
 def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
    # only aligned sizes
@@ -120,9 +116,7 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
    A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
    B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max

-    _, block_k = block_size[0], block_size[1]
-
-    A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_k)
+    A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_size[1])
    B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32)

    As = As_fp8.to(torch.float32)
@@ -132,14 +126,14 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
                                       out_dtype)

    # Transpose earlier so that the testing will not trigger transposing kernels
-    As_fp8 = deep_gemm.get_col_major_tma_aligned_tensor(As_fp8)
+    As_fp8 = get_col_major_tma_aligned_tensor(As_fp8)

    out = torch.zeros((M, N), device='cuda', dtype=out_dtype)

    assert As_fp8.shape == (M, (K + 127) //
                            128), f"{As_fp8.shape} != {(M, (K + 127) // 128)}"

-    deep_gemm.gemm_fp8_fp8_bf16_nt((A_fp8, As_fp8), (B_fp8, Bs_fp8), out)
+    fp8_gemm_nt((A_fp8, As_fp8), (B_fp8, Bs_fp8), out)

    rel_diff = (torch.mean(
        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /

--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -1072,6 +1072,7 @@ def torch_experts(
    quant_dtype: Optional[torch.dtype] = None,
    per_act_token_quant=False,
    block_shape: Optional[list[int]] = None,
+    apply_router_weights_on_input: bool = False,
 ) -> torch.Tensor:
    assert (global_num_experts == -1
            or (global_num_experts == w1.shape[0] and expert_map is None)
@@ -1081,11 +1082,17 @@ def torch_experts(
    M, K = a.shape
    topk = topk_ids.shape[1]

+    if apply_router_weights_on_input:
+        assert topk == 1
+        a = a * topk_weight.to(a.dtype)
+
    a = a.view(M, -1, K).repeat(1, topk, 1).reshape(-1, K)

    out = torch.zeros(M * topk, w2.shape[1], dtype=a.dtype, device=a.device)

-    a, a_scale = moe_kernel_quantize_input(a, None, quant_dtype,
+    if a1_scale:
+        assert not per_act_token_quant and block_shape is None
+    a, a_scale = moe_kernel_quantize_input(a, a1_scale, quant_dtype,
                                           per_act_token_quant, block_shape)

    num_experts = w1.shape[0]
@@ -1104,6 +1111,7 @@ def torch_experts(
                tmp2 = SiluAndMul()(tmp1)
                out[mask] = tmp2 @ w2[i].transpose(0, 1)
            elif block_shape is not None:
+                # block quantized
                assert (a_scale is not None and w1_scale is not None
                        and w2_scale is not None)
                tmp1 = native_w8a8_block_matmul(a[mask], w1[i], a_scale[mask],
@@ -1121,15 +1129,27 @@ def torch_experts(
                assert (a_scale is not None and w1_scale is not None
                        and w2_scale is not None)
                scales = a_scale if a_scale.numel() == 1 else a_scale[mask]
+
                tmp1 = a[mask].to(f32) * scales
                w1_dq = (w1[i].to(f32) * w1_scale[i]).transpose(0, 1)
-                tmp1 = tmp1 @ w1_dq
-                tmp2 = SiluAndMul()(tmp1)
+                tmp1 = (tmp1 @ w1_dq).to(out.dtype)
+
+                tmp2 = SiluAndMul()(tmp1).to(out.dtype)
+
+                tmp2, b_scale = moe_kernel_quantize_input(
+                    tmp2, a2_scale, quant_dtype, per_act_token_quant,
+                    block_shape)
+                assert b_scale is not None
+
+                tmp2 = tmp2.to(f32) * b_scale
                w2_dq = (w2[i].to(f32) * w2_scale[i]).transpose(0, 1)
                out[mask] = (tmp2 @ w2_dq).to(out.dtype)

-    return (out.view(M, -1, w2.shape[1]).to(f32) *
-            topk_weight.view(M, -1, 1)).sum(dim=1).to(out.dtype)
+    if apply_router_weights_on_input:
+        return out
+    else:
+        return (out.view(M, -1, w2.shape[1]).to(f32) *
+                topk_weight.view(M, -1, 1)).sum(dim=1).to(out.dtype)


 def torch_moe(a: torch.Tensor,

--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -234,12 +234,6 @@ def qwen_lora_files():
    return os.path.join(models_path_prefix, "customize/qwen-nl2dsl-lora")


-@pytest.fixture(scope="session")
-def long_context_lora_files_16k_1():
-    # return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_1")
-    return os.path.join(models_path_prefix, "SangBinCho/long_context_16k_testing_1")
-
-
 @pytest.fixture
 def llama_2_7b_engine_extra_embeddings():
    cleanup_dist_env_and_memory(shutdown_ray=True)

--- a/tests/lora/test_default_mm_loras.py
+++ b/tests/lora/test_default_mm_loras.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for applying default registered multimodal loras.
+"""
+
+import os
+
+from huggingface_hub import snapshot_download
+
+from vllm.lora.request import LoRARequest
+
+from ..conftest import AudioTestAssets, VllmRunner
+
+MODEL_PATH = snapshot_download("microsoft/Phi-4-multimodal-instruct")
+AUDIO_LORA_PATH = os.path.join(MODEL_PATH, "speech-lora")
+IMAGE_LORA_PATH = os.path.join(MODEL_PATH, "vision-lora")
+
+AUDIO_PROMPT = "<|user|><|audio_1|>Can you transcribe this audio?<|end|><|assistant|>"  # noqa: E501
+
+# Responses are greedy decoded; we just check the end of
+# the generated text. If the lora is inactive, this model
+# generates commentary on the transcription.
+RESPONSE_SUFFIX_WITH_LORA = "Spoken text: The first words I spoke in the original chronograph, a little piece of practical poetry. Mary had a little lamb, it slept with quite a snow, and everywhere that Mary went, the lamb was sure to go."  # noqa: E501
+RESPONSE_SUFFIX_WITHOUT_LORA = "Certainly! Here is the transcription of the audio you provided:\n\nThe first words I spoke in the original phonograph record: A little piece of practical poetry. Mary had a little lamb; its fleece was white as snow, and everywhere that Mary went, the lamb was sure to go."  # noqa: E501
+
+VLLM_RUNNER_BASE_KWARGS = {
+    "model_name": MODEL_PATH,
+    "dtype": "half",
+    "enable_lora": "True",
+    "max_num_seqs": 2,
+    "max_lora_rank": 320,
+    "max_model_len": 12800,
+    "gpu_memory_utilization": 0.8,
+    "limit_mm_per_prompt": {
+        "audio": 1
+    },
+    "enforce_eager": True,
+}
+
+
+def run_test(vllm_runner, audio_assets, lora_request, expected_suffix,
+             **kwargs):
+    inputs = [([AUDIO_PROMPT], [audio_assets[0].audio_and_sample_rate[0]])]
+
+    # Apply any additional kwargs as overrides to the base kwargs
+    vllm_runner_kwargs = {**VLLM_RUNNER_BASE_KWARGS, **kwargs}
+
+    with vllm_runner(**vllm_runner_kwargs) as vllm_model:
+        vllm_outputs_with_default_lora = [
+            vllm_model.generate_greedy(
+                prompts,
+                max_tokens=128,
+                audios=audios,
+                lora_request=lora_request,
+            ) for prompts, audios in inputs
+        ]
+
+        assert vllm_outputs_with_default_lora[-1][-1][-1].endswith(
+            expected_suffix)
+
+
+def test_active_default_mm_lora(
+    vllm_runner: type[VllmRunner],
+    audio_assets: AudioTestAssets,
+):
+    """Ensure that we can use the default audio lora."""
+    run_test(
+        vllm_runner,
+        audio_assets,
+        lora_request=None,
+        default_mm_loras={"audio": AUDIO_LORA_PATH},
+        expected_suffix=RESPONSE_SUFFIX_WITH_LORA,
+    )
+
+
+def test_inactive_default_mm_lora(
+    vllm_runner: type[VllmRunner],
+    audio_assets: AudioTestAssets,
+):
+    """Ensure that modalities are filtered properly."""
+    # Default image lora won't be active since we only pass audio
+    run_test(
+        vllm_runner,
+        audio_assets,
+        lora_request=None,
+        default_mm_loras={"image": IMAGE_LORA_PATH},
+        expected_suffix=RESPONSE_SUFFIX_WITHOUT_LORA,
+    )
+
+
+def test_default_mm_lora_succeeds_with_redundant_lora_request(
+    vllm_runner: type[VllmRunner],
+    audio_assets: AudioTestAssets,
+):
+    """Ensure that redundantly providing the lora works."""
+    run_test(
+        vllm_runner,
+        audio_assets,
+        lora_request=LoRARequest("audio", 1, AUDIO_LORA_PATH),
+        default_mm_loras={"audio": AUDIO_LORA_PATH},
+        expected_suffix=RESPONSE_SUFFIX_WITH_LORA,
+    )
+
+
+def test_default_mm_lora_fails_with_overridden_lora_request(
+    vllm_runner: type[VllmRunner],
+    audio_assets: AudioTestAssets,
+):
+    """Ensure that if the lora_request conflicts with default_mm_loras,
+    we use the lora_request."""
+    run_test(
+        vllm_runner,
+        audio_assets,
+        lora_request=LoRARequest("speech", 2, AUDIO_LORA_PATH),
+        default_mm_loras={"audio": IMAGE_LORA_PATH},
+        expected_suffix=RESPONSE_SUFFIX_WITH_LORA,
+    )
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -170,7 +170,8 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
            f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
            MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
            str(tp_size), "serialize", "--serialized-directory",
-            str(tmp_path), "--suffix", suffix
+            str(tmp_path), "--suffix", suffix, "--serialization-kwargs",
+            '{"limit_cpu_concurrency": 4}'
        ],
                                check=True,
                                capture_output=True,
@@ -185,27 +186,26 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,

    model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
    tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
-    tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir

-    loaded_vllm_model = LLM(model=model_ref,
-                            load_format="tensorizer",
-                            enable_lora=True,
-                            enforce_eager=True,
-                            model_loader_extra_config=tensorizer_config,
-                            max_num_seqs=13,
-                            tensor_parallel_size=2,
-                            max_loras=2)
+    loaded_llm = LLM(model=model_ref,
+                     load_format="tensorizer",
+                     enable_lora=True,
+                     enforce_eager=True,
+                     model_loader_extra_config=tensorizer_config,
+                     max_num_seqs=13,
+                     tensor_parallel_size=2,
+                     max_loras=2)

-    tensorizer_config_dict = tensorizer_config.to_dict()
+    tc_as_dict = tensorizer_config.to_serializable()

    print("lora adapter created")
-    assert do_sample(loaded_vllm_model,
+    assert do_sample(loaded_llm,
                     sql_lora_files,
-                     tensorizer_config_dict=tensorizer_config_dict,
+                     tensorizer_config_dict=tc_as_dict,
                     lora_id=0) == EXPECTED_NO_LORA_OUTPUT

    print("lora 1")
-    assert do_sample(loaded_vllm_model,
+    assert do_sample(loaded_llm,
                     sql_lora_files,
-                     tensorizer_config_dict=tensorizer_config_dict,
+                     tensorizer_config_dict=tc_as_dict,
                     lora_id=1) == EXPECTED_LORA_OUTPUT