Commit 7e63ef82 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0' into v0.14.0-dev

parents 8cbcac5d b17039bc
...@@ -34,6 +34,7 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import ( ...@@ -34,6 +34,7 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import (
) )
from vllm.model_executor.layers.utils import shuffle_weight from vllm.model_executor.layers.utils import shuffle_weight
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
MNK = [ MNK = [
(1, 512, 384), (1, 512, 384),
...@@ -211,7 +212,7 @@ def test_oai_triton_moe( ...@@ -211,7 +212,7 @@ def test_oai_triton_moe(
unfused: bool, unfused: bool,
workspace_init, workspace_init,
): ):
current_platform.seed_everything(0) set_random_seed(0)
( (
w1, w1,
w2, w2,
......
...@@ -60,10 +60,14 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_w ...@@ -60,10 +60,14 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import quantize_w
from vllm.model_executor.models.mixtral import MixtralMoE from vllm.model_executor.models.mixtral import MixtralMoE
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.scalar_type import ScalarType, scalar_types from vllm.scalar_type import ScalarType, scalar_types
from vllm.utils.torch_utils import set_random_seed
from vllm.v1.worker.workspace import init_workspace_manager
NUM_EXPERTS = [8, 64, 192] NUM_EXPERTS = [8, 64, 192]
NUM_EXPERTS_LARGE = [128, 256]
EP_SIZE = [1, 4] EP_SIZE = [1, 4]
TOP_KS = [2, 6] TOP_KS = [2, 6]
TOP_KS_SMALL = [1, 2]
MOE_MARLIN_QUANT_TEST_CONFIGS = [ MOE_MARLIN_QUANT_TEST_CONFIGS = [
# AWQ-INT4 # AWQ-INT4
...@@ -131,6 +135,13 @@ FUSED_MOE_MNK_FACTORS = [ ...@@ -131,6 +135,13 @@ FUSED_MOE_MNK_FACTORS = [
(40000, 1024, 1024), (40000, 1024, 1024),
] ]
FUSED_MOE_MNK_FACTORS_SMALL_M = [
(1, 128, 128),
(1, 2048, 128),
(2, 2048, 128),
(2, 2048, 511),
]
FUSED_MOE_WN16_MNK_FACTORS = [ FUSED_MOE_WN16_MNK_FACTORS = [
(1, 128, 128), (1, 128, 128),
(1, 1024, 1024), (1, 1024, 1024),
...@@ -233,7 +244,7 @@ def test_fused_moe( ...@@ -233,7 +244,7 @@ def test_fused_moe(
monkeypatch, monkeypatch,
workspace_init, workspace_init,
): ):
current_platform.seed_everything(7) set_random_seed(7)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size)) monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
...@@ -328,6 +339,111 @@ def test_fused_moe( ...@@ -328,6 +339,111 @@ def test_fused_moe(
) )
@pytest.mark.parametrize("m,n,k", FUSED_MOE_MNK_FACTORS_SMALL_M)
@pytest.mark.parametrize("e", NUM_EXPERTS_LARGE)
@pytest.mark.parametrize("topk", TOP_KS_SMALL)
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("padding", [True, False])
@pytest.mark.parametrize("chunk_size", [8192])
def test_naive_block_assignment_moe(
m: int,
n: int,
k: int,
e: int,
topk: int,
dtype: torch.dtype,
padding: bool,
chunk_size: int,
monkeypatch,
workspace_init,
):
current_platform.seed_everything(7)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
#
# Setup test data
#
#
# Setup test data
#
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
score = torch.randn((m, e), device="cuda", dtype=dtype)
e_map = None
#
# Setup test functions
#
quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
m_fused_moe_fn = modular_triton_fused_moe(quant_config)
def m_fused_moe(
a: torch.Tensor,
w1: torch.Tensor,
w2: torch.Tensor,
score: torch.Tensor,
topk: int,
global_num_experts: int = -1,
expert_map: torch.Tensor | None = None,
) -> torch.Tensor:
topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)
return m_fused_moe_fn(
a,
w1,
w2,
topk_weights,
topk_ids,
global_num_experts=global_num_experts,
expert_map=expert_map,
)
fused_moe_fn = functools.partial(fused_moe, renormalize=False)
#
# Run tests
#
runner = functools.partial(
run_moe_test,
a=a,
w1=w1,
w2=w2,
score=score,
topk=topk,
global_num_experts=e,
expert_map=e_map,
padding=padding,
)
# Note: for now use_compile will error out if the problem size is
# large enough to trigger chunking. I'm leaving the flag and
# setup code in case we are able to revisit this later.
use_compile = False
use_cudagraph = n >= 1024 and k >= 1024 and current_platform.is_cuda_alike()
with set_current_vllm_config(vllm_config):
baseline_output = runner(torch_moe, iterative_moe)
runner(
baseline_output,
fused_moe_fn,
use_compile=use_compile,
use_cudagraph=use_cudagraph,
)
runner(
baseline_output,
m_fused_moe,
use_compile=use_compile,
use_cudagraph=use_cudagraph,
)
@pytest.mark.parametrize("m,n,k", FUSED_MOE_WN16_MNK_FACTORS) @pytest.mark.parametrize("m,n,k", FUSED_MOE_WN16_MNK_FACTORS)
@pytest.mark.parametrize("e", NUM_EXPERTS) @pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS) @pytest.mark.parametrize("topk", TOP_KS)
...@@ -466,7 +582,12 @@ def test_fused_moe_wn16( ...@@ -466,7 +582,12 @@ def test_fused_moe_wn16(
) )
@torch.inference_mode() @torch.inference_mode()
def test_mixtral_moe( def test_mixtral_moe(
dist_init, dtype: torch.dtype, padding: bool, use_rocm_aiter: bool, monkeypatch default_vllm_config,
dist_init,
dtype: torch.dtype,
padding: bool,
use_rocm_aiter: bool,
monkeypatch,
): ):
"""Make sure our Mixtral MoE implementation agrees with the one from """Make sure our Mixtral MoE implementation agrees with the one from
huggingface.""" huggingface."""
...@@ -487,6 +608,7 @@ def test_mixtral_moe( ...@@ -487,6 +608,7 @@ def test_mixtral_moe(
monkeypatch.setenv("MASTER_ADDR", "localhost") monkeypatch.setenv("MASTER_ADDR", "localhost")
monkeypatch.setenv("MASTER_PORT", "12345") monkeypatch.setenv("MASTER_PORT", "12345")
init_distributed_environment() init_distributed_environment()
init_workspace_manager(torch.cuda.current_device())
# Instantiate our and huggingface's MoE blocks # Instantiate our and huggingface's MoE blocks
vllm_config.compilation_config.static_forward_context = dict() vllm_config.compilation_config.static_forward_context = dict()
...@@ -540,6 +662,11 @@ def test_mixtral_moe( ...@@ -540,6 +662,11 @@ def test_mixtral_moe(
torch.cuda.synchronize() torch.cuda.synchronize()
torch.cuda.empty_cache() torch.cuda.empty_cache()
# FIXME (zyongye) fix this after we move self.kernel
# assignment in FusedMoE.__init__
vllm_moe.experts.quant_method.process_weights_after_loading(vllm_moe.experts)
# Run forward passes for both MoE blocks # Run forward passes for both MoE blocks
hf_states, _ = hf_moe.forward(hf_inputs) hf_states, _ = hf_moe.forward(hf_inputs)
vllm_states = vllm_moe.forward(vllm_inputs) vllm_states = vllm_moe.forward(vllm_inputs)
......
...@@ -14,12 +14,13 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( ...@@ -14,12 +14,13 @@ from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.math_utils import round_up from vllm.utils.math_utils import round_up
from vllm.utils.torch_utils import set_random_seed
NUM_TOKENS = [1, 3, 256, 2256, 4096] NUM_TOKENS = [1, 3, 256, 2256, 4096]
NUM_EXPERTS = [32, 160, 256, 257] NUM_EXPERTS = [32, 160, 256, 257]
TOP_KS = [1, 2, 16, 32] TOP_KS = [1, 2, 16, 32]
BLOCK_SIZES = [32, 128] BLOCK_SIZES = [32, 128]
current_platform.seed_everything(0) set_random_seed(0)
def _group_tokens_by_expert( def _group_tokens_by_expert(
......
...@@ -44,8 +44,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularK ...@@ -44,8 +44,8 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularK
from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
TopKWeightAndReduceDelegate, TopKWeightAndReduceDelegate,
) )
from vllm.platforms import current_platform
from vllm.utils.math_utils import round_up from vllm.utils.math_utils import round_up
from vllm.utils.torch_utils import set_random_seed
from vllm.v1.worker.workspace import init_workspace_manager from vllm.v1.worker.workspace import init_workspace_manager
from ...utils import multi_gpu_test from ...utils import multi_gpu_test
...@@ -184,7 +184,7 @@ def test_fused_moe_batched_experts( ...@@ -184,7 +184,7 @@ def test_fused_moe_batched_experts(
dtype: torch.dtype, dtype: torch.dtype,
workspace_init, workspace_init,
): ):
current_platform.seed_everything(7) set_random_seed(7)
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
...@@ -491,7 +491,7 @@ def test_pplx_prepare_finalize_slow( ...@@ -491,7 +491,7 @@ def test_pplx_prepare_finalize_slow(
if per_act_token_quant and block_shape is not None: if per_act_token_quant and block_shape is not None:
pytest.skip("Skip illegal quantization combination") pytest.skip("Skip illegal quantization combination")
current_platform.seed_everything(7) set_random_seed(7)
m, n, k = mnk m, n, k = mnk
world_size, dp_size = world_dp_size world_size, dp_size = world_dp_size
device = "cuda" device = "cuda"
...@@ -809,7 +809,7 @@ def test_pplx_moe_slow( ...@@ -809,7 +809,7 @@ def test_pplx_moe_slow(
block_shape: list[int] | None, block_shape: list[int] | None,
use_internode: bool, use_internode: bool,
): ):
current_platform.seed_everything(7) set_random_seed(7)
m, n, k = mnk m, n, k = mnk
world_size, dp_size = world_dp_size world_size, dp_size = world_dp_size
...@@ -888,7 +888,7 @@ def _pplx_test_loop( ...@@ -888,7 +888,7 @@ def _pplx_test_loop(
new_vllm_config.parallel_config.enable_expert_parallel = True new_vllm_config.parallel_config.enable_expert_parallel = True
_set_vllm_config(new_vllm_config, pgi.world_size, pgi.rank, pgi.local_rank) _set_vllm_config(new_vllm_config, pgi.world_size, pgi.rank, pgi.local_rank)
current_platform.seed_everything(7) set_random_seed(7)
combos = itertools.product( combos = itertools.product(
PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES, [False, True], [None, [128, 128]] PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES, [False, True], [None, [128, 128]]
) )
...@@ -982,7 +982,7 @@ def test_pplx_prepare_finalize( ...@@ -982,7 +982,7 @@ def test_pplx_prepare_finalize(
world_dp_size: tuple[int, int], world_dp_size: tuple[int, int],
use_internode: bool, use_internode: bool,
): ):
current_platform.seed_everything(7) set_random_seed(7)
world_size, dp_size = world_dp_size world_size, dp_size = world_dp_size
parallel_launch( parallel_launch(
world_size * dp_size, world_size * dp_size,
...@@ -1005,7 +1005,7 @@ def test_pplx_moe( ...@@ -1005,7 +1005,7 @@ def test_pplx_moe(
use_internode: bool, use_internode: bool,
use_shared_experts: bool, use_shared_experts: bool,
): ):
current_platform.seed_everything(7) set_random_seed(7)
world_size, dp_size = world_dp_size world_size, dp_size = world_dp_size
parallel_launch( parallel_launch(
world_size, world_size,
......
...@@ -11,6 +11,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import ( ...@@ -11,6 +11,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.triton_utils import triton from vllm.triton_utils import triton
from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
from vllm.utils.torch_utils import set_random_seed
FLOAT8_DTYPE = torch.float8_e4m3fn FLOAT8_DTYPE = torch.float8_e4m3fn
GROUP_SIZE = 128 GROUP_SIZE = 128
...@@ -67,8 +68,12 @@ def reference(x: torch.Tensor, use_ue8m0: bool) -> tuple[torch.Tensor, torch.Ten ...@@ -67,8 +68,12 @@ def reference(x: torch.Tensor, use_ue8m0: bool) -> tuple[torch.Tensor, torch.Ten
@pytest.mark.parametrize("T", [128, 256, 512]) @pytest.mark.parametrize("T", [128, 256, 512])
@pytest.mark.parametrize("N", [128 * 2, 256 * 2, 768 * 2, 2048 * 2, 7168 * 2]) @pytest.mark.parametrize("N", [128 * 2, 256 * 2, 768 * 2, 2048 * 2, 7168 * 2])
@pytest.mark.skipif(
current_platform.is_rocm(),
reason="ROCm does not support DeepGemm.",
)
def test_silu_mul_fp8_quant_deep_gemm(T: int, N: int): def test_silu_mul_fp8_quant_deep_gemm(T: int, N: int):
current_platform.seed_everything(42) set_random_seed(42)
input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda") input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda")
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for MoE with non-gated activations (*_no_mul).
These tests verify that MoE layers work correctly with activations like
silu_no_mul, gelu_no_mul, relu2_no_mul where the activation output dimension
equals N (not N // 2 like gated activations).
"""
import pytest
import torch
from vllm.model_executor.layers.fused_moe.config import (
FUSED_MOE_UNQUANTIZED_CONFIG,
)
from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
from vllm.model_executor.layers.fused_moe.utils import (
GELU_NO_MUL,
RELU2_NO_MUL,
SILU_NO_MUL,
)
from vllm.platforms import current_platform
# Test parameters
M_SIZES = [1, 16, 64]
N_SIZES = [128, 256]
K_SIZES = [64, 128]
TOPK_VALUES = [1, 2]
NUM_EXPERTS = 8
NO_MUL_ACTIVATIONS = [SILU_NO_MUL, GELU_NO_MUL, RELU2_NO_MUL]
def make_test_tensors(
m: int,
n: int,
k: int,
num_experts: int,
topk: int,
dtype: torch.dtype = torch.bfloat16,
device: str = "cuda",
):
"""Create test tensors for MoE with non-gated activation.
For non-gated activations (*_no_mul):
- w1: (E, N, K) - projects from K to N
- w2: (E, K, N) - projects from N back to K (note: N, not N//2)
"""
hidden_states = torch.randn(m, k, dtype=dtype, device=device)
# For non-gated: w1 projects K -> N, w2 projects N -> K
w1 = torch.randn(num_experts, n, k, dtype=dtype, device=device) * 0.1
w2 = torch.randn(num_experts, k, n, dtype=dtype, device=device) * 0.1
topk_weights = torch.ones(m, topk, dtype=torch.float32, device=device) / topk
topk_ids = torch.randint(0, num_experts, (m, topk), device=device)
return hidden_states, w1, w2, topk_weights, topk_ids
@pytest.mark.skipif(
not current_platform.has_device_capability(80),
reason="Requires compute capability >= 8.0",
)
@pytest.mark.parametrize("m", M_SIZES)
@pytest.mark.parametrize("n", N_SIZES)
@pytest.mark.parametrize("k", K_SIZES)
@pytest.mark.parametrize("topk", TOPK_VALUES)
@pytest.mark.parametrize("activation", NO_MUL_ACTIVATIONS)
@torch.inference_mode()
def test_triton_experts_no_mul_activation(
m: int,
n: int,
k: int,
topk: int,
activation: str,
):
hidden_states, w1, w2, topk_weights, topk_ids = make_test_tensors(
m, n, k, NUM_EXPERTS, topk
)
experts = TritonExperts(FUSED_MOE_UNQUANTIZED_CONFIG)
ws1_shape, ws2_shape, out_shape = experts.workspace_shapes(
M=m,
N=n,
K=k,
topk=topk,
global_num_experts=NUM_EXPERTS,
local_num_experts=NUM_EXPERTS,
expert_tokens_meta=None,
activation=activation,
)
# Verify workspace shapes are correct for no_mul activation
# workspace1 should handle activation_out_dim = N (not N//2)
assert ws1_shape == (m, topk, max(n, k)), (
f"workspace1 shape mismatch: expected {(m, topk, max(n, k))}, got {ws1_shape}"
)
# workspace2 should handle max(N, K) for intermediate_cache1/cache3
assert ws2_shape == (m, topk, max(n, k)), (
f"workspace2 shape mismatch: expected {(m, topk, max(n, k))}, got {ws2_shape}"
)
assert out_shape == (m, k), (
f"output shape mismatch: expected {(m, k)}, got {out_shape}"
)
workspace1 = torch.empty(
ws1_shape[0] * ws1_shape[1] * ws1_shape[2],
dtype=hidden_states.dtype,
device=hidden_states.device,
)
workspace2 = torch.empty(
ws2_shape[0] * ws2_shape[1] * ws2_shape[2],
dtype=hidden_states.dtype,
device=hidden_states.device,
)
output = torch.zeros(m, k, dtype=hidden_states.dtype, device=hidden_states.device)
experts.apply(
output=output,
hidden_states=hidden_states,
w1=w1,
w2=w2,
topk_weights=topk_weights,
topk_ids=topk_ids,
activation=activation,
global_num_experts=NUM_EXPERTS,
expert_map=None,
a1q_scale=None,
a2_scale=None,
workspace13=workspace1,
workspace2=workspace2,
expert_tokens_meta=None,
apply_router_weight_on_input=False,
)
assert output.shape == (m, k), f"Expected shape {(m, k)}, got {output.shape}"
assert not torch.isnan(output).any(), "Output contains NaN"
assert not torch.isinf(output).any(), "Output contains Inf"
assert output.abs().sum() > 0, "Output is all zeros"
@pytest.mark.skipif(
not current_platform.has_device_capability(80),
reason="Requires compute capability >= 8.0",
)
@torch.inference_mode()
def test_workspace_shapes_no_mul_vs_gated():
"""Test that workspace shapes differ correctly between gated and non-gated."""
from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
M, N, K, topk = 64, 256, 128, 2
experts = TritonExperts(FUSED_MOE_UNQUANTIZED_CONFIG)
ws1_no_mul, _, out_no_mul = experts.workspace_shapes(
M, N, K, topk, 8, 8, None, SILU_NO_MUL
)
ws1_gated, _, out_gated = experts.workspace_shapes(
M, N, K, topk, 8, 8, None, "silu"
)
# For no_mul: activation_out_dim = N
# For gated: activation_out_dim = N // 2
# workspace1 should use max(activation_out_dim, K)
activation_out_dim_no_mul = N
activation_out_dim_gated = N // 2
assert ws1_no_mul[2] == max(activation_out_dim_no_mul, K), (
f"no_mul workspace1 last dim should be max({activation_out_dim_no_mul}, {K})"
)
assert ws1_gated[2] == max(activation_out_dim_gated, K), (
f"gated workspace1 last dim should be max({activation_out_dim_gated}, {K})"
)
# Output shapes should be the same
assert out_no_mul == out_gated == (M, K)
@pytest.mark.skipif(
not current_platform.has_device_capability(80),
reason="Requires compute capability >= 8.0",
)
@torch.inference_mode()
def test_adjust_n_for_activation():
"""Test the adjust_N_for_activation method."""
from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts
experts = TritonExperts(FUSED_MOE_UNQUANTIZED_CONFIG)
N = 256
# Gated activations should return N // 2
assert experts.adjust_N_for_activation(N, "silu") == N // 2
assert experts.adjust_N_for_activation(N, "gelu") == N // 2
# Non-gated activations should return N
assert experts.adjust_N_for_activation(N, SILU_NO_MUL) == N
assert experts.adjust_N_for_activation(N, GELU_NO_MUL) == N
assert experts.adjust_N_for_activation(N, RELU2_NO_MUL) == N
...@@ -7,19 +7,25 @@ from math import prod ...@@ -7,19 +7,25 @@ from math import prod
import pytest import pytest
import torch import torch
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.config import (
FUSED_MOE_UNQUANTIZED_CONFIG, FUSED_MOE_UNQUANTIZED_CONFIG,
FusedMoEQuantConfig,
fp8_w8a8_moe_quant_config, fp8_w8a8_moe_quant_config,
) )
from vllm.model_executor.layers.fused_moe.cutlass_moe import ( from vllm.model_executor.layers.fused_moe.cutlass_moe import (
cutlass_moe_fp8, CutlassExpertsFp8,
run_cutlass_moe_fp8, run_cutlass_moe_fp8,
) )
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
MoEPrepareAndFinalizeNoEP,
)
from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
NUM_EXPERTS = [40, 64] NUM_EXPERTS = [40, 64]
TOP_KS = [6, 8] TOP_KS = [6, 8]
...@@ -149,16 +155,15 @@ class MOETensors8Bit(MOETensors): ...@@ -149,16 +155,15 @@ class MOETensors8Bit(MOETensors):
def run_with_expert_maps( def run_with_expert_maps(
num_experts: int, num_local_experts: int, **cutlass_moe_kwargs num_experts: int,
num_local_experts: int,
quant_config: FusedMoEQuantConfig,
**cutlass_moe_kwargs,
): ):
def slice_experts(): def slice_experts():
slice_params = [ slice_params = [
"w1_q", "w1",
"w2_q", "w2",
"ab_strides1",
"ab_strides2",
"c_strides1",
"c_strides2",
] ]
full_tensors = { full_tensors = {
k: v k: v
...@@ -166,8 +171,6 @@ def run_with_expert_maps( ...@@ -166,8 +171,6 @@ def run_with_expert_maps(
if k in slice_params and k in cutlass_moe_kwargs if k in slice_params and k in cutlass_moe_kwargs
} }
quant_config = cutlass_moe_kwargs["quant_config"]
for i in range(0, num_experts, num_local_experts): for i in range(0, num_experts, num_local_experts):
s, e = i, i + num_local_experts s, e = i, i + num_local_experts
...@@ -186,13 +189,23 @@ def run_with_expert_maps( ...@@ -186,13 +189,23 @@ def run_with_expert_maps(
new_quant_config._w1.scale = quant_config.w1_scale[s:e] new_quant_config._w1.scale = quant_config.w1_scale[s:e]
new_quant_config._w2.scale = quant_config.w2_scale[s:e] new_quant_config._w2.scale = quant_config.w2_scale[s:e]
cutlass_moe_kwargs["quant_config"] = new_quant_config yield cutlass_moe_kwargs, new_quant_config
yield cutlass_moe_kwargs out_tensor = torch.zeros_like(cutlass_moe_kwargs["hidden_states"])
for kwargs, new_quant_config in slice_experts():
out_tensor = torch.zeros_like(cutlass_moe_kwargs["a"]) kernel = mk.FusedMoEModularKernel(
for kwargs in slice_experts(): MoEPrepareAndFinalizeNoEP(),
out_tensor = out_tensor + cutlass_moe_fp8(**kwargs) CutlassExpertsFp8(
out_dtype=kwargs["hidden_states"].dtype,
# NOTE(rob): w2 is shaped as [E, hidden, intermediate]
e=kwargs["w2"].shape[0], # type: ignore[union-attr]
n=kwargs["w2"].shape[2], # type: ignore[union-attr]
k=kwargs["w2"].shape[1], # type: ignore[union-attr]
quant_config=new_quant_config,
device="cuda",
),
)
out_tensor = out_tensor + kernel(**kwargs)
return out_tensor return out_tensor
...@@ -229,27 +242,35 @@ def run_8_bit( ...@@ -229,27 +242,35 @@ def run_8_bit(
) )
kwargs = { kwargs = {
"a": moe_tensors.a, "hidden_states": moe_tensors.a,
"w1_q": moe_tensors.w1_q, # type: ignore[union-attr] "w1": moe_tensors.w1_q, # type: ignore[union-attr]
"w2_q": moe_tensors.w2_q, # type: ignore[union-attr] "w2": moe_tensors.w2_q, # type: ignore[union-attr]
"topk_weights": topk_weights, "topk_weights": topk_weights,
"topk_ids": topk_ids, "topk_ids": topk_ids,
"ab_strides1": moe_tensors.ab_strides1,
"ab_strides2": moe_tensors.ab_strides2,
"c_strides1": moe_tensors.c_strides1,
"c_strides2": moe_tensors.c_strides2,
"quant_config": quant_config,
} }
num_experts = moe_tensors.w1.size(0) num_experts = moe_tensors.w1.size(0)
with_ep = num_local_experts is not None or num_local_experts == num_experts with_ep = num_local_experts is not None or num_local_experts == num_experts
if not with_ep: if not with_ep:
return cutlass_moe_fp8(**kwargs) kernel = mk.FusedMoEModularKernel(
MoEPrepareAndFinalizeNoEP(),
CutlassExpertsFp8(
out_dtype=moe_tensors.a.dtype,
# NOTE(rob): w2 is shaped as [E, hidden, intermediate]
e=moe_tensors.w2_q.shape[0], # type: ignore[union-attr]
n=moe_tensors.w2_q.shape[2], # type: ignore[union-attr]
k=moe_tensors.w2_q.shape[1], # type: ignore[union-attr]
quant_config=quant_config,
device="cuda",
),
)
return kernel(**kwargs)
assert num_local_experts is not None assert num_local_experts is not None
return run_with_expert_maps( return run_with_expert_maps(
num_experts, num_experts,
num_local_experts, # type: ignore[arg-type] num_local_experts, # type: ignore[arg-type]
quant_config,
**kwargs, **kwargs,
) )
...@@ -277,7 +298,7 @@ def test_cutlass_moe_8_bit_no_graph( ...@@ -277,7 +298,7 @@ def test_cutlass_moe_8_bit_no_graph(
workspace_init, workspace_init,
ep_size: int | None = None, ep_size: int | None = None,
): ):
current_platform.seed_everything(7) set_random_seed(7)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, per_out_ch) mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, per_out_ch)
...@@ -332,7 +353,7 @@ def test_cutlass_moe_8_bit_cuda_graph( ...@@ -332,7 +353,7 @@ def test_cutlass_moe_8_bit_cuda_graph(
monkeypatch, monkeypatch,
workspace_init, workspace_init,
): ):
current_platform.seed_everything(7) set_random_seed(7)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
dtype = torch.half dtype = torch.half
...@@ -469,7 +490,7 @@ def test_run_cutlass_moe_fp8( ...@@ -469,7 +490,7 @@ def test_run_cutlass_moe_fp8(
ep_size: int, ep_size: int,
workspace_init, workspace_init,
): ):
current_platform.seed_everything(7) set_random_seed(7)
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
mt = MOETensors8Bit.make_moe_tensors_8bit( mt = MOETensors8Bit.make_moe_tensors_8bit(
m, k, n, e, per_act_token, per_out_channel m, k, n, e, per_act_token, per_out_channel
......
...@@ -17,11 +17,12 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( ...@@ -17,11 +17,12 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
moe_unpermute, moe_unpermute,
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
NUM_EXPERTS = [16, 64, 256] NUM_EXPERTS = [16, 64, 256]
TOP_KS = [2, 6, 8] TOP_KS = [2, 6, 8]
EP_SIZE = [1, 4, 16] EP_SIZE = [1, 4, 16]
current_platform.seed_everything(0) set_random_seed(0)
if current_platform.is_rocm(): if current_platform.is_rocm():
pytest.skip( pytest.skip(
...@@ -226,7 +227,7 @@ def test_moe_permute_unpermute( ...@@ -226,7 +227,7 @@ def test_moe_permute_unpermute(
n_local_expert, expert_map, _ = determine_expert_map(ep_size, ep_rank, n_expert) n_local_expert, expert_map, _ = determine_expert_map(ep_size, ep_rank, n_expert)
expert_map = expert_map.cuda() expert_map = expert_map.cuda()
start_expert = n_local_expert * ep_rank start_expert = n_local_expert * ep_rank
current_platform.seed_everything(0) set_random_seed(0)
hidden_states = torch.randn((n_token, n_hidden), device="cuda").to(dtype) hidden_states = torch.randn((n_token, n_hidden), device="cuda").to(dtype)
gating_output = torch.randn((n_token, n_expert), device="cuda").to(dtype) gating_output = torch.randn((n_token, n_expert), device="cuda").to(dtype)
topk_weights, topk_ids, token_expert_indices = fused_topk( topk_weights, topk_ids, token_expert_indices = fused_topk(
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
import pytest import pytest
import torch import torch
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from tests.kernels.moe.utils import make_test_weights from tests.kernels.moe.utils import make_test_weights
from tests.kernels.quantization.nvfp4_utils import ( from tests.kernels.quantization.nvfp4_utils import (
FLOAT4_E2M1_MAX, FLOAT4_E2M1_MAX,
...@@ -13,9 +14,15 @@ from tests.kernels.utils import torch_moe ...@@ -13,9 +14,15 @@ from tests.kernels.utils import torch_moe
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.config import nvfp4_moe_quant_config from vllm.model_executor.layers.fused_moe.config import nvfp4_moe_quant_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4 from vllm.model_executor.layers.fused_moe.cutlass_moe import (
CutlassExpertsFp4,
)
from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.prepare_finalize import (
MoEPrepareAndFinalizeNoEP,
)
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100): if not current_platform.has_device_capability(100):
pytest.skip( pytest.skip(
...@@ -42,7 +49,7 @@ MNK_FACTORS = [ ...@@ -42,7 +49,7 @@ MNK_FACTORS = [
def test_cutlass_fp4_moe_no_graph( def test_cutlass_fp4_moe_no_graph(
m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype, workspace_init m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype, workspace_init
): ):
current_platform.seed_everything(7) set_random_seed(7)
with set_current_vllm_config( with set_current_vllm_config(
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
): ):
...@@ -82,17 +89,21 @@ def test_cutlass_fp4_moe_no_graph( ...@@ -82,17 +89,21 @@ def test_cutlass_fp4_moe_no_graph(
w2_scale=w2_blockscale, w2_scale=w2_blockscale,
) )
cutlass_output = cutlass_moe_fp4( kernel = mk.FusedMoEModularKernel(
a=a, MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
w1_fp4=w1_q, CutlassExpertsFp4(
w2_fp4=w2_q, out_dtype=dtype,
max_experts_per_worker=e,
quant_config=quant_config,
),
)
cutlass_output = kernel(
hidden_states=a,
w1=w1_q,
w2=w2_q,
topk_weights=topk_weights, topk_weights=topk_weights,
topk_ids=topk_ids, topk_ids=topk_ids,
quant_config=quant_config,
m=m,
n=n,
k=k,
e=e,
) )
# Reference check: # Reference check:
......
...@@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk ...@@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.math_utils import cdiv from vllm.utils.math_utils import cdiv
from vllm.utils.torch_utils import set_random_seed
from ...utils import multi_gpu_test from ...utils import multi_gpu_test
from .parallel_utils import ProcessGroupInfo, parallel_launch from .parallel_utils import ProcessGroupInfo, parallel_launch
...@@ -290,7 +291,7 @@ def test_cutlass_moe_pplx( ...@@ -290,7 +291,7 @@ def test_cutlass_moe_pplx(
world_dp_size: tuple[int, int], world_dp_size: tuple[int, int],
use_internode: bool, use_internode: bool,
): ):
current_platform.seed_everything(7) set_random_seed(7)
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
dtype = torch.half dtype = torch.half
......
...@@ -13,6 +13,7 @@ from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( ...@@ -13,6 +13,7 @@ from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm
from vllm.utils.math_utils import cdiv, round_up from vllm.utils.math_utils import cdiv, round_up
from vllm.utils.torch_utils import set_random_seed
if current_platform.is_fp8_fnuz(): if current_platform.is_fp8_fnuz():
pytest.skip( pytest.skip(
...@@ -201,7 +202,7 @@ def token_random(E, T, H2, tokens_per_expert): ...@@ -201,7 +202,7 @@ def token_random(E, T, H2, tokens_per_expert):
@torch.inference_mode() @torch.inference_mode()
def test_silu_mul_fp8_quant_deep_gemm(E: int, T: int, H: int, fp8_type: torch.dtype): def test_silu_mul_fp8_quant_deep_gemm(E: int, T: int, H: int, fp8_type: torch.dtype):
group_size = 128 group_size = 128
current_platform.seed_everything(42) set_random_seed(42)
tokens_per_expert = torch.randint( tokens_per_expert = torch.randint(
low=0, low=0,
......
...@@ -4,13 +4,13 @@ ...@@ -4,13 +4,13 @@
import torch import torch
from vllm.model_executor.layers.quantization.utils.quant_utils import group_broadcast from vllm.model_executor.layers.quantization.utils.quant_utils import (
get_fp8_min_max,
group_broadcast,
)
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.math_utils import round_up from vllm.utils.math_utils import round_up
# Using the default value (240.0) from pytorch will cause accuracy
# issue on dynamic quantization models. Here use 224.0 for rocm.
ROCM_FP8FNUZ_MAX = 224.0
FP8_DTYPE = current_platform.fp8_dtype() FP8_DTYPE = current_platform.fp8_dtype()
...@@ -25,16 +25,12 @@ def ref_dynamic_per_token_quant( ...@@ -25,16 +25,12 @@ def ref_dynamic_per_token_quant(
if scale_ub is not None: if scale_ub is not None:
assert quant_dtype == FP8_DTYPE assert quant_dtype == FP8_DTYPE
qtype_traits = ( if quant_dtype == torch.int8:
torch.iinfo(quant_dtype) qtype_traits = torch.iinfo(quant_dtype)
if quant_dtype == torch.int8 qtype_traits_min = qtype_traits.min
else torch.finfo(quant_dtype) qtype_traits_max = qtype_traits.max
) else:
use_fp8fnuz = ( qtype_traits_min, qtype_traits_max = get_fp8_min_max()
current_platform.is_fp8_fnuz() and quant_dtype == current_platform.fp8_dtype()
)
qtype_traits_max = ROCM_FP8FNUZ_MAX if use_fp8fnuz else qtype_traits.max
qtype_traits_min = -ROCM_FP8FNUZ_MAX if use_fp8fnuz else qtype_traits.min
qtype_max = as_float32_tensor(qtype_traits_max) qtype_max = as_float32_tensor(qtype_traits_max)
s_1 = as_float32_tensor(1.0) s_1 = as_float32_tensor(1.0)
s_512 = as_float32_tensor(512.0) s_512 = as_float32_tensor(512.0)
...@@ -72,17 +68,7 @@ def ref_dynamic_per_token_quant( ...@@ -72,17 +68,7 @@ def ref_dynamic_per_token_quant(
def ref_dynamic_per_tensor_fp8_quant( def ref_dynamic_per_tensor_fp8_quant(
x: torch.Tensor, x: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
fp8_traits = torch.finfo(FP8_DTYPE) fp8_traits_min, fp8_traits_max = get_fp8_min_max()
fp8_traits_max = (
ROCM_FP8FNUZ_MAX
if current_platform.is_rocm() and current_platform.is_fp8_fnuz()
else fp8_traits.max
)
fp8_traits_min = (
-ROCM_FP8FNUZ_MAX
if current_platform.is_rocm() and current_platform.is_fp8_fnuz()
else fp8_traits.min
)
fp8_max = as_float32_tensor(fp8_traits_max) fp8_max = as_float32_tensor(fp8_traits_max)
one = as_float32_tensor(1.0) one = as_float32_tensor(1.0)
......
...@@ -13,7 +13,7 @@ from vllm.model_executor.layers.quantization.awq_triton import ( ...@@ -13,7 +13,7 @@ from vllm.model_executor.layers.quantization.awq_triton import (
awq_dequantize_triton, awq_dequantize_triton,
awq_gemm_triton, awq_gemm_triton,
) )
from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed
device = "cuda" device = "cuda"
...@@ -86,7 +86,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size): ...@@ -86,7 +86,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size):
zeros_cols = qweight_cols zeros_cols = qweight_cols
zeros_dtype = torch.int32 zeros_dtype = torch.int32
current_platform.seed_everything(0) set_random_seed(0)
qweight = torch.randint( qweight = torch.randint(
0, 0,
...@@ -141,7 +141,7 @@ def test_gemm(N, K, M, splitK, group_size): ...@@ -141,7 +141,7 @@ def test_gemm(N, K, M, splitK, group_size):
qzeros_rows = scales_rows qzeros_rows = scales_rows
qzeros_cols = qweight_cols qzeros_cols = qweight_cols
current_platform.seed_everything(0) set_random_seed(0)
input = torch.rand((input_rows, input_cols), dtype=input_dtype, device=device) input = torch.rand((input_rows, input_cols), dtype=input_dtype, device=device)
qweight = torch.randint( qweight = torch.randint(
......
...@@ -17,6 +17,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( ...@@ -17,6 +17,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.scalar_type import ScalarType, scalar_types from vllm.scalar_type import ScalarType, scalar_types
from vllm.utils.torch_utils import set_random_seed
IS_SUPPORTED_BY_GPU = ( IS_SUPPORTED_BY_GPU = (
current_platform.is_cuda() and current_platform.get_device_capability()[0] >= 9 current_platform.is_cuda() and current_platform.get_device_capability()[0] >= 9
...@@ -248,7 +249,7 @@ def compute_moe_reference_output(setup: MoETestSetup) -> torch.Tensor: ...@@ -248,7 +249,7 @@ def compute_moe_reference_output(setup: MoETestSetup) -> torch.Tensor:
@pytest.mark.parametrize("random_zero", [True, False]) @pytest.mark.parametrize("random_zero", [True, False])
def test_cutlass_w4a8_moe_mm_end_to_end(shape, random_zero): def test_cutlass_w4a8_moe_mm_end_to_end(shape, random_zero):
num_experts, N, K = shape num_experts, N, K = shape
current_platform.seed_everything(42) set_random_seed(42)
setup = make_moe_test_setup( setup = make_moe_test_setup(
num_experts=num_experts, K=K, N=N, max_blocks=64, random_zero=random_zero num_experts=num_experts, K=K, N=N, max_blocks=64, random_zero=random_zero
) )
...@@ -308,7 +309,7 @@ class W4A8MoELayer(torch.nn.Module): ...@@ -308,7 +309,7 @@ class W4A8MoELayer(torch.nn.Module):
reason="W4A8 Grouped GEMM is not supported on this GPU type.", reason="W4A8 Grouped GEMM is not supported on this GPU type.",
) )
def test_cutlass_w4a8_moe_mm_cuda_graph(): def test_cutlass_w4a8_moe_mm_cuda_graph():
current_platform.seed_everything(42) set_random_seed(42)
# Fixed config for CUDA graph test (single parameter point). # Fixed config for CUDA graph test (single parameter point).
num_experts = 8 num_experts = 8
K = 512 K = 512
......
...@@ -12,6 +12,7 @@ from nvfp4_utils import ( ...@@ -12,6 +12,7 @@ from nvfp4_utils import (
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100): if not current_platform.has_device_capability(100):
pytest.skip( pytest.skip(
...@@ -72,7 +73,7 @@ def test_flashinfer_nvfp4_gemm( ...@@ -72,7 +73,7 @@ def test_flashinfer_nvfp4_gemm(
if backend == "trtllm" and dtype == torch.float16: if backend == "trtllm" and dtype == torch.float16:
pytest.skip("Only torch.bfloat16 is supported for TRTLLM FP4 GEMM operations") pytest.skip("Only torch.bfloat16 is supported for TRTLLM FP4 GEMM operations")
current_platform.seed_everything(seed) set_random_seed(seed)
m, n, packed_k = shape m, n, packed_k = shape
k = packed_k * 2 k = packed_k * 2
block_size = 16 block_size = 16
......
...@@ -6,6 +6,7 @@ import torch ...@@ -6,6 +6,7 @@ import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100): if not current_platform.has_device_capability(100):
pytest.skip( pytest.skip(
...@@ -38,7 +39,7 @@ def test_flashinfer_fp8_gemm( ...@@ -38,7 +39,7 @@ def test_flashinfer_fp8_gemm(
device: str, device: str,
autotune: bool, autotune: bool,
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
m, n, k = shape m, n, k = shape
a = torch.randn((m, k), dtype=dtype, device=device) a = torch.randn((m, k), dtype=dtype, device=device)
b = torch.randn((n, k), dtype=dtype, device=device) / k b = torch.randn((n, k), dtype=dtype, device=device) / k
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Unit tests for the get_fp8_min_max() helper function.
These tests verify the FP8 min/max value logic for both standard
and fnuz (ROCm MI300) dtype handling.
"""
from unittest.mock import patch
import pytest
import torch
from vllm.model_executor.layers.quantization.utils.quant_utils import (
get_fp8_min_max,
)
class TestGetFp8MinMax:
"""Test cases for get_fp8_min_max() function."""
@patch("vllm.model_executor.layers.quantization.utils.quant_utils.current_platform")
def test_standard_fp8_platform(self, mock_platform):
"""Test that standard FP8 platform uses PyTorch's finfo values."""
mock_platform.is_fp8_fnuz.return_value = False
mock_platform.fp8_dtype.return_value = torch.float8_e4m3fn
fp8_min, fp8_max = get_fp8_min_max()
finfo = torch.finfo(torch.float8_e4m3fn)
# Standard FP8 max is 448.0 for e4m3fn
assert fp8_max == finfo.max, f"Expected finfo.max={finfo.max}, got {fp8_max}"
assert fp8_min == finfo.min, f"Expected finfo.min={finfo.min}, got {fp8_min}"
@patch("vllm.model_executor.layers.quantization.utils.quant_utils.current_platform")
def test_fnuz_platform_returns_224(self, mock_platform):
"""Test that fnuz platform returns 224.0."""
mock_platform.is_fp8_fnuz.return_value = True
fp8_min, fp8_max = get_fp8_min_max()
# fnuz on ROCm MI300 should return 224.0, not 240.0
assert fp8_max == 224.0, f"Expected 224.0 for fnuz platform, got {fp8_max}"
assert fp8_min == -224.0, f"Expected -224.0 for fnuz platform, got {fp8_min}"
@patch("vllm.model_executor.layers.quantization.utils.quant_utils.current_platform")
def test_non_fnuz_platform_uses_finfo(self, mock_platform):
"""Test that non-fnuz platform uses finfo values."""
mock_platform.is_fp8_fnuz.return_value = False
mock_platform.fp8_dtype.return_value = torch.float8_e4m3fn
fp8_min, fp8_max = get_fp8_min_max()
finfo = torch.finfo(torch.float8_e4m3fn)
assert fp8_max == finfo.max, (
f"Non-fnuz platform should use finfo.max={finfo.max}, got {fp8_max}"
)
assert fp8_min == finfo.min, (
f"Non-fnuz platform should use finfo.min={finfo.min}, got {fp8_min}"
)
if __name__ == "__main__":
pytest.main([__file__, "-v"])
...@@ -7,7 +7,7 @@ import torch ...@@ -7,7 +7,7 @@ import torch
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -23,14 +23,19 @@ from vllm.platforms import current_platform ...@@ -23,14 +23,19 @@ from vllm.platforms import current_platform
@pytest.mark.parametrize("use_ue8m0", [True, False]) @pytest.mark.parametrize("use_ue8m0", [True, False])
@torch.inference_mode() @torch.inference_mode()
def test_quantfp8_group_functionality( def test_quantfp8_group_functionality(
batch_size: int, hidden_dim: int, group_size: int, seed: int, use_ue8m0: bool default_vllm_config,
batch_size: int,
hidden_dim: int,
group_size: int,
seed: int,
use_ue8m0: bool,
) -> None: ) -> None:
"""Test QuantFP8 group quantization with various configurations. """Test QuantFP8 group quantization with various configurations.
Tests both CUDA and native implementations, column-major scales, Tests both CUDA and native implementations, column-major scales,
and verifies consistency between implementations. and verifies consistency between implementations.
""" """
current_platform.seed_everything(seed) set_random_seed(seed)
x = torch.randn((batch_size, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8 x = torch.randn((batch_size, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8
expected_num_groups = (hidden_dim + group_size - 1) // group_size expected_num_groups = (hidden_dim + group_size - 1) // group_size
...@@ -82,8 +87,10 @@ def test_quantfp8_group_functionality( ...@@ -82,8 +87,10 @@ def test_quantfp8_group_functionality(
@pytest.mark.parametrize("seed", [42]) @pytest.mark.parametrize("seed", [42])
@pytest.mark.parametrize("use_ue8m0", [True, False]) @pytest.mark.parametrize("use_ue8m0", [True, False])
@torch.inference_mode() @torch.inference_mode()
def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None: def test_quantfp8_group_multidimensional(
current_platform.seed_everything(seed) default_vllm_config, seed: int, use_ue8m0: bool
) -> None:
set_random_seed(seed)
group_size = 64 group_size = 64
...@@ -135,8 +142,8 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None: ...@@ -135,8 +142,8 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
@pytest.mark.parametrize("seed", [42]) @pytest.mark.parametrize("seed", [42])
@torch.inference_mode() @torch.inference_mode()
def test_quantfp8_group_edge_cases(seed: int) -> None: def test_quantfp8_group_edge_cases(default_vllm_config, seed: int) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
batch_size = 16 batch_size = 16
group_size = 64 group_size = 64
......
...@@ -12,8 +12,8 @@ from huggingface_hub import snapshot_download ...@@ -12,8 +12,8 @@ from huggingface_hub import snapshot_download
import vllm._custom_ops as ops import vllm._custom_ops as ops
from vllm.model_executor.layers.fused_moe import fused_experts from vllm.model_executor.layers.fused_moe import fused_experts
from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf
from vllm.platforms import current_platform
from ...utils import models_path_prefix from ...utils import models_path_prefix
from vllm.utils.torch_utils import set_random_seed
# GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample") # GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
# GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample") # GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample")
...@@ -95,7 +95,7 @@ def test_dequantize( ...@@ -95,7 +95,7 @@ def test_dequantize(
@pytest.mark.parametrize("quant_type", QUANT_TYPES) @pytest.mark.parametrize("quant_type", QUANT_TYPES)
@torch.inference_mode() @torch.inference_mode()
def test_mmvq(hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType): def test_mmvq(hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType):
current_platform.seed_everything(0) set_random_seed(0)
tensors = get_gguf_sample_tensors(hidden_size, quant_type) tensors = get_gguf_sample_tensors(hidden_size, quant_type)
x = torch.rand((1, hidden_size), dtype=dtype, device="cuda") x = torch.rand((1, hidden_size), dtype=dtype, device="cuda")
...@@ -138,7 +138,7 @@ def test_mmq( ...@@ -138,7 +138,7 @@ def test_mmq(
dtype: torch.dtype, dtype: torch.dtype,
quant_type: GGMLQuantizationType, quant_type: GGMLQuantizationType,
): ):
current_platform.seed_everything(0) set_random_seed(0)
tensors = get_gguf_sample_tensors(hidden_size, quant_type) tensors = get_gguf_sample_tensors(hidden_size, quant_type)
x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda") x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda")
...@@ -173,7 +173,7 @@ def test_moe( ...@@ -173,7 +173,7 @@ def test_moe(
quant_type: GGMLQuantizationType, quant_type: GGMLQuantizationType,
top_k: int, top_k: int,
): ):
current_platform.seed_everything(0) set_random_seed(0)
H, E = 1024, 256 H, E = 1024, 256
x = torch.rand((num_tokens, H), dtype=dtype, device="cuda") x = torch.rand((num_tokens, H), dtype=dtype, device="cuda")
......
...@@ -107,7 +107,7 @@ SEEDS = [0] ...@@ -107,7 +107,7 @@ SEEDS = [0]
itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS), itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS),
) )
@torch.inference_mode() @torch.inference_mode()
def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed): def test_w8a8_fp8_fused_moe(default_vllm_config, M, N, K, E, topk, dtype, seed):
torch.manual_seed(seed) torch.manual_seed(seed)
# Initialize int8 quantization parameters # Initialize int8 quantization parameters
factor_for_scale = 1e-2 factor_for_scale = 1e-2
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment