Commit 711aa9d5 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.0' into v0.10.0-dev

parents 751c492c 6d8d0a24
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
import torch
import vllm._custom_ops as ops
def per_token_cast_to_fp8(
x: torch.Tensor, block_size: int) -> tuple[torch.Tensor, torch.Tensor]:
assert x.dim() == 2
m, n = x.shape
pad_size = (block_size - (n % block_size)) % block_size
x = torch.nn.functional.pad(x,
(0, pad_size), value=0) if pad_size > 0 else x
x_view = x.view(m, -1, block_size)
x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
fp8_data = (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn)
return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
def per_block_cast_to_fp8(
x: torch.Tensor, block_size_k: int,
block_size_n: int) -> tuple[torch.Tensor, torch.Tensor]:
assert x.dim() == 2
m, n = x.shape
x_padded = torch.zeros(
(
int(math.ceil(m / block_size_k)) * block_size_k,
int(math.ceil(n / block_size_n)) * block_size_n,
),
dtype=x.dtype,
device=x.device,
)
x_padded[:m, :n] = x
x_view = x_padded.view(-1, block_size_k,
x_padded.size(1) // block_size_k, block_size_n)
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
return x_scaled_sub, scales
def make_non_quant_weights(
e: int,
n: int,
k: int,
dtype: torch.dtype,
) -> tuple[torch.Tensor, torch.Tensor]:
"""
Return weights w1, w2
"""
device = torch.cuda.current_device()
w1 = torch.randn((e, 2 * n, k), device=device, dtype=dtype) / 15
w2 = torch.randn((e, k, n), device=device, dtype=dtype) / 15
return w1, w2
def make_block_quant_fp8_weights(
e: int,
n: int,
k: int,
block_size: list[int],
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Return weights w1, w2, w1_scale, w2_scale
"""
dtype = torch.bfloat16
device = torch.cuda.current_device()
fp8_info = torch.finfo(torch.float8_e4m3fn)
fp8_max, fp8_min = fp8_info.max, fp8_info.min
w1_bf16, w2_bf16 = make_non_quant_weights(e, n, k, dtype)
w1_bf16 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype)
w2_bf16 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype)
block_n, block_k = block_size[0], block_size[1]
n_tiles_w1 = ((2 * n) + block_n - 1) // block_n
k_tiles_w1 = (k + block_k - 1) // block_k
n_tiles_w2 = (k + block_n - 1) // block_n
k_tiles_w2 = (n + block_k - 1) // block_k
w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn, device=device)
w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn, device=device)
w1_s = torch.empty((e, n_tiles_w1, k_tiles_w1),
device=device,
dtype=torch.float32)
w2_s = torch.empty((e, n_tiles_w2, k_tiles_w2),
device=device,
dtype=torch.float32)
assert w1_s.shape == (e, (2 * n + (block_n - 1)) // block_n,
(k + (block_k - 1)) // block_k)
assert (w2.shape[-2] + block_n - 1) // block_n == w2_s.shape[-2]
for i in range(e):
w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i],
block_size_k=block_k,
block_size_n=block_n)
w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i],
block_size_k=block_k,
block_size_n=block_n)
return w1, w2, w1_s, w2_s
def make_quant_fp8_weights(
e: int,
n: int,
k: int,
per_out_channel_quant: bool,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Return w1, w2, w1_scale, w2_scale
"""
q_dtype = torch.float8_e4m3fn
w1, w2 = make_non_quant_weights(e, n, k, dtype=torch.bfloat16)
# w1 -> w1_q, w2 -> w2_q
w1_q = torch.empty((e, 2 * n, k), device="cuda", dtype=q_dtype)
w2_q = torch.empty((e, k, n), device="cuda", dtype=q_dtype)
n_b_scales = 2 * n if per_out_channel_quant else 1
k_b_scales = k if per_out_channel_quant else 1
w1_scale = torch.empty((e, n_b_scales, 1),
device="cuda",
dtype=torch.float32)
w2_scale = torch.empty((e, k_b_scales, 1),
device="cuda",
dtype=torch.float32)
for expert in range(e):
w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
w1[expert], use_per_token_if_dynamic=per_out_channel_quant)
w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
w2[expert], use_per_token_if_dynamic=per_out_channel_quant)
return w1_q, w2_q, w1_scale, w2_scale
......@@ -4,7 +4,6 @@
DeepEP test utilities
"""
import dataclasses
import importlib
import os
import traceback
from typing import Callable, Optional
......@@ -15,10 +14,9 @@ from torch.multiprocessing import (
spawn) # pyright: ignore[reportPrivateImportUsage]
from typing_extensions import Concatenate, ParamSpec
from vllm.utils import get_open_port
from vllm.utils import get_open_port, has_deep_ep
has_deep_ep = importlib.util.find_spec("deep_ep") is not None
if has_deep_ep:
if has_deep_ep():
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501
DeepEPHTPrepareAndFinalize)
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501
......
......@@ -6,7 +6,6 @@ from typing import Optional
import pytest
import torch
import triton.language as tl
from tests.kernels.moe.utils import (batched_moe,
make_quantized_test_activations,
......@@ -18,6 +17,7 @@ from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
invoke_moe_batched_triton_kernel)
from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
from vllm.platforms import current_platform
from vllm.triton_utils import tl
MNK_FACTORS = [
(1, 128, 128),
......
......@@ -15,13 +15,13 @@ from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_topk, modular_triton_fused_moe)
from vllm.platforms import current_platform
from vllm.utils import has_deep_gemm
from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
dg_available = False
try:
import deep_gemm
dg_available = True
except ImportError:
pass
dg_available = has_deep_gemm()
if dg_available:
from deep_gemm import get_m_alignment_for_contiguous_layout
if current_platform.get_device_capability() < (9, 0):
pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
......@@ -224,6 +224,7 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
@pytest.mark.skipif(is_blackwell_deep_gemm_used(), reason="Not E8M0 scale MOE")
@torch.inference_mode()
def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
monkeypatch):
......@@ -238,8 +239,7 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
torch.manual_seed(seed)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
block_m = get_m_alignment_for_contiguous_layout()
block_size = [block_m, block_m]
dtype = torch.bfloat16
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests compute_expert_num_tokens kernels
"""
import dataclasses
from typing import Optional
import pytest
import torch
from vllm.model_executor.layers.fused_moe.utils import count_expert_num_tokens
@dataclasses.dataclass
class TestTensors:
topk_ids: torch.Tensor
expert_map: Optional[torch.Tensor] = None
def to_device(self, device: str):
self.topk_ids = self.topk_ids.to(device=device)
if self.expert_map is not None:
self.expert_map = self.expert_map.to(device=device)
@staticmethod
def make(num_tokens: int, num_topk: int, num_experts: int, device: str,
topk_ids_dtype: torch.dtype) -> "TestTensors":
# make topk ids
topk_ids = torch.empty((num_tokens, num_topk),
device=device,
dtype=torch.int64)
for x in range(num_tokens):
topk_ids[x] = torch.randperm(num_experts)[:num_topk]
topk_ids = topk_ids.to(dtype=torch.int64)
return TestTensors(topk_ids=topk_ids)
def with_ep_rank(self, ep_rank: int, num_global_experts: int,
num_local_experts: int, device: str):
# make an expert map
expert_map = torch.empty((num_global_experts),
device=device,
dtype=torch.int32)
expert_map.fill_(-1)
s = ep_rank * num_local_experts
e = s + num_local_experts
expert_map[s:e] = torch.tensor(list(range(num_local_experts)),
device=device)
return TestTensors(topk_ids=self.topk_ids.clone(),
expert_map=expert_map)
def ref_impl(tt: TestTensors, expert_num_tokens: torch.Tensor):
# do the reference in cpu
tt.to_device("cpu")
expert_ids, counts = tt.topk_ids.unique(return_counts=True)
for eid, count in zip(expert_ids, counts):
if eid != -1 and tt.expert_map is not None:
eid = tt.expert_map[eid]
if eid == -1:
continue
expert_num_tokens[eid] += count
def do_test_compute_expert_num_tokens(num_tokens: int, num_topk: int,
num_experts: int, ep_size: int,
topk_ids_dtype: torch.dtype):
assert num_topk <= num_experts
tt = TestTensors.make(num_tokens,
num_topk,
num_experts,
topk_ids_dtype=topk_ids_dtype,
device="cpu")
num_global_experts = num_experts
assert num_global_experts % ep_size == 0
num_local_experts = num_global_experts // ep_size
for ep_rank in range(ep_size):
tt_rank = tt.with_ep_rank(ep_rank, num_global_experts,
num_local_experts, "cpu")
ref_expert_num_tokens = torch.zeros((num_local_experts),
device="cpu",
dtype=torch.int32)
ref_impl(tt_rank, ref_expert_num_tokens)
ref_expert_num_tokens = ref_expert_num_tokens.to("cuda")
tt_rank.to_device("cuda")
# Test with expert_map
triton_expert_num_tokens_w_emap = count_expert_num_tokens(
tt_rank.topk_ids, num_local_experts, tt_rank.expert_map)
# Test without expert map
topk_ids = tt_rank.expert_map[tt_rank.topk_ids].to(topk_ids_dtype)
triton_expert_num_tokens_wo_emap = count_expert_num_tokens(
topk_ids, num_local_experts, expert_map=None)
torch.testing.assert_close(ref_expert_num_tokens,
triton_expert_num_tokens_w_emap,
atol=0,
rtol=0)
torch.testing.assert_close(ref_expert_num_tokens,
triton_expert_num_tokens_wo_emap,
atol=0,
rtol=0)
@pytest.mark.parametrize(
"num_tokens", [1, 4, 8, 11, 19, 128, 127, 405, 1024, 3333, 6666, 7317])
@pytest.mark.parametrize("num_topk", [2, 6, 8])
@pytest.mark.parametrize("num_experts", [64])
@pytest.mark.parametrize("ep_size", [1, 2, 4])
@pytest.mark.parametrize("topk_ids_dtype", [torch.int64])
def test_compute_expert_num_tokens(num_tokens: int, num_topk: int,
num_experts: int, ep_size: int,
topk_ids_dtype: torch.dtype):
do_test_compute_expert_num_tokens(num_tokens, num_topk, num_experts,
ep_size, topk_ids_dtype)
@pytest.mark.parametrize("numel", list(range(1, 8192, 11)))
@pytest.mark.parametrize("num_experts", [32])
@pytest.mark.parametrize("ep_size", [2])
@pytest.mark.parametrize("topk_ids_dtype", [torch.int64])
def test_compute_expert_num_tokens_from_numel(numel: int, num_experts: int,
ep_size: int,
topk_ids_dtype: torch.dtype):
do_test_compute_expert_num_tokens(num_tokens=numel,
num_topk=1,
num_experts=num_experts,
ep_size=ep_size,
topk_ids_dtype=topk_ids_dtype)
......@@ -20,6 +20,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
FusedMoEModularKernel)
from vllm.platforms import current_platform
from vllm.utils import has_deep_ep, has_deep_gemm
from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
from .parallel_utils import ProcessGroupInfo, parallel_launch
from .utils import make_test_weights
......@@ -368,6 +369,8 @@ NUM_EXPERTS = [32]
@pytest.mark.parametrize("world_dp_size", [(2, 1)])
@requires_deep_ep
@requires_deep_gemm
@pytest.mark.skipif(is_blackwell_deep_gemm_used(),
reason="Skipping test for Blackwell DeepGEMM")
def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
topk: int, world_dp_size: tuple[int, int]):
"""
......@@ -423,6 +426,8 @@ USE_FP8_DISPATCH = [False]
@pytest.mark.parametrize("world_dp_size", [(2, 1)])
@requires_deep_ep
@requires_deep_gemm
@pytest.mark.skipif(is_blackwell_deep_gemm_used(),
reason="Skipping test for Blackwell DeepGEMM")
def test_ll_deepep_deepgemm_moe(
mnk: tuple[int, int, int],
num_experts: int,
......
......@@ -15,46 +15,17 @@ import torch
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8)
from vllm.utils import cdiv
from vllm.utils import has_deep_gemm
from vllm.utils.deep_gemm import calc_diff, per_block_cast_to_fp8
has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None
if has_deep_gemm:
import deep_gemm
BLOCK_M = deep_gemm.get_m_alignment_for_contiguous_layout()
BLOCK_SIZE = [BLOCK_M, BLOCK_M]
BLOCK_SIZE = [128, 128]
requires_deep_gemm = pytest.mark.skipif(
not has_deep_gemm,
not has_deep_gemm(),
reason="Requires deep_gemm kernels",
)
def calc_diff(x: torch.Tensor, y: torch.Tensor):
x, y = x.double(), y.double()
denominator = (x * x + y * y).sum()
sim = 2 * (x * y).sum() / denominator
return 1 - sim
def per_block_cast_to_fp8(
x: torch.Tensor,
block_size_n: int = 128) -> tuple[torch.Tensor, torch.Tensor]:
assert x.dim() == 2
m, n = x.shape
x_padded = torch.zeros(
(cdiv(m, 128) * 128, cdiv(n, block_size_n) * block_size_n),
dtype=x.dtype,
device=x.device)
x_padded[:m, :n] = x
x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, block_size_n)
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
return x_scaled_sub, scales
def make_block_quant_fp8_weights(
e: int,
n: int,
......@@ -124,7 +95,7 @@ def run_single_case(m, n, k, topk, num_experts, block_size):
topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1)
topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
# triton referrence
# triton reference
out_triton = fused_experts(
hidden_states=tokens_bf16,
w1=w1,
......@@ -155,17 +126,8 @@ def run_single_case(m, n, k, topk, num_experts, block_size):
block_shape=block_size,
allow_deep_gemm=True,
)
base = out_triton.abs().mean()
atol = 0.1 * base.clamp(min=1e-2) # 10% of mean, but not lower than 1e-3
rtol = 0.05
# ----- Compare -----
torch.testing.assert_close(
out_deepgemm.to(torch.float32),
out_triton.to(torch.float32),
rtol=rtol,
atol=float(atol),
)
diff = calc_diff(out_deepgemm, out_triton)
assert diff < 0.001, f"Diff exceeded 1%: {diff}"
# Note: W1 has shape (E, 2N, K), so N = 512
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copy
from itertools import product
from typing import Optional
import pytest
import torch
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.config import VllmConfig, current_platform, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501
BatchedTritonOrDeepGemmExperts)
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
BatchedTritonExperts)
from vllm.model_executor.layers.fused_moe.layer import TritonExperts
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts)
from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
from .modular_kernel_tools.common import (Config, RankTensors, WeightTensors,
reference_moe_impl,
run_modular_kernel)
from .modular_kernel_tools.mk_objects import (
MK_FUSED_EXPERT_TYPES, MK_MULTI_GPU_PREPARE_FINALIZE_TYPES,
MK_QUANT_CONFIGS, MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES)
from .modular_kernel_tools.parallel_utils import (ProcessGroupInfo,
parallel_launch_with_config)
# TODO (varun): These requirements are very strict and could be relaxed.
has_all_packages = (has_deep_ep() and has_deep_gemm() and has_pplx())
meets_package_requirements = pytest.mark.skipif(
not has_all_packages,
reason="Requires deep_ep & deep_gemm & pplx packages",
)
def rank_worker(
pgi: ProcessGroupInfo,
vllm_config: VllmConfig,
cpu_group,
config: Config,
weights: WeightTensors,
):
current_platform.seed_everything(pgi.rank)
# sanity check
from vllm import envs
if config.fused_moe_chunk_size is not None:
assert (config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE)
# get weights to this device
weights.to_current_device()
Ms = config.Ms
assert isinstance(Ms, list)
TOPKs = config.topks
assert isinstance(TOPKs, list)
for m, topk in product(Ms, TOPKs):
print(f"Running m={m}, topk={topk} ...")
# override m and topk
cfgx = copy.deepcopy(config)
cfgx.Ms = m
cfgx.topks = topk
# inputs for rank
rank_tensors = RankTensors.make(cfgx, pgi)
# modular kernel out
mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights,
rank_tensors)
with set_current_vllm_config(vllm_config):
ref_out = reference_moe_impl(cfgx, weights, rank_tensors)
torch.testing.assert_close(ref_out, mk_out, atol=3e-2, rtol=3e-2)
def run(config: Config):
assert config.is_valid()
print(f"Testing config \n{config.describe()} ...")
weights: WeightTensors = WeightTensors.make(config)
vllm_config, env_dict = config.make_env_data()
parallel_launch_with_config(config.world_size, rank_worker, vllm_config,
env_dict, config, weights)
Ms = [32, 64]
Ks = [7168] # hidden sizes
Ns = [2048]
TOPKs = [4, 1]
Es = [32]
DTYPEs = [torch.bfloat16]
FUSED_MOE_CHUNK_SIZEs = [None, 16]
def is_nyi_config(config: Config) -> bool:
# We know these configs to be legitimate. but still fail.
if (config.fused_experts_type in [
BatchedTritonExperts, BatchedTritonOrDeepGemmExperts,
TritonExperts, TritonOrDeepGemmExperts
]):
# The triton kernels expect both per-act-token-quant and
# per-out-ch-quant or neither.
unsupported_quant_config = ((config.is_per_act_token_quant +
config.is_per_out_ch_quant) == 1)
return unsupported_quant_config
# cutlass kernels dont support expert_maps yet.
return config.fused_experts_type == CutlassExpertsFp8
@pytest.mark.parametrize("k", Ks)
@pytest.mark.parametrize("n", Ns)
@pytest.mark.parametrize("e", Es)
@pytest.mark.parametrize("dtype", DTYPEs)
@pytest.mark.parametrize("quant_config", MK_QUANT_CONFIGS)
@pytest.mark.parametrize(
"combination",
product(MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
@pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
@pytest.mark.parametrize("world_size", [2])
@meets_package_requirements
def test_modular_kernel_combinations_multigpu(
k: int, n: int, e: int, dtype: torch.dtype,
quant_config: FusedMoEQuantConfig,
combination: tuple[mk.FusedMoEPrepareAndFinalize,
mk.FusedMoEPermuteExpertsUnpermute],
fused_moe_chunk_size: Optional[int], world_size: int):
config = Config(
Ms=Ms,
K=k,
N=n,
E=e,
topks=TOPKs,
dtype=dtype,
quant_config=quant_config,
prepare_finalize_type=combination[0],
fused_experts_type=combination[1],
fused_moe_chunk_size=fused_moe_chunk_size,
world_size=world_size,
)
if not config.is_valid():
pytest.skip(f"Tests config {config} is not valid. Skipping ...")
if is_nyi_config(config):
pytest.skip(f"Tests config {config} is nyi. Skipping ...")
print(f"{config.describe()}")
run(config)
@pytest.mark.parametrize("k", Ks)
@pytest.mark.parametrize("n", Ns)
@pytest.mark.parametrize("e", Es)
@pytest.mark.parametrize("dtype", DTYPEs)
@pytest.mark.parametrize("quant_config", MK_QUANT_CONFIGS)
@pytest.mark.parametrize(
"combination",
product(MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
@pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
@pytest.mark.parametrize("world_size", [1])
@meets_package_requirements
def test_modular_kernel_combinations_singlegpu(
k: int, n: int, e: int, dtype: torch.dtype,
quant_config: FusedMoEQuantConfig,
combination: tuple[mk.FusedMoEPrepareAndFinalize,
mk.FusedMoEPermuteExpertsUnpermute],
fused_moe_chunk_size: Optional[int], world_size: int):
config = Config(
Ms=Ms,
K=k,
N=n,
E=e,
topks=TOPKs,
dtype=dtype,
quant_config=quant_config,
prepare_finalize_type=combination[0],
fused_experts_type=combination[1],
fused_moe_chunk_size=fused_moe_chunk_size,
world_size=world_size,
)
if not config.is_valid():
pytest.skip(f"Tests config {config} is not valid. Skipping ...")
if is_nyi_config(config):
pytest.skip(f"Tests config {config} is nyi. Skipping ...")
run(config)
if __name__ == '__main__':
# Ability to test individual PrepareAndFinalize and FusedExperts combination
from .modular_kernel_tools.cli_args import (make_config,
make_config_arg_parser)
parser = make_config_arg_parser(description=(
"Run single prepare-finalize & fused-experts combination test"
"Example : python3 -m tests.kernels.moe.test_modular_kernel_combinations " #noqa: E501
"--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
))
args = parser.parse_args()
config = make_config(args)
run(config)
......@@ -174,6 +174,7 @@ def test_fused_moe(
use_int8_w8a8=False,
use_int8_w8a16=False,
use_int4_w4a16=False,
use_mxfp4_w4a4=False,
per_act_token_quant=False,
block_shape=None)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
"""Tests for the MOE align block size function.
Run `pytest tests/kernels/moe/test_moe_align_block_size.py`.
"""
from typing import Optional
import pytest
import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
moe_align_block_size_triton)
@pytest.mark.parametrize(
"block_size,num_tokens,topk,num_experts",
list(
itertools.product(
[32, 64, 128, 256], # block_size
[
1,
3,
7,
16,
256,
2256,
4096,
], # num_tokens
[1, 4, 16, 64], # topk
[64, 160, 256, 257, 260, 264], # num_experts
)),
)
def test_moe_align_block_size_compare_implementations(block_size, num_tokens,
topk, num_experts):
topk_ids = torch.stack([
torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk]
for _ in range(num_tokens)
])
moe_align_block_size)
from vllm.platforms import current_platform
from vllm.utils import round_up
NUM_TOKENS = [1, 3, 7, 16, 256, 2256, 4096]
NUM_EXPERTS = [32, 160, 256, 257, 512]
TOP_KS = [1, 2, 16, 32]
BLOCK_SIZES = [32, 64, 128, 256]
current_platform.seed_everything(0)
def _group_tokens_by_expert(
sorted_ids: torch.Tensor,
expert_ids: torch.Tensor,
block_size: int,
valid_length: int,
total_tokens: int,
) -> dict:
num_blocks = valid_length // block_size
expert_tokens: dict[int, list[int]] = {}
for block_idx in range(num_blocks):
expert_id = expert_ids[block_idx].item()
block_start = block_idx * block_size
block_end = min(block_start + block_size, valid_length)
block_tokens = sorted_ids[block_start:block_end]
valid_tokens = block_tokens[block_tokens < total_tokens]
if expert_id not in expert_tokens:
expert_tokens[expert_id] = []
expert_tokens[expert_id].extend(valid_tokens.tolist())
return expert_tokens
def _verify_expert_level_sorting(
actual_sorted_ids: torch.Tensor,
golden_sorted_ids: torch.Tensor,
expert_ids: torch.Tensor,
block_size: int,
valid_length: int,
total_tokens: int,
):
"""
Verify that actual_sorted_ids follows the correct expert-level sorting.
The kerne limplementation may or may not preserve original token order
in topk_ids in the final sorted_ids however this does not impact quality.
"""
# Group tokens by expert from the golden implementation
golden_expert_tokens = _group_tokens_by_expert(golden_sorted_ids,
expert_ids, block_size,
valid_length, total_tokens)
actual_expert_tokens = _group_tokens_by_expert(actual_sorted_ids,
expert_ids, block_size,
valid_length, total_tokens)
assert set(golden_expert_tokens.keys()) == set(
actual_expert_tokens.keys()), (
f"Expert IDs mismatch: golden={set(golden_expert_tokens.keys())}, "
f"actual={set(actual_expert_tokens.keys())}")
for expert_id in golden_expert_tokens:
golden_tokens = torch.tensor(golden_expert_tokens[expert_id],
device=actual_sorted_ids.device)
actual_tokens = torch.tensor(actual_expert_tokens[expert_id],
device=actual_sorted_ids.device)
assert torch.equal(
torch.sort(golden_tokens)[0],
torch.sort(actual_tokens)[0]), (
f"Expert {expert_id} token mismatch: "
f"golden={golden_expert_tokens[expert_id]}, "
f"actual={actual_expert_tokens[expert_id]}")
def torch_moe_align_block_size(
topk_ids: torch.Tensor,
block_size: int,
num_experts: int,
expert_map: Optional[torch.Tensor] = None,
pad_sorted_ids: bool = False,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Golden torch implementation of moe_align_block_size.
This function aligns the token distribution across experts to be compatible
with block size for matrix multiplication by sorting tokens by expert and
padding to block boundaries.
"""
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
if pad_sorted_ids:
max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
flattened_token_indices = torch.arange(topk_ids.numel(),
device=topk_ids.device,
dtype=torch.int32)
flattened_expert_ids = topk_ids.flatten()
sorted_expert_ids, sort_indices = torch.sort(flattened_expert_ids,
stable=True)
sorted_token_indices = flattened_token_indices[sort_indices]
expert_token_counts = torch.zeros(num_experts,
dtype=torch.int64,
device=topk_ids.device)
for expert_id in range(num_experts):
mask = sorted_expert_ids == expert_id
expert_token_counts[expert_id] = mask.sum()
expert_padded_counts = torch.zeros(num_experts,
dtype=torch.int64,
device=topk_ids.device)
for expert_id in range(num_experts):
original_count = expert_token_counts[expert_id]
if original_count > 0:
expert_padded_counts[expert_id] = (
(original_count + block_size - 1) // block_size) * block_size
sorted_ids_cuda = torch.empty((max_num_tokens_padded, ),
dtype=torch.int32,
device=topk_ids.device)
sorted_ids_cuda.fill_(topk_ids.numel())
max_num_m_blocks = max_num_tokens_padded // block_size
expert_ids_cuda = torch.zeros((max_num_m_blocks, ),
dtype=torch.int32,
device=topk_ids.device)
num_tokens_post_pad_cuda = torch.empty((1),
dtype=torch.int32,
device=topk_ids.device)
sorted_ids_triton = torch.empty_like(sorted_ids_cuda)
sorted_ids_triton.fill_(topk_ids.numel())
expert_ids_triton = torch.zeros_like(expert_ids_cuda)
num_tokens_post_pad_triton = torch.empty_like(num_tokens_post_pad_cuda)
ops.moe_align_block_size(
topk_ids,
num_experts,
sorted_token_ids = torch.full(
(max_num_tokens_padded, ),
topk_ids.numel(),
dtype=torch.int32,
device=topk_ids.device,
)
max_num_blocks = (max_num_tokens_padded + block_size - 1) // block_size
expert_ids = torch.zeros(max_num_blocks,
dtype=torch.int32,
device=topk_ids.device)
current_pos = 0
current_block = 0
for expert_id in range(num_experts):
expert_mask = sorted_expert_ids == expert_id
expert_tokens = sorted_token_indices[expert_mask]
num_expert_tokens = expert_tokens.shape[0]
if num_expert_tokens > 0:
sorted_token_ids[current_pos:current_pos +
num_expert_tokens] = (expert_tokens)
expert_blocks_needed = expert_padded_counts[expert_id] // block_size
expert_ids[current_block:current_block +
expert_blocks_needed] = (expert_id)
current_pos += expert_padded_counts[expert_id]
current_block += expert_blocks_needed
total_padded_tokens = expert_padded_counts.sum()
num_tokens_post_pad = torch.tensor([total_padded_tokens],
dtype=torch.int32,
device=topk_ids.device)
if expert_map is not None:
expert_ids = expert_map[expert_ids]
return sorted_token_ids, expert_ids, num_tokens_post_pad
@pytest.mark.parametrize("m", NUM_TOKENS)
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("pad_sorted_ids", [False, True])
@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
def test_moe_align_block_size(m: int, topk: int, num_experts: int,
block_size: int, pad_sorted_ids: bool):
"""Test moe_align_block_size without expert mapping"""
topk_ids = torch.zeros((m, topk), device="cuda", dtype=torch.int32)
for i in range(m):
experts = torch.randperm(num_experts, device="cuda")[:topk]
topk_ids[i] = experts
actual_sorted_ids, actual_expert_ids, actual_num_tokens = (
moe_align_block_size(
topk_ids=topk_ids,
block_size=block_size,
num_experts=num_experts,
pad_sorted_ids=pad_sorted_ids,
))
golden_sorted_ids, golden_expert_ids, golden_num_tokens = (
torch_moe_align_block_size(
topk_ids=topk_ids,
block_size=block_size,
num_experts=num_experts,
pad_sorted_ids=pad_sorted_ids,
))
torch.testing.assert_close(actual_num_tokens,
golden_num_tokens,
atol=0,
rtol=0)
torch.testing.assert_close(actual_expert_ids,
golden_expert_ids,
atol=0,
rtol=0)
# For sorted_token_ids, verify block-level correctness rather than exact
# order Tokens within each expert's blocks can be in any order, but expert
# regions must be correct
_verify_expert_level_sorting(
actual_sorted_ids,
golden_sorted_ids,
actual_expert_ids,
block_size,
sorted_ids_cuda,
expert_ids_cuda,
num_tokens_post_pad_cuda,
actual_num_tokens.item(),
m * topk,
)
moe_align_block_size_triton(
topk_ids,
num_experts,
total_tokens = m * topk
assert actual_num_tokens.item() % block_size == 0, (
"num_tokens_post_pad should be divisible by block_size")
assert actual_num_tokens.item() >= total_tokens, (
"num_tokens_post_pad should be at least total_tokens")
valid_tokens = actual_sorted_ids[actual_sorted_ids < total_tokens]
assert len(valid_tokens) == total_tokens, (
f"Should have exactly {total_tokens} valid tokens, "
f"got {len(valid_tokens)}")
assert (actual_expert_ids >= 0).all() and (
actual_expert_ids
< num_experts).all(), "expert_ids should contain valid expert indices"
@pytest.mark.parametrize("m", [16, 32])
@pytest.mark.parametrize("topk", [2, 4])
@pytest.mark.parametrize("num_experts", [8])
@pytest.mark.parametrize("block_size", [64])
@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
def test_moe_align_block_size_with_expert_map(m: int, topk: int,
num_experts: int,
block_size: int):
"""Test moe_align_block_size with expert mapping (EP scenario)"""
topk_ids = torch.zeros((m, topk), device="cuda", dtype=torch.int32)
for i in range(m):
experts = torch.randperm(num_experts, device="cuda")[:topk]
topk_ids[i] = experts
expert_map = torch.full((num_experts, ),
-1,
device="cuda",
dtype=torch.int32)
local_experts = list(range(0, num_experts, 2))
for i, expert_id in enumerate(local_experts):
expert_map[expert_id] = i
actual_sorted_ids, actual_expert_ids, actual_num_tokens = (
moe_align_block_size(
topk_ids=topk_ids,
block_size=block_size,
num_experts=num_experts,
expert_map=expert_map,
))
golden_sorted_ids, golden_expert_ids, golden_num_tokens = (
torch_moe_align_block_size(
topk_ids=topk_ids,
block_size=block_size,
num_experts=num_experts,
expert_map=expert_map,
))
torch.testing.assert_close(actual_num_tokens,
golden_num_tokens,
atol=0,
rtol=0)
torch.testing.assert_close(actual_expert_ids,
golden_expert_ids,
atol=0,
rtol=0)
_verify_expert_level_sorting(
actual_sorted_ids,
golden_sorted_ids,
actual_expert_ids,
block_size,
sorted_ids_triton,
expert_ids_triton,
num_tokens_post_pad_triton,
actual_num_tokens.item(),
m * topk,
)
assert torch.allclose(expert_ids_cuda, expert_ids_triton), (
f"Expert IDs mismatch for block_size={block_size}, "
f"num_tokens={num_tokens}, topk={topk}\n"
f"CUDA expert_ids: {expert_ids_cuda}\n"
f"Triton expert_ids: {expert_ids_triton}")
assert torch.allclose(
num_tokens_post_pad_cuda, num_tokens_post_pad_triton), (
f"Num tokens post pad mismatch for block_size={block_size}, "
f"num_tokens={num_tokens}, topk={topk}\n"
f"CUDA num_tokens_post_pad: {num_tokens_post_pad_cuda}\n"
f"Triton num_tokens_post_pad: {num_tokens_post_pad_triton}")
def test_moe_align_block_size_deterministic():
m, topk, num_experts, block_size = 128, 2, 32, 64
torch.manual_seed(42)
topk_ids = torch.randint(0,
num_experts, (m, topk),
device="cuda",
dtype=torch.int32)
# expect the results to be reproducible
results = []
for _ in range(5):
sorted_ids, expert_ids, num_tokens = moe_align_block_size(
topk_ids=topk_ids, block_size=block_size, num_experts=num_experts)
results.append(
(sorted_ids.clone(), expert_ids.clone(), num_tokens.clone()))
if __name__ == "__main__":
pytest.main([__file__])
for i in range(1, len(results)):
assert torch.equal(
results[0][0],
results[i][0]), ("sorted_ids should be deterministic")
assert torch.equal(
results[0][1],
results[i][1]), ("expert_ids should be deterministic")
assert torch.equal(
results[0][2],
results[i][2]), ("num_tokens should be deterministic")
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib
import importlib.metadata
from dataclasses import dataclass
import pytest
import torch
from packaging import version
QUARK_MXFP4_AVAILABLE = importlib.util.find_spec(
"quark") is not None and version.parse(
importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
@dataclass
class ModelCase:
model_id: str
tp: int
@pytest.mark.parametrize('model_case', [
ModelCase("fxmarty/qwen_1.5-moe-a2.7b-mxfp4", tp=1),
ModelCase("fxmarty/deepseek_r1_3_layers_mxfp4", tp=8),
ModelCase("fxmarty/Llama-4-Scout-17B-16E-Instruct-2-layers-mxfp4", tp=1)
])
@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE,
reason="amd-quark>=0.9 is not available")
def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
if torch.cuda.device_count() < model_case.tp:
pytest.skip(f"This test requires >={model_case.tp} gpus, got only "
f"{torch.cuda.device_count()}")
with vllm_runner(model_case.model_id,
tensor_parallel_size=model_case.tp,
load_format="dummy") as llm:
# TODO: llm.apply_model(check_model) currently relies on V0 internals.
# Re-enable this later.
# def check_model(model):
# layer = model.model.layers[0]
# qkv_proj = layer.self_attn.qkv_proj
# assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
# assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4)
# assert isinstance(layer.mlp.experts.quant_method,
# QuarkW4A4MXFp4MoEMethod)
# if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4":
# llm.apply_model(check_model)
output = llm.generate_greedy("Today I am in the French Alps and",
max_tokens=20)
assert output
\ No newline at end of file
......@@ -93,11 +93,11 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
a1_gscale=a1_gs,
w1_fp4=w1_q,
w1_blockscale=w1_blockscale,
w1_alphas=(1 / w1_gs),
g1_alphas=(1 / w1_gs),
a2_gscale=a2_gs,
w2_fp4=w2_q,
w2_blockscale=w2_blockscale,
w2_alphas=(1 / w2_gs),
g2_alphas=(1 / w2_gs),
topk_weights=topk_weights,
topk_ids=topk_ids,
m=m,
......
......@@ -32,6 +32,8 @@ from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
from vllm.model_executor.layers.fused_moe.fused_moe import get_default_config
from vllm.model_executor.layers.fused_moe.modular_kernel import (
FusedMoEModularKernel)
from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
TopKWeightAndReduceDelegate)
from vllm.platforms import current_platform
from vllm.utils import round_up
......@@ -371,6 +373,7 @@ def pplx_prepare_finalize(
chunk_topk_weight,
chunk_topk_ids,
False,
weight_and_reduce_impl=TopKWeightAndReduceDelegate(),
)
torch.cuda.synchronize()
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses
from math import prod
from typing import Optional
import pytest
......@@ -8,9 +9,12 @@ import torch
from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
from vllm.model_executor.layers.fused_moe.cutlass_moe import (
cutlass_moe_fp8, run_cutlass_moe_fp8)
from vllm.model_executor.layers.fused_moe.fused_moe import (fused_experts,
fused_topk)
from vllm.model_executor.layers.fused_moe.utils import (
moe_kernel_quantize_input)
from vllm.platforms import current_platform
NUM_EXPERTS = [40, 64]
......@@ -21,6 +25,7 @@ MNK_FACTORS = [
(2, 1024, 1536),
(2, 3072, 1024),
(2, 3072, 1536),
(7, 3072, 1536),
(64, 1024, 1024),
(64, 1024, 1536),
(64, 3072, 1024),
......@@ -236,6 +241,7 @@ def test_cutlass_moe_8_bit_no_graph(
per_act_token: bool,
per_out_ch: bool,
monkeypatch,
ep_size: Optional[int] = None,
):
current_platform.seed_everything(7)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
......@@ -254,7 +260,13 @@ def test_cutlass_moe_8_bit_no_graph(
triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights,
topk_ids)
cutlass_output = run_8_bit(mt, topk_weights, topk_ids, per_act_token)
if ep_size is not None:
assert e % ep_size == 0, "Cannot distribute experts evenly"
number_local_experts = e // ep_size
else:
number_local_experts = None
cutlass_output = run_8_bit(mt, topk_weights, topk_ids, per_act_token,
number_local_experts)
# Note 5.5 only needed for larger problem sizes, 5 works ok for
# the rest.
......@@ -340,9 +352,62 @@ def test_cutlass_moe_8_bit_EP(
per_out_channel: bool,
ep_size: int,
monkeypatch,
):
test_cutlass_moe_8_bit_no_graph(m, n, k, e, topk, per_act_token,
per_out_channel, monkeypatch, ep_size)
LARGE_MNK_FACTORS = [
(1, 8192, 5120, 31),
(32768, 1024, 1024, 16),
(65536, 512, 1024, 16),
]
@pytest.mark.parametrize("m,n,k,topk", LARGE_MNK_FACTORS)
@pytest.mark.parametrize("e", [128])
@pytest.mark.parametrize("per_act_token", [False])
@pytest.mark.parametrize("per_out_channel", [True])
@pytest.mark.parametrize("ep_size", [8])
@pytest.mark.skipif(
(lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
current_platform.get_device_capability()),
reason="Grouped gemm is not supported on this GPU type.")
def test_cutlass_moe_8_bit_EP_large(
m: int,
n: int,
k: int,
e: int,
topk: int,
per_act_token: bool,
per_out_channel: bool,
ep_size: int,
monkeypatch,
):
test_cutlass_moe_8_bit_no_graph(m, n, k, e, topk, per_act_token,
per_out_channel, monkeypatch, ep_size)
@pytest.mark.parametrize("m,n,k,topk", [(1, 8192, 5120, 31)])
@pytest.mark.parametrize("e", [128])
@pytest.mark.parametrize("per_act_token", [False])
@pytest.mark.parametrize("per_out_channel", [True])
@pytest.mark.parametrize("ep_size", [8])
@pytest.mark.skipif(
(lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
current_platform.get_device_capability()),
reason="Grouped gemm is not supported on this GPU type.")
def test_run_cutlass_moe_fp8(
m: int,
n: int,
k: int,
e: int,
topk: int,
per_act_token: bool,
per_out_channel: bool,
ep_size: int,
):
current_platform.seed_everything(7)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
with set_current_vllm_config(vllm_config):
mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
per_out_channel)
......@@ -352,20 +417,53 @@ def test_cutlass_moe_8_bit_EP(
score,
topk,
renormalize=False)
# Note that we are using the dequantized versions of the tensors.
# Using a, w1 and w2 directly results in minor output differences.
triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights,
topk_ids)
assert e % ep_size == 0, "Cannot distribute experts evenly"
cutlass_output = run_8_bit(mt,
topk_weights,
topk_ids,
per_act_token,
num_local_experts=e // ep_size)
torch.testing.assert_close(triton_output,
cutlass_output,
atol=5e-2,
rtol=1e-2)
\ No newline at end of file
# we want to make sure there is at least one token that's generated in
# this expert shard and at least one token that's NOT generated in this
# expert shard
topk_ids[0][0] = -1
topk_ids[0][1] = 1
workspace13_shape = (m * topk, max(2 * n, k))
workspace2_shape = (m * topk, n)
output_shape = (m * topk, k)
workspace13 = torch.empty(prod(workspace13_shape),
device="cuda",
dtype=mt.a.dtype)
workspace2 = torch.empty(prod(workspace2_shape),
device="cuda",
dtype=mt.a.dtype)
num_local_experts = e // ep_size
start, end = 0, num_local_experts
expert_map = [-1] * e
expert_map[start:end] = list(range(num_local_experts))
expert_map = torch.tensor(expert_map, dtype=torch.int32, device="cuda")
activation = lambda o, i: torch.ops._C.silu_and_mul(o, i)
a1q, a1q_scale = moe_kernel_quantize_input(mt.a, mt.a_scale,
torch.float8_e4m3fn,
per_act_token)
global_num_experts = -1 if mt.w1_q is None else mt.w1_q.size(0)
func = lambda output: run_cutlass_moe_fp8(
output, a1q, mt.w1_q, mt.w2_q, topk_ids, activation,
global_num_experts, expert_map, mt.w1_scale, mt.w2_scale,
a1q_scale, None, workspace13, workspace2, None, mt.a.dtype,
per_act_token, per_out_channel, False)
workspace13.random_()
output_random_workspace = torch.empty(output_shape,
device="cuda",
dtype=mt.a.dtype)
func(output_random_workspace)
workspace13.fill_(0)
output_zero_workspace = torch.zeros(output_shape,
device="cuda",
dtype=mt.a.dtype)
func(output_zero_workspace)
torch.testing.assert_close(output_random_workspace,
output_zero_workspace,
atol=5e-3,
rtol=1e-3)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import patch
import pytest
import torch
from vllm.model_executor.layers.quantization.utils import fp8_utils
@pytest.mark.parametrize("shape", [(32, 128), (64, 256), (16, 512)])
@pytest.mark.parametrize("column_major", [False, True])
@pytest.mark.parametrize("scale_ue8m0", [False, True])
@pytest.mark.parametrize("group_size", [64, 128])
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
def test_per_token_group_quant_fp8(shape, column_major: bool,
scale_ue8m0: bool, group_size: int):
device = "cuda"
torch.manual_seed(42)
num_tokens, hidden_dim = shape
x = (torch.randn(
(num_tokens, hidden_dim), device=device, dtype=torch.bfloat16) * 8)
# cuda path
out_q, scale = fp8_utils.per_token_group_quant_fp8(
x,
group_size,
column_major_scales=column_major,
use_ue8m0=scale_ue8m0,
)
# triton ref
with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
ref_q, ref_s = fp8_utils.per_token_group_quant_fp8(
x,
group_size,
column_major_scales=column_major,
use_ue8m0=scale_ue8m0,
)
assert torch.allclose(out_q.float(), ref_q.float(), atol=0.15, rtol=0.15)
assert torch.allclose(scale, ref_s, atol=0.01, rtol=0.01)
......@@ -8,19 +8,14 @@ import pytest
import torch
from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
native_w8a8_block_matmul,
per_block_cast_to_fp8)
native_w8a8_block_matmul)
from vllm.config import VllmConfig
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8, w8a8_block_fp8_matmul)
get_col_major_tma_aligned_tensor, per_token_group_quant_fp8,
w8a8_block_fp8_matmul)
from vllm.platforms import current_platform
dg_available = False
try:
import deep_gemm
dg_available = True
except ImportError:
pass
from vllm.utils import has_deep_gemm
from vllm.utils.deep_gemm import fp8_gemm_nt, per_block_cast_to_fp8
if current_platform.get_device_capability() < (9, 0):
pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
......@@ -106,7 +101,8 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
@pytest.mark.parametrize(
"M,N,K,block_size,out_dtype,seed",
itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
@pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
@pytest.mark.skipif(not has_deep_gemm(),
reason="DeepGemm kernels not available.")
@torch.inference_mode()
def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
# only aligned sizes
......@@ -120,9 +116,7 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
_, block_k = block_size[0], block_size[1]
A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_k)
A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_size[1])
B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32)
As = As_fp8.to(torch.float32)
......@@ -132,14 +126,14 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
out_dtype)
# Transpose earlier so that the testing will not trigger transposing kernels
As_fp8 = deep_gemm.get_col_major_tma_aligned_tensor(As_fp8)
As_fp8 = get_col_major_tma_aligned_tensor(As_fp8)
out = torch.zeros((M, N), device='cuda', dtype=out_dtype)
assert As_fp8.shape == (M, (K + 127) //
128), f"{As_fp8.shape} != {(M, (K + 127) // 128)}"
deep_gemm.gemm_fp8_fp8_bf16_nt((A_fp8, As_fp8), (B_fp8, Bs_fp8), out)
fp8_gemm_nt((A_fp8, As_fp8), (B_fp8, Bs_fp8), out)
rel_diff = (torch.mean(
torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
......
......@@ -1072,6 +1072,7 @@ def torch_experts(
quant_dtype: Optional[torch.dtype] = None,
per_act_token_quant=False,
block_shape: Optional[list[int]] = None,
apply_router_weights_on_input: bool = False,
) -> torch.Tensor:
assert (global_num_experts == -1
or (global_num_experts == w1.shape[0] and expert_map is None)
......@@ -1081,11 +1082,17 @@ def torch_experts(
M, K = a.shape
topk = topk_ids.shape[1]
if apply_router_weights_on_input:
assert topk == 1
a = a * topk_weight.to(a.dtype)
a = a.view(M, -1, K).repeat(1, topk, 1).reshape(-1, K)
out = torch.zeros(M * topk, w2.shape[1], dtype=a.dtype, device=a.device)
a, a_scale = moe_kernel_quantize_input(a, None, quant_dtype,
if a1_scale:
assert not per_act_token_quant and block_shape is None
a, a_scale = moe_kernel_quantize_input(a, a1_scale, quant_dtype,
per_act_token_quant, block_shape)
num_experts = w1.shape[0]
......@@ -1104,6 +1111,7 @@ def torch_experts(
tmp2 = SiluAndMul()(tmp1)
out[mask] = tmp2 @ w2[i].transpose(0, 1)
elif block_shape is not None:
# block quantized
assert (a_scale is not None and w1_scale is not None
and w2_scale is not None)
tmp1 = native_w8a8_block_matmul(a[mask], w1[i], a_scale[mask],
......@@ -1121,15 +1129,27 @@ def torch_experts(
assert (a_scale is not None and w1_scale is not None
and w2_scale is not None)
scales = a_scale if a_scale.numel() == 1 else a_scale[mask]
tmp1 = a[mask].to(f32) * scales
w1_dq = (w1[i].to(f32) * w1_scale[i]).transpose(0, 1)
tmp1 = tmp1 @ w1_dq
tmp2 = SiluAndMul()(tmp1)
tmp1 = (tmp1 @ w1_dq).to(out.dtype)
tmp2 = SiluAndMul()(tmp1).to(out.dtype)
tmp2, b_scale = moe_kernel_quantize_input(
tmp2, a2_scale, quant_dtype, per_act_token_quant,
block_shape)
assert b_scale is not None
tmp2 = tmp2.to(f32) * b_scale
w2_dq = (w2[i].to(f32) * w2_scale[i]).transpose(0, 1)
out[mask] = (tmp2 @ w2_dq).to(out.dtype)
return (out.view(M, -1, w2.shape[1]).to(f32) *
topk_weight.view(M, -1, 1)).sum(dim=1).to(out.dtype)
if apply_router_weights_on_input:
return out
else:
return (out.view(M, -1, w2.shape[1]).to(f32) *
topk_weight.view(M, -1, 1)).sum(dim=1).to(out.dtype)
def torch_moe(a: torch.Tensor,
......
......@@ -234,12 +234,6 @@ def qwen_lora_files():
return os.path.join(models_path_prefix, "customize/qwen-nl2dsl-lora")
@pytest.fixture(scope="session")
def long_context_lora_files_16k_1():
# return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_1")
return os.path.join(models_path_prefix, "SangBinCho/long_context_16k_testing_1")
@pytest.fixture
def llama_2_7b_engine_extra_embeddings():
cleanup_dist_env_and_memory(shutdown_ray=True)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for applying default registered multimodal loras.
"""
import os
from huggingface_hub import snapshot_download
from vllm.lora.request import LoRARequest
from ..conftest import AudioTestAssets, VllmRunner
MODEL_PATH = snapshot_download("microsoft/Phi-4-multimodal-instruct")
AUDIO_LORA_PATH = os.path.join(MODEL_PATH, "speech-lora")
IMAGE_LORA_PATH = os.path.join(MODEL_PATH, "vision-lora")
AUDIO_PROMPT = "<|user|><|audio_1|>Can you transcribe this audio?<|end|><|assistant|>" # noqa: E501
# Responses are greedy decoded; we just check the end of
# the generated text. If the lora is inactive, this model
# generates commentary on the transcription.
RESPONSE_SUFFIX_WITH_LORA = "Spoken text: The first words I spoke in the original chronograph, a little piece of practical poetry. Mary had a little lamb, it slept with quite a snow, and everywhere that Mary went, the lamb was sure to go." # noqa: E501
RESPONSE_SUFFIX_WITHOUT_LORA = "Certainly! Here is the transcription of the audio you provided:\n\nThe first words I spoke in the original phonograph record: A little piece of practical poetry. Mary had a little lamb; its fleece was white as snow, and everywhere that Mary went, the lamb was sure to go." # noqa: E501
VLLM_RUNNER_BASE_KWARGS = {
"model_name": MODEL_PATH,
"dtype": "half",
"enable_lora": "True",
"max_num_seqs": 2,
"max_lora_rank": 320,
"max_model_len": 12800,
"gpu_memory_utilization": 0.8,
"limit_mm_per_prompt": {
"audio": 1
},
"enforce_eager": True,
}
def run_test(vllm_runner, audio_assets, lora_request, expected_suffix,
**kwargs):
inputs = [([AUDIO_PROMPT], [audio_assets[0].audio_and_sample_rate[0]])]
# Apply any additional kwargs as overrides to the base kwargs
vllm_runner_kwargs = {**VLLM_RUNNER_BASE_KWARGS, **kwargs}
with vllm_runner(**vllm_runner_kwargs) as vllm_model:
vllm_outputs_with_default_lora = [
vllm_model.generate_greedy(
prompts,
max_tokens=128,
audios=audios,
lora_request=lora_request,
) for prompts, audios in inputs
]
assert vllm_outputs_with_default_lora[-1][-1][-1].endswith(
expected_suffix)
def test_active_default_mm_lora(
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets,
):
"""Ensure that we can use the default audio lora."""
run_test(
vllm_runner,
audio_assets,
lora_request=None,
default_mm_loras={"audio": AUDIO_LORA_PATH},
expected_suffix=RESPONSE_SUFFIX_WITH_LORA,
)
def test_inactive_default_mm_lora(
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets,
):
"""Ensure that modalities are filtered properly."""
# Default image lora won't be active since we only pass audio
run_test(
vllm_runner,
audio_assets,
lora_request=None,
default_mm_loras={"image": IMAGE_LORA_PATH},
expected_suffix=RESPONSE_SUFFIX_WITHOUT_LORA,
)
def test_default_mm_lora_succeeds_with_redundant_lora_request(
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets,
):
"""Ensure that redundantly providing the lora works."""
run_test(
vllm_runner,
audio_assets,
lora_request=LoRARequest("audio", 1, AUDIO_LORA_PATH),
default_mm_loras={"audio": AUDIO_LORA_PATH},
expected_suffix=RESPONSE_SUFFIX_WITH_LORA,
)
def test_default_mm_lora_fails_with_overridden_lora_request(
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets,
):
"""Ensure that if the lora_request conflicts with default_mm_loras,
we use the lora_request."""
run_test(
vllm_runner,
audio_assets,
lora_request=LoRARequest("speech", 2, AUDIO_LORA_PATH),
default_mm_loras={"audio": IMAGE_LORA_PATH},
expected_suffix=RESPONSE_SUFFIX_WITH_LORA,
)
......@@ -170,7 +170,8 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
str(tp_size), "serialize", "--serialized-directory",
str(tmp_path), "--suffix", suffix
str(tmp_path), "--suffix", suffix, "--serialization-kwargs",
'{"limit_cpu_concurrency": 4}'
],
check=True,
capture_output=True,
......@@ -185,27 +186,26 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
loaded_vllm_model = LLM(model=model_ref,
load_format="tensorizer",
enable_lora=True,
enforce_eager=True,
model_loader_extra_config=tensorizer_config,
max_num_seqs=13,
tensor_parallel_size=2,
max_loras=2)
loaded_llm = LLM(model=model_ref,
load_format="tensorizer",
enable_lora=True,
enforce_eager=True,
model_loader_extra_config=tensorizer_config,
max_num_seqs=13,
tensor_parallel_size=2,
max_loras=2)
tensorizer_config_dict = tensorizer_config.to_dict()
tc_as_dict = tensorizer_config.to_serializable()
print("lora adapter created")
assert do_sample(loaded_vllm_model,
assert do_sample(loaded_llm,
sql_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
tensorizer_config_dict=tc_as_dict,
lora_id=0) == EXPECTED_NO_LORA_OUTPUT
print("lora 1")
assert do_sample(loaded_vllm_model,
assert do_sample(loaded_llm,
sql_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
tensorizer_config_dict=tc_as_dict,
lora_id=1) == EXPECTED_LORA_OUTPUT
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment