Commit 711aa9d5 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.0' into v0.10.0-dev

parents 751c492c 6d8d0a24
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
import torch
import vllm._custom_ops as ops
def per_token_cast_to_fp8(
x: torch.Tensor, block_size: int) -> tuple[torch.Tensor, torch.Tensor]:
assert x.dim() == 2
m, n = x.shape
pad_size = (block_size - (n % block_size)) % block_size
x = torch.nn.functional.pad(x,
(0, pad_size), value=0) if pad_size > 0 else x
x_view = x.view(m, -1, block_size)
x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
fp8_data = (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn)
return fp8_data.view(m, n + pad_size)[:, :n], (x_amax / 448.0).view(m, -1)
def per_block_cast_to_fp8(
x: torch.Tensor, block_size_k: int,
block_size_n: int) -> tuple[torch.Tensor, torch.Tensor]:
assert x.dim() == 2
m, n = x.shape
x_padded = torch.zeros(
(
int(math.ceil(m / block_size_k)) * block_size_k,
int(math.ceil(n / block_size_n)) * block_size_n,
),
dtype=x.dtype,
device=x.device,
)
x_padded[:m, :n] = x
x_view = x_padded.view(-1, block_size_k,
x_padded.size(1) // block_size_k, block_size_n)
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
return x_scaled_sub, scales
def make_non_quant_weights(
e: int,
n: int,
k: int,
dtype: torch.dtype,
) -> tuple[torch.Tensor, torch.Tensor]:
"""
Return weights w1, w2
"""
device = torch.cuda.current_device()
w1 = torch.randn((e, 2 * n, k), device=device, dtype=dtype) / 15
w2 = torch.randn((e, k, n), device=device, dtype=dtype) / 15
return w1, w2
def make_block_quant_fp8_weights(
e: int,
n: int,
k: int,
block_size: list[int],
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Return weights w1, w2, w1_scale, w2_scale
"""
dtype = torch.bfloat16
device = torch.cuda.current_device()
fp8_info = torch.finfo(torch.float8_e4m3fn)
fp8_max, fp8_min = fp8_info.max, fp8_info.min
w1_bf16, w2_bf16 = make_non_quant_weights(e, n, k, dtype)
w1_bf16 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype)
w2_bf16 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(dtype=dtype)
block_n, block_k = block_size[0], block_size[1]
n_tiles_w1 = ((2 * n) + block_n - 1) // block_n
k_tiles_w1 = (k + block_k - 1) // block_k
n_tiles_w2 = (k + block_n - 1) // block_n
k_tiles_w2 = (n + block_k - 1) // block_k
w1 = torch.empty_like(w1_bf16, dtype=torch.float8_e4m3fn, device=device)
w2 = torch.empty_like(w2_bf16, dtype=torch.float8_e4m3fn, device=device)
w1_s = torch.empty((e, n_tiles_w1, k_tiles_w1),
device=device,
dtype=torch.float32)
w2_s = torch.empty((e, n_tiles_w2, k_tiles_w2),
device=device,
dtype=torch.float32)
assert w1_s.shape == (e, (2 * n + (block_n - 1)) // block_n,
(k + (block_k - 1)) // block_k)
assert (w2.shape[-2] + block_n - 1) // block_n == w2_s.shape[-2]
for i in range(e):
w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i],
block_size_k=block_k,
block_size_n=block_n)
w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i],
block_size_k=block_k,
block_size_n=block_n)
return w1, w2, w1_s, w2_s
def make_quant_fp8_weights(
e: int,
n: int,
k: int,
per_out_channel_quant: bool,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Return w1, w2, w1_scale, w2_scale
"""
q_dtype = torch.float8_e4m3fn
w1, w2 = make_non_quant_weights(e, n, k, dtype=torch.bfloat16)
# w1 -> w1_q, w2 -> w2_q
w1_q = torch.empty((e, 2 * n, k), device="cuda", dtype=q_dtype)
w2_q = torch.empty((e, k, n), device="cuda", dtype=q_dtype)
n_b_scales = 2 * n if per_out_channel_quant else 1
k_b_scales = k if per_out_channel_quant else 1
w1_scale = torch.empty((e, n_b_scales, 1),
device="cuda",
dtype=torch.float32)
w2_scale = torch.empty((e, k_b_scales, 1),
device="cuda",
dtype=torch.float32)
for expert in range(e):
w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(
w1[expert], use_per_token_if_dynamic=per_out_channel_quant)
w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(
w2[expert], use_per_token_if_dynamic=per_out_channel_quant)
return w1_q, w2_q, w1_scale, w2_scale
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
DeepEP test utilities DeepEP test utilities
""" """
import dataclasses import dataclasses
import importlib
import os import os
import traceback import traceback
from typing import Callable, Optional from typing import Callable, Optional
...@@ -15,10 +14,9 @@ from torch.multiprocessing import ( ...@@ -15,10 +14,9 @@ from torch.multiprocessing import (
spawn) # pyright: ignore[reportPrivateImportUsage] spawn) # pyright: ignore[reportPrivateImportUsage]
from typing_extensions import Concatenate, ParamSpec from typing_extensions import Concatenate, ParamSpec
from vllm.utils import get_open_port from vllm.utils import get_open_port, has_deep_ep
has_deep_ep = importlib.util.find_spec("deep_ep") is not None if has_deep_ep():
if has_deep_ep:
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501
DeepEPHTPrepareAndFinalize) DeepEPHTPrepareAndFinalize)
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501
......
...@@ -6,7 +6,6 @@ from typing import Optional ...@@ -6,7 +6,6 @@ from typing import Optional
import pytest import pytest
import torch import torch
import triton.language as tl
from tests.kernels.moe.utils import (batched_moe, from tests.kernels.moe.utils import (batched_moe,
make_quantized_test_activations, make_quantized_test_activations,
...@@ -18,6 +17,7 @@ from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( ...@@ -18,6 +17,7 @@ from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
invoke_moe_batched_triton_kernel) invoke_moe_batched_triton_kernel)
from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.triton_utils import tl
MNK_FACTORS = [ MNK_FACTORS = [
(1, 128, 128), (1, 128, 128),
......
...@@ -15,13 +15,13 @@ from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( ...@@ -15,13 +15,13 @@ from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
from vllm.model_executor.layers.fused_moe.fused_moe import ( from vllm.model_executor.layers.fused_moe.fused_moe import (
fused_topk, modular_triton_fused_moe) fused_topk, modular_triton_fused_moe)
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import has_deep_gemm
from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
dg_available = False dg_available = has_deep_gemm()
try:
import deep_gemm if dg_available:
dg_available = True from deep_gemm import get_m_alignment_for_contiguous_layout
except ImportError:
pass
if current_platform.get_device_capability() < (9, 0): if current_platform.get_device_capability() < (9, 0):
pytest.skip("FP8 Triton requires CUDA 9.0 or higher", pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
...@@ -224,6 +224,7 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed, ...@@ -224,6 +224,7 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
@pytest.mark.parametrize("topk", TOP_KS) @pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.") @pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
@pytest.mark.skipif(is_blackwell_deep_gemm_used(), reason="Not E8M0 scale MOE")
@torch.inference_mode() @torch.inference_mode()
def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed, def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
monkeypatch): monkeypatch):
...@@ -238,8 +239,7 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed, ...@@ -238,8 +239,7 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
torch.manual_seed(seed) torch.manual_seed(seed)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size)) monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size))
block_m = get_m_alignment_for_contiguous_layout()
block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
block_size = [block_m, block_m] block_size = [block_m, block_m]
dtype = torch.bfloat16 dtype = torch.bfloat16
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests compute_expert_num_tokens kernels
"""
import dataclasses
from typing import Optional
import pytest
import torch
from vllm.model_executor.layers.fused_moe.utils import count_expert_num_tokens
@dataclasses.dataclass
class TestTensors:
topk_ids: torch.Tensor
expert_map: Optional[torch.Tensor] = None
def to_device(self, device: str):
self.topk_ids = self.topk_ids.to(device=device)
if self.expert_map is not None:
self.expert_map = self.expert_map.to(device=device)
@staticmethod
def make(num_tokens: int, num_topk: int, num_experts: int, device: str,
topk_ids_dtype: torch.dtype) -> "TestTensors":
# make topk ids
topk_ids = torch.empty((num_tokens, num_topk),
device=device,
dtype=torch.int64)
for x in range(num_tokens):
topk_ids[x] = torch.randperm(num_experts)[:num_topk]
topk_ids = topk_ids.to(dtype=torch.int64)
return TestTensors(topk_ids=topk_ids)
def with_ep_rank(self, ep_rank: int, num_global_experts: int,
num_local_experts: int, device: str):
# make an expert map
expert_map = torch.empty((num_global_experts),
device=device,
dtype=torch.int32)
expert_map.fill_(-1)
s = ep_rank * num_local_experts
e = s + num_local_experts
expert_map[s:e] = torch.tensor(list(range(num_local_experts)),
device=device)
return TestTensors(topk_ids=self.topk_ids.clone(),
expert_map=expert_map)
def ref_impl(tt: TestTensors, expert_num_tokens: torch.Tensor):
# do the reference in cpu
tt.to_device("cpu")
expert_ids, counts = tt.topk_ids.unique(return_counts=True)
for eid, count in zip(expert_ids, counts):
if eid != -1 and tt.expert_map is not None:
eid = tt.expert_map[eid]
if eid == -1:
continue
expert_num_tokens[eid] += count
def do_test_compute_expert_num_tokens(num_tokens: int, num_topk: int,
num_experts: int, ep_size: int,
topk_ids_dtype: torch.dtype):
assert num_topk <= num_experts
tt = TestTensors.make(num_tokens,
num_topk,
num_experts,
topk_ids_dtype=topk_ids_dtype,
device="cpu")
num_global_experts = num_experts
assert num_global_experts % ep_size == 0
num_local_experts = num_global_experts // ep_size
for ep_rank in range(ep_size):
tt_rank = tt.with_ep_rank(ep_rank, num_global_experts,
num_local_experts, "cpu")
ref_expert_num_tokens = torch.zeros((num_local_experts),
device="cpu",
dtype=torch.int32)
ref_impl(tt_rank, ref_expert_num_tokens)
ref_expert_num_tokens = ref_expert_num_tokens.to("cuda")
tt_rank.to_device("cuda")
# Test with expert_map
triton_expert_num_tokens_w_emap = count_expert_num_tokens(
tt_rank.topk_ids, num_local_experts, tt_rank.expert_map)
# Test without expert map
topk_ids = tt_rank.expert_map[tt_rank.topk_ids].to(topk_ids_dtype)
triton_expert_num_tokens_wo_emap = count_expert_num_tokens(
topk_ids, num_local_experts, expert_map=None)
torch.testing.assert_close(ref_expert_num_tokens,
triton_expert_num_tokens_w_emap,
atol=0,
rtol=0)
torch.testing.assert_close(ref_expert_num_tokens,
triton_expert_num_tokens_wo_emap,
atol=0,
rtol=0)
@pytest.mark.parametrize(
"num_tokens", [1, 4, 8, 11, 19, 128, 127, 405, 1024, 3333, 6666, 7317])
@pytest.mark.parametrize("num_topk", [2, 6, 8])
@pytest.mark.parametrize("num_experts", [64])
@pytest.mark.parametrize("ep_size", [1, 2, 4])
@pytest.mark.parametrize("topk_ids_dtype", [torch.int64])
def test_compute_expert_num_tokens(num_tokens: int, num_topk: int,
num_experts: int, ep_size: int,
topk_ids_dtype: torch.dtype):
do_test_compute_expert_num_tokens(num_tokens, num_topk, num_experts,
ep_size, topk_ids_dtype)
@pytest.mark.parametrize("numel", list(range(1, 8192, 11)))
@pytest.mark.parametrize("num_experts", [32])
@pytest.mark.parametrize("ep_size", [2])
@pytest.mark.parametrize("topk_ids_dtype", [torch.int64])
def test_compute_expert_num_tokens_from_numel(numel: int, num_experts: int,
ep_size: int,
topk_ids_dtype: torch.dtype):
do_test_compute_expert_num_tokens(num_tokens=numel,
num_topk=1,
num_experts=num_experts,
ep_size=ep_size,
topk_ids_dtype=topk_ids_dtype)
...@@ -20,6 +20,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import ( ...@@ -20,6 +20,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
FusedMoEModularKernel) FusedMoEModularKernel)
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import has_deep_ep, has_deep_gemm from vllm.utils import has_deep_ep, has_deep_gemm
from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
from .parallel_utils import ProcessGroupInfo, parallel_launch from .parallel_utils import ProcessGroupInfo, parallel_launch
from .utils import make_test_weights from .utils import make_test_weights
...@@ -368,6 +369,8 @@ NUM_EXPERTS = [32] ...@@ -368,6 +369,8 @@ NUM_EXPERTS = [32]
@pytest.mark.parametrize("world_dp_size", [(2, 1)]) @pytest.mark.parametrize("world_dp_size", [(2, 1)])
@requires_deep_ep @requires_deep_ep
@requires_deep_gemm @requires_deep_gemm
@pytest.mark.skipif(is_blackwell_deep_gemm_used(),
reason="Skipping test for Blackwell DeepGEMM")
def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int, def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
topk: int, world_dp_size: tuple[int, int]): topk: int, world_dp_size: tuple[int, int]):
""" """
...@@ -423,6 +426,8 @@ USE_FP8_DISPATCH = [False] ...@@ -423,6 +426,8 @@ USE_FP8_DISPATCH = [False]
@pytest.mark.parametrize("world_dp_size", [(2, 1)]) @pytest.mark.parametrize("world_dp_size", [(2, 1)])
@requires_deep_ep @requires_deep_ep
@requires_deep_gemm @requires_deep_gemm
@pytest.mark.skipif(is_blackwell_deep_gemm_used(),
reason="Skipping test for Blackwell DeepGEMM")
def test_ll_deepep_deepgemm_moe( def test_ll_deepep_deepgemm_moe(
mnk: tuple[int, int, int], mnk: tuple[int, int, int],
num_experts: int, num_experts: int,
......
...@@ -15,46 +15,17 @@ import torch ...@@ -15,46 +15,17 @@ import torch
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
from vllm.model_executor.layers.quantization.utils.fp8_utils import ( from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8) per_token_group_quant_fp8)
from vllm.utils import cdiv from vllm.utils import has_deep_gemm
from vllm.utils.deep_gemm import calc_diff, per_block_cast_to_fp8
has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None BLOCK_SIZE = [128, 128]
if has_deep_gemm:
import deep_gemm
BLOCK_M = deep_gemm.get_m_alignment_for_contiguous_layout()
BLOCK_SIZE = [BLOCK_M, BLOCK_M]
requires_deep_gemm = pytest.mark.skipif( requires_deep_gemm = pytest.mark.skipif(
not has_deep_gemm, not has_deep_gemm(),
reason="Requires deep_gemm kernels", reason="Requires deep_gemm kernels",
) )
def calc_diff(x: torch.Tensor, y: torch.Tensor):
x, y = x.double(), y.double()
denominator = (x * x + y * y).sum()
sim = 2 * (x * y).sum() / denominator
return 1 - sim
def per_block_cast_to_fp8(
x: torch.Tensor,
block_size_n: int = 128) -> tuple[torch.Tensor, torch.Tensor]:
assert x.dim() == 2
m, n = x.shape
x_padded = torch.zeros(
(cdiv(m, 128) * 128, cdiv(n, block_size_n) * block_size_n),
dtype=x.dtype,
device=x.device)
x_padded[:m, :n] = x
x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, block_size_n)
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
return x_scaled_sub, scales
def make_block_quant_fp8_weights( def make_block_quant_fp8_weights(
e: int, e: int,
n: int, n: int,
...@@ -124,7 +95,7 @@ def run_single_case(m, n, k, topk, num_experts, block_size): ...@@ -124,7 +95,7 @@ def run_single_case(m, n, k, topk, num_experts, block_size):
topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1) topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1)
topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1) topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
# triton referrence # triton reference
out_triton = fused_experts( out_triton = fused_experts(
hidden_states=tokens_bf16, hidden_states=tokens_bf16,
w1=w1, w1=w1,
...@@ -155,17 +126,8 @@ def run_single_case(m, n, k, topk, num_experts, block_size): ...@@ -155,17 +126,8 @@ def run_single_case(m, n, k, topk, num_experts, block_size):
block_shape=block_size, block_shape=block_size,
allow_deep_gemm=True, allow_deep_gemm=True,
) )
diff = calc_diff(out_deepgemm, out_triton)
base = out_triton.abs().mean() assert diff < 0.001, f"Diff exceeded 1%: {diff}"
atol = 0.1 * base.clamp(min=1e-2) # 10% of mean, but not lower than 1e-3
rtol = 0.05
# ----- Compare -----
torch.testing.assert_close(
out_deepgemm.to(torch.float32),
out_triton.to(torch.float32),
rtol=rtol,
atol=float(atol),
)
# Note: W1 has shape (E, 2N, K), so N = 512 # Note: W1 has shape (E, 2N, K), so N = 512
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import copy
from itertools import product
from typing import Optional
import pytest
import torch
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.config import VllmConfig, current_platform, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501
BatchedTritonOrDeepGemmExperts)
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
BatchedTritonExperts)
from vllm.model_executor.layers.fused_moe.layer import TritonExperts
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
TritonOrDeepGemmExperts)
from vllm.utils import has_deep_ep, has_deep_gemm, has_pplx
from .modular_kernel_tools.common import (Config, RankTensors, WeightTensors,
reference_moe_impl,
run_modular_kernel)
from .modular_kernel_tools.mk_objects import (
MK_FUSED_EXPERT_TYPES, MK_MULTI_GPU_PREPARE_FINALIZE_TYPES,
MK_QUANT_CONFIGS, MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES)
from .modular_kernel_tools.parallel_utils import (ProcessGroupInfo,
parallel_launch_with_config)
# TODO (varun): These requirements are very strict and could be relaxed.
has_all_packages = (has_deep_ep() and has_deep_gemm() and has_pplx())
meets_package_requirements = pytest.mark.skipif(
not has_all_packages,
reason="Requires deep_ep & deep_gemm & pplx packages",
)
def rank_worker(
pgi: ProcessGroupInfo,
vllm_config: VllmConfig,
cpu_group,
config: Config,
weights: WeightTensors,
):
current_platform.seed_everything(pgi.rank)
# sanity check
from vllm import envs
if config.fused_moe_chunk_size is not None:
assert (config.fused_moe_chunk_size == envs.VLLM_FUSED_MOE_CHUNK_SIZE)
# get weights to this device
weights.to_current_device()
Ms = config.Ms
assert isinstance(Ms, list)
TOPKs = config.topks
assert isinstance(TOPKs, list)
for m, topk in product(Ms, TOPKs):
print(f"Running m={m}, topk={topk} ...")
# override m and topk
cfgx = copy.deepcopy(config)
cfgx.Ms = m
cfgx.topks = topk
# inputs for rank
rank_tensors = RankTensors.make(cfgx, pgi)
# modular kernel out
mk_out = run_modular_kernel(pgi, vllm_config, cfgx, weights,
rank_tensors)
with set_current_vllm_config(vllm_config):
ref_out = reference_moe_impl(cfgx, weights, rank_tensors)
torch.testing.assert_close(ref_out, mk_out, atol=3e-2, rtol=3e-2)
def run(config: Config):
assert config.is_valid()
print(f"Testing config \n{config.describe()} ...")
weights: WeightTensors = WeightTensors.make(config)
vllm_config, env_dict = config.make_env_data()
parallel_launch_with_config(config.world_size, rank_worker, vllm_config,
env_dict, config, weights)
Ms = [32, 64]
Ks = [7168] # hidden sizes
Ns = [2048]
TOPKs = [4, 1]
Es = [32]
DTYPEs = [torch.bfloat16]
FUSED_MOE_CHUNK_SIZEs = [None, 16]
def is_nyi_config(config: Config) -> bool:
# We know these configs to be legitimate. but still fail.
if (config.fused_experts_type in [
BatchedTritonExperts, BatchedTritonOrDeepGemmExperts,
TritonExperts, TritonOrDeepGemmExperts
]):
# The triton kernels expect both per-act-token-quant and
# per-out-ch-quant or neither.
unsupported_quant_config = ((config.is_per_act_token_quant +
config.is_per_out_ch_quant) == 1)
return unsupported_quant_config
# cutlass kernels dont support expert_maps yet.
return config.fused_experts_type == CutlassExpertsFp8
@pytest.mark.parametrize("k", Ks)
@pytest.mark.parametrize("n", Ns)
@pytest.mark.parametrize("e", Es)
@pytest.mark.parametrize("dtype", DTYPEs)
@pytest.mark.parametrize("quant_config", MK_QUANT_CONFIGS)
@pytest.mark.parametrize(
"combination",
product(MK_MULTI_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
@pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
@pytest.mark.parametrize("world_size", [2])
@meets_package_requirements
def test_modular_kernel_combinations_multigpu(
k: int, n: int, e: int, dtype: torch.dtype,
quant_config: FusedMoEQuantConfig,
combination: tuple[mk.FusedMoEPrepareAndFinalize,
mk.FusedMoEPermuteExpertsUnpermute],
fused_moe_chunk_size: Optional[int], world_size: int):
config = Config(
Ms=Ms,
K=k,
N=n,
E=e,
topks=TOPKs,
dtype=dtype,
quant_config=quant_config,
prepare_finalize_type=combination[0],
fused_experts_type=combination[1],
fused_moe_chunk_size=fused_moe_chunk_size,
world_size=world_size,
)
if not config.is_valid():
pytest.skip(f"Tests config {config} is not valid. Skipping ...")
if is_nyi_config(config):
pytest.skip(f"Tests config {config} is nyi. Skipping ...")
print(f"{config.describe()}")
run(config)
@pytest.mark.parametrize("k", Ks)
@pytest.mark.parametrize("n", Ns)
@pytest.mark.parametrize("e", Es)
@pytest.mark.parametrize("dtype", DTYPEs)
@pytest.mark.parametrize("quant_config", MK_QUANT_CONFIGS)
@pytest.mark.parametrize(
"combination",
product(MK_SINGLE_GPU_PREPARE_FINALIZE_TYPES, MK_FUSED_EXPERT_TYPES))
@pytest.mark.parametrize("fused_moe_chunk_size", FUSED_MOE_CHUNK_SIZEs)
@pytest.mark.parametrize("world_size", [1])
@meets_package_requirements
def test_modular_kernel_combinations_singlegpu(
k: int, n: int, e: int, dtype: torch.dtype,
quant_config: FusedMoEQuantConfig,
combination: tuple[mk.FusedMoEPrepareAndFinalize,
mk.FusedMoEPermuteExpertsUnpermute],
fused_moe_chunk_size: Optional[int], world_size: int):
config = Config(
Ms=Ms,
K=k,
N=n,
E=e,
topks=TOPKs,
dtype=dtype,
quant_config=quant_config,
prepare_finalize_type=combination[0],
fused_experts_type=combination[1],
fused_moe_chunk_size=fused_moe_chunk_size,
world_size=world_size,
)
if not config.is_valid():
pytest.skip(f"Tests config {config} is not valid. Skipping ...")
if is_nyi_config(config):
pytest.skip(f"Tests config {config} is nyi. Skipping ...")
run(config)
if __name__ == '__main__':
# Ability to test individual PrepareAndFinalize and FusedExperts combination
from .modular_kernel_tools.cli_args import (make_config,
make_config_arg_parser)
parser = make_config_arg_parser(description=(
"Run single prepare-finalize & fused-experts combination test"
"Example : python3 -m tests.kernels.moe.test_modular_kernel_combinations " #noqa: E501
"--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
))
args = parser.parse_args()
config = make_config(args)
run(config)
...@@ -174,6 +174,7 @@ def test_fused_moe( ...@@ -174,6 +174,7 @@ def test_fused_moe(
use_int8_w8a8=False, use_int8_w8a8=False,
use_int8_w8a16=False, use_int8_w8a16=False,
use_int4_w4a16=False, use_int4_w4a16=False,
use_mxfp4_w4a4=False,
per_act_token_quant=False, per_act_token_quant=False,
block_shape=None) block_shape=None)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools """Tests for the MOE align block size function.
Run `pytest tests/kernels/moe/test_moe_align_block_size.py`.
"""
from typing import Optional
import pytest import pytest
import torch import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
moe_align_block_size_triton) moe_align_block_size)
from vllm.platforms import current_platform
from vllm.utils import round_up
@pytest.mark.parametrize(
"block_size,num_tokens,topk,num_experts", NUM_TOKENS = [1, 3, 7, 16, 256, 2256, 4096]
list( NUM_EXPERTS = [32, 160, 256, 257, 512]
itertools.product( TOP_KS = [1, 2, 16, 32]
[32, 64, 128, 256], # block_size BLOCK_SIZES = [32, 64, 128, 256]
[ current_platform.seed_everything(0)
1,
3,
7, def _group_tokens_by_expert(
16, sorted_ids: torch.Tensor,
256, expert_ids: torch.Tensor,
2256, block_size: int,
4096, valid_length: int,
], # num_tokens total_tokens: int,
[1, 4, 16, 64], # topk ) -> dict:
[64, 160, 256, 257, 260, 264], # num_experts num_blocks = valid_length // block_size
)), expert_tokens: dict[int, list[int]] = {}
)
def test_moe_align_block_size_compare_implementations(block_size, num_tokens, for block_idx in range(num_blocks):
topk, num_experts): expert_id = expert_ids[block_idx].item()
topk_ids = torch.stack([ block_start = block_idx * block_size
torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk] block_end = min(block_start + block_size, valid_length)
for _ in range(num_tokens)
]) block_tokens = sorted_ids[block_start:block_end]
valid_tokens = block_tokens[block_tokens < total_tokens]
if expert_id not in expert_tokens:
expert_tokens[expert_id] = []
expert_tokens[expert_id].extend(valid_tokens.tolist())
return expert_tokens
def _verify_expert_level_sorting(
actual_sorted_ids: torch.Tensor,
golden_sorted_ids: torch.Tensor,
expert_ids: torch.Tensor,
block_size: int,
valid_length: int,
total_tokens: int,
):
"""
Verify that actual_sorted_ids follows the correct expert-level sorting.
The kerne limplementation may or may not preserve original token order
in topk_ids in the final sorted_ids however this does not impact quality.
"""
# Group tokens by expert from the golden implementation
golden_expert_tokens = _group_tokens_by_expert(golden_sorted_ids,
expert_ids, block_size,
valid_length, total_tokens)
actual_expert_tokens = _group_tokens_by_expert(actual_sorted_ids,
expert_ids, block_size,
valid_length, total_tokens)
assert set(golden_expert_tokens.keys()) == set(
actual_expert_tokens.keys()), (
f"Expert IDs mismatch: golden={set(golden_expert_tokens.keys())}, "
f"actual={set(actual_expert_tokens.keys())}")
for expert_id in golden_expert_tokens:
golden_tokens = torch.tensor(golden_expert_tokens[expert_id],
device=actual_sorted_ids.device)
actual_tokens = torch.tensor(actual_expert_tokens[expert_id],
device=actual_sorted_ids.device)
assert torch.equal(
torch.sort(golden_tokens)[0],
torch.sort(actual_tokens)[0]), (
f"Expert {expert_id} token mismatch: "
f"golden={golden_expert_tokens[expert_id]}, "
f"actual={actual_expert_tokens[expert_id]}")
def torch_moe_align_block_size(
topk_ids: torch.Tensor,
block_size: int,
num_experts: int,
expert_map: Optional[torch.Tensor] = None,
pad_sorted_ids: bool = False,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""
Golden torch implementation of moe_align_block_size.
This function aligns the token distribution across experts to be compatible
with block size for matrix multiplication by sorting tokens by expert and
padding to block boundaries.
"""
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
if pad_sorted_ids:
max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
flattened_token_indices = torch.arange(topk_ids.numel(),
device=topk_ids.device,
dtype=torch.int32)
flattened_expert_ids = topk_ids.flatten()
sorted_expert_ids, sort_indices = torch.sort(flattened_expert_ids,
stable=True)
sorted_token_indices = flattened_token_indices[sort_indices]
expert_token_counts = torch.zeros(num_experts,
dtype=torch.int64,
device=topk_ids.device)
for expert_id in range(num_experts):
mask = sorted_expert_ids == expert_id
expert_token_counts[expert_id] = mask.sum()
expert_padded_counts = torch.zeros(num_experts,
dtype=torch.int64,
device=topk_ids.device)
for expert_id in range(num_experts):
original_count = expert_token_counts[expert_id]
if original_count > 0:
expert_padded_counts[expert_id] = (
(original_count + block_size - 1) // block_size) * block_size
sorted_ids_cuda = torch.empty((max_num_tokens_padded, ), sorted_token_ids = torch.full(
dtype=torch.int32, (max_num_tokens_padded, ),
device=topk_ids.device) topk_ids.numel(),
sorted_ids_cuda.fill_(topk_ids.numel()) dtype=torch.int32,
max_num_m_blocks = max_num_tokens_padded // block_size device=topk_ids.device,
expert_ids_cuda = torch.zeros((max_num_m_blocks, ), )
dtype=torch.int32, max_num_blocks = (max_num_tokens_padded + block_size - 1) // block_size
device=topk_ids.device) expert_ids = torch.zeros(max_num_blocks,
num_tokens_post_pad_cuda = torch.empty((1), dtype=torch.int32,
dtype=torch.int32, device=topk_ids.device)
device=topk_ids.device)
current_pos = 0
sorted_ids_triton = torch.empty_like(sorted_ids_cuda) current_block = 0
sorted_ids_triton.fill_(topk_ids.numel()) for expert_id in range(num_experts):
expert_ids_triton = torch.zeros_like(expert_ids_cuda) expert_mask = sorted_expert_ids == expert_id
num_tokens_post_pad_triton = torch.empty_like(num_tokens_post_pad_cuda) expert_tokens = sorted_token_indices[expert_mask]
num_expert_tokens = expert_tokens.shape[0]
ops.moe_align_block_size(
topk_ids, if num_expert_tokens > 0:
num_experts, sorted_token_ids[current_pos:current_pos +
num_expert_tokens] = (expert_tokens)
expert_blocks_needed = expert_padded_counts[expert_id] // block_size
expert_ids[current_block:current_block +
expert_blocks_needed] = (expert_id)
current_pos += expert_padded_counts[expert_id]
current_block += expert_blocks_needed
total_padded_tokens = expert_padded_counts.sum()
num_tokens_post_pad = torch.tensor([total_padded_tokens],
dtype=torch.int32,
device=topk_ids.device)
if expert_map is not None:
expert_ids = expert_map[expert_ids]
return sorted_token_ids, expert_ids, num_tokens_post_pad
@pytest.mark.parametrize("m", NUM_TOKENS)
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("num_experts", NUM_EXPERTS)
@pytest.mark.parametrize("block_size", BLOCK_SIZES)
@pytest.mark.parametrize("pad_sorted_ids", [False, True])
@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
def test_moe_align_block_size(m: int, topk: int, num_experts: int,
block_size: int, pad_sorted_ids: bool):
"""Test moe_align_block_size without expert mapping"""
topk_ids = torch.zeros((m, topk), device="cuda", dtype=torch.int32)
for i in range(m):
experts = torch.randperm(num_experts, device="cuda")[:topk]
topk_ids[i] = experts
actual_sorted_ids, actual_expert_ids, actual_num_tokens = (
moe_align_block_size(
topk_ids=topk_ids,
block_size=block_size,
num_experts=num_experts,
pad_sorted_ids=pad_sorted_ids,
))
golden_sorted_ids, golden_expert_ids, golden_num_tokens = (
torch_moe_align_block_size(
topk_ids=topk_ids,
block_size=block_size,
num_experts=num_experts,
pad_sorted_ids=pad_sorted_ids,
))
torch.testing.assert_close(actual_num_tokens,
golden_num_tokens,
atol=0,
rtol=0)
torch.testing.assert_close(actual_expert_ids,
golden_expert_ids,
atol=0,
rtol=0)
# For sorted_token_ids, verify block-level correctness rather than exact
# order Tokens within each expert's blocks can be in any order, but expert
# regions must be correct
_verify_expert_level_sorting(
actual_sorted_ids,
golden_sorted_ids,
actual_expert_ids,
block_size, block_size,
sorted_ids_cuda, actual_num_tokens.item(),
expert_ids_cuda, m * topk,
num_tokens_post_pad_cuda,
) )
moe_align_block_size_triton( total_tokens = m * topk
topk_ids, assert actual_num_tokens.item() % block_size == 0, (
num_experts, "num_tokens_post_pad should be divisible by block_size")
assert actual_num_tokens.item() >= total_tokens, (
"num_tokens_post_pad should be at least total_tokens")
valid_tokens = actual_sorted_ids[actual_sorted_ids < total_tokens]
assert len(valid_tokens) == total_tokens, (
f"Should have exactly {total_tokens} valid tokens, "
f"got {len(valid_tokens)}")
assert (actual_expert_ids >= 0).all() and (
actual_expert_ids
< num_experts).all(), "expert_ids should contain valid expert indices"
@pytest.mark.parametrize("m", [16, 32])
@pytest.mark.parametrize("topk", [2, 4])
@pytest.mark.parametrize("num_experts", [8])
@pytest.mark.parametrize("block_size", [64])
@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
def test_moe_align_block_size_with_expert_map(m: int, topk: int,
num_experts: int,
block_size: int):
"""Test moe_align_block_size with expert mapping (EP scenario)"""
topk_ids = torch.zeros((m, topk), device="cuda", dtype=torch.int32)
for i in range(m):
experts = torch.randperm(num_experts, device="cuda")[:topk]
topk_ids[i] = experts
expert_map = torch.full((num_experts, ),
-1,
device="cuda",
dtype=torch.int32)
local_experts = list(range(0, num_experts, 2))
for i, expert_id in enumerate(local_experts):
expert_map[expert_id] = i
actual_sorted_ids, actual_expert_ids, actual_num_tokens = (
moe_align_block_size(
topk_ids=topk_ids,
block_size=block_size,
num_experts=num_experts,
expert_map=expert_map,
))
golden_sorted_ids, golden_expert_ids, golden_num_tokens = (
torch_moe_align_block_size(
topk_ids=topk_ids,
block_size=block_size,
num_experts=num_experts,
expert_map=expert_map,
))
torch.testing.assert_close(actual_num_tokens,
golden_num_tokens,
atol=0,
rtol=0)
torch.testing.assert_close(actual_expert_ids,
golden_expert_ids,
atol=0,
rtol=0)
_verify_expert_level_sorting(
actual_sorted_ids,
golden_sorted_ids,
actual_expert_ids,
block_size, block_size,
sorted_ids_triton, actual_num_tokens.item(),
expert_ids_triton, m * topk,
num_tokens_post_pad_triton,
) )
assert torch.allclose(expert_ids_cuda, expert_ids_triton), (
f"Expert IDs mismatch for block_size={block_size}, "
f"num_tokens={num_tokens}, topk={topk}\n"
f"CUDA expert_ids: {expert_ids_cuda}\n"
f"Triton expert_ids: {expert_ids_triton}")
assert torch.allclose( def test_moe_align_block_size_deterministic():
num_tokens_post_pad_cuda, num_tokens_post_pad_triton), ( m, topk, num_experts, block_size = 128, 2, 32, 64
f"Num tokens post pad mismatch for block_size={block_size}, "
f"num_tokens={num_tokens}, topk={topk}\n" torch.manual_seed(42)
f"CUDA num_tokens_post_pad: {num_tokens_post_pad_cuda}\n" topk_ids = torch.randint(0,
f"Triton num_tokens_post_pad: {num_tokens_post_pad_triton}") num_experts, (m, topk),
device="cuda",
dtype=torch.int32)
# expect the results to be reproducible
results = []
for _ in range(5):
sorted_ids, expert_ids, num_tokens = moe_align_block_size(
topk_ids=topk_ids, block_size=block_size, num_experts=num_experts)
results.append(
(sorted_ids.clone(), expert_ids.clone(), num_tokens.clone()))
if __name__ == "__main__": for i in range(1, len(results)):
pytest.main([__file__]) assert torch.equal(
results[0][0],
results[i][0]), ("sorted_ids should be deterministic")
assert torch.equal(
results[0][1],
results[i][1]), ("expert_ids should be deterministic")
assert torch.equal(
results[0][2],
results[i][2]), ("num_tokens should be deterministic")
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import importlib
import importlib.metadata
from dataclasses import dataclass
import pytest
import torch
from packaging import version
QUARK_MXFP4_AVAILABLE = importlib.util.find_spec(
"quark") is not None and version.parse(
importlib.metadata.version("amd-quark")) >= version.parse('0.8.99')
@dataclass
class ModelCase:
model_id: str
tp: int
@pytest.mark.parametrize('model_case', [
ModelCase("fxmarty/qwen_1.5-moe-a2.7b-mxfp4", tp=1),
ModelCase("fxmarty/deepseek_r1_3_layers_mxfp4", tp=8),
ModelCase("fxmarty/Llama-4-Scout-17B-16E-Instruct-2-layers-mxfp4", tp=1)
])
@pytest.mark.skipif(not QUARK_MXFP4_AVAILABLE,
reason="amd-quark>=0.9 is not available")
def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
if torch.cuda.device_count() < model_case.tp:
pytest.skip(f"This test requires >={model_case.tp} gpus, got only "
f"{torch.cuda.device_count()}")
with vllm_runner(model_case.model_id,
tensor_parallel_size=model_case.tp,
load_format="dummy") as llm:
# TODO: llm.apply_model(check_model) currently relies on V0 internals.
# Re-enable this later.
# def check_model(model):
# layer = model.model.layers[0]
# qkv_proj = layer.self_attn.qkv_proj
# assert isinstance(qkv_proj.quant_method, QuarkLinearMethod)
# assert isinstance(qkv_proj.scheme, QuarkW4A4MXFP4)
# assert isinstance(layer.mlp.experts.quant_method,
# QuarkW4A4MXFp4MoEMethod)
# if model_case.model_id == "fxmarty/qwen_1.5-moe-a2.7b-mxfp4":
# llm.apply_model(check_model)
output = llm.generate_greedy("Today I am in the French Alps and",
max_tokens=20)
assert output
\ No newline at end of file
...@@ -93,11 +93,11 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int, ...@@ -93,11 +93,11 @@ def test_cutlass_fp4_moe_no_graph(m: int, n: int, k: int, e: int, topk: int,
a1_gscale=a1_gs, a1_gscale=a1_gs,
w1_fp4=w1_q, w1_fp4=w1_q,
w1_blockscale=w1_blockscale, w1_blockscale=w1_blockscale,
w1_alphas=(1 / w1_gs), g1_alphas=(1 / w1_gs),
a2_gscale=a2_gs, a2_gscale=a2_gs,
w2_fp4=w2_q, w2_fp4=w2_q,
w2_blockscale=w2_blockscale, w2_blockscale=w2_blockscale,
w2_alphas=(1 / w2_gs), g2_alphas=(1 / w2_gs),
topk_weights=topk_weights, topk_weights=topk_weights,
topk_ids=topk_ids, topk_ids=topk_ids,
m=m, m=m,
......
...@@ -32,6 +32,8 @@ from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( ...@@ -32,6 +32,8 @@ from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
from vllm.model_executor.layers.fused_moe.fused_moe import get_default_config from vllm.model_executor.layers.fused_moe.fused_moe import get_default_config
from vllm.model_executor.layers.fused_moe.modular_kernel import ( from vllm.model_executor.layers.fused_moe.modular_kernel import (
FusedMoEModularKernel) FusedMoEModularKernel)
from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
TopKWeightAndReduceDelegate)
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import round_up from vllm.utils import round_up
...@@ -371,6 +373,7 @@ def pplx_prepare_finalize( ...@@ -371,6 +373,7 @@ def pplx_prepare_finalize(
chunk_topk_weight, chunk_topk_weight,
chunk_topk_ids, chunk_topk_ids,
False, False,
weight_and_reduce_impl=TopKWeightAndReduceDelegate(),
) )
torch.cuda.synchronize() torch.cuda.synchronize()
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import dataclasses import dataclasses
from math import prod
from typing import Optional from typing import Optional
import pytest import pytest
...@@ -8,9 +9,12 @@ import torch ...@@ -8,9 +9,12 @@ import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8 from vllm.model_executor.layers.fused_moe.cutlass_moe import (
cutlass_moe_fp8, run_cutlass_moe_fp8)
from vllm.model_executor.layers.fused_moe.fused_moe import (fused_experts, from vllm.model_executor.layers.fused_moe.fused_moe import (fused_experts,
fused_topk) fused_topk)
from vllm.model_executor.layers.fused_moe.utils import (
moe_kernel_quantize_input)
from vllm.platforms import current_platform from vllm.platforms import current_platform
NUM_EXPERTS = [40, 64] NUM_EXPERTS = [40, 64]
...@@ -21,6 +25,7 @@ MNK_FACTORS = [ ...@@ -21,6 +25,7 @@ MNK_FACTORS = [
(2, 1024, 1536), (2, 1024, 1536),
(2, 3072, 1024), (2, 3072, 1024),
(2, 3072, 1536), (2, 3072, 1536),
(7, 3072, 1536),
(64, 1024, 1024), (64, 1024, 1024),
(64, 1024, 1536), (64, 1024, 1536),
(64, 3072, 1024), (64, 3072, 1024),
...@@ -236,6 +241,7 @@ def test_cutlass_moe_8_bit_no_graph( ...@@ -236,6 +241,7 @@ def test_cutlass_moe_8_bit_no_graph(
per_act_token: bool, per_act_token: bool,
per_out_ch: bool, per_out_ch: bool,
monkeypatch, monkeypatch,
ep_size: Optional[int] = None,
): ):
current_platform.seed_everything(7) current_platform.seed_everything(7)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
...@@ -254,7 +260,13 @@ def test_cutlass_moe_8_bit_no_graph( ...@@ -254,7 +260,13 @@ def test_cutlass_moe_8_bit_no_graph(
triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights, triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights,
topk_ids) topk_ids)
cutlass_output = run_8_bit(mt, topk_weights, topk_ids, per_act_token) if ep_size is not None:
assert e % ep_size == 0, "Cannot distribute experts evenly"
number_local_experts = e // ep_size
else:
number_local_experts = None
cutlass_output = run_8_bit(mt, topk_weights, topk_ids, per_act_token,
number_local_experts)
# Note 5.5 only needed for larger problem sizes, 5 works ok for # Note 5.5 only needed for larger problem sizes, 5 works ok for
# the rest. # the rest.
...@@ -340,9 +352,62 @@ def test_cutlass_moe_8_bit_EP( ...@@ -340,9 +352,62 @@ def test_cutlass_moe_8_bit_EP(
per_out_channel: bool, per_out_channel: bool,
ep_size: int, ep_size: int,
monkeypatch, monkeypatch,
):
test_cutlass_moe_8_bit_no_graph(m, n, k, e, topk, per_act_token,
per_out_channel, monkeypatch, ep_size)
LARGE_MNK_FACTORS = [
(1, 8192, 5120, 31),
(32768, 1024, 1024, 16),
(65536, 512, 1024, 16),
]
@pytest.mark.parametrize("m,n,k,topk", LARGE_MNK_FACTORS)
@pytest.mark.parametrize("e", [128])
@pytest.mark.parametrize("per_act_token", [False])
@pytest.mark.parametrize("per_out_channel", [True])
@pytest.mark.parametrize("ep_size", [8])
@pytest.mark.skipif(
(lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
current_platform.get_device_capability()),
reason="Grouped gemm is not supported on this GPU type.")
def test_cutlass_moe_8_bit_EP_large(
m: int,
n: int,
k: int,
e: int,
topk: int,
per_act_token: bool,
per_out_channel: bool,
ep_size: int,
monkeypatch,
):
test_cutlass_moe_8_bit_no_graph(m, n, k, e, topk, per_act_token,
per_out_channel, monkeypatch, ep_size)
@pytest.mark.parametrize("m,n,k,topk", [(1, 8192, 5120, 31)])
@pytest.mark.parametrize("e", [128])
@pytest.mark.parametrize("per_act_token", [False])
@pytest.mark.parametrize("per_out_channel", [True])
@pytest.mark.parametrize("ep_size", [8])
@pytest.mark.skipif(
(lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
current_platform.get_device_capability()),
reason="Grouped gemm is not supported on this GPU type.")
def test_run_cutlass_moe_fp8(
m: int,
n: int,
k: int,
e: int,
topk: int,
per_act_token: bool,
per_out_channel: bool,
ep_size: int,
): ):
current_platform.seed_everything(7) current_platform.seed_everything(7)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token,
per_out_channel) per_out_channel)
...@@ -352,20 +417,53 @@ def test_cutlass_moe_8_bit_EP( ...@@ -352,20 +417,53 @@ def test_cutlass_moe_8_bit_EP(
score, score,
topk, topk,
renormalize=False) renormalize=False)
# we want to make sure there is at least one token that's generated in
# Note that we are using the dequantized versions of the tensors. # this expert shard and at least one token that's NOT generated in this
# Using a, w1 and w2 directly results in minor output differences. # expert shard
triton_output = fused_experts(mt.a_d, mt.w1_d, mt.w2_d, topk_weights, topk_ids[0][0] = -1
topk_ids) topk_ids[0][1] = 1
assert e % ep_size == 0, "Cannot distribute experts evenly" workspace13_shape = (m * topk, max(2 * n, k))
cutlass_output = run_8_bit(mt, workspace2_shape = (m * topk, n)
topk_weights, output_shape = (m * topk, k)
topk_ids,
per_act_token, workspace13 = torch.empty(prod(workspace13_shape),
num_local_experts=e // ep_size) device="cuda",
dtype=mt.a.dtype)
torch.testing.assert_close(triton_output, workspace2 = torch.empty(prod(workspace2_shape),
cutlass_output, device="cuda",
atol=5e-2, dtype=mt.a.dtype)
rtol=1e-2)
\ No newline at end of file num_local_experts = e // ep_size
start, end = 0, num_local_experts
expert_map = [-1] * e
expert_map[start:end] = list(range(num_local_experts))
expert_map = torch.tensor(expert_map, dtype=torch.int32, device="cuda")
activation = lambda o, i: torch.ops._C.silu_and_mul(o, i)
a1q, a1q_scale = moe_kernel_quantize_input(mt.a, mt.a_scale,
torch.float8_e4m3fn,
per_act_token)
global_num_experts = -1 if mt.w1_q is None else mt.w1_q.size(0)
func = lambda output: run_cutlass_moe_fp8(
output, a1q, mt.w1_q, mt.w2_q, topk_ids, activation,
global_num_experts, expert_map, mt.w1_scale, mt.w2_scale,
a1q_scale, None, workspace13, workspace2, None, mt.a.dtype,
per_act_token, per_out_channel, False)
workspace13.random_()
output_random_workspace = torch.empty(output_shape,
device="cuda",
dtype=mt.a.dtype)
func(output_random_workspace)
workspace13.fill_(0)
output_zero_workspace = torch.zeros(output_shape,
device="cuda",
dtype=mt.a.dtype)
func(output_zero_workspace)
torch.testing.assert_close(output_random_workspace,
output_zero_workspace,
atol=5e-3,
rtol=1e-3)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from unittest.mock import patch
import pytest
import torch
from vllm.model_executor.layers.quantization.utils import fp8_utils
@pytest.mark.parametrize("shape", [(32, 128), (64, 256), (16, 512)])
@pytest.mark.parametrize("column_major", [False, True])
@pytest.mark.parametrize("scale_ue8m0", [False, True])
@pytest.mark.parametrize("group_size", [64, 128])
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
def test_per_token_group_quant_fp8(shape, column_major: bool,
scale_ue8m0: bool, group_size: int):
device = "cuda"
torch.manual_seed(42)
num_tokens, hidden_dim = shape
x = (torch.randn(
(num_tokens, hidden_dim), device=device, dtype=torch.bfloat16) * 8)
# cuda path
out_q, scale = fp8_utils.per_token_group_quant_fp8(
x,
group_size,
column_major_scales=column_major,
use_ue8m0=scale_ue8m0,
)
# triton ref
with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
ref_q, ref_s = fp8_utils.per_token_group_quant_fp8(
x,
group_size,
column_major_scales=column_major,
use_ue8m0=scale_ue8m0,
)
assert torch.allclose(out_q.float(), ref_q.float(), atol=0.15, rtol=0.15)
assert torch.allclose(scale, ref_s, atol=0.01, rtol=0.01)
...@@ -8,19 +8,14 @@ import pytest ...@@ -8,19 +8,14 @@ import pytest
import torch import torch
from tests.kernels.quant_utils import (native_per_token_group_quant_fp8, from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
native_w8a8_block_matmul, native_w8a8_block_matmul)
per_block_cast_to_fp8)
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.model_executor.layers.quantization.utils.fp8_utils import ( from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8, w8a8_block_fp8_matmul) get_col_major_tma_aligned_tensor, per_token_group_quant_fp8,
w8a8_block_fp8_matmul)
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils import has_deep_gemm
dg_available = False from vllm.utils.deep_gemm import fp8_gemm_nt, per_block_cast_to_fp8
try:
import deep_gemm
dg_available = True
except ImportError:
pass
if current_platform.get_device_capability() < (9, 0): if current_platform.get_device_capability() < (9, 0):
pytest.skip("FP8 Triton requires CUDA 9.0 or higher", pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
...@@ -106,7 +101,8 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed): ...@@ -106,7 +101,8 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"M,N,K,block_size,out_dtype,seed", "M,N,K,block_size,out_dtype,seed",
itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS)) itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS))
@pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.") @pytest.mark.skipif(not has_deep_gemm(),
reason="DeepGemm kernels not available.")
@torch.inference_mode() @torch.inference_mode()
def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed): def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
# only aligned sizes # only aligned sizes
...@@ -120,9 +116,7 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed): ...@@ -120,9 +116,7 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
_, block_k = block_size[0], block_size[1] A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_size[1])
A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_k)
B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32) B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32)
As = As_fp8.to(torch.float32) As = As_fp8.to(torch.float32)
...@@ -132,14 +126,14 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed): ...@@ -132,14 +126,14 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
out_dtype) out_dtype)
# Transpose earlier so that the testing will not trigger transposing kernels # Transpose earlier so that the testing will not trigger transposing kernels
As_fp8 = deep_gemm.get_col_major_tma_aligned_tensor(As_fp8) As_fp8 = get_col_major_tma_aligned_tensor(As_fp8)
out = torch.zeros((M, N), device='cuda', dtype=out_dtype) out = torch.zeros((M, N), device='cuda', dtype=out_dtype)
assert As_fp8.shape == (M, (K + 127) // assert As_fp8.shape == (M, (K + 127) //
128), f"{As_fp8.shape} != {(M, (K + 127) // 128)}" 128), f"{As_fp8.shape} != {(M, (K + 127) // 128)}"
deep_gemm.gemm_fp8_fp8_bf16_nt((A_fp8, As_fp8), (B_fp8, Bs_fp8), out) fp8_gemm_nt((A_fp8, As_fp8), (B_fp8, Bs_fp8), out)
rel_diff = (torch.mean( rel_diff = (torch.mean(
torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) / torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
......
...@@ -1072,6 +1072,7 @@ def torch_experts( ...@@ -1072,6 +1072,7 @@ def torch_experts(
quant_dtype: Optional[torch.dtype] = None, quant_dtype: Optional[torch.dtype] = None,
per_act_token_quant=False, per_act_token_quant=False,
block_shape: Optional[list[int]] = None, block_shape: Optional[list[int]] = None,
apply_router_weights_on_input: bool = False,
) -> torch.Tensor: ) -> torch.Tensor:
assert (global_num_experts == -1 assert (global_num_experts == -1
or (global_num_experts == w1.shape[0] and expert_map is None) or (global_num_experts == w1.shape[0] and expert_map is None)
...@@ -1081,11 +1082,17 @@ def torch_experts( ...@@ -1081,11 +1082,17 @@ def torch_experts(
M, K = a.shape M, K = a.shape
topk = topk_ids.shape[1] topk = topk_ids.shape[1]
if apply_router_weights_on_input:
assert topk == 1
a = a * topk_weight.to(a.dtype)
a = a.view(M, -1, K).repeat(1, topk, 1).reshape(-1, K) a = a.view(M, -1, K).repeat(1, topk, 1).reshape(-1, K)
out = torch.zeros(M * topk, w2.shape[1], dtype=a.dtype, device=a.device) out = torch.zeros(M * topk, w2.shape[1], dtype=a.dtype, device=a.device)
a, a_scale = moe_kernel_quantize_input(a, None, quant_dtype, if a1_scale:
assert not per_act_token_quant and block_shape is None
a, a_scale = moe_kernel_quantize_input(a, a1_scale, quant_dtype,
per_act_token_quant, block_shape) per_act_token_quant, block_shape)
num_experts = w1.shape[0] num_experts = w1.shape[0]
...@@ -1104,6 +1111,7 @@ def torch_experts( ...@@ -1104,6 +1111,7 @@ def torch_experts(
tmp2 = SiluAndMul()(tmp1) tmp2 = SiluAndMul()(tmp1)
out[mask] = tmp2 @ w2[i].transpose(0, 1) out[mask] = tmp2 @ w2[i].transpose(0, 1)
elif block_shape is not None: elif block_shape is not None:
# block quantized
assert (a_scale is not None and w1_scale is not None assert (a_scale is not None and w1_scale is not None
and w2_scale is not None) and w2_scale is not None)
tmp1 = native_w8a8_block_matmul(a[mask], w1[i], a_scale[mask], tmp1 = native_w8a8_block_matmul(a[mask], w1[i], a_scale[mask],
...@@ -1121,15 +1129,27 @@ def torch_experts( ...@@ -1121,15 +1129,27 @@ def torch_experts(
assert (a_scale is not None and w1_scale is not None assert (a_scale is not None and w1_scale is not None
and w2_scale is not None) and w2_scale is not None)
scales = a_scale if a_scale.numel() == 1 else a_scale[mask] scales = a_scale if a_scale.numel() == 1 else a_scale[mask]
tmp1 = a[mask].to(f32) * scales tmp1 = a[mask].to(f32) * scales
w1_dq = (w1[i].to(f32) * w1_scale[i]).transpose(0, 1) w1_dq = (w1[i].to(f32) * w1_scale[i]).transpose(0, 1)
tmp1 = tmp1 @ w1_dq tmp1 = (tmp1 @ w1_dq).to(out.dtype)
tmp2 = SiluAndMul()(tmp1)
tmp2 = SiluAndMul()(tmp1).to(out.dtype)
tmp2, b_scale = moe_kernel_quantize_input(
tmp2, a2_scale, quant_dtype, per_act_token_quant,
block_shape)
assert b_scale is not None
tmp2 = tmp2.to(f32) * b_scale
w2_dq = (w2[i].to(f32) * w2_scale[i]).transpose(0, 1) w2_dq = (w2[i].to(f32) * w2_scale[i]).transpose(0, 1)
out[mask] = (tmp2 @ w2_dq).to(out.dtype) out[mask] = (tmp2 @ w2_dq).to(out.dtype)
return (out.view(M, -1, w2.shape[1]).to(f32) * if apply_router_weights_on_input:
topk_weight.view(M, -1, 1)).sum(dim=1).to(out.dtype) return out
else:
return (out.view(M, -1, w2.shape[1]).to(f32) *
topk_weight.view(M, -1, 1)).sum(dim=1).to(out.dtype)
def torch_moe(a: torch.Tensor, def torch_moe(a: torch.Tensor,
......
...@@ -234,12 +234,6 @@ def qwen_lora_files(): ...@@ -234,12 +234,6 @@ def qwen_lora_files():
return os.path.join(models_path_prefix, "customize/qwen-nl2dsl-lora") return os.path.join(models_path_prefix, "customize/qwen-nl2dsl-lora")
@pytest.fixture(scope="session")
def long_context_lora_files_16k_1():
# return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_1")
return os.path.join(models_path_prefix, "SangBinCho/long_context_16k_testing_1")
@pytest.fixture @pytest.fixture
def llama_2_7b_engine_extra_embeddings(): def llama_2_7b_engine_extra_embeddings():
cleanup_dist_env_and_memory(shutdown_ray=True) cleanup_dist_env_and_memory(shutdown_ray=True)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for applying default registered multimodal loras.
"""
import os
from huggingface_hub import snapshot_download
from vllm.lora.request import LoRARequest
from ..conftest import AudioTestAssets, VllmRunner
MODEL_PATH = snapshot_download("microsoft/Phi-4-multimodal-instruct")
AUDIO_LORA_PATH = os.path.join(MODEL_PATH, "speech-lora")
IMAGE_LORA_PATH = os.path.join(MODEL_PATH, "vision-lora")
AUDIO_PROMPT = "<|user|><|audio_1|>Can you transcribe this audio?<|end|><|assistant|>" # noqa: E501
# Responses are greedy decoded; we just check the end of
# the generated text. If the lora is inactive, this model
# generates commentary on the transcription.
RESPONSE_SUFFIX_WITH_LORA = "Spoken text: The first words I spoke in the original chronograph, a little piece of practical poetry. Mary had a little lamb, it slept with quite a snow, and everywhere that Mary went, the lamb was sure to go." # noqa: E501
RESPONSE_SUFFIX_WITHOUT_LORA = "Certainly! Here is the transcription of the audio you provided:\n\nThe first words I spoke in the original phonograph record: A little piece of practical poetry. Mary had a little lamb; its fleece was white as snow, and everywhere that Mary went, the lamb was sure to go." # noqa: E501
VLLM_RUNNER_BASE_KWARGS = {
"model_name": MODEL_PATH,
"dtype": "half",
"enable_lora": "True",
"max_num_seqs": 2,
"max_lora_rank": 320,
"max_model_len": 12800,
"gpu_memory_utilization": 0.8,
"limit_mm_per_prompt": {
"audio": 1
},
"enforce_eager": True,
}
def run_test(vllm_runner, audio_assets, lora_request, expected_suffix,
**kwargs):
inputs = [([AUDIO_PROMPT], [audio_assets[0].audio_and_sample_rate[0]])]
# Apply any additional kwargs as overrides to the base kwargs
vllm_runner_kwargs = {**VLLM_RUNNER_BASE_KWARGS, **kwargs}
with vllm_runner(**vllm_runner_kwargs) as vllm_model:
vllm_outputs_with_default_lora = [
vllm_model.generate_greedy(
prompts,
max_tokens=128,
audios=audios,
lora_request=lora_request,
) for prompts, audios in inputs
]
assert vllm_outputs_with_default_lora[-1][-1][-1].endswith(
expected_suffix)
def test_active_default_mm_lora(
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets,
):
"""Ensure that we can use the default audio lora."""
run_test(
vllm_runner,
audio_assets,
lora_request=None,
default_mm_loras={"audio": AUDIO_LORA_PATH},
expected_suffix=RESPONSE_SUFFIX_WITH_LORA,
)
def test_inactive_default_mm_lora(
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets,
):
"""Ensure that modalities are filtered properly."""
# Default image lora won't be active since we only pass audio
run_test(
vllm_runner,
audio_assets,
lora_request=None,
default_mm_loras={"image": IMAGE_LORA_PATH},
expected_suffix=RESPONSE_SUFFIX_WITHOUT_LORA,
)
def test_default_mm_lora_succeeds_with_redundant_lora_request(
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets,
):
"""Ensure that redundantly providing the lora works."""
run_test(
vllm_runner,
audio_assets,
lora_request=LoRARequest("audio", 1, AUDIO_LORA_PATH),
default_mm_loras={"audio": AUDIO_LORA_PATH},
expected_suffix=RESPONSE_SUFFIX_WITH_LORA,
)
def test_default_mm_lora_fails_with_overridden_lora_request(
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets,
):
"""Ensure that if the lora_request conflicts with default_mm_loras,
we use the lora_request."""
run_test(
vllm_runner,
audio_assets,
lora_request=LoRARequest("speech", 2, AUDIO_LORA_PATH),
default_mm_loras={"audio": IMAGE_LORA_PATH},
expected_suffix=RESPONSE_SUFFIX_WITH_LORA,
)
...@@ -170,7 +170,8 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files, ...@@ -170,7 +170,8 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model", f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size", MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
str(tp_size), "serialize", "--serialized-directory", str(tp_size), "serialize", "--serialized-directory",
str(tmp_path), "--suffix", suffix str(tmp_path), "--suffix", suffix, "--serialization-kwargs",
'{"limit_cpu_concurrency": 4}'
], ],
check=True, check=True,
capture_output=True, capture_output=True,
...@@ -185,27 +186,26 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files, ...@@ -185,27 +186,26 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
model_uri = tmp_path / "vllm" / model_ref / suffix / model_name model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri)) tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
loaded_vllm_model = LLM(model=model_ref, loaded_llm = LLM(model=model_ref,
load_format="tensorizer", load_format="tensorizer",
enable_lora=True, enable_lora=True,
enforce_eager=True, enforce_eager=True,
model_loader_extra_config=tensorizer_config, model_loader_extra_config=tensorizer_config,
max_num_seqs=13, max_num_seqs=13,
tensor_parallel_size=2, tensor_parallel_size=2,
max_loras=2) max_loras=2)
tensorizer_config_dict = tensorizer_config.to_dict() tc_as_dict = tensorizer_config.to_serializable()
print("lora adapter created") print("lora adapter created")
assert do_sample(loaded_vllm_model, assert do_sample(loaded_llm,
sql_lora_files, sql_lora_files,
tensorizer_config_dict=tensorizer_config_dict, tensorizer_config_dict=tc_as_dict,
lora_id=0) == EXPECTED_NO_LORA_OUTPUT lora_id=0) == EXPECTED_NO_LORA_OUTPUT
print("lora 1") print("lora 1")
assert do_sample(loaded_vllm_model, assert do_sample(loaded_llm,
sql_lora_files, sql_lora_files,
tensorizer_config_dict=tensorizer_config_dict, tensorizer_config_dict=tc_as_dict,
lora_id=1) == EXPECTED_LORA_OUTPUT lora_id=1) == EXPECTED_LORA_OUTPUT
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment