Commit a99300bd authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.2rc1' into v0.10.2rc1-dev

parents cc3e01c7 5438967f
......@@ -17,6 +17,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import (
from vllm.platforms import current_platform
from vllm.utils import cdiv
from ...utils import multi_gpu_test
from .parallel_utils import ProcessGroupInfo, parallel_launch
try:
......@@ -76,6 +77,7 @@ def pplx_cutlass_moe(
assert torch.cuda.current_device() == pgi.local_rank
num_tokens, hidden_dim = a.shape
intermediate_dim = w2.shape[2]
num_experts = w1.shape[0]
block_size = hidden_dim # TODO support more cases
device = pgi.device
......@@ -124,8 +126,27 @@ def pplx_cutlass_moe(
num_local_experts=num_local_experts,
num_dispatchers=num_dispatchers)
ab_strides1 = torch.full((num_local_experts, ),
hidden_dim,
device="cuda",
dtype=torch.int64)
ab_strides2 = torch.full((num_local_experts, ),
intermediate_dim,
device="cuda",
dtype=torch.int64)
c_strides1 = torch.full((num_local_experts, ),
2 * intermediate_dim,
device="cuda",
dtype=torch.int64)
c_strides2 = torch.full((num_local_experts, ),
hidden_dim,
device="cuda",
dtype=torch.int64)
experts = CutlassBatchedExpertsFp8(num_local_experts, num_dispatchers,
out_dtype, per_act_token, per_out_ch)
out_dtype, per_act_token, per_out_ch,
ab_strides1, ab_strides2, c_strides1,
c_strides2)
fused_cutlass_experts = FusedMoEModularKernel(
prepare_finalize,
......@@ -227,6 +248,7 @@ def _pplx_moe(
@pytest.mark.parametrize("per_out_ch", [True, False])
@pytest.mark.parametrize("world_dp_size", [[2, 1]]) #, [4, 2]])
@pytest.mark.parametrize("use_internode", [False])
@multi_gpu_test(num_gpus=2)
@pytest.mark.skipif(
(lambda x: x is None or not ops.cutlass_group_gemm_supported(x.to_int()))(
current_platform.get_device_capability()),
......
......@@ -24,7 +24,7 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed):
current_platform.seed_everything(seed)
# Input tensor of shape (E, T, 2*H)
y = torch.randn((E, T, 2 * H), dtype=torch.float32, device="cuda")
y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
tokens_per_expert = torch.randint(
low=0,
high=T,
......@@ -74,7 +74,7 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed):
y_se = y_s[e]
y_qe = y_q[e]
torch.testing.assert_close(y_se[:nt], ref_s[:nt])
torch.testing.assert_close(y_se[:nt], ref_s[:nt], atol=1e-4, rtol=1e-2)
torch.testing.assert_close(
y_qe[:nt].to(torch.float32),
ref_q[:nt].to(torch.float32),
......
......@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the AWQ Triton kernel.
Run `pytest tests/kernels/test_awq_triton.py`.
Run `pytest tests/kernels/quantization/test_awq_triton.py`.
"""
import pytest
import torch
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the CUTLASS W4A8 kernel.
Run `pytest tests/kernels/quantization/test_cutlass_w4a8.py`.
"""
from dataclasses import dataclass
from typing import Optional
import pytest
import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.quant_utils import (
pack_rows, quantize_weights)
from vllm.platforms import current_platform
from vllm.scalar_type import ScalarType, scalar_types
# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
# unit tests to a common utility function. Currently the use of
# `is_quant_method_supported` conflates kernels with quantization methods
# an assumption which is breaking down as quantizations methods can have
# have kernels and some kernels support multiple quantization methods.
IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9
MNK_SHAPES = [(1, 128, 128), (1, 512, 1024), (1, 4096, 4096), (1, 8192, 28672),
(13, 8192, 4096), (26, 4096, 8192), (64, 4096, 4096),
(64, 8192, 28672), (257, 128, 4096), (257, 4096, 4096),
(1024, 4096, 8192), (1024, 8192, 4096)]
# TODO(czhu): get supported schedules from fn
SCHEDULES = [
'128x16_1x1x1', '256x16_1x1x1', '128x32_1x1x1', '256x32_1x1x1',
'128x64_1x1x1', '256x64_1x1x1', '128x128_1x1x1', '256x128_1x1x1',
'128x256_1x1x1', '128x256_2x1x1'
]
@dataclass
class TypeConfig:
act_type: torch.dtype
weight_type: ScalarType
output_type: Optional[torch.dtype]
group_scale_type: Optional[torch.dtype]
channel_scale_type: Optional[torch.dtype]
token_scale_type: Optional[torch.dtype]
@dataclass
class Tensors:
w_ref: torch.Tensor
a_ref: torch.Tensor
a: torch.Tensor
w_q: torch.Tensor
w_g_s: torch.Tensor
w_ch_s: torch.Tensor
w_tok_s: torch.Tensor
# (Act Type, Weight Type, Output Type, Scale Type, ZeroPoints,
# Ch Scales Type, Tok Scales Type)
TestTypeTuple = tuple[list[torch.dtype], ScalarType, Optional[torch.dtype],
Optional[torch.dtype], bool]
TEST_TYPES = [
*(
TypeConfig(act_type=torch.float8_e4m3fn,
weight_type=w_type,
output_type=o_type,
group_scale_type=torch.float8_e4m3fn,
channel_scale_type=torch.float32,
token_scale_type=torch.float32)
for w_type in [scalar_types.int4]
# TODO(czhu): fp16 out type
for o_type in [torch.bfloat16]),
]
# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
# unit tests to a common utility function. Currently the use of
# `is_quant_method_supported` conflates kernels with quantization methods
# an assumption which is breaking down as quantizations methods can have
# have kernels and some kernels support multiple quantization methods.
IS_SUPPORTED_BY_GPU = current_platform.has_device_capability(90)
# For testing quantized linear kernels
def to_fp8(tensor: torch.Tensor):
finfo = torch.finfo(torch.float8_e4m3fn)
return tensor.clamp(min=finfo.min,
max=finfo.max).to(dtype=torch.float8_e4m3fn)
def cutlass_quantize_and_pack(atype: torch.dtype,
w: torch.Tensor,
wtype: ScalarType,
stype: Optional[torch.dtype],
group_size: Optional[int],
zero_points: bool = False):
assert wtype.is_integer(), "TODO: support floating point weights"
w_ref, w_q, w_s, w_zp = quantize_weights(w,
wtype,
group_size=group_size,
zero_points=zero_points)
# since scales are cast to fp8, we need to compute w_ref this way
w_ref = ((w_q).to(torch.float32) * w_s.to(atype).to(
torch.float32).repeat_interleave(group_size, dim=0)).to(atype)
# bit mask prevents sign extending int4 when packing
w_q = pack_rows(w_q & 0x0F, wtype.size_bits, *w_q.shape)
w_q = w_q.t().contiguous().t() # convert to col major
w_q_packed = ops.cutlass_encode_and_reorder_int4b(w_q)
w_s_packed = ops.cutlass_pack_scale_fp8(w_s.to(atype))
return w_ref, w_q_packed, w_s_packed, w_zp
def create_test_tensors(shape: tuple[int, int, int], types: TypeConfig,
group_size: Optional[int]) -> Tensors:
m, n, k = shape
print("create_test_tensors, shape:", shape, "types:", types, "group_size:",
group_size)
a = to_fp8(torch.randn((m, k), device="cuda"))
w = to_fp8(torch.randn((k, n), device="cuda"))
if types.group_scale_type is not None:
w = w.to(types.group_scale_type)
if w.dtype.itemsize == 1:
w = w.to(torch.float16)
w_ref, w_q_packed, w_s, _ = cutlass_quantize_and_pack(
a.dtype, w, types.weight_type, types.group_scale_type, group_size,
False)
a_ref = a.to(torch.float32)
w_ref = w_ref.to(torch.float32)
# for the practical use case we need per-tok scales for fp8 activations
w_tok_s = torch.randn((m, ), device='cuda', dtype=types.token_scale_type)
# weights are already per-group quantized, use placeholder here
w_ch_s = torch.ones((n, ), device='cuda', dtype=types.channel_scale_type)
return Tensors(w_ref=w_ref,
a_ref=a_ref,
a=a,
w_q=w_q_packed,
w_g_s=w_s,
w_ch_s=w_ch_s,
w_tok_s=w_tok_s)
def mm_test_helper(types: TypeConfig,
tensors: Tensors,
group_size: Optional[int] = None,
schedule: Optional[str] = None):
# CUTLASS upstream uses fp8 with fastaccum as reference
# https://github.com/NVIDIA/cutlass/blob/main/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu#L406
output_ref = torch._scaled_mm(
tensors.a_ref.to(types.act_type),
tensors.w_ref.to(types.act_type).t().contiguous().t(), # col major
tensors.w_tok_s.unsqueeze(1),
tensors.w_ch_s.unsqueeze(0),
out_dtype=types.output_type,
use_fast_accum=True)
output = ops.cutlass_w4a8_mm(
a=tensors.a,
b_q=tensors.w_q,
b_group_scales=tensors.w_g_s,
b_group_size=group_size,
b_channel_scales=tensors.w_ch_s,
a_token_scales=tensors.w_tok_s,
)
print(output)
print(output_ref)
torch.testing.assert_close(output,
output_ref.to(output.dtype),
rtol=1e-3,
atol=1e-3)
@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
reason="CUTLASS W4A8 is not supported on this GPU type.")
@pytest.mark.parametrize("shape",
MNK_SHAPES,
ids=lambda x: "x".join(str(v) for v in x))
@pytest.mark.parametrize("types", TEST_TYPES)
@pytest.mark.parametrize("schedule", SCHEDULES)
def test_cutlass_w4a8(shape, types: TypeConfig, schedule):
group_sizes = [128]
for group_size in group_sizes:
tensors = create_test_tensors(shape, types, group_size)
mm_test_helper(types, tensors, group_size, schedule)
# Test to make sure cuda graphs work
class W4A8Layer(torch.nn.Module):
def __init__(self, **kwargs):
super().__init__()
self.kwargs = kwargs
def forward(self, a):
return ops.cutlass_w4a8_mm(a=a, **self.kwargs)
@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
reason="CUTLASS W4A8 is not supported on this GPU type.")
def test_w4a8_cuda_graph():
m, n, k = 512, 4096, 4096
a = to_fp8(torch.randn((m, k), device="cuda"))
b = to_fp8(torch.randn((k, n), device="cuda"))
wtype = scalar_types.int4
stype = torch.float8_e4m3fn
group_size = 128
zero_points = False
w_ref, w_q_packed, w_s, _ = cutlass_quantize_and_pack(
a.dtype, b.to(torch.float16), wtype, stype, group_size, zero_points)
w_tok_s = torch.randn((m, ), device='cuda', dtype=torch.float32)
w_ch_s = torch.ones((n, ), device='cuda', dtype=torch.float32)
# Construct a trivial model with a single layer that calls the kernel
model = W4A8Layer(
b_q=w_q_packed,
b_group_scales=w_s,
b_group_size=group_size,
b_channel_scales=w_ch_s,
a_token_scales=w_tok_s,
)
output_ref = torch._scaled_mm(
a,
w_ref.to(a.dtype).t().contiguous().t(), # col major
w_tok_s.unsqueeze(1),
w_ch_s.unsqueeze(0),
out_dtype=torch.bfloat16,
use_fast_accum=True)
# Run the model with a cuda graph
stream = torch.cuda.Stream()
with torch.cuda.stream(stream):
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
output = model(a)
output.zero_()
g.replay()
torch.testing.assert_close(output, output_ref, rtol=1e-3, atol=1e-3)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm
if not current_platform.has_device_capability(100):
pytest.skip(
reason=
"Flashinfer FP8 gemms requires compute capability of 10.0 or above.",
allow_module_level=True,
)
DTYPES = [torch.float16, torch.bfloat16]
# m, n, k
SHAPES = [(128, 128, 64), (128, 128, 128), (256, 128, 64), (128, 256, 128)]
PAD_SHAPES = [(150, 128, 64), (128, 128, 96)]
SHAPES.extend(PAD_SHAPES)
SEEDS = [42]
CUDA_DEVICES = ["cuda:0"]
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("shape", SHAPES)
@pytest.mark.parametrize("use_bias", [True, False])
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("autotune", [False, True])
@torch.inference_mode()
def test_flashinfer_fp8_gemm(
dtype: torch.dtype,
shape: tuple[int, int, int],
use_bias: bool,
seed: int,
device: str,
autotune: bool,
) -> None:
current_platform.seed_everything(seed)
m, n, k = shape
a = torch.randn((m, k), dtype=dtype, device=device)
b = torch.randn((n, k), dtype=dtype, device=device) / k
a_fp8, a_scale = ops.scaled_fp8_quant(a)
b_fp8, b_scale = ops.scaled_fp8_quant(b)
expected_out = torch.mm(
a_scale * a_fp8.to(dtype=torch.float32),
b_scale * b_fp8.to(dtype=torch.float32).t(),
).to(dtype=dtype)
if use_bias:
bias = torch.randn((n, ), dtype=dtype, device=device)
expected_out = expected_out + bias
else:
bias = None
import flashinfer
with flashinfer.autotune(autotune):
out = flashinfer_scaled_fp8_mm(
a_fp8,
b_fp8.t(),
a_scale,
b_scale,
dtype,
bias=bias,
)
torch.testing.assert_close(out, expected_out, atol=1e-2, rtol=1e-2)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from tests.kernels.utils import opcheck
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types
if not current_platform.has_device_capability(100):
pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
allow_module_level=True)
DTYPES = [torch.float16, torch.bfloat16]
SHAPES = [(128, 64), (128, 128), (256, 64), (256, 128)]
SEEDS = [42]
CUDA_DEVICES = ['cuda:0']
FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
BLOCK_SIZE = 16
def ref_impl(silu_and_mul: SiluAndMul, x: torch.Tensor,
global_scale: torch.Tensor,
ref_output_scale: torch.Tensor) -> torch.Tensor:
silu_and_mul_out = silu_and_mul.forward_native(x)
assert not current_platform.is_rocm()
assert silu_and_mul_out.ndim >= 1, (
f'input.ndim needs to be >= 1, but got {silu_and_mul_out.ndim}.')
other_dims = 1 if silu_and_mul_out.ndim == 1 else -1
silu_and_mul_out = silu_and_mul_out.reshape(other_dims,
silu_and_mul_out.shape[-1])
m, n = silu_and_mul_out.shape
device = silu_and_mul_out.device
# Two fp4 values will be packed into an uint8.
out = torch.empty((m, n // 2), device=device, dtype=torch.uint8)
output_scale = ref_output_scale
torch.ops._C.scaled_fp4_quant(out, silu_and_mul_out, output_scale,
global_scale)
return out, output_scale
def ops_impl(x: torch.Tensor, global_scale: torch.Tensor,
ref_output_scale: torch.Tensor) -> torch.Tensor:
out_shape = (x.shape[0], x.shape[1] // 4)
output_scale = ref_output_scale
out = torch.empty(out_shape, dtype=torch.uint8, device=x.device)
torch.ops._C.silu_and_mul_nvfp4_quant(out, output_scale, x, global_scale)
return out, output_scale
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("shape", SHAPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode()
def test_quantize_to_fp4(
dtype: torch.dtype,
shape: tuple[int, int],
seed: int,
device: str,
) -> None:
current_platform.seed_everything(seed)
torch.set_default_device(device)
m, n = shape
x = torch.randn((m, n), dtype=dtype)
tensor_amax = torch.abs(x).max().to(torch.float32)
global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
block_size = 16
assert n % block_size == 0, (
f'last dim has to be multiple of 16, but got {n}.')
assert x.dtype in (torch.float16, torch.bfloat16), (
f'input.dtype needs to be fp16 or bf16 but got {x.dtype}.')
round_up = lambda x, y: (x + y - 1) // y * y
rounded_m = round_up(x.shape[0], 128)
scale_n = x.shape[1] // (2 * block_size)
rounded_n = round_up(scale_n, 4)
output_scale = torch.empty((rounded_m, rounded_n // 4),
device=x.device,
dtype=torch.int32)
layer = SiluAndMul()
ref_out, ref_out_scale = ref_impl(layer, x, global_scale, output_scale)
fusion_out, fusion_out_scale = ops_impl(x, global_scale, output_scale)
assert ref_out.dtype == torch.uint8
assert fusion_out.dtype == torch.uint8
assert ref_out.shape == fusion_out.shape
assert ref_out_scale.dtype == torch.int32
assert fusion_out_scale.dtype == torch.int32
assert ref_out_scale.shape == fusion_out_scale.shape
# Allow up to 2% of mismatched values since BF16 has accuracy issues.
mis_threshold = 0.02
atol = 0.4
rtol = 0.4
ref_logits = ref_out[-1]
fusion_logits = fusion_out[-1]
mis_count = torch.sum(
torch.abs(fusion_logits - ref_logits) > (atol +
rtol * torch.abs(ref_logits)))
mis_ratio = mis_count / fusion_logits.numel()
assert mis_ratio < mis_threshold, \
f"Mismatch ratio {mis_ratio} exceeds threshold {mis_threshold}"
torch.testing.assert_close(ref_out_scale, fusion_out_scale)
opcheck(torch.ops._C.silu_and_mul_nvfp4_quant,
(fusion_out, fusion_out_scale, x, global_scale))
......@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the triton_scaled_mm kernel
Run `pytest tests/kernels/test_triton_scaled_mm.py`.
Run `pytest tests/kernels/quantization/test_triton_scaled_mm.py`.
"""
import os
import importlib
......
......@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for sparse cutlass kernels
Run `pytest tests/kernels/test_semi_structured.py`.
Run `pytest tests/kernels/quantization/test_cutlass_2of4_sparse.py`.
"""
import pytest
......
......@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for cutlass kernels
Run `pytest tests/kernels/test_cutlass.py`.
Run `pytest tests/kernels/quantization/test_cutlass_scaled_mm.py`.
"""
import random
......@@ -535,7 +535,7 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
expert_offsets = torch.zeros((num_experts + 1),
device=device,
dtype=torch.int32)
dtype=torch.int64)
problem_sizes = torch.zeros((num_experts, 3),
device=device,
......
......@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the machete kernel.
Run `pytest tests/kernels/test_machete_mm.py`.
Run `pytest tests/kernels/quantization/test_machete_mm.py`.
"""
import math
......@@ -95,23 +95,23 @@ TEST_TYPES = [
token_scale_type=None)
for w_type in [scalar_types.uint4, scalar_types.uint8]
for a_type in [torch.float16, torch.bfloat16]),
# QQQ style
*(TypeConfig(act_type=torch.int8,
weight_type=scalar_types.uint4b8,
output_type=torch.float16,
group_scale_type=group_scale_type,
group_zero_type=None,
channel_scale_type=torch.float,
token_scale_type=torch.float)
for group_scale_type in [None, torch.float16]),
*(TypeConfig(act_type=torch.float8_e4m3fn,
weight_type=scalar_types.uint4b8,
output_type=torch.float16,
group_scale_type=group_scale_type,
group_zero_type=None,
channel_scale_type=torch.float,
token_scale_type=torch.float)
for group_scale_type in [None, torch.float16]),
# # QQQ style
# *(TypeConfig(act_type=torch.int8,
# weight_type=scalar_types.uint4b8,
# output_type=torch.float16,
# group_scale_type=group_scale_type,
# group_zero_type=None,
# channel_scale_type=torch.float,
# token_scale_type=torch.float)
# for group_scale_type in [None, torch.float16]),
# *(TypeConfig(act_type=torch.float8_e4m3fn,
# weight_type=scalar_types.uint4b8,
# output_type=torch.float16,
# group_scale_type=group_scale_type,
# group_zero_type=None,
# channel_scale_type=torch.float,
# token_scale_type=torch.float)
# for group_scale_type in [None, torch.float16]),
]
# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
......
......@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the marlin kernel.
Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
Run `pytest tests/kernels/quantization/test_marlin_gemm.py`.
"""
import pytest
import torch
......@@ -13,11 +13,7 @@ from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
from vllm.model_executor.layers.quantization.qqq import (
MARLIN_QQQ_MAX_PARALLEL, MARLIN_QQQ_MIN_THREAD_N,
MARLIN_QQQ_SUPPORTED_GROUP_SIZES, MARLIN_QQQ_SUPPORTED_NUM_BITS)
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
MARLIN_SUPPORTED_GROUP_SIZES, marlin_make_empty_g_idx,
marlin_make_workspace_new, marlin_permute_bias, marlin_permute_scales,
query_marlin_supported_quant_types)
......@@ -31,8 +27,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
marlin_weights)
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
marlin_24_quantize)
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_qqq import ( # noqa: E501
marlin_qqq_quantize)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
awq_pack, gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights)
from vllm.scalar_type import scalar_types
......@@ -449,68 +443,6 @@ def test_hqq_marlin_gemm(
assert max_diff < 0.04
@pytest.mark.skipif(not is_quant_method_supported("qqq"),
reason="Marlin is not supported on this GPU type.")
@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
@pytest.mark.parametrize("num_bits", MARLIN_QQQ_SUPPORTED_NUM_BITS)
@pytest.mark.parametrize("group_size", MARLIN_QQQ_SUPPORTED_GROUP_SIZES)
@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
def test_marlin_qqq_gemm(
k_chunk,
n_chunk,
num_bits,
group_size,
mnk_factors,
):
int8_traits = torch.iinfo(torch.int8)
m_factor, n_factor, k_factor = mnk_factors
size_m = m_factor
size_k = k_chunk * k_factor
size_n = n_chunk * n_factor
a_input = rand_data((size_m, size_k))
b_weight = rand_data((size_k, size_n))
# Quantize activations
s_a = a_input.abs().max(dim=-1, keepdim=True)[0].div(int8_traits.max).to(
torch.float)
q_a = (a_input / s_a).round().clamp(int8_traits.min,
int8_traits.max).to(torch.int8)
# Quantize weights
w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel = \
marlin_qqq_quantize(b_weight, num_bits, group_size)
workspace = MarlinWorkspace(size_n, MARLIN_QQQ_MIN_THREAD_N,
MARLIN_QQQ_MAX_PARALLEL)
opcheck(torch.ops._C.marlin_qqq_gemm,
(q_a, marlin_qqq_q_w, s_a, marlin_qqq_s_channel,
marlin_qqq_s_group, workspace.scratch, a_input.shape[0],
b_weight.shape[1], a_input.shape[1]))
output = ops.marlin_qqq_gemm(
q_a,
marlin_qqq_q_w,
s_a,
marlin_qqq_s_channel,
marlin_qqq_s_group,
workspace.scratch,
a_input.shape[0],
b_weight.shape[1],
a_input.shape[1],
)
output_ref = torch.matmul(q_a.half() * s_a.half(), w_ref)
torch.cuda.synchronize()
max_diff = compute_max_diff(output, output_ref)
assert max_diff < 0.04
def test_marlin_gemm_subset_input():
quant_type = scalar_types.uint4b8
group_size = 128
......@@ -602,18 +534,3 @@ def test_marlin_gemm_with_bias(size_m):
max_diff = compute_max_diff(output, output_ref)
assert max_diff < 0.04
def test_marlin_gemm_opcheck():
size_m = 2048
size_n = 4096
size_k = 4096
a = torch.rand((size_m, size_n), device='cuda', dtype=torch.float16)
w = torch.randint(-5, 5, (256, 8192), device='cuda', dtype=torch.int32)
s = torch.full((32, size_k), 0.125, device='cuda', dtype=torch.float16)
wk = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
GPTQ_MARLIN_MAX_PARALLEL).scratch
x = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k)
y = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k)
torch.testing.assert_close(x, y)
opcheck(torch.ops._C.marlin_gemm, (a, w, s, wk, size_m, size_n, size_k))
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the triton_scaled_mm kernel
Run `pytest tests/kernels/quantization/test_triton_scaled_mm.py`.
"""
import os
import importlib
from typing import Optional
import pytest
import torch
from vllm.platforms import current_platform
from ...utils import models_path_prefix
device = "cuda"
triton_scaled_mm_module = importlib.import_module(
"vllm.model_executor.layers.quantization.compressed_tensors."
"triton_scaled_mm")
triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
def torch_scaled_mm(a: torch.Tensor,
b: torch.Tensor,
scale_a: torch.Tensor,
scale_b: torch.Tensor,
out_dtype: type[torch.dtype],
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
out = torch.mm(a.to(torch.float32), b.to(torch.float32))
out = scale_a * out
out = scale_b.T * out
out = out.to(out_dtype)
if bias is not None:
out = out + bias
return out
def get_8bit_types():
types = [torch.int8]
if current_platform.supports_fp8():
types.append(current_platform.fp8_dtype())
return types
# This test is to check regressions for int8 support on ROCm.
@pytest.mark.parametrize("model_path", [
os.path.join(models_path_prefix, "neuralmagic/Llama-3.2-1B-quantized.w8a8"),
])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [10])
@pytest.mark.skipif(not current_platform.is_rocm(),
reason="Should only run on ROCm")
def test_rocm_compressed_tensors_w8a8(vllm_runner, example_prompts, model_path,
max_tokens, num_logprobs):
dtype = "bfloat16"
with vllm_runner(model_path, dtype=dtype) as vllm_model:
vllm_model.generate_greedy_logprobs(example_prompts, max_tokens,
num_logprobs)
MNK_FACTORS = [
(1, 256, 128),
(33, 256, 496),
(64, 971, 1024),
(64, 20486, 128),
(512, 256, 496),
(512, 20486, 1024),
]
@pytest.mark.parametrize("M,N,K", MNK_FACTORS)
@pytest.mark.parametrize("out_dtype", [torch.bfloat16])
@pytest.mark.parametrize("in_dtype", get_8bit_types())
@pytest.mark.parametrize("use_scalar_scale_a", [True, False])
@pytest.mark.parametrize("use_scalar_scale_b", [True, False])
@pytest.mark.parametrize("use_bias", [True, False])
def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a,
use_scalar_scale_b, use_bias):
is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t
).is_floating_point()
current_platform.seed_everything(0)
# NOTE: There are cases, where if the matrix is large enough, an output
# like 65504.4 can be produced, and can easily turn into inf when
# multiplied when using float16/bfloat16. This means one function, e.g.,
# testing function, and another function, e.g. golden function, can
# produce a non-inf value while the other produces an inf value, and
# will cause assert_close/allclose to fail, even though if overflow
# wouldn't have occurred, the values would have been "close."
#
# So, the values here are kept small enough to avoid this situation.
if is_floating_point_type(in_dtype):
a = (0.25 * torch.rand(
(M, K), dtype=torch.float32, device=device)).to(in_dtype)
b = (0.25 * torch.rand(
(K, N), dtype=torch.float32, device=device)).to(in_dtype)
else:
a = torch.randint(-32, 32, (M, K), dtype=in_dtype, device=device)
b = torch.randint(-32, 32, (K, N), dtype=in_dtype, device=device)
if use_scalar_scale_a:
scale_a = torch.rand((1, 1), device=device)
else:
scale_a = 0.25 * torch.rand((M, 1), device=device)
if use_scalar_scale_b:
scale_b = torch.rand((1, 1), device=device)
else:
scale_b = 0.25 * torch.rand((N, 1), device=device)
bias = None
if use_bias:
bias = torch.rand((N, ), device=device, dtype=out_dtype)
c_check = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
c_actual = torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
torch.testing.assert_close(c_check, c_actual, rtol=1e-1, atol=1e-1)
......@@ -10,13 +10,19 @@ import pytest
import torch
from packaging import version
from vllm import SamplingParams
from tests.v1.attention.utils import (BatchSpec, create_common_attn_metadata,
create_standard_kv_cache_spec,
create_vllm_config)
from vllm.v1.attention.backends.flex_attention import (
FlexAttentionMetadataBuilder)
from ..utils import models_path_prefix
from ..models.utils import check_embeddings_close
from ..models.utils import check_embeddings_close, check_logprobs_close
TORCH_VERSION = version.parse(torch.__version__)
MINIMUM_TORCH_VERSION = version.parse("2.7.0")
DIRECT_BUILD_VERSION = version.parse("2.9.dev0")
def set_seed(seed):
......@@ -36,22 +42,18 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
"""Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with
the default backend, ensuring they are identical when using the same seed.
the default backend, ensuring they are similar when using the same seed.
"""
model_name = os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct")
seed = 42
max_tokens = 24
num_logprobs = 5
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
]
sampling_params = SamplingParams(temperature=0.0,
top_p=1.0,
seed=seed,
max_tokens=max_tokens)
# Run with flex attention
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
......@@ -63,7 +65,8 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
tensor_parallel_size=1,
num_gpu_blocks_override=128,
enforce_eager=True) as llm_flex:
output_flex = llm_flex.generate(prompts, sampling_params)
output_flex = llm_flex.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs)
# Run with default backend
with monkeypatch.context() as m:
......@@ -73,20 +76,17 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
runner="generate",
tensor_parallel_size=1,
num_gpu_blocks_override=128,
enforce_eager=True) as llm_default:
output_default = llm_default.generate(prompts, sampling_params)
# Compare outputs from both backends
for i, (flex_result,
default_result) in enumerate(zip(output_flex, output_default)):
prompt = prompts[i]
flex_text = flex_result[1][0]
default_text = default_result[1][0]
assert flex_text == default_text, (
f"FlexAttention output doesn't match default for: {prompt!r}\n"
f"FlexAttention: {flex_text!r}\n"
f"Default: {default_text!r}")
enforce_eager=True,
gpu_memory_utilization=0.85) as llm_default:
output_default = llm_default.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs)
check_logprobs_close(
outputs_0_lst=output_flex,
outputs_1_lst=output_default,
name_0="flex",
name_1="default",
)
@pytest.mark.skipif(
......@@ -138,5 +138,70 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
)
@pytest.mark.skipif(
not torch.cuda.is_available() or TORCH_VERSION < DIRECT_BUILD_VERSION,
reason="CUDA not available or PyTorch version < 2.7",
)
def test_block_mask_direct_vs_slow_path():
"""Test that direct path block mask is a superset of slow path.
The direct path may include extra blocks for performance (over-estimation),
but must include all blocks that the slow path determines are necessary.
"""
device = torch.device("cuda")
vllm_config = create_vllm_config(model_name="meta-llama/Meta-Llama-3-8B",
block_size=16,
max_model_len=1024)
kv_cache_spec = create_standard_kv_cache_spec(vllm_config)
# Use a mixed batch that will create groups spanning multiple sequences
batch_spec = BatchSpec(seq_lens=[35, 64, 128, 256],
query_lens=[33, 5, 32, 64],
name="test_mixed_batch")
common_attn_metadata = create_common_attn_metadata(
batch_spec, vllm_config.cache_config.block_size, device)
builder = FlexAttentionMetadataBuilder(kv_cache_spec, [], vllm_config,
device)
metadata_direct = builder.build(common_prefix_len=0,
common_attn_metadata=common_attn_metadata)
builder.direct_build = False
metadata_slow = builder.build(common_prefix_len=0,
common_attn_metadata=common_attn_metadata)
assert metadata_direct.block_mask is not None
assert metadata_slow.block_mask is not None
# Extract block indices for comparison, B, H are the same
direct_indices = metadata_direct.block_mask.kv_indices[0, 0]
slow_indices = metadata_slow.block_mask.kv_indices[0, 0]
direct_num = metadata_direct.block_mask.kv_num_blocks[0, 0]
slow_num = metadata_slow.block_mask.kv_num_blocks[0, 0]
# main test: every block needed by slow path must be in direct path
num_groups = direct_num.shape[0]
all_contained = True
missing_details = []
for group_idx in range(num_groups):
direct_blocks = set(
direct_indices[group_idx, :direct_num[group_idx]].tolist())
slow_blocks = set(
slow_indices[group_idx, :slow_num[group_idx]].tolist())
missing_blocks = slow_blocks - direct_blocks
if missing_blocks:
all_contained = False
missing_details.append(
f"Group {group_idx}: missing {sorted(missing_blocks)}")
assert all_contained, (
"Direct path is missing blocks required by slow path:\n" +
"\n".join(missing_details))
if __name__ == "__main__":
pytest.main([__file__])
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Integration tests for FlexAttention backend vs default backend"""
from typing import Optional
import pytest
import torch
from tests.kernels.utils import to_int8
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
if not current_platform.is_cpu():
pytest.skip("skipping CPU-only tests", allow_module_level=True)
NK_FACTORS = [
(256, 128),
(4096, 4096),
(16384, 4096),
(1023, 491),
(1001, 15),
]
M_FACTORS = [
(16, 1, 32, 128, 64),
(1, 17, 1, 31, 17),
]
CACHE_SIZES = [2]
DTYPE = [torch.bfloat16]
def rand_int8(shape: tuple, device: str = "cpu"):
return to_int8(torch.rand(shape, device=device) * 255 - 128)
def ref_int8_scaled_mm(
a: torch.Tensor,
b: torch.Tensor,
scale_a: torch.Tensor,
scale_b: torch.Tensor,
azp: Optional[torch.Tensor],
bias: Optional[torch.Tensor],
output_type: torch.dtype,
):
if azp is not None:
a = a.to(dtype=torch.float32) - azp.to(dtype=torch.float32)
output = torch.mm((scale_a * a.to(dtype=torch.float32)),
(scale_b * b.to(dtype=torch.float32)))
if bias is not None:
output += bias.float()
return output.to(dtype=output_type)
def onednn_int8_gemm_test_helper(primitive_cache_size: int,
m: int,
n: int,
k: int,
per_tensor_a_quant: bool,
per_tensor_b_quant: bool,
use_azp: bool,
use_bias: bool,
out_dtype: torch.dtype = torch.bfloat16,
device: str = "cpu"):
# Test for a oneDNN kernel with per-tensor / per-token activation
# quantization and per-tensor / per-output channel weight quantization.
a = to_int8(torch.randn((m, k), device=device) * 5)
b = to_int8(torch.randn((n, k), device=device).t() * 5)
a_scales_shape = (1, 1) if per_tensor_a_quant else (m, 1)
b_scales_shape = (1, 1) if per_tensor_b_quant else (1, n)
scale_a = (torch.randn(a_scales_shape, device=device, dtype=torch.float32))
scale_b = (torch.randn(b_scales_shape, device=device, dtype=torch.float32))
if use_azp:
azp = torch.rand(a_scales_shape, dtype=torch.float32) * 10 + 1.5
azp = (azp / scale_a).round().to(dtype=torch.int32)
azp_adj = scale_b * b.sum(dim=0, keepdim=True, dtype=torch.float32)
else:
azp = None
azp_adj = None
if use_bias:
bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
else:
bias = None
handler = ops.create_onednn_scaled_mm(
b,
scale_b,
out_dtype,
not per_tensor_a_quant,
use_azp,
primitive_cache_size,
)
out = torch.zeros((m, n), dtype=out_dtype)
ops.onednn_scaled_mm(handler, a, out, scale_a, azp, azp_adj, bias)
baseline = ref_int8_scaled_mm(a, b, scale_a, scale_b, azp, bias, out_dtype)
torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
if use_bias:
# To test runtime bias setting
out = torch.zeros((m, n), dtype=out_dtype)
ops.onednn_scaled_mm(handler, a, out, scale_a, azp, azp_adj, None)
baseline = ref_int8_scaled_mm(a, b, scale_a, scale_b, azp, None,
out_dtype)
torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
@pytest.mark.parametrize("n,k", NK_FACTORS)
@pytest.mark.parametrize("m_list", M_FACTORS)
@pytest.mark.parametrize("per_tensor_a_scale", [True, False])
@pytest.mark.parametrize("per_tensor_b_scale", [True, False])
@pytest.mark.parametrize("use_bias", [True, False])
@pytest.mark.parametrize("use_azp", [True, False])
@pytest.mark.parametrize("output_type", DTYPE)
@pytest.mark.parametrize("primitive_cache_size", CACHE_SIZES)
def test_onednn_int8_scaled_gemm(
n: int,
k: int,
m_list: tuple[int],
per_tensor_a_scale: bool,
per_tensor_b_scale: bool,
use_bias: bool,
use_azp: bool,
output_type: torch.dtype,
primitive_cache_size: int,
):
for m in m_list:
onednn_int8_gemm_test_helper(
primitive_cache_size=primitive_cache_size,
m=m,
n=n,
k=k,
per_tensor_a_quant=per_tensor_a_scale,
per_tensor_b_quant=per_tensor_b_scale,
use_bias=use_bias,
use_azp=use_azp,
out_dtype=output_type,
)
......@@ -3,7 +3,7 @@
import tempfile
from collections import OrderedDict
from unittest.mock import MagicMock, patch
from unittest.mock import MagicMock
import pytest
import os
......@@ -11,8 +11,6 @@ import torch
import torch.nn as nn
from huggingface_hub import snapshot_download
import vllm
from vllm.config import LoRAConfig
from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment,
initialize_model_parallel)
......@@ -22,7 +20,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader import get_model
from vllm.model_executor.models.interfaces import SupportsLoRA
from vllm.platforms import current_platform
from ..utils import models_path_prefix
......@@ -106,6 +103,7 @@ def dummy_model() -> nn.Module:
]))
model.config = MagicMock()
model.embedding_modules = {"lm_head": "lm_head"}
model.unpadded_vocab_size = 32000
return model
......@@ -139,6 +137,8 @@ def dummy_model_gate_up() -> nn.Module:
],
}
model.embedding_modules = {"lm_head": "lm_head"}
model.unpadded_vocab_size = 32000
return model
......@@ -223,40 +223,6 @@ def tinyllama_lora_files():
return os.path.join(models_path_prefix, "jashing/tinyllama-colorist-lora")
@pytest.fixture(scope="session")
def phi2_lora_files():
# return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora")
return os.path.join(models_path_prefix, "isotr0py/phi-2-test-sql-lora")
@pytest.fixture(scope="session")
def qwen_lora_files():
# return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider")
return os.path.join(models_path_prefix, "customize/qwen-nl2dsl-lora")
@pytest.fixture
def llama_2_7b_engine_extra_embeddings():
cleanup_dist_env_and_memory(shutdown_ray=True)
get_model_old = get_model
def get_model_patched(**kwargs):
kwargs["vllm_config"].lora_config = LoRAConfig(max_loras=4,
max_lora_rank=8)
return get_model_old(**kwargs)
with patch("vllm.worker.model_runner.get_model", get_model_patched):
engine = vllm.LLM(os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"), enable_lora=False)
yield engine.llm_engine
del engine
cleanup_dist_env_and_memory(shutdown_ray=True)
@pytest.fixture
def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
model_runner.model)
@pytest.fixture
def reset_default_device():
"""
......
......@@ -6,7 +6,6 @@ import time
import os
import pytest
import vllm.envs as env
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
......@@ -100,12 +99,10 @@ async def test_add_lora(chatglm3_lora_files):
# Run with warmup
add_lora_tasks = [llm.add_lora(lr) for lr in warmup_run_requests]
add_lora_results = await asyncio.gather(*add_lora_tasks)
if env.VLLM_USE_V1:
# Test that all all_lora calls are successful.
assert all(add_lora_results)
else:
# No way to check V0 engine results as the calls just return None.
pass
# Test that all all_lora calls are successful.
assert all(add_lora_results)
time_with_add_lora = await requests_processing_time(
llm, warmup_run_requests)
......
......@@ -89,6 +89,9 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
# https://github.com/NVIDIA/nccl/issues/1790, set a lower value for
# gpu_memory_utilization here because NCCL >= 2.26.3 seems to use
# more GPU memory causing vLLM to OOM
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
enable_lora=True,
......@@ -97,7 +100,8 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=True,
enable_chunked_prefill=True)
enable_chunked_prefill=True,
gpu_memory_utilization=0.85)
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
......
......@@ -243,7 +243,7 @@ def check_punica_wrapper(punica_wrapper) -> bool:
@torch.inference_mode()
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
@pytest.mark.parametrize("num_loras", [1, 2, 4])
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
@pytest.mark.parametrize("stage", STAGES)
......@@ -347,7 +347,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
@torch.inference_mode()
# @pytest.mark.skip(
# reason="Fails when loras are in any slot other than the first.")
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
@pytest.mark.parametrize("num_loras", [1, 2, 4])
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
@pytest.mark.parametrize("stage", STAGES)
......@@ -486,7 +486,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
@torch.inference_mode()
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
@pytest.mark.parametrize("num_loras", [1, 2, 4])
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
@pytest.mark.parametrize("stage", STAGES)
......@@ -620,12 +620,15 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
@torch.inference_mode()
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
@pytest.mark.parametrize("num_loras", [1, 2, 4])
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES)
@pytest.mark.parametrize("bias_enabled", [True, False])
def test_linear_replicated(dist_init, num_loras, device, stage,
bias_enabled) -> None:
def test_linear_replicated(
dist_init,
num_loras,
device,
stage,
) -> None:
if current_platform.is_cuda_alike():
torch.cuda.set_device(device)
......@@ -634,10 +637,11 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
torch.set_default_device(device)
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
assert check_punica_wrapper(punica_wrapper)
lora_config = LoRAConfig(max_loras=max_loras,
max_lora_rank=8,
lora_dtype=torch.float16,
bias_enabled=bias_enabled)
lora_config = LoRAConfig(
max_loras=max_loras,
max_lora_rank=8,
lora_dtype=torch.float16,
)
def create_random_linear_replicated_layer():
......@@ -651,10 +655,6 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
lora_linear.create_lora_weights(max_loras, lora_config)
assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
lora_linear.lora_b_stacked) == 1)
if bias_enabled:
assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
else:
assert lora_linear.lora_bias_stacked is None
return linear, lora_linear
for i in range(NUM_RANDOM_SEEDS):
......@@ -734,14 +734,13 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
@torch.inference_mode()
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
@pytest.mark.parametrize("num_loras", [1, 2, 4])
@pytest.mark.parametrize("orientation", ["row", "column"])
@pytest.mark.parametrize("fully_shard", [True, False])
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES)
@pytest.mark.parametrize("bias_enabled", [True, False])
def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
device, stage, bias_enabled) -> None:
device, stage) -> None:
if current_platform.is_cuda_alike():
torch.cuda.set_device(device)
......@@ -750,11 +749,12 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
torch.set_default_device(device)
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
assert check_punica_wrapper(punica_wrapper)
lora_config = LoRAConfig(max_loras=max_loras,
max_lora_rank=8,
fully_sharded_loras=fully_shard,
lora_dtype=torch.float16,
bias_enabled=bias_enabled)
lora_config = LoRAConfig(
max_loras=max_loras,
max_lora_rank=8,
fully_sharded_loras=fully_shard,
lora_dtype=torch.float16,
)
def create_random_linear_parallel_layer():
if orientation == "row":
......@@ -777,10 +777,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
lora_linear.create_lora_weights(max_loras, lora_config)
assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
lora_linear.lora_b_stacked) == 1)
if bias_enabled:
assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
else:
assert lora_linear.lora_bias_stacked is None
return linear, lora_linear
for i in range(NUM_RANDOM_SEEDS):
......@@ -860,14 +857,13 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
@torch.inference_mode()
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
@pytest.mark.parametrize("num_loras", [1, 2, 4])
@pytest.mark.parametrize("repeats", [1, 2, 3])
@pytest.mark.parametrize("fully_shard", [True, False])
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES)
@pytest.mark.parametrize("bias_enabled", [True, False])
def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
device, stage, bias_enabled) -> None:
device, stage) -> None:
if current_platform.is_cuda_alike():
torch.cuda.set_device(device)
......@@ -876,11 +872,12 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
torch.set_default_device(device)
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
assert check_punica_wrapper(punica_wrapper)
lora_config = LoRAConfig(max_loras=max_loras,
max_lora_rank=8,
fully_sharded_loras=fully_shard,
lora_dtype=torch.float16,
bias_enabled=bias_enabled)
lora_config = LoRAConfig(
max_loras=max_loras,
max_lora_rank=8,
fully_sharded_loras=fully_shard,
lora_dtype=torch.float16,
)
def create_column_parallel_packed_layer():
if repeats == 2:
......@@ -924,10 +921,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
model_config=FakeConfig())
assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
lora_linear.lora_b_stacked) == n_slices)
if bias_enabled:
assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
else:
assert lora_linear.lora_bias_stacked is None
return linear, lora_linear
for i in range(NUM_RANDOM_SEEDS):
......
......@@ -114,8 +114,7 @@ def test_llama_lora(sql_lora_files):
enable_lora=True,
# also test odd max_num_seqs
max_num_seqs=13,
max_loras=4,
enable_chunked_prefill=True)
max_loras=4)
generate_and_test(llm, sql_lora_files)
......@@ -129,7 +128,6 @@ def test_llama_lora_tp4(sql_lora_files):
max_num_seqs=16,
max_loras=4,
tensor_parallel_size=4,
enable_chunked_prefill=True,
)
generate_and_test(llm, sql_lora_files)
......@@ -145,7 +143,6 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
max_loras=4,
tensor_parallel_size=4,
fully_sharded_loras=True,
enable_chunked_prefill=True,
)
generate_and_test(llm, sql_lora_files)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Script to test multi loras service with tp >= 2
This script contains:
1. test multi loras service with tp >= 2
2. test multi loras request
"""
import pytest
from tests.utils import multi_gpu_test
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest
......@@ -156,3 +160,34 @@ def test_multi_loras_with_tp_sync():
output_text = call_llm_get_outputs(prompt, "Alice")
check_outputs(output_text, expected_output)
def test_multiple_lora_requests():
llm = LLM(
model=MODEL_PATH,
enable_lora=True,
max_loras=4,
max_lora_rank=LORA_RANK,
max_model_len=512,
gpu_memory_utilization=0.5,
enforce_eager=True,
)
PROMPTS = ["Hello, my name is"] * 2
LORA_NAME = "Alice"
lora_request = [
LoRARequest(LORA_NAME + str(idx), idx + 1,
LORA_NAME_PATH_MAP[LORA_NAME])
for idx in range(len(PROMPTS))
]
# Multiple SamplingParams should be matched with each prompt
outputs = llm.generate(PROMPTS, lora_request=lora_request)
assert len(PROMPTS) == len(outputs)
# Exception raised, if the size of params does not match the size of prompts
with pytest.raises(ValueError):
outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
# Single LoRARequest should be applied to every prompt
single_lora_request = lora_request[0]
outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
assert len(PROMPTS) == len(outputs)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment