Commit d2b52805 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.2rc1' into v0.10.2rc1-ori

parents 9a521c23 5438967f
......@@ -24,7 +24,7 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed):
current_platform.seed_everything(seed)
# Input tensor of shape (E, T, 2*H)
y = torch.randn((E, T, 2 * H), dtype=torch.float32, device="cuda")
y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
tokens_per_expert = torch.randint(
low=0,
high=T,
......@@ -74,7 +74,7 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, group_size, seed):
y_se = y_s[e]
y_qe = y_q[e]
torch.testing.assert_close(y_se[:nt], ref_s[:nt])
torch.testing.assert_close(y_se[:nt], ref_s[:nt], atol=1e-4, rtol=1e-2)
torch.testing.assert_close(
y_qe[:nt].to(torch.float32),
ref_q[:nt].to(torch.float32),
......
......@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the AWQ Triton kernel.
Run `pytest tests/kernels/test_awq_triton.py`.
Run `pytest tests/kernels/quantization/test_awq_triton.py`.
"""
import pytest
import torch
......
......@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for sparse cutlass kernels
Run `pytest tests/kernels/test_semi_structured.py`.
Run `pytest tests/kernels/quantization/test_cutlass_2of4_sparse.py`.
"""
import pytest
......
......@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for cutlass kernels
Run `pytest tests/kernels/test_cutlass.py`.
Run `pytest tests/kernels/quantization/test_cutlass_scaled_mm.py`.
"""
import random
......@@ -535,7 +535,7 @@ def test_cutlass_fp8_group_gemm(num_experts: int, per_act_token: bool,
expert_offsets = torch.zeros((num_experts + 1),
device=device,
dtype=torch.int32)
dtype=torch.int64)
problem_sizes = torch.zeros((num_experts, 3),
device=device,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the CUTLASS W4A8 kernel.
Run `pytest tests/kernels/quantization/test_cutlass_w4a8.py`.
"""
from dataclasses import dataclass
from typing import Optional
import pytest
import torch
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.quant_utils import (
pack_rows, quantize_weights)
from vllm.platforms import current_platform
from vllm.scalar_type import ScalarType, scalar_types
# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
# unit tests to a common utility function. Currently the use of
# `is_quant_method_supported` conflates kernels with quantization methods
# an assumption which is breaking down as quantizations methods can have
# have kernels and some kernels support multiple quantization methods.
IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9
MNK_SHAPES = [(1, 128, 128), (1, 512, 1024), (1, 4096, 4096), (1, 8192, 28672),
(13, 8192, 4096), (26, 4096, 8192), (64, 4096, 4096),
(64, 8192, 28672), (257, 128, 4096), (257, 4096, 4096),
(1024, 4096, 8192), (1024, 8192, 4096)]
# TODO(czhu): get supported schedules from fn
SCHEDULES = [
'128x16_1x1x1', '256x16_1x1x1', '128x32_1x1x1', '256x32_1x1x1',
'128x64_1x1x1', '256x64_1x1x1', '128x128_1x1x1', '256x128_1x1x1',
'128x256_1x1x1', '128x256_2x1x1'
]
@dataclass
class TypeConfig:
act_type: torch.dtype
weight_type: ScalarType
output_type: Optional[torch.dtype]
group_scale_type: Optional[torch.dtype]
channel_scale_type: Optional[torch.dtype]
token_scale_type: Optional[torch.dtype]
@dataclass
class Tensors:
w_ref: torch.Tensor
a_ref: torch.Tensor
a: torch.Tensor
w_q: torch.Tensor
w_g_s: torch.Tensor
w_ch_s: torch.Tensor
w_tok_s: torch.Tensor
# (Act Type, Weight Type, Output Type, Scale Type, ZeroPoints,
# Ch Scales Type, Tok Scales Type)
TestTypeTuple = tuple[list[torch.dtype], ScalarType, Optional[torch.dtype],
Optional[torch.dtype], bool]
TEST_TYPES = [
*(
TypeConfig(act_type=torch.float8_e4m3fn,
weight_type=w_type,
output_type=o_type,
group_scale_type=torch.float8_e4m3fn,
channel_scale_type=torch.float32,
token_scale_type=torch.float32)
for w_type in [scalar_types.int4]
# TODO(czhu): fp16 out type
for o_type in [torch.bfloat16]),
]
# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
# unit tests to a common utility function. Currently the use of
# `is_quant_method_supported` conflates kernels with quantization methods
# an assumption which is breaking down as quantizations methods can have
# have kernels and some kernels support multiple quantization methods.
IS_SUPPORTED_BY_GPU = current_platform.has_device_capability(90)
# For testing quantized linear kernels
def to_fp8(tensor: torch.Tensor):
finfo = torch.finfo(torch.float8_e4m3fn)
return tensor.clamp(min=finfo.min,
max=finfo.max).to(dtype=torch.float8_e4m3fn)
def cutlass_quantize_and_pack(atype: torch.dtype,
w: torch.Tensor,
wtype: ScalarType,
stype: Optional[torch.dtype],
group_size: Optional[int],
zero_points: bool = False):
assert wtype.is_integer(), "TODO: support floating point weights"
w_ref, w_q, w_s, w_zp = quantize_weights(w,
wtype,
group_size=group_size,
zero_points=zero_points)
# since scales are cast to fp8, we need to compute w_ref this way
w_ref = ((w_q).to(torch.float32) * w_s.to(atype).to(
torch.float32).repeat_interleave(group_size, dim=0)).to(atype)
# bit mask prevents sign extending int4 when packing
w_q = pack_rows(w_q & 0x0F, wtype.size_bits, *w_q.shape)
w_q = w_q.t().contiguous().t() # convert to col major
w_q_packed = ops.cutlass_encode_and_reorder_int4b(w_q)
w_s_packed = ops.cutlass_pack_scale_fp8(w_s.to(atype))
return w_ref, w_q_packed, w_s_packed, w_zp
def create_test_tensors(shape: tuple[int, int, int], types: TypeConfig,
group_size: Optional[int]) -> Tensors:
m, n, k = shape
print("create_test_tensors, shape:", shape, "types:", types, "group_size:",
group_size)
a = to_fp8(torch.randn((m, k), device="cuda"))
w = to_fp8(torch.randn((k, n), device="cuda"))
if types.group_scale_type is not None:
w = w.to(types.group_scale_type)
if w.dtype.itemsize == 1:
w = w.to(torch.float16)
w_ref, w_q_packed, w_s, _ = cutlass_quantize_and_pack(
a.dtype, w, types.weight_type, types.group_scale_type, group_size,
False)
a_ref = a.to(torch.float32)
w_ref = w_ref.to(torch.float32)
# for the practical use case we need per-tok scales for fp8 activations
w_tok_s = torch.randn((m, ), device='cuda', dtype=types.token_scale_type)
# weights are already per-group quantized, use placeholder here
w_ch_s = torch.ones((n, ), device='cuda', dtype=types.channel_scale_type)
return Tensors(w_ref=w_ref,
a_ref=a_ref,
a=a,
w_q=w_q_packed,
w_g_s=w_s,
w_ch_s=w_ch_s,
w_tok_s=w_tok_s)
def mm_test_helper(types: TypeConfig,
tensors: Tensors,
group_size: Optional[int] = None,
schedule: Optional[str] = None):
# CUTLASS upstream uses fp8 with fastaccum as reference
# https://github.com/NVIDIA/cutlass/blob/main/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu#L406
output_ref = torch._scaled_mm(
tensors.a_ref.to(types.act_type),
tensors.w_ref.to(types.act_type).t().contiguous().t(), # col major
tensors.w_tok_s.unsqueeze(1),
tensors.w_ch_s.unsqueeze(0),
out_dtype=types.output_type,
use_fast_accum=True)
output = ops.cutlass_w4a8_mm(
a=tensors.a,
b_q=tensors.w_q,
b_group_scales=tensors.w_g_s,
b_group_size=group_size,
b_channel_scales=tensors.w_ch_s,
a_token_scales=tensors.w_tok_s,
)
print(output)
print(output_ref)
torch.testing.assert_close(output,
output_ref.to(output.dtype),
rtol=1e-3,
atol=1e-3)
@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
reason="CUTLASS W4A8 is not supported on this GPU type.")
@pytest.mark.parametrize("shape",
MNK_SHAPES,
ids=lambda x: "x".join(str(v) for v in x))
@pytest.mark.parametrize("types", TEST_TYPES)
@pytest.mark.parametrize("schedule", SCHEDULES)
def test_cutlass_w4a8(shape, types: TypeConfig, schedule):
group_sizes = [128]
for group_size in group_sizes:
tensors = create_test_tensors(shape, types, group_size)
mm_test_helper(types, tensors, group_size, schedule)
# Test to make sure cuda graphs work
class W4A8Layer(torch.nn.Module):
def __init__(self, **kwargs):
super().__init__()
self.kwargs = kwargs
def forward(self, a):
return ops.cutlass_w4a8_mm(a=a, **self.kwargs)
@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU,
reason="CUTLASS W4A8 is not supported on this GPU type.")
def test_w4a8_cuda_graph():
m, n, k = 512, 4096, 4096
a = to_fp8(torch.randn((m, k), device="cuda"))
b = to_fp8(torch.randn((k, n), device="cuda"))
wtype = scalar_types.int4
stype = torch.float8_e4m3fn
group_size = 128
zero_points = False
w_ref, w_q_packed, w_s, _ = cutlass_quantize_and_pack(
a.dtype, b.to(torch.float16), wtype, stype, group_size, zero_points)
w_tok_s = torch.randn((m, ), device='cuda', dtype=torch.float32)
w_ch_s = torch.ones((n, ), device='cuda', dtype=torch.float32)
# Construct a trivial model with a single layer that calls the kernel
model = W4A8Layer(
b_q=w_q_packed,
b_group_scales=w_s,
b_group_size=group_size,
b_channel_scales=w_ch_s,
a_token_scales=w_tok_s,
)
output_ref = torch._scaled_mm(
a,
w_ref.to(a.dtype).t().contiguous().t(), # col major
w_tok_s.unsqueeze(1),
w_ch_s.unsqueeze(0),
out_dtype=torch.bfloat16,
use_fast_accum=True)
# Run the model with a cuda graph
stream = torch.cuda.Stream()
with torch.cuda.stream(stream):
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
output = model(a)
output.zero_()
g.replay()
torch.testing.assert_close(output, output_ref, rtol=1e-3, atol=1e-3)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm
if not current_platform.has_device_capability(100):
pytest.skip(
reason=
"Flashinfer FP8 gemms requires compute capability of 10.0 or above.",
allow_module_level=True,
)
DTYPES = [torch.float16, torch.bfloat16]
# m, n, k
SHAPES = [(128, 128, 64), (128, 128, 128), (256, 128, 64), (128, 256, 128)]
PAD_SHAPES = [(150, 128, 64), (128, 128, 96)]
SHAPES.extend(PAD_SHAPES)
SEEDS = [42]
CUDA_DEVICES = ["cuda:0"]
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("shape", SHAPES)
@pytest.mark.parametrize("use_bias", [True, False])
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@pytest.mark.parametrize("autotune", [False, True])
@torch.inference_mode()
def test_flashinfer_fp8_gemm(
dtype: torch.dtype,
shape: tuple[int, int, int],
use_bias: bool,
seed: int,
device: str,
autotune: bool,
) -> None:
current_platform.seed_everything(seed)
m, n, k = shape
a = torch.randn((m, k), dtype=dtype, device=device)
b = torch.randn((n, k), dtype=dtype, device=device) / k
a_fp8, a_scale = ops.scaled_fp8_quant(a)
b_fp8, b_scale = ops.scaled_fp8_quant(b)
expected_out = torch.mm(
a_scale * a_fp8.to(dtype=torch.float32),
b_scale * b_fp8.to(dtype=torch.float32).t(),
).to(dtype=dtype)
if use_bias:
bias = torch.randn((n, ), dtype=dtype, device=device)
expected_out = expected_out + bias
else:
bias = None
import flashinfer
with flashinfer.autotune(autotune):
out = flashinfer_scaled_fp8_mm(
a_fp8,
b_fp8.t(),
a_scale,
b_scale,
dtype,
bias=bias,
)
torch.testing.assert_close(out, expected_out, atol=1e-2, rtol=1e-2)
......@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the machete kernel.
Run `pytest tests/kernels/test_machete_mm.py`.
Run `pytest tests/kernels/quantization/test_machete_mm.py`.
"""
import math
......@@ -95,23 +95,23 @@ TEST_TYPES = [
token_scale_type=None)
for w_type in [scalar_types.uint4, scalar_types.uint8]
for a_type in [torch.float16, torch.bfloat16]),
# QQQ style
*(TypeConfig(act_type=torch.int8,
weight_type=scalar_types.uint4b8,
output_type=torch.float16,
group_scale_type=group_scale_type,
group_zero_type=None,
channel_scale_type=torch.float,
token_scale_type=torch.float)
for group_scale_type in [None, torch.float16]),
*(TypeConfig(act_type=torch.float8_e4m3fn,
weight_type=scalar_types.uint4b8,
output_type=torch.float16,
group_scale_type=group_scale_type,
group_zero_type=None,
channel_scale_type=torch.float,
token_scale_type=torch.float)
for group_scale_type in [None, torch.float16]),
# # QQQ style
# *(TypeConfig(act_type=torch.int8,
# weight_type=scalar_types.uint4b8,
# output_type=torch.float16,
# group_scale_type=group_scale_type,
# group_zero_type=None,
# channel_scale_type=torch.float,
# token_scale_type=torch.float)
# for group_scale_type in [None, torch.float16]),
# *(TypeConfig(act_type=torch.float8_e4m3fn,
# weight_type=scalar_types.uint4b8,
# output_type=torch.float16,
# group_scale_type=group_scale_type,
# group_zero_type=None,
# channel_scale_type=torch.float,
# token_scale_type=torch.float)
# for group_scale_type in [None, torch.float16]),
]
# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel
......
......@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the marlin kernel.
Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
Run `pytest tests/kernels/quantization/test_marlin_gemm.py`.
"""
import pytest
import torch
......@@ -13,11 +13,7 @@ from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
from vllm.model_executor.layers.quantization.qqq import (
MARLIN_QQQ_MAX_PARALLEL, MARLIN_QQQ_MIN_THREAD_N,
MARLIN_QQQ_SUPPORTED_GROUP_SIZES, MARLIN_QQQ_SUPPORTED_NUM_BITS)
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
MARLIN_SUPPORTED_GROUP_SIZES, marlin_make_empty_g_idx,
marlin_make_workspace_new, marlin_permute_bias, marlin_permute_scales,
query_marlin_supported_quant_types)
......@@ -31,8 +27,6 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
marlin_weights)
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
marlin_24_quantize)
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_qqq import ( # noqa: E501
marlin_qqq_quantize)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
awq_pack, gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights)
from vllm.scalar_type import scalar_types
......@@ -449,68 +443,6 @@ def test_hqq_marlin_gemm(
assert max_diff < 0.04
@pytest.mark.skipif(not is_quant_method_supported("qqq"),
reason="Marlin is not supported on this GPU type.")
@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
@pytest.mark.parametrize("num_bits", MARLIN_QQQ_SUPPORTED_NUM_BITS)
@pytest.mark.parametrize("group_size", MARLIN_QQQ_SUPPORTED_GROUP_SIZES)
@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
def test_marlin_qqq_gemm(
k_chunk,
n_chunk,
num_bits,
group_size,
mnk_factors,
):
int8_traits = torch.iinfo(torch.int8)
m_factor, n_factor, k_factor = mnk_factors
size_m = m_factor
size_k = k_chunk * k_factor
size_n = n_chunk * n_factor
a_input = rand_data((size_m, size_k))
b_weight = rand_data((size_k, size_n))
# Quantize activations
s_a = a_input.abs().max(dim=-1, keepdim=True)[0].div(int8_traits.max).to(
torch.float)
q_a = (a_input / s_a).round().clamp(int8_traits.min,
int8_traits.max).to(torch.int8)
# Quantize weights
w_ref, marlin_qqq_q_w, marlin_qqq_s_group, marlin_qqq_s_channel = \
marlin_qqq_quantize(b_weight, num_bits, group_size)
workspace = MarlinWorkspace(size_n, MARLIN_QQQ_MIN_THREAD_N,
MARLIN_QQQ_MAX_PARALLEL)
opcheck(torch.ops._C.marlin_qqq_gemm,
(q_a, marlin_qqq_q_w, s_a, marlin_qqq_s_channel,
marlin_qqq_s_group, workspace.scratch, a_input.shape[0],
b_weight.shape[1], a_input.shape[1]))
output = ops.marlin_qqq_gemm(
q_a,
marlin_qqq_q_w,
s_a,
marlin_qqq_s_channel,
marlin_qqq_s_group,
workspace.scratch,
a_input.shape[0],
b_weight.shape[1],
a_input.shape[1],
)
output_ref = torch.matmul(q_a.half() * s_a.half(), w_ref)
torch.cuda.synchronize()
max_diff = compute_max_diff(output, output_ref)
assert max_diff < 0.04
def test_marlin_gemm_subset_input():
quant_type = scalar_types.uint4b8
group_size = 128
......@@ -602,18 +534,3 @@ def test_marlin_gemm_with_bias(size_m):
max_diff = compute_max_diff(output, output_ref)
assert max_diff < 0.04
def test_marlin_gemm_opcheck():
size_m = 2048
size_n = 4096
size_k = 4096
a = torch.rand((size_m, size_n), device='cuda', dtype=torch.float16)
w = torch.randint(-5, 5, (256, 8192), device='cuda', dtype=torch.int32)
s = torch.full((32, size_k), 0.125, device='cuda', dtype=torch.float16)
wk = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
GPTQ_MARLIN_MAX_PARALLEL).scratch
x = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k)
y = torch.ops._C.marlin_gemm(a, w, s, wk, size_m, size_n, size_k)
torch.testing.assert_close(x, y)
opcheck(torch.ops._C.marlin_gemm, (a, w, s, wk, size_m, size_n, size_k))
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
from tests.kernels.utils import opcheck
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types
if not current_platform.has_device_capability(100):
pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
allow_module_level=True)
DTYPES = [torch.float16, torch.bfloat16]
SHAPES = [(128, 64), (128, 128), (256, 64), (256, 128)]
SEEDS = [42]
CUDA_DEVICES = ['cuda:0']
FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
BLOCK_SIZE = 16
def ref_impl(silu_and_mul: SiluAndMul, x: torch.Tensor,
global_scale: torch.Tensor,
ref_output_scale: torch.Tensor) -> torch.Tensor:
silu_and_mul_out = silu_and_mul.forward_native(x)
assert not current_platform.is_rocm()
assert silu_and_mul_out.ndim >= 1, (
f'input.ndim needs to be >= 1, but got {silu_and_mul_out.ndim}.')
other_dims = 1 if silu_and_mul_out.ndim == 1 else -1
silu_and_mul_out = silu_and_mul_out.reshape(other_dims,
silu_and_mul_out.shape[-1])
m, n = silu_and_mul_out.shape
device = silu_and_mul_out.device
# Two fp4 values will be packed into an uint8.
out = torch.empty((m, n // 2), device=device, dtype=torch.uint8)
output_scale = ref_output_scale
torch.ops._C.scaled_fp4_quant(out, silu_and_mul_out, output_scale,
global_scale)
return out, output_scale
def ops_impl(x: torch.Tensor, global_scale: torch.Tensor,
ref_output_scale: torch.Tensor) -> torch.Tensor:
out_shape = (x.shape[0], x.shape[1] // 4)
output_scale = ref_output_scale
out = torch.empty(out_shape, dtype=torch.uint8, device=x.device)
torch.ops._C.silu_and_mul_nvfp4_quant(out, output_scale, x, global_scale)
return out, output_scale
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("shape", SHAPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode()
def test_quantize_to_fp4(
dtype: torch.dtype,
shape: tuple[int, int],
seed: int,
device: str,
) -> None:
current_platform.seed_everything(seed)
torch.set_default_device(device)
m, n = shape
x = torch.randn((m, n), dtype=dtype)
tensor_amax = torch.abs(x).max().to(torch.float32)
global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
block_size = 16
assert n % block_size == 0, (
f'last dim has to be multiple of 16, but got {n}.')
assert x.dtype in (torch.float16, torch.bfloat16), (
f'input.dtype needs to be fp16 or bf16 but got {x.dtype}.')
round_up = lambda x, y: (x + y - 1) // y * y
rounded_m = round_up(x.shape[0], 128)
scale_n = x.shape[1] // (2 * block_size)
rounded_n = round_up(scale_n, 4)
output_scale = torch.empty((rounded_m, rounded_n // 4),
device=x.device,
dtype=torch.int32)
layer = SiluAndMul()
ref_out, ref_out_scale = ref_impl(layer, x, global_scale, output_scale)
fusion_out, fusion_out_scale = ops_impl(x, global_scale, output_scale)
assert ref_out.dtype == torch.uint8
assert fusion_out.dtype == torch.uint8
assert ref_out.shape == fusion_out.shape
assert ref_out_scale.dtype == torch.int32
assert fusion_out_scale.dtype == torch.int32
assert ref_out_scale.shape == fusion_out_scale.shape
# Allow up to 2% of mismatched values since BF16 has accuracy issues.
mis_threshold = 0.02
atol = 0.4
rtol = 0.4
ref_logits = ref_out[-1]
fusion_logits = fusion_out[-1]
mis_count = torch.sum(
torch.abs(fusion_logits - ref_logits) > (atol +
rtol * torch.abs(ref_logits)))
mis_ratio = mis_count / fusion_logits.numel()
assert mis_ratio < mis_threshold, \
f"Mismatch ratio {mis_ratio} exceeds threshold {mis_threshold}"
torch.testing.assert_close(ref_out_scale, fusion_out_scale)
opcheck(torch.ops._C.silu_and_mul_nvfp4_quant,
(fusion_out, fusion_out_scale, x, global_scale))
......@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the triton_scaled_mm kernel
Run `pytest tests/kernels/test_triton_scaled_mm.py`.
Run `pytest tests/kernels/quantization/test_triton_scaled_mm.py`.
"""
import importlib
from typing import Optional
......
......@@ -9,12 +9,17 @@ import pytest
import torch
from packaging import version
from vllm import SamplingParams
from tests.v1.attention.utils import (BatchSpec, create_common_attn_metadata,
create_standard_kv_cache_spec,
create_vllm_config)
from vllm.v1.attention.backends.flex_attention import (
FlexAttentionMetadataBuilder)
from ..models.utils import check_embeddings_close
from ..models.utils import check_embeddings_close, check_logprobs_close
TORCH_VERSION = version.parse(torch.__version__)
MINIMUM_TORCH_VERSION = version.parse("2.7.0")
DIRECT_BUILD_VERSION = version.parse("2.9.dev0")
def set_seed(seed):
......@@ -34,22 +39,18 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
"""Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with
the default backend, ensuring they are identical when using the same seed.
the default backend, ensuring they are similar when using the same seed.
"""
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
seed = 42
max_tokens = 24
num_logprobs = 5
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
]
sampling_params = SamplingParams(temperature=0.0,
top_p=1.0,
seed=seed,
max_tokens=max_tokens)
# Run with flex attention
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
......@@ -61,7 +62,8 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
tensor_parallel_size=1,
num_gpu_blocks_override=128,
enforce_eager=True) as llm_flex:
output_flex = llm_flex.generate(prompts, sampling_params)
output_flex = llm_flex.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs)
# Run with default backend
with monkeypatch.context() as m:
......@@ -71,20 +73,17 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
runner="generate",
tensor_parallel_size=1,
num_gpu_blocks_override=128,
enforce_eager=True) as llm_default:
output_default = llm_default.generate(prompts, sampling_params)
# Compare outputs from both backends
for i, (flex_result,
default_result) in enumerate(zip(output_flex, output_default)):
prompt = prompts[i]
flex_text = flex_result[1][0]
default_text = default_result[1][0]
assert flex_text == default_text, (
f"FlexAttention output doesn't match default for: {prompt!r}\n"
f"FlexAttention: {flex_text!r}\n"
f"Default: {default_text!r}")
enforce_eager=True,
gpu_memory_utilization=0.85) as llm_default:
output_default = llm_default.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs)
check_logprobs_close(
outputs_0_lst=output_flex,
outputs_1_lst=output_default,
name_0="flex",
name_1="default",
)
@pytest.mark.skipif(
......@@ -136,5 +135,70 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
)
@pytest.mark.skipif(
not torch.cuda.is_available() or TORCH_VERSION < DIRECT_BUILD_VERSION,
reason="CUDA not available or PyTorch version < 2.7",
)
def test_block_mask_direct_vs_slow_path():
"""Test that direct path block mask is a superset of slow path.
The direct path may include extra blocks for performance (over-estimation),
but must include all blocks that the slow path determines are necessary.
"""
device = torch.device("cuda")
vllm_config = create_vllm_config(model_name="meta-llama/Meta-Llama-3-8B",
block_size=16,
max_model_len=1024)
kv_cache_spec = create_standard_kv_cache_spec(vllm_config)
# Use a mixed batch that will create groups spanning multiple sequences
batch_spec = BatchSpec(seq_lens=[35, 64, 128, 256],
query_lens=[33, 5, 32, 64],
name="test_mixed_batch")
common_attn_metadata = create_common_attn_metadata(
batch_spec, vllm_config.cache_config.block_size, device)
builder = FlexAttentionMetadataBuilder(kv_cache_spec, [], vllm_config,
device)
metadata_direct = builder.build(common_prefix_len=0,
common_attn_metadata=common_attn_metadata)
builder.direct_build = False
metadata_slow = builder.build(common_prefix_len=0,
common_attn_metadata=common_attn_metadata)
assert metadata_direct.block_mask is not None
assert metadata_slow.block_mask is not None
# Extract block indices for comparison, B, H are the same
direct_indices = metadata_direct.block_mask.kv_indices[0, 0]
slow_indices = metadata_slow.block_mask.kv_indices[0, 0]
direct_num = metadata_direct.block_mask.kv_num_blocks[0, 0]
slow_num = metadata_slow.block_mask.kv_num_blocks[0, 0]
# main test: every block needed by slow path must be in direct path
num_groups = direct_num.shape[0]
all_contained = True
missing_details = []
for group_idx in range(num_groups):
direct_blocks = set(
direct_indices[group_idx, :direct_num[group_idx]].tolist())
slow_blocks = set(
slow_indices[group_idx, :slow_num[group_idx]].tolist())
missing_blocks = slow_blocks - direct_blocks
if missing_blocks:
all_contained = False
missing_details.append(
f"Group {group_idx}: missing {sorted(missing_blocks)}")
assert all_contained, (
"Direct path is missing blocks required by slow path:\n" +
"\n".join(missing_details))
if __name__ == "__main__":
pytest.main([__file__])
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Integration tests for FlexAttention backend vs default backend"""
from typing import Optional
import pytest
import torch
from tests.kernels.utils import to_int8
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
if not current_platform.is_cpu():
pytest.skip("skipping CPU-only tests", allow_module_level=True)
NK_FACTORS = [
(256, 128),
(4096, 4096),
(16384, 4096),
(1023, 491),
(1001, 15),
]
M_FACTORS = [
(16, 1, 32, 128, 64),
(1, 17, 1, 31, 17),
]
CACHE_SIZES = [2]
DTYPE = [torch.bfloat16]
def rand_int8(shape: tuple, device: str = "cpu"):
return to_int8(torch.rand(shape, device=device) * 255 - 128)
def ref_int8_scaled_mm(
a: torch.Tensor,
b: torch.Tensor,
scale_a: torch.Tensor,
scale_b: torch.Tensor,
azp: Optional[torch.Tensor],
bias: Optional[torch.Tensor],
output_type: torch.dtype,
):
if azp is not None:
a = a.to(dtype=torch.float32) - azp.to(dtype=torch.float32)
output = torch.mm((scale_a * a.to(dtype=torch.float32)),
(scale_b * b.to(dtype=torch.float32)))
if bias is not None:
output += bias.float()
return output.to(dtype=output_type)
def onednn_int8_gemm_test_helper(primitive_cache_size: int,
m: int,
n: int,
k: int,
per_tensor_a_quant: bool,
per_tensor_b_quant: bool,
use_azp: bool,
use_bias: bool,
out_dtype: torch.dtype = torch.bfloat16,
device: str = "cpu"):
# Test for a oneDNN kernel with per-tensor / per-token activation
# quantization and per-tensor / per-output channel weight quantization.
a = to_int8(torch.randn((m, k), device=device) * 5)
b = to_int8(torch.randn((n, k), device=device).t() * 5)
a_scales_shape = (1, 1) if per_tensor_a_quant else (m, 1)
b_scales_shape = (1, 1) if per_tensor_b_quant else (1, n)
scale_a = (torch.randn(a_scales_shape, device=device, dtype=torch.float32))
scale_b = (torch.randn(b_scales_shape, device=device, dtype=torch.float32))
if use_azp:
azp = torch.rand(a_scales_shape, dtype=torch.float32) * 10 + 1.5
azp = (azp / scale_a).round().to(dtype=torch.int32)
azp_adj = scale_b * b.sum(dim=0, keepdim=True, dtype=torch.float32)
else:
azp = None
azp_adj = None
if use_bias:
bias = torch.rand((n, ), device=device, dtype=out_dtype) * 10
else:
bias = None
handler = ops.create_onednn_scaled_mm(
b,
scale_b,
out_dtype,
not per_tensor_a_quant,
use_azp,
primitive_cache_size,
)
out = torch.zeros((m, n), dtype=out_dtype)
ops.onednn_scaled_mm(handler, a, out, scale_a, azp, azp_adj, bias)
baseline = ref_int8_scaled_mm(a, b, scale_a, scale_b, azp, bias, out_dtype)
torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
if use_bias:
# To test runtime bias setting
out = torch.zeros((m, n), dtype=out_dtype)
ops.onednn_scaled_mm(handler, a, out, scale_a, azp, azp_adj, None)
baseline = ref_int8_scaled_mm(a, b, scale_a, scale_b, azp, None,
out_dtype)
torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
@pytest.mark.parametrize("n,k", NK_FACTORS)
@pytest.mark.parametrize("m_list", M_FACTORS)
@pytest.mark.parametrize("per_tensor_a_scale", [True, False])
@pytest.mark.parametrize("per_tensor_b_scale", [True, False])
@pytest.mark.parametrize("use_bias", [True, False])
@pytest.mark.parametrize("use_azp", [True, False])
@pytest.mark.parametrize("output_type", DTYPE)
@pytest.mark.parametrize("primitive_cache_size", CACHE_SIZES)
def test_onednn_int8_scaled_gemm(
n: int,
k: int,
m_list: tuple[int],
per_tensor_a_scale: bool,
per_tensor_b_scale: bool,
use_bias: bool,
use_azp: bool,
output_type: torch.dtype,
primitive_cache_size: int,
):
for m in m_list:
onednn_int8_gemm_test_helper(
primitive_cache_size=primitive_cache_size,
m=m,
n=n,
k=k,
per_tensor_a_quant=per_tensor_a_scale,
per_tensor_b_quant=per_tensor_b_scale,
use_bias=use_bias,
use_azp=use_azp,
out_dtype=output_type,
)
......@@ -3,15 +3,13 @@
import tempfile
from collections import OrderedDict
from unittest.mock import MagicMock, patch
from unittest.mock import MagicMock
import pytest
import torch
import torch.nn as nn
from huggingface_hub import snapshot_download
import vllm
from vllm.config import LoRAConfig
from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment,
initialize_model_parallel)
......@@ -21,7 +19,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader import get_model
from vllm.model_executor.models.interfaces import SupportsLoRA
from vllm.platforms import current_platform
......@@ -104,6 +101,7 @@ def dummy_model() -> nn.Module:
]))
model.config = MagicMock()
model.embedding_modules = {"lm_head": "lm_head"}
model.unpadded_vocab_size = 32000
return model
......@@ -137,6 +135,8 @@ def dummy_model_gate_up() -> nn.Module:
],
}
model.embedding_modules = {"lm_head": "lm_head"}
model.unpadded_vocab_size = 32000
return model
......@@ -216,34 +216,6 @@ def tinyllama_lora_files():
return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
@pytest.fixture(scope="session")
def phi2_lora_files():
return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora")
@pytest.fixture
def llama_2_7b_engine_extra_embeddings():
cleanup_dist_env_and_memory(shutdown_ray=True)
get_model_old = get_model
def get_model_patched(**kwargs):
kwargs["vllm_config"].lora_config = LoRAConfig(max_loras=4,
max_lora_rank=8)
return get_model_old(**kwargs)
with patch("vllm.worker.model_runner.get_model", get_model_patched):
engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
yield engine.llm_engine
del engine
cleanup_dist_env_and_memory(shutdown_ray=True)
@pytest.fixture
def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
model_runner.model)
@pytest.fixture
def reset_default_device():
"""
......
......@@ -5,7 +5,6 @@ import time
import pytest
import vllm.envs as env
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
......@@ -98,12 +97,10 @@ async def test_add_lora(chatglm3_lora_files):
# Run with warmup
add_lora_tasks = [llm.add_lora(lr) for lr in warmup_run_requests]
add_lora_results = await asyncio.gather(*add_lora_tasks)
if env.VLLM_USE_V1:
# Test that all all_lora calls are successful.
assert all(add_lora_results)
else:
# No way to check V0 engine results as the calls just return None.
pass
# Test that all all_lora calls are successful.
assert all(add_lora_results)
time_with_add_lora = await requests_processing_time(
llm, warmup_run_requests)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import vllm
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.lora.request import LoRARequest
MODEL_PATH = "baichuan-inc/Baichuan-7B"
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
PROMPT_TEMPLATE.format(
query=
"What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
),
PROMPT_TEMPLATE.format(
query=
"Show name, country, age for all singers ordered by age from the oldest to the youngest." # noqa: E501
),
]
print(prompts)
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
# Print the outputs.
generated_texts: list[str] = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
return generated_texts
def test_baichuan_lora(baichuan_lora_files):
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=64,
trust_remote_code=True)
expected_lora_output = [
"SELECT count(*) FROM singer",
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE Country = 'France'", # noqa: E501
"SELECT name , country , age FROM singer ORDER BY age ASC",
]
output1 = do_sample(llm, baichuan_lora_files, lora_id=1)
for i in range(len(expected_lora_output)):
assert output1[i] == expected_lora_output[i]
output2 = do_sample(llm, baichuan_lora_files, lora_id=2)
for i in range(len(expected_lora_output)):
assert output2[i] == expected_lora_output[i]
@pytest.mark.parametrize("fully_sharded", [True, False])
def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
num_gpus_available, fully_sharded):
if num_gpus_available < 4:
pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
llm_tp1 = vllm.LLM(MODEL_PATH,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
max_lora_rank=64,
trust_remote_code=True,
fully_sharded_loras=fully_sharded)
output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
del llm_tp1
cleanup_dist_env_and_memory()
llm_tp2 = vllm.LLM(MODEL_PATH,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
max_lora_rank=64,
tensor_parallel_size=2,
trust_remote_code=True,
fully_sharded_loras=fully_sharded)
output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2)
del llm_tp2
cleanup_dist_env_and_memory()
assert output_tp1 == output_tp2
llm_tp4 = vllm.LLM(MODEL_PATH,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
max_lora_rank=64,
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=fully_sharded)
output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2)
del llm_tp4
cleanup_dist_env_and_memory()
assert output_tp1 == output_tp4
......@@ -87,6 +87,9 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
# https://github.com/NVIDIA/nccl/issues/1790, set a lower value for
# gpu_memory_utilization here because NCCL >= 2.26.3 seems to use
# more GPU memory causing vLLM to OOM
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
enable_lora=True,
......@@ -95,7 +98,8 @@ def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=True,
enable_chunked_prefill=True)
enable_chunked_prefill=True,
gpu_memory_utilization=0.85)
output1 = do_sample(llm, chatglm3_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
......
......@@ -243,7 +243,7 @@ def check_punica_wrapper(punica_wrapper) -> bool:
@torch.inference_mode()
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
@pytest.mark.parametrize("num_loras", [1, 2, 4])
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
@pytest.mark.parametrize("stage", STAGES)
......@@ -347,7 +347,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
@torch.inference_mode()
# @pytest.mark.skip(
# reason="Fails when loras are in any slot other than the first.")
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
@pytest.mark.parametrize("num_loras", [1, 2, 4])
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
@pytest.mark.parametrize("stage", STAGES)
......@@ -486,7 +486,7 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
@torch.inference_mode()
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
@pytest.mark.parametrize("num_loras", [1, 2, 4])
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
@pytest.mark.parametrize("stage", STAGES)
......@@ -620,12 +620,15 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
@torch.inference_mode()
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
@pytest.mark.parametrize("num_loras", [1, 2, 4])
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES)
@pytest.mark.parametrize("bias_enabled", [True, False])
def test_linear_replicated(dist_init, num_loras, device, stage,
bias_enabled) -> None:
def test_linear_replicated(
dist_init,
num_loras,
device,
stage,
) -> None:
if current_platform.is_cuda_alike():
torch.cuda.set_device(device)
......@@ -634,10 +637,11 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
torch.set_default_device(device)
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
assert check_punica_wrapper(punica_wrapper)
lora_config = LoRAConfig(max_loras=max_loras,
max_lora_rank=8,
lora_dtype=torch.float16,
bias_enabled=bias_enabled)
lora_config = LoRAConfig(
max_loras=max_loras,
max_lora_rank=8,
lora_dtype=torch.float16,
)
def create_random_linear_replicated_layer():
......@@ -651,10 +655,6 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
lora_linear.create_lora_weights(max_loras, lora_config)
assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
lora_linear.lora_b_stacked) == 1)
if bias_enabled:
assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
else:
assert lora_linear.lora_bias_stacked is None
return linear, lora_linear
for i in range(NUM_RANDOM_SEEDS):
......@@ -734,14 +734,13 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
@torch.inference_mode()
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
@pytest.mark.parametrize("num_loras", [1, 2, 4])
@pytest.mark.parametrize("orientation", ["row", "column"])
@pytest.mark.parametrize("fully_shard", [True, False])
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES)
@pytest.mark.parametrize("bias_enabled", [True, False])
def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
device, stage, bias_enabled) -> None:
device, stage) -> None:
if current_platform.is_cuda_alike():
torch.cuda.set_device(device)
......@@ -750,11 +749,12 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
torch.set_default_device(device)
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
assert check_punica_wrapper(punica_wrapper)
lora_config = LoRAConfig(max_loras=max_loras,
max_lora_rank=8,
fully_sharded_loras=fully_shard,
lora_dtype=torch.float16,
bias_enabled=bias_enabled)
lora_config = LoRAConfig(
max_loras=max_loras,
max_lora_rank=8,
fully_sharded_loras=fully_shard,
lora_dtype=torch.float16,
)
def create_random_linear_parallel_layer():
if orientation == "row":
......@@ -777,10 +777,7 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
lora_linear.create_lora_weights(max_loras, lora_config)
assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
lora_linear.lora_b_stacked) == 1)
if bias_enabled:
assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
else:
assert lora_linear.lora_bias_stacked is None
return linear, lora_linear
for i in range(NUM_RANDOM_SEEDS):
......@@ -860,14 +857,13 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
@torch.inference_mode()
@pytest.mark.parametrize("num_loras", [1, 2, 4, 8])
@pytest.mark.parametrize("num_loras", [1, 2, 4])
@pytest.mark.parametrize("repeats", [1, 2, 3])
@pytest.mark.parametrize("fully_shard", [True, False])
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES)
@pytest.mark.parametrize("bias_enabled", [True, False])
def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
device, stage, bias_enabled) -> None:
device, stage) -> None:
if current_platform.is_cuda_alike():
torch.cuda.set_device(device)
......@@ -876,11 +872,12 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
torch.set_default_device(device)
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
assert check_punica_wrapper(punica_wrapper)
lora_config = LoRAConfig(max_loras=max_loras,
max_lora_rank=8,
fully_sharded_loras=fully_shard,
lora_dtype=torch.float16,
bias_enabled=bias_enabled)
lora_config = LoRAConfig(
max_loras=max_loras,
max_lora_rank=8,
fully_sharded_loras=fully_shard,
lora_dtype=torch.float16,
)
def create_column_parallel_packed_layer():
if repeats == 2:
......@@ -924,10 +921,7 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
model_config=FakeConfig())
assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len(
lora_linear.lora_b_stacked) == n_slices)
if bias_enabled:
assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices
else:
assert lora_linear.lora_bias_stacked is None
return linear, lora_linear
for i in range(NUM_RANDOM_SEEDS):
......
......@@ -113,8 +113,7 @@ def test_llama_lora(sql_lora_files):
enable_lora=True,
# also test odd max_num_seqs
max_num_seqs=13,
max_loras=4,
enable_chunked_prefill=True)
max_loras=4)
generate_and_test(llm, sql_lora_files)
......@@ -128,7 +127,6 @@ def test_llama_lora_tp4(sql_lora_files):
max_num_seqs=16,
max_loras=4,
tensor_parallel_size=4,
enable_chunked_prefill=True,
)
generate_and_test(llm, sql_lora_files)
......@@ -144,7 +142,6 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
max_loras=4,
tensor_parallel_size=4,
fully_sharded_loras=True,
enable_chunked_prefill=True,
)
generate_and_test(llm, sql_lora_files)
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Script to test multi loras service with tp >= 2
This script contains:
1. test multi loras service with tp >= 2
2. test multi loras request
"""
import pytest
from tests.utils import multi_gpu_test
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest
......@@ -156,3 +160,34 @@ def test_multi_loras_with_tp_sync():
output_text = call_llm_get_outputs(prompt, "Alice")
check_outputs(output_text, expected_output)
def test_multiple_lora_requests():
llm = LLM(
model=MODEL_PATH,
enable_lora=True,
max_loras=4,
max_lora_rank=LORA_RANK,
max_model_len=512,
gpu_memory_utilization=0.5,
enforce_eager=True,
)
PROMPTS = ["Hello, my name is"] * 2
LORA_NAME = "Alice"
lora_request = [
LoRARequest(LORA_NAME + str(idx), idx + 1,
LORA_NAME_PATH_MAP[LORA_NAME])
for idx in range(len(PROMPTS))
]
# Multiple SamplingParams should be matched with each prompt
outputs = llm.generate(PROMPTS, lora_request=lora_request)
assert len(PROMPTS) == len(outputs)
# Exception raised, if the size of params does not match the size of prompts
with pytest.raises(ValueError):
outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
# Single LoRARequest should be applied to every prompt
single_lora_request = lora_request[0]
outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
assert len(PROMPTS) == len(outputs)
......@@ -21,6 +21,8 @@ from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
WorkerLoRAManager)
from vllm.platforms import current_platform
from .utils import create_peft_lora
EMBEDDING_MODULES = {
"embed_tokens": "input_embeddings",
"lm_head": "output_embeddings",
......@@ -35,17 +37,6 @@ DEVICES = ([
DEFAULT_DTYPE = torch.get_default_dtype()
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch: pytest.MonkeyPatch):
"""
Some tests depend on V0 internals. Since both V0 and V1 use the same
LoRAModelManager it is okay to just test V0.
"""
with monkeypatch.context() as m:
m.setenv('VLLM_USE_V1', '0')
yield
@pytest.mark.parametrize("device", DEVICES)
def test_from_lora_tensors(sql_lora_files, device):
tensors = load_file(
......@@ -326,7 +317,6 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
max_loras=2,
lora_dtype=DEFAULT_DTYPE),
device=device)
assert all(x is None for x in manager.lora_index_to_id)
# Add up to capacity
......@@ -430,32 +420,40 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
@pytest.mark.parametrize("device", DEVICES)
def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
sql_lora_files, device):
def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device,
tmp_path):
lora_config = LoRAConfig(max_lora_rank=8,
max_cpu_loras=4,
max_loras=4,
lora_dtype=DEFAULT_DTYPE)
dummy_lora_files = f"{tmp_path}/lora_adapter"
os.makedirs(dummy_lora_files, exist_ok=True)
create_peft_lora(
dummy_model,
save_dir=dummy_lora_files,
target_modules=["layer1.dense1", "dense2"],
lora_dtype=DEFAULT_DTYPE,
)
worker_adapter_manager = LRUCacheWorkerLoRAManager(
4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
lora_config.lora_extra_vocab_size, lora_config, device,
EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
worker_adapter_manager.create_lora_manager(
llama_2_7b_model_extra_embeddings)
4, 2,
dummy_model.unpadded_vocab_size - lora_config.lora_extra_vocab_size,
lora_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
worker_adapter_manager.create_lora_manager(dummy_model)
mapping = LoRAMapping([], [])
worker_adapter_manager.set_active_adapters([
LoRARequest("1", 1, sql_lora_files),
LoRARequest("2", 2, sql_lora_files)
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("2", 2, dummy_lora_files)
], mapping)
assert worker_adapter_manager.list_adapters() == {1, 2}
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
worker_adapter_manager.set_active_adapters([
LoRARequest("1", 1, sql_lora_files),
LoRARequest("3", 3, sql_lora_files),
LoRARequest("4", 4, sql_lora_files)
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("3", 3, dummy_lora_files),
LoRARequest("4", 4, dummy_lora_files)
], mapping)
assert worker_adapter_manager.list_adapters() == {1, 2, 3, 4}
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
......@@ -464,9 +462,9 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
worker_adapter_manager.set_active_adapters([
LoRARequest("1", 1, sql_lora_files),
LoRARequest("2", 2, sql_lora_files),
LoRARequest("5", 5, sql_lora_files)
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("2", 2, dummy_lora_files),
LoRARequest("5", 5, dummy_lora_files)
], mapping)
assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
......@@ -475,9 +473,9 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
worker_adapter_manager.set_active_adapters([
LoRARequest("1", 1, sql_lora_files),
LoRARequest("1", 1, sql_lora_files),
LoRARequest("1", 1, sql_lora_files)
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("1", 1, dummy_lora_files)
], mapping)
assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5}
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
......@@ -486,9 +484,9 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4
worker_adapter_manager.set_active_adapters([
LoRARequest("6", 6, sql_lora_files),
LoRARequest("7", 7, sql_lora_files),
LoRARequest("8", 8, sql_lora_files)
LoRARequest("6", 6, dummy_lora_files),
LoRARequest("7", 7, dummy_lora_files),
LoRARequest("8", 8, dummy_lora_files)
], mapping)
assert worker_adapter_manager.list_adapters() == {1, 6, 7, 8}
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
......@@ -499,11 +497,11 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
# Over capacity
with pytest.raises(RuntimeError):
worker_adapter_manager.set_active_adapters([
LoRARequest("10", 10, sql_lora_files),
LoRARequest("11", 11, sql_lora_files),
LoRARequest("12", 12, sql_lora_files),
LoRARequest("13", 13, sql_lora_files),
LoRARequest("14", 14, sql_lora_files)
LoRARequest("10", 10, dummy_lora_files),
LoRARequest("11", 11, dummy_lora_files),
LoRARequest("12", 12, dummy_lora_files),
LoRARequest("13", 13, dummy_lora_files),
LoRARequest("14", 14, dummy_lora_files)
], mapping)
assert worker_adapter_manager.device == device
......@@ -512,33 +510,41 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
@pytest.mark.parametrize("device", DEVICES)
def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
sql_lora_files, device):
def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device,
tmp_path):
# Should remove every LoRA not specified in the request.
lora_config = LoRAConfig(max_lora_rank=8,
max_cpu_loras=4,
max_loras=4,
lora_dtype=DEFAULT_DTYPE)
worker_adapter_manager = WorkerLoRAManager(
4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
4, 2, dummy_model_gate_up.unpadded_vocab_size -
lora_config.lora_extra_vocab_size, lora_config, device,
EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
worker_adapter_manager.create_lora_manager(
llama_2_7b_model_extra_embeddings)
worker_adapter_manager.create_lora_manager(dummy_model_gate_up)
dummy_lora_files = f"{tmp_path}/lora_adapter"
os.makedirs(dummy_lora_files, exist_ok=True)
create_peft_lora(
dummy_model_gate_up,
save_dir=dummy_lora_files,
target_modules=["layer1.dense1", "dense2"],
lora_dtype=DEFAULT_DTYPE,
)
mapping = LoRAMapping([], [])
worker_adapter_manager.set_active_adapters([
LoRARequest("1", 1, sql_lora_files),
LoRARequest("2", 2, sql_lora_files)
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("2", 2, dummy_lora_files)
], mapping)
assert worker_adapter_manager.list_adapters() == {1, 2}
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2
worker_adapter_manager.set_active_adapters([
LoRARequest("1", 1, sql_lora_files),
LoRARequest("3", 3, sql_lora_files),
LoRARequest("4", 4, sql_lora_files)
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("3", 3, dummy_lora_files),
LoRARequest("4", 4, dummy_lora_files)
], mapping)
assert worker_adapter_manager.list_adapters() == {1, 3, 4}
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
......@@ -546,9 +552,9 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 4
worker_adapter_manager.set_active_adapters([
LoRARequest("1", 1, sql_lora_files),
LoRARequest("2", 2, sql_lora_files),
LoRARequest("5", 5, sql_lora_files)
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("2", 2, dummy_lora_files),
LoRARequest("5", 5, dummy_lora_files)
], mapping)
assert worker_adapter_manager.list_adapters() == {1, 2, 5}
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
......@@ -556,9 +562,9 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5
worker_adapter_manager.set_active_adapters([
LoRARequest("1", 1, sql_lora_files),
LoRARequest("1", 1, sql_lora_files),
LoRARequest("1", 1, sql_lora_files)
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("1", 1, dummy_lora_files),
LoRARequest("1", 1, dummy_lora_files)
], mapping)
assert worker_adapter_manager.list_adapters() == {1}
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1
......@@ -566,9 +572,9 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] is None
worker_adapter_manager.set_active_adapters([
LoRARequest("6", 6, sql_lora_files),
LoRARequest("7", 7, sql_lora_files),
LoRARequest("8", 8, sql_lora_files)
LoRARequest("6", 6, dummy_lora_files),
LoRARequest("7", 7, dummy_lora_files),
LoRARequest("8", 8, dummy_lora_files)
], mapping)
assert worker_adapter_manager.list_adapters() == {6, 7, 8}
assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 8
......@@ -578,11 +584,11 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
# Over capacity
with pytest.raises(RuntimeError):
worker_adapter_manager.set_active_adapters([
LoRARequest("10", 10, sql_lora_files),
LoRARequest("11", 11, sql_lora_files),
LoRARequest("12", 12, sql_lora_files),
LoRARequest("13", 13, sql_lora_files),
LoRARequest("14", 14, sql_lora_files)
LoRARequest("10", 10, dummy_lora_files),
LoRARequest("11", 11, dummy_lora_files),
LoRARequest("12", 12, dummy_lora_files),
LoRARequest("13", 13, dummy_lora_files),
LoRARequest("14", 14, dummy_lora_files)
], mapping)
assert worker_adapter_manager.device == device
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment