Commit 7e63ef82 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0' into v0.14.0-dev

parents 8cbcac5d b17039bc
...@@ -7,7 +7,7 @@ import torch ...@@ -7,7 +7,7 @@ import torch
from tests.kernels.quant_utils import ref_dynamic_per_token_quant from tests.kernels.quant_utils import ref_dynamic_per_token_quant
from tests.kernels.utils import opcheck from tests.kernels.utils import opcheck
from vllm._custom_ops import scaled_int8_quant from vllm._custom_ops import scaled_int8_quant
from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed
DTYPES = [torch.bfloat16, torch.float] DTYPES = [torch.bfloat16, torch.float]
HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193] HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
...@@ -48,7 +48,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True): ...@@ -48,7 +48,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
def test_dynamic_scaled_int8_quant( def test_dynamic_scaled_int8_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
...@@ -74,7 +74,7 @@ def test_dynamic_scaled_int8_quant( ...@@ -74,7 +74,7 @@ def test_dynamic_scaled_int8_quant(
def test_dynamic_scaled_int8_azp_quant( def test_dynamic_scaled_int8_azp_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
int8_traits = torch.iinfo(torch.int8) int8_traits = torch.iinfo(torch.int8)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300 x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300
...@@ -115,7 +115,7 @@ def test_dynamic_scaled_int8_azp_quant( ...@@ -115,7 +115,7 @@ def test_dynamic_scaled_int8_azp_quant(
def test_static_scaled_int8_quant( def test_static_scaled_int8_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int, scale: float num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int, scale: float
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
int8_traits = torch.iinfo(torch.int8) int8_traits = torch.iinfo(torch.int8)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
...@@ -148,7 +148,7 @@ def test_static_scaled_int8_azp_quant( ...@@ -148,7 +148,7 @@ def test_static_scaled_int8_azp_quant(
scale: float, scale: float,
azp: int, azp: int,
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
int8_traits = torch.iinfo(torch.int8) int8_traits = torch.iinfo(torch.int8)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300 x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300
......
...@@ -24,6 +24,7 @@ from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_m ...@@ -24,6 +24,7 @@ from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_m
from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not torch.cuda.is_available(): if not torch.cuda.is_available():
pytest.skip("CUDA required for these tests.", allow_module_level=True) pytest.skip("CUDA required for these tests.", allow_module_level=True)
...@@ -205,7 +206,7 @@ LLAMA_MODELS = { ...@@ -205,7 +206,7 @@ LLAMA_MODELS = {
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def _seed_each_test(): def _seed_each_test():
current_platform.seed_everything(0) set_random_seed(0)
np.random.seed(0) np.random.seed(0)
torch.random.manual_seed(0) torch.random.manual_seed(0)
......
...@@ -25,6 +25,7 @@ from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm ...@@ -25,6 +25,7 @@ from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm
from vllm._custom_ops import fusedQuantizeNv from vllm._custom_ops import fusedQuantizeNv
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not torch.cuda.is_available(): if not torch.cuda.is_available():
pytest.skip("CUDA required for these tests.", allow_module_level=True) pytest.skip("CUDA required for these tests.", allow_module_level=True)
...@@ -193,7 +194,7 @@ LLAMA_MODELS = { ...@@ -193,7 +194,7 @@ LLAMA_MODELS = {
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def _seed_each_test(): def _seed_each_test():
current_platform.seed_everything(0) set_random_seed(0)
np.random.seed(0) np.random.seed(0)
torch.random.manual_seed(0) torch.random.manual_seed(0)
......
...@@ -11,6 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import ( ...@@ -11,6 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import (
from vllm._custom_ops import scaled_fp4_quant from vllm._custom_ops import scaled_fp4_quant
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100): if not current_platform.has_device_capability(100):
pytest.skip( pytest.skip(
...@@ -30,10 +31,11 @@ BLOCK_SIZE = 16 ...@@ -30,10 +31,11 @@ BLOCK_SIZE = 16
@pytest.mark.parametrize("shape", SHAPES) @pytest.mark.parametrize("shape", SHAPES)
@torch.inference_mode() @torch.inference_mode()
def test_silu_mul_nvfp4_quant( def test_silu_mul_nvfp4_quant(
default_vllm_config,
dtype: torch.dtype, dtype: torch.dtype,
shape: tuple[int, int], shape: tuple[int, int],
) -> None: ) -> None:
current_platform.seed_everything(42) set_random_seed(42)
device = "cuda:0" device = "cuda:0"
torch.set_default_device(device) torch.set_default_device(device)
......
...@@ -11,7 +11,9 @@ import pytest ...@@ -11,7 +11,9 @@ import pytest
import torch import torch
from vllm.platforms import current_platform from vllm.platforms import current_platform
from ...utils import models_path_prefix from ...utils import models_path_prefix
from vllm.utils.torch_utils import set_random_seed
device = "cuda" device = "cuda"
...@@ -86,7 +88,7 @@ def test_scaled_mm( ...@@ -86,7 +88,7 @@ def test_scaled_mm(
): ):
is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t).is_floating_point() is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t).is_floating_point()
current_platform.seed_everything(0) set_random_seed(0)
# NOTE: There are cases, where if the matrix is large enough, an output # NOTE: There are cases, where if the matrix is large enough, an output
# like 65504.4 can be produced, and can easily turn into inf when # like 65504.4 can be produced, and can easily turn into inf when
......
...@@ -24,6 +24,10 @@ from vllm.utils.deep_gemm import ( ...@@ -24,6 +24,10 @@ from vllm.utils.deep_gemm import (
per_block_cast_to_fp8, per_block_cast_to_fp8,
should_use_deepgemm_for_fp8_linear, should_use_deepgemm_for_fp8_linear,
) )
from vllm.utils.flashinfer import (
flashinfer_fp8_blockscale_gemm,
has_flashinfer_fp8_blockscale_gemm,
)
from vllm.utils.import_utils import has_deep_gemm from vllm.utils.import_utils import has_deep_gemm
if current_platform.get_device_capability() < (9, 0): if current_platform.get_device_capability() < (9, 0):
...@@ -205,3 +209,50 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed): ...@@ -205,3 +209,50 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
torch.abs(out.to(torch.float32) - ref_out.to(torch.float32)) torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))
) / torch.mean(torch.abs(ref_out.to(torch.float32))) ) / torch.mean(torch.abs(ref_out.to(torch.float32)))
assert rel_diff < 0.001 assert rel_diff < 0.001
@pytest.mark.skipif(
current_platform.is_fp8_fnuz(),
reason="This platform supports e4m3fnuz, not e4m3fn.",
)
@pytest.mark.parametrize(
"M,N,K,block_size,out_dtype,seed",
itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS),
)
@torch.inference_mode()
def test_w8a8_block_fp8_flashinfer_matmul(M, N, K, block_size, out_dtype, seed):
if not has_flashinfer_fp8_blockscale_gemm():
pytest.skip(
"FlashInfer block GEMM not available (requires SM90+ and FlashInfer)"
)
# only aligned sizes
if K % 128 != 0 or N % 64 != 0:
pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}")
torch.manual_seed(seed)
fp8_info = torch.finfo(torch.float8_e4m3fn)
fp8_max = fp8_info.max
A_bf16 = (torch.rand(M, K, dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
B_bf16 = (torch.rand(N, K, dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
A_fp8, As_fp8 = per_token_group_quant_fp8(A_bf16, block_size[1], use_ue8m0=False)
B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_bf16, block_size, use_ue8m0=False)
As = As_fp8.to(torch.float32)
Bs = Bs_fp8.to(torch.float32)
ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
out = flashinfer_fp8_blockscale_gemm(
input=A_bf16,
weight=B_fp8,
input_scale=None,
weight_scale=Bs,
out_dtype=out_dtype,
)
rel_diff = torch.mean(
torch.abs(out.to(torch.bfloat16) - ref_out.to(torch.bfloat16))
) / torch.mean(torch.abs(ref_out.to(torch.bfloat16)))
assert rel_diff < 0.001
...@@ -11,7 +11,11 @@ from tests.kernels.quant_utils import ( ...@@ -11,7 +11,11 @@ from tests.kernels.quant_utils import (
ref_dynamic_per_token_quant, ref_dynamic_per_token_quant,
) )
from tests.kernels.utils import opcheck from tests.kernels.utils import opcheck
from vllm.model_executor.layers.quantization.utils.quant_utils import (
scaled_quantize,
)
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
DTYPES = [torch.bfloat16, torch.float] DTYPES = [torch.bfloat16, torch.float]
HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193] HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
...@@ -21,10 +25,18 @@ SEEDS = [0] ...@@ -21,10 +25,18 @@ SEEDS = [0]
def opcheck_fp8_quant( def opcheck_fp8_quant(
output, input, scale=None, scale_ub=None, use_per_token_if_dynamic=False output,
input,
scale=None,
scale_ub=None,
use_per_token_if_dynamic=False,
group_shape=None,
): ):
if scale is not None: if scale is not None:
opcheck(torch.ops._C.static_scaled_fp8_quant, (output, input, scale)) opcheck(
torch.ops._C.static_scaled_fp8_quant,
(output, input, scale, group_shape),
)
elif use_per_token_if_dynamic: elif use_per_token_if_dynamic:
scale = torch.empty( scale = torch.empty(
(input.shape[0], 1), device=input.device, dtype=torch.float32 (input.shape[0], 1), device=input.device, dtype=torch.float32
...@@ -51,7 +63,7 @@ def opcheck_fp8_quant( ...@@ -51,7 +63,7 @@ def opcheck_fp8_quant(
def test_dynamic_per_token_fp8_quant( def test_dynamic_per_token_fp8_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, scale_ub: bool, seed: int num_tokens: int, hidden_size: int, dtype: torch.dtype, scale_ub: bool, seed: int
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
x = ( x = (
torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") + 1e-6 torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") + 1e-6
...@@ -81,7 +93,7 @@ def test_dynamic_per_token_fp8_quant( ...@@ -81,7 +93,7 @@ def test_dynamic_per_token_fp8_quant(
def test_dynamic_per_tensor_fp8_quant( def test_dynamic_per_tensor_fp8_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
...@@ -101,7 +113,7 @@ def test_dynamic_per_tensor_fp8_quant( ...@@ -101,7 +113,7 @@ def test_dynamic_per_tensor_fp8_quant(
@torch.inference_mode() @torch.inference_mode()
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
def test_fp8_quant_large(seed: int) -> None: def test_fp8_quant_large(seed: int) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
num_tokens = 1024000 # Mistral-Nemo's max_position_embeddings num_tokens = 1024000 # Mistral-Nemo's max_position_embeddings
hidden_size = 1152 # Smallest hidden_size to reproduce the error hidden_size = 1152 # Smallest hidden_size to reproduce the error
...@@ -117,4 +129,93 @@ def test_fp8_quant_large(seed: int) -> None: ...@@ -117,4 +129,93 @@ def test_fp8_quant_large(seed: int) -> None:
ref_out = ref_out.to(dtype=dtype) ref_out = ref_out.to(dtype=dtype)
ops_out = ops_out.to(dtype=dtype) ops_out = ops_out.to(dtype=dtype)
torch.testing.assert_close(ref_out, ops_out) torch.testing.assert_close(ref_out, ops_out)
\ No newline at end of file
# Test static FP8 quantization with 2D group scales
GROUP_SHAPES_2D = [
(-1, -1), # Per-tensor
(-1, 1), # Per-channel
(1, -1), # Per-token
(-1, 128), # Per-head quantization
(1, 128), # DeepSeek-style per-token-per-group (group_m=1, group_n=128)
(128, 128), # DeepSeek-style block quantization
(1, 64), # Smaller group size
(1, 16), # Small group (scalar path in kernel)
(4, 256), # Non-trivial both dimensions
]
# Use sizes divisible by all group shapes
NUM_TOKENS_GROUP = [128, 512]
HIDDEN_SIZES_GROUP = [256, 1024, 2048]
@pytest.mark.parametrize("num_tokens", NUM_TOKENS_GROUP)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES_GROUP)
@pytest.mark.parametrize("group_shape", GROUP_SHAPES_2D)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@torch.inference_mode()
def test_static_fp8_quant_group_2d(
num_tokens: int,
hidden_size: int,
group_shape: tuple[int, int],
dtype: torch.dtype,
seed: int,
) -> None:
"""Test static FP8 quantization with 2D group scales using scaled_quantize."""
# Normalize group_shape (-1 means full extent)
norm_group_m = num_tokens if group_shape[0] == -1 else group_shape[0]
norm_group_n = hidden_size if group_shape[1] == -1 else group_shape[1]
# Skip if sizes are not divisible by group shape
if num_tokens % norm_group_m != 0 or hidden_size % norm_group_n != 0:
pytest.skip(
f"Skipping: ({num_tokens}, {hidden_size}) not divisible by "
f"group_shape ({group_shape[0]}, {group_shape[1]})"
)
current_platform.seed_everything(seed)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
ref_out, scale = scaled_quantize(
x, group_shape, FP8_DTYPE, compute_dtype=torch.float32
)
ops_out, ops_scale = ops.scaled_fp8_quant(x, scale=scale, group_shape=group_shape)
torch.testing.assert_close(scale, ops_scale)
torch.testing.assert_close(ref_out.float(), ops_out.float(), rtol=0.12, atol=0.0)
opcheck_fp8_quant(ops_out, x, scale=scale)
@pytest.mark.parametrize("num_tokens", NUM_TOKENS_GROUP)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES_GROUP)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("group_shape", [(1, -1), (-1, 1)]) # per-token, per-channel
@torch.inference_mode()
def test_static_fp8_quant_1d_scale(
num_tokens: int,
hidden_size: int,
dtype: torch.dtype,
seed: int,
group_shape: tuple[int, int],
) -> None:
"""Test static FP8 quantization with 1D scale (per-token or per-channel)."""
current_platform.seed_everything(seed)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
ref_out, scale_2d = scaled_quantize(
x, group_shape, FP8_DTYPE, compute_dtype=torch.float32
)
# Flatten scale to 1D for testing 1D scale path
scale_1d = scale_2d.flatten()
ops_out, ops_scale = ops.scaled_fp8_quant(
x, scale=scale_1d, group_shape=group_shape
)
torch.testing.assert_close(scale_1d, ops_scale)
torch.testing.assert_close(ref_out.float(), ops_out.float(), rtol=0.12, atol=0.0)
opcheck_fp8_quant(ops_out, x, scale=scale_1d, group_shape=group_shape)
...@@ -6,6 +6,7 @@ import torch ...@@ -6,6 +6,7 @@ import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types from vllm.scalar_type import scalar_types
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100): if not current_platform.has_device_capability(100):
pytest.skip( pytest.skip(
...@@ -134,7 +135,7 @@ def test_quantize_to_fp4( ...@@ -134,7 +135,7 @@ def test_quantize_to_fp4(
seed: int, seed: int,
device: str, device: str,
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
torch.set_default_device(device) torch.set_default_device(device)
m, n = shape m, n = shape
...@@ -156,7 +157,7 @@ def test_quantize_to_fp4( ...@@ -156,7 +157,7 @@ def test_quantize_to_fp4(
@torch.inference_mode() @torch.inference_mode()
def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None: def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:
dtype = torch.float16 dtype = torch.float16
current_platform.seed_everything(42) set_random_seed(42)
torch.set_default_device("cuda:0") torch.set_default_device("cuda:0")
m, n = pad_shape m, n = pad_shape
......
...@@ -6,6 +6,7 @@ from nvfp4_utils import FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX, dequantize_nvfp4_to_dt ...@@ -6,6 +6,7 @@ from nvfp4_utils import FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX, dequantize_nvfp4_to_dt
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100): if not current_platform.has_device_capability(100):
pytest.skip( pytest.skip(
...@@ -59,7 +60,7 @@ def test_nvfp4_gemm( ...@@ -59,7 +60,7 @@ def test_nvfp4_gemm(
seed: int, seed: int,
device: str, device: str,
) -> None: ) -> None:
current_platform.seed_everything(seed) set_random_seed(seed)
m, n, packed_k = shape m, n, packed_k = shape
k = packed_k * 2 k = packed_k * 2
block_size = 16 block_size = 16
......
...@@ -9,6 +9,7 @@ from vllm._custom_ops import ( ...@@ -9,6 +9,7 @@ from vllm._custom_ops import (
apply_repetition_penalties_torch, apply_repetition_penalties_torch,
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
NUM_SEQS = [1, 2, 3, 4, 8, 13, 17, 32, 37, 256, 1023, 1024, 1025] NUM_SEQS = [1, 2, 3, 4, 8, 13, 17, 32, 37, 256, 1023, 1024, 1025]
# [stress, stress, stress, Qwen, llama 4] # [stress, stress, stress, Qwen, llama 4]
...@@ -38,7 +39,7 @@ def test_apply_repetition_penalties( ...@@ -38,7 +39,7 @@ def test_apply_repetition_penalties(
Test the apply_repetition_penalties custom op Test the apply_repetition_penalties custom op
against a reference implementation. against a reference implementation.
""" """
current_platform.seed_everything(seed) set_random_seed(seed)
torch.set_default_device("cuda:0") torch.set_default_device("cuda:0")
# Create test data # Create test data
...@@ -95,7 +96,7 @@ def test_apply_repetition_penalties_zero_seqs() -> None: ...@@ -95,7 +96,7 @@ def test_apply_repetition_penalties_zero_seqs() -> None:
dtype = torch.float32 dtype = torch.float32
seed = 0 seed = 0
current_platform.seed_everything(seed) set_random_seed(seed)
torch.set_default_device("cuda:0") torch.set_default_device("cuda:0")
# Create test data # Create test data
......
...@@ -10,7 +10,7 @@ from vllm.model_executor.layers.fla.ops.layernorm_guard import ( ...@@ -10,7 +10,7 @@ from vllm.model_executor.layers.fla.ops.layernorm_guard import (
layernorm_fn, layernorm_fn,
rms_norm_ref, rms_norm_ref,
) )
from vllm.platforms import current_platform from vllm.utils.torch_utils import set_random_seed
def layer_norm_ref( def layer_norm_ref(
...@@ -114,7 +114,7 @@ def test_layer_norm_fwd_basic( ...@@ -114,7 +114,7 @@ def test_layer_norm_fwd_basic(
is_rms_norm: bool, is_rms_norm: bool,
) -> None: ) -> None:
"""Test basic layer norm forward pass without z (gate) tensor.""" """Test basic layer norm forward pass without z (gate) tensor."""
current_platform.seed_everything(seed) set_random_seed(seed)
device = torch.device("cuda:0") device = torch.device("cuda:0")
# Create inputs # Create inputs
...@@ -156,7 +156,7 @@ def test_layer_norm_fwd_with_gate( ...@@ -156,7 +156,7 @@ def test_layer_norm_fwd_with_gate(
is_rms_norm: bool, is_rms_norm: bool,
) -> None: ) -> None:
"""Test layer norm forward pass with z (gate) tensor.""" """Test layer norm forward pass with z (gate) tensor."""
current_platform.seed_everything(42) set_random_seed(42)
device = torch.device("cuda:0") device = torch.device("cuda:0")
# Create inputs # Create inputs
...@@ -213,7 +213,7 @@ def test_layer_norm_fwd_with_groups( ...@@ -213,7 +213,7 @@ def test_layer_norm_fwd_with_groups(
f"hidden_size {hidden_size} not divisible by group_size {group_size}" f"hidden_size {hidden_size} not divisible by group_size {group_size}"
) )
current_platform.seed_everything(42) set_random_seed(42)
device = torch.device("cuda:0") device = torch.device("cuda:0")
# Create inputs # Create inputs
...@@ -253,7 +253,7 @@ def test_layer_norm_rows_per_block( ...@@ -253,7 +253,7 @@ def test_layer_norm_rows_per_block(
dtype: torch.dtype, dtype: torch.dtype,
) -> None: ) -> None:
"""Test that rows_per_block logic works correctly for various M sizes.""" """Test that rows_per_block logic works correctly for various M sizes."""
current_platform.seed_everything(42) set_random_seed(42)
device = torch.device("cuda:0") device = torch.device("cuda:0")
hidden_size = 1024 hidden_size = 1024
...@@ -278,7 +278,7 @@ def test_layer_norm_rows_per_block( ...@@ -278,7 +278,7 @@ def test_layer_norm_rows_per_block(
def test_strided_input(dtype: torch.dtype) -> None: def test_strided_input(dtype: torch.dtype) -> None:
"""Test that the kernel handles non-contiguous (strided) """Test that the kernel handles non-contiguous (strided)
inputs correctly.""" inputs correctly."""
current_platform.seed_everything(42) set_random_seed(42)
device = torch.device("cuda:0") device = torch.device("cuda:0")
num_tokens = 128 num_tokens = 128
hidden_size = 1024 hidden_size = 1024
...@@ -318,7 +318,7 @@ def test_output_buffer_provided( ...@@ -318,7 +318,7 @@ def test_output_buffer_provided(
dtype: torch.dtype, dtype: torch.dtype,
) -> None: ) -> None:
"""Test that the kernel works when an output buffer is provided.""" """Test that the kernel works when an output buffer is provided."""
current_platform.seed_everything(42) set_random_seed(42)
device = torch.device("cuda:0") device = torch.device("cuda:0")
# Create inputs # Create inputs
...@@ -359,7 +359,7 @@ def test_multidimensional_input( ...@@ -359,7 +359,7 @@ def test_multidimensional_input(
dtype: torch.dtype, dtype: torch.dtype,
) -> None: ) -> None:
"""Test that the autograd function handles multidimensional inputs.""" """Test that the autograd function handles multidimensional inputs."""
current_platform.seed_everything(42) set_random_seed(42)
device = torch.device("cuda:0") device = torch.device("cuda:0")
hidden_size = shape[-1] hidden_size = shape[-1]
......
...@@ -42,7 +42,7 @@ def set_seed(seed): ...@@ -42,7 +42,7 @@ def set_seed(seed):
not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION, not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
reason="CUDA not available or PyTorch version < 2.7", reason="CUDA not available or PyTorch version < 2.7",
) )
def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch): def test_flex_attention_vs_default_backend(vllm_runner):
"""Test that FlexAttention produces the same outputs as the default backend. """Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with This test compares the outputs from the FlexAttention backend with
...@@ -59,35 +59,32 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch): ...@@ -59,35 +59,32 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
] ]
# Run with flex attention # Run with flex attention
with monkeypatch.context() as m: set_seed(seed)
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") with vllm_runner(
model_name,
set_seed(seed) runner="generate",
with vllm_runner( tensor_parallel_size=1,
model_name, num_gpu_blocks_override=128,
runner="generate", enforce_eager=True,
tensor_parallel_size=1, attention_config={"backend": "FLEX_ATTENTION"},
num_gpu_blocks_override=128, ) as llm_flex:
enforce_eager=True, output_flex = llm_flex.generate_greedy_logprobs(
) as llm_flex: prompts, max_tokens, num_logprobs
output_flex = llm_flex.generate_greedy_logprobs( )
prompts, max_tokens, num_logprobs
)
# Run with default backend # Run with default backend
with monkeypatch.context() as m: set_seed(seed)
set_seed(seed) with vllm_runner(
with vllm_runner( model_name,
model_name, runner="generate",
runner="generate", tensor_parallel_size=1,
tensor_parallel_size=1, num_gpu_blocks_override=128,
num_gpu_blocks_override=128, enforce_eager=True,
enforce_eager=True, gpu_memory_utilization=0.85,
gpu_memory_utilization=0.85, ) as llm_default:
) as llm_default: output_default = llm_default.generate_greedy_logprobs(
output_default = llm_default.generate_greedy_logprobs( prompts, max_tokens, num_logprobs
prompts, max_tokens, num_logprobs )
)
check_logprobs_close( check_logprobs_close(
outputs_0_lst=output_flex, outputs_0_lst=output_flex,
...@@ -101,7 +98,7 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch): ...@@ -101,7 +98,7 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION, not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
reason="CUDA not available or PyTorch version < 2.7", reason="CUDA not available or PyTorch version < 2.7",
) )
def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch): def test_encoder_flex_attention_vs_default_backend(vllm_runner):
"""Test that FlexAttention produces the same outputs as the default backend. """Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with This test compares the outputs from the FlexAttention backend with
...@@ -115,30 +112,26 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch): ...@@ -115,30 +112,26 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
] ]
# Run with flex attention # Run with flex attention
with monkeypatch.context() as m: with vllm_runner(
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") model_name,
with vllm_runner( runner="pooling",
model_name, dtype=torch.bfloat16,
runner="pooling", tensor_parallel_size=1,
dtype=torch.bfloat16, max_model_len=100,
tensor_parallel_size=1, enforce_eager=True,
max_model_len=100, attention_config={"backend": "FLEX_ATTENTION"},
enforce_eager=True, ) as llm_flex:
) as llm_flex: flex_outputs = llm_flex.embed(prompts)
flex_outputs = llm_flex.embed(prompts)
# Run with default backend # Run with default backend
with ( with vllm_runner(
monkeypatch.context() as m, model_name,
vllm_runner( runner="pooling",
model_name, dtype=torch.bfloat16,
runner="pooling", tensor_parallel_size=1,
dtype=torch.bfloat16, max_model_len=100,
tensor_parallel_size=1, enforce_eager=True,
max_model_len=100, ) as llm_default:
enforce_eager=True,
) as llm_default,
):
default_outputs = llm_default.embed(prompts) default_outputs = llm_default.embed(prompts)
check_embeddings_close( check_embeddings_close(
......
...@@ -39,6 +39,7 @@ def ops_impl(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor: ...@@ -39,6 +39,7 @@ def ops_impl(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode() @torch.inference_mode()
def test_silu_and_mul( def test_silu_and_mul(
default_vllm_config,
num_tokens: int, num_tokens: int,
hidden_size: int, hidden_size: int,
dtype: torch.dtype, dtype: torch.dtype,
......
...@@ -13,11 +13,11 @@ import torch ...@@ -13,11 +13,11 @@ import torch
from torch._prims_common import TensorLikeType from torch._prims_common import TensorLikeType
from tests.kernels.quant_utils import native_w8a8_block_matmul from tests.kernels.quant_utils import native_w8a8_block_matmul
from vllm.attention.backends.abstract import AttentionType
from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.custom_op import CustomOp
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
from vllm.utils.torch_utils import make_tensor_with_pad from vllm.utils.torch_utils import make_tensor_with_pad
from vllm.v1.attention.backend import AttentionType
# For now, disable "test_aot_dispatch_dynamic" since there are some # For now, disable "test_aot_dispatch_dynamic" since there are some
# bugs related to this test in PyTorch 2.4. # bugs related to this test in PyTorch 2.4.
......
...@@ -84,7 +84,7 @@ class DummyLoRAModel(nn.Sequential, SupportsLoRA): ...@@ -84,7 +84,7 @@ class DummyLoRAModel(nn.Sequential, SupportsLoRA):
@pytest.fixture @pytest.fixture
def dummy_model() -> nn.Module: def dummy_model(default_vllm_config) -> nn.Module:
model = DummyLoRAModel( model = DummyLoRAModel(
OrderedDict( OrderedDict(
[ [
...@@ -117,7 +117,7 @@ def dummy_model() -> nn.Module: ...@@ -117,7 +117,7 @@ def dummy_model() -> nn.Module:
@pytest.fixture @pytest.fixture
def dummy_model_gate_up() -> nn.Module: def dummy_model_gate_up(default_vllm_config) -> nn.Module:
model = DummyLoRAModel( model = DummyLoRAModel(
OrderedDict( OrderedDict(
[ [
...@@ -214,6 +214,31 @@ def qwen25vl_lora_files(): ...@@ -214,6 +214,31 @@ def qwen25vl_lora_files():
return snapshot_download(repo_id="jeeejeee/qwen25-vl-lora-pokemon") return snapshot_download(repo_id="jeeejeee/qwen25-vl-lora-pokemon")
@pytest.fixture(scope="session")
def qwen2vl_language_lora_files():
return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-language")
@pytest.fixture(scope="session")
def qwen2vl_vision_tower_connector_lora_files():
return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-tower-connector")
@pytest.fixture(scope="session")
def qwen2vl_vision_tower_lora_files():
return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-tower")
@pytest.fixture(scope="session")
def qwen25vl_vision_lora_files():
return snapshot_download(repo_id="EpochEcho/qwen2.5-3b-vl-lora-vision-connector")
@pytest.fixture(scope="session")
def qwen3vl_vision_lora_files():
return snapshot_download(repo_id="EpochEcho/qwen3-4b-vl-lora-vision-connector")
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def tinyllama_lora_files(): def tinyllama_lora_files():
# return snapshot_download(repo_id="jashing/tinyllama-colorist-lora") # return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
......
...@@ -18,8 +18,8 @@ from vllm.distributed.parallel_state import ( ...@@ -18,8 +18,8 @@ from vllm.distributed.parallel_state import (
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
) )
from vllm.lora.ops.triton_ops import fused_moe_lora from vllm.lora.ops.triton_ops import fused_moe_lora
from vllm.platforms import current_platform
from vllm.utils.network_utils import get_open_port from vllm.utils.network_utils import get_open_port
from vllm.utils.torch_utils import set_random_seed
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
...@@ -265,7 +265,7 @@ def test_fused_moe_lora_kernel( ...@@ -265,7 +265,7 @@ def test_fused_moe_lora_kernel(
seed, seed,
): ):
torch.set_default_device(device) torch.set_default_device(device)
current_platform.seed_everything(seed) set_random_seed(seed)
# the number of randomly generated sentences. # the number of randomly generated sentences.
num_sequences = 10 num_sequences = 10
# generate data # generate data
...@@ -358,7 +358,7 @@ def test_fused_moe_lora_kernel_fully_sharded( ...@@ -358,7 +358,7 @@ def test_fused_moe_lora_kernel_fully_sharded(
seed, seed,
column_parallel, column_parallel,
): ):
current_platform.seed_everything(seed) set_random_seed(seed)
# the number of randomly generated sentences. # the number of randomly generated sentences.
num_sequences = 10 num_sequences = 10
# generate data # generate data
...@@ -415,7 +415,7 @@ def use_fused_moe_lora_kernel_tensor_parallel( ...@@ -415,7 +415,7 @@ def use_fused_moe_lora_kernel_tensor_parallel(
def _get_shard_slice(shard_size): def _get_shard_slice(shard_size):
return slice(local_rank * shard_size, (local_rank + 1) * shard_size) return slice(local_rank * shard_size, (local_rank + 1) * shard_size)
current_platform.seed_everything(seed) set_random_seed(seed)
device = torch.device(f"cuda:{local_rank}") device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device) torch.cuda.set_device(device)
......
...@@ -34,9 +34,9 @@ The Competition_ID of competition_record is the foreign key of Competition_ID of ...@@ -34,9 +34,9 @@ The Competition_ID of competition_record is the foreign key of Competition_ID of
###Response:<|end|><|start|>assistant<|channel|>final<|message|>""" # noqa: E501 ###Response:<|end|><|start|>assistant<|channel|>final<|message|>""" # noqa: E501
EXPECTED_LORA_OUTPUT = [ EXPECTED_LORA_OUTPUT = [
"SELECT AVG(Working_Horses) FROM farm WHERE Total_Horses > 5000;", "SELECT avg(Working_Horses) FROM farm WHERE Total_Horses > 5000",
"SELECT MAX(Cows) AS Max_Cows, MIN(Cows) AS Min_Cows FROM farm;", "SELECT max(Cows) , min(Cows) FROM farm",
"SELECT MAX(Cows) AS Max_Cows, MIN(Cows) AS Min_Cows FROM farm;", "SELECT max(Cows) , min(Cows) FROM farm",
] ]
...@@ -69,38 +69,54 @@ def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None: ...@@ -69,38 +69,54 @@ def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None:
assert generated_texts[i].startswith(EXPECTED_LORA_OUTPUT[i]) assert generated_texts[i].startswith(EXPECTED_LORA_OUTPUT[i])
def test_gpt_oss_lora(gptoss20b_lora_files): @pytest.mark.parametrize("mxfp4_use_marlin", [True, False])
llm = vllm.LLM( def test_gpt_oss_lora(
MODEL_PATH, monkeypatch: pytest.MonkeyPatch, gptoss20b_lora_files, mxfp4_use_marlin
max_model_len=1024, ):
enable_lora=True, with monkeypatch.context() as m:
max_loras=4, m.setenv("VLLM_MXFP4_USE_MARLIN", "1" if mxfp4_use_marlin else "0")
max_lora_rank=8, llm = vllm.LLM(
compilation_config=vllm.config.CompilationConfig( # Avoid OOM MODEL_PATH,
cudagraph_specialize_lora=False, max_model_len=1024,
), enable_lora=True,
) max_loras=4,
max_lora_rank=8,
generate_and_test(llm, gptoss20b_lora_files, lora_id=1) max_num_seqs=2,
generate_and_test(llm, gptoss20b_lora_files, lora_id=2) max_num_batched_tokens=2048,
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
cudagraph_specialize_lora=False,
),
)
generate_and_test(llm, gptoss20b_lora_files, lora_id=1)
generate_and_test(llm, gptoss20b_lora_files, lora_id=2)
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("fully_sharded_loras", [False, True]) @pytest.mark.parametrize("fully_sharded_loras", [False, True])
def test_gpt_oss_lora_tp2(gptoss20b_lora_files, fully_sharded_loras): @pytest.mark.parametrize("mxfp4_use_marlin", [True, False])
llm = vllm.LLM( def test_gpt_oss_lora_tp2(
MODEL_PATH, monkeypatch: pytest.MonkeyPatch,
max_model_len=1024, gptoss20b_lora_files,
enable_lora=True, fully_sharded_loras,
max_loras=2, mxfp4_use_marlin,
max_lora_rank=8, ):
max_num_seqs=16, with monkeypatch.context() as m:
tensor_parallel_size=2, m.setenv("VLLM_MXFP4_USE_MARLIN", "1" if mxfp4_use_marlin else "0")
fully_sharded_loras=fully_sharded_loras, llm = vllm.LLM(
compilation_config=vllm.config.CompilationConfig( # Avoid OOM MODEL_PATH,
cudagraph_specialize_lora=False, max_model_len=1024,
), enable_lora=True,
) max_loras=2,
max_num_seqs=2,
generate_and_test(llm, gptoss20b_lora_files, lora_id=1) max_num_batched_tokens=2048,
generate_and_test(llm, gptoss20b_lora_files, lora_id=2) tensor_parallel_size=2,
gpu_memory_utilization=0.8,
fully_sharded_loras=fully_sharded_loras,
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
cudagraph_specialize_lora=False,
),
)
generate_and_test(llm, gptoss20b_lora_files, lora_id=1)
generate_and_test(llm, gptoss20b_lora_files, lora_id=2)
...@@ -43,8 +43,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ...@@ -43,8 +43,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding, VocabParallelEmbedding,
get_masked_input_and_mask, get_masked_input_and_mask,
) )
from vllm.model_executor.utils import set_random_seed
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
from .utils import DummyLoRAManager from .utils import DummyLoRAManager
...@@ -252,7 +252,9 @@ def check_punica_wrapper(punica_wrapper) -> bool: ...@@ -252,7 +252,9 @@ def check_punica_wrapper(punica_wrapper) -> bool:
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000]) @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
@pytest.mark.parametrize("stage", STAGES) @pytest.mark.parametrize("stage", STAGES)
def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: def test_embeddings(
default_vllm_config, dist_init, num_loras, device, vocab_size, stage
) -> None:
# For multi-GPU testing of Triton kernel, we must explicitly set the CUDA # For multi-GPU testing of Triton kernel, we must explicitly set the CUDA
# device, see: https://github.com/triton-lang/triton/issues/2925 # device, see: https://github.com/triton-lang/triton/issues/2925
# Same below. # Same below.
...@@ -261,11 +263,11 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: ...@@ -261,11 +263,11 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
torch.set_default_device(device) torch.set_default_device(device)
max_loras = 8 max_loras = 8
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
assert check_punica_wrapper(punica_wrapper)
lora_config = LoRAConfig( lora_config = LoRAConfig(
max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16 max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16
) )
punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
assert check_punica_wrapper(punica_wrapper)
def create_random_embedding_layer(): def create_random_embedding_layer():
embedding = VocabParallelEmbedding(vocab_size, 256) embedding = VocabParallelEmbedding(vocab_size, 256)
...@@ -353,18 +355,18 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: ...@@ -353,18 +355,18 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512]) @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
@pytest.mark.parametrize("stage", STAGES) @pytest.mark.parametrize("stage", STAGES)
def test_lm_head_logits_processor( def test_lm_head_logits_processor(
dist_init, num_loras, device, vocab_size, stage default_vllm_config, dist_init, num_loras, device, vocab_size, stage
) -> None: ) -> None:
if current_platform.is_cuda_alike(): if current_platform.is_cuda_alike():
torch.cuda.set_device(device) torch.cuda.set_device(device)
torch.set_default_device(device) torch.set_default_device(device)
max_loras = 8 max_loras = 8
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
assert check_punica_wrapper(punica_wrapper)
lora_config = LoRAConfig( lora_config = LoRAConfig(
max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16 max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16
) )
punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
assert check_punica_wrapper(punica_wrapper)
def _pretest(): def _pretest():
linear = ParallelLMHead( linear = ParallelLMHead(
...@@ -470,6 +472,7 @@ def test_lm_head_logits_processor( ...@@ -470,6 +472,7 @@ def test_lm_head_logits_processor(
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES) @pytest.mark.parametrize("stage", STAGES)
def test_linear_replicated( def test_linear_replicated(
default_vllm_config,
dist_init, dist_init,
num_loras, num_loras,
device, device,
...@@ -480,13 +483,13 @@ def test_linear_replicated( ...@@ -480,13 +483,13 @@ def test_linear_replicated(
max_loras = 8 max_loras = 8
torch.set_default_device(device) torch.set_default_device(device)
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
assert check_punica_wrapper(punica_wrapper)
lora_config = LoRAConfig( lora_config = LoRAConfig(
max_loras=max_loras, max_loras=max_loras,
max_lora_rank=8, max_lora_rank=8,
lora_dtype=torch.float16, lora_dtype=torch.float16,
) )
punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
assert check_punica_wrapper(punica_wrapper)
def create_random_linear_replicated_layer(): def create_random_linear_replicated_layer():
linear = ReplicatedLinear(4096, 4096, bias=False, params_dtype=torch.float16) linear = ReplicatedLinear(4096, 4096, bias=False, params_dtype=torch.float16)
...@@ -580,21 +583,21 @@ def test_linear_replicated( ...@@ -580,21 +583,21 @@ def test_linear_replicated(
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES) @pytest.mark.parametrize("stage", STAGES)
def test_linear_parallel( def test_linear_parallel(
dist_init, num_loras, orientation, fully_shard, device, stage default_vllm_config, dist_init, num_loras, orientation, fully_shard, device, stage
) -> None: ) -> None:
if current_platform.is_cuda_alike(): if current_platform.is_cuda_alike():
torch.cuda.set_device(device) torch.cuda.set_device(device)
max_loras = 8 max_loras = 8
torch.set_default_device(device) torch.set_default_device(device)
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
assert check_punica_wrapper(punica_wrapper)
lora_config = LoRAConfig( lora_config = LoRAConfig(
max_loras=max_loras, max_loras=max_loras,
max_lora_rank=8, max_lora_rank=8,
fully_sharded_loras=fully_shard, fully_sharded_loras=fully_shard,
lora_dtype=torch.float16, lora_dtype=torch.float16,
) )
punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
assert check_punica_wrapper(punica_wrapper)
def create_random_linear_parallel_layer(): def create_random_linear_parallel_layer():
if orientation == "row": if orientation == "row":
...@@ -705,21 +708,21 @@ def test_linear_parallel( ...@@ -705,21 +708,21 @@ def test_linear_parallel(
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES) @pytest.mark.parametrize("stage", STAGES)
def test_column_parallel_packed( def test_column_parallel_packed(
dist_init, num_loras, repeats, fully_shard, device, stage default_vllm_config, dist_init, num_loras, repeats, fully_shard, device, stage
) -> None: ) -> None:
if current_platform.is_cuda_alike(): if current_platform.is_cuda_alike():
torch.cuda.set_device(device) torch.cuda.set_device(device)
max_loras = 8 max_loras = 8
torch.set_default_device(device) torch.set_default_device(device)
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
assert check_punica_wrapper(punica_wrapper)
lora_config = LoRAConfig( lora_config = LoRAConfig(
max_loras=max_loras, max_loras=max_loras,
max_lora_rank=8, max_lora_rank=8,
fully_sharded_loras=fully_shard, fully_sharded_loras=fully_shard,
lora_dtype=torch.float16, lora_dtype=torch.float16,
) )
punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
assert check_punica_wrapper(punica_wrapper)
def create_column_parallel_packed_layer(): def create_column_parallel_packed_layer():
if repeats == 2: if repeats == 2:
...@@ -851,7 +854,7 @@ def test_column_parallel_packed( ...@@ -851,7 +854,7 @@ def test_column_parallel_packed(
@pytest.mark.parametrize( @pytest.mark.parametrize(
"seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS)) "seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS))
) )
def test_vocab_parallel_embedding_indices(tp_size, seed): def test_vocab_parallel_embedding_indices(tp_size, seed, default_vllm_config):
random.seed(seed) random.seed(seed)
vocab_size = random.randint(4000, 64000) vocab_size = random.randint(4000, 64000)
added_vocab_size = random.randint(0, 1024) added_vocab_size = random.randint(0, 1024)
......
...@@ -77,11 +77,18 @@ def do_sample( ...@@ -77,11 +77,18 @@ def do_sample(
if lora_id if lora_id
else None, else None,
) )
# Print the outputs. lora_request = LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None
generated_texts: list[str] = [] generated_texts: list[str] = []
for output in outputs: for output in outputs:
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
# The output should include correct lora_request info
if lora_request is not None:
assert output.lora_request.lora_name == lora_request.lora_name
assert output.lora_request.lora_int_id == lora_request.lora_int_id
assert output.lora_request.lora_path == lora_request.lora_path
else:
assert output.lora_request is None
generated_texts.append(generated_text) generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
return generated_texts return generated_texts
......
...@@ -18,6 +18,7 @@ from vllm.lora.layers import ( ...@@ -18,6 +18,7 @@ from vllm.lora.layers import (
from vllm.lora.lora_model import LoRAModel from vllm.lora.lora_model import LoRAModel
from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
from vllm.lora.model_manager import ( from vllm.lora.model_manager import (
DEFAULT_LANGUAGE_WRAPPER_KEY,
LoRAMapping, LoRAMapping,
LoRAModelManager, LoRAModelManager,
LRUCacheLoRAModelManager, LRUCacheLoRAModelManager,
...@@ -110,7 +111,7 @@ def create_packed_lora( ...@@ -110,7 +111,7 @@ def create_packed_lora(
return LoRAModel(lora_id, 8, loras) return LoRAModel(lora_id, 8, loras)
def test_replace_submodules(dist_init, dummy_model): def test_replace_submodules(default_vllm_config, dist_init, dummy_model):
model = dummy_model model = dummy_model
manager = LoRAModelManager( manager = LoRAModelManager(
model, model,
...@@ -132,7 +133,7 @@ def test_replace_submodules(dist_init, dummy_model): ...@@ -132,7 +133,7 @@ def test_replace_submodules(dist_init, dummy_model):
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_lora_model_manager(dist_init, dummy_model, device): def test_lora_model_manager(default_vllm_config, dist_init, dummy_model, device):
model = dummy_model model = dummy_model
model_lora1 = create_lora( model_lora1 = create_lora(
1, model, ["layer1.dense1", "dense2", "lm_head"], device=device 1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
...@@ -183,9 +184,11 @@ def test_lora_model_manager(dist_init, dummy_model, device): ...@@ -183,9 +184,11 @@ def test_lora_model_manager(dist_init, dummy_model, device):
assert manager.activate_adapter(2) assert manager.activate_adapter(2)
assert manager.lora_index_to_id[0] == 3 assert manager.lora_index_to_id[0] == 3
assert manager.lora_index_to_id[1] == 2 assert manager.lora_index_to_id[1] == 2
assert manager.device == device assert manager.device == device
assert manager.punica_wrapper.device == device assert (
manager.punica_wrapper_mapping.get(DEFAULT_LANGUAGE_WRAPPER_KEY).device
== device
)
assert hasattr(manager, "supported_lora_modules") assert hasattr(manager, "supported_lora_modules")
assert sorted(manager.supported_lora_modules) == [ assert sorted(manager.supported_lora_modules) == [
"dense1", "dense1",
...@@ -196,7 +199,9 @@ def test_lora_model_manager(dist_init, dummy_model, device): ...@@ -196,7 +199,9 @@ def test_lora_model_manager(dist_init, dummy_model, device):
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_lora_lru_cache_model_manager(dist_init, dummy_model, device): def test_lora_lru_cache_model_manager(
default_vllm_config, dist_init, dummy_model, device
):
model = dummy_model model = dummy_model
model_lora1 = create_lora( model_lora1 = create_lora(
1, model, ["layer1.dense1", "dense2", "lm_head"], device=device 1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
...@@ -278,13 +283,15 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device): ...@@ -278,13 +283,15 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
assert manager.remove_adapter(3) assert manager.remove_adapter(3)
with pytest.raises(ValueError): with pytest.raises(ValueError):
assert manager.pin_adapter(3) assert manager.pin_adapter(3)
assert (
assert manager.punica_wrapper.device == device manager.punica_wrapper_mapping.get(DEFAULT_LANGUAGE_WRAPPER_KEY).device
== device
)
assert manager.device == device assert manager.device == device
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_lru_lora_model_manager(dist_init, dummy_model, device): def test_lru_lora_model_manager(default_vllm_config, dist_init, dummy_model, device):
# This tests just the LRU cache functionality, everything else is # This tests just the LRU cache functionality, everything else is
# tested in test_lora_model_manager # tested in test_lora_model_manager
model = dummy_model model = dummy_model
...@@ -402,12 +409,17 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device): ...@@ -402,12 +409,17 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
assert manager.remove_oldest_adapter() assert manager.remove_oldest_adapter()
assert set(manager.list_adapters()) == {1} assert set(manager.list_adapters()) == {1}
assert manager.punica_wrapper.device == device assert (
manager.punica_wrapper_mapping.get(DEFAULT_LANGUAGE_WRAPPER_KEY).device
== device
)
assert manager.device == device assert manager.device == device
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_path): def test_lru_cache_worker_adapter_manager(
default_vllm_config, dist_init, dummy_model, device, tmp_path
):
lora_config = LoRAConfig( lora_config = LoRAConfig(
max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE
) )
...@@ -514,11 +526,16 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa ...@@ -514,11 +526,16 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa
) )
assert worker_adapter_manager.device == device assert worker_adapter_manager.device == device
assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device punica_wrapper = worker_adapter_manager._adapter_manager.punica_wrapper_mapping.get(
DEFAULT_LANGUAGE_WRAPPER_KEY
)
assert punica_wrapper.device == device
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path): def test_worker_adapter_manager(
default_vllm_config, dist_init, dummy_model_gate_up, device, tmp_path
):
# Should remove every LoRA not specified in the request. # Should remove every LoRA not specified in the request.
lora_config = LoRAConfig( lora_config = LoRAConfig(
max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE
...@@ -618,11 +635,14 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path ...@@ -618,11 +635,14 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
) )
assert worker_adapter_manager.device == device assert worker_adapter_manager.device == device
assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device punica_wrapper = worker_adapter_manager._adapter_manager.punica_wrapper_mapping.get(
DEFAULT_LANGUAGE_WRAPPER_KEY
)
assert punica_wrapper.device == device
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_packed_loras(dist_init, dummy_model_gate_up, device): def test_packed_loras(default_vllm_config, dist_init, dummy_model_gate_up, device):
model = dummy_model_gate_up model = dummy_model_gate_up
model_lora = create_packed_lora( model_lora = create_packed_lora(
1, 1,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment