Commit 7e63ef82 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0' into v0.14.0-dev

parents 8cbcac5d b17039bc
......@@ -7,7 +7,7 @@ import torch
from tests.kernels.quant_utils import ref_dynamic_per_token_quant
from tests.kernels.utils import opcheck
from vllm._custom_ops import scaled_int8_quant
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
DTYPES = [torch.bfloat16, torch.float]
HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
......@@ -48,7 +48,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
def test_dynamic_scaled_int8_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
......@@ -74,7 +74,7 @@ def test_dynamic_scaled_int8_quant(
def test_dynamic_scaled_int8_azp_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
int8_traits = torch.iinfo(torch.int8)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300
......@@ -115,7 +115,7 @@ def test_dynamic_scaled_int8_azp_quant(
def test_static_scaled_int8_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int, scale: float
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
int8_traits = torch.iinfo(torch.int8)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
......@@ -148,7 +148,7 @@ def test_static_scaled_int8_azp_quant(
scale: float,
azp: int,
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
int8_traits = torch.iinfo(torch.int8)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300
......
......@@ -24,6 +24,7 @@ from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_m
from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not torch.cuda.is_available():
pytest.skip("CUDA required for these tests.", allow_module_level=True)
......@@ -205,7 +206,7 @@ LLAMA_MODELS = {
@pytest.fixture(autouse=True)
def _seed_each_test():
current_platform.seed_everything(0)
set_random_seed(0)
np.random.seed(0)
torch.random.manual_seed(0)
......
......@@ -25,6 +25,7 @@ from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm
from vllm._custom_ops import fusedQuantizeNv
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not torch.cuda.is_available():
pytest.skip("CUDA required for these tests.", allow_module_level=True)
......@@ -193,7 +194,7 @@ LLAMA_MODELS = {
@pytest.fixture(autouse=True)
def _seed_each_test():
current_platform.seed_everything(0)
set_random_seed(0)
np.random.seed(0)
torch.random.manual_seed(0)
......
......@@ -11,6 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import (
from vllm._custom_ops import scaled_fp4_quant
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100):
pytest.skip(
......@@ -30,10 +31,11 @@ BLOCK_SIZE = 16
@pytest.mark.parametrize("shape", SHAPES)
@torch.inference_mode()
def test_silu_mul_nvfp4_quant(
default_vllm_config,
dtype: torch.dtype,
shape: tuple[int, int],
) -> None:
current_platform.seed_everything(42)
set_random_seed(42)
device = "cuda:0"
torch.set_default_device(device)
......
......@@ -11,7 +11,9 @@ import pytest
import torch
from vllm.platforms import current_platform
from ...utils import models_path_prefix
from vllm.utils.torch_utils import set_random_seed
device = "cuda"
......@@ -86,7 +88,7 @@ def test_scaled_mm(
):
is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t).is_floating_point()
current_platform.seed_everything(0)
set_random_seed(0)
# NOTE: There are cases, where if the matrix is large enough, an output
# like 65504.4 can be produced, and can easily turn into inf when
......
......@@ -24,6 +24,10 @@ from vllm.utils.deep_gemm import (
per_block_cast_to_fp8,
should_use_deepgemm_for_fp8_linear,
)
from vllm.utils.flashinfer import (
flashinfer_fp8_blockscale_gemm,
has_flashinfer_fp8_blockscale_gemm,
)
from vllm.utils.import_utils import has_deep_gemm
if current_platform.get_device_capability() < (9, 0):
......@@ -205,3 +209,50 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))
) / torch.mean(torch.abs(ref_out.to(torch.float32)))
assert rel_diff < 0.001
@pytest.mark.skipif(
current_platform.is_fp8_fnuz(),
reason="This platform supports e4m3fnuz, not e4m3fn.",
)
@pytest.mark.parametrize(
"M,N,K,block_size,out_dtype,seed",
itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS),
)
@torch.inference_mode()
def test_w8a8_block_fp8_flashinfer_matmul(M, N, K, block_size, out_dtype, seed):
if not has_flashinfer_fp8_blockscale_gemm():
pytest.skip(
"FlashInfer block GEMM not available (requires SM90+ and FlashInfer)"
)
# only aligned sizes
if K % 128 != 0 or N % 64 != 0:
pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}")
torch.manual_seed(seed)
fp8_info = torch.finfo(torch.float8_e4m3fn)
fp8_max = fp8_info.max
A_bf16 = (torch.rand(M, K, dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
B_bf16 = (torch.rand(N, K, dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
A_fp8, As_fp8 = per_token_group_quant_fp8(A_bf16, block_size[1], use_ue8m0=False)
B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_bf16, block_size, use_ue8m0=False)
As = As_fp8.to(torch.float32)
Bs = Bs_fp8.to(torch.float32)
ref_out = native_w8a8_block_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
out = flashinfer_fp8_blockscale_gemm(
input=A_bf16,
weight=B_fp8,
input_scale=None,
weight_scale=Bs,
out_dtype=out_dtype,
)
rel_diff = torch.mean(
torch.abs(out.to(torch.bfloat16) - ref_out.to(torch.bfloat16))
) / torch.mean(torch.abs(ref_out.to(torch.bfloat16)))
assert rel_diff < 0.001
......@@ -11,7 +11,11 @@ from tests.kernels.quant_utils import (
ref_dynamic_per_token_quant,
)
from tests.kernels.utils import opcheck
from vllm.model_executor.layers.quantization.utils.quant_utils import (
scaled_quantize,
)
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
DTYPES = [torch.bfloat16, torch.float]
HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193]
......@@ -21,10 +25,18 @@ SEEDS = [0]
def opcheck_fp8_quant(
output, input, scale=None, scale_ub=None, use_per_token_if_dynamic=False
output,
input,
scale=None,
scale_ub=None,
use_per_token_if_dynamic=False,
group_shape=None,
):
if scale is not None:
opcheck(torch.ops._C.static_scaled_fp8_quant, (output, input, scale))
opcheck(
torch.ops._C.static_scaled_fp8_quant,
(output, input, scale, group_shape),
)
elif use_per_token_if_dynamic:
scale = torch.empty(
(input.shape[0], 1), device=input.device, dtype=torch.float32
......@@ -51,7 +63,7 @@ def opcheck_fp8_quant(
def test_dynamic_per_token_fp8_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, scale_ub: bool, seed: int
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
x = (
torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") + 1e-6
......@@ -81,7 +93,7 @@ def test_dynamic_per_token_fp8_quant(
def test_dynamic_per_tensor_fp8_quant(
num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
......@@ -101,7 +113,7 @@ def test_dynamic_per_tensor_fp8_quant(
@torch.inference_mode()
@pytest.mark.parametrize("seed", SEEDS)
def test_fp8_quant_large(seed: int) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
num_tokens = 1024000 # Mistral-Nemo's max_position_embeddings
hidden_size = 1152 # Smallest hidden_size to reproduce the error
......@@ -117,4 +129,93 @@ def test_fp8_quant_large(seed: int) -> None:
ref_out = ref_out.to(dtype=dtype)
ops_out = ops_out.to(dtype=dtype)
torch.testing.assert_close(ref_out, ops_out)
\ No newline at end of file
torch.testing.assert_close(ref_out, ops_out)
# Test static FP8 quantization with 2D group scales
GROUP_SHAPES_2D = [
(-1, -1), # Per-tensor
(-1, 1), # Per-channel
(1, -1), # Per-token
(-1, 128), # Per-head quantization
(1, 128), # DeepSeek-style per-token-per-group (group_m=1, group_n=128)
(128, 128), # DeepSeek-style block quantization
(1, 64), # Smaller group size
(1, 16), # Small group (scalar path in kernel)
(4, 256), # Non-trivial both dimensions
]
# Use sizes divisible by all group shapes
NUM_TOKENS_GROUP = [128, 512]
HIDDEN_SIZES_GROUP = [256, 1024, 2048]
@pytest.mark.parametrize("num_tokens", NUM_TOKENS_GROUP)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES_GROUP)
@pytest.mark.parametrize("group_shape", GROUP_SHAPES_2D)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@torch.inference_mode()
def test_static_fp8_quant_group_2d(
num_tokens: int,
hidden_size: int,
group_shape: tuple[int, int],
dtype: torch.dtype,
seed: int,
) -> None:
"""Test static FP8 quantization with 2D group scales using scaled_quantize."""
# Normalize group_shape (-1 means full extent)
norm_group_m = num_tokens if group_shape[0] == -1 else group_shape[0]
norm_group_n = hidden_size if group_shape[1] == -1 else group_shape[1]
# Skip if sizes are not divisible by group shape
if num_tokens % norm_group_m != 0 or hidden_size % norm_group_n != 0:
pytest.skip(
f"Skipping: ({num_tokens}, {hidden_size}) not divisible by "
f"group_shape ({group_shape[0]}, {group_shape[1]})"
)
current_platform.seed_everything(seed)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
ref_out, scale = scaled_quantize(
x, group_shape, FP8_DTYPE, compute_dtype=torch.float32
)
ops_out, ops_scale = ops.scaled_fp8_quant(x, scale=scale, group_shape=group_shape)
torch.testing.assert_close(scale, ops_scale)
torch.testing.assert_close(ref_out.float(), ops_out.float(), rtol=0.12, atol=0.0)
opcheck_fp8_quant(ops_out, x, scale=scale)
@pytest.mark.parametrize("num_tokens", NUM_TOKENS_GROUP)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES_GROUP)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@pytest.mark.parametrize("group_shape", [(1, -1), (-1, 1)]) # per-token, per-channel
@torch.inference_mode()
def test_static_fp8_quant_1d_scale(
num_tokens: int,
hidden_size: int,
dtype: torch.dtype,
seed: int,
group_shape: tuple[int, int],
) -> None:
"""Test static FP8 quantization with 1D scale (per-token or per-channel)."""
current_platform.seed_everything(seed)
x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda")
ref_out, scale_2d = scaled_quantize(
x, group_shape, FP8_DTYPE, compute_dtype=torch.float32
)
# Flatten scale to 1D for testing 1D scale path
scale_1d = scale_2d.flatten()
ops_out, ops_scale = ops.scaled_fp8_quant(
x, scale=scale_1d, group_shape=group_shape
)
torch.testing.assert_close(scale_1d, ops_scale)
torch.testing.assert_close(ref_out.float(), ops_out.float(), rtol=0.12, atol=0.0)
opcheck_fp8_quant(ops_out, x, scale=scale_1d, group_shape=group_shape)
......@@ -6,6 +6,7 @@ import torch
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100):
pytest.skip(
......@@ -134,7 +135,7 @@ def test_quantize_to_fp4(
seed: int,
device: str,
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
torch.set_default_device(device)
m, n = shape
......@@ -156,7 +157,7 @@ def test_quantize_to_fp4(
@torch.inference_mode()
def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:
dtype = torch.float16
current_platform.seed_everything(42)
set_random_seed(42)
torch.set_default_device("cuda:0")
m, n = pad_shape
......
......@@ -6,6 +6,7 @@ from nvfp4_utils import FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX, dequantize_nvfp4_to_dt
from vllm import _custom_ops as ops
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
if not current_platform.has_device_capability(100):
pytest.skip(
......@@ -59,7 +60,7 @@ def test_nvfp4_gemm(
seed: int,
device: str,
) -> None:
current_platform.seed_everything(seed)
set_random_seed(seed)
m, n, packed_k = shape
k = packed_k * 2
block_size = 16
......
......@@ -9,6 +9,7 @@ from vllm._custom_ops import (
apply_repetition_penalties_torch,
)
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
NUM_SEQS = [1, 2, 3, 4, 8, 13, 17, 32, 37, 256, 1023, 1024, 1025]
# [stress, stress, stress, Qwen, llama 4]
......@@ -38,7 +39,7 @@ def test_apply_repetition_penalties(
Test the apply_repetition_penalties custom op
against a reference implementation.
"""
current_platform.seed_everything(seed)
set_random_seed(seed)
torch.set_default_device("cuda:0")
# Create test data
......@@ -95,7 +96,7 @@ def test_apply_repetition_penalties_zero_seqs() -> None:
dtype = torch.float32
seed = 0
current_platform.seed_everything(seed)
set_random_seed(seed)
torch.set_default_device("cuda:0")
# Create test data
......
......@@ -10,7 +10,7 @@ from vllm.model_executor.layers.fla.ops.layernorm_guard import (
layernorm_fn,
rms_norm_ref,
)
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
def layer_norm_ref(
......@@ -114,7 +114,7 @@ def test_layer_norm_fwd_basic(
is_rms_norm: bool,
) -> None:
"""Test basic layer norm forward pass without z (gate) tensor."""
current_platform.seed_everything(seed)
set_random_seed(seed)
device = torch.device("cuda:0")
# Create inputs
......@@ -156,7 +156,7 @@ def test_layer_norm_fwd_with_gate(
is_rms_norm: bool,
) -> None:
"""Test layer norm forward pass with z (gate) tensor."""
current_platform.seed_everything(42)
set_random_seed(42)
device = torch.device("cuda:0")
# Create inputs
......@@ -213,7 +213,7 @@ def test_layer_norm_fwd_with_groups(
f"hidden_size {hidden_size} not divisible by group_size {group_size}"
)
current_platform.seed_everything(42)
set_random_seed(42)
device = torch.device("cuda:0")
# Create inputs
......@@ -253,7 +253,7 @@ def test_layer_norm_rows_per_block(
dtype: torch.dtype,
) -> None:
"""Test that rows_per_block logic works correctly for various M sizes."""
current_platform.seed_everything(42)
set_random_seed(42)
device = torch.device("cuda:0")
hidden_size = 1024
......@@ -278,7 +278,7 @@ def test_layer_norm_rows_per_block(
def test_strided_input(dtype: torch.dtype) -> None:
"""Test that the kernel handles non-contiguous (strided)
inputs correctly."""
current_platform.seed_everything(42)
set_random_seed(42)
device = torch.device("cuda:0")
num_tokens = 128
hidden_size = 1024
......@@ -318,7 +318,7 @@ def test_output_buffer_provided(
dtype: torch.dtype,
) -> None:
"""Test that the kernel works when an output buffer is provided."""
current_platform.seed_everything(42)
set_random_seed(42)
device = torch.device("cuda:0")
# Create inputs
......@@ -359,7 +359,7 @@ def test_multidimensional_input(
dtype: torch.dtype,
) -> None:
"""Test that the autograd function handles multidimensional inputs."""
current_platform.seed_everything(42)
set_random_seed(42)
device = torch.device("cuda:0")
hidden_size = shape[-1]
......
......@@ -42,7 +42,7 @@ def set_seed(seed):
not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
reason="CUDA not available or PyTorch version < 2.7",
)
def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
def test_flex_attention_vs_default_backend(vllm_runner):
"""Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with
......@@ -59,35 +59,32 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
]
# Run with flex attention
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
set_seed(seed)
with vllm_runner(
model_name,
runner="generate",
tensor_parallel_size=1,
num_gpu_blocks_override=128,
enforce_eager=True,
) as llm_flex:
output_flex = llm_flex.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs
)
set_seed(seed)
with vllm_runner(
model_name,
runner="generate",
tensor_parallel_size=1,
num_gpu_blocks_override=128,
enforce_eager=True,
attention_config={"backend": "FLEX_ATTENTION"},
) as llm_flex:
output_flex = llm_flex.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs
)
# Run with default backend
with monkeypatch.context() as m:
set_seed(seed)
with vllm_runner(
model_name,
runner="generate",
tensor_parallel_size=1,
num_gpu_blocks_override=128,
enforce_eager=True,
gpu_memory_utilization=0.85,
) as llm_default:
output_default = llm_default.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs
)
set_seed(seed)
with vllm_runner(
model_name,
runner="generate",
tensor_parallel_size=1,
num_gpu_blocks_override=128,
enforce_eager=True,
gpu_memory_utilization=0.85,
) as llm_default:
output_default = llm_default.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs
)
check_logprobs_close(
outputs_0_lst=output_flex,
......@@ -101,7 +98,7 @@ def test_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
reason="CUDA not available or PyTorch version < 2.7",
)
def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
def test_encoder_flex_attention_vs_default_backend(vllm_runner):
"""Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with
......@@ -115,30 +112,26 @@ def test_encoder_flex_attention_vs_default_backend(vllm_runner, monkeypatch):
]
# Run with flex attention
with monkeypatch.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
with vllm_runner(
model_name,
runner="pooling",
dtype=torch.bfloat16,
tensor_parallel_size=1,
max_model_len=100,
enforce_eager=True,
) as llm_flex:
flex_outputs = llm_flex.embed(prompts)
with vllm_runner(
model_name,
runner="pooling",
dtype=torch.bfloat16,
tensor_parallel_size=1,
max_model_len=100,
enforce_eager=True,
attention_config={"backend": "FLEX_ATTENTION"},
) as llm_flex:
flex_outputs = llm_flex.embed(prompts)
# Run with default backend
with (
monkeypatch.context() as m,
vllm_runner(
model_name,
runner="pooling",
dtype=torch.bfloat16,
tensor_parallel_size=1,
max_model_len=100,
enforce_eager=True,
) as llm_default,
):
with vllm_runner(
model_name,
runner="pooling",
dtype=torch.bfloat16,
tensor_parallel_size=1,
max_model_len=100,
enforce_eager=True,
) as llm_default:
default_outputs = llm_default.embed(prompts)
check_embeddings_close(
......
......@@ -39,6 +39,7 @@ def ops_impl(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode()
def test_silu_and_mul(
default_vllm_config,
num_tokens: int,
hidden_size: int,
dtype: torch.dtype,
......
......@@ -13,11 +13,11 @@ import torch
from torch._prims_common import TensorLikeType
from tests.kernels.quant_utils import native_w8a8_block_matmul
from vllm.attention.backends.abstract import AttentionType
from vllm.model_executor.custom_op import CustomOp
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
from vllm.utils.torch_utils import make_tensor_with_pad
from vllm.v1.attention.backend import AttentionType
# For now, disable "test_aot_dispatch_dynamic" since there are some
# bugs related to this test in PyTorch 2.4.
......
......@@ -84,7 +84,7 @@ class DummyLoRAModel(nn.Sequential, SupportsLoRA):
@pytest.fixture
def dummy_model() -> nn.Module:
def dummy_model(default_vllm_config) -> nn.Module:
model = DummyLoRAModel(
OrderedDict(
[
......@@ -117,7 +117,7 @@ def dummy_model() -> nn.Module:
@pytest.fixture
def dummy_model_gate_up() -> nn.Module:
def dummy_model_gate_up(default_vllm_config) -> nn.Module:
model = DummyLoRAModel(
OrderedDict(
[
......@@ -214,6 +214,31 @@ def qwen25vl_lora_files():
return snapshot_download(repo_id="jeeejeee/qwen25-vl-lora-pokemon")
@pytest.fixture(scope="session")
def qwen2vl_language_lora_files():
return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-language")
@pytest.fixture(scope="session")
def qwen2vl_vision_tower_connector_lora_files():
return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-tower-connector")
@pytest.fixture(scope="session")
def qwen2vl_vision_tower_lora_files():
return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-tower")
@pytest.fixture(scope="session")
def qwen25vl_vision_lora_files():
return snapshot_download(repo_id="EpochEcho/qwen2.5-3b-vl-lora-vision-connector")
@pytest.fixture(scope="session")
def qwen3vl_vision_lora_files():
return snapshot_download(repo_id="EpochEcho/qwen3-4b-vl-lora-vision-connector")
@pytest.fixture(scope="session")
def tinyllama_lora_files():
# return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
......
......@@ -18,8 +18,8 @@ from vllm.distributed.parallel_state import (
get_tensor_model_parallel_world_size,
)
from vllm.lora.ops.triton_ops import fused_moe_lora
from vllm.platforms import current_platform
from vllm.utils.network_utils import get_open_port
from vllm.utils.torch_utils import set_random_seed
@pytest.fixture(autouse=True)
......@@ -265,7 +265,7 @@ def test_fused_moe_lora_kernel(
seed,
):
torch.set_default_device(device)
current_platform.seed_everything(seed)
set_random_seed(seed)
# the number of randomly generated sentences.
num_sequences = 10
# generate data
......@@ -358,7 +358,7 @@ def test_fused_moe_lora_kernel_fully_sharded(
seed,
column_parallel,
):
current_platform.seed_everything(seed)
set_random_seed(seed)
# the number of randomly generated sentences.
num_sequences = 10
# generate data
......@@ -415,7 +415,7 @@ def use_fused_moe_lora_kernel_tensor_parallel(
def _get_shard_slice(shard_size):
return slice(local_rank * shard_size, (local_rank + 1) * shard_size)
current_platform.seed_everything(seed)
set_random_seed(seed)
device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device)
......
......@@ -34,9 +34,9 @@ The Competition_ID of competition_record is the foreign key of Competition_ID of
###Response:<|end|><|start|>assistant<|channel|>final<|message|>""" # noqa: E501
EXPECTED_LORA_OUTPUT = [
"SELECT AVG(Working_Horses) FROM farm WHERE Total_Horses > 5000;",
"SELECT MAX(Cows) AS Max_Cows, MIN(Cows) AS Min_Cows FROM farm;",
"SELECT MAX(Cows) AS Max_Cows, MIN(Cows) AS Min_Cows FROM farm;",
"SELECT avg(Working_Horses) FROM farm WHERE Total_Horses > 5000",
"SELECT max(Cows) , min(Cows) FROM farm",
"SELECT max(Cows) , min(Cows) FROM farm",
]
......@@ -69,38 +69,54 @@ def generate_and_test(llm: vllm.LLM, lora_path: str, lora_id: int) -> None:
assert generated_texts[i].startswith(EXPECTED_LORA_OUTPUT[i])
def test_gpt_oss_lora(gptoss20b_lora_files):
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=8,
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
cudagraph_specialize_lora=False,
),
)
generate_and_test(llm, gptoss20b_lora_files, lora_id=1)
generate_and_test(llm, gptoss20b_lora_files, lora_id=2)
@pytest.mark.parametrize("mxfp4_use_marlin", [True, False])
def test_gpt_oss_lora(
monkeypatch: pytest.MonkeyPatch, gptoss20b_lora_files, mxfp4_use_marlin
):
with monkeypatch.context() as m:
m.setenv("VLLM_MXFP4_USE_MARLIN", "1" if mxfp4_use_marlin else "0")
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=8,
max_num_seqs=2,
max_num_batched_tokens=2048,
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
cudagraph_specialize_lora=False,
),
)
generate_and_test(llm, gptoss20b_lora_files, lora_id=1)
generate_and_test(llm, gptoss20b_lora_files, lora_id=2)
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("fully_sharded_loras", [False, True])
def test_gpt_oss_lora_tp2(gptoss20b_lora_files, fully_sharded_loras):
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=2,
max_lora_rank=8,
max_num_seqs=16,
tensor_parallel_size=2,
fully_sharded_loras=fully_sharded_loras,
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
cudagraph_specialize_lora=False,
),
)
generate_and_test(llm, gptoss20b_lora_files, lora_id=1)
generate_and_test(llm, gptoss20b_lora_files, lora_id=2)
@pytest.mark.parametrize("mxfp4_use_marlin", [True, False])
def test_gpt_oss_lora_tp2(
monkeypatch: pytest.MonkeyPatch,
gptoss20b_lora_files,
fully_sharded_loras,
mxfp4_use_marlin,
):
with monkeypatch.context() as m:
m.setenv("VLLM_MXFP4_USE_MARLIN", "1" if mxfp4_use_marlin else "0")
llm = vllm.LLM(
MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=2,
max_num_seqs=2,
max_num_batched_tokens=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.8,
fully_sharded_loras=fully_sharded_loras,
compilation_config=vllm.config.CompilationConfig( # Avoid OOM
cudagraph_specialize_lora=False,
),
)
generate_and_test(llm, gptoss20b_lora_files, lora_id=1)
generate_and_test(llm, gptoss20b_lora_files, lora_id=2)
......@@ -43,8 +43,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding,
get_masked_input_and_mask,
)
from vllm.model_executor.utils import set_random_seed
from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed
from .utils import DummyLoRAManager
......@@ -252,7 +252,9 @@ def check_punica_wrapper(punica_wrapper) -> bool:
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
@pytest.mark.parametrize("stage", STAGES)
def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
def test_embeddings(
default_vllm_config, dist_init, num_loras, device, vocab_size, stage
) -> None:
# For multi-GPU testing of Triton kernel, we must explicitly set the CUDA
# device, see: https://github.com/triton-lang/triton/issues/2925
# Same below.
......@@ -261,11 +263,11 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
torch.set_default_device(device)
max_loras = 8
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
assert check_punica_wrapper(punica_wrapper)
lora_config = LoRAConfig(
max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16
)
punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
assert check_punica_wrapper(punica_wrapper)
def create_random_embedding_layer():
embedding = VocabParallelEmbedding(vocab_size, 256)
......@@ -353,18 +355,18 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
@pytest.mark.parametrize("stage", STAGES)
def test_lm_head_logits_processor(
dist_init, num_loras, device, vocab_size, stage
default_vllm_config, dist_init, num_loras, device, vocab_size, stage
) -> None:
if current_platform.is_cuda_alike():
torch.cuda.set_device(device)
torch.set_default_device(device)
max_loras = 8
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
assert check_punica_wrapper(punica_wrapper)
lora_config = LoRAConfig(
max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16
)
punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
assert check_punica_wrapper(punica_wrapper)
def _pretest():
linear = ParallelLMHead(
......@@ -470,6 +472,7 @@ def test_lm_head_logits_processor(
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES)
def test_linear_replicated(
default_vllm_config,
dist_init,
num_loras,
device,
......@@ -480,13 +483,13 @@ def test_linear_replicated(
max_loras = 8
torch.set_default_device(device)
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
assert check_punica_wrapper(punica_wrapper)
lora_config = LoRAConfig(
max_loras=max_loras,
max_lora_rank=8,
lora_dtype=torch.float16,
)
punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
assert check_punica_wrapper(punica_wrapper)
def create_random_linear_replicated_layer():
linear = ReplicatedLinear(4096, 4096, bias=False, params_dtype=torch.float16)
......@@ -580,21 +583,21 @@ def test_linear_replicated(
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES)
def test_linear_parallel(
dist_init, num_loras, orientation, fully_shard, device, stage
default_vllm_config, dist_init, num_loras, orientation, fully_shard, device, stage
) -> None:
if current_platform.is_cuda_alike():
torch.cuda.set_device(device)
max_loras = 8
torch.set_default_device(device)
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
assert check_punica_wrapper(punica_wrapper)
lora_config = LoRAConfig(
max_loras=max_loras,
max_lora_rank=8,
fully_sharded_loras=fully_shard,
lora_dtype=torch.float16,
)
punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
assert check_punica_wrapper(punica_wrapper)
def create_random_linear_parallel_layer():
if orientation == "row":
......@@ -705,21 +708,21 @@ def test_linear_parallel(
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES)
def test_column_parallel_packed(
dist_init, num_loras, repeats, fully_shard, device, stage
default_vllm_config, dist_init, num_loras, repeats, fully_shard, device, stage
) -> None:
if current_platform.is_cuda_alike():
torch.cuda.set_device(device)
max_loras = 8
torch.set_default_device(device)
punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
assert check_punica_wrapper(punica_wrapper)
lora_config = LoRAConfig(
max_loras=max_loras,
max_lora_rank=8,
fully_sharded_loras=fully_shard,
lora_dtype=torch.float16,
)
punica_wrapper = get_punica_wrapper(8192, 256, device, lora_config=lora_config)
assert check_punica_wrapper(punica_wrapper)
def create_column_parallel_packed_layer():
if repeats == 2:
......@@ -851,7 +854,7 @@ def test_column_parallel_packed(
@pytest.mark.parametrize(
"seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS))
)
def test_vocab_parallel_embedding_indices(tp_size, seed):
def test_vocab_parallel_embedding_indices(tp_size, seed, default_vllm_config):
random.seed(seed)
vocab_size = random.randint(4000, 64000)
added_vocab_size = random.randint(0, 1024)
......
......@@ -77,11 +77,18 @@ def do_sample(
if lora_id
else None,
)
# Print the outputs.
lora_request = LoRARequest(str(lora_id), lora_id, lora_path) if lora_id else None
generated_texts: list[str] = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
# The output should include correct lora_request info
if lora_request is not None:
assert output.lora_request.lora_name == lora_request.lora_name
assert output.lora_request.lora_int_id == lora_request.lora_int_id
assert output.lora_request.lora_path == lora_request.lora_path
else:
assert output.lora_request is None
generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
return generated_texts
......
......@@ -18,6 +18,7 @@ from vllm.lora.layers import (
from vllm.lora.lora_model import LoRAModel
from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
from vllm.lora.model_manager import (
DEFAULT_LANGUAGE_WRAPPER_KEY,
LoRAMapping,
LoRAModelManager,
LRUCacheLoRAModelManager,
......@@ -110,7 +111,7 @@ def create_packed_lora(
return LoRAModel(lora_id, 8, loras)
def test_replace_submodules(dist_init, dummy_model):
def test_replace_submodules(default_vllm_config, dist_init, dummy_model):
model = dummy_model
manager = LoRAModelManager(
model,
......@@ -132,7 +133,7 @@ def test_replace_submodules(dist_init, dummy_model):
@pytest.mark.parametrize("device", DEVICES)
def test_lora_model_manager(dist_init, dummy_model, device):
def test_lora_model_manager(default_vllm_config, dist_init, dummy_model, device):
model = dummy_model
model_lora1 = create_lora(
1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
......@@ -183,9 +184,11 @@ def test_lora_model_manager(dist_init, dummy_model, device):
assert manager.activate_adapter(2)
assert manager.lora_index_to_id[0] == 3
assert manager.lora_index_to_id[1] == 2
assert manager.device == device
assert manager.punica_wrapper.device == device
assert (
manager.punica_wrapper_mapping.get(DEFAULT_LANGUAGE_WRAPPER_KEY).device
== device
)
assert hasattr(manager, "supported_lora_modules")
assert sorted(manager.supported_lora_modules) == [
"dense1",
......@@ -196,7 +199,9 @@ def test_lora_model_manager(dist_init, dummy_model, device):
@pytest.mark.parametrize("device", DEVICES)
def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
def test_lora_lru_cache_model_manager(
default_vllm_config, dist_init, dummy_model, device
):
model = dummy_model
model_lora1 = create_lora(
1, model, ["layer1.dense1", "dense2", "lm_head"], device=device
......@@ -278,13 +283,15 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
assert manager.remove_adapter(3)
with pytest.raises(ValueError):
assert manager.pin_adapter(3)
assert manager.punica_wrapper.device == device
assert (
manager.punica_wrapper_mapping.get(DEFAULT_LANGUAGE_WRAPPER_KEY).device
== device
)
assert manager.device == device
@pytest.mark.parametrize("device", DEVICES)
def test_lru_lora_model_manager(dist_init, dummy_model, device):
def test_lru_lora_model_manager(default_vllm_config, dist_init, dummy_model, device):
# This tests just the LRU cache functionality, everything else is
# tested in test_lora_model_manager
model = dummy_model
......@@ -402,12 +409,17 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
assert manager.remove_oldest_adapter()
assert set(manager.list_adapters()) == {1}
assert manager.punica_wrapper.device == device
assert (
manager.punica_wrapper_mapping.get(DEFAULT_LANGUAGE_WRAPPER_KEY).device
== device
)
assert manager.device == device
@pytest.mark.parametrize("device", DEVICES)
def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_path):
def test_lru_cache_worker_adapter_manager(
default_vllm_config, dist_init, dummy_model, device, tmp_path
):
lora_config = LoRAConfig(
max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE
)
......@@ -514,11 +526,16 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa
)
assert worker_adapter_manager.device == device
assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device
punica_wrapper = worker_adapter_manager._adapter_manager.punica_wrapper_mapping.get(
DEFAULT_LANGUAGE_WRAPPER_KEY
)
assert punica_wrapper.device == device
@pytest.mark.parametrize("device", DEVICES)
def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path):
def test_worker_adapter_manager(
default_vllm_config, dist_init, dummy_model_gate_up, device, tmp_path
):
# Should remove every LoRA not specified in the request.
lora_config = LoRAConfig(
max_lora_rank=8, max_cpu_loras=4, max_loras=4, lora_dtype=DEFAULT_DTYPE
......@@ -618,11 +635,14 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
)
assert worker_adapter_manager.device == device
assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device
punica_wrapper = worker_adapter_manager._adapter_manager.punica_wrapper_mapping.get(
DEFAULT_LANGUAGE_WRAPPER_KEY
)
assert punica_wrapper.device == device
@pytest.mark.parametrize("device", DEVICES)
def test_packed_loras(dist_init, dummy_model_gate_up, device):
def test_packed_loras(default_vllm_config, dist_init, dummy_model_gate_up, device):
model = dummy_model_gate_up
model_lora = create_packed_lora(
1,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment