Commit 3fb4b5fa authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.18.0' into v0.18.0-ori

parents bcf25339 89138b21
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
import vllm.config
from tests.compile.backend import TestBackend
from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
from vllm._aiter_ops import is_aiter_found_and_supported, rocm_aiter_ops
from vllm.compilation.passes.fusion.matcher_utils import ROTARY_OP
from vllm.compilation.passes.fusion.rope_kvcache_fusion import RopeKVCacheFusionPass
from vllm.compilation.passes.utility.noop_elimination import NoOpEliminationPass
from vllm.compilation.passes.utility.post_cleanup import PostCleanupPass
from vllm.compilation.passes.utility.scatter_split_replace import (
ScatterSplitReplacementPass,
)
from vllm.compilation.passes.utility.split_coalescing import SplitCoalescingPass
from vllm.config import (
CacheConfig,
CompilationConfig,
CompilationMode,
ModelConfig,
PassConfig,
VllmConfig,
)
from vllm.forward_context import get_forward_context, set_forward_context
from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
from vllm.platforms import current_platform
from vllm.v1.attention.backend import (
AttentionBackend,
CommonAttentionMetadata,
)
from vllm.v1.attention.backends.registry import AttentionBackendEnum
from vllm.v1.kv_cache_interface import AttentionSpec
INDEX_SELECT_OP = torch.ops.aten.index.Tensor
VLLM_UNIFIED_KV_CACHE_UPDATE_OP = torch.ops.vllm.unified_kv_cache_update
FP8_DTYPE = current_platform.fp8_dtype()
class QKRoPEKVCacheTestModel(torch.nn.Module):
def __init__(
self,
vllm_config: VllmConfig,
attn_backend: AttentionBackendEnum,
num_heads: int,
num_kv_heads: int,
head_size: int,
is_neox: bool,
dtype: torch.dtype,
device: torch.device,
prefix: str = "model.layers.0.self_attn.attn",
):
super().__init__()
self.num_heads = num_heads
self.num_kv_heads = num_kv_heads
self.head_size = head_size
self.block_size = vllm_config.cache_config.block_size
self.q_size = num_heads * head_size
self.kv_size = num_kv_heads * head_size
self.is_neox = is_neox
self.dtype = dtype
self.device = device
self.layer_name = prefix
self.rotary_emb = RotaryEmbedding(
head_size,
rotary_dim=head_size,
max_position_embeddings=4096,
base=10000,
is_neox_style=is_neox,
dtype=self.dtype,
)
# Whether to check for the RoPE custom op or component index_select
self.enable_rope_custom_op = self.rotary_emb.enabled()
# Register layer metadata for the fusion pass via Attention.
self.attn = Attention(
num_heads=num_heads,
head_size=head_size,
scale=1.0 / head_size**0.5,
num_kv_heads=num_kv_heads,
cache_config=vllm_config.cache_config,
quant_config=vllm_config.quant_config,
prefix=prefix,
attn_backend=attn_backend.get_class(),
)
self.attn_backend: type[AttentionBackend] = self.attn.get_attn_backend()
assert not self.attn_backend.forward_includes_kv_cache_update, (
f"Attention backend {self.attn_backend} does not support fuse_rope_kvcache."
)
self.attn._k_scale = self.attn._k_scale.to(device)
self.attn._v_scale = self.attn._v_scale.to(device)
kv_cache_dtype_str = vllm_config.cache_config.cache_dtype
self.kv_cache_dtype = (
FP8_DTYPE if kv_cache_dtype_str.startswith("fp8") else self.dtype
)
# Initialize attn MetadataBuilder
self.builder = self.attn.attn_backend.get_builder_cls()(
kv_cache_spec=AttentionSpec(
block_size=self.block_size,
num_kv_heads=self.num_kv_heads,
head_size=head_size,
dtype=self.kv_cache_dtype,
),
layer_names=[self.attn.layer_name],
vllm_config=vllm_config,
device=device,
)
def build_attn_metadata(self, batch_size: int) -> CommonAttentionMetadata:
"""Initialize attention metadata."""
# Create common attn metadata
batch_spec = BatchSpec(seq_lens=[1] * batch_size, query_lens=[1] * batch_size)
common_attn_metadata = create_common_attn_metadata(
batch_spec, self.block_size, self.device, arange_block_indices=True
)
max_blocks = (max(batch_spec.seq_lens) + self.block_size - 1) // self.block_size
num_blocks = batch_size * max_blocks
# Fetch the attention backend and kv cache shape and stride order
attn_backend = self.attn.attn_backend
kv_cache_shape = attn_backend.get_kv_cache_shape(
num_blocks, self.block_size, self.num_kv_heads, self.head_size
)
try:
kv_cache_stride_order = attn_backend.get_kv_cache_stride_order()
except (AttributeError, NotImplementedError):
kv_cache_stride_order = tuple(range(len(kv_cache_shape)))
kv_cache_shape = tuple(kv_cache_shape[i] for i in kv_cache_stride_order)
inv_order = [
kv_cache_stride_order.index(i) for i in range(len(kv_cache_stride_order))
]
# Create dummy KV cache
raw_tensor = torch.zeros(
2 * num_blocks * self.block_size * self.num_kv_heads * self.head_size,
dtype=self.kv_cache_dtype,
device=self.device,
)
raw_tensor = raw_tensor.view(kv_cache_shape)
kv_cache = raw_tensor.permute(*inv_order)
self.attn.kv_cache = [kv_cache]
# Build attn metadata
attn_metadata = self.builder.build(
common_prefix_len=0, common_attn_metadata=common_attn_metadata
)
return attn_metadata
def forward(
self, qkv: torch.Tensor, positions: torch.Tensor
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
# Create copy so inplace ops do not modify the original tensors
qkv = qkv.clone()
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
q, k = self.rotary_emb(positions, q, k)
# Instead of a full forward pass, match only the KV cache update op here
q = q.view(-1, self.num_heads, self.head_size)
k = k.view(-1, self.num_kv_heads, self.head_size)
v = v.view(-1, self.num_kv_heads, self.head_size)
kv_cache_dummy_dep = torch.ops.vllm.unified_kv_cache_update(
k, v, self.layer_name
)
return q, k, v, kv_cache_dummy_dep
def ops_in_model_before(self) -> list[torch._ops.OpOverload]:
ops = []
if self.enable_rope_custom_op:
if rocm_aiter_ops.is_triton_rotary_embed_enabled():
ops.append(torch.ops.vllm.rocm_aiter_triton_rotary_embedding.default)
else:
ops.append(ROTARY_OP)
else:
ops.append(INDEX_SELECT_OP)
ops.append(torch.ops.vllm.unified_kv_cache_update.default)
return ops
def ops_in_model_after(self) -> list[torch._ops.OpOverload]:
return [torch.ops.vllm.fused_rope_and_unified_kv_cache_update.default]
@pytest.mark.parametrize(
"attn_backend",
[
AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
AttentionBackendEnum.TRITON_ATTN,
AttentionBackendEnum.ROCM_ATTN,
AttentionBackendEnum.ROCM_AITER_FA,
],
)
@pytest.mark.parametrize("enable_rope_custom_op", [True]) # [True, False])
@pytest.mark.parametrize("enable_aiter_triton_rope", [True, False])
@pytest.mark.parametrize("num_heads", [64])
@pytest.mark.parametrize("num_kv_heads", [8])
@pytest.mark.parametrize("head_size", [64])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("is_neox", [True, False])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
@pytest.mark.skipif(
not is_aiter_found_and_supported(),
reason="Only test on ROCm with AITER installed and supported",
)
def test_rope_kvcache_fusion(
attn_backend: AttentionBackendEnum,
enable_rope_custom_op: bool,
enable_aiter_triton_rope: bool,
num_heads: int,
num_kv_heads: int,
head_size: int,
block_size: int,
is_neox: bool,
dtype: torch.dtype,
kv_cache_dtype: str,
monkeypatch: pytest.MonkeyPatch,
):
torch.set_default_device("cuda")
torch.set_default_dtype(dtype)
torch.manual_seed(0)
custom_ops: list[str] = []
if enable_rope_custom_op:
custom_ops.append("+rotary_embedding")
vllm_config = VllmConfig(
model_config=ModelConfig(dtype=dtype),
cache_config=CacheConfig(
block_size=block_size,
cache_dtype=kv_cache_dtype,
),
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
custom_ops=custom_ops,
pass_config=PassConfig(
fuse_rope_kvcache=True,
eliminate_noops=True,
),
),
)
with vllm.config.set_current_vllm_config(vllm_config), monkeypatch.context() as m:
m.setenv("VLLM_ROCM_USE_AITER", "1")
m.setenv(
"VLLM_ROCM_USE_AITER_TRITON_ROPE", "1" if enable_aiter_triton_rope else "0"
)
rocm_aiter_ops.refresh_env_variables()
model = QKRoPEKVCacheTestModel(
vllm_config=vllm_config,
attn_backend=attn_backend,
num_heads=num_heads,
num_kv_heads=num_kv_heads,
head_size=head_size,
is_neox=is_neox,
dtype=dtype,
device=torch.get_default_device(),
)
fusion_pass = RopeKVCacheFusionPass(vllm_config)
passes = [
NoOpEliminationPass(vllm_config),
SplitCoalescingPass(vllm_config),
ScatterSplitReplacementPass(vllm_config),
fusion_pass,
PostCleanupPass(vllm_config),
]
backend = TestBackend(*passes)
T = 5
qkv = torch.randn(
T, num_heads * head_size + 2 * num_kv_heads * head_size, dtype=dtype
)
pos = torch.arange(T, dtype=torch.long)
qkv_unfused = qkv.clone()
pos_unfused = pos.clone()
with set_forward_context(None, vllm_config):
forward_context = get_forward_context()
attn_metadata = model.build_attn_metadata(T)
forward_context.slot_mapping = {
model.layer_name: attn_metadata.slot_mapping
}
q_unfused, k_unfused, v_unfused, dummy = model(qkv_unfused, pos_unfused)
attn_layer = forward_context.no_compile_layers[model.layer_name]
kv_cache_unfused = attn_layer.kv_cache[forward_context.virtual_engine]
del dummy
torch._dynamo.mark_dynamic(qkv, 0)
torch._dynamo.mark_dynamic(pos, 0)
with set_forward_context(None, vllm_config):
model_fused = torch.compile(model, backend=backend)
forward_context = get_forward_context()
attn_metadata = model_fused.build_attn_metadata(T)
forward_context.slot_mapping = {
model.layer_name: attn_metadata.slot_mapping
}
q_fused, k_fused, v_fused, dummy = model_fused(qkv, pos)
attn_layer = forward_context.no_compile_layers[model.layer_name]
kv_cache_fused = attn_layer.kv_cache[forward_context.virtual_engine]
del dummy
assert fusion_pass.matched_count == 1
backend.check_before_ops(model.ops_in_model_before())
backend.check_after_ops(model.ops_in_model_after())
if dtype == torch.float16:
ATOL, RTOL = (2e-3, 2e-3)
else:
ATOL, RTOL = (1e-2, 1e-2)
torch.testing.assert_close(q_unfused, q_fused, atol=ATOL, rtol=RTOL)
torch.testing.assert_close(k_unfused, k_fused, atol=ATOL, rtol=RTOL)
torch.testing.assert_close(v_unfused, v_fused, atol=ATOL, rtol=RTOL)
# Cannot compare fp8_* directly here, cast to model dtype instead
torch.testing.assert_close(
kv_cache_unfused.view(dtype),
kv_cache_fused.view(dtype),
atol=ATOL,
rtol=RTOL,
)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
import torch.nn as nn
import vllm
from tests.compile.backend import TestBackend
from vllm.compilation.passes.utility.scatter_split_replace import (
ScatterSplitReplacementPass,
)
from vllm.compilation.passes.utility.split_coalescing import SplitCoalescingPass
from vllm.config import CompilationConfig, CompilationMode, VllmConfig
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
class ScatterSplitReplacementModel(nn.Module):
"""Model with a rope+getitem+slice_scatter+split_with_sizes sequence."""
def __init__(
self,
num_heads: int,
num_kv_heads: int,
head_size: int,
dtype: torch.dtype,
):
super().__init__()
self.q_size = num_heads * head_size
self.kv_size = num_kv_heads * head_size
self.rotary_emb = RotaryEmbedding(
head_size,
rotary_dim=head_size,
max_position_embeddings=4096,
base=10000,
is_neox_style=True,
dtype=dtype,
)
def forward(self, qkv: torch.Tensor, positions: torch.Tensor):
# Create copy so inplace ops do not modify the original tensors
qkv = qkv.clone()
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
q, k = self.rotary_emb(positions, q, k)
q = q + 1
k = k + 2
v = v + 3
return q, k, v
def ops_in_model_before(self) -> list[torch._ops.OpOverload]:
return [
torch.ops.aten.slice_scatter.default,
torch.ops.aten.split_with_sizes.default,
torch.ops.aten.getitem.default,
]
def ops_in_model_after(self) -> list[torch._ops.OpOverload]:
return [torch.ops.aten.getitem.default]
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
def test_scatter_split_replace(dtype):
torch.set_default_device("cuda")
torch.set_default_dtype(dtype)
torch.manual_seed(0)
num_heads = 8
num_kv_heads = 4
head_size = 64
vllm_config = VllmConfig(
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
custom_ops=["+rotary_embedding"],
),
)
with vllm.config.set_current_vllm_config(vllm_config):
# ScatterSplitReplacementPass requires SplitCoalescingPass to be run before it
coalesce_pass = SplitCoalescingPass(vllm_config)
replace_pass = ScatterSplitReplacementPass(vllm_config)
passes = [coalesce_pass, replace_pass]
backend = TestBackend(*passes)
model = ScatterSplitReplacementModel(num_heads, num_kv_heads, head_size, dtype)
T = 5
qkv = torch.randn(
T, num_heads * head_size + 2 * num_kv_heads * head_size, dtype=dtype
)
pos = torch.arange(T, dtype=torch.long)
qkv_eager = qkv.clone()
pos_eager = pos.clone()
result_eager = model(qkv_eager, pos_eager)
torch._dynamo.mark_dynamic(qkv, 0)
torch._dynamo.mark_dynamic(pos, 0)
model_compiled = torch.compile(model, backend=backend)
result_compiled = model_compiled(qkv, pos)
for eager, compiled in zip(result_eager, result_compiled):
torch.testing.assert_close(eager, compiled)
assert backend.op_count(torch.ops.aten.slice_scatter.default) == 0
assert backend.op_count(torch.ops.aten.split_with_sizes.default) == 1
...@@ -26,22 +26,14 @@ from vllm.config import ( ...@@ -26,22 +26,14 @@ from vllm.config import (
VllmConfig, VllmConfig,
set_current_vllm_config, set_current_vllm_config,
) )
from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.kernels.linear import (
from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import (
CutlassFP8ScaledMMLinearKernel, CutlassFP8ScaledMMLinearKernel,
)
from vllm.model_executor.layers.quantization.kernels.scaled_mm.flashinfer import (
FlashInferFP8ScaledMMLinearKernel, FlashInferFP8ScaledMMLinearKernel,
) FP8ScaledMMLinearKernel,
from vllm.model_executor.layers.quantization.kernels.scaled_mm.pytorch import (
PerTensorTorchFP8ScaledMMLinearKernel, PerTensorTorchFP8ScaledMMLinearKernel,
)
from vllm.model_executor.layers.quantization.kernels.scaled_mm.rocm import (
ROCmFP8ScaledMMLinearKernel, ROCmFP8ScaledMMLinearKernel,
) )
from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import ( # noqa: E501 from vllm.model_executor.layers.activation import SiluAndMul
FP8ScaledMMLinearKernel,
)
from vllm.model_executor.layers.quantization.utils.fp8_utils import W8A8BlockFp8LinearOp from vllm.model_executor.layers.quantization.utils.fp8_utils import W8A8BlockFp8LinearOp
from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.quant_utils import (
GroupShape, GroupShape,
...@@ -190,8 +182,24 @@ TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS ...@@ -190,8 +182,24 @@ TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS
"model_class, enable_quant_fp8_custom_op, force_kernel", "model_class, enable_quant_fp8_custom_op, force_kernel",
list(itertools.product([TestSiluMulFp8QuantModel], [True, False], TEST_KERNELS)) list(itertools.product([TestSiluMulFp8QuantModel], [True, False], TEST_KERNELS))
+ [ + [
(TestSiluMulNvfp4QuantModel, False, None), pytest.param(
(TestSiluMulGroupFp8QuantModel, False, None), TestSiluMulNvfp4QuantModel,
False,
None,
marks=pytest.mark.skipif(
not current_platform.is_cuda(), reason="CUDA only"
),
),
# GroupFP8Quant fusion only works with AITER on ROCm.
# and the enable_quant_fp8_custom_op must be True.
pytest.param(
TestSiluMulGroupFp8QuantModel,
True,
None,
marks=pytest.mark.skipif(
not current_platform.is_rocm(), reason="ROCm only"
),
),
], ],
) )
@pytest.mark.skipif( @pytest.mark.skipif(
...@@ -209,6 +217,7 @@ def test_fusion_silu_and_mul_quant( ...@@ -209,6 +217,7 @@ def test_fusion_silu_and_mul_quant(
enable_silu_mul_custom_op: bool, enable_silu_mul_custom_op: bool,
enable_quant_fp8_custom_op: bool, enable_quant_fp8_custom_op: bool,
force_kernel: FP8ScaledMMLinearKernel | None, force_kernel: FP8ScaledMMLinearKernel | None,
monkeypatch: pytest.MonkeyPatch,
): ):
if model_class is TestSiluMulNvfp4QuantModel and not is_nvfp4_supported(): if model_class is TestSiluMulNvfp4QuantModel and not is_nvfp4_supported():
pytest.skip("NVFP4 is not supported on this GPU.") pytest.skip("NVFP4 is not supported on this GPU.")
...@@ -235,13 +244,16 @@ def test_fusion_silu_and_mul_quant( ...@@ -235,13 +244,16 @@ def test_fusion_silu_and_mul_quant(
), ),
) )
with set_current_vllm_config(config): with set_current_vllm_config(config), monkeypatch.context() as m:
fusion_passes = [ActivationQuantFusionPass(config)] fusion_passes = [ActivationQuantFusionPass(config)]
if IS_AITER_FOUND: if IS_AITER_FOUND and model_class is TestSiluMulGroupFp8QuantModel:
from vllm._aiter_ops import rocm_aiter_ops
from vllm.compilation.passes.fusion.rocm_aiter_fusion import ( from vllm.compilation.passes.fusion.rocm_aiter_fusion import (
RocmAiterSiluMulFp8GroupQuantFusionPass, RocmAiterSiluMulFp8GroupQuantFusionPass,
) )
m.setenv("VLLM_ROCM_USE_AITER", "1")
rocm_aiter_ops.refresh_env_variables()
fusion_passes += [RocmAiterSiluMulFp8GroupQuantFusionPass(config)] fusion_passes += [RocmAiterSiluMulFp8GroupQuantFusionPass(config)]
passes = [NoOpEliminationPass(config), *fusion_passes, PostCleanupPass(config)] passes = [NoOpEliminationPass(config), *fusion_passes, PostCleanupPass(config)]
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
import functools import functools
import hashlib import hashlib
import multiprocessing import multiprocessing
import os
import pickle import pickle
import tempfile import tempfile
from contextlib import contextmanager from contextlib import contextmanager
...@@ -14,9 +15,12 @@ import pytest ...@@ -14,9 +15,12 @@ import pytest
import torch import torch
import vllm.model_executor.layers.activation import vllm.model_executor.layers.activation
from vllm.compilation.backends import VllmBackend
from vllm.compilation.caching import ( from vllm.compilation.caching import (
StandaloneCompiledArtifacts, StandaloneCompiledArtifacts,
VllmSerializableFunction,
) )
from vllm.compilation.counter import compilation_counter
from vllm.compilation.decorators import support_torch_compile from vllm.compilation.decorators import support_torch_compile
from vllm.config import ( from vllm.config import (
CompilationConfig, CompilationConfig,
...@@ -156,6 +160,26 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch): ...@@ -156,6 +160,26 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
assert torch.allclose(ret, expected) assert torch.allclose(ret, expected)
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
def test_save_and_load_slice(monkeypatch: pytest.MonkeyPatch):
def foo(x: torch.Tensor):
return x[slice(0, x.shape[0])]
vllm_config = make_vllm_config()
example_input = torch.randn(10, 10)
torch._dynamo.mark_dynamic(example_input, 0)
gm = torch.fx.symbolic_trace(foo)
assert "getitem_1 = x[slice(0, getitem, None)]" in gm.code
with use_vllm_config(vllm_config):
payload = VllmSerializableFunction.serialize_compile_artifacts(
VllmSerializableFunction(gm, (example_input,), "", foo)
)
fn = VllmSerializableFunction.deserialize_compile_artifacts(payload)
assert gm.code == fn.graph_module.code
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10") @pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
def test_cache_load_returns_tuple_consistency(monkeypatch: pytest.MonkeyPatch): def test_cache_load_returns_tuple_consistency(monkeypatch: pytest.MonkeyPatch):
""" """
...@@ -700,3 +724,156 @@ class TestStandaloneCompiledArtifactsIntegration: ...@@ -700,3 +724,156 @@ class TestStandaloneCompiledArtifactsIntegration:
("mod3", "shape3"), ("mod3", "shape3"),
]: ]:
assert cache.get(submod, shape) == shared_data assert cache.get(submod, shape) == shared_data
def test_functorch_config(self):
vllm_config = make_vllm_config()
example_inputs = (torch.randn(10, 10),)
def add_1(x: torch.Tensor):
return x + 1
gm = torch._dynamo.functional_export.dynamo_graph_capture_for_export(add_1)(
*example_inputs
)
gm.graph._codegen = torch.fx.graph.CodeGen()
gm._dynamo_bytecode_flatten = None
gm._dynamo_bytecode_unflatten = None
with (
torch._functorch.config.patch(bundled_autograd_cache=False),
set_current_vllm_config(vllm_config),
):
with torch._functorch.config.patch(bundled_autograd_cache=True):
fn = VllmSerializableFunction(gm, example_inputs, "", add_1)
payload = VllmSerializableFunction.serialize_compile_artifacts(fn)
config = None
def backend(*args, **kwargs) -> VllmSerializableFunction:
nonlocal config
# bundled_autograd_cache should be True even compiler backend
# runs with bundled_autograd_cache=False in ambient context.
config = torch._functorch.config.save_config_portable()
return fn
loaded_fn = VllmSerializableFunction.deserialize_compile_artifacts(payload)
with patch.object(VllmBackend, "__call__", backend):
loaded_fn(*example_inputs)
assert isinstance(config, dict)
assert "bundled_autograd_cache" in config
assert config["bundled_autograd_cache"] is True
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
def test_disable_compile_cache_skips_aot_save(
monkeypatch: pytest.MonkeyPatch, fresh_vllm_cache: str
):
"""When VLLM_DISABLE_COMPILE_CACHE=1, AOT artifacts must not be saved."""
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
monkeypatch.setenv("VLLM_USE_AOT_COMPILE", "1")
disable_envs_cache()
args = (torch.randn(10, 10),)
expected = reference_fn(*args)
vllm_config = make_vllm_config()
with (
use_vllm_config(vllm_config),
compilation_counter.expect(
num_aot_compiles=1,
num_aot_artifacts_saved=0,
num_aot_artifacts_loaded=0,
),
):
mod = CompiledMod(vllm_config=vllm_config)
actual = mod(*args)
assert torch.allclose(actual, expected)
# No cached artifact should exist on disk
aot_dir = os.path.join(fresh_vllm_cache, "torch_compile_cache", "torch_aot_compile")
if os.path.isdir(aot_dir):
for root, _dirs, files in os.walk(aot_dir):
for f in files:
assert f != "model", (
f"AOT artifact unexpectedly saved at {os.path.join(root, f)}"
)
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
def test_disable_compile_cache_skips_aot_load(
monkeypatch: pytest.MonkeyPatch, fresh_vllm_cache: str
):
"""When VLLM_DISABLE_COMPILE_CACHE=1, AOT artifacts must not be loaded."""
# Phase 1: compile and save with cache enabled
monkeypatch.setenv("VLLM_USE_AOT_COMPILE", "1")
disable_envs_cache()
args = (torch.randn(10, 10),)
vllm_config = make_vllm_config()
with (
use_vllm_config(vllm_config),
compilation_counter.expect(num_aot_artifacts_saved=1),
):
CompiledMod(vllm_config=vllm_config)(*args)
# Phase 2: disable cache, compile again — should NOT load from disk
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
disable_envs_cache()
torch._dynamo.reset()
vllm_config = make_vllm_config()
with (
use_vllm_config(vllm_config),
compilation_counter.expect(
num_aot_compiles=1,
num_aot_artifacts_saved=0,
num_aot_artifacts_loaded=0,
),
):
mod = CompiledMod(vllm_config=vllm_config)
mod(*args)
assert not mod.was_aot_compile_fn_loaded_from_disk
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
def test_aot_counters_on_save_and_load(
monkeypatch: pytest.MonkeyPatch, fresh_vllm_cache: str
):
"""Verify AOT counters are incremented correctly on save and load."""
monkeypatch.setenv("VLLM_USE_AOT_COMPILE", "1")
disable_envs_cache()
args = (torch.randn(10, 10),)
# Phase 1: fresh compile + save
vllm_config = make_vllm_config()
with (
use_vllm_config(vllm_config),
compilation_counter.expect(
num_aot_compiles=1,
num_aot_artifacts_saved=1,
num_aot_artifacts_loaded=0,
),
):
CompiledMod(vllm_config=vllm_config)(*args)
# Phase 2: load from cache
monkeypatch.setenv("VLLM_FORCE_AOT_LOAD", "1")
disable_envs_cache()
vllm_config = make_vllm_config()
with (
use_vllm_config(vllm_config),
compilation_counter.expect(
num_aot_compiles=0,
num_aot_artifacts_saved=0,
num_aot_artifacts_loaded=1,
),
):
CompiledMod(vllm_config=vllm_config)(*args)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from torch._dynamo.utils import counters
from vllm import LLM
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
def test_moe_compilation_cold_start(monkeypatch, use_fresh_inductor_cache):
# Run in same process so we can access PyTorch's internal counters
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
# I'm not sure if this is going to affect the numbers
monkeypatch.setenv("VLLM_USE_AOT_COMPILE", "0")
# Force cold compilation
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
compilation_config = CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
cudagraph_mode=CUDAGraphMode.NONE, # make the model loading faster
)
counters.clear()
_ = LLM(
model="microsoft/Phi-tiny-MoE-instruct",
max_model_len=256,
load_format="dummy", # make the model loading faster
compilation_config=compilation_config,
num_gpu_blocks_override=8, # make the model loading faster
)
# vLLM-compile cold start is special. By default, we do
# one full dynamo capture of the entire forward pass.
# The forward pass consists of 32 transformer layers.
# Then, we split on the attention operation. This results in
# 33 subgraphs (not including the attention operation).
# We then generate compiled artifacts for the unique subgraphs.
#
# There are actually only 3 unique subgraphs for this model
# (all of its transformer layers are the same modulo weights);
# this is true for most vLLM models.
# So we test that during cold start, we are only compling
# for 3 unique subgraphs.
assert counters["aot_autograd"]["autograd_cache_miss"] == 3
assert counters["aot_autograd"]["autograd_cache_hit"] == 0
...@@ -73,6 +73,7 @@ def test_compile_ranges(use_fresh_inductor_cache): ...@@ -73,6 +73,7 @@ def test_compile_ranges(use_fresh_inductor_cache):
Range(start=16, end=16), Range(start=16, end=16),
Range(start=9, end=32), Range(start=9, end=32),
Range(start=64, end=64), Range(start=64, end=64),
Range(start=128, end=128),
Range(start=33, end=8192), Range(start=33, end=8192),
] ]
) )
...@@ -85,7 +86,7 @@ def test_compile_ranges(use_fresh_inductor_cache): ...@@ -85,7 +86,7 @@ def test_compile_ranges(use_fresh_inductor_cache):
), ),
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE, mode=CompilationMode.VLLM_COMPILE,
compile_ranges_split_points=[8, 32], compile_ranges_endpoints=[8, 32],
compile_sizes=[16, 64, 128], compile_sizes=[16, 64, 128],
inductor_compile_config={ inductor_compile_config={
"post_grad_custom_post_pass": post_grad_range_checker, "post_grad_custom_post_pass": post_grad_range_checker,
...@@ -95,21 +96,21 @@ def test_compile_ranges(use_fresh_inductor_cache): ...@@ -95,21 +96,21 @@ def test_compile_ranges(use_fresh_inductor_cache):
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
model = TestModel(vllm_config=vllm_config, prefix="").eval() model = TestModel(vllm_config=vllm_config, prefix="").eval()
# Number of compilations: 3 for each compile range + 2 compile sizes # Number of compilations: 3 compile ranges + 3 compile sizes
batch_sizes = [1, 4, 16, 24, 48, 64, 8192] batch_sizes = [1, 4, 16, 24, 48, 64, 8192]
with compilation_counter.expect( with compilation_counter.expect(
num_graphs_seen=1, num_graphs_seen=1,
num_piecewise_graphs_seen=1, num_piecewise_graphs_seen=1,
num_backend_compilations=5, num_backend_compilations=6,
): ):
run_model(vllm_config, model, batch_sizes) run_model(vllm_config, model, batch_sizes)
assert post_grad_range_checker.num_calls == 5 assert post_grad_range_checker.num_calls == 6
def test_compile_config_get_compile_ranges(): def test_compile_config_get_compile_ranges():
compilation_config = CompilationConfig( compilation_config = CompilationConfig(
compile_ranges_split_points=[8, 32], compile_ranges_endpoints=[8, 32],
) )
VllmConfig( VllmConfig(
scheduler_config=SchedulerConfig( scheduler_config=SchedulerConfig(
...@@ -126,6 +127,88 @@ def test_compile_config_get_compile_ranges(): ...@@ -126,6 +127,88 @@ def test_compile_config_get_compile_ranges():
] ]
class PostGradStaticShapeChecker(InductorPass):
"""Asserts that compile_sizes entries produce graphs with fully concrete
(non-symbolic) shapes, and compile_ranges entries have symbolic shapes."""
def __init__(self):
self.num_static_calls = 0
self.num_dynamic_calls = 0
def __call__(self, graph: fx.Graph):
from torch.fx.experimental.symbolic_shapes import is_symbolic
compile_range = get_pass_context().compile_range
is_single = compile_range.is_single_size()
for node in graph.nodes:
val = node.meta.get("val")
if val is None:
val = node.meta.get("example_value")
if isinstance(val, torch.Tensor):
has_symbolic = any(is_symbolic(d) for d in val.shape)
if is_single:
assert not has_symbolic, (
f"compile_sizes entry {compile_range}: "
f"node '{node.name}' has symbolic shape "
f"{val.shape}"
)
else:
# compile_ranges should have at least some
# symbolic shapes (the batch dimension)
if has_symbolic:
self.num_dynamic_calls += 1
return
if is_single:
self.num_static_calls += 1
def uuid(self) -> str:
state: dict[str, Any] = {}
return InductorPass.hash_dict(state)
def test_compile_sizes_produce_static_shapes(use_fresh_inductor_cache):
"""Verify that compile_sizes entries are compiled with fully concrete
shapes (no SymInts), while compile_ranges entries retain dynamic shapes."""
checker = PostGradStaticShapeChecker()
torch.set_default_device("cuda")
vllm_config = VllmConfig(
scheduler_config=SchedulerConfig(
max_num_batched_tokens=8192,
max_model_len=8192,
is_encoder_decoder=False,
),
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
compile_ranges_endpoints=[8],
compile_sizes=[16],
inductor_compile_config={
"post_grad_custom_post_pass": checker,
},
),
)
with set_current_vllm_config(vllm_config):
model = TestModel(vllm_config=vllm_config, prefix="").eval()
# 3 compilations: Range(1,8), Range(9,8192), single-size 16
with compilation_counter.expect(
num_graphs_seen=1,
num_piecewise_graphs_seen=1,
num_backend_compilations=3,
):
run_model(vllm_config, model, [1, 16, 64])
# compile_sizes=16 should produce static shapes
assert checker.num_static_calls == 1, (
f"Expected 1 static compilation, got {checker.num_static_calls}"
)
# compile_ranges should produce dynamic shapes
assert checker.num_dynamic_calls == 2, (
f"Expected 2 dynamic compilations, got {checker.num_dynamic_calls}"
)
def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache): def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache):
# To force multiple compilations, we disable the compile cache # To force multiple compilations, we disable the compile cache
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1") monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
...@@ -148,7 +231,7 @@ def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache): ...@@ -148,7 +231,7 @@ def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache):
scheduler_config=scheduler_config, scheduler_config=scheduler_config,
compilation_config=CompilationConfig( compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE, mode=CompilationMode.VLLM_COMPILE,
compile_ranges_split_points=[8], compile_ranges_endpoints=[8],
inductor_compile_config={ inductor_compile_config={
"post_grad_custom_post_pass": post_grad_range_checker, "post_grad_custom_post_pass": post_grad_range_checker,
}, },
......
...@@ -421,6 +421,7 @@ def test_cudagraph_sizes_post_init( ...@@ -421,6 +421,7 @@ def test_cudagraph_sizes_post_init(
fuse_norm_quant=True, fuse_norm_quant=True,
fuse_act_quant=True, fuse_act_quant=True,
eliminate_noops=True, eliminate_noops=True,
sp_min_token_num=512 if enable_sp else None,
), ),
cudagraph_mode=cudagraph_mode, cudagraph_mode=cudagraph_mode,
) )
...@@ -569,3 +570,45 @@ def test_compile_sizes_padding_validation(): ...@@ -569,3 +570,45 @@ def test_compile_sizes_padding_validation():
assert sorted(config.compile_sizes) == [3, 5, 7] assert sorted(config.compile_sizes) == [3, 5, 7]
dispatcher = CudagraphDispatcher(_create_vllm_config_for_validation(config)) dispatcher = CudagraphDispatcher(_create_vllm_config_for_validation(config))
dispatcher.initialize_cudagraph_keys(CUDAGraphMode.NONE) # Should not raise dispatcher.initialize_cudagraph_keys(CUDAGraphMode.NONE) # Should not raise
@pytest.mark.parametrize(
"capture_sizes, max_size, num_blocks, expected_sizes, expected_max",
[
# Normal capping: sizes filtered to <= num_blocks
(
[1, 2, 4, 8, 16, 32, 64, 128, 256, 512],
512,
200,
[1, 2, 4, 8, 16, 32, 64, 128],
128,
),
# No capping needed: num_blocks >= max
([1, 2, 4, 8, 16], 16, 1000, [1, 2, 4, 8, 16], 16),
# Exact boundary: num_blocks == max (no capping)
([1, 2, 4, 8, 16, 32], 32, 32, [1, 2, 4, 8, 16, 32], 32),
# All sizes capped: num_blocks < smallest size
([8, 16, 32], 32, 4, [], 0),
# num_blocks <= 0: early return, no change
([1, 2, 4], 4, 0, [1, 2, 4], 4),
],
)
def test_adjust_cudagraph_sizes_for_mamba_cache(
capture_sizes, max_size, num_blocks, expected_sizes, expected_max
):
"""Test that cudagraph capture sizes are correctly capped to fit
available Mamba cache blocks.
See: https://github.com/vllm-project/vllm/issues/34094
"""
config = CompilationConfig(
cudagraph_capture_sizes=capture_sizes,
max_cudagraph_capture_size=max_size,
cudagraph_mode=CUDAGraphMode.NONE,
)
config.adjust_cudagraph_sizes_for_mamba_cache(num_blocks)
assert config.cudagraph_capture_sizes == expected_sizes
assert config.max_cudagraph_capture_size == expected_max
# Invariant: last element == max_cudagraph_capture_size
if expected_sizes:
assert config.cudagraph_capture_sizes[-1] == config.max_cudagraph_capture_size
...@@ -234,7 +234,7 @@ def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch ...@@ -234,7 +234,7 @@ def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch
expected_num_backend_compilations = 4 expected_num_backend_compilations = 4
# A has support_torch_compile but enable_if fn returns False # A has support_torch_compile but enable_if fn returns False
# enalbe_if will be True for B, so we expect mod1 and mod2 # enable_if will be True for B, so we expect mod1 and mod2
# to be compiled # to be compiled
with compilation_counter.expect( with compilation_counter.expect(
num_graphs_seen=2, num_graphs_seen=2,
......
...@@ -99,8 +99,8 @@ def test_dynamic_shapes_compilation( ...@@ -99,8 +99,8 @@ def test_dynamic_shapes_compilation(
# Clean up GPU memory # Clean up GPU memory
del model del model
gc.collect() gc.collect()
torch.cuda.empty_cache() torch.accelerator.empty_cache()
torch.cuda.synchronize() torch.accelerator.synchronize()
print("GPU memory cleared") print("GPU memory cleared")
......
...@@ -7,7 +7,7 @@ import pytest ...@@ -7,7 +7,7 @@ import pytest
import torch import torch
from torch.fx.experimental.proxy_tensor import make_fx from torch.fx.experimental.proxy_tensor import make_fx
from vllm.compilation.backends import split_graph from vllm.compilation.backends import _is_empty_allocation_node, split_graph
from vllm.compilation.passes.fx_utils import find_op_nodes from vllm.compilation.passes.fx_utils import find_op_nodes
# This import automatically registers `torch.ops.silly.attention` # This import automatically registers `torch.ops.silly.attention`
...@@ -184,3 +184,146 @@ def test_consecutive_ops_in_split(): ...@@ -184,3 +184,146 @@ def test_consecutive_ops_in_split():
assert [node.op for node in splitting_gm.graph.nodes] == ["placeholder"] + 2 * [ assert [node.op for node in splitting_gm.graph.nodes] == ["placeholder"] + 2 * [
"call_function" "call_function"
] + ["output"] ] + ["output"]
def _get_empty_nodes(split_item):
return [
node for node in split_item.graph.graph.nodes if _is_empty_allocation_node(node)
]
def _subgraphs_with_empty_nodes(split_items, *, is_splitting_graph):
return [
split_item
for split_item in split_items
if split_item.is_splitting_graph == is_splitting_graph
and _get_empty_nodes(split_item)
]
def test_empty_only_partition_stays_separate_after_splitting_predecessor():
"""
Empty-only subgraphs should not be merged when the only predecessor is
a splitting-op subgraph.
"""
def model_fn(x: torch.Tensor) -> torch.Tensor:
y = torch.sin(x)
out = torch.empty_like(y)
torch.ops.aten.cos.out(y, out=out)
return out
x = torch.randn(4, 3)
gm = make_fx(model_fn)(x)
split_ops = ["aten::sin", "aten::cos.out"]
split_gm, split_items = split_graph(gm, split_ops)
# Graph partitioning for this pattern is:
# [sin], [empty_like], [cos.out].
assert len(split_items) == 3, (
"Empty-only partition should not merge into splitting-op subgraph"
)
splitting_with_empty = _subgraphs_with_empty_nodes(
split_items, is_splitting_graph=True
)
assert len(splitting_with_empty) == 0, (
"Splitting-op subgraphs should not contain empty allocation nodes: "
f"{[item.submod_name for item in splitting_with_empty]}"
)
output_original = gm(x)
output_split = split_gm(x)
assert torch.allclose(output_original, output_split), "Output mismatch after split"
def test_empty_only_partition_is_merged():
"""
Empty-only subgraphs should still be merged when a non-splitting predecessor
exists. The merged empty node must remain outside splitting-op subgraphs.
"""
def model_fn(x: torch.Tensor) -> torch.Tensor:
base = x + 1
y = torch.sin(base)
out = torch.empty_like(base)
torch.ops.aten.cos.out(base, out=out)
return out + y
x = torch.randn(4, 3)
gm = make_fx(model_fn)(x)
split_gm, split_items = split_graph(gm, ["aten::sin", "aten::cos.out"])
# Partitioning should be:
# [add, empty_like], [sin], [cos.out], [add].
assert len(split_items) == 4, (
"Empty-only partition should be merged into non-splitting predecessor"
)
splitting_with_empty = _subgraphs_with_empty_nodes(
split_items, is_splitting_graph=True
)
assert len(splitting_with_empty) == 0, (
"Splitting-op subgraphs should not contain empty allocation nodes: "
f"{[item.submod_name for item in splitting_with_empty]}"
)
non_splitting_with_empty = _subgraphs_with_empty_nodes(
split_items, is_splitting_graph=False
)
assert len(non_splitting_with_empty) == 1, (
"Exactly one non-splitting subgraph should contain the merged empty node"
)
assert len(_get_empty_nodes(non_splitting_with_empty[0])) == 1, (
"Expected exactly one empty allocation node in merged subgraph"
)
output_original = gm(x)
output_split = split_gm(x)
assert torch.allclose(output_original, output_split), "Output mismatch after split"
def test_builtin_empty_only_partition_is_merged():
"""
In Dynamo graphs, torch.empty/empty_like may appear as builtin call targets
(not aten OpOverload). Ensure empty-only partitions are still merged.
"""
def model_fn(x: torch.Tensor) -> torch.Tensor:
hidden = x + 1
out1 = torch.empty_like(hidden)
torch.ops.silly.attention(hidden, hidden, hidden, out1)
out2 = torch.empty_like(hidden)
torch.ops.silly.attention(out1, out1, hidden, out2)
return out2 + hidden
gm = torch.fx.symbolic_trace(model_fn)
split_gm, split_items = split_graph(gm, ["silly::attention"])
# Without empty-only merge, this graph would split into:
# [add, empty_like], [attention], [empty_like], [attention], [add].
assert len(split_items) == 4, "Builtin empty-only partition should be merged"
splitting_with_empty = _subgraphs_with_empty_nodes(
split_items, is_splitting_graph=True
)
assert len(splitting_with_empty) == 0, (
"Splitting-op subgraphs should not contain empty allocation nodes: "
f"{[item.submod_name for item in splitting_with_empty]}"
)
non_splitting_with_empty = _subgraphs_with_empty_nodes(
split_items, is_splitting_graph=False
)
assert len(non_splitting_with_empty) == 1, (
"Exactly one non-splitting subgraph should contain merged empty nodes"
)
assert len(_get_empty_nodes(non_splitting_with_empty[0])) == 2, (
"Expected two builtin empty_like nodes in merged non-splitting subgraph"
)
x = torch.randn(2, 3, device="cuda")
output_original = gm(x)
output_split = split_gm(x)
assert torch.allclose(output_original, output_split), "Output mismatch after split"
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.compilation.passes.fusion.sequence_parallelism import (
SP_MIN_HIDDEN_SIZE,
SP_MIN_PER_GPU_SIZE_MB,
get_sequence_parallelism_threshold,
)
class TestGetSequenceParallelismThreshold:
"""Tests for get_sequence_parallelism_threshold function."""
def test_non_cuda_returns_none(self, mock_cuda_platform):
"""Non-CUDA platforms should return None."""
with mock_cuda_platform(is_cuda=False):
result = get_sequence_parallelism_threshold(
hidden_size=8192, tp_size=2, element_size=2
)
assert result is None
def test_unsupported_device_capability_returns_none(self, mock_cuda_platform):
"""Unsupported device capabilities (e.g., sm80) should return None."""
with mock_cuda_platform(capability=(8, 0)):
result = get_sequence_parallelism_threshold(
hidden_size=8192, tp_size=2, element_size=2
)
assert result is None
def test_small_hidden_size_returns_none(self, mock_cuda_platform):
"""H100 with hidden_size below threshold should return None."""
with mock_cuda_platform(capability=(9, 0)):
result = get_sequence_parallelism_threshold(
hidden_size=4096,
tp_size=2,
element_size=2, # 4096 < 8192
)
assert result is None
def test_h100_large_model_returns_threshold(self, mock_cuda_platform):
"""H100 with large enough hidden_size should return calculated threshold."""
with mock_cuda_platform(capability=(9, 0)):
hidden_size = 8192
tp_size = 2
element_size = 2 # float16/bfloat16
result = get_sequence_parallelism_threshold(
hidden_size=hidden_size,
tp_size=tp_size,
element_size=element_size,
)
# Verify calculation: (8 * 2 * 1024 * 1024) // (8192 * 2) = 1024
MiB = 1024 * 1024
expected = int(
(SP_MIN_PER_GPU_SIZE_MB[90] * tp_size * MiB)
// (hidden_size * element_size)
)
assert result == expected
assert result == 1024
@pytest.mark.parametrize(
"hidden_size,tp_size,element_size,expected",
[
# Boundary: exactly at min hidden size threshold, tp_size=1
# (8 * 1 * 1024 * 1024) // (8192 * 2) = 512
(8192, 1, 2, 512),
# Larger hidden size reduces token threshold
# (8 * 1 * 1024 * 1024) // (16384 * 2) = 256
(16384, 1, 2, 256),
# Larger tp_size increases token threshold
# (8 * 4 * 1024 * 1024) // (8192 * 2) = 2048
(8192, 4, 2, 2048),
# Larger element_size (fp32) reduces token threshold
# (8 * 2 * 1024 * 1024) // (8192 * 4) = 512
(8192, 2, 4, 512),
],
)
def test_threshold_calculation_variations(
self, mock_cuda_platform, hidden_size, tp_size, element_size, expected
):
"""Test threshold calculation with various parameter combinations."""
with mock_cuda_platform(capability=(9, 0)):
result = get_sequence_parallelism_threshold(
hidden_size=hidden_size,
tp_size=tp_size,
element_size=element_size,
)
assert result == expected
def test_hidden_size_boundary(self, mock_cuda_platform):
"""Test behavior at the exact hidden_size boundary."""
with mock_cuda_platform(capability=(9, 0)):
# Just below threshold
result = get_sequence_parallelism_threshold(
hidden_size=SP_MIN_HIDDEN_SIZE[90] - 1,
tp_size=2,
element_size=2,
)
assert result is None
# Exactly at threshold
result = get_sequence_parallelism_threshold(
hidden_size=SP_MIN_HIDDEN_SIZE[90],
tp_size=2,
element_size=2,
)
assert result is not None
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Cold start and warm start tests for vLLM-compile.
Cold start runs in a forked child (must fork before CUDA init) which
populates on-disk caches and asserts cold-start counters. Warm start
then runs in the parent with clean in-memory state but populated caches.
"""
import multiprocessing as mp
from torch._dynamo.utils import counters
from vllm.compilation.counter import compilation_counter
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
MODEL = "microsoft/Phi-tiny-MoE-instruct"
def _run_vllm(vllm_runner):
with vllm_runner(
MODEL,
trust_remote_code=False,
max_model_len=256,
max_num_batched_tokens=1024,
load_format="dummy",
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
cudagraph_mode=CUDAGraphMode.NONE,
),
num_gpu_blocks_override=8,
):
pass
def _cold_start(vllm_runner):
counters.clear()
with compilation_counter.expect(
num_compiled_artifacts_saved=3,
num_compiled_artifacts_loaded=0,
):
_run_vllm(vllm_runner)
assert counters["aot_autograd"]["total"] == 33
assert counters["aot_autograd"]["autograd_cache_miss"] == 3
assert counters["aot_autograd"]["autograd_cache_hit"] == 0
def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
# Cold start in a forked child (must fork before CUDA init).
# This model has 32 identical transformer layers which produce
# 33 subgraphs after splitting on attention — only 3 are unique.
ctx = mp.get_context("fork")
p = ctx.Process(target=_cold_start, args=(vllm_runner,))
p.start()
p.join()
assert p.exitcode == 0, "Cold-start child failed"
# Warm start — compiled artifacts loaded from disk cache.
counters.clear()
with compilation_counter.expect(
num_compiled_artifacts_loaded=3,
num_compiled_artifacts_saved=0,
):
_run_vllm(vllm_runner)
assert counters["aot_autograd"]["total"] == 30
assert counters["aot_autograd"]["autograd_cache_miss"] == 0
assert (
counters["aot_autograd"]["autograd_cache_hit"] == 0
) # No miss at aot_autograd level causing disk I/O.
...@@ -109,9 +109,9 @@ def test_vllm_structured_logging_artifacts(use_fresh_inductor_cache): ...@@ -109,9 +109,9 @@ def test_vllm_structured_logging_artifacts(use_fresh_inductor_cache):
f"got {len(vllm_piecewise_split_graph)}" f"got {len(vllm_piecewise_split_graph)}"
) )
compile_start_artifacts = capture.get("artifact", "vllm_piecewise_compile_start") compile_start_artifacts = capture.get("artifact", "vllm_piecewise_compile_start")
assert len(compile_start_artifacts) == 2, ( assert len(compile_start_artifacts) == 4, (
"Expected 2 vllm_piecewise_compile_start " "Expected 4 vllm_piecewise_compile_start "
"(one for dynamic ranges, one for compile size), " "(2 subgraphs x 2 ranges each: dynamic + compile size), "
f"got {len(compile_start_artifacts)}" f"got {len(compile_start_artifacts)}"
) )
submod_dumps = capture.get("graph_dump", r"vllm_submod_.*") submod_dumps = capture.get("graph_dump", r"vllm_submod_.*")
......
...@@ -95,7 +95,7 @@ def test_torch_compile_wrapper(use_bytecode_hook, monkeypatch): ...@@ -95,7 +95,7 @@ def test_torch_compile_wrapper(use_bytecode_hook, monkeypatch):
f"Expected {expected1}, got {result1}" f"Expected {expected1}, got {result1}"
) )
# Second call should triger another compilation # Second call should trigger another compilation
x2 = torch.tensor([1, 2, 3]) x2 = torch.tensor([1, 2, 3])
result2 = wrapper(x2) result2 = wrapper(x2)
expected2 = torch.tensor([100, 200, 300]) expected2 = torch.tensor([100, 200, 300])
......
...@@ -78,3 +78,34 @@ def test_ray_runtime_env(monkeypatch: pytest.MonkeyPatch): ...@@ -78,3 +78,34 @@ def test_ray_runtime_env(monkeypatch: pytest.MonkeyPatch):
) )
ray.shutdown() ray.shutdown()
def test_unrecognized_env(monkeypatch):
import os
from vllm.envs import environment_variables
# Remove any existing unrecognized VLLM env vars that might interfere
for env in list(os.environ):
if env.startswith("VLLM_") and env not in environment_variables:
monkeypatch.delenv(env, raising=False)
# Test that if fail_on_environ_validation is True, then an error
# is raised when an unrecognized vLLM environment variable is set
monkeypatch.setenv("VLLM_UNRECOGNIZED_ENV_VAR", "some_value")
engine_args = EngineArgs(
fail_on_environ_validation=True,
)
with pytest.raises(ValueError, match="Unknown vLLM environment variable detected"):
engine_args.create_engine_config()
# Test that if fail_on_environ_validation is False, then no error is raised
engine_args = EngineArgs()
engine_args.create_engine_config()
# Test that when the unrecognized env var is removed, no error is raised
monkeypatch.delenv("VLLM_UNRECOGNIZED_ENV_VAR")
engine_args = EngineArgs(
fail_on_environ_validation=True,
)
engine_args.create_engine_config()
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
import pytest import pytest
from vllm.config.model import ModelConfig
from vllm.config.multimodal import MultiModalConfig from vllm.config.multimodal import MultiModalConfig
from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.attention.backends.registry import AttentionBackendEnum
...@@ -23,3 +24,20 @@ def test_mm_encoder_attn_backend_hash_updates(): ...@@ -23,3 +24,20 @@ def test_mm_encoder_attn_backend_hash_updates():
mm_encoder_attn_backend=AttentionBackendEnum.FLASH_ATTN mm_encoder_attn_backend=AttentionBackendEnum.FLASH_ATTN
).compute_hash() ).compute_hash()
assert base_hash != overridden_hash assert base_hash != overridden_hash
def test_language_model_only_does_not_affect_mm_hash():
"""language_model_only does not affect the ViT computation graph,
so it should not change the multimodal config hash."""
base_hash = MultiModalConfig().compute_hash()
lm_only_hash = MultiModalConfig(language_model_only=True).compute_hash()
assert base_hash == lm_only_hash
def test_language_model_only_affects_model_hash():
"""language_model_only affects the LM computation graph,
so it should change the model config hash."""
model = "llava-hf/llava-1.5-7b-hf"
base_hash = ModelConfig(model).compute_hash()
lm_only_hash = ModelConfig(model, language_model_only=True).compute_hash()
assert base_hash != lm_only_hash
...@@ -176,16 +176,20 @@ def init_test_http_connection(): ...@@ -176,16 +176,20 @@ def init_test_http_connection():
@pytest.fixture @pytest.fixture
def dist_init(): def dist_init():
from tests.utils import ensure_current_vllm_config
temp_file = tempfile.mkstemp()[1] temp_file = tempfile.mkstemp()[1]
init_distributed_environment(
world_size=1, with ensure_current_vllm_config():
rank=0, init_distributed_environment(
distributed_init_method=f"file://{temp_file}", world_size=1,
local_rank=0, rank=0,
backend="nccl", distributed_init_method=f"file://{temp_file}",
) local_rank=0,
initialize_model_parallel(1, 1) backend="nccl",
yield )
initialize_model_parallel(1, 1)
yield
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
...@@ -419,18 +423,16 @@ class HfRunner: ...@@ -419,18 +423,16 @@ class HfRunner:
self.tokenizer: "PreTrainedTokenizer | PreTrainedTokenizerFast" = ( self.tokenizer: "PreTrainedTokenizer | PreTrainedTokenizerFast" = (
AutoTokenizer.from_pretrained( AutoTokenizer.from_pretrained(
model_name, model_name,
dtype=dtype,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
) )
) )
# don't put this import at the top level # don't put this import at the top level
# it will call torch.cuda.device_count() # it will call torch.accelerator.device_count()
from transformers import AutoProcessor from transformers import AutoProcessor
self.processor = AutoProcessor.from_pretrained( self.processor = AutoProcessor.from_pretrained(
model_name, model_name,
dtype=dtype,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
) )
if skip_tokenizer_init: if skip_tokenizer_init:
...@@ -792,7 +794,6 @@ class VllmRunner: ...@@ -792,7 +794,6 @@ class VllmRunner:
tensor_parallel_size: int = 1, tensor_parallel_size: int = 1,
block_size: int = 16 if not torch.xpu.is_available() else 64, block_size: int = 16 if not torch.xpu.is_available() else 64,
enable_chunked_prefill: bool | None = False, enable_chunked_prefill: bool | None = False,
swap_space: int = 4,
enforce_eager: bool | None = False, enforce_eager: bool | None = False,
# Set this to avoid hanging issue # Set this to avoid hanging issue
default_torch_num_threads: int | None = None, default_torch_num_threads: int | None = None,
...@@ -829,7 +830,6 @@ class VllmRunner: ...@@ -829,7 +830,6 @@ class VllmRunner:
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
dtype=dtype, dtype=dtype,
seed=seed, seed=seed,
swap_space=swap_space,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
disable_log_stats=disable_log_stats, disable_log_stats=disable_log_stats,
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
...@@ -841,7 +841,10 @@ class VllmRunner: ...@@ -841,7 +841,10 @@ class VllmRunner:
def get_inputs( def get_inputs(
self, self,
prompts: list[str] | list[torch.Tensor] | list[list[int]], prompts: list[str]
| list[torch.Tensor]
| list[list[int]]
| list[dict[str, Any]],
images: PromptImageInput | None = None, images: PromptImageInput | None = None,
videos: PromptVideoInput | None = None, videos: PromptVideoInput | None = None,
audios: PromptAudioInput | None = None, audios: PromptAudioInput | None = None,
...@@ -855,26 +858,32 @@ class VllmRunner: ...@@ -855,26 +858,32 @@ class VllmRunner:
inputs = list[dict[str, Any]]() inputs = list[dict[str, Any]]()
for i, prompt in enumerate(prompts): for i, prompt in enumerate(prompts):
prompt_dict = dict[str, Any]() # If we're passing an encoder/decoder prompt, we assume it
if isinstance(prompt, str): # already contains the multimodal data in the prompt
prompt_dict["prompt"] = prompt if isinstance(prompt, dict):
elif isinstance(prompt, list): assert images is None and audios is None and videos is None
prompt_dict["prompt_token_ids"] = prompt inputs.append(prompt.copy())
else: else:
prompt_dict["prompt_embeds"] = prompt prompt_dict = dict[str, Any]()
if isinstance(prompt, str):
multi_modal_data = dict[str, Any]() prompt_dict["prompt"] = prompt
if images is not None and (image := images[i]) is not None: elif isinstance(prompt, list):
multi_modal_data["image"] = image prompt_dict["prompt_token_ids"] = prompt
if videos is not None and (video := videos[i]) is not None: else:
multi_modal_data["video"] = video prompt_dict["prompt_embeds"] = prompt
if audios is not None and (audio := audios[i]) is not None:
multi_modal_data["audio"] = audio multi_modal_data = dict[str, Any]()
if images is not None and (image := images[i]) is not None:
multi_modal_data["image"] = image
if videos is not None and (video := videos[i]) is not None:
multi_modal_data["video"] = video
if audios is not None and (audio := audios[i]) is not None:
multi_modal_data["audio"] = audio
if multi_modal_data: if multi_modal_data:
prompt_dict["multi_modal_data"] = multi_modal_data prompt_dict["multi_modal_data"] = multi_modal_data
inputs.append(prompt_dict) inputs.append(prompt_dict)
return inputs return inputs
...@@ -1138,6 +1147,15 @@ class VllmRunner: ...@@ -1138,6 +1147,15 @@ class VllmRunner:
return self return self
def __exit__(self, exc_type, exc_value, traceback): def __exit__(self, exc_type, exc_value, traceback):
# Explicitly shutdown the engine core to release GPU resources
# This is needed because when executing consecutive tests, the GC
# might not be fast enough in shutting down the llm engine. This can lead to OOMs
# because when the next test starts some GPU memory is still in use.
try:
self.llm.llm_engine.engine_core.shutdown()
except Exception:
# Ignore shutdown errors as cleanup will still proceed
pass
del self.llm del self.llm
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
...@@ -1517,7 +1535,7 @@ def clean_gpu_memory_between_tests(): ...@@ -1517,7 +1535,7 @@ def clean_gpu_memory_between_tests():
from tests.utils import wait_for_gpu_memory_to_clear from tests.utils import wait_for_gpu_memory_to_clear
num_gpus = torch.cuda.device_count() num_gpus = torch.accelerator.device_count()
if num_gpus > 0: if num_gpus > 0:
try: try:
wait_for_gpu_memory_to_clear( wait_for_gpu_memory_to_clear(
...@@ -1531,7 +1549,7 @@ def clean_gpu_memory_between_tests(): ...@@ -1531,7 +1549,7 @@ def clean_gpu_memory_between_tests():
# Clean up GPU memory after the test # Clean up GPU memory after the test
if torch.cuda.is_available(): if torch.cuda.is_available():
torch.cuda.empty_cache() torch.accelerator.empty_cache()
gc.collect() gc.collect()
...@@ -1546,6 +1564,14 @@ def use_fresh_inductor_cache(): ...@@ -1546,6 +1564,14 @@ def use_fresh_inductor_cache():
yield yield
@pytest.fixture
def fresh_vllm_cache(monkeypatch, use_fresh_inductor_cache):
"""Temporary VLLM_CACHE_ROOT combined with a fresh inductor cache."""
with tempfile.TemporaryDirectory() as tmp_dir:
monkeypatch.setenv("VLLM_CACHE_ROOT", tmp_dir)
yield tmp_dir
@pytest.fixture(scope="function") @pytest.fixture(scope="function")
def enable_pickle(monkeypatch): def enable_pickle(monkeypatch):
"""`LLM.apply_model` requires pickling a function.""" """`LLM.apply_model` requires pickling a function."""
......
...@@ -14,7 +14,7 @@ import torch # noqa: E402 ...@@ -14,7 +14,7 @@ import torch # noqa: E402
from vllm.platforms import current_platform # noqa: F401, E402 from vllm.platforms import current_platform # noqa: F401, E402
os.environ["CUDA_VISIBLE_DEVICES"] = "0" os.environ["CUDA_VISIBLE_DEVICES"] = "0"
count = torch.cuda.device_count() count = torch.accelerator.device_count()
if count == 0: if count == 0:
sys.exit(0) # Skip: no GPUs available sys.exit(0) # Skip: no GPUs available
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for CUDA forward compatibility path logic in env_override.py.
Verifies the opt-in LD_LIBRARY_PATH manipulation for CUDA compat libs,
including env var parsing, path detection, and deduplication.
"""
import os
from unittest.mock import patch
import pytest
# Import the functions directly (they're module-level in env_override)
# We must import them without triggering the module-level side effects,
# so we import the functions by name after the module is already loaded.
from vllm.env_override import (
_get_torch_cuda_version,
_maybe_set_cuda_compatibility_path,
)
class TestCudaCompatibilityEnvParsing:
"""Test VLLM_ENABLE_CUDA_COMPATIBILITY env var parsing."""
def test_disabled_by_default(self, monkeypatch):
"""Compat path is NOT set when env var is absent."""
monkeypatch.delenv("VLLM_ENABLE_CUDA_COMPATIBILITY", raising=False)
monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
_maybe_set_cuda_compatibility_path()
assert (
"LD_LIBRARY_PATH" not in os.environ
or os.environ.get("LD_LIBRARY_PATH", "") == ""
)
@pytest.mark.parametrize("value", ["0", "false", "False", "no", ""])
def test_disabled_values(self, monkeypatch, value):
"""Various falsy values should not activate compat path."""
monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", value)
monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
_maybe_set_cuda_compatibility_path()
# LD_LIBRARY_PATH should not be set (or remain empty)
ld_path = os.environ.get("LD_LIBRARY_PATH", "")
assert "compat" not in ld_path
@pytest.mark.parametrize("value", ["1", "true", "True", " 1 ", " TRUE "])
def test_enabled_values_with_valid_path(self, monkeypatch, tmp_path, value):
"""Truthy values activate compat path when a valid path exists."""
compat_dir = tmp_path / "compat"
compat_dir.mkdir()
monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", value)
monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir))
monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
_maybe_set_cuda_compatibility_path()
ld_path = os.environ.get("LD_LIBRARY_PATH", "")
assert str(compat_dir) in ld_path
class TestCudaCompatibilityPathDetection:
"""Test path detection: custom override, conda, default."""
def test_custom_path_override(self, monkeypatch, tmp_path):
"""VLLM_CUDA_COMPATIBILITY_PATH takes highest priority."""
custom_dir = tmp_path / "my-compat"
custom_dir.mkdir()
monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(custom_dir))
monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
_maybe_set_cuda_compatibility_path()
ld_path = os.environ.get("LD_LIBRARY_PATH", "")
assert ld_path.startswith(str(custom_dir))
def test_conda_prefix_fallback(self, monkeypatch, tmp_path):
"""Falls back to $CONDA_PREFIX/cuda-compat if custom not set."""
conda_dir = tmp_path / "conda-env"
compat_dir = conda_dir / "cuda-compat"
compat_dir.mkdir(parents=True)
monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
monkeypatch.delenv("VLLM_CUDA_COMPATIBILITY_PATH", raising=False)
monkeypatch.setenv("CONDA_PREFIX", str(conda_dir))
monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
_maybe_set_cuda_compatibility_path()
ld_path = os.environ.get("LD_LIBRARY_PATH", "")
assert str(compat_dir) in ld_path
def test_no_valid_path_does_nothing(self, monkeypatch):
"""When enabled but no valid path exists, LD_LIBRARY_PATH unchanged."""
monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", "/nonexistent/path")
monkeypatch.delenv("CONDA_PREFIX", raising=False)
monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
with patch("vllm.env_override._get_torch_cuda_version", return_value=None):
_maybe_set_cuda_compatibility_path()
assert os.environ.get("LD_LIBRARY_PATH", "") == ""
def test_default_cuda_path_fallback(self, monkeypatch, tmp_path):
"""Falls back to /usr/local/cuda-{ver}/compat via torch version."""
fake_cuda = tmp_path / "cuda-12.8" / "compat"
fake_cuda.mkdir(parents=True)
monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
monkeypatch.delenv("VLLM_CUDA_COMPATIBILITY_PATH", raising=False)
monkeypatch.delenv("CONDA_PREFIX", raising=False)
monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
with (
patch("vllm.env_override._get_torch_cuda_version", return_value="12.8"),
patch(
"vllm.env_override.os.path.isdir",
side_effect=lambda p: p == "/usr/local/cuda-12.8/compat"
or os.path.isdir(p),
),
):
_maybe_set_cuda_compatibility_path()
ld_path = os.environ.get("LD_LIBRARY_PATH", "")
assert "/usr/local/cuda-12.8/compat" in ld_path
class TestCudaCompatibilityLdPathManipulation:
"""Test LD_LIBRARY_PATH prepend and deduplication logic."""
def test_prepends_to_empty_ld_path(self, monkeypatch, tmp_path):
"""Compat path is set when LD_LIBRARY_PATH is empty."""
compat_dir = tmp_path / "compat"
compat_dir.mkdir()
monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir))
monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
_maybe_set_cuda_compatibility_path()
assert os.environ["LD_LIBRARY_PATH"] == str(compat_dir)
def test_prepends_to_existing_ld_path(self, monkeypatch, tmp_path):
"""Compat path is prepended before existing entries."""
compat_dir = tmp_path / "compat"
compat_dir.mkdir()
monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir))
monkeypatch.setenv("LD_LIBRARY_PATH", "/usr/lib:/other/lib")
_maybe_set_cuda_compatibility_path()
ld_path = os.environ["LD_LIBRARY_PATH"]
parts = ld_path.split(os.pathsep)
assert parts[0] == str(compat_dir)
assert "/usr/lib" in parts
assert "/other/lib" in parts
def test_deduplicates_existing_compat_path(self, monkeypatch, tmp_path):
"""If compat path already in LD_LIBRARY_PATH, move to front."""
compat_dir = tmp_path / "compat"
compat_dir.mkdir()
monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir))
monkeypatch.setenv(
"LD_LIBRARY_PATH",
f"/usr/lib:{compat_dir}:/other/lib",
)
_maybe_set_cuda_compatibility_path()
ld_path = os.environ["LD_LIBRARY_PATH"]
parts = ld_path.split(os.pathsep)
assert parts[0] == str(compat_dir)
assert parts.count(str(compat_dir)) == 1
def test_already_at_front_is_noop(self, monkeypatch, tmp_path):
"""If compat path is already first, don't modify LD_LIBRARY_PATH."""
compat_dir = tmp_path / "compat"
compat_dir.mkdir()
original = f"{compat_dir}:/usr/lib"
monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir))
monkeypatch.setenv("LD_LIBRARY_PATH", original)
_maybe_set_cuda_compatibility_path()
assert os.environ["LD_LIBRARY_PATH"] == original
class TestGetTorchCudaVersion:
"""Test _get_torch_cuda_version() helper."""
def test_returns_string_when_torch_available(self):
"""Should return a CUDA version string like '12.8'."""
version = _get_torch_cuda_version()
# torch is installed in vllm's environment
assert version is None or isinstance(version, str)
def test_returns_none_when_torch_missing(self):
"""Should return None when torch is not importable."""
with patch(
"vllm.env_override.importlib.util.find_spec",
return_value=None,
):
assert _get_torch_cuda_version() is None
...@@ -7,7 +7,6 @@ from vllm.entrypoints.llm import LLM ...@@ -7,7 +7,6 @@ from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
@pytest.mark.skip_v1
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) @pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
def test_computed_prefix_blocks(model: str): def test_computed_prefix_blocks(model: str):
# This test checks if the engine generates completions both with and # This test checks if the engine generates completions both with and
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment