Commit 3fb4b5fa authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.18.0' into v0.18.0-ori

parents bcf25339 89138b21
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
import vllm.config
from tests.compile.backend import TestBackend
from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
from vllm._aiter_ops import is_aiter_found_and_supported, rocm_aiter_ops
from vllm.compilation.passes.fusion.matcher_utils import ROTARY_OP
from vllm.compilation.passes.fusion.rope_kvcache_fusion import RopeKVCacheFusionPass
from vllm.compilation.passes.utility.noop_elimination import NoOpEliminationPass
from vllm.compilation.passes.utility.post_cleanup import PostCleanupPass
from vllm.compilation.passes.utility.scatter_split_replace import (
ScatterSplitReplacementPass,
)
from vllm.compilation.passes.utility.split_coalescing import SplitCoalescingPass
from vllm.config import (
CacheConfig,
CompilationConfig,
CompilationMode,
ModelConfig,
PassConfig,
VllmConfig,
)
from vllm.forward_context import get_forward_context, set_forward_context
from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
from vllm.platforms import current_platform
from vllm.v1.attention.backend import (
AttentionBackend,
CommonAttentionMetadata,
)
from vllm.v1.attention.backends.registry import AttentionBackendEnum
from vllm.v1.kv_cache_interface import AttentionSpec
INDEX_SELECT_OP = torch.ops.aten.index.Tensor
VLLM_UNIFIED_KV_CACHE_UPDATE_OP = torch.ops.vllm.unified_kv_cache_update
FP8_DTYPE = current_platform.fp8_dtype()
class QKRoPEKVCacheTestModel(torch.nn.Module):
def __init__(
self,
vllm_config: VllmConfig,
attn_backend: AttentionBackendEnum,
num_heads: int,
num_kv_heads: int,
head_size: int,
is_neox: bool,
dtype: torch.dtype,
device: torch.device,
prefix: str = "model.layers.0.self_attn.attn",
):
super().__init__()
self.num_heads = num_heads
self.num_kv_heads = num_kv_heads
self.head_size = head_size
self.block_size = vllm_config.cache_config.block_size
self.q_size = num_heads * head_size
self.kv_size = num_kv_heads * head_size
self.is_neox = is_neox
self.dtype = dtype
self.device = device
self.layer_name = prefix
self.rotary_emb = RotaryEmbedding(
head_size,
rotary_dim=head_size,
max_position_embeddings=4096,
base=10000,
is_neox_style=is_neox,
dtype=self.dtype,
)
# Whether to check for the RoPE custom op or component index_select
self.enable_rope_custom_op = self.rotary_emb.enabled()
# Register layer metadata for the fusion pass via Attention.
self.attn = Attention(
num_heads=num_heads,
head_size=head_size,
scale=1.0 / head_size**0.5,
num_kv_heads=num_kv_heads,
cache_config=vllm_config.cache_config,
quant_config=vllm_config.quant_config,
prefix=prefix,
attn_backend=attn_backend.get_class(),
)
self.attn_backend: type[AttentionBackend] = self.attn.get_attn_backend()
assert not self.attn_backend.forward_includes_kv_cache_update, (
f"Attention backend {self.attn_backend} does not support fuse_rope_kvcache."
)
self.attn._k_scale = self.attn._k_scale.to(device)
self.attn._v_scale = self.attn._v_scale.to(device)
kv_cache_dtype_str = vllm_config.cache_config.cache_dtype
self.kv_cache_dtype = (
FP8_DTYPE if kv_cache_dtype_str.startswith("fp8") else self.dtype
)
# Initialize attn MetadataBuilder
self.builder = self.attn.attn_backend.get_builder_cls()(
kv_cache_spec=AttentionSpec(
block_size=self.block_size,
num_kv_heads=self.num_kv_heads,
head_size=head_size,
dtype=self.kv_cache_dtype,
),
layer_names=[self.attn.layer_name],
vllm_config=vllm_config,
device=device,
)
def build_attn_metadata(self, batch_size: int) -> CommonAttentionMetadata:
"""Initialize attention metadata."""
# Create common attn metadata
batch_spec = BatchSpec(seq_lens=[1] * batch_size, query_lens=[1] * batch_size)
common_attn_metadata = create_common_attn_metadata(
batch_spec, self.block_size, self.device, arange_block_indices=True
)
max_blocks = (max(batch_spec.seq_lens) + self.block_size - 1) // self.block_size
num_blocks = batch_size * max_blocks
# Fetch the attention backend and kv cache shape and stride order
attn_backend = self.attn.attn_backend
kv_cache_shape = attn_backend.get_kv_cache_shape(
num_blocks, self.block_size, self.num_kv_heads, self.head_size
)
try:
kv_cache_stride_order = attn_backend.get_kv_cache_stride_order()
except (AttributeError, NotImplementedError):
kv_cache_stride_order = tuple(range(len(kv_cache_shape)))
kv_cache_shape = tuple(kv_cache_shape[i] for i in kv_cache_stride_order)
inv_order = [
kv_cache_stride_order.index(i) for i in range(len(kv_cache_stride_order))
]
# Create dummy KV cache
raw_tensor = torch.zeros(
2 * num_blocks * self.block_size * self.num_kv_heads * self.head_size,
dtype=self.kv_cache_dtype,
device=self.device,
)
raw_tensor = raw_tensor.view(kv_cache_shape)
kv_cache = raw_tensor.permute(*inv_order)
self.attn.kv_cache = [kv_cache]
# Build attn metadata
attn_metadata = self.builder.build(
common_prefix_len=0, common_attn_metadata=common_attn_metadata
)
return attn_metadata
def forward(
self, qkv: torch.Tensor, positions: torch.Tensor
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
# Create copy so inplace ops do not modify the original tensors
qkv = qkv.clone()
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
q, k = self.rotary_emb(positions, q, k)
# Instead of a full forward pass, match only the KV cache update op here
q = q.view(-1, self.num_heads, self.head_size)
k = k.view(-1, self.num_kv_heads, self.head_size)
v = v.view(-1, self.num_kv_heads, self.head_size)
kv_cache_dummy_dep = torch.ops.vllm.unified_kv_cache_update(
k, v, self.layer_name
)
return q, k, v, kv_cache_dummy_dep
def ops_in_model_before(self) -> list[torch._ops.OpOverload]:
ops = []
if self.enable_rope_custom_op:
if rocm_aiter_ops.is_triton_rotary_embed_enabled():
ops.append(torch.ops.vllm.rocm_aiter_triton_rotary_embedding.default)
else:
ops.append(ROTARY_OP)
else:
ops.append(INDEX_SELECT_OP)
ops.append(torch.ops.vllm.unified_kv_cache_update.default)
return ops
def ops_in_model_after(self) -> list[torch._ops.OpOverload]:
return [torch.ops.vllm.fused_rope_and_unified_kv_cache_update.default]
@pytest.mark.parametrize(
"attn_backend",
[
AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
AttentionBackendEnum.TRITON_ATTN,
AttentionBackendEnum.ROCM_ATTN,
AttentionBackendEnum.ROCM_AITER_FA,
],
)
@pytest.mark.parametrize("enable_rope_custom_op", [True]) # [True, False])
@pytest.mark.parametrize("enable_aiter_triton_rope", [True, False])
@pytest.mark.parametrize("num_heads", [64])
@pytest.mark.parametrize("num_kv_heads", [8])
@pytest.mark.parametrize("head_size", [64])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("is_neox", [True, False])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
@pytest.mark.skipif(
not is_aiter_found_and_supported(),
reason="Only test on ROCm with AITER installed and supported",
)
def test_rope_kvcache_fusion(
attn_backend: AttentionBackendEnum,
enable_rope_custom_op: bool,
enable_aiter_triton_rope: bool,
num_heads: int,
num_kv_heads: int,
head_size: int,
block_size: int,
is_neox: bool,
dtype: torch.dtype,
kv_cache_dtype: str,
monkeypatch: pytest.MonkeyPatch,
):
torch.set_default_device("cuda")
torch.set_default_dtype(dtype)
torch.manual_seed(0)
custom_ops: list[str] = []
if enable_rope_custom_op:
custom_ops.append("+rotary_embedding")
vllm_config = VllmConfig(
model_config=ModelConfig(dtype=dtype),
cache_config=CacheConfig(
block_size=block_size,
cache_dtype=kv_cache_dtype,
),
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
custom_ops=custom_ops,
pass_config=PassConfig(
fuse_rope_kvcache=True,
eliminate_noops=True,
),
),
)
with vllm.config.set_current_vllm_config(vllm_config), monkeypatch.context() as m:
m.setenv("VLLM_ROCM_USE_AITER", "1")
m.setenv(
"VLLM_ROCM_USE_AITER_TRITON_ROPE", "1" if enable_aiter_triton_rope else "0"
)
rocm_aiter_ops.refresh_env_variables()
model = QKRoPEKVCacheTestModel(
vllm_config=vllm_config,
attn_backend=attn_backend,
num_heads=num_heads,
num_kv_heads=num_kv_heads,
head_size=head_size,
is_neox=is_neox,
dtype=dtype,
device=torch.get_default_device(),
)
fusion_pass = RopeKVCacheFusionPass(vllm_config)
passes = [
NoOpEliminationPass(vllm_config),
SplitCoalescingPass(vllm_config),
ScatterSplitReplacementPass(vllm_config),
fusion_pass,
PostCleanupPass(vllm_config),
]
backend = TestBackend(*passes)
T = 5
qkv = torch.randn(
T, num_heads * head_size + 2 * num_kv_heads * head_size, dtype=dtype
)
pos = torch.arange(T, dtype=torch.long)
qkv_unfused = qkv.clone()
pos_unfused = pos.clone()
with set_forward_context(None, vllm_config):
forward_context = get_forward_context()
attn_metadata = model.build_attn_metadata(T)
forward_context.slot_mapping = {
model.layer_name: attn_metadata.slot_mapping
}
q_unfused, k_unfused, v_unfused, dummy = model(qkv_unfused, pos_unfused)
attn_layer = forward_context.no_compile_layers[model.layer_name]
kv_cache_unfused = attn_layer.kv_cache[forward_context.virtual_engine]
del dummy
torch._dynamo.mark_dynamic(qkv, 0)
torch._dynamo.mark_dynamic(pos, 0)
with set_forward_context(None, vllm_config):
model_fused = torch.compile(model, backend=backend)
forward_context = get_forward_context()
attn_metadata = model_fused.build_attn_metadata(T)
forward_context.slot_mapping = {
model.layer_name: attn_metadata.slot_mapping
}
q_fused, k_fused, v_fused, dummy = model_fused(qkv, pos)
attn_layer = forward_context.no_compile_layers[model.layer_name]
kv_cache_fused = attn_layer.kv_cache[forward_context.virtual_engine]
del dummy
assert fusion_pass.matched_count == 1
backend.check_before_ops(model.ops_in_model_before())
backend.check_after_ops(model.ops_in_model_after())
if dtype == torch.float16:
ATOL, RTOL = (2e-3, 2e-3)
else:
ATOL, RTOL = (1e-2, 1e-2)
torch.testing.assert_close(q_unfused, q_fused, atol=ATOL, rtol=RTOL)
torch.testing.assert_close(k_unfused, k_fused, atol=ATOL, rtol=RTOL)
torch.testing.assert_close(v_unfused, v_fused, atol=ATOL, rtol=RTOL)
# Cannot compare fp8_* directly here, cast to model dtype instead
torch.testing.assert_close(
kv_cache_unfused.view(dtype),
kv_cache_fused.view(dtype),
atol=ATOL,
rtol=RTOL,
)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch
import torch.nn as nn
import vllm
from tests.compile.backend import TestBackend
from vllm.compilation.passes.utility.scatter_split_replace import (
ScatterSplitReplacementPass,
)
from vllm.compilation.passes.utility.split_coalescing import SplitCoalescingPass
from vllm.config import CompilationConfig, CompilationMode, VllmConfig
from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
class ScatterSplitReplacementModel(nn.Module):
"""Model with a rope+getitem+slice_scatter+split_with_sizes sequence."""
def __init__(
self,
num_heads: int,
num_kv_heads: int,
head_size: int,
dtype: torch.dtype,
):
super().__init__()
self.q_size = num_heads * head_size
self.kv_size = num_kv_heads * head_size
self.rotary_emb = RotaryEmbedding(
head_size,
rotary_dim=head_size,
max_position_embeddings=4096,
base=10000,
is_neox_style=True,
dtype=dtype,
)
def forward(self, qkv: torch.Tensor, positions: torch.Tensor):
# Create copy so inplace ops do not modify the original tensors
qkv = qkv.clone()
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
q, k = self.rotary_emb(positions, q, k)
q = q + 1
k = k + 2
v = v + 3
return q, k, v
def ops_in_model_before(self) -> list[torch._ops.OpOverload]:
return [
torch.ops.aten.slice_scatter.default,
torch.ops.aten.split_with_sizes.default,
torch.ops.aten.getitem.default,
]
def ops_in_model_after(self) -> list[torch._ops.OpOverload]:
return [torch.ops.aten.getitem.default]
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
def test_scatter_split_replace(dtype):
torch.set_default_device("cuda")
torch.set_default_dtype(dtype)
torch.manual_seed(0)
num_heads = 8
num_kv_heads = 4
head_size = 64
vllm_config = VllmConfig(
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
custom_ops=["+rotary_embedding"],
),
)
with vllm.config.set_current_vllm_config(vllm_config):
# ScatterSplitReplacementPass requires SplitCoalescingPass to be run before it
coalesce_pass = SplitCoalescingPass(vllm_config)
replace_pass = ScatterSplitReplacementPass(vllm_config)
passes = [coalesce_pass, replace_pass]
backend = TestBackend(*passes)
model = ScatterSplitReplacementModel(num_heads, num_kv_heads, head_size, dtype)
T = 5
qkv = torch.randn(
T, num_heads * head_size + 2 * num_kv_heads * head_size, dtype=dtype
)
pos = torch.arange(T, dtype=torch.long)
qkv_eager = qkv.clone()
pos_eager = pos.clone()
result_eager = model(qkv_eager, pos_eager)
torch._dynamo.mark_dynamic(qkv, 0)
torch._dynamo.mark_dynamic(pos, 0)
model_compiled = torch.compile(model, backend=backend)
result_compiled = model_compiled(qkv, pos)
for eager, compiled in zip(result_eager, result_compiled):
torch.testing.assert_close(eager, compiled)
assert backend.op_count(torch.ops.aten.slice_scatter.default) == 0
assert backend.op_count(torch.ops.aten.split_with_sizes.default) == 1
......@@ -26,22 +26,14 @@ from vllm.config import (
VllmConfig,
set_current_vllm_config,
)
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.quantization.kernels.scaled_mm.cutlass import (
from vllm.model_executor.kernels.linear import (
CutlassFP8ScaledMMLinearKernel,
)
from vllm.model_executor.layers.quantization.kernels.scaled_mm.flashinfer import (
FlashInferFP8ScaledMMLinearKernel,
)
from vllm.model_executor.layers.quantization.kernels.scaled_mm.pytorch import (
FP8ScaledMMLinearKernel,
PerTensorTorchFP8ScaledMMLinearKernel,
)
from vllm.model_executor.layers.quantization.kernels.scaled_mm.rocm import (
ROCmFP8ScaledMMLinearKernel,
)
from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import ( # noqa: E501
FP8ScaledMMLinearKernel,
)
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.quantization.utils.fp8_utils import W8A8BlockFp8LinearOp
from vllm.model_executor.layers.quantization.utils.quant_utils import (
GroupShape,
......@@ -190,8 +182,24 @@ TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS
"model_class, enable_quant_fp8_custom_op, force_kernel",
list(itertools.product([TestSiluMulFp8QuantModel], [True, False], TEST_KERNELS))
+ [
(TestSiluMulNvfp4QuantModel, False, None),
(TestSiluMulGroupFp8QuantModel, False, None),
pytest.param(
TestSiluMulNvfp4QuantModel,
False,
None,
marks=pytest.mark.skipif(
not current_platform.is_cuda(), reason="CUDA only"
),
),
# GroupFP8Quant fusion only works with AITER on ROCm.
# and the enable_quant_fp8_custom_op must be True.
pytest.param(
TestSiluMulGroupFp8QuantModel,
True,
None,
marks=pytest.mark.skipif(
not current_platform.is_rocm(), reason="ROCm only"
),
),
],
)
@pytest.mark.skipif(
......@@ -209,6 +217,7 @@ def test_fusion_silu_and_mul_quant(
enable_silu_mul_custom_op: bool,
enable_quant_fp8_custom_op: bool,
force_kernel: FP8ScaledMMLinearKernel | None,
monkeypatch: pytest.MonkeyPatch,
):
if model_class is TestSiluMulNvfp4QuantModel and not is_nvfp4_supported():
pytest.skip("NVFP4 is not supported on this GPU.")
......@@ -235,13 +244,16 @@ def test_fusion_silu_and_mul_quant(
),
)
with set_current_vllm_config(config):
with set_current_vllm_config(config), monkeypatch.context() as m:
fusion_passes = [ActivationQuantFusionPass(config)]
if IS_AITER_FOUND:
if IS_AITER_FOUND and model_class is TestSiluMulGroupFp8QuantModel:
from vllm._aiter_ops import rocm_aiter_ops
from vllm.compilation.passes.fusion.rocm_aiter_fusion import (
RocmAiterSiluMulFp8GroupQuantFusionPass,
)
m.setenv("VLLM_ROCM_USE_AITER", "1")
rocm_aiter_ops.refresh_env_variables()
fusion_passes += [RocmAiterSiluMulFp8GroupQuantFusionPass(config)]
passes = [NoOpEliminationPass(config), *fusion_passes, PostCleanupPass(config)]
......
......@@ -4,6 +4,7 @@
import functools
import hashlib
import multiprocessing
import os
import pickle
import tempfile
from contextlib import contextmanager
......@@ -14,9 +15,12 @@ import pytest
import torch
import vllm.model_executor.layers.activation
from vllm.compilation.backends import VllmBackend
from vllm.compilation.caching import (
StandaloneCompiledArtifacts,
VllmSerializableFunction,
)
from vllm.compilation.counter import compilation_counter
from vllm.compilation.decorators import support_torch_compile
from vllm.config import (
CompilationConfig,
......@@ -156,6 +160,26 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
assert torch.allclose(ret, expected)
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
def test_save_and_load_slice(monkeypatch: pytest.MonkeyPatch):
def foo(x: torch.Tensor):
return x[slice(0, x.shape[0])]
vllm_config = make_vllm_config()
example_input = torch.randn(10, 10)
torch._dynamo.mark_dynamic(example_input, 0)
gm = torch.fx.symbolic_trace(foo)
assert "getitem_1 = x[slice(0, getitem, None)]" in gm.code
with use_vllm_config(vllm_config):
payload = VllmSerializableFunction.serialize_compile_artifacts(
VllmSerializableFunction(gm, (example_input,), "", foo)
)
fn = VllmSerializableFunction.deserialize_compile_artifacts(payload)
assert gm.code == fn.graph_module.code
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
def test_cache_load_returns_tuple_consistency(monkeypatch: pytest.MonkeyPatch):
"""
......@@ -700,3 +724,156 @@ class TestStandaloneCompiledArtifactsIntegration:
("mod3", "shape3"),
]:
assert cache.get(submod, shape) == shared_data
def test_functorch_config(self):
vllm_config = make_vllm_config()
example_inputs = (torch.randn(10, 10),)
def add_1(x: torch.Tensor):
return x + 1
gm = torch._dynamo.functional_export.dynamo_graph_capture_for_export(add_1)(
*example_inputs
)
gm.graph._codegen = torch.fx.graph.CodeGen()
gm._dynamo_bytecode_flatten = None
gm._dynamo_bytecode_unflatten = None
with (
torch._functorch.config.patch(bundled_autograd_cache=False),
set_current_vllm_config(vllm_config),
):
with torch._functorch.config.patch(bundled_autograd_cache=True):
fn = VllmSerializableFunction(gm, example_inputs, "", add_1)
payload = VllmSerializableFunction.serialize_compile_artifacts(fn)
config = None
def backend(*args, **kwargs) -> VllmSerializableFunction:
nonlocal config
# bundled_autograd_cache should be True even compiler backend
# runs with bundled_autograd_cache=False in ambient context.
config = torch._functorch.config.save_config_portable()
return fn
loaded_fn = VllmSerializableFunction.deserialize_compile_artifacts(payload)
with patch.object(VllmBackend, "__call__", backend):
loaded_fn(*example_inputs)
assert isinstance(config, dict)
assert "bundled_autograd_cache" in config
assert config["bundled_autograd_cache"] is True
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
def test_disable_compile_cache_skips_aot_save(
monkeypatch: pytest.MonkeyPatch, fresh_vllm_cache: str
):
"""When VLLM_DISABLE_COMPILE_CACHE=1, AOT artifacts must not be saved."""
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
monkeypatch.setenv("VLLM_USE_AOT_COMPILE", "1")
disable_envs_cache()
args = (torch.randn(10, 10),)
expected = reference_fn(*args)
vllm_config = make_vllm_config()
with (
use_vllm_config(vllm_config),
compilation_counter.expect(
num_aot_compiles=1,
num_aot_artifacts_saved=0,
num_aot_artifacts_loaded=0,
),
):
mod = CompiledMod(vllm_config=vllm_config)
actual = mod(*args)
assert torch.allclose(actual, expected)
# No cached artifact should exist on disk
aot_dir = os.path.join(fresh_vllm_cache, "torch_compile_cache", "torch_aot_compile")
if os.path.isdir(aot_dir):
for root, _dirs, files in os.walk(aot_dir):
for f in files:
assert f != "model", (
f"AOT artifact unexpectedly saved at {os.path.join(root, f)}"
)
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
def test_disable_compile_cache_skips_aot_load(
monkeypatch: pytest.MonkeyPatch, fresh_vllm_cache: str
):
"""When VLLM_DISABLE_COMPILE_CACHE=1, AOT artifacts must not be loaded."""
# Phase 1: compile and save with cache enabled
monkeypatch.setenv("VLLM_USE_AOT_COMPILE", "1")
disable_envs_cache()
args = (torch.randn(10, 10),)
vllm_config = make_vllm_config()
with (
use_vllm_config(vllm_config),
compilation_counter.expect(num_aot_artifacts_saved=1),
):
CompiledMod(vllm_config=vllm_config)(*args)
# Phase 2: disable cache, compile again — should NOT load from disk
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
disable_envs_cache()
torch._dynamo.reset()
vllm_config = make_vllm_config()
with (
use_vllm_config(vllm_config),
compilation_counter.expect(
num_aot_compiles=1,
num_aot_artifacts_saved=0,
num_aot_artifacts_loaded=0,
),
):
mod = CompiledMod(vllm_config=vllm_config)
mod(*args)
assert not mod.was_aot_compile_fn_loaded_from_disk
@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
def test_aot_counters_on_save_and_load(
monkeypatch: pytest.MonkeyPatch, fresh_vllm_cache: str
):
"""Verify AOT counters are incremented correctly on save and load."""
monkeypatch.setenv("VLLM_USE_AOT_COMPILE", "1")
disable_envs_cache()
args = (torch.randn(10, 10),)
# Phase 1: fresh compile + save
vllm_config = make_vllm_config()
with (
use_vllm_config(vllm_config),
compilation_counter.expect(
num_aot_compiles=1,
num_aot_artifacts_saved=1,
num_aot_artifacts_loaded=0,
),
):
CompiledMod(vllm_config=vllm_config)(*args)
# Phase 2: load from cache
monkeypatch.setenv("VLLM_FORCE_AOT_LOAD", "1")
disable_envs_cache()
vllm_config = make_vllm_config()
with (
use_vllm_config(vllm_config),
compilation_counter.expect(
num_aot_compiles=0,
num_aot_artifacts_saved=0,
num_aot_artifacts_loaded=1,
),
):
CompiledMod(vllm_config=vllm_config)(*args)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from torch._dynamo.utils import counters
from vllm import LLM
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
def test_moe_compilation_cold_start(monkeypatch, use_fresh_inductor_cache):
# Run in same process so we can access PyTorch's internal counters
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
# I'm not sure if this is going to affect the numbers
monkeypatch.setenv("VLLM_USE_AOT_COMPILE", "0")
# Force cold compilation
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
compilation_config = CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
cudagraph_mode=CUDAGraphMode.NONE, # make the model loading faster
)
counters.clear()
_ = LLM(
model="microsoft/Phi-tiny-MoE-instruct",
max_model_len=256,
load_format="dummy", # make the model loading faster
compilation_config=compilation_config,
num_gpu_blocks_override=8, # make the model loading faster
)
# vLLM-compile cold start is special. By default, we do
# one full dynamo capture of the entire forward pass.
# The forward pass consists of 32 transformer layers.
# Then, we split on the attention operation. This results in
# 33 subgraphs (not including the attention operation).
# We then generate compiled artifacts for the unique subgraphs.
#
# There are actually only 3 unique subgraphs for this model
# (all of its transformer layers are the same modulo weights);
# this is true for most vLLM models.
# So we test that during cold start, we are only compling
# for 3 unique subgraphs.
assert counters["aot_autograd"]["autograd_cache_miss"] == 3
assert counters["aot_autograd"]["autograd_cache_hit"] == 0
......@@ -73,6 +73,7 @@ def test_compile_ranges(use_fresh_inductor_cache):
Range(start=16, end=16),
Range(start=9, end=32),
Range(start=64, end=64),
Range(start=128, end=128),
Range(start=33, end=8192),
]
)
......@@ -85,7 +86,7 @@ def test_compile_ranges(use_fresh_inductor_cache):
),
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
compile_ranges_split_points=[8, 32],
compile_ranges_endpoints=[8, 32],
compile_sizes=[16, 64, 128],
inductor_compile_config={
"post_grad_custom_post_pass": post_grad_range_checker,
......@@ -95,21 +96,21 @@ def test_compile_ranges(use_fresh_inductor_cache):
with set_current_vllm_config(vllm_config):
model = TestModel(vllm_config=vllm_config, prefix="").eval()
# Number of compilations: 3 for each compile range + 2 compile sizes
# Number of compilations: 3 compile ranges + 3 compile sizes
batch_sizes = [1, 4, 16, 24, 48, 64, 8192]
with compilation_counter.expect(
num_graphs_seen=1,
num_piecewise_graphs_seen=1,
num_backend_compilations=5,
num_backend_compilations=6,
):
run_model(vllm_config, model, batch_sizes)
assert post_grad_range_checker.num_calls == 5
assert post_grad_range_checker.num_calls == 6
def test_compile_config_get_compile_ranges():
compilation_config = CompilationConfig(
compile_ranges_split_points=[8, 32],
compile_ranges_endpoints=[8, 32],
)
VllmConfig(
scheduler_config=SchedulerConfig(
......@@ -126,6 +127,88 @@ def test_compile_config_get_compile_ranges():
]
class PostGradStaticShapeChecker(InductorPass):
"""Asserts that compile_sizes entries produce graphs with fully concrete
(non-symbolic) shapes, and compile_ranges entries have symbolic shapes."""
def __init__(self):
self.num_static_calls = 0
self.num_dynamic_calls = 0
def __call__(self, graph: fx.Graph):
from torch.fx.experimental.symbolic_shapes import is_symbolic
compile_range = get_pass_context().compile_range
is_single = compile_range.is_single_size()
for node in graph.nodes:
val = node.meta.get("val")
if val is None:
val = node.meta.get("example_value")
if isinstance(val, torch.Tensor):
has_symbolic = any(is_symbolic(d) for d in val.shape)
if is_single:
assert not has_symbolic, (
f"compile_sizes entry {compile_range}: "
f"node '{node.name}' has symbolic shape "
f"{val.shape}"
)
else:
# compile_ranges should have at least some
# symbolic shapes (the batch dimension)
if has_symbolic:
self.num_dynamic_calls += 1
return
if is_single:
self.num_static_calls += 1
def uuid(self) -> str:
state: dict[str, Any] = {}
return InductorPass.hash_dict(state)
def test_compile_sizes_produce_static_shapes(use_fresh_inductor_cache):
"""Verify that compile_sizes entries are compiled with fully concrete
shapes (no SymInts), while compile_ranges entries retain dynamic shapes."""
checker = PostGradStaticShapeChecker()
torch.set_default_device("cuda")
vllm_config = VllmConfig(
scheduler_config=SchedulerConfig(
max_num_batched_tokens=8192,
max_model_len=8192,
is_encoder_decoder=False,
),
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
compile_ranges_endpoints=[8],
compile_sizes=[16],
inductor_compile_config={
"post_grad_custom_post_pass": checker,
},
),
)
with set_current_vllm_config(vllm_config):
model = TestModel(vllm_config=vllm_config, prefix="").eval()
# 3 compilations: Range(1,8), Range(9,8192), single-size 16
with compilation_counter.expect(
num_graphs_seen=1,
num_piecewise_graphs_seen=1,
num_backend_compilations=3,
):
run_model(vllm_config, model, [1, 16, 64])
# compile_sizes=16 should produce static shapes
assert checker.num_static_calls == 1, (
f"Expected 1 static compilation, got {checker.num_static_calls}"
)
# compile_ranges should produce dynamic shapes
assert checker.num_dynamic_calls == 2, (
f"Expected 2 dynamic compilations, got {checker.num_dynamic_calls}"
)
def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache):
# To force multiple compilations, we disable the compile cache
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
......@@ -148,7 +231,7 @@ def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache):
scheduler_config=scheduler_config,
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
compile_ranges_split_points=[8],
compile_ranges_endpoints=[8],
inductor_compile_config={
"post_grad_custom_post_pass": post_grad_range_checker,
},
......
......@@ -421,6 +421,7 @@ def test_cudagraph_sizes_post_init(
fuse_norm_quant=True,
fuse_act_quant=True,
eliminate_noops=True,
sp_min_token_num=512 if enable_sp else None,
),
cudagraph_mode=cudagraph_mode,
)
......@@ -569,3 +570,45 @@ def test_compile_sizes_padding_validation():
assert sorted(config.compile_sizes) == [3, 5, 7]
dispatcher = CudagraphDispatcher(_create_vllm_config_for_validation(config))
dispatcher.initialize_cudagraph_keys(CUDAGraphMode.NONE) # Should not raise
@pytest.mark.parametrize(
"capture_sizes, max_size, num_blocks, expected_sizes, expected_max",
[
# Normal capping: sizes filtered to <= num_blocks
(
[1, 2, 4, 8, 16, 32, 64, 128, 256, 512],
512,
200,
[1, 2, 4, 8, 16, 32, 64, 128],
128,
),
# No capping needed: num_blocks >= max
([1, 2, 4, 8, 16], 16, 1000, [1, 2, 4, 8, 16], 16),
# Exact boundary: num_blocks == max (no capping)
([1, 2, 4, 8, 16, 32], 32, 32, [1, 2, 4, 8, 16, 32], 32),
# All sizes capped: num_blocks < smallest size
([8, 16, 32], 32, 4, [], 0),
# num_blocks <= 0: early return, no change
([1, 2, 4], 4, 0, [1, 2, 4], 4),
],
)
def test_adjust_cudagraph_sizes_for_mamba_cache(
capture_sizes, max_size, num_blocks, expected_sizes, expected_max
):
"""Test that cudagraph capture sizes are correctly capped to fit
available Mamba cache blocks.
See: https://github.com/vllm-project/vllm/issues/34094
"""
config = CompilationConfig(
cudagraph_capture_sizes=capture_sizes,
max_cudagraph_capture_size=max_size,
cudagraph_mode=CUDAGraphMode.NONE,
)
config.adjust_cudagraph_sizes_for_mamba_cache(num_blocks)
assert config.cudagraph_capture_sizes == expected_sizes
assert config.max_cudagraph_capture_size == expected_max
# Invariant: last element == max_cudagraph_capture_size
if expected_sizes:
assert config.cudagraph_capture_sizes[-1] == config.max_cudagraph_capture_size
......@@ -234,7 +234,7 @@ def test_conditional_compile_enable_if(use_inductor_graph_partition, monkeypatch
expected_num_backend_compilations = 4
# A has support_torch_compile but enable_if fn returns False
# enalbe_if will be True for B, so we expect mod1 and mod2
# enable_if will be True for B, so we expect mod1 and mod2
# to be compiled
with compilation_counter.expect(
num_graphs_seen=2,
......
......@@ -99,8 +99,8 @@ def test_dynamic_shapes_compilation(
# Clean up GPU memory
del model
gc.collect()
torch.cuda.empty_cache()
torch.cuda.synchronize()
torch.accelerator.empty_cache()
torch.accelerator.synchronize()
print("GPU memory cleared")
......
......@@ -7,7 +7,7 @@ import pytest
import torch
from torch.fx.experimental.proxy_tensor import make_fx
from vllm.compilation.backends import split_graph
from vllm.compilation.backends import _is_empty_allocation_node, split_graph
from vllm.compilation.passes.fx_utils import find_op_nodes
# This import automatically registers `torch.ops.silly.attention`
......@@ -184,3 +184,146 @@ def test_consecutive_ops_in_split():
assert [node.op for node in splitting_gm.graph.nodes] == ["placeholder"] + 2 * [
"call_function"
] + ["output"]
def _get_empty_nodes(split_item):
return [
node for node in split_item.graph.graph.nodes if _is_empty_allocation_node(node)
]
def _subgraphs_with_empty_nodes(split_items, *, is_splitting_graph):
return [
split_item
for split_item in split_items
if split_item.is_splitting_graph == is_splitting_graph
and _get_empty_nodes(split_item)
]
def test_empty_only_partition_stays_separate_after_splitting_predecessor():
"""
Empty-only subgraphs should not be merged when the only predecessor is
a splitting-op subgraph.
"""
def model_fn(x: torch.Tensor) -> torch.Tensor:
y = torch.sin(x)
out = torch.empty_like(y)
torch.ops.aten.cos.out(y, out=out)
return out
x = torch.randn(4, 3)
gm = make_fx(model_fn)(x)
split_ops = ["aten::sin", "aten::cos.out"]
split_gm, split_items = split_graph(gm, split_ops)
# Graph partitioning for this pattern is:
# [sin], [empty_like], [cos.out].
assert len(split_items) == 3, (
"Empty-only partition should not merge into splitting-op subgraph"
)
splitting_with_empty = _subgraphs_with_empty_nodes(
split_items, is_splitting_graph=True
)
assert len(splitting_with_empty) == 0, (
"Splitting-op subgraphs should not contain empty allocation nodes: "
f"{[item.submod_name for item in splitting_with_empty]}"
)
output_original = gm(x)
output_split = split_gm(x)
assert torch.allclose(output_original, output_split), "Output mismatch after split"
def test_empty_only_partition_is_merged():
"""
Empty-only subgraphs should still be merged when a non-splitting predecessor
exists. The merged empty node must remain outside splitting-op subgraphs.
"""
def model_fn(x: torch.Tensor) -> torch.Tensor:
base = x + 1
y = torch.sin(base)
out = torch.empty_like(base)
torch.ops.aten.cos.out(base, out=out)
return out + y
x = torch.randn(4, 3)
gm = make_fx(model_fn)(x)
split_gm, split_items = split_graph(gm, ["aten::sin", "aten::cos.out"])
# Partitioning should be:
# [add, empty_like], [sin], [cos.out], [add].
assert len(split_items) == 4, (
"Empty-only partition should be merged into non-splitting predecessor"
)
splitting_with_empty = _subgraphs_with_empty_nodes(
split_items, is_splitting_graph=True
)
assert len(splitting_with_empty) == 0, (
"Splitting-op subgraphs should not contain empty allocation nodes: "
f"{[item.submod_name for item in splitting_with_empty]}"
)
non_splitting_with_empty = _subgraphs_with_empty_nodes(
split_items, is_splitting_graph=False
)
assert len(non_splitting_with_empty) == 1, (
"Exactly one non-splitting subgraph should contain the merged empty node"
)
assert len(_get_empty_nodes(non_splitting_with_empty[0])) == 1, (
"Expected exactly one empty allocation node in merged subgraph"
)
output_original = gm(x)
output_split = split_gm(x)
assert torch.allclose(output_original, output_split), "Output mismatch after split"
def test_builtin_empty_only_partition_is_merged():
"""
In Dynamo graphs, torch.empty/empty_like may appear as builtin call targets
(not aten OpOverload). Ensure empty-only partitions are still merged.
"""
def model_fn(x: torch.Tensor) -> torch.Tensor:
hidden = x + 1
out1 = torch.empty_like(hidden)
torch.ops.silly.attention(hidden, hidden, hidden, out1)
out2 = torch.empty_like(hidden)
torch.ops.silly.attention(out1, out1, hidden, out2)
return out2 + hidden
gm = torch.fx.symbolic_trace(model_fn)
split_gm, split_items = split_graph(gm, ["silly::attention"])
# Without empty-only merge, this graph would split into:
# [add, empty_like], [attention], [empty_like], [attention], [add].
assert len(split_items) == 4, "Builtin empty-only partition should be merged"
splitting_with_empty = _subgraphs_with_empty_nodes(
split_items, is_splitting_graph=True
)
assert len(splitting_with_empty) == 0, (
"Splitting-op subgraphs should not contain empty allocation nodes: "
f"{[item.submod_name for item in splitting_with_empty]}"
)
non_splitting_with_empty = _subgraphs_with_empty_nodes(
split_items, is_splitting_graph=False
)
assert len(non_splitting_with_empty) == 1, (
"Exactly one non-splitting subgraph should contain merged empty nodes"
)
assert len(_get_empty_nodes(non_splitting_with_empty[0])) == 2, (
"Expected two builtin empty_like nodes in merged non-splitting subgraph"
)
x = torch.randn(2, 3, device="cuda")
output_original = gm(x)
output_split = split_gm(x)
assert torch.allclose(output_original, output_split), "Output mismatch after split"
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.compilation.passes.fusion.sequence_parallelism import (
SP_MIN_HIDDEN_SIZE,
SP_MIN_PER_GPU_SIZE_MB,
get_sequence_parallelism_threshold,
)
class TestGetSequenceParallelismThreshold:
"""Tests for get_sequence_parallelism_threshold function."""
def test_non_cuda_returns_none(self, mock_cuda_platform):
"""Non-CUDA platforms should return None."""
with mock_cuda_platform(is_cuda=False):
result = get_sequence_parallelism_threshold(
hidden_size=8192, tp_size=2, element_size=2
)
assert result is None
def test_unsupported_device_capability_returns_none(self, mock_cuda_platform):
"""Unsupported device capabilities (e.g., sm80) should return None."""
with mock_cuda_platform(capability=(8, 0)):
result = get_sequence_parallelism_threshold(
hidden_size=8192, tp_size=2, element_size=2
)
assert result is None
def test_small_hidden_size_returns_none(self, mock_cuda_platform):
"""H100 with hidden_size below threshold should return None."""
with mock_cuda_platform(capability=(9, 0)):
result = get_sequence_parallelism_threshold(
hidden_size=4096,
tp_size=2,
element_size=2, # 4096 < 8192
)
assert result is None
def test_h100_large_model_returns_threshold(self, mock_cuda_platform):
"""H100 with large enough hidden_size should return calculated threshold."""
with mock_cuda_platform(capability=(9, 0)):
hidden_size = 8192
tp_size = 2
element_size = 2 # float16/bfloat16
result = get_sequence_parallelism_threshold(
hidden_size=hidden_size,
tp_size=tp_size,
element_size=element_size,
)
# Verify calculation: (8 * 2 * 1024 * 1024) // (8192 * 2) = 1024
MiB = 1024 * 1024
expected = int(
(SP_MIN_PER_GPU_SIZE_MB[90] * tp_size * MiB)
// (hidden_size * element_size)
)
assert result == expected
assert result == 1024
@pytest.mark.parametrize(
"hidden_size,tp_size,element_size,expected",
[
# Boundary: exactly at min hidden size threshold, tp_size=1
# (8 * 1 * 1024 * 1024) // (8192 * 2) = 512
(8192, 1, 2, 512),
# Larger hidden size reduces token threshold
# (8 * 1 * 1024 * 1024) // (16384 * 2) = 256
(16384, 1, 2, 256),
# Larger tp_size increases token threshold
# (8 * 4 * 1024 * 1024) // (8192 * 2) = 2048
(8192, 4, 2, 2048),
# Larger element_size (fp32) reduces token threshold
# (8 * 2 * 1024 * 1024) // (8192 * 4) = 512
(8192, 2, 4, 512),
],
)
def test_threshold_calculation_variations(
self, mock_cuda_platform, hidden_size, tp_size, element_size, expected
):
"""Test threshold calculation with various parameter combinations."""
with mock_cuda_platform(capability=(9, 0)):
result = get_sequence_parallelism_threshold(
hidden_size=hidden_size,
tp_size=tp_size,
element_size=element_size,
)
assert result == expected
def test_hidden_size_boundary(self, mock_cuda_platform):
"""Test behavior at the exact hidden_size boundary."""
with mock_cuda_platform(capability=(9, 0)):
# Just below threshold
result = get_sequence_parallelism_threshold(
hidden_size=SP_MIN_HIDDEN_SIZE[90] - 1,
tp_size=2,
element_size=2,
)
assert result is None
# Exactly at threshold
result = get_sequence_parallelism_threshold(
hidden_size=SP_MIN_HIDDEN_SIZE[90],
tp_size=2,
element_size=2,
)
assert result is not None
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Cold start and warm start tests for vLLM-compile.
Cold start runs in a forked child (must fork before CUDA init) which
populates on-disk caches and asserts cold-start counters. Warm start
then runs in the parent with clean in-memory state but populated caches.
"""
import multiprocessing as mp
from torch._dynamo.utils import counters
from vllm.compilation.counter import compilation_counter
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
MODEL = "microsoft/Phi-tiny-MoE-instruct"
def _run_vllm(vllm_runner):
with vllm_runner(
MODEL,
trust_remote_code=False,
max_model_len=256,
max_num_batched_tokens=1024,
load_format="dummy",
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
cudagraph_mode=CUDAGraphMode.NONE,
),
num_gpu_blocks_override=8,
):
pass
def _cold_start(vllm_runner):
counters.clear()
with compilation_counter.expect(
num_compiled_artifacts_saved=3,
num_compiled_artifacts_loaded=0,
):
_run_vllm(vllm_runner)
assert counters["aot_autograd"]["total"] == 33
assert counters["aot_autograd"]["autograd_cache_miss"] == 3
assert counters["aot_autograd"]["autograd_cache_hit"] == 0
def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
# Cold start in a forked child (must fork before CUDA init).
# This model has 32 identical transformer layers which produce
# 33 subgraphs after splitting on attention — only 3 are unique.
ctx = mp.get_context("fork")
p = ctx.Process(target=_cold_start, args=(vllm_runner,))
p.start()
p.join()
assert p.exitcode == 0, "Cold-start child failed"
# Warm start — compiled artifacts loaded from disk cache.
counters.clear()
with compilation_counter.expect(
num_compiled_artifacts_loaded=3,
num_compiled_artifacts_saved=0,
):
_run_vllm(vllm_runner)
assert counters["aot_autograd"]["total"] == 30
assert counters["aot_autograd"]["autograd_cache_miss"] == 0
assert (
counters["aot_autograd"]["autograd_cache_hit"] == 0
) # No miss at aot_autograd level causing disk I/O.
......@@ -109,9 +109,9 @@ def test_vllm_structured_logging_artifacts(use_fresh_inductor_cache):
f"got {len(vllm_piecewise_split_graph)}"
)
compile_start_artifacts = capture.get("artifact", "vllm_piecewise_compile_start")
assert len(compile_start_artifacts) == 2, (
"Expected 2 vllm_piecewise_compile_start "
"(one for dynamic ranges, one for compile size), "
assert len(compile_start_artifacts) == 4, (
"Expected 4 vllm_piecewise_compile_start "
"(2 subgraphs x 2 ranges each: dynamic + compile size), "
f"got {len(compile_start_artifacts)}"
)
submod_dumps = capture.get("graph_dump", r"vllm_submod_.*")
......
......@@ -95,7 +95,7 @@ def test_torch_compile_wrapper(use_bytecode_hook, monkeypatch):
f"Expected {expected1}, got {result1}"
)
# Second call should triger another compilation
# Second call should trigger another compilation
x2 = torch.tensor([1, 2, 3])
result2 = wrapper(x2)
expected2 = torch.tensor([100, 200, 300])
......
......@@ -78,3 +78,34 @@ def test_ray_runtime_env(monkeypatch: pytest.MonkeyPatch):
)
ray.shutdown()
def test_unrecognized_env(monkeypatch):
import os
from vllm.envs import environment_variables
# Remove any existing unrecognized VLLM env vars that might interfere
for env in list(os.environ):
if env.startswith("VLLM_") and env not in environment_variables:
monkeypatch.delenv(env, raising=False)
# Test that if fail_on_environ_validation is True, then an error
# is raised when an unrecognized vLLM environment variable is set
monkeypatch.setenv("VLLM_UNRECOGNIZED_ENV_VAR", "some_value")
engine_args = EngineArgs(
fail_on_environ_validation=True,
)
with pytest.raises(ValueError, match="Unknown vLLM environment variable detected"):
engine_args.create_engine_config()
# Test that if fail_on_environ_validation is False, then no error is raised
engine_args = EngineArgs()
engine_args.create_engine_config()
# Test that when the unrecognized env var is removed, no error is raised
monkeypatch.delenv("VLLM_UNRECOGNIZED_ENV_VAR")
engine_args = EngineArgs(
fail_on_environ_validation=True,
)
engine_args.create_engine_config()
......@@ -3,6 +3,7 @@
import pytest
from vllm.config.model import ModelConfig
from vllm.config.multimodal import MultiModalConfig
from vllm.v1.attention.backends.registry import AttentionBackendEnum
......@@ -23,3 +24,20 @@ def test_mm_encoder_attn_backend_hash_updates():
mm_encoder_attn_backend=AttentionBackendEnum.FLASH_ATTN
).compute_hash()
assert base_hash != overridden_hash
def test_language_model_only_does_not_affect_mm_hash():
"""language_model_only does not affect the ViT computation graph,
so it should not change the multimodal config hash."""
base_hash = MultiModalConfig().compute_hash()
lm_only_hash = MultiModalConfig(language_model_only=True).compute_hash()
assert base_hash == lm_only_hash
def test_language_model_only_affects_model_hash():
"""language_model_only affects the LM computation graph,
so it should change the model config hash."""
model = "llava-hf/llava-1.5-7b-hf"
base_hash = ModelConfig(model).compute_hash()
lm_only_hash = ModelConfig(model, language_model_only=True).compute_hash()
assert base_hash != lm_only_hash
......@@ -176,16 +176,20 @@ def init_test_http_connection():
@pytest.fixture
def dist_init():
from tests.utils import ensure_current_vllm_config
temp_file = tempfile.mkstemp()[1]
init_distributed_environment(
world_size=1,
rank=0,
distributed_init_method=f"file://{temp_file}",
local_rank=0,
backend="nccl",
)
initialize_model_parallel(1, 1)
yield
with ensure_current_vllm_config():
init_distributed_environment(
world_size=1,
rank=0,
distributed_init_method=f"file://{temp_file}",
local_rank=0,
backend="nccl",
)
initialize_model_parallel(1, 1)
yield
cleanup_dist_env_and_memory()
......@@ -419,18 +423,16 @@ class HfRunner:
self.tokenizer: "PreTrainedTokenizer | PreTrainedTokenizerFast" = (
AutoTokenizer.from_pretrained(
model_name,
dtype=dtype,
trust_remote_code=trust_remote_code,
)
)
# don't put this import at the top level
# it will call torch.cuda.device_count()
# it will call torch.accelerator.device_count()
from transformers import AutoProcessor
self.processor = AutoProcessor.from_pretrained(
model_name,
dtype=dtype,
trust_remote_code=trust_remote_code,
)
if skip_tokenizer_init:
......@@ -792,7 +794,6 @@ class VllmRunner:
tensor_parallel_size: int = 1,
block_size: int = 16 if not torch.xpu.is_available() else 64,
enable_chunked_prefill: bool | None = False,
swap_space: int = 4,
enforce_eager: bool | None = False,
# Set this to avoid hanging issue
default_torch_num_threads: int | None = None,
......@@ -829,7 +830,6 @@ class VllmRunner:
trust_remote_code=trust_remote_code,
dtype=dtype,
seed=seed,
swap_space=swap_space,
enforce_eager=enforce_eager,
disable_log_stats=disable_log_stats,
tensor_parallel_size=tensor_parallel_size,
......@@ -841,7 +841,10 @@ class VllmRunner:
def get_inputs(
self,
prompts: list[str] | list[torch.Tensor] | list[list[int]],
prompts: list[str]
| list[torch.Tensor]
| list[list[int]]
| list[dict[str, Any]],
images: PromptImageInput | None = None,
videos: PromptVideoInput | None = None,
audios: PromptAudioInput | None = None,
......@@ -855,26 +858,32 @@ class VllmRunner:
inputs = list[dict[str, Any]]()
for i, prompt in enumerate(prompts):
prompt_dict = dict[str, Any]()
if isinstance(prompt, str):
prompt_dict["prompt"] = prompt
elif isinstance(prompt, list):
prompt_dict["prompt_token_ids"] = prompt
# If we're passing an encoder/decoder prompt, we assume it
# already contains the multimodal data in the prompt
if isinstance(prompt, dict):
assert images is None and audios is None and videos is None
inputs.append(prompt.copy())
else:
prompt_dict["prompt_embeds"] = prompt
multi_modal_data = dict[str, Any]()
if images is not None and (image := images[i]) is not None:
multi_modal_data["image"] = image
if videos is not None and (video := videos[i]) is not None:
multi_modal_data["video"] = video
if audios is not None and (audio := audios[i]) is not None:
multi_modal_data["audio"] = audio
prompt_dict = dict[str, Any]()
if isinstance(prompt, str):
prompt_dict["prompt"] = prompt
elif isinstance(prompt, list):
prompt_dict["prompt_token_ids"] = prompt
else:
prompt_dict["prompt_embeds"] = prompt
multi_modal_data = dict[str, Any]()
if images is not None and (image := images[i]) is not None:
multi_modal_data["image"] = image
if videos is not None and (video := videos[i]) is not None:
multi_modal_data["video"] = video
if audios is not None and (audio := audios[i]) is not None:
multi_modal_data["audio"] = audio
if multi_modal_data:
prompt_dict["multi_modal_data"] = multi_modal_data
if multi_modal_data:
prompt_dict["multi_modal_data"] = multi_modal_data
inputs.append(prompt_dict)
inputs.append(prompt_dict)
return inputs
......@@ -1138,6 +1147,15 @@ class VllmRunner:
return self
def __exit__(self, exc_type, exc_value, traceback):
# Explicitly shutdown the engine core to release GPU resources
# This is needed because when executing consecutive tests, the GC
# might not be fast enough in shutting down the llm engine. This can lead to OOMs
# because when the next test starts some GPU memory is still in use.
try:
self.llm.llm_engine.engine_core.shutdown()
except Exception:
# Ignore shutdown errors as cleanup will still proceed
pass
del self.llm
cleanup_dist_env_and_memory()
......@@ -1517,7 +1535,7 @@ def clean_gpu_memory_between_tests():
from tests.utils import wait_for_gpu_memory_to_clear
num_gpus = torch.cuda.device_count()
num_gpus = torch.accelerator.device_count()
if num_gpus > 0:
try:
wait_for_gpu_memory_to_clear(
......@@ -1531,7 +1549,7 @@ def clean_gpu_memory_between_tests():
# Clean up GPU memory after the test
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.accelerator.empty_cache()
gc.collect()
......@@ -1546,6 +1564,14 @@ def use_fresh_inductor_cache():
yield
@pytest.fixture
def fresh_vllm_cache(monkeypatch, use_fresh_inductor_cache):
"""Temporary VLLM_CACHE_ROOT combined with a fresh inductor cache."""
with tempfile.TemporaryDirectory() as tmp_dir:
monkeypatch.setenv("VLLM_CACHE_ROOT", tmp_dir)
yield tmp_dir
@pytest.fixture(scope="function")
def enable_pickle(monkeypatch):
"""`LLM.apply_model` requires pickling a function."""
......
......@@ -14,7 +14,7 @@ import torch # noqa: E402
from vllm.platforms import current_platform # noqa: F401, E402
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
count = torch.cuda.device_count()
count = torch.accelerator.device_count()
if count == 0:
sys.exit(0) # Skip: no GPUs available
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for CUDA forward compatibility path logic in env_override.py.
Verifies the opt-in LD_LIBRARY_PATH manipulation for CUDA compat libs,
including env var parsing, path detection, and deduplication.
"""
import os
from unittest.mock import patch
import pytest
# Import the functions directly (they're module-level in env_override)
# We must import them without triggering the module-level side effects,
# so we import the functions by name after the module is already loaded.
from vllm.env_override import (
_get_torch_cuda_version,
_maybe_set_cuda_compatibility_path,
)
class TestCudaCompatibilityEnvParsing:
"""Test VLLM_ENABLE_CUDA_COMPATIBILITY env var parsing."""
def test_disabled_by_default(self, monkeypatch):
"""Compat path is NOT set when env var is absent."""
monkeypatch.delenv("VLLM_ENABLE_CUDA_COMPATIBILITY", raising=False)
monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
_maybe_set_cuda_compatibility_path()
assert (
"LD_LIBRARY_PATH" not in os.environ
or os.environ.get("LD_LIBRARY_PATH", "") == ""
)
@pytest.mark.parametrize("value", ["0", "false", "False", "no", ""])
def test_disabled_values(self, monkeypatch, value):
"""Various falsy values should not activate compat path."""
monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", value)
monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
_maybe_set_cuda_compatibility_path()
# LD_LIBRARY_PATH should not be set (or remain empty)
ld_path = os.environ.get("LD_LIBRARY_PATH", "")
assert "compat" not in ld_path
@pytest.mark.parametrize("value", ["1", "true", "True", " 1 ", " TRUE "])
def test_enabled_values_with_valid_path(self, monkeypatch, tmp_path, value):
"""Truthy values activate compat path when a valid path exists."""
compat_dir = tmp_path / "compat"
compat_dir.mkdir()
monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", value)
monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir))
monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
_maybe_set_cuda_compatibility_path()
ld_path = os.environ.get("LD_LIBRARY_PATH", "")
assert str(compat_dir) in ld_path
class TestCudaCompatibilityPathDetection:
"""Test path detection: custom override, conda, default."""
def test_custom_path_override(self, monkeypatch, tmp_path):
"""VLLM_CUDA_COMPATIBILITY_PATH takes highest priority."""
custom_dir = tmp_path / "my-compat"
custom_dir.mkdir()
monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(custom_dir))
monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
_maybe_set_cuda_compatibility_path()
ld_path = os.environ.get("LD_LIBRARY_PATH", "")
assert ld_path.startswith(str(custom_dir))
def test_conda_prefix_fallback(self, monkeypatch, tmp_path):
"""Falls back to $CONDA_PREFIX/cuda-compat if custom not set."""
conda_dir = tmp_path / "conda-env"
compat_dir = conda_dir / "cuda-compat"
compat_dir.mkdir(parents=True)
monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
monkeypatch.delenv("VLLM_CUDA_COMPATIBILITY_PATH", raising=False)
monkeypatch.setenv("CONDA_PREFIX", str(conda_dir))
monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
_maybe_set_cuda_compatibility_path()
ld_path = os.environ.get("LD_LIBRARY_PATH", "")
assert str(compat_dir) in ld_path
def test_no_valid_path_does_nothing(self, monkeypatch):
"""When enabled but no valid path exists, LD_LIBRARY_PATH unchanged."""
monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", "/nonexistent/path")
monkeypatch.delenv("CONDA_PREFIX", raising=False)
monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
with patch("vllm.env_override._get_torch_cuda_version", return_value=None):
_maybe_set_cuda_compatibility_path()
assert os.environ.get("LD_LIBRARY_PATH", "") == ""
def test_default_cuda_path_fallback(self, monkeypatch, tmp_path):
"""Falls back to /usr/local/cuda-{ver}/compat via torch version."""
fake_cuda = tmp_path / "cuda-12.8" / "compat"
fake_cuda.mkdir(parents=True)
monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
monkeypatch.delenv("VLLM_CUDA_COMPATIBILITY_PATH", raising=False)
monkeypatch.delenv("CONDA_PREFIX", raising=False)
monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
with (
patch("vllm.env_override._get_torch_cuda_version", return_value="12.8"),
patch(
"vllm.env_override.os.path.isdir",
side_effect=lambda p: p == "/usr/local/cuda-12.8/compat"
or os.path.isdir(p),
),
):
_maybe_set_cuda_compatibility_path()
ld_path = os.environ.get("LD_LIBRARY_PATH", "")
assert "/usr/local/cuda-12.8/compat" in ld_path
class TestCudaCompatibilityLdPathManipulation:
"""Test LD_LIBRARY_PATH prepend and deduplication logic."""
def test_prepends_to_empty_ld_path(self, monkeypatch, tmp_path):
"""Compat path is set when LD_LIBRARY_PATH is empty."""
compat_dir = tmp_path / "compat"
compat_dir.mkdir()
monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir))
monkeypatch.delenv("LD_LIBRARY_PATH", raising=False)
_maybe_set_cuda_compatibility_path()
assert os.environ["LD_LIBRARY_PATH"] == str(compat_dir)
def test_prepends_to_existing_ld_path(self, monkeypatch, tmp_path):
"""Compat path is prepended before existing entries."""
compat_dir = tmp_path / "compat"
compat_dir.mkdir()
monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir))
monkeypatch.setenv("LD_LIBRARY_PATH", "/usr/lib:/other/lib")
_maybe_set_cuda_compatibility_path()
ld_path = os.environ["LD_LIBRARY_PATH"]
parts = ld_path.split(os.pathsep)
assert parts[0] == str(compat_dir)
assert "/usr/lib" in parts
assert "/other/lib" in parts
def test_deduplicates_existing_compat_path(self, monkeypatch, tmp_path):
"""If compat path already in LD_LIBRARY_PATH, move to front."""
compat_dir = tmp_path / "compat"
compat_dir.mkdir()
monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir))
monkeypatch.setenv(
"LD_LIBRARY_PATH",
f"/usr/lib:{compat_dir}:/other/lib",
)
_maybe_set_cuda_compatibility_path()
ld_path = os.environ["LD_LIBRARY_PATH"]
parts = ld_path.split(os.pathsep)
assert parts[0] == str(compat_dir)
assert parts.count(str(compat_dir)) == 1
def test_already_at_front_is_noop(self, monkeypatch, tmp_path):
"""If compat path is already first, don't modify LD_LIBRARY_PATH."""
compat_dir = tmp_path / "compat"
compat_dir.mkdir()
original = f"{compat_dir}:/usr/lib"
monkeypatch.setenv("VLLM_ENABLE_CUDA_COMPATIBILITY", "1")
monkeypatch.setenv("VLLM_CUDA_COMPATIBILITY_PATH", str(compat_dir))
monkeypatch.setenv("LD_LIBRARY_PATH", original)
_maybe_set_cuda_compatibility_path()
assert os.environ["LD_LIBRARY_PATH"] == original
class TestGetTorchCudaVersion:
"""Test _get_torch_cuda_version() helper."""
def test_returns_string_when_torch_available(self):
"""Should return a CUDA version string like '12.8'."""
version = _get_torch_cuda_version()
# torch is installed in vllm's environment
assert version is None or isinstance(version, str)
def test_returns_none_when_torch_missing(self):
"""Should return None when torch is not importable."""
with patch(
"vllm.env_override.importlib.util.find_spec",
return_value=None,
):
assert _get_torch_cuda_version() is None
......@@ -7,7 +7,6 @@ from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams
@pytest.mark.skip_v1
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
def test_computed_prefix_blocks(model: str):
# This test checks if the engine generates completions both with and
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment