Commit 38d80967 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori

parents 33650733 880c741b
......@@ -4,9 +4,9 @@
Test (piecewise) compilation with a simple model where multiple submodules
are compiled and graph captured separately.
"""
import torch
from torch import nn
from torch.library import Library
from vllm.compilation.backends import set_model_tag
from vllm.compilation.counter import compilation_counter
......@@ -15,10 +15,9 @@ from vllm.compilation.decorators import (ignore_torch_compile,
from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
VllmConfig, set_current_vllm_config)
from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils import direct_register_custom_op
# create a library to hold the custom op
silly_lib = Library("silly", "FRAGMENT") # noqa
# This import automatically registers `torch.ops.silly.attention`
from .. import silly_attention # noqa: F401
BATCH_SIZE = 32
MLP_SIZE = 128
......@@ -26,27 +25,6 @@ HIDDEN_SIZE = 1024
RANDOM_SEED = 0
def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
out: torch.Tensor) -> None:
out.copy_(q)
out += k
out += v
def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
out: torch.Tensor) -> None:
return
direct_register_custom_op(
op_name="attention",
op_func=silly_attention,
mutates_args=["out"],
fake_impl=silly_attention_fake,
target_lib=silly_lib,
)
@support_torch_compile
class ParentModel(nn.Module):
......@@ -134,7 +112,7 @@ class SimpleModelWithTwoGraphs(ParentModel):
# Test will fail without set_model_tag here with error:
# "ValueError: too many values to unpack (expected 3)"
# This is because CompiledAttention and CompiledAttentionTwo
# have different implmentations but the same torch.compile
# have different implementations but the same torch.compile
# cache dir will be used as default prefix is 'model_tag'
with set_model_tag("attn_one"):
self.attn_one = CompiledAttention(
......
......@@ -4,10 +4,10 @@
Test the piecewise compilation with a simple model so that we
can exactly calculate the expected output and side effects.
"""
import pytest
import torch
from torch import nn
from torch.library import Library
from vllm.compilation.counter import compilation_counter
from vllm.compilation.decorators import support_torch_compile
......@@ -15,35 +15,9 @@ from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
VllmConfig, set_current_vllm_config)
from vllm.envs import VLLM_USE_V1
from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils import direct_register_custom_op
global_counter = 0
# create a library to hold the custom op
silly_lib = Library("silly", "FRAGMENT") # noqa
def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
out: torch.Tensor) -> None:
global global_counter
global_counter += 1
print(f"{global_counter=}")
out.copy_(q)
out[0] += 1
def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
out: torch.Tensor) -> None:
return
direct_register_custom_op(
op_name="attention",
op_func=silly_attention,
mutates_args=["out"],
fake_impl=silly_attention_fake,
target_lib=silly_lib,
)
# This import automatically registers `torch.ops.silly.attention`
from ..silly_attention import get_global_counter, reset_global_counter
@support_torch_compile
......@@ -59,8 +33,7 @@ class SillyModel(nn.Module):
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
Overall effect:
x += 1
x[0] += 2
x = 3 * x + 19
global_counter += 2
"""
x = x + 1
......@@ -78,6 +51,7 @@ class SillyModel(nn.Module):
@pytest.mark.parametrize("use_inductor", [True, False])
@torch.inference_mode()
def test_simple_piecewise_compile(use_inductor):
assert VLLM_USE_V1
......@@ -121,13 +95,12 @@ def test_simple_piecewise_compile(use_inductor):
model(torch.randn(1).cuda())
input = torch.zeros(2).cuda()
global global_counter
global_counter = 0
reset_global_counter()
with set_forward_context(
None,
vllm_config=vllm_config,
cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE,
batch_descriptor=BatchDescriptor(num_tokens=2, )):
output = model(input)
assert global_counter == 2
assert torch.allclose(output.cpu(), torch.tensor([3., 1.]))
assert get_global_counter() == 2
assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
......@@ -14,38 +14,15 @@ from typing import Any, Optional
import pytest
import torch
from torch import nn
from torch.library import Library
from vllm.compilation.counter import compilation_counter
from vllm.compilation.decorators import support_torch_compile
from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
VllmConfig, set_current_vllm_config)
from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils import direct_register_custom_op
# create a library to hold the custom op
silly_lib = Library("silly", "FRAGMENT") # noqa
def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
out: torch.Tensor) -> None:
out.copy_(q)
out += k
out += v
def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
out: torch.Tensor) -> None:
return
direct_register_custom_op(
op_name="attention",
op_func=silly_attention,
mutates_args=["out"],
fake_impl=silly_attention_fake,
target_lib=silly_lib,
)
# This import automatically registers `torch.ops.silly.attention`
from .. import silly_attention # noqa: F401
@dataclass
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Shared PyTorch custom silly attention for compilation tests.
Centralizes custom operation definitions to avoid duplicate registrations.
"""
import torch
from torch.library import Library
from vllm.utils import direct_register_custom_op
# Shared library for all compilation test operations
# Using "silly" namespace to match existing test expectations
# import this file will automatically register
# torch ops for testing (like silly.attention)
silly_lib = Library("silly", "FRAGMENT")
# Global counter that counts the number of times attention is invoked
_global_counter = 0
def get_global_counter():
"""Get the current global counter value"""
return _global_counter
def reset_global_counter():
"""Reset the global counter to 0"""
global _global_counter
_global_counter = 0
def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
out: torch.Tensor) -> None:
"""
Unified attention implementation that depends on
all inputs and affects the output.
Always increments a global counter that tests can use or ignore.
"""
global _global_counter
# Always increment the global counter
_global_counter += 1
# Unified implementation that depends on all inputs
out.copy_(q + k + v)
def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
out: torch.Tensor) -> None:
"""Fake implementation for testing"""
return
# Register the unified attention operation
direct_register_custom_op(
op_name="attention",
op_func=silly_attention,
mutates_args=["out"],
fake_impl=silly_attention_fake,
target_lib=silly_lib,
)
......@@ -23,7 +23,7 @@ class TestSetting:
fullgraph: bool
# we cannot afford testing the full Catesian product
# we cannot afford testing the full Cartesian product
# of all models and all levels
@pytest.mark.parametrize(
"test_setting",
......@@ -62,8 +62,12 @@ class TestSetting:
TestSetting(
model="BAAI/bge-multilingual-gemma2",
model_args=[
"--runner", "pooling", "--dtype", "bfloat16",
"--max-model-len", "2048"
"--runner",
"pooling",
"--dtype",
"bfloat16",
"--max-model-len",
"2048",
],
pp_size=1,
tp_size=1,
......@@ -71,17 +75,15 @@ class TestSetting:
method="encode",
fullgraph=True,
),
# TODO: bert models are not supported in V1 yet
# # encoder-based embedding model (BERT)
# TestSetting(
# model="BAAI/bge-base-en-v1.5",
# model_args=["--runner", "pooling"],
# pp_size=1,
# tp_size=1,
# attn_backend="XFORMERS",
# method="encode",
# fullgraph=True,
# ),
TestSetting(
model="BAAI/bge-base-en-v1.5",
model_args=["--runner", "pooling"],
pp_size=1,
tp_size=1,
attn_backend="FLASH_ATTN",
method="encode",
fullgraph=True,
),
# vision language model
TestSetting(
model="microsoft/Phi-3.5-vision-instruct",
......@@ -92,7 +94,8 @@ class TestSetting:
method="generate_with_image",
fullgraph=False,
),
])
],
)
def test_compile_correctness(
monkeypatch: pytest.MonkeyPatch,
test_setting: TestSetting,
......
......@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
from torch import nn
from torch.library import Library
from vllm.compilation.counter import compilation_counter
from vllm.compilation.decorators import (ignore_torch_compile,
......@@ -10,36 +9,14 @@ from vllm.compilation.decorators import (ignore_torch_compile,
from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel,
CUDAGraphMode, VllmConfig, set_current_vllm_config)
from vllm.forward_context import BatchDescriptor, set_forward_context
from vllm.utils import direct_register_custom_op
# create a library to hold the custom op
silly_lib = Library("silly", "FRAGMENT") # noqa
# This import automatically registers `torch.ops.silly.attention`
from . import silly_attention # noqa: F401
BATCH_SIZE = 32
MLP_SIZE = 128
def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
out: torch.Tensor) -> None:
out.copy_(q)
out += k
out += v
def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
out: torch.Tensor) -> None:
return
direct_register_custom_op(
op_name="attention",
op_func=silly_attention,
mutates_args=["out"],
fake_impl=silly_attention_fake,
target_lib=silly_lib,
)
@torch.inference_mode
def run_model(vllm_config: VllmConfig, model: nn.Module,
cudagraph_runtime_mode: CUDAGraphMode):
......@@ -151,7 +128,7 @@ def test_ignore_torch_compile_decorator():
run_model(vllm_config, mod_C, cudagraph_runtime_mode)
# Only enable torch.compile if
# Only enable torch.compile if
# vllm_config.cache_config.kv_sharing_fast_prefill=True
@support_torch_compile(enable_if=lambda vllm_config: vllm_config.cache_config.
kv_sharing_fast_prefill)
......@@ -173,7 +150,7 @@ class B(nn.Module):
return x
# Only enable torch.compile if
# Only enable torch.compile if
# vllm_config.cache_config.kv_sharing_fast_prefill=False
@support_torch_compile(enable_if=lambda vllm_config: not vllm_config.
cache_config.kv_sharing_fast_prefill)
......
......@@ -15,9 +15,10 @@ from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.quantization.utils.quant_utils import (
GroupShape, QuantKey, ScaleDesc)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
Fp8LinearOp, maybe_create_device_identity)
Fp8LinearOp, cutlass_fp8_supported, maybe_create_device_identity)
from vllm.platforms import current_platform
from ..utils import override_cutlass_fp8_supported
from .backend import TestBackend
FP8_DTYPE = current_platform.fp8_dtype()
......@@ -26,9 +27,9 @@ FP8_DTYPE = current_platform.fp8_dtype()
class TestModel(torch.nn.Module):
def __init__(self, hidden_size: int, eps: float, static: bool,
force_fp8_e4m3fnuz: bool, *args, **kwargs):
cuda_force_torch: bool, *args, **kwargs):
super().__init__(*args, **kwargs)
self.force_fp8_e4m3fnuz = force_fp8_e4m3fnuz
self.cuda_force_torch = cuda_force_torch
self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
group_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN
......@@ -42,11 +43,12 @@ class TestModel(torch.nn.Module):
torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
for _ in range(2)
]
self.fp8_linear = Fp8LinearOp(
force_fp8_e4m3fnuz=force_fp8_e4m3fnuz,
act_quant_static=static,
act_quant_group_shape=group_shape,
)
with override_cutlass_fp8_supported(not cuda_force_torch):
self.fp8_linear = Fp8LinearOp(
act_quant_static=static,
act_quant_group_shape=group_shape,
)
def forward(self, x):
resid = torch.sqrt(x)
......@@ -81,11 +83,14 @@ class TestModel(torch.nn.Module):
@pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
@pytest.mark.parametrize("eps", [1e-5, 1e-6])
@pytest.mark.parametrize("static", [True, False])
@pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False])
# cuda_force_torch used to test torch code path on platforms that
# cutlass_fp8_supported() == True.
@pytest.mark.parametrize("cuda_force_torch",
[True, False] if cutlass_fp8_supported() else [True])
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
reason="Only test on CUDA and ROCm")
def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
force_fp8_e4m3fnuz):
cuda_force_torch):
torch.set_default_device("cuda")
torch.set_default_dtype(dtype)
torch.manual_seed(1)
......@@ -102,7 +107,7 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
fusion_pass = FusionPass.instance(vllm_config)
backend = TestBackend(noop_pass, fusion_pass)
model = TestModel(hidden_size, eps, static, force_fp8_e4m3fnuz)
model = TestModel(hidden_size, eps, static, cuda_force_torch)
# First dimension dynamic
x = torch.rand(num_tokens, hidden_size)
......
......@@ -40,13 +40,12 @@ backend_unfused: Optional[TestBackend] = None
@pytest.mark.parametrize(
"model, quant_key",
[("amd/Llama-3.1-8B-Instruct-FP8-KV", kFp8StaticTensorSym)])
@pytest.mark.parametrize(
"use_triton_fa", [True, False] if current_platform.is_rocm() else [False])
@pytest.mark.parametrize("use_triton_fa", [True, False])
@pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
reason="Only test CUDA and ROCm")
def test_attention_fusion(example_prompts, monkeypatch, model: str,
quant_key: QuantKey, use_triton_fa: bool):
@pytest.mark.skipif(not current_platform.is_rocm(),
reason="V0 attn quant fusion only on ROCm")
def test_attention_fusion_v0(example_prompts, monkeypatch, model: str,
quant_key: QuantKey, use_triton_fa: bool):
# Clean Dynamo cache to avoid reusing other test cases
# (for some reason the reset at the end is not enough)
torch._dynamo.reset()
......@@ -69,13 +68,17 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
backend="tests.compile.test_fusion_attn.backend_unfused",
custom_ops=["+quant_fp8"],
)
vllm_config = VllmConfig(compilation_config=compile_config)
vllm_config = VllmConfig(compilation_config=compile_config,
model_config=ModelConfig(
model=model,
dtype=torch.bfloat16,
))
backend_unfused = TestBackend(NoOpEliminationPass(vllm_config))
llm = LLM(model,
enforce_eager=True,
compilation_config=compile_config,
gpu_memory_utilization=0.9,
gpu_memory_utilization=0.5,
max_model_len=2048)
sampling_params = SamplingParams(temperature=0.0,
......@@ -93,7 +96,11 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
backend="tests.compile.test_fusion_attn.backend",
custom_ops=["+quant_fp8"],
)
vllm_config = VllmConfig(compilation_config=compile_config)
vllm_config = VllmConfig(compilation_config=compile_config,
model_config=ModelConfig(
model=model,
dtype=torch.bfloat16,
))
# AttnFusionPass needs attention layers to be registered in config upon init
# so we initialize it during compilation.
......@@ -102,7 +109,7 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
llm2 = LLM(model,
enforce_eager=True,
compilation_config=compile_config,
gpu_memory_utilization=0.9,
gpu_memory_utilization=0.5,
max_model_len=2048)
# check support
......@@ -171,6 +178,8 @@ class AttentionQuantPatternModel(torch.nn.Module):
cache_config=vllm_config.cache_config,
prefix="model.layers.0.self_attn.attn",
)
self.attn._k_scale = self.attn._k_scale.to(device)
self.attn._v_scale = self.attn._v_scale.to(device)
self.block_size = 16
......@@ -188,7 +197,7 @@ class AttentionQuantPatternModel(torch.nn.Module):
device=self.device,
)
def build_attn_metadata(self, batch_size: int):
def build_attn_metadata(self, batch_size: int, use_hnd: bool):
"""Initialize attention metadata."""
# Create common attn metadata
......@@ -205,10 +214,8 @@ class AttentionQuantPatternModel(torch.nn.Module):
num_blocks = batch_size * max_blocks
# Create dummy KV cache for FlashInfer TRTLLM
# - NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
# - HND: [num_blocks, 2, num_kv_heads, block_size, head_size]
# Create kv_cache in HND layout and permute to NHD layout
# (later will be permuted back to HND layout in forward pass)
# - NHD: [num_blocks, block_size, num_kv_heads, head_size]
# - HND: [num_blocks, num_kv_heads, block_size, head_size]
kv_cache = torch.zeros(num_blocks,
2,
self.num_kv_heads,
......@@ -216,7 +223,17 @@ class AttentionQuantPatternModel(torch.nn.Module):
self.head_size,
dtype=self.kv_cache_dtype,
device=self.device)
kv_cache = kv_cache.permute(0, 1, 3, 2, 4)
if current_platform.is_rocm():
# k/v as 1st dimention
if use_hnd:
kv_cache = kv_cache.permute(1, 0, 2, 3, 4)
else:
kv_cache = kv_cache.permute(1, 0, 3, 2, 4)
else:
# k/v as 2nd dimention
# Create kv_cache in HND layout and permute to NHD layout
# (later will be permuted back to HND layout in forward pass)
kv_cache = kv_cache.permute(0, 1, 3, 2, 4)
self.attn.kv_cache = [kv_cache]
# Build attn metadata
......@@ -296,28 +313,51 @@ class TestAttentionNvfp4QuantPatternModel(AttentionQuantPatternModel):
out_dtype=attn_output.dtype)
@pytest.mark.parametrize("num_qo_heads, num_kv_heads", [(64, 8), (40, 8)])
if current_platform.is_cuda():
MODELS = [("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
TestAttentionFp8StaticQuantPatternModel),
("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
TestAttentionNvfp4QuantPatternModel)]
HEADS = [(64, 8), (40, 8)]
elif current_platform.is_rocm():
MODELS = [("amd/Llama-3.1-8B-Instruct-FP8-KV",
TestAttentionFp8StaticQuantPatternModel)]
HEADS = [(32, 8), (40, 8)]
else:
MODELS = []
HEADS = []
@pytest.mark.parametrize("num_qo_heads, num_kv_heads", HEADS)
@pytest.mark.parametrize("head_size", [128])
@pytest.mark.parametrize("batch_size", [7, 256, 533])
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("model_name, model_class",
[("nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
TestAttentionFp8StaticQuantPatternModel),
("nvidia/Llama-4-Scout-17B-16E-Instruct-FP4",
TestAttentionNvfp4QuantPatternModel)])
@pytest.mark.parametrize("backend", [_Backend.FLASHINFER])
@pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test CUDA")
@pytest.mark.parametrize("batch_size",
[7, 256, 533] if current_platform.is_cuda() else [8])
@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
@pytest.mark.parametrize("model_name, model_class", MODELS)
@pytest.mark.parametrize("backend", [_Backend.FLASHINFER] if
current_platform.is_cuda() else [_Backend.ROCM_FLASH])
@pytest.mark.parametrize(
"split_attention",
[False, True] if current_platform.is_rocm() else [False])
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
reason="Only test ROCm or CUDA")
@pytest.mark.skipif(not current_platform.supports_fp8(), reason="Need FP8")
@pytest.mark.skipif(not current_platform.is_device_capability((10, 0)),
reason="Only test on SM100(Blackwell)")
@pytest.mark.skipif(current_platform.is_cuda()
and not current_platform.is_device_capability((10, 0)),
reason="On CUDA only test on SM100(Blackwell)")
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
reason="Only test ROCm or CUDA")
def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
head_size: int, batch_size: int,
dtype: torch.dtype, model_name: str,
model_class: type[AttentionQuantPatternModel],
backend: _Backend, monkeypatch, dist_init):
backend: _Backend, split_attention: bool,
monkeypatch, dist_init):
"""Test AttentionStaticQuantPattern fusion pass"""
monkeypatch.setenv("VLLM_USE_V1", "1")
if split_attention:
monkeypatch.setenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "1")
device = torch.device("cuda:0")
torch.manual_seed(42)
......@@ -326,6 +366,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
model_config=ModelConfig(
model=model_name,
max_model_len=2048,
dtype=dtype,
),
scheduler_config=SchedulerConfig(max_num_seqs=1024),
compilation_config=CompilationConfig(
......@@ -368,7 +409,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
forward_ctx = get_forward_context()
forward_ctx.attn_metadata = model_unfused.build_attn_metadata(
batch_size)
batch_size, use_hnd=split_attention)
# Run model directly without compilation and fusion
result_unfused = model_unfused(q, k, v)
......@@ -389,7 +430,8 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
model_fused = model_fused.to(device)
forward_ctx = get_forward_context()
forward_ctx.attn_metadata = model_fused.build_attn_metadata(batch_size)
forward_ctx.attn_metadata = model_fused.build_attn_metadata(
batch_size, use_hnd=split_attention)
# Create test backend with fusion passes enabled
noop_pass = NoOpEliminationPass(vllm_config)
......@@ -404,12 +446,19 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
assert model_compiled.attn._o_scale_float is None
result_fused_1 = model_compiled(q, k, v)
# After the 1st round of the forward pass, output quant scale should be
# loaded into the attn layer's _o_scale_float, the 2nd round should
# reuse the loaded _o_scale_float
assert model_compiled.attn._o_scale_float is not None
result_fused_2 = model_compiled(q, k, v)
assert model_compiled.attn._o_scale_float is not None
if backend == _Backend.FLASHINFER:
# With the Flashinfer backend after the 1st round of the forward
# pass, output quant scale should be loaded into the attn layer's
# _o_scale_float, the 2nd round should reuse the loaded
# _o_scale_float
assert model_compiled.attn._o_scale_float is not None
result_fused_2 = model_compiled(q, k, v)
assert model_compiled.attn._o_scale_float is not None
torch.testing.assert_close(result_unfused,
result_fused_2,
atol=1e-2,
rtol=1e-2)
# Check attn fusion support
quant_key = model_class.quant_key
......@@ -444,12 +493,8 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
assert attn_nodes_post[0].kwargs.get("output_block_scale") is not None, \
"Attention should have output_block_scale after FP4 fusion" # noqa: E501
# Check that results are closed
# Check that results are close
torch.testing.assert_close(result_unfused,
result_fused_1,
atol=1e-2,
rtol=1e-2)
torch.testing.assert_close(result_unfused,
result_fused_2,
atol=1e-2,
rtol=1e-2)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import cast
import pytest
import torch
import vllm.envs as envs
from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor
from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
# yapf conflicts with isort for this block
# yapf: disable
......@@ -17,9 +20,10 @@ from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.quantization.utils.quant_utils import (
GroupShape, kFp8StaticTensorSym, kNvfp4Quant)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
Fp8LinearOp)
Fp8LinearOp, cutlass_fp8_supported)
from vllm.platforms import current_platform
from ..utils import override_cutlass_fp8_supported
from .backend import TestBackend
FP8_DTYPE = current_platform.fp8_dtype()
......@@ -32,7 +36,7 @@ def is_nvfp4_supported():
class TestSiluMulFp8QuantModel(torch.nn.Module):
def __init__(self, hidden_size: int, force_fp8_e4m3fnuz: bool, **kwargs):
def __init__(self, hidden_size: int, cuda_force_torch: bool, **kwargs):
super().__init__()
self.silu_and_mul = SiluAndMul()
self.wscale = torch.rand(1, dtype=torch.float32)
......@@ -40,11 +44,11 @@ class TestSiluMulFp8QuantModel(torch.nn.Module):
self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
self.fp8_linear = Fp8LinearOp(
force_fp8_e4m3fnuz=force_fp8_e4m3fnuz,
act_quant_static=True,
act_quant_group_shape=GroupShape.PER_TENSOR,
)
with override_cutlass_fp8_supported(not cuda_force_torch):
self.fp8_linear = Fp8LinearOp(
act_quant_static=True,
act_quant_group_shape=GroupShape.PER_TENSOR,
)
def forward(self, x):
y = self.silu_and_mul(x)
......@@ -63,24 +67,27 @@ class TestSiluMulFp8QuantModel(torch.nn.Module):
class TestSiluMulNvfp4QuantModel(torch.nn.Module):
def __init__(self, hidden_size: int, **kwargs):
def __init__(self, hidden_size: int, x: torch.Tensor, **kwargs):
super().__init__()
self.silu_and_mul = SiluAndMul()
self.w = torch.randint(256, (hidden_size, hidden_size // 2),
dtype=FP4_DTYPE)
self.wscale = torch.randn(hidden_size,
hidden_size // 16).to(dtype=FP8_DTYPE)
self.wscale2 = torch.rand(1, dtype=torch.float32)
self.scale = torch.rand(1, dtype=torch.float32)
# create nvfp4 weight
w = torch.rand((hidden_size, hidden_size))
self.w, self.w_block_scale, self.w_global_scale = quant_nvfp4_tensor(w)
# get global scale offline
_, _, self.y_global_scale = quant_nvfp4_tensor(self.silu_and_mul(x))
self.alpha = 1.0 / (self.w_global_scale * self.y_global_scale)
def forward(self, x):
y = self.silu_and_mul(x)
y_quant, y_block_scale = scaled_fp4_quant(y, 1 / self.scale)
y_quant, y_block_scale = scaled_fp4_quant(y, self.y_global_scale)
out = cutlass_scaled_fp4_mm(a=y_quant,
b=self.w,
block_scale_a=y_block_scale,
block_scale_b=self.wscale,
alpha=self.scale * self.wscale2,
block_scale_b=self.w_block_scale,
alpha=self.alpha,
out_dtype=y.dtype)
return out
......@@ -94,19 +101,25 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
@pytest.mark.parametrize("num_tokens", [64])
@pytest.mark.parametrize("hidden_size", [128])
@pytest.mark.parametrize(
"model_class", [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel]
if is_nvfp4_supported() else [TestSiluMulFp8QuantModel])
@pytest.mark.parametrize("force_fp8_e4m3fnuz", [True, False])
"model_class",
cast(list[type], [TestSiluMulFp8QuantModel, TestSiluMulNvfp4QuantModel]
if is_nvfp4_supported() else [TestSiluMulFp8QuantModel]))
# cuda_force_torch used to test torch code path on platforms that
# cutlass_fp8_supported() == True.
@pytest.mark.parametrize("cuda_force_torch",
[True, False] if cutlass_fp8_supported() else [True])
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"],
reason="Only test on CUDA and ROCm")
def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
force_fp8_e4m3fnuz):
if model_class == TestSiluMulNvfp4QuantModel and force_fp8_e4m3fnuz:
cuda_force_torch):
if model_class == TestSiluMulNvfp4QuantModel and cuda_force_torch:
pytest.skip("Duplicate tests for NVFP4")
torch.set_default_device("cuda")
torch.set_default_dtype(torch.float16)
x = torch.rand(num_tokens, hidden_size * 2)
# Reshape pass is needed for the fusion pass to work
config = VllmConfig()
config.compilation_config = CompilationConfig(
......@@ -115,10 +128,10 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
backend = TestBackend(NoOpEliminationPass(config), fusion_pass)
model = model_class(hidden_size=hidden_size,
force_fp8_e4m3fnuz=force_fp8_e4m3fnuz)
cuda_force_torch=cuda_force_torch,
x=x)
# First dimension dynamic
x = torch.rand(num_tokens, hidden_size * 2)
torch._dynamo.mark_dynamic(x, 0)
result = model(x)
......@@ -127,10 +140,15 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
result2 = model2(x)
# Check that it gives the same answer
if model_class == TestSiluMulFp8QuantModel:
atol, rtol = 1e-3, 1e-3
elif model_class == TestSiluMulNvfp4QuantModel:
atol, rtol = 1e-1, 1e-1
torch.testing.assert_close(result[0].to(dtype=torch.float16),
result2[0].to(dtype=torch.float16),
atol=1e-3,
rtol=1e-3)
atol=atol,
rtol=rtol)
# In pre-nodes, quant op should be present and fused kernels should not
backend.check_before_ops(model.ops_in_model_before())
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa
from tblib import pickling_support
# Install support for pickling exceptions so that we can nicely propagate
# failures from tests running in a subprocess.
# This should be run before any custom exception subclasses are defined.
pickling_support.install()
import http.server
import json
import math
import mimetypes
import os
import socket
import tempfile
import threading
from collections.abc import Generator
from enum import Enum
from typing import Any, Callable, Optional, TypedDict, TypeVar, Union, cast
......@@ -32,6 +47,7 @@ from vllm.distributed import (cleanup_dist_env_and_memory,
from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
to_enc_dec_tuple_list, zip_enc_dec_prompts)
from vllm.logger import init_logger
from vllm.multimodal.utils import fetch_image
from vllm.outputs import RequestOutput
from vllm.sampling_params import BeamSearchParams
from vllm.sequence import Logprob
......@@ -1253,3 +1269,119 @@ def cli_config_file():
def cli_config_file_with_model():
"""Return the path to the CLI config file with model."""
return os.path.join(_TEST_DIR, "config", "test_config_with_model.yaml")
class AssetHandler(http.server.BaseHTTPRequestHandler):
# _IMAGE_CACHE : Dict[str, bytes] = {}
def log_message(self, *args, **kwargs):
pass
def do_GET(self):
# Accepts paths like: /1280px-Venn_diagram_rgb.jpg
filename = self.path.lstrip("/")
if not filename or "." not in filename:
self.send_error(404, "Missing filename (expected /<name>.<ext>)")
return
base, ext = filename.rsplit(".", 1)
ext = ext.lower()
if ext not in ["jpg", "png"]:
self.send_error(404, f"Unsupported extension: .{ext}")
return
try:
data = ImageAsset(base).read_bytes(ext=ext)
except Exception as e:
self.send_error(500, f"Failed to load asset: {ext} {base} {e} ")
return
ctype, _ = mimetypes.guess_type(filename)
if ctype is None:
ctype = {"jpg": "image/jpg", "png": "image/png"}[ext]
self.send_response(200)
self.send_header("Content-Type", ctype)
self.send_header("Content-Length", str(len(data)))
self.end_headers()
self.wfile.write(data)
def _find_free_port() -> int:
with socket.socket() as s:
s.bind(("127.0.0.1", 0))
return s.getsockname()[1]
class LocalAssetServer:
address: str
port: int
server: Optional[http.server.ThreadingHTTPServer]
thread: Optional[threading.Thread]
def __init__(self, address: str = "127.0.0.1") -> None:
self.address = address
self.port = -1
self.server = None
self.thread = None
def __enter__(self):
self.port = _find_free_port()
self.server = http.server.ThreadingHTTPServer(
(self.address, self.port), AssetHandler)
self.thread = threading.Thread(target=self.server.serve_forever,
daemon=True)
self.thread.start()
return self
def __exit__(self, exc_type, exc_value, traceback):
if self.server:
self.server.shutdown()
del self.server
if self.thread:
self.thread.join()
del self.thread
if exc_type is None:
return None
return False
@property
def base_url(self) -> str:
assert self.port is not None
return f"http://{self.address}:{self.port}"
def url_for(self, name: str) -> str:
"""e.g., name='RGBA_comp.png' -> 'http://127.0.0.1:PORT/RGBA_comp.png'"""
return f"{self.base_url}/{name}"
def get_image_asset(self, name: str) -> Image.Image:
return fetch_image(self.url_for(name))
@pytest.fixture(scope="session")
def local_asset_server() -> Generator[LocalAssetServer, None, None]:
"""
Starts a thread based HTTP server bound to 127.0.0.1 on a random free port.
The server currently servers images at:
http://127.0.0.1:<port>/<name>.<ext>
"""
with LocalAssetServer() as srv:
yield srv
@pytest.fixture
def image_url(request, local_asset_server) -> str:
# request.param is one of the IMAGE_ASSETS filenames
name = request.param
return local_asset_server.url_for(name)
@pytest.fixture
def image_urls(request, local_asset_server) -> list[str]:
"""Indirect fixture: takes a list of names, returns list of full URLs."""
names: list[str] = request.param
return [local_asset_server.url_for(name) for name in names]
......@@ -439,10 +439,10 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
@pytest.mark.parametrize("seed", [1])
def test_auto_prefix_caching_after_eviction_start(baseline_llm_generator,
test_llm_generator):
"""Verify block manager v2 with auto prefix caching could works normal
"""Verify block manager v2 with auto prefix caching could work normally
even when eviction started.
With APC enabled, all blocks are held by native block at the beginning.
Then blocks are managed by evictor instead. If cache hit at the evitor's
Then blocks are managed by evictor instead. If cache hit at the evictor's
block, then it could be reused, or we need to recompute its kv cache.
"""
output_len = 10
......
......@@ -10,7 +10,8 @@ import pytest # noqa
import torch
from torch import Use # noqa
from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
from vllm.config import CacheConfig, SchedulerConfig
from vllm.config.lora import LoRAConfig
from vllm.core.interfaces import AllocStatus
from vllm.core.scheduler import Scheduler, SchedulingBudget
from vllm.lora.request import LoRARequest
......@@ -641,7 +642,7 @@ def test_schedule_decode_blocks_to_copy_update():
# Nothing is preempted.
assert output.blocks_to_swap_out == []
# Since append_slot returns the source -> dist mapping, it should
# applied.
# be applied.
assert output.blocks_to_copy == [(2, 3)]
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.sampling_params import SamplingParams
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.detokenizer import BaseIncrementalDetokenizer
@pytest.fixture(params=[True, False])
def include_stop_str_in_output(request):
return request.param
class _DummyDetokenizer(BaseIncrementalDetokenizer):
def __init__(self, request: EngineCoreRequest):
super().__init__(request)
def decode_next(self, next_token_id: int) -> str:
# Map token id to single ASCII character for deterministic testing.
return chr(next_token_id)
def _make_request(stop, include_stop_str_in_output: bool, min_tokens: int = 0):
params = SamplingParams(
stop=stop,
include_stop_str_in_output=include_stop_str_in_output,
min_tokens=min_tokens)
# Keep other fields minimal for unit test purposes.
req = EngineCoreRequest(
request_id="test",
prompt_token_ids=[],
mm_features=None,
sampling_params=params,
pooling_params=None,
eos_token_id=None,
arrival_time=0.0,
lora_request=None,
cache_salt=None,
data_parallel_rank=None,
)
return req
def test_stop_string_while_stop_token_terminates(
include_stop_str_in_output: bool):
"""
This test verifies that the detokenizer correctly handles the case where
the generated token sequence contains both:
- a stop token
- an <eos> token
The detokenizer should respect the stop string and truncate the output
accordingly.
Imagine the following sequence:
- "abcdeZ" is generated, where "Z" is the <eos> token.
- "cd" is the stop string.
If include_stop_str_in_output=False, the detokenizer should truncate the
output to "ab" because the stop string "cd" is excluded.
If include_stop_str_in_output=True, the detokenizer should include the stop
string "cd" in the output, resulting in "abcd".
This verifies the behavioral change introduced in BaseIncrementalDetokenizer
where stop-string evaluation occurs before the early-return on
stop_terminated.
"""
# Generate text "abcdeZ" and tokenize it.
generated_text = "abcde"
eos_token = "Z"
stop_string = "cd"
generated_text = generated_text + eos_token
token_ids = [ord(c) for c in generated_text]
# Create a request with the stop string and initialize the detokenizer.
req = _make_request(stop=[stop_string],
include_stop_str_in_output=include_stop_str_in_output)
detok = _DummyDetokenizer(req)
# Simulate that the last token ('Z') is a stop token (stop_terminated=True).
result = detok.update(new_token_ids=token_ids, stop_terminated=True)
# The update should not report a stop string
assert result == stop_string
# Output text should reflect stop-string handling:
# - include_stop_str_in_output=False => exclude "cd" => "ab"
# - include_stop_str_in_output=True => include "cd" => "abcd"
expected_text = "abcd" if include_stop_str_in_output else "ab"
assert detok.output_text == expected_text
# The skipped final token should still be recorded in token_ids.
assert detok.output_token_ids == token_ids
# get_next_output_text should return the full text when finished=True.
# (Buffering only applies during streaming when finished=False.)
assert detok.get_next_output_text(finished=True,
delta=False) == expected_text
......@@ -8,7 +8,7 @@ import msgspec.msgpack
import pytest
import zmq
from vllm.config import KVEventsConfig
from vllm.config.kv_events import KVEventsConfig
from vllm.distributed.kv_events import EventPublisherFactory
from .test_events import SampleBatch
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
(2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
important to set the distributed backend to "mp" to avoid Ray scheduling
all workers in a node other than the head node, which can cause the test
to fail.
"""
import json
import os
from dataclasses import dataclass
from typing import Literal, NamedTuple, Optional
import pytest
from vllm.config import RunnerOption
from vllm.logger import init_logger
from ..models.registry import HF_EXAMPLE_MODELS
from ..utils import compare_two_settings, create_new_process_for_each_test
logger = init_logger("test_context_parallel")
VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1"
class ParallelSetup(NamedTuple):
tp_size: int
pp_size: int
dcp_size: int
eager_mode: bool
chunked_prefill: bool
class CPTestOptions(NamedTuple):
multi_node_only: bool
load_format: Optional[str] = None
@dataclass
class CPTestSettings:
parallel_setups: list[ParallelSetup]
# NOTE: the length of distributed_backends and
# vllm_major_versions should be the same, and they
# are first zipped together to iterate over all
# test settings.
distributed_backends: list[str]
# vllm major version: "0" for V0, "1" for V1
vllm_major_versions: list[str]
runner: RunnerOption
test_options: CPTestOptions
def __post_init__(self):
if len(self.distributed_backends) != len(self.vllm_major_versions):
raise ValueError(
f"Length mismatch: distributed_backends "
f"({len(self.distributed_backends)}) != "
f"vllm_major_versions ({len(self.vllm_major_versions)})")
@staticmethod
def detailed(
*,
tp_base: int = 4,
pp_base: int = 1,
dcp_base: int = 1,
multi_node_only: bool = False,
runner: RunnerOption = "auto",
load_format: Optional[str] = None,
):
parallel_setups = []
for eager_mode_val in [False]:
for pp_multiplier in [1]:
for dcp_multiplier in [2, 4]:
for chunked_prefill_val in [True]:
parallel_setups.append(
ParallelSetup(tp_size=tp_base,
pp_size=pp_multiplier * pp_base,
dcp_size=dcp_multiplier * dcp_base,
eager_mode=eager_mode_val,
chunked_prefill=chunked_prefill_val))
return CPTestSettings(
parallel_setups=parallel_setups,
distributed_backends=["mp"],
vllm_major_versions=["1"],
runner=runner,
test_options=CPTestOptions(multi_node_only=multi_node_only,
load_format=load_format),
)
def iter_params(self, model_id: str):
opts = self.test_options
for parallel_setup in self.parallel_setups:
for backend, vllm_major_version in zip(self.distributed_backends,
self.vllm_major_versions):
yield (model_id, parallel_setup, backend, vllm_major_version,
self.runner, opts)
def _compare_cp_with_tp(
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
vllm_major_version: str,
runner: RunnerOption,
test_options: CPTestOptions,
num_gpus_available: int,
*,
method: Literal["generate"],
is_multimodal: bool,
):
(
tp_size,
pp_size,
dcp_size,
eager_mode,
chunked_prefill,
) = parallel_setup
multi_node_only, load_format = test_options
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
model_info.check_transformers_version(on_fail="skip")
trust_remote_code = model_info.trust_remote_code
tokenizer_mode = model_info.tokenizer_mode
hf_overrides = model_info.hf_overrides
if load_format == "dummy":
# Avoid OOM
text_overrides = {
"num_hidden_layers": 4,
"hidden_size": 512,
"intermediate_size": 800,
"num_attention_heads": 4,
"num_key_value_heads": 1,
}
if is_multimodal:
hf_overrides.update({"text_config": text_overrides})
else:
hf_overrides.update(text_overrides)
else:
model_info.check_available_online(on_fail="skip")
if num_gpus_available < tp_size * pp_size:
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
if VLLM_MULTI_NODE and distributed_backend == "mp":
pytest.skip("Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend")
if multi_node_only and not VLLM_MULTI_NODE:
pytest.skip("Not in multi-node setting")
common_args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"2048",
"--max-num-seqs",
"8",
]
if chunked_prefill:
common_args.append("--enable-chunked-prefill")
if eager_mode:
common_args.append("--enforce-eager")
if runner != "auto":
common_args.extend(["--runner", runner])
if trust_remote_code:
common_args.append("--trust-remote-code")
if tokenizer_mode:
common_args.extend(["--tokenizer-mode", tokenizer_mode])
if load_format:
common_args.extend(["--load-format", load_format])
if hf_overrides:
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
cp_env = tp_env = {
"VLLM_USE_V1":
vllm_major_version, # Note(hc): DCP only support V1 engine only
}
cp_args = [
*common_args,
"--tensor-parallel-size",
str(tp_size),
"--pipeline-parallel-size",
str(pp_size),
"--decode-context-parallel-size",
str(dcp_size),
"--distributed-executor-backend",
distributed_backend,
]
tp_args = [
*common_args,
"--tensor-parallel-size",
str(tp_size),
"--pipeline-parallel-size",
str(pp_size),
"--distributed-executor-backend",
distributed_backend,
]
try:
compare_two_settings(model_id,
cp_args,
tp_args,
cp_env,
tp_env,
method=method,
max_wait_seconds=720)
except Exception:
testing_ray_compiled_graph = cp_env is not None
if testing_ray_compiled_graph and vllm_major_version == "0":
# Ray Compiled Graph tests are flaky for V0,
# so we don't want to fail the test
logger.exception("Ray Compiled Graph tests failed")
else:
raise
CP_TEXT_GENERATION_MODELS = {
# [MLA attention only]
"deepseek-ai/DeepSeek-V2-Lite-Chat": CPTestSettings.detailed(),
}
CP_TEST_MODELS = [
# TODO support other models
# [LANGUAGE GENERATION]
"deepseek-ai/DeepSeek-V2-Lite-Chat",
]
@pytest.mark.parametrize(
("model_id", "parallel_setup", "distributed_backend", "vllm_major_version",
"runner", "test_options"),
[
params for model_id, settings in CP_TEXT_GENERATION_MODELS.items()
for params in settings.iter_params(model_id)
if model_id in CP_TEST_MODELS
],
)
@create_new_process_for_each_test()
def test_cp_generation(
model_id: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
vllm_major_version: str,
runner: RunnerOption,
test_options: CPTestOptions,
num_gpus_available,
):
_compare_cp_with_tp(model_id,
parallel_setup,
distributed_backend,
vllm_major_version,
runner,
test_options,
num_gpus_available,
method="generate",
is_multimodal=False)
......@@ -298,6 +298,8 @@ def _compare_tp(
tokenizer_mode = model_info.tokenizer_mode
hf_overrides = model_info.hf_overrides
hf_config = get_config(model_id, trust_remote_code)
skip_tokenizer_init = model_info.skip_tokenizer_init
max_num_seqs = model_info.max_num_seqs
dtype = "float16"
if hf_config.model_type in _FLOAT16_NOT_SUPPORTED_MODELS:
......@@ -351,6 +353,10 @@ def _compare_tp(
common_args.extend(["--load-format", load_format])
if hf_overrides:
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
if skip_tokenizer_init:
common_args.append("--skip-tokenizer-init")
if max_num_seqs:
common_args.extend(["--max-num-seqs", f"{max_num_seqs}"])
specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
testing_ray_compiled_graph = False
......
......@@ -178,6 +178,7 @@ def _compare_sp(
trust_remote_code = model_info.trust_remote_code
tokenizer_mode = model_info.tokenizer_mode
hf_overrides = model_info.hf_overrides
skip_tokenizer_init = model_info.skip_tokenizer_init
if load_format == "dummy":
# Avoid OOM
......@@ -227,6 +228,8 @@ def _compare_sp(
common_args.extend(["--load-format", load_format])
if hf_overrides:
common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
if skip_tokenizer_init:
common_args.append("--skip-tokenizer-init")
compilation_config = {
'level': 3,
......
......@@ -63,6 +63,7 @@ def clear_cache():
current_platform.is_cpu(),
reason="CPU backend is not currently supported with encoder/decoder models"
)
@pytest.mark.skip(reason="bart not supported in V1")
def test_encoder_decoder_e2e(
hf_runner,
vllm_runner,
......
......@@ -167,7 +167,7 @@ def test_get_kwargs():
# dict should have json tip in help
json_tip = "Should either be a valid JSON string or JSON keys"
assert json_tip in kwargs["json_tip"]["help"]
# nested config should should construct the nested config
# nested config should construct the nested config
assert kwargs["nested_config"]["type"]('{"field": 2}') == NestedConfig(2)
......@@ -287,15 +287,6 @@ def test_prefix_cache_default():
},
"mm-processor-kwargs"
),
(
'{"cast_logits_dtype":"bfloat16","sequence_parallel_norm":true,"sequence_parallel_norm_threshold":2048}',
{
"cast_logits_dtype": "bfloat16",
"sequence_parallel_norm": True,
"sequence_parallel_norm_threshold": 2048,
},
"override-neuron-config"
),
])
# yapf: enable
def test_composite_arg_parser(arg, expected, option):
......
......@@ -25,7 +25,7 @@ class CustomUniExecutor(UniProcExecutor):
timeout: Optional[float] = None,
args: tuple = (),
kwargs: Optional[dict] = None) -> list[Any]:
# Drop marker to show that this was ran
# Drop marker to show that this was run
with open(".marker", "w"):
...
return super().collective_rpc(method, timeout, args, kwargs)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment