"vllm/vscode:/vscode.git/clone" did not exist on "ec68d53b2b75eb5480270c67676b126079998f5a"
Commit a3f8d5dd authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.13.0rc2' into v0.13.0rc2-ori

parents 8d75f22e f34eca5f
...@@ -29,7 +29,8 @@ from vllm.multimodal.utils import ( ...@@ -29,7 +29,8 @@ from vllm.multimodal.utils import (
encode_image_base64, encode_image_base64,
encode_video_base64, encode_video_base64,
) )
from vllm.tokenizers import MistralTokenizer, get_tokenizer from vllm.tokenizers import get_tokenizer
from vllm.tokenizers.mistral import MistralTokenizer
from vllm.utils.serial_utils import tensor2base64 from vllm.utils.serial_utils import tensor2base64
from ..models.registry import HF_EXAMPLE_MODELS from ..models.registry import HF_EXAMPLE_MODELS
...@@ -796,9 +797,13 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid( ...@@ -796,9 +797,13 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
"content": "<|image_1|>\nWhat's in this image?", "content": "<|image_1|>\nWhat's in this image?",
} }
] ]
assert mm_data is not None assert mm_data is not None
assert "image" in mm_data assert "image" in mm_data
assert mm_data["image"] is None assert isinstance(mm_data["image"], list)
assert len(mm_data["image"]) == 1
assert mm_data["image"][0] is None
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid]) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
...@@ -825,10 +830,11 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid( ...@@ -825,10 +830,11 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
# Should have audio in mm_data as None (UUID provided) # Should have audio in mm_data as None (UUID provided)
assert mm_data is not None assert mm_data is not None
assert "audio" in mm_data assert "audio" in mm_data
assert mm_data["audio"] is None assert isinstance(mm_data["audio"], list)
assert len(mm_data["audio"]) == 1
assert mm_data["audio"][0] is None
# UUID should be recorded # UUID should be recorded
assert mm_uuids is not None
assert "audio" in mm_uuids
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[uuid]) _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[uuid])
...@@ -1121,10 +1127,105 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async( ...@@ -1121,10 +1127,105 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
mm_data = await mm_future mm_data = await mm_future
assert mm_data is not None assert mm_data is not None
assert "image" in mm_data assert "image" in mm_data
assert mm_data["image"] is None assert isinstance(mm_data["image"], list)
assert len(mm_data["image"]) == 1
assert mm_data["image"][0] is None
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid]) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
def test_parse_chat_messages_empty_dict_image_embeds(
phi3v_model_config_image_embeds,
):
"""Test that empty dictionary for image_embeds is handled without errors."""
conversation, mm_data, mm_uuids = parse_chat_messages(
[
{
"role": "user",
"content": [
{"type": "image_embeds", "image_embeds": {}},
{"type": "text", "text": "What's in this image?"},
],
}
],
phi3v_model_config_image_embeds,
content_format="string",
)
# Verify conversation structure
assert conversation == [
{
"role": "user",
"content": "<|image_1|>\nWhat's in this image?",
}
]
# Verify mm_data contains an empty dictionary of embeddings
assert mm_data is not None
assert "image" in mm_data
assert isinstance(mm_data["image"], dict)
assert len(mm_data["image"]) == 0
# Verify UUIDs (None since we didn't provide any)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
def test_parse_chat_messages_multiple_dict_image_embeds(
phi3v_model_config_image_embeds,
):
"""Test that multiple dictionaries for image_embeds is handled without errors."""
# Create two sample image embedding tensors
batch_size = 2
image_embedding_1 = torch.randn(batch_size, 256, 1024)
image_embedding_2 = torch.randn(batch_size, 3)
conversation, mm_data, mm_uuids = parse_chat_messages(
[
{
"role": "user",
"content": [
{
"type": "image_embeds",
"image_embeds": {
"image_embedding_1": tensor2base64(p),
"image_embedding_2": tensor2base64(i),
},
}
for p, i in zip(image_embedding_1, image_embedding_2)
]
+ [
{"type": "text", "text": "Describe these two images."},
],
}
],
phi3v_model_config_image_embeds,
content_format="string",
)
# Verify conversation structure
assert conversation == [
{
"role": "user",
"content": "<|image_1|>\n<|image_2|>\nDescribe these two images.",
}
]
# Verify mm_data contains a dictionary of multi-embeddings
assert mm_data is not None
assert "image" in mm_data
assert isinstance(mm_data["image"], dict)
assert len(mm_data["image"]) == batch_size
# Verify each embedding has the correct shape
assert isinstance(mm_data["image"]["image_embedding_1"], torch.Tensor)
assert mm_data["image"]["image_embedding_1"].shape == image_embedding_1.shape
assert isinstance(mm_data["image"]["image_embedding_2"], torch.Tensor)
assert mm_data["image"]["image_embedding_2"].shape == image_embedding_2.shape
# Verify UUIDs (None since we didn't provide any)
_assert_mm_uuids(mm_uuids, batch_size, expected_uuids=[None, None])
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_parse_chat_messages_multiple_images_async( async def test_parse_chat_messages_multiple_images_async(
phi3v_model_config, phi3v_model_config,
......
...@@ -7,7 +7,8 @@ import math ...@@ -7,7 +7,8 @@ import math
import pytest import pytest
import torch import torch
from vllm.platforms import current_platform from vllm.platforms import CpuArchEnum, current_platform
from vllm.v1.attention.backends.cpu_attn import _get_attn_isa
if not current_platform.is_cpu(): if not current_platform.is_cpu():
pytest.skip("skipping CPU-only tests", allow_module_level=True) pytest.skip("skipping CPU-only tests", allow_module_level=True)
...@@ -36,6 +37,21 @@ SEQ_LENS = [ # (q_len, kv_len) ...@@ -36,6 +37,21 @@ SEQ_LENS = [ # (q_len, kv_len)
] ]
def get_attn_isa(
block_size: int | None = None,
dtype: torch.dtype | None = None,
):
if block_size and dtype:
return _get_attn_isa(dtype, block_size)
else:
if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
return "neon"
elif torch._C._cpu._is_amx_tile_supported():
return "amx"
else:
return "vec"
# rand number generation takes too much time, cache rand tensors # rand number generation takes too much time, cache rand tensors
@functools.lru_cache(maxsize=128, typed=False) @functools.lru_cache(maxsize=128, typed=False)
def tensor_cache( def tensor_cache(
...@@ -452,6 +468,49 @@ def test_varlen_with_paged_kv_normal_vec16( ...@@ -452,6 +468,49 @@ def test_varlen_with_paged_kv_normal_vec16(
) )
@pytest.mark.parametrize("seq_lens", SEQ_LENS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("block_size", [96, 128])
@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
@pytest.mark.parametrize("dtype", QTYPES)
@pytest.mark.parametrize("soft_cap", [None])
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("use_alibi", [False])
@pytest.mark.parametrize("use_sink", [False])
@pytest.mark.parametrize("isa", ["neon"])
@pytest.mark.skipif(
current_platform.get_cpu_architecture() != CpuArchEnum.ARM,
reason="Not an Arm CPU.",
)
def test_varlen_with_paged_kv_normal_neon(
seq_lens: list[tuple[int, int]],
num_heads: tuple[int, int],
head_size: int,
sliding_window: int | None,
dtype: torch.dtype,
block_size: int,
soft_cap: float | None,
num_blocks: int,
use_alibi: bool,
use_sink: bool,
isa: str,
) -> None:
varlen_with_paged_kv(
seq_lens=seq_lens,
num_heads=num_heads,
head_size=head_size,
sliding_window=sliding_window,
dtype=dtype,
block_size=block_size,
soft_cap=soft_cap,
num_blocks=num_blocks,
use_alibi=use_alibi,
use_sink=use_sink,
isa=isa,
)
@pytest.mark.parametrize("seq_lens", SEQ_LENS) @pytest.mark.parametrize("seq_lens", SEQ_LENS)
@pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", [96]) @pytest.mark.parametrize("head_size", [96])
...@@ -462,9 +521,7 @@ def test_varlen_with_paged_kv_normal_vec16( ...@@ -462,9 +521,7 @@ def test_varlen_with_paged_kv_normal_vec16(
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("use_alibi", [False]) @pytest.mark.parametrize("use_alibi", [False])
@pytest.mark.parametrize("use_sink", [False]) @pytest.mark.parametrize("use_sink", [False])
@pytest.mark.parametrize( @pytest.mark.parametrize("isa", [get_attn_isa()])
"isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
)
def test_varlen_with_paged_kv_softcap( def test_varlen_with_paged_kv_softcap(
seq_lens: list[tuple[int, int]], seq_lens: list[tuple[int, int]],
num_heads: tuple[int, int], num_heads: tuple[int, int],
...@@ -503,9 +560,7 @@ def test_varlen_with_paged_kv_softcap( ...@@ -503,9 +560,7 @@ def test_varlen_with_paged_kv_softcap(
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("use_alibi", [True]) @pytest.mark.parametrize("use_alibi", [True])
@pytest.mark.parametrize("use_sink", [False]) @pytest.mark.parametrize("use_sink", [False])
@pytest.mark.parametrize( @pytest.mark.parametrize("isa", [get_attn_isa()])
"isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
)
def test_varlen_with_paged_kv_alibi( def test_varlen_with_paged_kv_alibi(
seq_lens: list[tuple[int, int]], seq_lens: list[tuple[int, int]],
num_heads: tuple[int, int], num_heads: tuple[int, int],
...@@ -544,9 +599,7 @@ def test_varlen_with_paged_kv_alibi( ...@@ -544,9 +599,7 @@ def test_varlen_with_paged_kv_alibi(
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("use_alibi", [False]) @pytest.mark.parametrize("use_alibi", [False])
@pytest.mark.parametrize("use_sink", [True]) @pytest.mark.parametrize("use_sink", [True])
@pytest.mark.parametrize( @pytest.mark.parametrize("isa", [get_attn_isa()])
"isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
)
def test_varlen_with_paged_kv_sink( def test_varlen_with_paged_kv_sink(
seq_lens: list[tuple[int, int]], seq_lens: list[tuple[int, int]],
num_heads: tuple[int, int], num_heads: tuple[int, int],
......
...@@ -32,8 +32,8 @@ def cal_diff( ...@@ -32,8 +32,8 @@ def cal_diff(
CUTLASS_MLA_UNSUPPORTED_REASON = ( CUTLASS_MLA_UNSUPPORTED_REASON = (
"Cutlass MLA Requires compute capability of 10 or above." "Cutlass MLA Requires compute capability of 100 or above."
if not current_platform.is_device_capability(100) if not current_platform.is_device_capability_family(100)
else "Cutlass MLA is supported" else "Cutlass MLA is supported"
) )
......
...@@ -11,7 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import ( ...@@ -11,7 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import (
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.math_utils import round_up from vllm.utils.math_utils import round_up
if not current_platform.is_device_capability(100): if not current_platform.is_device_capability_family(100):
pytest.skip( pytest.skip(
"This TRTLLM kernel requires NVIDIA Blackwell.", allow_module_level=True "This TRTLLM kernel requires NVIDIA Blackwell.", allow_module_level=True
) )
...@@ -443,7 +443,7 @@ def test_flashinfer_trtllm_prefill_with_baseline( ...@@ -443,7 +443,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
output_trtllm = output_trtllm.reshape(-1, query.shape[1], query.shape[2]) output_trtllm = output_trtllm.reshape(-1, query.shape[1], query.shape[2])
if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE: if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE:
rtol, atol = 1e-1, 2e-1 rtol, atol = 3e-1, 4e-1
elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE: elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
rtol, atol = 4e-2, 6e-2 rtol, atol = 4e-2, 6e-2
elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == dtype: elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == dtype:
......
...@@ -7,6 +7,7 @@ import torch ...@@ -7,6 +7,7 @@ import torch
from vllm.attention.ops.triton_unified_attention import unified_attention from vllm.attention.ops.triton_unified_attention import unified_attention
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.math_utils import next_power_of_2
NUM_HEADS = [(4, 4), (8, 2)] NUM_HEADS = [(4, 4), (8, 2)]
HEAD_SIZES = [128, 256] HEAD_SIZES = [128, 256]
...@@ -22,6 +23,10 @@ QDTYPES = ( ...@@ -22,6 +23,10 @@ QDTYPES = (
# one value small enough to test the schema op check # one value small enough to test the schema op check
NUM_BLOCKS = [32768, 2048] NUM_BLOCKS = [32768, 2048]
# 0: use 2D kernel for decode
# 8: use 3D kernel for decode
SEQ_THRESHOLD_3D_VALUES = [0, 8]
def ref_paged_attn( def ref_paged_attn(
query: torch.Tensor, query: torch.Tensor,
...@@ -92,6 +97,7 @@ def ref_paged_attn( ...@@ -92,6 +97,7 @@ def ref_paged_attn(
@pytest.mark.parametrize("soft_cap", [None, 50.0]) @pytest.mark.parametrize("soft_cap", [None, 50.0])
@pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
@pytest.mark.parametrize("q_dtype", QDTYPES) @pytest.mark.parametrize("q_dtype", QDTYPES)
@pytest.mark.parametrize("seq_threshold_3D", SEQ_THRESHOLD_3D_VALUES)
@torch.inference_mode() @torch.inference_mode()
def test_triton_unified_attn( def test_triton_unified_attn(
seq_lens: list[tuple[int, int]], seq_lens: list[tuple[int, int]],
...@@ -103,6 +109,7 @@ def test_triton_unified_attn( ...@@ -103,6 +109,7 @@ def test_triton_unified_attn(
soft_cap: float | None, soft_cap: float | None,
num_blocks: int, num_blocks: int,
q_dtype: torch.dtype | None, q_dtype: torch.dtype | None,
seq_threshold_3D: int,
) -> None: ) -> None:
torch.set_default_device("cuda") torch.set_default_device("cuda")
...@@ -152,6 +159,21 @@ def test_triton_unified_attn( ...@@ -152,6 +159,21 @@ def test_triton_unified_attn(
k_descale = torch.rand(scale_shape, dtype=torch.float32) k_descale = torch.rand(scale_shape, dtype=torch.float32)
v_descale = torch.rand(scale_shape, dtype=torch.float32) v_descale = torch.rand(scale_shape, dtype=torch.float32)
num_par_softmax_segments = 16
head_size_padded = next_power_of_2(head_size)
softmax_segm_output = torch.empty(
(seq_threshold_3D, num_query_heads, num_par_softmax_segments, head_size_padded),
dtype=torch.float32,
)
softmax_segm_max = torch.empty(
(seq_threshold_3D, num_query_heads, num_par_softmax_segments),
dtype=torch.float32,
)
softmax_segm_expsum = torch.empty(
(seq_threshold_3D, num_query_heads, num_par_softmax_segments),
dtype=torch.float32,
)
unified_attention( unified_attention(
q=maybe_quantized_query, q=maybe_quantized_query,
k=maybe_quantized_key_cache, k=maybe_quantized_key_cache,
...@@ -169,6 +191,11 @@ def test_triton_unified_attn( ...@@ -169,6 +191,11 @@ def test_triton_unified_attn(
q_descale=q_descale, q_descale=q_descale,
k_descale=k_descale, k_descale=k_descale,
v_descale=v_descale, v_descale=v_descale,
seq_threshold_3D=seq_threshold_3D,
num_par_softmax_segments=num_par_softmax_segments,
softmax_segm_output=softmax_segm_output,
softmax_segm_max=softmax_segm_max,
softmax_segm_expsum=softmax_segm_expsum,
) )
ref_output = ref_paged_attn( ref_output = ref_paged_attn(
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for ApplyRotaryEmb CustomOp dispatch behavior.
This test ensures that RotaryEmbedding classes correctly call the appropriate
ApplyRotaryEmb methods based on the calling context:
1. RotaryEmbedding.forward_native() -> ApplyRotaryEmb.forward_native()
2. RotaryEmbedding.forward_cuda() -> ApplyRotaryEmb.forward() (auto-dispatch)
3. RotaryEmbedding.forward_hip() -> ApplyRotaryEmb.forward() (auto-dispatch)
"""
from dataclasses import dataclass
import pytest
import torch
from vllm.config import (
CompilationConfig,
VllmConfig,
get_cached_compilation_config,
set_current_vllm_config,
)
from vllm.platforms import current_platform
CUDA_DEVICES = ["cuda:0"]
@dataclass
class RotaryEmbeddingTestCase:
"""Test case configuration for RotaryEmbedding dispatch tests."""
name: str
rope_class: type
rope_kwargs: dict
method_name: str # forward_native, forward_cuda, forward
positions_shape: tuple # (num_tokens,) or (3, num_tokens) or (4, num_tokens)
expect_forward_native: bool # Should call ApplyRotaryEmb.forward_native()
expect_forward: bool # Should call ApplyRotaryEmb.forward()
def get_test_cases() -> list[RotaryEmbeddingTestCase]:
"""Generate test cases for all RotaryEmbedding classes."""
from vllm.model_executor.layers.rotary_embedding.ernie45_vl_rope import (
Ernie4_5_VLRotaryEmbedding,
)
from vllm.model_executor.layers.rotary_embedding.mrope import MRotaryEmbedding
from vllm.model_executor.layers.rotary_embedding.xdrope import XDRotaryEmbedding
common_kwargs = {
"head_size": 128,
"rotary_dim": 128,
"max_position_embeddings": 4096,
"base": 10000,
"is_neox_style": True,
"dtype": torch.bfloat16,
}
return [
# MRotaryEmbedding tests
RotaryEmbeddingTestCase(
name="MRotaryEmbedding.forward_native",
rope_class=MRotaryEmbedding,
rope_kwargs={**common_kwargs, "mrope_section": [16, 24, 24]},
method_name="forward_native",
positions_shape=(3, 32), # 2D for multimodal
expect_forward_native=True,
expect_forward=False,
),
RotaryEmbeddingTestCase(
name="MRotaryEmbedding.forward_cuda_1d",
rope_class=MRotaryEmbedding,
rope_kwargs={**common_kwargs, "mrope_section": [16, 24, 24]},
method_name="forward_cuda",
positions_shape=(32,), # 1D triggers apply_rotary_emb path
expect_forward_native=False,
expect_forward=True,
),
# XDRotaryEmbedding tests
RotaryEmbeddingTestCase(
name="XDRotaryEmbedding.forward",
rope_class=XDRotaryEmbedding,
rope_kwargs={
**common_kwargs,
"scaling_alpha": 1.0,
"xdrope_section": [16, 16, 16, 16],
},
method_name="forward",
positions_shape=(4, 32), # 4D for P/W/H/T
expect_forward_native=False,
expect_forward=True,
),
# Ernie4_5_VLRotaryEmbedding tests
RotaryEmbeddingTestCase(
name="Ernie4_5_VLRotaryEmbedding.forward_native",
rope_class=Ernie4_5_VLRotaryEmbedding,
rope_kwargs={**common_kwargs, "mrope_section": [22, 22, 20]},
method_name="forward_native",
positions_shape=(3, 32), # 2D for multimodal
expect_forward_native=True,
expect_forward=False,
),
]
def run_dispatch_test(
test_case: RotaryEmbeddingTestCase,
device: str,
):
"""Run a dispatch test for a RotaryEmbedding class."""
vllm_config = VllmConfig(
compilation_config=CompilationConfig(custom_ops=["all", "+apply_rotary_emb"])
)
get_cached_compilation_config.cache_clear()
with set_current_vllm_config(vllm_config):
rope = test_case.rope_class(**test_case.rope_kwargs).to(device=device)
apply_rotary_emb = rope.apply_rotary_emb
# Verify custom op is enabled
if test_case.expect_forward_native:
assert (
apply_rotary_emb._forward_method != apply_rotary_emb.forward_native
), "Test setup error: ApplyRotaryEmb custom op should be enabled"
# Setup call tracking
call_tracker = {"forward_native_called": False, "forward_called": False}
original_forward_native = apply_rotary_emb.forward_native
original_forward = apply_rotary_emb.forward
def tracked_forward_native(*args, **kwargs):
call_tracker["forward_native_called"] = True
return original_forward_native(*args, **kwargs)
def tracked_forward(*args, **kwargs):
call_tracker["forward_called"] = True
return original_forward(*args, **kwargs)
apply_rotary_emb.forward_native = tracked_forward_native
apply_rotary_emb.forward = tracked_forward
try:
num_tokens = test_case.positions_shape[-1]
num_q_heads = 8
num_kv_heads = 2
head_size = test_case.rope_kwargs["head_size"]
max_position = test_case.rope_kwargs["max_position_embeddings"]
positions = torch.randint(
0, max_position // 4, test_case.positions_shape, device=device
)
query = torch.randn(
num_tokens, num_q_heads * head_size, dtype=torch.bfloat16, device=device
)
key = torch.randn(
num_tokens,
num_kv_heads * head_size,
dtype=torch.bfloat16,
device=device,
)
# Call the method under test
method = getattr(rope, test_case.method_name)
method(positions, query.clone(), key.clone())
# Verify expectations
if test_case.expect_forward_native:
assert call_tracker["forward_native_called"], (
f"{test_case.name} should call ApplyRotaryEmb.forward_native()"
)
if not test_case.expect_forward:
assert not call_tracker["forward_called"], (
f"{test_case.name} should NOT call ApplyRotaryEmb.forward(). "
"Bug: when +apply_rotary_emb is enabled, forward_native() "
"incorrectly dispatches to CUDA/HIP kernels."
)
if test_case.expect_forward:
assert call_tracker["forward_called"], (
f"{test_case.name} should call ApplyRotaryEmb.forward()"
)
finally:
apply_rotary_emb.forward_native = original_forward_native
apply_rotary_emb.forward = original_forward
@pytest.mark.skipif(
not current_platform.is_cuda_alike(), reason="Skipping CUDA/ROCm only tests."
)
@pytest.mark.parametrize("test_case", get_test_cases(), ids=lambda tc: tc.name)
@pytest.mark.parametrize("device", CUDA_DEVICES)
def test_rotary_embedding_dispatch(
test_case: RotaryEmbeddingTestCase,
device: str,
):
"""
Test that RotaryEmbedding classes dispatch to the correct ApplyRotaryEmb method.
- forward_native methods should call ApplyRotaryEmb.forward_native()
- forward_cuda/forward methods should call ApplyRotaryEmb.forward()
"""
run_dispatch_test(test_case, device)
...@@ -116,7 +116,6 @@ def test_mrope( ...@@ -116,7 +116,6 @@ def test_mrope(
mrope_helper_class = get_rope( mrope_helper_class = get_rope(
head_size=head_dim, head_size=head_dim,
rotary_dim=head_dim,
max_position=max_position, max_position=max_position,
is_neox_style=is_neox_style, is_neox_style=is_neox_style,
rope_parameters=config.rope_parameters, rope_parameters=config.rope_parameters,
...@@ -185,7 +184,6 @@ def test_mrope_torch_compile_tracing( ...@@ -185,7 +184,6 @@ def test_mrope_torch_compile_tracing(
mrope_helper_class = get_rope( mrope_helper_class = get_rope(
head_size=head_dim, head_size=head_dim,
rotary_dim=head_dim,
max_position=max_position, max_position=max_position,
is_neox_style=is_neox_style, is_neox_style=is_neox_style,
rope_parameters=config.rope_parameters, rope_parameters=config.rope_parameters,
......
...@@ -83,8 +83,12 @@ def test_rotary_embedding( ...@@ -83,8 +83,12 @@ def test_rotary_embedding(
torch.set_default_device(device) torch.set_default_device(device)
if rotary_dim is None: if rotary_dim is None:
rotary_dim = head_size rotary_dim = head_size
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} rope_parameters = {
rope = get_rope(head_size, rotary_dim, max_position, is_neox_style, rope_parameters) "rope_type": "default",
"rope_theta": rope_theta,
"partial_rotary_factor": rotary_dim / head_size,
}
rope = get_rope(head_size, max_position, is_neox_style, rope_parameters)
rope = rope.to(dtype=dtype, device=torch.get_default_device()) rope = rope.to(dtype=dtype, device=torch.get_default_device())
positions = torch.randint(0, max_position, (batch_size, seq_len)) positions = torch.randint(0, max_position, (batch_size, seq_len))
...@@ -150,9 +154,9 @@ def test_rope_module_cache(): ...@@ -150,9 +154,9 @@ def test_rope_module_cache():
if rotary_dim is None: if rotary_dim is None:
rotary_dim = head_size rotary_dim = head_size
rope_parameters["rope_theta"] = rope_theta rope_parameters["rope_theta"] = rope_theta
rope_parameters["partial_rotary_factor"] = rotary_dim / head_size
rope = get_rope( rope = get_rope(
head_size, head_size,
rotary_dim,
max_position, max_position,
is_neox_style, is_neox_style,
rope_parameters, rope_parameters,
...@@ -177,9 +181,9 @@ def test_rope_module_cache(): ...@@ -177,9 +181,9 @@ def test_rope_module_cache():
if rotary_dim is None: if rotary_dim is None:
rotary_dim = head_size rotary_dim = head_size
rope_parameters["rope_theta"] = rope_theta rope_parameters["rope_theta"] = rope_theta
rope_parameters["partial_rotary_factor"] = rotary_dim / head_size
rope = get_rope( rope = get_rope(
head_size, head_size,
rotary_dim,
max_position, max_position,
is_neox_style, is_neox_style,
rope_parameters, rope_parameters,
......
...@@ -594,7 +594,8 @@ def make_modular_kernel( ...@@ -594,7 +594,8 @@ def make_modular_kernel(
) )
modular_kernel = mk.FusedMoEModularKernel( modular_kernel = mk.FusedMoEModularKernel(
prepare_finalize=prepare_finalize, fused_experts=fused_experts prepare_finalize=prepare_finalize,
fused_experts=fused_experts,
) )
return modular_kernel return modular_kernel
......
...@@ -27,7 +27,7 @@ BLOCK_SIZE = [128, 128] ...@@ -27,7 +27,7 @@ BLOCK_SIZE = [128, 128]
@pytest.mark.parametrize("N", [512, 1024]) # intermediate dim per expert @pytest.mark.parametrize("N", [512, 1024]) # intermediate dim per expert
@pytest.mark.parametrize("topk", [2, 4]) @pytest.mark.parametrize("topk", [2, 4])
def test_batched_deepgemm_vs_triton( def test_batched_deepgemm_vs_triton(
E: int, T: int, K: int, N: int, topk: int, monkeypatch E: int, T: int, K: int, N: int, topk: int, monkeypatch, workspace_init
): ):
"""Compare BatchedDeepGemmExperts to BatchedTritonExperts.""" """Compare BatchedDeepGemmExperts to BatchedTritonExperts."""
......
...@@ -248,6 +248,7 @@ def test_fused_moe_batched_experts( ...@@ -248,6 +248,7 @@ def test_fused_moe_batched_experts(
per_act_token_quant: bool, per_act_token_quant: bool,
block_shape: list[int] | None, block_shape: list[int] | None,
input_scales: bool, input_scales: bool,
workspace_init,
): ):
"""Note: float8_e4m3fn is not supported on CUDA architecture < 89, """Note: float8_e4m3fn is not supported on CUDA architecture < 89,
and those tests will be skipped on unsupported hardware.""" and those tests will be skipped on unsupported hardware."""
......
...@@ -137,7 +137,7 @@ def setup_cuda(): ...@@ -137,7 +137,7 @@ def setup_cuda():
@pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("seed", SEEDS)
@torch.inference_mode() @torch.inference_mode()
def test_w8a8_block_fp8_fused_moe( def test_w8a8_block_fp8_fused_moe(
M, N, K, E, topk, block_size, dtype, seed, monkeypatch M, N, K, E, topk, block_size, dtype, seed, monkeypatch, workspace_init
): ):
if topk > E: if topk > E:
pytest.skip(f"Skipping test; topk={topk} > E={E}") pytest.skip(f"Skipping test; topk={topk} > E={E}")
......
...@@ -274,6 +274,7 @@ def test_cutlass_moe_8_bit_no_graph( ...@@ -274,6 +274,7 @@ def test_cutlass_moe_8_bit_no_graph(
per_act_token: bool, per_act_token: bool,
per_out_ch: bool, per_out_ch: bool,
monkeypatch, monkeypatch,
workspace_init,
ep_size: int | None = None, ep_size: int | None = None,
): ):
current_platform.seed_everything(7) current_platform.seed_everything(7)
...@@ -329,6 +330,7 @@ def test_cutlass_moe_8_bit_cuda_graph( ...@@ -329,6 +330,7 @@ def test_cutlass_moe_8_bit_cuda_graph(
per_act_token: bool, per_act_token: bool,
per_out_ch: bool, per_out_ch: bool,
monkeypatch, monkeypatch,
workspace_init,
): ):
current_platform.seed_everything(7) current_platform.seed_everything(7)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
...@@ -385,9 +387,19 @@ def test_cutlass_moe_8_bit_EP( ...@@ -385,9 +387,19 @@ def test_cutlass_moe_8_bit_EP(
per_out_channel: bool, per_out_channel: bool,
ep_size: int, ep_size: int,
monkeypatch, monkeypatch,
workspace_init,
): ):
test_cutlass_moe_8_bit_no_graph( test_cutlass_moe_8_bit_no_graph(
m, n, k, e, topk, per_act_token, per_out_channel, monkeypatch, ep_size m,
n,
k,
e,
topk,
per_act_token,
per_out_channel,
monkeypatch,
workspace_init,
ep_size,
) )
...@@ -419,9 +431,19 @@ def test_cutlass_moe_8_bit_EP_large( ...@@ -419,9 +431,19 @@ def test_cutlass_moe_8_bit_EP_large(
per_out_channel: bool, per_out_channel: bool,
ep_size: int, ep_size: int,
monkeypatch, monkeypatch,
workspace_init,
): ):
test_cutlass_moe_8_bit_no_graph( test_cutlass_moe_8_bit_no_graph(
m, n, k, e, topk, per_act_token, per_out_channel, monkeypatch, ep_size m,
n,
k,
e,
topk,
per_act_token,
per_out_channel,
monkeypatch,
workspace_init,
ep_size,
) )
...@@ -445,6 +467,7 @@ def test_run_cutlass_moe_fp8( ...@@ -445,6 +467,7 @@ def test_run_cutlass_moe_fp8(
per_act_token: bool, per_act_token: bool,
per_out_channel: bool, per_out_channel: bool,
ep_size: int, ep_size: int,
workspace_init,
): ):
current_platform.seed_everything(7) current_platform.seed_everything(7)
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
......
...@@ -29,6 +29,7 @@ from vllm.utils.deep_gemm import ( ...@@ -29,6 +29,7 @@ from vllm.utils.deep_gemm import (
is_deep_gemm_supported, is_deep_gemm_supported,
) )
from vllm.utils.import_utils import has_deep_ep, has_deep_gemm from vllm.utils.import_utils import has_deep_ep, has_deep_gemm
from vllm.v1.worker.workspace import init_workspace_manager
from ...utils import multi_gpu_test from ...utils import multi_gpu_test
from .parallel_utils import ProcessGroupInfo, parallel_launch from .parallel_utils import ProcessGroupInfo, parallel_launch
...@@ -363,6 +364,9 @@ def _test_deepep_deepgemm_moe( ...@@ -363,6 +364,9 @@ def _test_deepep_deepgemm_moe(
w1_scale: torch.Tensor, w1_scale: torch.Tensor,
w2_scale: torch.Tensor, w2_scale: torch.Tensor,
): ):
device = torch.device(f"cuda:{pgi.local_rank}")
init_workspace_manager(device)
current_platform.seed_everything(pgi.rank) current_platform.seed_everything(pgi.rank)
w1 = w1.to(device=torch.cuda.current_device()) w1 = w1.to(device=torch.cuda.current_device())
...@@ -445,6 +449,7 @@ def test_ht_deepep_deepgemm_moe( ...@@ -445,6 +449,7 @@ def test_ht_deepep_deepgemm_moe(
topk: int, topk: int,
world_dp_size: tuple[int, int], world_dp_size: tuple[int, int],
disable_deepgemm_ue8m0, disable_deepgemm_ue8m0,
workspace_init,
): ):
""" """
Tests for High-Throughput DeepEP + DeepGemm integration. Tests for High-Throughput DeepEP + DeepGemm integration.
...@@ -518,6 +523,7 @@ def test_ll_deepep_deepgemm_moe( ...@@ -518,6 +523,7 @@ def test_ll_deepep_deepgemm_moe(
block_size: list[int], block_size: list[int],
world_dp_size: tuple[int, int], world_dp_size: tuple[int, int],
disable_deepgemm_ue8m0, disable_deepgemm_ue8m0,
workspace_init,
): ):
""" """
Tests for Low-Latency DeepEP + DeepGemm integration. Tests for Low-Latency DeepEP + DeepGemm integration.
......
...@@ -22,6 +22,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import ( ...@@ -22,6 +22,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.import_utils import has_deep_ep from vllm.utils.import_utils import has_deep_ep
from vllm.v1.worker.workspace import init_workspace_manager
from ...utils import multi_gpu_test from ...utils import multi_gpu_test
from .parallel_utils import ProcessGroupInfo, parallel_launch from .parallel_utils import ProcessGroupInfo, parallel_launch
...@@ -342,6 +343,9 @@ def _deep_ep_moe( ...@@ -342,6 +343,9 @@ def _deep_ep_moe(
use_fp8_dispatch: bool, use_fp8_dispatch: bool,
per_act_token_quant: bool, per_act_token_quant: bool,
): ):
device = torch.device(f"cuda:{pgi.local_rank}")
init_workspace_manager(device)
if not low_latency_mode: if not low_latency_mode:
assert not use_fp8_dispatch, ( assert not use_fp8_dispatch, (
"FP8 dispatch interface is available only in low-latency mode" "FP8 dispatch interface is available only in low-latency mode"
...@@ -437,6 +441,7 @@ def test_deep_ep_moe( ...@@ -437,6 +441,7 @@ def test_deep_ep_moe(
topk: int, topk: int,
world_dp_size: tuple[int, int], world_dp_size: tuple[int, int],
per_act_token_quant: bool, per_act_token_quant: bool,
workspace_init,
): ):
low_latency_mode = False low_latency_mode = False
use_fp8_dispatch = False use_fp8_dispatch = False
...@@ -492,6 +497,7 @@ def test_low_latency_deep_ep_moe( ...@@ -492,6 +497,7 @@ def test_low_latency_deep_ep_moe(
topk: int, topk: int,
world_dp_size: tuple[int, int], world_dp_size: tuple[int, int],
use_fp8_dispatch: bool, use_fp8_dispatch: bool,
workspace_init,
): ):
low_latency_mode = True low_latency_mode = True
......
...@@ -143,7 +143,7 @@ NUM_EXPERTS = [32] ...@@ -143,7 +143,7 @@ NUM_EXPERTS = [32]
@pytest.mark.parametrize("topk", TOPKS) @pytest.mark.parametrize("topk", TOPKS)
@pytest.mark.parametrize("num_experts", NUM_EXPERTS) @pytest.mark.parametrize("num_experts", NUM_EXPERTS)
@pytest.mark.skipif(not is_deep_gemm_supported(), reason="Requires deep_gemm kernels") @pytest.mark.skipif(not is_deep_gemm_supported(), reason="Requires deep_gemm kernels")
def test_deepgemm_vs_triton(m, n, k, topk, num_experts, monkeypatch): def test_deepgemm_vs_triton(m, n, k, topk, num_experts, monkeypatch, workspace_init):
with monkeypatch.context() as mp: with monkeypatch.context() as mp:
mp.setenv("VLLM_USE_DEEP_GEMM", "1") mp.setenv("VLLM_USE_DEEP_GEMM", "1")
......
...@@ -5,6 +5,7 @@ from dataclasses import dataclass ...@@ -5,6 +5,7 @@ from dataclasses import dataclass
import pytest import pytest
import torch import torch
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.config import (
FusedMoEQuantConfig, FusedMoEQuantConfig,
...@@ -107,6 +108,19 @@ class TestData: ...@@ -107,6 +108,19 @@ class TestData:
layer.w2_input_scale = a2_scale layer.w2_input_scale = a2_scale
layer.w13_weight_scale = w13_weight_scale layer.w13_weight_scale = w13_weight_scale
layer.w2_weight_scale = w2_weight_scale layer.w2_weight_scale = w2_weight_scale
# Setup dummy config.
layer.moe_parallel_config = mk.FusedMoEParallelConfig(
tp_size=1,
pcp_size=1,
dp_size=1,
ep_size=1,
tp_rank=1,
pcp_rank=1,
dp_rank=1,
ep_rank=1,
use_ep=False,
all2all_backend="naive",
)
register_moe_scaling_factors(layer) register_moe_scaling_factors(layer)
...@@ -206,6 +220,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph( ...@@ -206,6 +220,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
topk: int, topk: int,
activation: str, activation: str,
monkeypatch, monkeypatch,
workspace_init,
): ):
current_platform.seed_everything(7) current_platform.seed_everything(7)
monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
......
...@@ -51,7 +51,14 @@ MNK_FACTORS = [ ...@@ -51,7 +51,14 @@ MNK_FACTORS = [
@pytest.mark.parametrize("activation", ["silu_and_mul", "relu2"]) @pytest.mark.parametrize("activation", ["silu_and_mul", "relu2"])
@torch.inference_mode() @torch.inference_mode()
def test_flashinfer_fp4_moe_no_graph( def test_flashinfer_fp4_moe_no_graph(
m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype, activation: str m: int,
n: int,
k: int,
e: int,
topk: int,
dtype: torch.dtype,
activation: str,
workspace_init,
): ):
current_platform.seed_everything(7) current_platform.seed_everything(7)
with set_current_vllm_config( with set_current_vllm_config(
......
...@@ -269,7 +269,7 @@ class Case: ...@@ -269,7 +269,7 @@ class Case:
) )
@pytest.mark.parametrize("num_token", [2]) @pytest.mark.parametrize("num_token", [2])
@pytest.mark.parametrize("tp", [1, 2, 4, 8]) @pytest.mark.parametrize("tp", [1, 2, 4, 8])
def test_equiv(num_token, a_dtype, w_dtype, tp): def test_equiv(num_token, a_dtype, w_dtype, tp, workspace_init):
from triton_kernels.tensor_details import layout from triton_kernels.tensor_details import layout
if not hasattr(layout, "make_default_matmul_mxfp4_w_layout"): if not hasattr(layout, "make_default_matmul_mxfp4_w_layout"):
......
...@@ -16,6 +16,7 @@ from vllm.platforms import current_platform ...@@ -16,6 +16,7 @@ from vllm.platforms import current_platform
from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx
from vllm.utils.torch_utils import cuda_device_count_stateless from vllm.utils.torch_utils import cuda_device_count_stateless
from vllm.v1.worker.workspace import init_workspace_manager
from .modular_kernel_tools.common import ( from .modular_kernel_tools.common import (
Config, Config,
...@@ -77,6 +78,10 @@ def rank_worker( ...@@ -77,6 +78,10 @@ def rank_worker(
weights: WeightTensors, weights: WeightTensors,
verbose: bool, verbose: bool,
): ):
# Initialize workspace manager in child process
device = torch.device(f"cuda:{pgi.local_rank}")
init_workspace_manager(device)
current_platform.seed_everything(pgi.rank) current_platform.seed_everything(pgi.rank)
# sanity check # sanity check
...@@ -300,6 +305,7 @@ def test_modular_kernel_combinations_singlegpu( ...@@ -300,6 +305,7 @@ def test_modular_kernel_combinations_singlegpu(
chunk_size: int | None, chunk_size: int | None,
world_size: int, world_size: int,
pytestconfig, pytestconfig,
workspace_init,
): ):
"""Note: float8_e4m3fn is not supported on CUDA architecture < 89, """Note: float8_e4m3fn is not supported on CUDA architecture < 89,
and those tests will be skipped on unsupported hardware.""" and those tests will be skipped on unsupported hardware."""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment