Merge tag 'v0.13.0rc2' into v0.13.0rc2-ori

a3f8d5dd · zhuwenwen · 8d75f22e · f34eca5f · a3f8d5dd · a3f8d5dd
Commit a3f8d5dd authored Dec 17, 2025 by zhuwenwen
20 changed files
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -29,7 +29,8 @@ from vllm.multimodal.utils import (
    encode_image_base64,
    encode_video_base64,
 )
-from vllm.tokenizers import MistralTokenizer, get_tokenizer
+from vllm.tokenizers import get_tokenizer
+from vllm.tokenizers.mistral import MistralTokenizer
 from vllm.utils.serial_utils import tensor2base64
 from ..models.registry import HF_EXAMPLE_MODELS
@@ -796,9 +797,13 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
            "content": "<|image_1|>\nWhat's in this image?",
        }
    ]
    assert mm_data is not None
    assert "image" in mm_data
-    assert mm_data["image"] is None
+    assert isinstance(mm_data["image"], list)
+    assert len(mm_data["image"]) == 1
+    assert mm_data["image"][0] is None
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
@@ -825,10 +830,11 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
    # Should have audio in mm_data as None (UUID provided)
    assert mm_data is not None
    assert "audio" in mm_data
-    assert mm_data["audio"] is None
+    assert isinstance(mm_data["audio"], list)
+    assert len(mm_data["audio"]) == 1
+    assert mm_data["audio"][0] is None
    # UUID should be recorded
-    assert mm_uuids is not None
-    assert "audio" in mm_uuids
    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[uuid])
@@ -1121,10 +1127,105 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
    mm_data = await mm_future
    assert mm_data is not None
    assert "image" in mm_data
-    assert mm_data["image"] is None
+    assert isinstance(mm_data["image"], list)
+    assert len(mm_data["image"]) == 1
+    assert mm_data["image"][0] is None
    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
+def test_parse_chat_messages_empty_dict_image_embeds(
+    phi3v_model_config_image_embeds,
+):
+    """Test that empty dictionary for image_embeds is handled without errors."""
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_embeds", "image_embeds": {}},
+                    {"type": "text", "text": "What's in this image?"},
+                ],
+            }
+        ],
+        phi3v_model_config_image_embeds,
+        content_format="string",
+    )
+    # Verify conversation structure
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\nWhat's in this image?",
+        }
+    ]
+    # Verify mm_data contains an empty dictionary of embeddings
+    assert mm_data is not None
+    assert "image" in mm_data
+    assert isinstance(mm_data["image"], dict)
+    assert len(mm_data["image"]) == 0
+    # Verify UUIDs (None since we didn't provide any)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
+def test_parse_chat_messages_multiple_dict_image_embeds(
+    phi3v_model_config_image_embeds,
+):
+    """Test that multiple dictionaries for image_embeds is handled without errors."""
+    # Create two sample image embedding tensors
+    batch_size = 2
+    image_embedding_1 = torch.randn(batch_size, 256, 1024)
+    image_embedding_2 = torch.randn(batch_size, 3)
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_embeds",
+                        "image_embeds": {
+                            "image_embedding_1": tensor2base64(p),
+                            "image_embedding_2": tensor2base64(i),
+                        },
+                    }
+                    for p, i in zip(image_embedding_1, image_embedding_2)
+                ]
+                + [
+                    {"type": "text", "text": "Describe these two images."},
+                ],
+            }
+        ],
+        phi3v_model_config_image_embeds,
+        content_format="string",
+    )
+    # Verify conversation structure
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\n<|image_2|>\nDescribe these two images.",
+        }
+    ]
+    # Verify mm_data contains a dictionary of multi-embeddings
+    assert mm_data is not None
+    assert "image" in mm_data
+    assert isinstance(mm_data["image"], dict)
+    assert len(mm_data["image"]) == batch_size
+    # Verify each embedding has the correct shape
+    assert isinstance(mm_data["image"]["image_embedding_1"], torch.Tensor)
+    assert mm_data["image"]["image_embedding_1"].shape == image_embedding_1.shape
+    assert isinstance(mm_data["image"]["image_embedding_2"], torch.Tensor)
+    assert mm_data["image"]["image_embedding_2"].shape == image_embedding_2.shape
+    # Verify UUIDs (None since we didn't provide any)
+    _assert_mm_uuids(mm_uuids, batch_size, expected_uuids=[None, None])
 @pytest.mark.asyncio
 async def test_parse_chat_messages_multiple_images_async(
    phi3v_model_config,

--- a/tests/kernels/attention/test_cpu_attn.py
+++ b/tests/kernels/attention/test_cpu_attn.py
@@ -7,7 +7,8 @@ import math
 import pytest
 import torch
-from vllm.platforms import current_platform
+from vllm.platforms import CpuArchEnum, current_platform
+from vllm.v1.attention.backends.cpu_attn import _get_attn_isa
 if not current_platform.is_cpu():
    pytest.skip("skipping CPU-only tests", allow_module_level=True)
@@ -36,6 +37,21 @@ SEQ_LENS = [  # (q_len, kv_len)
 ]
+def get_attn_isa(
+    block_size: int | None = None,
+    dtype: torch.dtype | None = None,
+):
+    if block_size and dtype:
+        return _get_attn_isa(dtype, block_size)
+    else:
+        if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
+            return "neon"
+        elif torch._C._cpu._is_amx_tile_supported():
+            return "amx"
+        else:
+            return "vec"
 # rand number generation takes too much time, cache rand tensors
 @functools.lru_cache(maxsize=128, typed=False)
 def tensor_cache(
@@ -452,6 +468,49 @@ def test_varlen_with_paged_kv_normal_vec16(
    )
+@pytest.mark.parametrize("seq_lens", SEQ_LENS)
+@pytest.mark.parametrize("num_heads", NUM_HEADS)
+@pytest.mark.parametrize("head_size", HEAD_SIZES)
+@pytest.mark.parametrize("block_size", [96, 128])
+@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS)
+@pytest.mark.parametrize("dtype", QTYPES)
+@pytest.mark.parametrize("soft_cap", [None])
+@pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
+@pytest.mark.parametrize("use_alibi", [False])
+@pytest.mark.parametrize("use_sink", [False])
+@pytest.mark.parametrize("isa", ["neon"])
+@pytest.mark.skipif(
+    current_platform.get_cpu_architecture() != CpuArchEnum.ARM,
+    reason="Not an Arm CPU.",
+)
+def test_varlen_with_paged_kv_normal_neon(
+    seq_lens: list[tuple[int, int]],
+    num_heads: tuple[int, int],
+    head_size: int,
+    sliding_window: int | None,
+    dtype: torch.dtype,
+    block_size: int,
+    soft_cap: float | None,
+    num_blocks: int,
+    use_alibi: bool,
+    use_sink: bool,
+    isa: str,
+) -> None:
+    varlen_with_paged_kv(
+        seq_lens=seq_lens,
+        num_heads=num_heads,
+        head_size=head_size,
+        sliding_window=sliding_window,
+        dtype=dtype,
+        block_size=block_size,
+        soft_cap=soft_cap,
+        num_blocks=num_blocks,
+        use_alibi=use_alibi,
+        use_sink=use_sink,
+        isa=isa,
+    )
 @pytest.mark.parametrize("seq_lens", SEQ_LENS)
 @pytest.mark.parametrize("num_heads", NUM_HEADS)
 @pytest.mark.parametrize("head_size", [96])
@@ -462,9 +521,7 @@ def test_varlen_with_paged_kv_normal_vec16(
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("use_alibi", [False])
 @pytest.mark.parametrize("use_sink", [False])
-@pytest.mark.parametrize(
+@pytest.mark.parametrize("isa", [get_attn_isa()])
-    "isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
-)
 def test_varlen_with_paged_kv_softcap(
    seq_lens: list[tuple[int, int]],
    num_heads: tuple[int, int],
@@ -503,9 +560,7 @@ def test_varlen_with_paged_kv_softcap(
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("use_alibi", [True])
 @pytest.mark.parametrize("use_sink", [False])
-@pytest.mark.parametrize(
+@pytest.mark.parametrize("isa", [get_attn_isa()])
-    "isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
-)
 def test_varlen_with_paged_kv_alibi(
    seq_lens: list[tuple[int, int]],
    num_heads: tuple[int, int],
@@ -544,9 +599,7 @@ def test_varlen_with_paged_kv_alibi(
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("use_alibi", [False])
 @pytest.mark.parametrize("use_sink", [True])
-@pytest.mark.parametrize(
+@pytest.mark.parametrize("isa", [get_attn_isa()])
-    "isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
-)
 def test_varlen_with_paged_kv_sink(
    seq_lens: list[tuple[int, int]],
    num_heads: tuple[int, int],

--- a/tests/kernels/attention/test_cutlass_mla_decode.py
+++ b/tests/kernels/attention/test_cutlass_mla_decode.py
@@ -32,8 +32,8 @@ def cal_diff(
 CUTLASS_MLA_UNSUPPORTED_REASON = (
-    "Cutlass MLA Requires compute capability of 10 or above."
+    "Cutlass MLA Requires compute capability of 100 or above."
-    if not current_platform.is_device_capability(100)
+    if not current_platform.is_device_capability_family(100)
    else "Cutlass MLA is supported"
 )

--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -11,7 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import (
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import round_up
-if not current_platform.is_device_capability(100):
+if not current_platform.is_device_capability_family(100):
    pytest.skip(
        "This TRTLLM kernel requires NVIDIA Blackwell.", allow_module_level=True
    )
@@ -443,7 +443,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
        output_trtllm = output_trtllm.reshape(-1, query.shape[1], query.shape[2])
    if q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP4_DTYPE:
-        rtol, atol = 1e-1, 2e-1
+        rtol, atol = 3e-1, 4e-1
    elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == FP8_DTYPE:
        rtol, atol = 4e-2, 6e-2
    elif q_quant_dtype == FP8_DTYPE and o_quant_dtype == dtype:

--- a/tests/kernels/attention/test_triton_unified_attention.py
+++ b/tests/kernels/attention/test_triton_unified_attention.py
@@ -7,6 +7,7 @@ import torch
 from vllm.attention.ops.triton_unified_attention import unified_attention
 from vllm.platforms import current_platform
+from vllm.utils.math_utils import next_power_of_2
 NUM_HEADS = [(4, 4), (8, 2)]
 HEAD_SIZES = [128, 256]
@@ -22,6 +23,10 @@ QDTYPES = (
 # one value small enough to test the schema op check
 NUM_BLOCKS = [32768, 2048]
+# 0: use 2D kernel for decode
+# 8: use 3D kernel for decode
+SEQ_THRESHOLD_3D_VALUES = [0, 8]
 def ref_paged_attn(
    query: torch.Tensor,
@@ -92,6 +97,7 @@ def ref_paged_attn(
 @pytest.mark.parametrize("soft_cap", [None, 50.0])
 @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 @pytest.mark.parametrize("q_dtype", QDTYPES)
+@pytest.mark.parametrize("seq_threshold_3D", SEQ_THRESHOLD_3D_VALUES)
 @torch.inference_mode()
 def test_triton_unified_attn(
    seq_lens: list[tuple[int, int]],
@@ -103,6 +109,7 @@ def test_triton_unified_attn(
    soft_cap: float | None,
    num_blocks: int,
    q_dtype: torch.dtype | None,
+    seq_threshold_3D: int,
 ) -> None:
    torch.set_default_device("cuda")
@@ -152,6 +159,21 @@ def test_triton_unified_attn(
        k_descale = torch.rand(scale_shape, dtype=torch.float32)
        v_descale = torch.rand(scale_shape, dtype=torch.float32)
+    num_par_softmax_segments = 16
+    head_size_padded = next_power_of_2(head_size)
+    softmax_segm_output = torch.empty(
+        (seq_threshold_3D, num_query_heads, num_par_softmax_segments, head_size_padded),
+        dtype=torch.float32,
+    )
+    softmax_segm_max = torch.empty(
+        (seq_threshold_3D, num_query_heads, num_par_softmax_segments),
+        dtype=torch.float32,
+    )
+    softmax_segm_expsum = torch.empty(
+        (seq_threshold_3D, num_query_heads, num_par_softmax_segments),
+        dtype=torch.float32,
+    )
    unified_attention(
        q=maybe_quantized_query,
        k=maybe_quantized_key_cache,
@@ -169,6 +191,11 @@ def test_triton_unified_attn(
        q_descale=q_descale,
        k_descale=k_descale,
        v_descale=v_descale,
+        seq_threshold_3D=seq_threshold_3D,
+        num_par_softmax_segments=num_par_softmax_segments,
+        softmax_segm_output=softmax_segm_output,
+        softmax_segm_max=softmax_segm_max,
+        softmax_segm_expsum=softmax_segm_expsum,
    )
    ref_output = ref_paged_attn(

--- a/tests/kernels/core/test_apply_rotary_emb.py
+++ b/tests/kernels/core/test_apply_rotary_emb.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for ApplyRotaryEmb CustomOp dispatch behavior.
+This test ensures that RotaryEmbedding classes correctly call the appropriate
+ApplyRotaryEmb methods based on the calling context:
+1. RotaryEmbedding.forward_native() -> ApplyRotaryEmb.forward_native()
+2. RotaryEmbedding.forward_cuda() -> ApplyRotaryEmb.forward() (auto-dispatch)
+3. RotaryEmbedding.forward_hip() -> ApplyRotaryEmb.forward() (auto-dispatch)
+"""
+from dataclasses import dataclass
+import pytest
+import torch
+from vllm.config import (
+    CompilationConfig,
+    VllmConfig,
+    get_cached_compilation_config,
+    set_current_vllm_config,
+)
+from vllm.platforms import current_platform
+CUDA_DEVICES = ["cuda:0"]
+@dataclass
+class RotaryEmbeddingTestCase:
+    """Test case configuration for RotaryEmbedding dispatch tests."""
+    name: str
+    rope_class: type
+    rope_kwargs: dict
+    method_name: str  # forward_native, forward_cuda, forward
+    positions_shape: tuple  # (num_tokens,) or (3, num_tokens) or (4, num_tokens)
+    expect_forward_native: bool  # Should call ApplyRotaryEmb.forward_native()
+    expect_forward: bool  # Should call ApplyRotaryEmb.forward()
+def get_test_cases() -> list[RotaryEmbeddingTestCase]:
+    """Generate test cases for all RotaryEmbedding classes."""
+    from vllm.model_executor.layers.rotary_embedding.ernie45_vl_rope import (
+        Ernie4_5_VLRotaryEmbedding,
+    )
+    from vllm.model_executor.layers.rotary_embedding.mrope import MRotaryEmbedding
+    from vllm.model_executor.layers.rotary_embedding.xdrope import XDRotaryEmbedding
+    common_kwargs = {
+        "head_size": 128,
+        "rotary_dim": 128,
+        "max_position_embeddings": 4096,
+        "base": 10000,
+        "is_neox_style": True,
+        "dtype": torch.bfloat16,
+    }
+    return [
+        # MRotaryEmbedding tests
+        RotaryEmbeddingTestCase(
+            name="MRotaryEmbedding.forward_native",
+            rope_class=MRotaryEmbedding,
+            rope_kwargs={**common_kwargs, "mrope_section": [16, 24, 24]},
+            method_name="forward_native",
+            positions_shape=(3, 32),  # 2D for multimodal
+            expect_forward_native=True,
+            expect_forward=False,
+        ),
+        RotaryEmbeddingTestCase(
+            name="MRotaryEmbedding.forward_cuda_1d",
+            rope_class=MRotaryEmbedding,
+            rope_kwargs={**common_kwargs, "mrope_section": [16, 24, 24]},
+            method_name="forward_cuda",
+            positions_shape=(32,),  # 1D triggers apply_rotary_emb path
+            expect_forward_native=False,
+            expect_forward=True,
+        ),
+        # XDRotaryEmbedding tests
+        RotaryEmbeddingTestCase(
+            name="XDRotaryEmbedding.forward",
+            rope_class=XDRotaryEmbedding,
+            rope_kwargs={
+                **common_kwargs,
+                "scaling_alpha": 1.0,
+                "xdrope_section": [16, 16, 16, 16],
+            },
+            method_name="forward",
+            positions_shape=(4, 32),  # 4D for P/W/H/T
+            expect_forward_native=False,
+            expect_forward=True,
+        ),
+        # Ernie4_5_VLRotaryEmbedding tests
+        RotaryEmbeddingTestCase(
+            name="Ernie4_5_VLRotaryEmbedding.forward_native",
+            rope_class=Ernie4_5_VLRotaryEmbedding,
+            rope_kwargs={**common_kwargs, "mrope_section": [22, 22, 20]},
+            method_name="forward_native",
+            positions_shape=(3, 32),  # 2D for multimodal
+            expect_forward_native=True,
+            expect_forward=False,
+        ),
+    ]
+def run_dispatch_test(
+    test_case: RotaryEmbeddingTestCase,
+    device: str,
+):
+    """Run a dispatch test for a RotaryEmbedding class."""
+    vllm_config = VllmConfig(
+        compilation_config=CompilationConfig(custom_ops=["all", "+apply_rotary_emb"])
+    )
+    get_cached_compilation_config.cache_clear()
+    with set_current_vllm_config(vllm_config):
+        rope = test_case.rope_class(**test_case.rope_kwargs).to(device=device)
+        apply_rotary_emb = rope.apply_rotary_emb
+        # Verify custom op is enabled
+        if test_case.expect_forward_native:
+            assert (
+                apply_rotary_emb._forward_method != apply_rotary_emb.forward_native
+            ), "Test setup error: ApplyRotaryEmb custom op should be enabled"
+        # Setup call tracking
+        call_tracker = {"forward_native_called": False, "forward_called": False}
+        original_forward_native = apply_rotary_emb.forward_native
+        original_forward = apply_rotary_emb.forward
+        def tracked_forward_native(*args, **kwargs):
+            call_tracker["forward_native_called"] = True
+            return original_forward_native(*args, **kwargs)
+        def tracked_forward(*args, **kwargs):
+            call_tracker["forward_called"] = True
+            return original_forward(*args, **kwargs)
+        apply_rotary_emb.forward_native = tracked_forward_native
+        apply_rotary_emb.forward = tracked_forward
+        try:
+            num_tokens = test_case.positions_shape[-1]
+            num_q_heads = 8
+            num_kv_heads = 2
+            head_size = test_case.rope_kwargs["head_size"]
+            max_position = test_case.rope_kwargs["max_position_embeddings"]
+            positions = torch.randint(
+                0, max_position // 4, test_case.positions_shape, device=device
+            )
+            query = torch.randn(
+                num_tokens, num_q_heads * head_size, dtype=torch.bfloat16, device=device
+            )
+            key = torch.randn(
+                num_tokens,
+                num_kv_heads * head_size,
+                dtype=torch.bfloat16,
+                device=device,
+            )
+            # Call the method under test
+            method = getattr(rope, test_case.method_name)
+            method(positions, query.clone(), key.clone())
+            # Verify expectations
+            if test_case.expect_forward_native:
+                assert call_tracker["forward_native_called"], (
+                    f"{test_case.name} should call ApplyRotaryEmb.forward_native()"
+                )
+            if not test_case.expect_forward:
+                assert not call_tracker["forward_called"], (
+                    f"{test_case.name} should NOT call ApplyRotaryEmb.forward(). "
+                    "Bug: when +apply_rotary_emb is enabled, forward_native() "
+                    "incorrectly dispatches to CUDA/HIP kernels."
+                )
+            if test_case.expect_forward:
+                assert call_tracker["forward_called"], (
+                    f"{test_case.name} should call ApplyRotaryEmb.forward()"
+                )
+        finally:
+            apply_rotary_emb.forward_native = original_forward_native
+            apply_rotary_emb.forward = original_forward
+@pytest.mark.skipif(
+    not current_platform.is_cuda_alike(), reason="Skipping CUDA/ROCm only tests."
+)
+@pytest.mark.parametrize("test_case", get_test_cases(), ids=lambda tc: tc.name)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_rotary_embedding_dispatch(
+    test_case: RotaryEmbeddingTestCase,
+    device: str,
+):
+    """
+    Test that RotaryEmbedding classes dispatch to the correct ApplyRotaryEmb method.
+    - forward_native methods should call ApplyRotaryEmb.forward_native()
+    - forward_cuda/forward methods should call ApplyRotaryEmb.forward()
+    """
+    run_dispatch_test(test_case, device)
--- a/tests/kernels/core/test_mrope.py
+++ b/tests/kernels/core/test_mrope.py
@@ -116,7 +116,6 @@ def test_mrope(
    mrope_helper_class = get_rope(
        head_size=head_dim,
-        rotary_dim=head_dim,
        max_position=max_position,
        is_neox_style=is_neox_style,
        rope_parameters=config.rope_parameters,
@@ -185,7 +184,6 @@ def test_mrope_torch_compile_tracing(
    mrope_helper_class = get_rope(
        head_size=head_dim,
-        rotary_dim=head_dim,
        max_position=max_position,
        is_neox_style=is_neox_style,
        rope_parameters=config.rope_parameters,

--- a/tests/kernels/core/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@@ -83,8 +83,12 @@ def test_rotary_embedding(
    torch.set_default_device(device)
    if rotary_dim is None:
        rotary_dim = head_size
-    rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
+    rope_parameters = {
-    rope = get_rope(head_size, rotary_dim, max_position, is_neox_style, rope_parameters)
+        "rope_type": "default",
+        "rope_theta": rope_theta,
+        "partial_rotary_factor": rotary_dim / head_size,
+    }
+    rope = get_rope(head_size, max_position, is_neox_style, rope_parameters)
    rope = rope.to(dtype=dtype, device=torch.get_default_device())
    positions = torch.randint(0, max_position, (batch_size, seq_len))
@@ -150,9 +154,9 @@ def test_rope_module_cache():
        if rotary_dim is None:
            rotary_dim = head_size
        rope_parameters["rope_theta"] = rope_theta
+        rope_parameters["partial_rotary_factor"] = rotary_dim / head_size
        rope = get_rope(
            head_size,
-            rotary_dim,
            max_position,
            is_neox_style,
            rope_parameters,
@@ -177,9 +181,9 @@ def test_rope_module_cache():
        if rotary_dim is None:
            rotary_dim = head_size
        rope_parameters["rope_theta"] = rope_theta
+        rope_parameters["partial_rotary_factor"] = rotary_dim / head_size
        rope = get_rope(
            head_size,
-            rotary_dim,
            max_position,
            is_neox_style,
            rope_parameters,

--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -594,7 +594,8 @@ def make_modular_kernel(
    )
    modular_kernel = mk.FusedMoEModularKernel(
-        prepare_finalize=prepare_finalize, fused_experts=fused_experts
+        prepare_finalize=prepare_finalize,
+        fused_experts=fused_experts,
    )
    return modular_kernel

--- a/tests/kernels/moe/test_batched_deepgemm.py
+++ b/tests/kernels/moe/test_batched_deepgemm.py
@@ -27,7 +27,7 @@ BLOCK_SIZE = [128, 128]
 @pytest.mark.parametrize("N", [512, 1024])  # intermediate dim per expert
 @pytest.mark.parametrize("topk", [2, 4])
 def test_batched_deepgemm_vs_triton(
-    E: int, T: int, K: int, N: int, topk: int, monkeypatch
+    E: int, T: int, K: int, N: int, topk: int, monkeypatch, workspace_init
 ):
    """Compare BatchedDeepGemmExperts to BatchedTritonExperts."""

--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -248,6 +248,7 @@ def test_fused_moe_batched_experts(
    per_act_token_quant: bool,
    block_shape: list[int] | None,
    input_scales: bool,
+    workspace_init,
 ):
    """Note: float8_e4m3fn is not supported on CUDA architecture < 89,
    and those tests will be skipped on unsupported hardware."""

--- a/tests/kernels/moe/test_block_fp8.py
+++ b/tests/kernels/moe/test_block_fp8.py
@@ -137,7 +137,7 @@ def setup_cuda():
 @pytest.mark.parametrize("seed", SEEDS)
 @torch.inference_mode()
 def test_w8a8_block_fp8_fused_moe(
-    M, N, K, E, topk, block_size, dtype, seed, monkeypatch
+    M, N, K, E, topk, block_size, dtype, seed, monkeypatch, workspace_init
 ):
    if topk > E:
        pytest.skip(f"Skipping test; topk={topk} > E={E}")

--- a/tests/kernels/moe/test_cutlass_moe.py
+++ b/tests/kernels/moe/test_cutlass_moe.py
@@ -274,6 +274,7 @@ def test_cutlass_moe_8_bit_no_graph(
    per_act_token: bool,
    per_out_ch: bool,
    monkeypatch,
+    workspace_init,
    ep_size: int | None = None,
 ):
    current_platform.seed_everything(7)
@@ -329,6 +330,7 @@ def test_cutlass_moe_8_bit_cuda_graph(
    per_act_token: bool,
    per_out_ch: bool,
    monkeypatch,
+    workspace_init,
 ):
    current_platform.seed_everything(7)
    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
@@ -385,9 +387,19 @@ def test_cutlass_moe_8_bit_EP(
    per_out_channel: bool,
    ep_size: int,
    monkeypatch,
+    workspace_init,
 ):
    test_cutlass_moe_8_bit_no_graph(
-        m, n, k, e, topk, per_act_token, per_out_channel, monkeypatch, ep_size
+        m,
+        n,
+        k,
+        e,
+        topk,
+        per_act_token,
+        per_out_channel,
+        monkeypatch,
+        workspace_init,
+        ep_size,
    )
@@ -419,9 +431,19 @@ def test_cutlass_moe_8_bit_EP_large(
    per_out_channel: bool,
    ep_size: int,
    monkeypatch,
+    workspace_init,
 ):
    test_cutlass_moe_8_bit_no_graph(
-        m, n, k, e, topk, per_act_token, per_out_channel, monkeypatch, ep_size
+        m,
+        n,
+        k,
+        e,
+        topk,
+        per_act_token,
+        per_out_channel,
+        monkeypatch,
+        workspace_init,
+        ep_size,
    )
@@ -445,6 +467,7 @@ def test_run_cutlass_moe_fp8(
    per_act_token: bool,
    per_out_channel: bool,
    ep_size: int,
+    workspace_init,
 ):
    current_platform.seed_everything(7)
    with set_current_vllm_config(vllm_config):

--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -29,6 +29,7 @@ from vllm.utils.deep_gemm import (
    is_deep_gemm_supported,
 )
 from vllm.utils.import_utils import has_deep_ep, has_deep_gemm
+from vllm.v1.worker.workspace import init_workspace_manager
 from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
@@ -363,6 +364,9 @@ def _test_deepep_deepgemm_moe(
    w1_scale: torch.Tensor,
    w2_scale: torch.Tensor,
 ):
+    device = torch.device(f"cuda:{pgi.local_rank}")
+    init_workspace_manager(device)
    current_platform.seed_everything(pgi.rank)
    w1 = w1.to(device=torch.cuda.current_device())
@@ -445,6 +449,7 @@ def test_ht_deepep_deepgemm_moe(
    topk: int,
    world_dp_size: tuple[int, int],
    disable_deepgemm_ue8m0,
+    workspace_init,
 ):
    """
    Tests for High-Throughput DeepEP + DeepGemm integration.
@@ -518,6 +523,7 @@ def test_ll_deepep_deepgemm_moe(
    block_size: list[int],
    world_dp_size: tuple[int, int],
    disable_deepgemm_ue8m0,
+    workspace_init,
 ):
    """
    Tests for Low-Latency DeepEP + DeepGemm integration.

--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -22,6 +22,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 )
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import has_deep_ep
+from vllm.v1.worker.workspace import init_workspace_manager
 from ...utils import multi_gpu_test
 from .parallel_utils import ProcessGroupInfo, parallel_launch
@@ -342,6 +343,9 @@ def _deep_ep_moe(
    use_fp8_dispatch: bool,
    per_act_token_quant: bool,
 ):
+    device = torch.device(f"cuda:{pgi.local_rank}")
+    init_workspace_manager(device)
    if not low_latency_mode:
        assert not use_fp8_dispatch, (
            "FP8 dispatch interface is available only in low-latency mode"
@@ -437,6 +441,7 @@ def test_deep_ep_moe(
    topk: int,
    world_dp_size: tuple[int, int],
    per_act_token_quant: bool,
+    workspace_init,
 ):
    low_latency_mode = False
    use_fp8_dispatch = False
@@ -492,6 +497,7 @@ def test_low_latency_deep_ep_moe(
    topk: int,
    world_dp_size: tuple[int, int],
    use_fp8_dispatch: bool,
+    workspace_init,
 ):
    low_latency_mode = True

--- a/tests/kernels/moe/test_deepgemm.py
+++ b/tests/kernels/moe/test_deepgemm.py
@@ -143,7 +143,7 @@ NUM_EXPERTS = [32]
 @pytest.mark.parametrize("topk", TOPKS)
 @pytest.mark.parametrize("num_experts", NUM_EXPERTS)
 @pytest.mark.skipif(not is_deep_gemm_supported(), reason="Requires deep_gemm kernels")
-def test_deepgemm_vs_triton(m, n, k, topk, num_experts, monkeypatch):
+def test_deepgemm_vs_triton(m, n, k, topk, num_experts, monkeypatch, workspace_init):
    with monkeypatch.context() as mp:
        mp.setenv("VLLM_USE_DEEP_GEMM", "1")

--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -5,6 +5,7 @@ from dataclasses import dataclass
 import pytest
 import torch
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEQuantConfig,
@@ -107,6 +108,19 @@ class TestData:
        layer.w2_input_scale = a2_scale
        layer.w13_weight_scale = w13_weight_scale
        layer.w2_weight_scale = w2_weight_scale
+        # Setup dummy config.
+        layer.moe_parallel_config = mk.FusedMoEParallelConfig(
+            tp_size=1,
+            pcp_size=1,
+            dp_size=1,
+            ep_size=1,
+            tp_rank=1,
+            pcp_rank=1,
+            dp_rank=1,
+            ep_rank=1,
+            use_ep=False,
+            all2all_backend="naive",
+        )
        register_moe_scaling_factors(layer)
@@ -206,6 +220,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
    topk: int,
    activation: str,
    monkeypatch,
+    workspace_init,
 ):
    current_platform.seed_everything(7)
    monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")

--- a/tests/kernels/moe/test_flashinfer_moe.py
+++ b/tests/kernels/moe/test_flashinfer_moe.py
@@ -51,7 +51,14 @@ MNK_FACTORS = [
 @pytest.mark.parametrize("activation", ["silu_and_mul", "relu2"])
 @torch.inference_mode()
 def test_flashinfer_fp4_moe_no_graph(
-    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype, activation: str
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+    activation: str,
+    workspace_init,
 ):
    current_platform.seed_everything(7)
    with set_current_vllm_config(

--- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py
+++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
@@ -269,7 +269,7 @@ class Case:
 )
 @pytest.mark.parametrize("num_token", [2])
 @pytest.mark.parametrize("tp", [1, 2, 4, 8])
-def test_equiv(num_token, a_dtype, w_dtype, tp):
+def test_equiv(num_token, a_dtype, w_dtype, tp, workspace_init):
    from triton_kernels.tensor_details import layout
    if not hasattr(layout, "make_default_matmul_mxfp4_w_layout"):

--- a/tests/kernels/moe/test_modular_kernel_combinations.py
+++ b/tests/kernels/moe/test_modular_kernel_combinations.py
@@ -16,6 +16,7 @@ from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx
 from vllm.utils.torch_utils import cuda_device_count_stateless
+from vllm.v1.worker.workspace import init_workspace_manager
 from .modular_kernel_tools.common import (
    Config,
@@ -77,6 +78,10 @@ def rank_worker(
    weights: WeightTensors,
    verbose: bool,
 ):
+    # Initialize workspace manager in child process
+    device = torch.device(f"cuda:{pgi.local_rank}")
+    init_workspace_manager(device)
    current_platform.seed_everything(pgi.rank)
    # sanity check
@@ -300,6 +305,7 @@ def test_modular_kernel_combinations_singlegpu(
    chunk_size: int | None,
    world_size: int,
    pytestconfig,
+    workspace_init,
 ):
    """Note: float8_e4m3fn is not supported on CUDA architecture < 89,
    and those tests will be skipped on unsupported hardware."""