Merge tag 'v0.13.0rc2' into v0.13.0rc2-ori

a3f8d5dd · zhuwenwen · 8d75f22e · f34eca5f · a3f8d5dd · a3f8d5dd
Commit a3f8d5dd authored Dec 17, 2025 by zhuwenwen
20 changed files
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -77,7 +77,7 @@ def parse_args():
    parser.add_argument(
        "--seed",
        type=int,
-        default=None,
+        default=0,
        help="Set the seed when initializing `vllm.LLM`.",
    )
    return parser.parse_args()

--- a/examples/offline_inference/qwen2_5_omni/only_thinker.py
+++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py
@@ -158,7 +158,7 @@ def parse_args():
    parser.add_argument(
        "--seed",
        type=int,
-        default=None,
+        default=0,
        help="Set the seed when initializing `vllm.LLM`.",
    )

--- a/examples/offline_inference/qwen3_omni/only_thinker.py
+++ b/examples/offline_inference/qwen3_omni/only_thinker.py
@@ -158,7 +158,7 @@ def parse_args():
    parser.add_argument(
        "--seed",
        type=int,
-        default=None,
+        default=0,
        help="Set the seed when initializing `vllm.LLM`.",
    )

--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -118,6 +118,32 @@ def run_bee(questions: list[str], modality: str) -> ModelRequestData:
    )
+def run_bagel(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+    model_name = "ByteDance-Seed/BAGEL-7B-MoT"
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=2,
+        limit_mm_per_prompt={modality: 1},
+    )
+    prompts = [
+        (
+            f"<|im_start|>user\n<|image_pad|>\n{question}<|im_end|>\n"
+            f"<|im_start|>assistant\n"
+        )
+        for question in questions
+    ]
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 # BLIP-2
 def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
@@ -1832,6 +1858,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
 model_example_map = {
    "aria": run_aria,
    "aya_vision": run_aya_vision,
+    "bagel": run_bagel,
    "bee": run_bee,
    "blip-2": run_blip2,
    "chameleon": run_chameleon,
@@ -2031,7 +2058,7 @@ def parse_args():
    parser.add_argument(
        "--seed",
        type=int,
-        default=None,
+        default=0,
        help="Set the seed when initializing `vllm.LLM`.",
    )

--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1382,7 +1382,7 @@ def run_generate(
    model,
    question: str,
    image_urls: list[str],
-    seed: int | None,
+    seed: int,
    tensor_parallel_size: int | None,
 ):
    req_data = model_example_map[model](question, image_urls)
@@ -1416,7 +1416,7 @@ def run_chat(
    model: str,
    question: str,
    image_urls: list[str],
-    seed: int | None,
+    seed: int,
    tensor_parallel_size: int | None,
 ):
    req_data = model_example_map[model](question, image_urls)
@@ -1494,7 +1494,7 @@ def parse_args():
    parser.add_argument(
        "--seed",
        type=int,
-        default=None,
+        default=0,
        help="Set the seed when initializing `vllm.LLM`.",
    )
    parser.add_argument(

--- a/examples/online_serving/run_cluster.sh
+++ b/examples/online_serving/run_cluster.sh
@@ -21,7 +21,7 @@
 #         --worker \
 #         /abs/path/to/huggingface/cache \
 #         -e VLLM_HOST_IP=<worker_node_ip>
-# 
+#
 # Each worker requires a unique VLLM_HOST_IP value.
 # Keep each terminal session open. Closing a session stops the associated Ray
 # node and thereby shuts down the entire cluster.
@@ -59,6 +59,34 @@ if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then
    exit 1
 fi
+# Extract VLLM_HOST_IP from ADDITIONAL_ARGS (e.g. "-e VLLM_HOST_IP=...").
+VLLM_HOST_IP=""
+for ((i = 0; i < ${#ADDITIONAL_ARGS[@]}; i++)); do
+    arg="${ADDITIONAL_ARGS[$i]}"
+    case "${arg}" in
+        -e)
+            next="${ADDITIONAL_ARGS[$((i + 1))]:-}"
+            if [[ "${next}" == VLLM_HOST_IP=* ]]; then
+                VLLM_HOST_IP="${next#VLLM_HOST_IP=}"
+                break
+            fi
+            ;;
+        -eVLLM_HOST_IP=* | VLLM_HOST_IP=*)
+            VLLM_HOST_IP="${arg#*=}"
+            break
+            ;;
+    esac
+done
+# For the head node, HEAD_NODE_ADDRESS and VLLM_HOST_IP should be consistent.
+if [[ "${NODE_TYPE}" == "--head" && -n "${VLLM_HOST_IP}" ]]; then
+    if [[ "${VLLM_HOST_IP}" != "${HEAD_NODE_ADDRESS}" ]]; then
+        echo "Warning: VLLM_HOST_IP (${VLLM_HOST_IP}) differs from head_node_ip (${HEAD_NODE_ADDRESS})."
+        echo "Using VLLM_HOST_IP as the head node address."
+        HEAD_NODE_ADDRESS="${VLLM_HOST_IP}"
+    fi
+fi
 # Generate a unique container name with random suffix.
 # Docker container names must be unique on each host.
 # The random suffix allows multiple Ray containers to run simultaneously on the same machine,
@@ -74,36 +102,17 @@ cleanup() {
 trap cleanup EXIT
 # Build the Ray start command based on the node role.
-# The head node manages the cluster and accepts connections on port 6379, 
+# The head node manages the cluster and accepts connections on port 6379,
 # while workers connect to the head's address.
 RAY_START_CMD="ray start --block"
 if [ "${NODE_TYPE}" == "--head" ]; then
-    RAY_START_CMD+=" --head --port=6379"
+    RAY_START_CMD+=" --head --node-ip-address=${HEAD_NODE_ADDRESS} --port=6379"
 else
-    RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
-fi
-# Parse VLLM_HOST_IP from additional args if present.
+    RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
-# This is needed for multi-NIC configurations where Ray needs explicit IP bindings.
+    if [ -n "${VLLM_HOST_IP}" ]; then
-VLLM_HOST_IP=""
+        RAY_START_CMD+=" --node-ip-address=${VLLM_HOST_IP}"
-for arg in "${ADDITIONAL_ARGS[@]}"; do
-    if [[ $arg == "-e" ]]; then
-        continue
-    fi
-    if [[ $arg == VLLM_HOST_IP=* ]]; then
-        VLLM_HOST_IP="${arg#VLLM_HOST_IP=}"
-        break
    fi
-done
-# Build Ray IP environment variables if VLLM_HOST_IP is set.
-# These variables ensure Ray binds to the correct network interface on multi-NIC systems.
-RAY_IP_VARS=()
-if [ -n "${VLLM_HOST_IP}" ]; then
-    RAY_IP_VARS=(
-        -e "RAY_NODE_IP_ADDRESS=${VLLM_HOST_IP}"
-        -e "RAY_OVERRIDE_NODE_IP_ADDRESS=${VLLM_HOST_IP}"
-    )
 fi
 # Launch the container with the assembled parameters.
@@ -118,6 +127,5 @@ docker run \
    --shm-size 10.24g \
    --gpus all \
    -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
-    "${RAY_IP_VARS[@]}" \
    "${ADDITIONAL_ARGS[@]}" \
    "${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
--- a/examples/online_serving/structured_outputs/structured_outputs.py
+++ b/examples/online_serving/structured_outputs/structured_outputs.py
@@ -112,7 +112,7 @@ PARAMS: dict[ConstraintsFormat, dict[str, Any]] = {
        "messages": [
            {
                "role": "user",
-                "content": "Generate an SQL query to show the 'username' and 'email'from the 'users' table.",
+                "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.",
            }
        ],
        "extra_body": {

--- a/examples/pooling/plugin/prithvi_geospatial_mae_client.py
+++ b/examples/pooling/plugin/prithvi_geospatial_mae_client.py
@@ -16,7 +16,7 @@ import requests
 # - start vllm in serving mode with the below args
 #   --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
 #   --model-impl terratorch
-#   --task embed --trust-remote-code
+#   --trust-remote-code
 #   --skip-tokenizer-init --enforce-eager
 #   --io-processor-plugin terratorch_segmentation
 #   --enable-mm-embeds

--- a/examples/pooling/pooling/vision_language_pooling.py
+++ b/examples/pooling/pooling/vision_language_pooling.py
@@ -305,7 +305,7 @@ def get_query(modality: QueryModality):
    raise ValueError(msg)
-def run_encode(model: str, modality: QueryModality, seed: int | None):
+def run_encode(model: str, modality: QueryModality, seed: int):
    query = get_query(modality)
    req_data = model_example_map[model](query)
@@ -335,7 +335,7 @@ def run_encode(model: str, modality: QueryModality, seed: int | None):
        print("-" * 50)
-def run_score(model: str, modality: QueryModality, seed: int | None):
+def run_score(model: str, modality: QueryModality, seed: int):
    query = get_query(modality)
    req_data = model_example_map[model](query)
@@ -390,7 +390,7 @@ def parse_args():
    parser.add_argument(
        "--seed",
        type=int,
-        default=None,
+        default=0,
        help="Set the seed when initializing `vllm.LLM`.",
    )
    return parser.parse_args()

--- a/examples/pooling/score/qwen3_reranker.py
+++ b/examples/pooling/score/qwen3_reranker.py
--- a/examples/pooling/score/jinaai_rerank_client.py
+++ b/examples/pooling/score/jinaai_rerank_client.py
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -51,6 +51,7 @@ hooks:
  - docs/mkdocs/hooks/remove_announcement.py
  - docs/mkdocs/hooks/generate_examples.py
  - docs/mkdocs/hooks/generate_argparse.py
+  - docs/mkdocs/hooks/generate_metrics.py
  - docs/mkdocs/hooks/url_schemes.py
 plugins:

--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -50,4 +50,5 @@ ijson # Required for mistral streaming tool parser
 setproctitle # Used to set process names for better debugging and monitoring
 openai-harmony >= 0.0.3  # Required for gpt-oss
 anthropic == 0.71.0
 model-hosting-container-standards >= 0.1.9, < 1.0.0
\ No newline at end of file
+mcp
\ No newline at end of file
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -75,7 +75,7 @@ torchgeo==0.7.0
 mteb==2.1.2
 # Data processing
-xgrammar==0.1.27
+xgrammar @ git+https://github.com/divakar-amd/xgrammar@3272f7c520564858056a60480d5afdf69ae79c84
 # Test async scheduling
 # Utilities

--- a/tests/benchmarks/test_param_sweep.py
+++ b/tests/benchmarks/test_param_sweep.py
@@ -23,14 +23,6 @@ class TestParameterSweepItem:
                {"compilation_config.use_inductor_graph_partition": True},
                "--compilation-config.use_inductor_graph_partition=true",
            ),
-            (
-                {"compilation_config.use_inductor": False},
-                "--compilation-config.use_inductor=false",
-            ),
-            (
-                {"compilation_config.use_inductor": True},
-                "--compilation-config.use_inductor=true",
-            ),
        ],
    )
    def test_nested_boolean_params(self, input_dict, expected):

--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -20,13 +20,14 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer
 from ...utils import flat_product, multi_gpu_test
-is_blackwell = lambda: current_platform.is_device_capability(100)
+is_blackwell = lambda: current_platform.is_device_capability_family(100)
 """Are we running on Blackwell, a lot of tests depend on it"""
 class Matches(NamedTuple):
    attention_fusion: int = 0
    allreduce_fusion: int = 0
+    rms_quant_norm_fusion: int = 0
    sequence_parallel: int = 0
    async_tp: int = 0
@@ -40,6 +41,7 @@ class ModelBackendTestCase(NamedTuple):
 MODELS_FP8: list[ModelBackendTestCase] = []
 MODELS_FP4: list[ModelBackendTestCase] = []
+MODELS_GROUP_FP8: list[ModelBackendTestCase] = []
 MODELS: list[ModelBackendTestCase] = []  # tp-only
 if current_platform.is_cuda():
@@ -138,6 +140,17 @@ elif current_platform.is_rocm():
 CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
+def has_cuda_graph_wrapper_metadata() -> bool:
+    from importlib import import_module
+    try:
+        module = import_module("torch._inductor.utils")
+        module.CUDAGraphWrapperMetadata  # noqa B018
+    except AttributeError:
+        return False
+    return True
 @pytest.mark.parametrize(
    "model_name, model_kwargs, backend, matches, custom_ops",
    # Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8
@@ -145,7 +158,20 @@ CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
    # quant_fp4 only has the custom impl
    + list(flat_product(MODELS_FP4, [""])),
 )
-@pytest.mark.parametrize("inductor_graph_partition", [True, False])
+@pytest.mark.parametrize(
+    "inductor_graph_partition",
+    [
+        pytest.param(
+            True,
+            marks=pytest.mark.skipif(
+                not has_cuda_graph_wrapper_metadata(),
+                reason="This test requires"
+                "torch._inductor.utils.CUDAGraphWrapperMetadata to run",
+            ),
+        ),
+        False,
+    ],
+)
 def test_attn_quant(
    model_name: str,
    model_kwargs: dict[str, Any],
@@ -474,3 +500,81 @@ def run_model(compile_config: int | CompilationConfig, model: str, **model_kwarg
    compilation_config.compile_ranges_split_points = (
        llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points
    )
+if current_platform.is_cuda():
+    MODELS_GROUP_FP8 = [
+        ModelBackendTestCase(
+            model_name="Qwen/Qwen3-30B-A3B-FP8",
+            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
+            backend=AttentionBackendEnum.TRITON_ATTN,
+            matches=Matches(
+                rms_quant_norm_fusion=48,
+            ),
+        ),
+    ]
+CUSTOM_OPS_QUANT_RMS_NORM = ["+quant_fp8,+rms_norm"]
+@pytest.mark.parametrize(
+    "model_name, model_kwargs, backend, matches, custom_ops",
+    # Test rms norm+group quant_fp8 fusion
+    list[tuple[Any, ...]](flat_product(MODELS_GROUP_FP8, CUSTOM_OPS_QUANT_RMS_NORM)),
+)
+@pytest.mark.parametrize("inductor_graph_partition", [True, False])
+# TODO: remove skip after we fix the fusion thoroughly
+@pytest.mark.skipif(is_blackwell(), reason="Temporarily disabled on Blackwell")
+def test_rms_group_quant(
+    model_name: str,
+    model_kwargs: dict[str, Any],
+    backend: AttentionBackendEnum,
+    matches: Matches,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    caplog_mp_spawn,
+    monkeypatch,
+):
+    if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("Inductor graph partition requires torch>=2.9")
+    custom_ops_list = custom_ops.split(",") if custom_ops else []
+    if inductor_graph_partition:
+        mode = CUDAGraphMode.FULL_AND_PIECEWISE
+        splitting_ops: list[str] | None = None
+    else:
+        mode = CUDAGraphMode.FULL_DECODE_ONLY
+        splitting_ops = []
+    # Disable, compile cache to make sure custom passes run.
+    # Otherwise, we can't verify fusion happened through the logs.
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+    # To capture subprocess logs, we need to know whether spawn or fork is used.
+    # Force spawn as it is more general.
+    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+    compilation_config = CompilationConfig(
+        # Testing properties
+        custom_ops=custom_ops_list,
+        use_inductor_graph_partition=inductor_graph_partition,
+        cudagraph_mode=mode,
+        splitting_ops=splitting_ops,
+        # Common
+        mode=CompilationMode.VLLM_COMPILE,
+        pass_config=PassConfig(eliminate_noops=True, fuse_norm_quant=True),
+        # Inductor caches custom passes by default as well via uuid
+        inductor_compile_config={"force_disable_caches": True},
+    )
+    with caplog_mp_spawn(logging.DEBUG) as log_holder:
+        run_model(compilation_config, model_name, **model_kwargs)
+    log_matches = re.findall(
+        r"\[fusion.py:\d+] Replaced (\d+) patterns",
+        log_holder.text,
+    )
+    assert len(log_matches) == 1, log_holder.text
+    assert int(log_matches[0]) == matches.rms_quant_norm_fusion
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
-import logging
 from contextlib import nullcontext
 from unittest.mock import patch
@@ -13,7 +12,6 @@ from vllm.compilation.fix_functionalization import FixFunctionalizationPass
 from vllm.config import CompilationConfig, CUDAGraphMode, ParallelConfig, VllmConfig
 from vllm.config.compilation import CompilationMode, PassConfig
 from vllm.engine.arg_utils import EngineArgs
-from vllm.logger import _print_warning_once
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import _is_torch_equal_or_newer
@@ -290,7 +288,7 @@ def test_moe_splitting_ops_deepep_ht_attn_fusion_no_inductor():
        ),
        compilation_config=CompilationConfig(
            mode=CompilationMode.VLLM_COMPILE,
-            pass_config={"enable_attn_fusion": True, "enable_noop": True},
+            pass_config={"fuse_attn_quant": True, "eliminate_noops": True},
            custom_ops=["+quant_fp8"],
            cudagraph_mode=CUDAGraphMode.PIECEWISE,
        ),
@@ -442,62 +440,3 @@ def test_cudagraph_sizes_post_init(
            vllm_config.compilation_config.max_cudagraph_capture_size
            == expected_max_size
        )
-def test_pass_config_deprecation(caplog_vllm):
-    caplog_vllm.set_level(logging.WARNING)
-    # Clear cache to ensure warnings are re-issued
-    _print_warning_once.cache_clear()
-    # Test enable_fusion -> fuse_norm_quant, fuse_act_quant
-    caplog_vllm.clear()
-    config = PassConfig(enable_fusion=True)
-    assert "enable_fusion is deprecated" in caplog_vllm.text
-    assert config.fuse_norm_quant is True
-    assert config.fuse_act_quant is True
-    assert config.enable_fusion is True
-    # Test enable_attn_fusion -> fuse_attn_quant
-    caplog_vllm.clear()
-    config = PassConfig(enable_attn_fusion=True)
-    assert "enable_attn_fusion is deprecated" in caplog_vllm.text
-    assert config.fuse_attn_quant is True
-    assert config.enable_attn_fusion is True
-    # Test enable_noop -> eliminate_noops
-    caplog_vllm.clear()
-    config = PassConfig(enable_noop=True)
-    assert "enable_noop is deprecated" in caplog_vllm.text
-    assert config.eliminate_noops is True
-    assert config.enable_noop is True
-    # Test enable_sequence_parallelism -> enable_sp
-    caplog_vllm.clear()
-    config = PassConfig(enable_sequence_parallelism=True)
-    assert "enable_sequence_parallelism is deprecated" in caplog_vllm.text
-    assert config.enable_sp is True
-    assert config.enable_sequence_parallelism is True
-    # Test enable_async_tp -> fuse_gemm_comms
-    caplog_vllm.clear()
-    config = PassConfig(enable_async_tp=True)
-    assert "enable_async_tp is deprecated" in caplog_vllm.text
-    assert config.fuse_gemm_comms is True
-    assert config.enable_async_tp is True
-    # Test enable_fi_allreduce_fusion -> fuse_allreduce_rms
-    caplog_vllm.clear()
-    config = PassConfig(enable_fi_allreduce_fusion=True)
-    assert "enable_fi_allreduce_fusion is deprecated" in caplog_vllm.text
-    assert config.fuse_allreduce_rms is True
-    assert config.enable_fi_allreduce_fusion is True
-    # Test hash consistency
-    config_old = PassConfig(enable_fusion=True)
-    config_new = PassConfig(fuse_norm_quant=True, fuse_act_quant=True)
-    assert config_old.compute_hash() == config_new.compute_hash()
-    config_old = PassConfig(enable_async_tp=True)
-    config_new = PassConfig(fuse_gemm_comms=True)
-    assert config_old.compute_hash() == config_new.compute_hash()
--- a/tests/compile/test_dynamic_shapes_compilation.py
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@@ -36,7 +36,7 @@ def get_test_models():
        DynamicShapesType.BACKED_SIZE_OBLIVIOUS,
    ],
 )
-@pytest.mark.parametrize("use_aot_compile", ["0"])
+@pytest.mark.parametrize("use_aot_compile", ["0", "1"])
 @pytest.mark.parametrize("use_bytecode_hook", [True, False])
 @pytest.mark.parametrize("evaluate_guards", [False, True])
 @pytest.mark.skipif(
@@ -54,6 +54,12 @@ def test_dynamic_shapes_compilation(
    if use_bytecode_hook and shapes_type == DynamicShapesType.UNBACKED:
        pytest.skip("UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0")
+    if evaluate_guards and shapes_type == DynamicShapesType.UNBACKED:
+        pytest.skip("unbacked dynamic shapes do not add guards")
+    if evaluate_guards and use_aot_compile:
+        pytest.skip("evaluate_guards requires use_aot_compile=0")
    monkeypatch.setenv("VLLM_USE_AOT_COMPILE", use_aot_compile)
    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
@@ -120,7 +126,7 @@ def test_model_specialization_with_evaluate_guards(
        and dynamic_shapes_type == DynamicShapesType.BACKED
        and evaluate_guards
    ):
-        pytest.skip("evaluate_guards for backed does not work with aot_compile =1")
+        pytest.skip("evaluate_guards for backed does not work with aot_compile=1")
    @support_torch_compile
    class ModelWithSizeCheck(torch.nn.Module):

--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -128,14 +128,12 @@ class TestFusedAddRMSNorm(torch.nn.Module):
 class TestRotaryEmbedding(torch.nn.Module):
-    def __init__(self, head_dim=64, rotary_dim=None, max_position=2048, base=10000):
+    def __init__(self, head_dim=64, max_position=2048, base=10000):
        super().__init__()
        self.head_dim = head_dim
-        self.rotary_dim = rotary_dim or head_dim
        self.rotary_emb = get_rope(
            self.head_dim,
-            rotary_dim=self.rotary_dim,
            max_position=max_position,
            rope_parameters={"rope_type": "default", "rope_theta": base},
        )
@@ -170,7 +168,6 @@ class TestRotaryEmbeddingSliceScatter(torch.nn.Module):
        self.rotary_emb = get_rope(
            self.head_dim,
-            rotary_dim=self.head_dim,
            max_position=max_position,
            rope_parameters={"rope_type": "default", "rope_theta": base},
        )

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -202,6 +202,27 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
        cleanup_dist_env_and_memory()
+@pytest.fixture
+def workspace_init():
+    """Initialize the workspace manager for tests that need it.
+    This fixture initializes the workspace manager with a CUDA device
+    if available, and resets it after the test completes. Tests that
+    create a full vLLM engine should NOT use this fixture as the engine
+    will initialize the workspace manager itself.
+    """
+    from vllm.v1.worker.workspace import (
+        init_workspace_manager,
+        reset_workspace_manager,
+    )
+    if torch.cuda.is_available():
+        device = torch.device("cuda:0")
+        init_workspace_manager(device)
+    yield
+    reset_workspace_manager()
 @pytest.fixture(autouse=True)
 def dynamo_reset():
    yield
@@ -681,10 +702,16 @@ class HfRunner:
                **kwargs,
            )
+            # Encoder-decoder models return decoder_hidden_states instead of
+            # hidden_states
+            hidden_states = (
+                getattr(output, "hidden_states", None) or output.decoder_hidden_states
+            )
            (
                seq_logprobs_lst,
                output_len,
-            ) = self._hidden_states_to_logprobs(output.hidden_states, num_logprobs)
+            ) = self._hidden_states_to_logprobs(hidden_states, num_logprobs)
            all_logprobs.append(seq_logprobs_lst)
            seq_ids = output.sequences[0]
@@ -741,7 +768,7 @@ class VllmRunner:
        tokenizer_name: str | None = None,
        tokenizer_mode: str = "auto",
        trust_remote_code: bool = True,
-        seed: int | None = 0,
+        seed: int = 0,
        max_model_len: int | None = 1024,
        dtype: str = "auto",
        disable_log_stats: bool = True,