Merge tag 'v0.14.0' into v0.14.0-dev

7e63ef82 · zhuwenwen · 8cbcac5d · b17039bc · 7e63ef82 · 7e63ef82
Commit 7e63ef82 authored Jan 21, 2026 by zhuwenwen
20 changed files
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -30,8 +30,9 @@ example_prompts = [first_prompt, "In one word, the capital of France is "] + [

 default_params = dict(
    temperature=0.0,  # greedy
-    max_tokens=23,
-    min_tokens=18,
+    max_tokens=30,
+    # spec decoding currently doesn't support min_tokens
+    # min_tokens=28,
 )


@@ -50,6 +51,14 @@ def test_without_spec_decoding(
        dict(logprobs=2),
        dict(logprobs=2, presence_penalty=-1.0),
        dict(structured_outputs=struct_outputs),
+        dict(
+            structured_outputs=struct_outputs,
+            logprobs=2,
+        ),
+        dict(
+            structured_outputs=struct_outputs,
+            presence_penalty=-1.0,
+        ),
        dict(
            structured_outputs=struct_outputs,
            logprobs=2,
@@ -86,7 +95,7 @@ def test_without_spec_decoding(
    run_tests(monkeypatch, MODEL, test_configs, test_sampling_params)


-def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
+def test_with_spec_decoding(sample_json_schema, monkeypatch: pytest.MonkeyPatch):
    """Test consistency and acceptance rates with some different combos of
    preemption, executor, async scheduling, prefill chunking,
    spec decoding model length.
@@ -100,9 +109,20 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
    # Set small draft model len to force doesn't-fit-in-drafter case.
    spec_config_short = spec_config | {"max_model_len": 50}

+    struct_outputs = StructuredOutputsParams(json=sample_json_schema)
+
    test_sampling_params = [
        dict(),
+        dict(presence_penalty=-1.0),
+        dict(bad_words=["the", " the"]),
        dict(logprobs=2),
+        dict(logprobs=2, presence_penalty=-1.0),
+        dict(structured_outputs=struct_outputs),
+        dict(
+            structured_outputs=struct_outputs,
+            logprobs=2,
+            presence_penalty=-1.0,
+        ),
    ]

    # test_preemption, executor, async_scheduling,
@@ -142,18 +162,12 @@ def run_tests(
    """Test consistency of combos of async scheduling, preemption,
    uni/multiproc executor with spec decoding."""

+    # Determine attention config based on platform
+    attention_config = {"backend": "FLEX_ATTENTION"}
+
    with monkeypatch.context() as m:
-        # avoid precision errors
-        if current_platform.is_rocm():
-            if is_testing_with_spec_decoding:
-                # Use TRITON_ATTN for spec decoding test for consistency
-                m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
-            else:
-                m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_FA")
-        else:
-            m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
        # lock matmul precision to full FP32 (IEEE)
-        m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "ieee")
+        m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest")
        # m.setenv("VLLM_BATCH_INVARIANT", "1")
        outputs: list[tuple[str, list, list]] = []
        for n, (
@@ -174,6 +188,7 @@ def run_tests(
                spec_config,
                test_prefill_chunking=test_prefill_chunking,
                is_testing_with_spec_decoding=is_testing_with_spec_decoding,
+                attention_config=attention_config,
            )
            outputs.append(test_results)

@@ -204,15 +219,7 @@ def run_tests(
                    name_1=f"config=[{test_config}], params={params}",
                )

-                # On ROCm with TRITON_ATTN (spec decoding test), skip strict
-                # logprobs comparison when logprobs are requested
-                skip_logprobs_check = (
-                    current_platform.is_rocm()
-                    and params.get("logprobs")
-                    and is_testing_with_spec_decoding
-                )
-                if not skip_logprobs_check:
-                    assert _all_logprobs_match(base_logprobs, test_logprobs)
+                assert _all_logprobs_match(base_logprobs, test_logprobs)

                if (
                    base_acceptance_rate is not None
@@ -262,6 +269,7 @@ def run_test(
    spec_config: dict[str, Any] | None,
    test_prefill_chunking: bool,
    is_testing_with_spec_decoding: bool = False,
+    attention_config: dict[str, Any] | None = None,
 ):
    spec_decoding = spec_config is not None
    cache_arg: dict[str, Any] = (
@@ -281,14 +289,6 @@ def run_test(
    print(f"---- TESTING {test_str}: {test_config}")
    print("-" * 80)

-    # On ROCm: use float16 for first test (ROCM_AITER_FA), but float32 for
-    # spec decoding test (TRITON_ATTN) for better precision.
-    # On others: always use float32.
-    if current_platform.is_rocm() and not is_testing_with_spec_decoding:
-        dtype = "float16"
-    else:
-        dtype = "float32"
-
    with VllmRunner(
        model,
        max_model_len=512,
@@ -298,9 +298,10 @@ def run_test(
        # enforce_eager=True,
        async_scheduling=async_scheduling,
        distributed_executor_backend=executor,
-        dtype=dtype,
+        dtype="float32",
        speculative_config=spec_config,
        disable_log_stats=False,
+        attention_config=attention_config,
        **cache_arg,
    ) as vllm_model:
        results = []
@@ -358,12 +359,7 @@ def _all_logprobs_match(req_a, req_b) -> bool:


 def _logprobs_match(lps_a: dict[int, Logprob], lps_b: dict[int, Logprob]) -> bool:
-    if current_platform.is_rocm():
-        # ROCm has higher numerical variance
-        # due to use of float16.
-        rel_tol, abs_tol = 5e-2, 1e-5
-    else:
-        rel_tol, abs_tol = 1e-3, 1e-6
+    rel_tol, abs_tol = 1e-3, 1e-6
    return (
        len(lps_a) == len(lps_b)
        and lps_a.keys() == lps_b.keys()

--- a/tests/v1/e2e/test_async_spec_decode.py
+++ b/tests/v1/e2e/test_async_spec_decode.py
@@ -19,7 +19,7 @@ def sync_tracker():
    Fixture that patches CommonAttentionMetadata.seq_lens_cpu to detect
    lazy init syncs. Prints stack traces immediately when syncs occur.
    """
-    from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+    from vllm.v1.attention.backend import CommonAttentionMetadata

    # Shared counter for cross-process communication (inherited by fork)
    sync_count = multiprocessing.Value("i", 0)

--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -445,25 +445,26 @@ def test_eagle_correctness(
    should be the same when using eagle speculative decoding.
    model_setup: (method, model_name, eagle_model_name, tp_size)
    """
-    with monkeypatch.context() as m:
-        if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN":
-            # Scout requires default backend selection
-            # because vision encoder has head_dim 88 being incompatible
-            #  with FLASH_ATTN and needs to fall back to Flex Attn
-
-            # pass if not ROCm
-            if current_platform.is_rocm():
-                # TODO: Enable Flex Attn for spec_decode on ROCm
-                pytest.skip("Flex Attn for spec_decode not supported on ROCm currently")
-        else:
-            m.setenv("VLLM_MLA_DISABLE", "1")
-            m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
+    # Determine attention config
+    # Scout requires default backend selection because vision encoder has
+    # head_dim 88 being incompatible with FLASH_ATTN and needs to fall back
+    # to Flex Attn
+    if "Llama-4-Scout" in model_setup[1] and attn_backend == "FLASH_ATTN":
+        if current_platform.is_rocm():
+            # TODO: Enable Flex Attn for spec_decode on ROCm
+            pytest.skip("Flex Attn for spec_decode not supported on ROCm currently")
+        attention_config = None  # Let it fall back to default
+    else:
+        attention_config = {"backend": attn_backend}
+
+    if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
+        pytest.skip(
+            "TRITON_ATTN does not support "
+            "multi-token eagle spec decode on current platform"
+        )

-        if attn_backend == "TRITON_ATTN" and not current_platform.is_rocm():
-            pytest.skip(
-                "TRITON_ATTN does not support "
-                "multi-token eagle spec decode on current platform"
-            )
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_MLA_DISABLE", "1")

        if attn_backend == "ROCM_AITER_FA" and current_platform.is_rocm():
            if "deepseek" in model_setup[1].lower():
@@ -478,7 +479,10 @@ def test_eagle_correctness(
        max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len

        ref_llm = LLM(
-            model=model_name, max_model_len=max_model_len, tensor_parallel_size=tp_size
+            model=model_name,
+            max_model_len=max_model_len,
+            tensor_parallel_size=tp_size,
+            attention_config=attention_config,
        )
        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
        del ref_llm
@@ -499,6 +503,7 @@ def test_eagle_correctness(
            max_num_batched_tokens=max_num_batched_tokens,
            enable_chunked_prefill=enable_chunked_prefill,
            model_impl=model_impl,
+            attention_config=attention_config,
        )
        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
        matches = 0

--- a/tests/v1/e2e/untest_cascade_attention.py
+++ b/tests/v1/e2e/untest_cascade_attention.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+import os
 import pytest

 from vllm import LLM, SamplingParams

 from ...utils import create_new_process_for_each_test
+from ...utils import models_path_prefix


 @create_new_process_for_each_test()
 @pytest.mark.parametrize("attn_backend", ["FLASH_ATTN", "FLASHINFER"])
-def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
+def test_cascade_attention(example_system_message, attn_backend):
    prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"

    if attn_backend == "FLASHINFER":
@@ -19,19 +21,18 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
            "needs investigation. See issue #25679."
        )

-    with monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
-
-        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
-        sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
-
-        # No cascade attention.
-        single_prompt = [example_system_message + prompt]
-        responses = llm.generate(single_prompt, sampling_params)
-        ref_output = responses[0].outputs[0].text
-
-        # (Probably) Use cascade attention.
-        prompts = [example_system_message + prompt] * 64
-        responses = llm.generate(prompts, sampling_params)
-        for response in responses:
-            assert response.outputs[0].text == ref_output
\ No newline at end of file
+    llm = LLM(
+        model=os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct"), attention_config={"backend": attn_backend}
+    )
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+
+    # No cascade attention.
+    single_prompt = [example_system_message + prompt]
+    responses = llm.generate(single_prompt, sampling_params)
+    ref_output = responses[0].outputs[0].text
+
+    # (Probably) Use cascade attention.
+    prompts = [example_system_message + prompt] * 64
+    responses = llm.generate(prompts, sampling_params)
+    for response in responses:
+        assert response.outputs[0].text == ref_output
--- a/tests/v1/ec_connector/integration/test_epd_correctness.py
+++ b/tests/v1/ec_connector/integration/test_epd_correctness.py
@@ -31,7 +31,7 @@ import openai
 import requests

 from vllm.assets.image import ImageAsset
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import encode_image_url

 MAX_OUTPUT_LEN = 256

@@ -49,9 +49,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
                "content": [
                    {
                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image;base64,{encode_image_base64(image_1)}"
-                        },
+                        "image_url": {"url": encode_image_url(image_1)},
                    },
                    {"type": "text", "text": "What's in this image?"},
                ],
@@ -66,9 +64,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
                "content": [
                    {
                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image;base64,{encode_image_base64(image_2)}"
-                        },
+                        "image_url": {"url": encode_image_url(image_2)},
                    },
                    {
                        "type": "image_url",

--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
@@ -12,6 +12,13 @@ from vllm import SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ErrorResponse,
+)
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
 from vllm.inputs import PromptType
 from vllm.outputs import RequestOutput
 from vllm.platforms import current_platform
@@ -255,7 +262,7 @@ async def test_multi_abort(output_kind: RequestOutputKind):

        # Use multi-abort to abort multiple requests at once
        abort_request_ids = [request_ids[i] for i in REQUEST_IDS_TO_ABORT]
-        await engine.abort(abort_request_ids)
+        await engine.abort(abort_request_ids, internal=False)

        # Wait for all tasks to complete
        results = await asyncio.gather(*tasks, return_exceptions=True)
@@ -486,6 +493,60 @@ async def test_dp_rank_argument():
                pass


+@pytest.mark.asyncio(scope="module")
+async def test_header_dp_rank_argument():
+    with ExitStack() as after:
+        with set_default_torch_num_threads(1):
+            engine = AsyncLLM.from_engine_args(TEXT_ENGINE_ARGS)
+        after.callback(engine.shutdown)
+
+        MODEL_NAME = "test-model"
+        BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
+
+        # Create models first
+        models = OpenAIServingModels(
+            engine_client=engine,
+            base_model_paths=BASE_MODEL_PATHS,
+        )
+
+        # Create serving chat instance
+        serving_chat = OpenAIServingChat(
+            engine_client=engine,
+            models=models,
+            response_role="assistant",
+            chat_template=None,
+            chat_template_content_format="auto",
+            request_logger=None,
+        )
+        # Create a chat completion request
+        req = ChatCompletionRequest(
+            model=MODEL_NAME,
+            messages=[{"role": "user", "content": TEXT_PROMPT}],
+            max_tokens=100,
+            temperature=1.0,
+            seed=33,
+        )
+        # Test 1: Valid DP rank (0)
+        mock_raw_request = MagicMock()
+        mock_raw_request.headers = {"X-data-parallel-rank": "0"}
+        mock_raw_request.state = MagicMock()
+
+        # Should succeed with valid rank
+        response = await serving_chat.create_chat_completion(req, mock_raw_request)
+        assert isinstance(response, ChatCompletionResponse), (
+            "Expected a ChatCompletionResponse for valid DP rank"
+        )
+
+        # Test 2: Out-of-range DP rank (1)
+        mock_raw_request.headers = {"X-data-parallel-rank": "1"}
+
+        # should return ErrorResponse for out-of-range rank
+        response2 = await serving_chat.create_chat_completion(req, mock_raw_request)
+        assert isinstance(response2, ErrorResponse), (
+            "Expected an ErrorResponse for out-of-range DP rank"
+        )
+
+
 @pytest.mark.asyncio
 async def test_check_health():
    """Test that check_health returns normally for healthy engine
@@ -550,7 +611,7 @@ async def test_abort_final_output(output_kind: RequestOutputKind):
        await asyncio.sleep(0.5)

        # Abort the request
-        await engine.abort(request_id)
+        await engine.abort(request_id, internal=False)

        # Wait for generation to complete and return final output
        final_output = await generated

--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -42,10 +42,16 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
 PROMPT = "I am Gyoubu Masataka Oniwa"
 PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids

+_REQUEST_COUNTER = 0
+

 def make_request() -> EngineCoreRequest:
+    global _REQUEST_COUNTER
+    _REQUEST_COUNTER += 1
+    request_id = f"request-{_REQUEST_COUNTER}"
    return EngineCoreRequest(
-        request_id=str(uuid.uuid4()),
+        request_id=request_id,
+        external_req_id=f"{request_id}-{uuid.uuid4()}",
        prompt_token_ids=PROMPT_TOKENS,
        mm_features=None,
        sampling_params=SamplingParams(),

--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -2,12 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import asyncio
+import importlib
 import os
 import signal
 import time
 import uuid
 from dataclasses import dataclass
 from threading import Thread
+from types import SimpleNamespace
 from typing import Any
 from unittest.mock import MagicMock

@@ -25,7 +27,11 @@ from vllm.usage.usage_lib import UsageContext
 from vllm.utils.torch_utils import set_default_torch_num_threads
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core import EngineCore
-from vllm.v1.engine.core_client import AsyncMPClient, EngineCoreClient, SyncMPClient
+from vllm.v1.engine.core_client import (
+    AsyncMPClient,
+    EngineCoreClient,
+    SyncMPClient,
+)
 from vllm.v1.engine.utils import CoreEngineProcManager
 from vllm.v1.executor.abstract import Executor

@@ -33,14 +39,19 @@ from ...distributed.conftest import MockSubscriber
 from ...utils import create_new_process_for_each_test
 from ...utils import models_path_prefix

-if not current_platform.is_cuda():
-    pytest.skip(reason="V1 currently only supported on CUDA.", allow_module_level=True)
+if not current_platform.is_cuda_alike():
+    pytest.skip(
+        reason="V1 currently only supported on CUDA-alike platforms.",
+        allow_module_level=True,
+    )

 MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
 TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
 PROMPT = "Hello my name is Robert and I love quantization kernels"
 PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids

+_REQUEST_COUNTER = 0
+

 def make_request(
    params: SamplingParams, prompt_tokens_ids: list[int] | None = None
@@ -48,8 +59,12 @@ def make_request(
    if not prompt_tokens_ids:
        prompt_tokens_ids = PROMPT_TOKENS

+    global _REQUEST_COUNTER
+    _REQUEST_COUNTER += 1
+    request_id = f"request-{_REQUEST_COUNTER}"
    return EngineCoreRequest(
-        request_id=str(uuid.uuid4()),
+        request_id=request_id,
+        external_req_id=f"{request_id}-{uuid.uuid4()}",
        prompt_token_ids=prompt_tokens_ids,
        mm_features=None,
        sampling_params=params,
@@ -62,6 +77,92 @@ def make_request(
    )


+def _reload_envs_module():
+    import vllm.envs as envs_mod
+
+    cache_clear = getattr(getattr(envs_mod, "__getattr__", None), "cache_clear", None)
+    if cache_clear is not None:
+        cache_clear()
+    return importlib.reload(envs_mod)
+
+
+def _reload_core_client_module():
+    module = importlib.import_module("vllm.v1.engine.core_client")
+    return importlib.reload(module)
+
+
+def test_mp_client_uses_env_timeout(monkeypatch: pytest.MonkeyPatch):
+    timeout_value = 654
+    monkeypatch.setenv("VLLM_ENGINE_READY_TIMEOUT_S", str(timeout_value))
+
+    # Ensure that the environment variable is loaded if caching is enabled
+    _reload_envs_module()
+    core_client_mod = _reload_core_client_module()
+
+    poll_timeouts: list[int] = []
+
+    class ShadowSocket:
+        def poll(self, timeout: int) -> int:
+            # Capture the timeout value for each poll call
+            poll_timeouts.append(timeout)
+            return 1
+
+        def recv_multipart(self):
+            return (b"\x00\x00", b"ready")
+
+    class DummySocket:
+        def send_multipart(self, _msg, *, copy: bool = False, track: bool = False):
+            if track:
+                return SimpleNamespace(done=True)
+
+        def recv_multipart(self, *, copy: bool = False):
+            return (b"", b"")
+
+        def close(self, *, linger: int = 0):
+            pass
+
+        def bind(self, _address):
+            pass
+
+        def connect(self, _address):
+            pass
+
+        def setsockopt(self, *_args, **_kwargs):
+            pass
+
+    monkeypatch.setattr(core_client_mod.zmq.Socket, "shadow", lambda *_: ShadowSocket())
+    monkeypatch.setattr(
+        core_client_mod, "make_zmq_socket", lambda *_, **__: DummySocket()
+    )
+
+    parallel_config = SimpleNamespace(
+        data_parallel_size=1,
+        data_parallel_rank=0,
+        data_parallel_index=0,
+        data_parallel_size_local=1,
+        data_parallel_rank_local=None,
+        data_parallel_hybrid_lb=False,
+        data_parallel_external_lb=False,
+    )
+    vllm_config = SimpleNamespace(parallel_config=parallel_config)
+
+    client = core_client_mod.MPClient(
+        asyncio_mode=False,
+        vllm_config=vllm_config,
+        executor_class=object,
+        log_stats=False,
+        client_addresses={
+            "input_address": "inproc://input",
+            "output_address": "inproc://output",
+        },
+    )
+    try:
+        # timeout_value is in seconds, but poll receives milliseconds
+        assert poll_timeouts == [timeout_value * 1000]
+    finally:
+        client.shutdown()
+
+
 def loop_until_done(client: EngineCoreClient, outputs: dict):
    while True:
        engine_core_outputs = client.get_output().outputs
@@ -638,6 +739,7 @@ def test_kv_cache_events(
        )
        assert event.parent_block_hash is None, "Parent block hash should be None"
        assert event.lora_id is None, "Lora id should be None"
+        assert event.lora_name is None, "Lora name should be None"
        assert len(event.token_ids) == num_blocks * block_size, (
            "Token ids should be the same as the custom tokens"
        )

--- a/tests/v1/engine/test_fast_incdec_prefix_err.py
+++ b/tests/v1/engine/test_fast_incdec_prefix_err.py
@@ -29,6 +29,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
    params = SamplingParams(skip_special_tokens=True)
    request = EngineCoreRequest(
        request_id="test",
+        external_req_id="test-ext",
        prompt_token_ids=prompt_token_ids,
        mm_features=None,
        sampling_params=params,

--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -59,12 +59,12 @@ def test_incremental_detokenization(
    output_processor = OutputProcessor(
        dummy_test_vectors.tokenizer, log_stats=False, stream_interval=stream_interval
    )
-    engine_core = MockEngineCore(tokens_list=dummy_test_vectors.generation_tokens)

    # Make N requests.
    requests = [
        EngineCoreRequest(
-            request_id=f"request-{idx}",
+            request_id=f"request-{idx}-int",
+            external_req_id=f"request-{idx}",
            prompt_token_ids=prompt_tokens,
            mm_features=None,
            eos_token_id=None,
@@ -84,6 +84,11 @@ def test_incremental_detokenization(
        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
    ]

+    engine_core = MockEngineCore(
+        tokens_list=dummy_test_vectors.generation_tokens,
+        request_ids=[req.request_id for req in requests],
+    )
+
    # Add requests to the detokenizer.
    for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
        output_processor.add_request(request, prompt)
@@ -270,12 +275,28 @@ def _validate_logprobs(
                    # the logprob token id at this sequence position
                    decoded_token = pos_logprob_dict[lp_tok].decoded_token
                    ref_decoded_token = _ref_convert_id_to_token(dtv.tokenizer, lp_tok)
-                    assert decoded_token == ref_decoded_token, (
-                        f"Sampled logprob token id {lp_tok} decodes to"
-                        f" {ref_decoded_token} but Logprob decoded"
-                        f" token is {decoded_token} instead"
-                        f" (at position {idx})"
-                    )
+
+                    # With UTF-8 correction logic, tokens ending with "�"
+                    # (incomplete byte sequences) are corrected to either
+                    # empty string or proper UTF-8 characters
+                    if ref_decoded_token.endswith("�"):
+                        # Token needs UTF-8 correction
+                        assert not decoded_token.endswith("�"), (
+                            f"Sampled logprob token id {lp_tok} decodes to"
+                            f" '{ref_decoded_token}' (ends with replacement char)"
+                            f" but corrected decoded token '{decoded_token}'"
+                            f" still ends with replacement char"
+                            f" (at position {idx}). UTF-8 correction should"
+                            f" have removed it."
+                        )
+                    else:
+                        # No correction needed, should match exactly
+                        assert decoded_token == ref_decoded_token, (
+                            f"Sampled logprob token id {lp_tok} decodes to"
+                            f" {ref_decoded_token} but Logprob decoded"
+                            f" token is {decoded_token} instead"
+                            f" (at position {idx})"
+                        )

                ref_cumulative_logprob += pos_logprob_dict[sampled_token].logprob
            # Assert that cumulative logprobs are correct
@@ -416,12 +437,28 @@ def _validate_logprobs(
                    # the logprob token id at this sequence position
                    decoded_token = pos_logprob_dict[plp_tok].decoded_token
                    ref_decoded_token = _ref_convert_id_to_token(dtv.tokenizer, plp_tok)
-                    assert decoded_token == ref_decoded_token, (
-                        f"Prompt logprob token id {plp_tok} decodes to"
-                        f" {ref_decoded_token} but Logprob decoded"
-                        f" token is {decoded_token} instead"
-                        f" (at position {idx})"
-                    )
+
+                    # With UTF-8 correction logic, tokens ending with "�"
+                    # (incomplete byte sequences) are corrected to either
+                    # empty string or proper UTF-8 characters
+                    if ref_decoded_token.endswith("�"):
+                        # Token needs UTF-8 correction
+                        assert not decoded_token.endswith("�"), (
+                            f"Prompt logprob token id {plp_tok} decodes to"
+                            f" '{ref_decoded_token}' (ends with replacement char)"
+                            f" but corrected decoded token '{decoded_token}'"
+                            f" still ends with replacement char"
+                            f" (at position {idx}). UTF-8 correction should"
+                            f" have removed it."
+                        )
+                    else:
+                        # No correction needed, should match exactly
+                        assert decoded_token == ref_decoded_token, (
+                            f"Prompt logprob token id {plp_tok} decodes to"
+                            f" {ref_decoded_token} but Logprob decoded"
+                            f" token is {decoded_token} instead"
+                            f" (at position {idx})"
+                        )
        else:
            # Prompt logprobs disabled for this request
            assert prompt_logprobs is None
@@ -439,15 +476,6 @@ def test_logprobs_processor(
    dummy_test_vectors,
 ):
    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
-    engine_core = MockEngineCore(
-        tokens_list=dummy_test_vectors.generation_tokens,
-        generated_logprobs_raw=None
-        if num_sample_logprobs is None
-        else dummy_test_vectors.generation_logprobs,
-        prompt_logprobs_raw=None
-        if num_prompt_logprobs is None
-        else dummy_test_vectors.prompt_logprobs,
-    )

    # Make N requests.
    request_id_list = [
@@ -455,7 +483,8 @@ def test_logprobs_processor(
    ]
    requests = [
        EngineCoreRequest(
-            request_id=request_id_list[idx],
+            request_id=request_id_list[idx] + "-int",
+            external_req_id=request_id_list[idx],
            prompt_token_ids=prompt_tokens,
            mm_features=None,
            eos_token_id=None,
@@ -477,6 +506,17 @@ def test_logprobs_processor(
        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
    ]

+    engine_core = MockEngineCore(
+        tokens_list=dummy_test_vectors.generation_tokens,
+        generated_logprobs_raw=None
+        if num_sample_logprobs is None
+        else dummy_test_vectors.generation_logprobs,
+        prompt_logprobs_raw=None
+        if num_prompt_logprobs is None
+        else dummy_test_vectors.prompt_logprobs,
+        request_ids=[req.request_id for req in requests],
+    )
+
    # Add requests to the detokenizer.
    for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
        output_processor.add_request(request, prompt)
@@ -622,19 +662,12 @@ def test_stop_token(
        ]
    prompt_string = dummy_test_vectors.prompt_strings[0]
    prompt_tokens = dummy_test_vectors.prompt_tokens[0]
-    engine_core = MockEngineCore(
-        tokens_list=[generation_tokens],
-        generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
-        prompt_logprobs_raw=None,
-        eos_token_id=eos_token_id,
-        stop_token_ids=stop_token_ids,
-        ignore_eos=ignore_eos,
-    )

    # Make request.
    request_id = "request-0"
    request = EngineCoreRequest(
        request_id=request_id,
+        external_req_id=request_id + "-ext",
        prompt_token_ids=prompt_tokens,
        mm_features=None,
        eos_token_id=eos_token_id,
@@ -656,6 +689,16 @@ def test_stop_token(
        pooling_params=None,
    )

+    engine_core = MockEngineCore(
+        tokens_list=[generation_tokens],
+        generated_logprobs_raw=[generation_logprobs] if do_logprobs else None,
+        prompt_logprobs_raw=None,
+        eos_token_id=eos_token_id,
+        stop_token_ids=stop_token_ids,
+        ignore_eos=ignore_eos,
+        request_ids=[request.request_id],
+    )
+
    # Add request to the detokenizer.
    output_processor.add_request(request, prompt_string)

@@ -721,13 +764,6 @@ def test_stop_string(
    dummy_test_vectors,
 ):
    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
-    engine_core = MockEngineCore(
-        tokens_list=dummy_test_vectors.generation_tokens,
-        generated_logprobs_raw=dummy_test_vectors.generation_logprobs
-        if num_sample_logprobs
-        else None,
-        prompt_logprobs_raw=None,
-    )

    # Make N requests.
    request_id_list = [
@@ -735,7 +771,8 @@ def test_stop_string(
    ]
    requests = [
        EngineCoreRequest(
-            request_id=request_id_list[idx],
+            request_id=request_id_list[idx] + "-int",
+            external_req_id=request_id_list[idx],
            prompt_token_ids=prompt_tokens,
            mm_features=None,
            eos_token_id=None,
@@ -757,6 +794,15 @@ def test_stop_string(
        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
    ]

+    engine_core = MockEngineCore(
+        tokens_list=dummy_test_vectors.generation_tokens,
+        generated_logprobs_raw=dummy_test_vectors.generation_logprobs
+        if num_sample_logprobs
+        else None,
+        prompt_logprobs_raw=None,
+        request_ids=[req.request_id for req in requests],
+    )
+
    # Add requests to the detokenizer.
    for request, prompt in zip(requests, dummy_test_vectors.prompt_strings):
        output_processor.add_request(request, prompt)
@@ -814,9 +860,12 @@ def test_stop_string(
    for idx, (ref_gen_str, stop_str) in enumerate(
        zip(dummy_test_vectors.generation_strings, STOP_STRINGS)
    ):
-        # Request should be aborted.
+        # Request should be aborted (check internal ID in abort list).
+        internal_request_id = f"request-{idx}-int"
+        assert internal_request_id in aborted
+
+        # Use external ID for collecting outputs
        request_id = f"request-{idx}"
-        assert request_id in aborted

        # Collected values that were generated.
        gen_str = gen_strings[request_id]
@@ -849,13 +898,13 @@ def test_stop_string(

 def test_iteration_stats(dummy_test_vectors):
    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
-    engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
    engine_core_timestamp = time.monotonic()

    # Make N requests.
    requests = [
        EngineCoreRequest(
            request_id=f"request-{idx}",
+            external_req_id=f"request-{idx}-ext",
            prompt_token_ids=prompt_tokens,
            mm_features=None,
            eos_token_id=None,
@@ -869,6 +918,11 @@ def test_iteration_stats(dummy_test_vectors):
        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
    ]

+    engine_core = MockEngineCore(
+        dummy_test_vectors.generation_tokens,
+        request_ids=[req.request_id for req in requests],
+    )
+
    # Add all requests except one to the OutputProcessor.
    num_active = len(dummy_test_vectors.generation_tokens) - 1
    for request in requests[:num_active]:
@@ -923,7 +977,6 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
    output_processor = OutputProcessor(
        dummy_test_vectors.tokenizer, log_stats=log_stats
    )
-    engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
    engine_core_timestamp = time.monotonic()

    # Create LoRA requests
@@ -937,7 +990,8 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
    lora_assignments = [lora1, lora2, None]
    requests = [
        EngineCoreRequest(
-            request_id=f"request-{idx}",
+            request_id=f"request-{idx}-int",
+            external_req_id=f"request-{idx}",
            prompt_token_ids=prompt_tokens,
            mm_features=None,
            eos_token_id=None,
@@ -951,6 +1005,11 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
        for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens)
    ]

+    engine_core = MockEngineCore(
+        dummy_test_vectors.generation_tokens,
+        request_ids=[req.request_id for req in requests],
+    )
+
    # Add all requests to the OutputProcessor
    for request in requests:
        output_processor.add_request(request, None)
@@ -1016,9 +1075,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
    outputs = EngineCoreOutputs(
        outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
    )
-    # Find and mark request-0 as finished (it uses lora-1)
+    # Find and mark request-0-int as finished (it uses lora-1)
    for output in outputs.outputs:
-        if output.request_id == "request-0":
+        if output.request_id == "request-0-int":
            output.finish_reason = FinishReason.LENGTH
            break

@@ -1041,9 +1100,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
    outputs = EngineCoreOutputs(
        outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
    )
-    # Find and mark request-1 as finished (it uses lora-2)
+    # Find and mark request-1-int as finished (it uses lora-2)
    for output in outputs.outputs:
-        if output.request_id == "request-1":
+        if output.request_id == "request-1-int":
            output.finish_reason = FinishReason.LENGTH
            break

@@ -1065,9 +1124,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
    outputs = EngineCoreOutputs(
        outputs=engine_core.get_outputs(), scheduler_stats=SchedulerStats()
    )
-    # Find and mark request-2 as finished (it has no LoRA)
+    # Find and mark request-2-int as finished (it has no LoRA)
    for output in outputs.outputs:
-        if output.request_id == "request-2":
+        if output.request_id == "request-2-int":
            output.finish_reason = FinishReason.LENGTH
            break

@@ -1108,7 +1167,9 @@ async def test_request_output_collector():
            for idx in range(NUM_REQS)
        ]

-    collector = RequestOutputCollector(RequestOutputKind.DELTA)
+    collector = RequestOutputCollector(
+        RequestOutputKind.DELTA, request_id="my-request-id-int"
+    )

    # CASE 1: Put then get.
    outputs = make_outputs()
@@ -1164,7 +1225,9 @@ async def test_request_output_collector():
 @pytest.mark.asyncio
 async def test_cumulative_output_collector_n():
    """Test collector correctly handles multiple outputs by index."""
-    collector = RequestOutputCollector(RequestOutputKind.CUMULATIVE)
+    collector = RequestOutputCollector(
+        RequestOutputKind.CUMULATIVE, request_id="my-request-id-int"
+    )
    outputs = [
        RequestOutput(
            request_id="my-request-id",
@@ -1243,11 +1306,13 @@ async def test_cumulative_output_collector_n():


 @pytest.mark.parametrize("runner", ["generate", "pooling"])
-def test_abort_requests(runner: str, dummy_test_vectors):
+@pytest.mark.parametrize("abort_by", ["internal", "external"])
+def test_abort_requests(runner: str, abort_by: str, dummy_test_vectors):
    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
    requests = [
        EngineCoreRequest(
            request_id=f"request-{idx}",
+            external_req_id=f"external-{idx}",
            prompt_token_ids=prompt_tokens,
            mm_features=None,
            eos_token_id=None,
@@ -1266,8 +1331,13 @@ def test_abort_requests(runner: str, dummy_test_vectors):
            output_kind = request.sampling_params.output_kind
        else:
            output_kind = request.pooling_params.output_kind
-        queue = RequestOutputCollector(output_kind=output_kind)
+        queue = RequestOutputCollector(
+            output_kind=output_kind, request_id=request.request_id
+        )
        output_processor.add_request(request, None, queue=queue)

    for request in requests:
-        output_processor.abort_requests([request.request_id])
+        if abort_by == "internal":
+            output_processor.abort_requests([request.request_id], internal=True)
+        else:
+            output_processor.abort_requests([request.external_req_id], internal=False)
--- a/tests/v1/engine/test_parallel_sampling.py
+++ b/tests/v1/engine/test_parallel_sampling.py
@@ -4,11 +4,12 @@
 from vllm import SamplingParams
 from vllm.outputs import CompletionOutput
 from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.parallel_sampling import ParentRequest


 def test_parent_request_to_output_stream() -> None:
-    parent_request = ParentRequest("parent_id", SamplingParams(n=2))
+    parent_request = ParentRequest(make_request(SamplingParams(n=2)))
    parent_request.child_requests = {"child_id_0", "child_id_1"}
    output_0 = CompletionOutput(
        index=0, text="child 0", token_ids=[], cumulative_logprob=None, logprobs=None
@@ -17,51 +18,31 @@ def test_parent_request_to_output_stream() -> None:
        index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
    )
    # Request not finished
-    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
-        "child_id_0", output_0
-    )
-    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
-        "child_id_1", output_1
-    )
-    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
-        "child_id_0", output_0
-    )
-    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
-        "child_id_1", output_1
-    )
+    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
+    assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
+    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
+    assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)

    # output_1 finished
    output_1.finish_reason = "ended"
-    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
-        "child_id_0", output_0
-    )
-    assert ("parent_id", [output_1], False) == parent_request.get_outputs(
-        "child_id_1", output_1
-    )
+    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
+    assert ([output_1], False) == parent_request.get_outputs("child_id_1", output_1)
    # Finished output_1 had already returned, DO NOT returned again
-    assert ("parent_id", [output_0], False) == parent_request.get_outputs(
-        "child_id_0", output_0
-    )
-    assert parent_request.get_outputs("child_id_1", output_1) == (
-        "parent_id",
-        [],
-        False,
-    )
+    assert ([output_0], False) == parent_request.get_outputs("child_id_0", output_0)
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], False)

    # output_0 finished
    output_0.finish_reason = "ended"
-    assert ("parent_id", [output_0], True) == parent_request.get_outputs(
-        "child_id_0", output_0
-    )
-    assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
+    assert ([output_0], True) == parent_request.get_outputs("child_id_0", output_0)
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], True)
    # Finished output_0 had already returned, DO NOT returned again
-    assert parent_request.get_outputs("child_id_0", output_0) == ("parent_id", [], True)
-    assert parent_request.get_outputs("child_id_1", output_1) == ("parent_id", [], True)
+    assert parent_request.get_outputs("child_id_0", output_0) == ([], True)
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], True)


 def test_parent_request_to_output_final_only() -> None:
    parent_request = ParentRequest(
-        "parent_id", SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY)
+        make_request(SamplingParams(n=2, output_kind=RequestOutputKind.FINAL_ONLY))
    )
    parent_request.child_requests = {"child_id_0", "child_id_1"}
    output_0 = CompletionOutput(
@@ -71,33 +52,33 @@ def test_parent_request_to_output_final_only() -> None:
        index=1, text="child 1", token_ids=[], cumulative_logprob=None, logprobs=None
    )
    # Request not finished, return nothing
-    assert parent_request.get_outputs("child_id_0", output_0) == (
-        "parent_id",
-        [],
-        False,
-    )
-    assert parent_request.get_outputs("child_id_1", output_1) == (
-        "parent_id",
-        [],
-        False,
-    )
+    assert parent_request.get_outputs("child_id_0", output_0) == ([], False)
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
    # output_1 finished, but outputs won't be returned until all child requests finished
    output_1.finish_reason = "ended"
-    assert parent_request.get_outputs("child_id_0", output_0) == (
-        "parent_id",
-        [],
-        False,
-    )
-    assert parent_request.get_outputs("child_id_1", output_1) == (
-        "parent_id",
-        [],
-        False,
-    )
+    assert parent_request.get_outputs("child_id_0", output_0) == ([], False)
+    assert parent_request.get_outputs("child_id_1", output_1) == ([], False)
    # output_0 finished, as all child requests finished, the output would be returned
    output_0.finish_reason = "ended"
-    assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
+    assert ([output_0, output_1], True) == parent_request.get_outputs(
        "child_id_0", output_0
    )
-    assert ("parent_id", [output_0, output_1], True) == parent_request.get_outputs(
+    assert ([output_0, output_1], True) == parent_request.get_outputs(
        "child_id_1", output_1
    )
+
+
+def make_request(sampling_params: SamplingParams) -> EngineCoreRequest:
+    return EngineCoreRequest(
+        request_id="parent_id",
+        external_req_id="ext_parent_id",
+        prompt_token_ids=None,
+        mm_features=None,
+        sampling_params=sampling_params,
+        pooling_params=None,
+        eos_token_id=None,
+        arrival_time=0.0,
+        lora_request=None,
+        cache_salt=None,
+        data_parallel_rank=None,
+    )
--- a/tests/v1/engine/test_preprocess_error_handling.py
+++ b/tests/v1/engine/test_preprocess_error_handling.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch.cuda
+
+from vllm import LLM, SamplingParams
+from vllm.platforms import current_platform
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.core import EngineCore
+
+MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
+
+
+def test_preprocess_error_handling(monkeypatch: pytest.MonkeyPatch):
+    """Test that preprocessing errors are handled gracefully."""
+
+    if current_platform.is_rocm() or current_platform.is_xpu():
+        pytest.skip(
+            "Skipped on ROCm/XPU: this test only works with 'fork', "
+            "but ROCm/XPU uses 'spawn'."
+        )
+
+    assert not torch.cuda.is_initialized(), (
+        "fork needs to be used for the engine "
+        "core process and this isn't possible if cuda is already initialized"
+    )
+
+    # Store original method to call for non-failing requests
+    original_preprocess = EngineCore.preprocess_add_request
+
+    # Monkeypatch to make preprocess_add_request raise an exception
+    # only for requests with "FAIL" in the first token
+    def conditional_failing_preprocess(self, request: EngineCoreRequest):
+        # Fail if the first token id is 333
+        if request.prompt_token_ids and request.prompt_token_ids[0] == 333:
+            raise ValueError("Simulated preprocessing error!")
+        return original_preprocess(self, request)
+
+    monkeypatch.setattr(
+        EngineCore, "preprocess_add_request", conditional_failing_preprocess
+    )
+
+    llm = LLM(model=MODEL_NAME)
+
+    # Create a failing request by crafting a request with an invalid token
+    # We need to use a direct approach since LLM.generate tokenizes for us
+    from vllm.inputs import TokensPrompt
+
+    # This should raise an exception due to the preprocessing failure
+    # Special token id to trigger the failure
+    failing_prompt = TokensPrompt(prompt_token_ids=[333])
+    outputs = llm.generate(failing_prompt, SamplingParams(max_tokens=10))  # type: ignore
+    assert len(outputs) == 1
+    assert len(outputs[0].outputs[0].token_ids) == 0
+    assert outputs[0].finished
+    assert outputs[0].outputs[0].finish_reason == "error"
+
+    # Verify the engine is still functional with a normal request
+    outputs = llm.generate("Hello, my name is", SamplingParams(max_tokens=10))
+    assert len(outputs) == 1
+    assert len(outputs[0].outputs[0].token_ids) > 0
+    assert outputs[0].outputs[0].finish_reason in ("stop", "length")
--- a/tests/v1/engine/test_process_multi_modal_uuids.py
+++ b/tests/v1/engine/test_process_multi_modal_uuids.py
@@ -6,6 +6,7 @@ import pytest
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
+from vllm.multimodal import MultiModalUUIDDict
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine import input_processor as input_processor_mod
 from vllm.v1.engine.input_processor import InputProcessor
@@ -166,7 +167,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
        monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False
    )

-    captured: dict[str, object] = {}
+    captured: dict[str, MultiModalUUIDDict] = {}

    def fake_preprocess(
        prompt, *, tokenization_kwargs=None, lora_request=None, mm_uuids=None
@@ -196,7 +197,16 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
    )

    # Expect request-id-based overrides are passed through
-    assert captured["mm_uuids"] == {
-        "image": [f"{request_id}-image-0", f"{request_id}-image-1"],
-        "video": [f"{request_id}-video-0"],
-    }
+    mm_uuids = captured["mm_uuids"]
+    assert set(mm_uuids.keys()) == {"image", "video"}
+    assert len(mm_uuids["image"]) == 2
+    assert len(mm_uuids["video"]) == 1
+    assert mm_uuids["image"][0].startswith(f"{request_id}-image-") and mm_uuids[
+        "image"
+    ][0].endswith("-0")
+    assert mm_uuids["image"][1].startswith(f"{request_id}-image-") and mm_uuids[
+        "image"
+    ][1].endswith("-1")
+    assert mm_uuids["video"][0].startswith(f"{request_id}-video-") and mm_uuids[
+        "video"
+    ][0].endswith("-0")
--- a/tests/v1/engine/utils.py
+++ b/tests/v1/engine/utils.py
@@ -343,6 +343,7 @@ class MockEngineCore:
        eos_token_id: int | None = None,
        stop_token_ids: list[int] | None = None,
        ignore_eos: bool = False,
+        request_ids: list[str] | None = None,
    ) -> None:
        self.num_requests = len(tokens_list)
        self.tokens_list = tokens_list
@@ -355,6 +356,11 @@ class MockEngineCore:
        self.eos_token_id = eos_token_id
        self.stop_token_ids = stop_token_ids
        self.ignore_eos = ignore_eos
+        self.request_ids = (
+            request_ids
+            if request_ids is not None
+            else [f"request-{i}" for i in range(self.num_requests)]
+        )

    def get_outputs(self) -> list[EngineCoreOutput]:
        do_logprobs = self.do_logprobs
@@ -386,7 +392,7 @@ class MockEngineCore:
                    prompt_logprobs = None
                new_token_id = token_ids[token_idx]
                output = EngineCoreOutput(
-                    request_id=f"request-{req_idx}",
+                    request_id=self.request_ids[req_idx],
                    new_token_ids=[new_token_id],
                    new_logprobs=logprobs,
                    new_prompt_logprobs_tensors=prompt_logprobs,

--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -610,7 +610,7 @@ Make the response as short as possible.


 @pytest.mark.parametrize(
-    "model_name, backend, tokenizer_mode, reasoning_parser, speculative_config",  # noqa: E501
+    "model_name, backend, tokenizer_mode, reasoning_parser, speculative_config, async_scheduling",  # noqa: E501
    [
        (
            "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
@@ -618,8 +618,10 @@ Make the response as short as possible.
            "auto",
            "deepseek_r1",
            NGRAM_SPEC_CONFIG,
+            False,
        ),
-        ("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None),
+        ("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None, False),
+        ("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None, True),
    ],
 )
 def test_structured_output_with_reasoning_matrices(
@@ -628,6 +630,7 @@ def test_structured_output_with_reasoning_matrices(
    reasoning_parser: str,
    model_name: str,
    speculative_config: dict[str, Any] | None,
+    async_scheduling: bool,
 ):
    if current_platform.is_tpu() and speculative_config:
        pytest.skip("TPU does not support speculative decoding")
@@ -648,6 +651,7 @@ def test_structured_output_with_reasoning_matrices(
        ),
        tokenizer_mode=tokenizer_mode,
        speculative_config=speculative_config,
+        async_scheduling=async_scheduling,
    )
    tokenizer = llm.get_tokenizer()
    reasoner = ReasoningParserManager.get_reasoning_parser(reasoning_parser)(

--- a/tests/v1/entrypoints/openai/serving_responses/test_image.py
+++ b/tests/v1/entrypoints/openai/serving_responses/test_image.py
@@ -8,7 +8,7 @@ import pytest
 import pytest_asyncio

 from tests.utils import RemoteOpenAIServer
-from vllm.multimodal.utils import encode_image_base64
+from vllm.multimodal.utils import encode_image_url

 # Use a small vision model for testing
 MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
@@ -52,9 +52,9 @@ async def client(image_server):


 @pytest.fixture(scope="session")
-def base64_encoded_image(local_asset_server) -> dict[str, str]:
+def url_encoded_image(local_asset_server) -> dict[str, str]:
    return {
-        image_url: encode_image_base64(local_asset_server.get_image_asset(image_url))
+        image_url: encode_image_url(local_asset_server.get_image_asset(image_url))
        for image_url in TEST_IMAGE_ASSETS
    }

@@ -95,7 +95,7 @@ async def test_single_chat_session_image_base64encoded(
    client: openai.AsyncOpenAI,
    model_name: str,
    raw_image_url: str,
-    base64_encoded_image: dict[str, str],
+    url_encoded_image: dict[str, str],
 ):
    content_text = "What's in this image?"
    messages = [
@@ -104,7 +104,7 @@ async def test_single_chat_session_image_base64encoded(
            "content": [
                {
                    "type": "input_image",
-                    "image_url": f"data:image/jpeg;base64,{base64_encoded_image[raw_image_url]}",  # noqa: E501
+                    "image_url": url_encoded_image[raw_image_url],
                    "detail": "auto",
                },
                {"type": "input_text", "text": content_text},

--- a/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
@@ -5,24 +5,37 @@ set -euo pipefail
 SCRIPT="v1/kv_connector/nixl_integration/run_accuracy_test.sh"

 # Define test configurations
-configs=(
+tp_configs=(
  "GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2"
  "GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2"
+  "GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=1"
  "GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA case
  "GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
-  "DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP1, D-DPEP=2 (TP=1) 
+  "GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=1 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
 )
+dp_ep_configs=(
+"DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP1, D-DPEP=2 (TP=1) 
+"DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny" # MLA+P-TP2, D-DPEP=2 (TP=1) 
+)
+
+# Select config array based on DP_EP env var
+if [[ -n "${DP_EP:-}" ]]; then
+  configs=("${dp_ep_configs[@]}")
+  echo "DP_EP is set, using dp_ep_configs"
+else
+  configs=("${tp_configs[@]}")
+fi

 run_tests() {
  local label=$1
-  local extra_env=$2
+  local extra_args=$2

  echo "=== Running tests (${label}) ==="
  for cfg in "${configs[@]}"; do
-    echo "-> Running with ${cfg} ${extra_env:+and ${extra_env}}"
+    echo "-> Running with ${cfg} ${extra_args:+and ${extra_args}}"
    # Use 'env' to safely set variables without eval
-    if ! env ${extra_env} ${cfg} bash "${SCRIPT}"; then
-      echo "❌ Test failed for config: ${cfg} ${extra_env:+(${extra_env})}"
+    if ! env ${cfg} bash "${SCRIPT}" ${extra_args}; then
+      echo "❌ Test failed for config: ${cfg} ${extra_args:+(${extra_args})}"
      exit 1
    fi
  done
@@ -34,8 +47,8 @@ run_tests "default backend" ""

 # Check if FLASHINFER is set (non-empty)
 if [[ -n "${FLASHINFER:-}" ]]; then
-  echo "FLASHINFER is set, rerunning with VLLM_ATTENTION_BACKEND=FLASHINFER"
-  run_tests "FLASHINFER backend" "VLLM_ATTENTION_BACKEND=FLASHINFER"
+  echo "FLASHINFER is set, rerunning with --attention-backend FLASHINFER"
+  run_tests "FLASHINFER backend" "--attention-backend FLASHINFER"
 else
  echo "FLASHINFER not set, skipping FLASHINFER runs."
 fi
--- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -3,21 +3,29 @@ set -xe

 # Parse command line arguments
 KV_BUFFER_DEVICE="cuda"  # Default to cuda
+ATTENTION_BACKEND=""  # Default to empty (use vllm default)
 while [[ $# -gt 0 ]]; do
  case $1 in
    --kv_buffer_device)
      KV_BUFFER_DEVICE="$2"
      shift 2
      ;;
+    --attention-backend)
+      ATTENTION_BACKEND="$2"
+      shift 2
+      ;;
    *)
      echo "Unknown option $1"
-      echo "Usage: $0 [--kv_buffer_device <cuda|cpu>]"
+      echo "Usage: $0 [--kv_buffer_device <cuda|cpu>] [--attention-backend <backend>]"
      exit 1
      ;;
  esac
 done

 echo "Running accuracy tests with kv_buffer_device=$KV_BUFFER_DEVICE"
+if [[ -n "$ATTENTION_BACKEND" ]]; then
+  echo "Using attention backend: $ATTENTION_BACKEND"
+fi

 DECODER_KV_LAYOUT=${DECODER_KV_LAYOUT:-"HND"} # Default to HND, optional NHD
 if [[ "$DECODER_KV_LAYOUT" == "NHD" ]]; then
@@ -148,6 +156,11 @@ run_tests_for_model() {
    --tensor-parallel-size $PREFILLER_TP_SIZE \
    --kv-transfer-config '$KV_CONFIG'"

+    # Add attention backend config if specified
+    if [[ -n "$ATTENTION_BACKEND" ]]; then
+      BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND"
+    fi
+
    if [ -n "$model_args" ]; then
    FULL_CMD="$BASE_CMD $model_args"
    else
@@ -188,7 +201,12 @@ run_tests_for_model() {
    --block-size ${DECODE_BLOCK_SIZE} \
    --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
    --kv-transfer-config '$KV_CONFIG'"
-  
+
+    # Add attention backend config if specified
+    if [[ -n "$ATTENTION_BACKEND" ]]; then
+      BASE_CMD="${BASE_CMD} --attention-backend=$ATTENTION_BACKEND"
+    fi
+
  # DP-EP attention mode
  if [[ -z "$DP_EP" ]]; then
    BASE_CMD="${BASE_CMD} --tensor-parallel-size $DECODER_TP_SIZE"

--- a/tests/v1/kv_connector/unit/test_backwards_compatibility.py
+++ b/tests/v1/kv_connector/unit/test_backwards_compatibility.py
@@ -14,12 +14,12 @@ from unittest.mock import patch

 import pytest

-from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
    KVConnectorBase_V1,
    KVConnectorRole,
 )
+from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.core.sched.output import SchedulerOutput

 from .utils import create_scheduler, create_vllm_config

--- a/tests/v1/kv_connector/unit/test_config.py
+++ b/tests/v1/kv_connector/unit/test_config.py
@@ -15,11 +15,12 @@ pytestmark = pytest.mark.cpu_test
    [
        ("native", 4.0, 1, 1, "OffloadingConnector", 4.0 * (1 << 30)),
        # bytes per rank: 8.0 GiB / (2 * 2) = 2.0 GiB
-        ("native", 8.0, 2, 2, "OffloadingConnector", 8.0 * (1 << 30) / 4),
+        ("native", 8.0, 2, 2, "OffloadingConnector", 8.0 * (1 << 30)),
        ("lmcache", 4.0, 1, 1, "LMCacheConnectorV1", 4.0),
        # size per rank: 8.0 GiB / (2 * 2) = 2.0 GiB
        ("lmcache", 8.0, 2, 2, "LMCacheConnectorV1", 2.0),
-        (None, None, 1, 1, None, None),
+        # When kv_offloading_size is None, offloading is disabled (backend is ignored)
+        ("native", None, 1, 1, None, None),
    ],
 )
 def test_kv_connector(
@@ -54,8 +55,7 @@ def test_kv_connector(
    assert kv_transfer_config.kv_role == "kv_both"

    if kv_offloading_backend == "native":
-        assert kv_connector_extra_config["kv_bytes_per_rank"] == expected_bytes
-        assert kv_connector_extra_config["num_cpu_blocks"] == 0
+        assert kv_connector_extra_config["cpu_bytes_to_use"] == expected_bytes
        # Existing config should be preserved
        assert kv_connector_extra_config["existing_key"] == "existing_value"
    elif kv_offloading_backend == "lmcache":
@@ -63,3 +63,19 @@ def test_kv_connector(
        assert kv_connector_extra_config["lmcache.max_local_cpu_size"] == expected_bytes
        # Existing config should be replaced
        assert "existing_key" not in kv_connector_extra_config
+
+
+def test_kv_offloading_size_only_uses_native_default():
+    """Test that setting only kv_offloading_size enables native offloading."""
+    vllm_config = VllmConfig(
+        cache_config=CacheConfig(
+            kv_offloading_size=4.0,
+            # kv_offloading_backend not set, should default to "native"
+        ),
+    )
+
+    kv_transfer_config = vllm_config.kv_transfer_config
+    kv_connector_extra_config = kv_transfer_config.kv_connector_extra_config
+    assert kv_transfer_config.kv_connector == "OffloadingConnector"
+    assert kv_transfer_config.kv_role == "kv_both"
+    assert kv_connector_extra_config["cpu_bytes_to_use"] == 4.0 * (1 << 30)