Merge tag 'v0.8.3' into v0.8.3-dev

fcfc474d · zhuwenwen · bb94d2e5 · 296c6572 · fcfc474d · fcfc474d
Commit fcfc474d authored Apr 09, 2025 by zhuwenwen
20 changed files
--- a/tests/tool_use/test_tool_choice_required.py
+++ b/tests/tool_use/test_tool_choice_required.py
+# SPDX-License-Identifier: Apache-2.0
+import json
+import re
+from copy import deepcopy
+from unittest.mock import MagicMock
+
+import pytest
+from pydantic import TypeAdapter
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              ChatCompletionToolsParam)
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+
+EXAMPLE_TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type":
+                        "string",
+                        "description":
+                        "The city to find the weather for"
+                        ", e.g. 'San Francisco'",
+                    },
+                },
+                "required": ["city"],
+                "additionalProperties": False
+            },
+        },
+        "strict": True
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "get_forecast",
+            "description": "Get the weather forecast for a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type":
+                        "string",
+                        "description":
+                        "The city to get the forecast for, e.g. 'New York'",
+                    },
+                    "days": {
+                        "type":
+                        "integer",
+                        "description":
+                        "Number of days to get the forecast for (1-7)",
+                    },
+                },
+                "required": ["city", "days"],
+                "additionalProperties": False
+            },
+        },
+        "strict": True
+    },
+]
+
+
+def _compile_and_check(tools: list[ChatCompletionToolsParam], sample_output,
+                       should_match: bool):
+    self = MagicMock(tool_choice="required", tools=tools)
+    schema = ChatCompletionRequest._get_guided_json_from_tool(self)
+    assert isinstance(schema, dict)
+
+    # use build_regex_from_schema used in JSONLogitsProcessor to create Guide
+    from outlines_core.fsm.json_schema import build_regex_from_schema
+    regex = build_regex_from_schema(json.dumps(schema))
+    compiled = re.compile(regex)
+    matches = compiled.fullmatch(json.dumps(sample_output)) is not None
+
+    assert matches == should_match
+
+
+VALID_TOOL_OUTPUTS = [
+    ([{
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Vienna"
+        }
+    }], True),
+    ([{
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Vienna"
+        }
+    }, {
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Berlin"
+        }
+    }], True),
+    ([{
+        "name": "get_forecast",
+        "parameters": {
+            "city": "Vienna",
+            "days": 7
+        }
+    }], True),
+    ([{
+        "name": "get_forecast",
+        "parameters": {
+            "city": "Vienna",
+            "days": 7
+        }
+    }, {
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Vienna"
+        }
+    }], True),
+    ([{
+        "name": "get_forecast",
+        "parameters": {
+            "city": "Vienna",
+            "days": 7
+        }
+    }, {
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Vienna"
+        }
+    }, {
+        "name": "get_forecast",
+        "parameters": {
+            "city": "Berlin",
+            "days": 7
+        }
+    }, {
+        "name": "get_current_weather",
+        "parameters": {
+            "city": "Berlin"
+        }
+    }], True),
+]
+
+VALID_TOOLS = [t[0] for t in VALID_TOOL_OUTPUTS]
+
+
+@pytest.mark.parametrize(
+    "sample_output, should_match",
+    VALID_TOOL_OUTPUTS + [
+        (None, False),
+        ([], False),  # empty list cannot be generated
+        ({}, False),  # empty object cannot be generated
+        ([{}], False),  # list with empty object cannot be generated
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather"
+            }],
+            False),
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": {}
+            }],
+            False),
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": None
+            }],
+            False),
+        (
+            {  # tool call without lists cannot be generated
+                "name": "get_current_weather",
+                "parameters": {
+                    "city": "Vienna"
+                }
+            },
+            False),
+        (
+            [{  # tool call with extra parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": {
+                    "city": "Vienna",
+                    "extra": "value"
+                }
+            }],
+            False),
+        (
+            [{  # tool call where parameters are first cannot be generated
+                "parameters": {
+                    "city": "Vienna"
+                },
+                "name": "get_current_weather"
+            }],
+            False),
+        (
+            [{  # tool call without all required parameters cannot be generated
+                "name": "get_forecast",
+                "parameters": {
+                    "city": "Vienna"
+                }
+            }],
+            False),
+        (  # tool call with incorrect name/parameters cannot be generated
+            [{
+                "name": "get_weather",
+                "parameters": {
+                    "city": "Vienna",
+                    "days": 7
+                }
+            }], False),
+        (  #  tool call with both valid and empty function cannot be generated
+            [{
+                "name": "get_current_weather",
+                "parameters": {
+                    "city": "Vienna"
+                }
+            }, {}], False),
+    ])
+def test_guided_json(sample_output, should_match):
+    _compile_and_check(tools=TypeAdapter(
+        list[ChatCompletionToolsParam]).validate_python(EXAMPLE_TOOLS),
+                       sample_output=sample_output,
+                       should_match=should_match)
+
+
+def update_parameters_none(
+        tool: ChatCompletionToolsParam) -> ChatCompletionToolsParam:
+    tool.function.parameters = None
+    return tool
+
+
+def update_parameters_empty_dict(
+        tool: ChatCompletionToolsParam) -> ChatCompletionToolsParam:
+    tool.function.parameters = {}
+    return tool
+
+
+@pytest.mark.parametrize(
+    "sample_output, should_match",
+    [
+        (None, False),
+        ([], False),  # empty list cannot be generated
+        ({}, False),  # empty object cannot be generated
+        ([{}], False),  # list with empty object cannot be generated
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather"
+            }],
+            False),
+        (
+            [{  # function without required parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": None
+            }],
+            False),
+        (
+            [{  # function with extra parameters cannot be generated
+                "name": "get_current_weather",
+                "parameters": {
+                    "extra": "value"
+                }
+            }],
+            False),
+        (
+            [{  # only function with empty parameters object is valid
+                "name": "get_current_weather",
+                "parameters": {}
+            }],
+            True),
+    ])
+@pytest.mark.parametrize(
+    "update_parameters",
+    [update_parameters_none, update_parameters_empty_dict])
+def test_guided_json_without_parameters(sample_output, should_match,
+                                        update_parameters):
+    updated_tools = [deepcopy(EXAMPLE_TOOLS[0])]
+    tools = TypeAdapter(
+        list[ChatCompletionToolsParam]).validate_python(updated_tools)
+    tools = list(map(update_parameters, tools))
+    assert all([
+        tool.function.parameters is None or tool.function.parameters == {}
+        for tool in tools
+    ])
+    _compile_and_check(tools=tools,
+                       sample_output=sample_output,
+                       should_match=should_match)
+
+
+@pytest.mark.parametrize("output", VALID_TOOLS)
+@pytest.mark.parametrize("empty_params", [False, True])
+@pytest.mark.parametrize("delta_len", [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+def test_streaming_output_valid(output, empty_params, delta_len):
+    self = MagicMock()
+
+    output = deepcopy(output)
+    if empty_params:
+        output = [{"name": o["name"], "parameters": {}} for o in output]
+    output_json = json.dumps(output)
+
+    previous_text = ""
+    function_name_returned = False
+    messages = []
+    for i in range(0, len(output_json), delta_len):
+        delta_text = output_json[i:i + delta_len]
+        current_text = previous_text + delta_text
+
+        delta_message, function_name_returned = (
+            OpenAIServingChat.extract_tool_call_required_streaming(
+                self,
+                previous_text=previous_text,
+                current_text=current_text,
+                delta_text=delta_text,
+                function_name_returned=function_name_returned))
+
+        if delta_message:
+            messages.append(delta_message)
+
+        previous_text = current_text
+
+    assert len(messages) > 0
+    combined_messages = "["
+    for message in messages:
+        if message.tool_calls[0].function.name:
+            if len(combined_messages) > 1:
+                combined_messages += "},"
+
+            combined_messages += '{"name": "' + \
+                message.tool_calls[0].function.name  + \
+                    '", "parameters": ' + \
+                        message.tool_calls[0].function.arguments
+        else:
+            combined_messages += message.tool_calls[0].function.arguments
+    combined_messages += "}]"
+    assert json.loads(combined_messages) == output
+    assert json.dumps(json.loads(combined_messages)) == output_json
--- a/tests/tpu/untest_compilation.py
+++ b/tests/tpu/untest_compilation.py
@@ -5,12 +5,8 @@ import os
 import tempfile

 import depyf
-import pytest

-from vllm.config import CompilationLevel

-
-@pytest.mark.skip(reason="Not working; needs investigation.")
 def test_tpu_compilation():
    temp_dir = tempfile.mkdtemp()
    with depyf.prepare_debug(temp_dir):
@@ -22,27 +18,24 @@ def test_tpu_compilation():
            "The greatest glory in living lies not in never falling,",
        ]
        answers = [
-            " or, through inaction, allow a human being to come to harm.",
-            " what is essential is invisible to the eye.",
-            " but in rising every time we fall.",
+            " or, through inaction",
+            " what is essential ",
+            " but in rising ",
        ]
-        N = 1
+
        # Currently, top-p sampling is disabled. `top_p` should be 1.0.
+        N = 1
        sampling_params = SamplingParams(temperature=0.7,
                                         top_p=1.0,
                                         n=N,
                                         max_tokens=16)

-        # Set `enforce_eager=True` to avoid ahead-of-time compilation.
-        # In real workloads, `enforace_eager` should be `False`.
+        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
+                  max_num_batched_tokens=256,
+                  max_model_len=256,
+                  max_num_seqs=32,
+                  enforce_eager=False)

-        # disable custom dispatcher, let Dynamo takes over
-        # all the control
-        llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct",
-                  max_model_len=512,
-                  max_num_seqs=64,
-                  enforce_eager=True,
-                  compilation_config={"level": CompilationLevel.DYNAMO_AS_IS})
        outputs = llm.generate(prompts, sampling_params)
        for output, answer in zip(outputs, answers):
            prompt = output.prompt
@@ -56,16 +49,11 @@ def test_tpu_compilation():
    for i, compiled_code in enumerate(compiled_codes):
        print("{} file: {}".format(i + 1, compiled_code))

-    # We should only trigger Dynamo compilation 4 times:
-    # 1. forward pass (symbolic)
-    # 2. compute_logits (symbolic)
-    # 3. forward pass (shape 16)
-    # 4. forward pass (shape 32)
-    # and later calls should not trigger Dynamo compilation again.
-    # NOTE: It might still trigger XLA compilation.
-
+    # We should only trigger Dynamo compilation 2 times:
+    # 1. Forward pass without kv_caches
+    # 2. Forward pass with kv_caches
    # Check we have 4 compiled codes
-    assert len(compiled_codes) == 4
+    assert len(compiled_codes) == 2

    kv_cache_prefix = "kv_cache"
    attn_prefix = "ragged_paged_attention"
@@ -77,24 +65,13 @@ def test_tpu_compilation():
    for i, compiled_fn in enumerate(compiled_fns):
        print("{} file: {}".format(i + 1, compiled_fn))

-    # The first compilation is symbolic, so it should not have any kv_caches
+    # The first compilation should not have any kv_caches
    with open(compiled_fns[0]) as f:
        content = f.read()
        assert kv_cache_prefix not in content

-    # The second compilation is symbolic, so it should not have any kv_caches
-    with open(compiled_fns[1]) as f:
-        content = f.read()
-        assert kv_cache_prefix not in content
-
-    # The third compilation is shape 16, so it should have kv_caches and the
+    # The second compilation should have kv_caches and the
    # ragged_paged_attention
-    with open(compiled_fns[2]) as f:
-        content = f.read()
-        assert (kv_cache_prefix in content and attn_prefix in content)
-
-    # The forth compilation is shape 32, so it should have kv_caches and the
-    # ragged_paged_attention
-    with open(compiled_fns[3]) as f:
+    with open(compiled_fns[1]) as f:
        content = f.read()
        assert (kv_cache_prefix in content and attn_prefix in content)
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -110,6 +110,9 @@ class RemoteOpenAIServer:
        self.host = str(args.host or 'localhost')
        self.port = int(args.port)

+        self.show_hidden_metrics = \
+            args.show_hidden_metrics_for_version is not None
+
        # download the model before starting the server to avoid timeout
        is_local = os.path.isdir(model)
        if not is_local:
@@ -323,6 +326,37 @@ def _test_completion_close(
    return results


+def _test_chat(
+    client: openai.OpenAI,
+    model: str,
+    prompt: str,
+):
+    results = []
+
+    messages = [{
+        "role": "user",
+        "content": [{
+            "type": "text",
+            "text": prompt
+        }]
+    }]
+
+    # test with text prompt
+    chat_response = client.chat.completions.create(model=model,
+                                                   messages=messages,
+                                                   max_tokens=5,
+                                                   temperature=0.0)
+
+    results.append({
+        "test": "completion_close",
+        "text": chat_response.choices[0].message.content,
+        "finish_reason": chat_response.choices[0].finish_reason,
+        "usage": chat_response.usage,
+    })
+
+    return results
+
+
 def _test_embeddings(
    client: openai.OpenAI,
    model: str,
@@ -518,6 +552,8 @@ def compare_all_settings(model: str,
                results += _test_completion(client, model, prompt, token_ids)
            elif method == "generate_close":
                results += _test_completion_close(client, model, prompt)
+            elif method == "generate_chat":
+                results += _test_chat(client, model, prompt)
            elif method == "generate_with_image":
                results += _test_image_text(
                    client, model,
@@ -585,7 +621,6 @@ def multi_process_parallel(
    # as compared to multiprocessing.
    # NOTE: We need to set working_dir for distributed tests,
    # otherwise we may get import errors on ray workers
-    # ray.init(num_gpus=tp_size, runtime_env={"working_dir": VLLM_PATH}) xiabo
    # NOTE: Force ray not to use gitignore file as excluding, otherwise
    # it will not move .so files to working dir.
    # So we have to manually add some of large directories

--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -5,8 +5,12 @@ import torch

 from vllm.multimodal.inputs import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
-from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
-                                         KVCacheBlock, PrefixCachingMetrics,
+from vllm.utils import sha256
+# disable yapf here as it formats differently than isort such that both fail
+# yapf: disable
+from vllm.v1.core.kv_cache_utils import (NONE_HASH, BlockHashType,
+                                         FreeKVCacheBlockQueue, KVCacheBlock,
+                                         PrefixCachingMetrics,
                                         generate_block_hash_extra_keys,
                                         hash_block_tokens,
                                         hash_request_tokens,
@@ -16,6 +20,8 @@ from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
 from vllm.v1.metrics.stats import PrefixCacheStats
 from vllm.v1.request import Request

+# yapf: enable
+

 def make_request(request_id,
                 prompt_token_ids,
@@ -40,6 +46,12 @@ def make_request(request_id,
    )


+def test_none_hash():
+    assert NONE_HASH is not None
+    assert isinstance(NONE_HASH, int)
+    assert NONE_HASH != 0
+
+
 def test_kv_cache_block():
    # Test KVCacheBlock initialization
    block = KVCacheBlock(block_id=0)
@@ -190,21 +202,23 @@ def test_generate_block_hash_extra_keys_no_mm_inputs():
    assert next_mm_idx == 0


-def test_hash_block_tokens():
+@pytest.mark.parametrize("hash_fn", [sha256, hash])
+def test_hash_block_tokens(hash_fn):
    parent_block_hash = 123
    curr_block_token_ids = (1, 2, 3)
    extra_keys = ("key1", "key2")

-    block_hash = hash_block_tokens(parent_block_hash, curr_block_token_ids,
-                                   extra_keys)
+    block_hash = hash_block_tokens(hash_fn, parent_block_hash,
+                                   curr_block_token_ids, extra_keys)
    assert isinstance(block_hash, BlockHashType)
-    assert block_hash.hash_value == hash(
+    assert block_hash.hash_value == hash_fn(
        (parent_block_hash, curr_block_token_ids, extra_keys))
    assert block_hash.token_ids == curr_block_token_ids
    assert block_hash.extra_keys == extra_keys


-def test_hash_request_tokens():
+@pytest.mark.parametrize("hash_fn", [sha256, hash])
+def test_hash_request_tokens(hash_fn):
    request = make_request(
        request_id=0,
        prompt_token_ids=[_ for _ in range(6)],
@@ -219,7 +233,7 @@ def test_hash_request_tokens():
    )

    block_size = 3
-    block_hashes = hash_request_tokens(block_size, request)
+    block_hashes = hash_request_tokens(hash_fn, block_size, request)

    assert len(block_hashes) == 2
    assert isinstance(block_hashes[0], BlockHashType)
@@ -234,7 +248,8 @@ def test_hash_request_tokens():
    assert block_hashes[1].extra_keys == ("hash2", )


-def test_hash_tokens_different_mm_input():
+@pytest.mark.parametrize("hash_fn", [sha256, hash])
+def test_hash_tokens_different_mm_input(hash_fn):
    request1 = make_request(
        request_id=0,
        prompt_token_ids=[_ for _ in range(6)],
@@ -260,13 +275,14 @@ def test_hash_tokens_different_mm_input():
        mm_hashes=["hash3", "hash2"],
    )
    block_size = 3
-    block_hashes1 = hash_request_tokens(block_size, request1)
-    block_hashes2 = hash_request_tokens(block_size, request2)
+    block_hashes1 = hash_request_tokens(hash_fn, block_size, request1)
+    block_hashes2 = hash_request_tokens(hash_fn, block_size, request2)
    assert block_hashes1[0] != block_hashes2[0]
    assert block_hashes1[1] != block_hashes2[1]


-def test_hash_request_tokens_no_mm_inputs():
+@pytest.mark.parametrize("hash_fn", [sha256, hash])
+def test_hash_request_tokens_no_mm_inputs(hash_fn):
    request = make_request(
        request_id=0,
        prompt_token_ids=[_ for _ in range(6)],
@@ -275,7 +291,7 @@ def test_hash_request_tokens_no_mm_inputs():
    )

    block_size = 3
-    block_hashes = hash_request_tokens(block_size, request)
+    block_hashes = hash_request_tokens(hash_fn, block_size, request)

    assert len(block_hashes) == 2
    assert block_hashes[0].token_ids == (0, 1, 2)

--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -4,14 +4,17 @@
 from typing import Optional

 import pytest
+import torch

 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
-from vllm.utils import cdiv
+from vllm.utils import cdiv, sha256
 from vllm.v1.core.block_pool import BlockPool
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
 from vllm.v1.core.kv_cache_utils import (BlockHashType, KVCacheBlock,
                                         hash_block_tokens)
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec)


 def make_request(request_id,
@@ -39,16 +42,31 @@ def make_request(request_id,
    )


-def test_prefill():
+def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
+    return KVCacheConfig(
+        num_blocks=num_blocks,
+        tensors={},
+        kv_cache_groups=[
+            KVCacheGroupSpec(['layer'],
+                             FullAttentionSpec(block_size, 1, 1, torch.float32,
+                                               False))
+        ],
+    )
+
+
+@pytest.mark.parametrize("hash_algo", ["sha256", "hash"])
+def test_prefill(hash_algo):
    manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
        max_model_len=8192,
-        sliding_window=None,
        enable_caching=True,
+        caching_hash_algo=hash_algo,
        num_preallocate_tokens=16,
    )

+    # choose the hash function according to the parameter
+    hash_fn = sha256 if hash_algo == "sha256" else hash
+
    # Complete 3 blocks (48 tokens)
    common_token_ids = [i for i in range(3) for _ in range(16)]

@@ -62,19 +80,20 @@ def test_prefill():
    assert not computed_blocks
    assert num_computed_tokens == 0
    blocks = manager.allocate_slots(req0, 55, computed_blocks)
-    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]

    # Check full block metadata
    parent_block_hash = None
-    for block_id in (0, 1, 2):
-        block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16])
-        block_hash = hash_block_tokens(parent_block_hash, block_tokens)
+    for block_id in (1, 2, 3):
+        block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
+        block_hash = hash_block_tokens(hash_fn, parent_block_hash,
+                                       block_tokens)
        assert manager.block_pool.blocks[block_id].block_hash == block_hash
        assert manager.block_pool.blocks[block_id].ref_cnt == 1
        parent_block_hash = block_hash.hash_value

    # Check partial/preallocated block metadata
-    for block_id in (3, 4):
+    for block_id in (4, 5):
        assert manager.block_pool.blocks[block_id].block_hash is None
        assert manager.block_pool.blocks[block_id].ref_cnt == 1

@@ -84,11 +103,11 @@ def test_prefill():
    req1 = make_request("1", common_token_ids + unique_token_ids)
    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert [b.block_id for b in computed_blocks] == [0, 1, 2]
+    assert [b.block_id for b in computed_blocks] == [1, 2, 3]
    assert num_computed_tokens == 3 * 16
    num_new_tokens = 53 - 3 * 16
    blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
-    assert [b.block_id for b in blocks] == [5, 6]
+    assert [b.block_id for b in blocks] == [6, 7]
    for block in computed_blocks:
        assert block.ref_cnt == 2

@@ -101,14 +120,14 @@ def test_prefill():
    # All blocks should be available.
    assert manager.block_pool.free_block_queue.num_free_blocks == 10
    # The order should be
-    # [unallocated (7, 8, 9)]
-    # [unique_req0 (4, 3)]
-    # [unique_req1 (6, 5)]
-    # [common (2, 1, 0)]
+    # [unallocated (8, 9, 10)]
+    # [unique_req0 (5, 4)]
+    # [unique_req1 (7, 6)]
+    # [common (3, 2, 1)]
    assert [
        b.block_id
        for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ] == [7, 8, 9, 4, 3, 6, 5, 2, 1, 0]
+    ] == [8, 9, 10, 5, 4, 7, 6, 3, 2, 1]

    # Cache hit in the common prefix when the original block is already free.
    # Incomplete 1 block (6 tokens)
@@ -116,11 +135,11 @@ def test_prefill():
    req2 = make_request("2", common_token_ids + unique_token_ids)
    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
    assert len(manager.req_to_block_hashes[req2.request_id]) == 3
-    assert [b.block_id for b in computed_blocks] == [0, 1, 2]
+    assert [b.block_id for b in computed_blocks] == [1, 2, 3]
    assert num_computed_tokens == 3 * 16
    num_new_tokens = 53 - 3 * 16
    blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
-    assert [b.block_id for b in blocks] == [7, 8]
+    assert [b.block_id for b in blocks] == [8, 9]

    # Although we only have 5 free blocks, we have 8 blocks in
    # the free block queue due to lazy removal.
@@ -142,7 +161,7 @@ def test_prefill():
    assert num_computed_tokens == 0
    blocks = manager.allocate_slots(req3, 16 * 9, computed_blocks)
    # This block ID order also checks the eviction order.
-    assert [b.block_id for b in blocks] == [9, 4, 3, 6, 5, 8, 7, 2, 1, 0]
+    assert [b.block_id for b in blocks] == [10, 5, 4, 7, 6, 9, 8, 3, 2, 1]
    assert manager.block_pool.free_block_queue.num_free_blocks == 0
    assert manager.block_pool.free_block_queue.free_list_head is None
    assert manager.block_pool.free_block_queue.free_list_tail is None
@@ -156,13 +175,13 @@ def test_prefill_plp():
    3. Schedule plp request; no hit should occur; validate blocks
    '''
    manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
        max_model_len=8192,
-        sliding_window=None,
        enable_caching=True,
        num_preallocate_tokens=16,
    )
+    # the default hash function is hash
+    hash_fn = hash

    # Complete 3 blocks (48 tokens)
    common_token_ids = [i for i in range(3) for _ in range(16)]
@@ -178,20 +197,21 @@ def test_prefill_plp():
    assert not computed_blocks
    assert num_computed_tokens == 0
    blocks = manager.allocate_slots(req0, 55, computed_blocks)
-    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
    req0_block_hashes = [b.block_hash for b in blocks]

    # Check full block metadata
    parent_block_hash = None
-    for block_id in (0, 1, 2):
-        block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16])
-        block_hash = hash_block_tokens(parent_block_hash, block_tokens)
+    for block_id in (1, 2, 3):
+        block_tokens = tuple(all_token_ids[(block_id - 1) * 16:block_id * 16])
+        block_hash = hash_block_tokens(hash_fn, parent_block_hash,
+                                       block_tokens)
        assert manager.block_pool.blocks[block_id].block_hash == block_hash
        assert manager.block_pool.blocks[block_id].ref_cnt == 1
        parent_block_hash = block_hash.hash_value

    # Check partial/preallocated block metadata
-    for block_id in (3, 4):
+    for block_id in (4, 5):
        assert manager.block_pool.blocks[block_id].block_hash is None
        assert manager.block_pool.blocks[block_id].ref_cnt == 1

@@ -202,11 +222,11 @@ def test_prefill_plp():
    req1 = make_request("1", common_token_ids + unique_token_ids)
    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
-    assert [b.block_id for b in computed_blocks] == [0, 1, 2]
+    assert [b.block_id for b in computed_blocks] == [1, 2, 3]
    assert num_computed_tokens == 3 * 16
    num_new_tokens = 53 - 3 * 16
    blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
-    assert [b.block_id for b in blocks] == [5, 6]
+    assert [b.block_id for b in blocks] == [6, 7]
    for block in computed_blocks:
        assert block.ref_cnt == 2

@@ -219,14 +239,14 @@ def test_prefill_plp():
    # All blocks should be available.
    assert manager.block_pool.free_block_queue.num_free_blocks == 10
    # The order should be
-    # [unallocated (7, 8, 9)]
-    # [unique_req0 (4, 3)]
-    # [unique_req1 (6, 5)]
-    # [common (2, 1, 0)]
+    # [unallocated (8, 9, 10)]
+    # [unique_req0 (5, 4)]
+    # [unique_req1 (7, 6)]
+    # [common (3, 2, 1)]
    assert [
        b.block_id
        for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ] == [7, 8, 9, 4, 3, 6, 5, 2, 1, 0]
+    ] == [8, 9, 10, 5, 4, 7, 6, 3, 2, 1]

    # Request #2 is a prompt-logprobs request:
    # NO cache hit in the common prefix; duplicates request #0 cached blocks
@@ -242,7 +262,7 @@ def test_prefill_plp():
    block_ids = [b.block_id for b in blocks]
    # Duplicate cached blocks have different ids but same hashes vs request #0
    assert [b.block_hash for b in blocks] == req0_block_hashes
-    assert block_ids != [0, 1, 2, 3, 4]
+    assert block_ids != [1, 2, 3, 4, 5]

    # Request #2 block hashes are valid since request #0 hashes are.
    # Check block reference counts.
@@ -254,10 +274,8 @@ def test_prefill_plp():

 def test_decode():
    manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
        max_model_len=8192,
-        sliding_window=None,
        enable_caching=True,
        num_preallocate_tokens=16,
    )
@@ -273,7 +291,7 @@ def test_decode():
    assert not computed_blocks
    assert num_computed_tokens == 0
    blocks = manager.allocate_slots(req0, 55, computed_blocks)
-    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]

    # Append slots without allocating a new block.
    req0.num_computed_tokens = 55
@@ -307,10 +325,8 @@ def test_decode():

 def test_evict():
    manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
        max_model_len=8192,
-        sliding_window=None,
        enable_caching=True,
        num_preallocate_tokens=16,
    )
@@ -341,15 +357,15 @@ def test_evict():
    assert [
        b.block_id
        for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ] == [6, 5, 4, 3, 2, 1, 0, 9, 8, 7]
+    ] == [7, 6, 5, 4, 3, 2, 1, 10, 9, 8]

    # Touch the first 2 blocks.
    req2 = make_request("2", list(range(2 * 16 + 3)))
    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
-    assert [b.block_id for b in computed_blocks] == [0, 1]
+    assert [b.block_id for b in computed_blocks] == [1, 2]
    assert num_computed_tokens == 2 * 16
    blocks = manager.allocate_slots(req2, 3, computed_blocks)
-    assert [b.block_id for b in blocks] == [6, 5]
+    assert [b.block_id for b in blocks] == [7, 6]
    assert manager.block_pool.free_block_queue.num_free_blocks == 6


@@ -360,10 +376,8 @@ def test_hash_block_correct_reuse():
    """
    block_size = 16
    manager = KVCacheManager(
-        block_size=block_size,
-        num_gpu_blocks=1,
+        make_kv_cache_config(16, 2),
        max_model_len=8192,
-        sliding_window=None,
        enable_caching=True,
        num_preallocate_tokens=0,
    )
@@ -399,10 +413,8 @@ def test_computed_blocks_not_evicted():
    """
    block_size = 16
    manager = KVCacheManager(
-        block_size=block_size,
-        num_gpu_blocks=2,
+        make_kv_cache_config(block_size, 3),
        max_model_len=8192,
-        sliding_window=None,
        enable_caching=True,
        num_preallocate_tokens=0,
    )
@@ -415,7 +427,7 @@ def test_computed_blocks_not_evicted():
    assert num_computed_tokens == 0
    blocks = manager.allocate_slots(req0, num_tokens, computed_blocks)
    assert len(blocks) == 1
-    assert blocks[0].block_id == 0
+    assert blocks[0].block_id == 1

    # Allocate another block.
    req1 = make_request("1", list(range(num_tokens, num_tokens * 2)))
@@ -424,7 +436,7 @@ def test_computed_blocks_not_evicted():
    assert num_computed_tokens == 0
    blocks = manager.allocate_slots(req1, num_tokens, computed_blocks)
    assert len(blocks) == 1
-    assert blocks[0].block_id == 1
+    assert blocks[0].block_id == 2

    # Free the blocks.
    manager.free(req0)
@@ -435,13 +447,13 @@ def test_computed_blocks_not_evicted():
    req2 = make_request("2", list(range(num_tokens * 2)))
    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
    assert len(computed_blocks) == 1
-    assert computed_blocks[0].block_id == 0
+    assert computed_blocks[0].block_id == 1
    assert num_computed_tokens == block_size

    blocks = manager.allocate_slots(req2, num_tokens * 2 - num_tokens,
                                    computed_blocks)
    assert len(blocks) == 1
-    assert blocks[0].block_id == 1
+    assert blocks[0].block_id == 2


 def test_basic_prefix_caching_disabled():
@@ -450,10 +462,8 @@ def test_basic_prefix_caching_disabled():
    """
    block_size = 4
    manager = KVCacheManager(
-        block_size=block_size,
-        num_gpu_blocks=4,
+        make_kv_cache_config(block_size, 5),
        max_model_len=8192,
-        sliding_window=None,
        enable_caching=False,
        num_preallocate_tokens=0,
    )
@@ -493,10 +503,8 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
    This tests that the preallocated blocks are correctly added.
    """
    manager = KVCacheManager(
-        block_size=block_size,
-        num_gpu_blocks=10,
+        make_kv_cache_config(block_size, 11),
        max_model_len=8192,
-        sliding_window=None,
        enable_caching=True,
        num_preallocate_tokens=num_preallocate_tokens,
    )
@@ -522,7 +530,8 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
    assert len(blocks) == 1 + num_preallocated_blocks


-def test_cache_blocks():
+@pytest.mark.parametrize("hash_fn", [sha256, hash])
+def test_cache_blocks(hash_fn):
    """
    This is a unit test that tests the correctness of the _cache_full_blocks
    function of KVCacheManager.
@@ -550,6 +559,7 @@ def test_cache_blocks():
        num_cached_blocks=0,
        num_full_blocks=2,
        block_size=block_size,
+        hash_fn=hash_fn,
    )

    assert len(block_pool.cached_block_hash_to_block) == 2
@@ -564,6 +574,7 @@ def test_cache_blocks():
        num_cached_blocks=2,
        num_full_blocks=3,
        block_size=block_size,
+        hash_fn=hash_fn,
    )
    assert len(block_pool.cached_block_hash_to_block) == 3
    assert blocks[0].block_hash is not None
@@ -574,10 +585,8 @@ def test_mm_prefix_caching():
    This tests that the multi-modal prefix caching is correct.
    """
    manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
        max_model_len=8192,
-        sliding_window=None,
        enable_caching=True,
        num_preallocate_tokens=16,
    )
@@ -617,7 +626,7 @@ def test_mm_prefix_caching():
    assert block_hashes[2].extra_keys == ("bbb", )

    blocks = manager.allocate_slots(req0, 59, computed_blocks)
-    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4, 5]
    req0.num_computed_tokens = 59

    # Append slots without allocating a new block.
@@ -655,10 +664,8 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
    """
    block_size = 16
    manager = KVCacheManager(
-        block_size=block_size,
-        num_gpu_blocks=10,
+        make_kv_cache_config(block_size, 11),
        max_model_len=8192,
-        sliding_window=None,
        enable_caching=True,
        num_preallocate_tokens=0,
    )
@@ -711,10 +718,8 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():

 def test_reset_prefix_cache():
    manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
+        make_kv_cache_config(16, 11),
        max_model_len=8192,
-        sliding_window=None,
        enable_caching=True,
        num_preallocate_tokens=0,
    )
@@ -724,7 +729,7 @@ def test_reset_prefix_cache():
    all_token_ids = full_block_token_ids + unique_token_ids
    req0 = make_request("0", all_token_ids)
    blocks = manager.allocate_slots(req0, 55)
-    assert [b.block_id for b in blocks] == [0, 1, 2, 3]
+    assert [b.block_id for b in blocks] == [1, 2, 3, 4]

    unique_token_ids = [4] * 7
    all_token_ids = full_block_token_ids + unique_token_ids
@@ -733,7 +738,7 @@ def test_reset_prefix_cache():
    assert len(manager.req_to_block_hashes[req1.request_id]) == 3
    assert len(computed_blocks) == 3
    blocks = manager.allocate_slots(req1, 7, computed_blocks)
-    assert [b.block_id for b in blocks] == [4]
+    assert [b.block_id for b in blocks] == [5]

    # Failed to reset prefix cache because some blocks are not freed yet.
    assert not manager.reset_prefix_cache()

--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -2,12 +2,15 @@
 from typing import Optional

 import pytest
+import torch

 from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
+                                        KVCacheGroupSpec)
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
@@ -20,9 +23,10 @@ def create_scheduler(
    max_num_seqs: int = 16,
    max_num_batched_tokens: int = 8192,
    enable_prefix_caching: Optional[bool] = None,
+    long_prefill_token_threshold: int = 0,
 ) -> Scheduler:
    '''Create scheduler under test.
-    
+
    Args:
      model: model under test
      max_num_seqs: max sequences to schedule
@@ -38,6 +42,7 @@ def create_scheduler(
        max_num_seqs=max_num_seqs,
        max_num_batched_tokens=max_num_batched_tokens,
        max_model_len=max_num_batched_tokens,
+        long_prefill_token_threshold=long_prefill_token_threshold,
    )
    model_config = ModelConfig(
        model=model,
@@ -64,13 +69,21 @@ def create_scheduler(
        model_config=model_config,
        cache_config=cache_config,
    )
+    kv_cache_config = KVCacheConfig(
+        num_blocks=10000,  # A large number of blocks to hold all requests
+        tensors={},
+        kv_cache_groups=[
+            KVCacheGroupSpec(['layer'],
+                             FullAttentionSpec(16, 1, 1, torch.float32, False))
+        ],
+    )
    cache_config.num_gpu_blocks = 10000
    return Scheduler(
        scheduler_config,
        model_config,
        cache_config,
-        speculative_config=None,
        lora_config=None,
+        kv_cache_config=kv_cache_config,
        log_stats=True,
        structured_output_manager=StructuredOutputManager(vllm_config),
    )
@@ -242,7 +255,9 @@ def test_schedule_partial_requests():
    model_runner_output = ModelRunnerOutput(
        req_ids=[request.request_id for request in requests],
        req_id_to_index=req_to_index,
-        sampled_token_ids=[[0] for _ in range(len(requests))],
+        # Only the first request has a sampled token id because
+        # the rest requests are still being prefilled.
+        sampled_token_ids=[[0], [], []],
        spec_token_ids=None,
        logprobs=None,
        prompt_logprobs_dict={},
@@ -263,6 +278,86 @@ def test_schedule_partial_requests():
    assert requests[2].request_id not in output.num_scheduled_tokens


+@pytest.mark.parametrize("enable_prefix_caching", [True, False])
+def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
+    """Test scheduling behavior with concurrent partial requests.
+
+    This test verifies that: there are multiple long prefill requests in the
+    RUNNING state, and we can schedule them together.
+
+    """
+    scheduler = create_scheduler(
+        model="facebook/opt-125m",
+        max_num_batched_tokens=1024,
+        long_prefill_token_threshold=400,
+        enable_prefix_caching=enable_prefix_caching,
+    )
+    requests = create_requests(
+        num_requests=3,
+        num_tokens=800,
+    )
+    for request in requests:
+        scheduler.add_request(request)
+
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 3
+    assert len(output.scheduled_cached_reqs) == 0
+    assert len(output.finished_req_ids) == 0
+
+    # The first request is scheduled partially - 400.
+    assert output.num_scheduled_tokens[requests[0].request_id] == 400
+    # The second request is scheduled partially - 400.
+    assert output.num_scheduled_tokens[requests[1].request_id] == 400
+    # The third request is also scheduled partially - 1024 - 400 - 400 = 224.
+    assert output.num_scheduled_tokens[requests[2].request_id] == 224
+    req_to_index = {
+        request.request_id: i
+        for i, request in enumerate(requests)
+    }
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[request.request_id for request in requests],
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[] for _ in range(len(requests))],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    scheduler.update_from_output(output, model_runner_output)
+
+    # Schedule the next step. All three requests are running.
+    # Processed the remaining prefills of the first and second requests.
+    output1 = scheduler.schedule()
+    assert len(scheduler.running) == 3
+    assert len(output1.scheduled_new_reqs) == 0
+    assert len(output1.scheduled_cached_reqs) == 3
+    assert len(output1.finished_req_ids) == 0
+    assert output1.num_scheduled_tokens[requests[0].request_id] == 400
+    assert output1.num_scheduled_tokens[requests[1].request_id] == 400
+    assert output1.num_scheduled_tokens[requests[2].request_id] == 224
+
+    # Schedule the third step. All three requests are running.
+    # First and second requests are in the decode stage.
+    # All the remaining tokens in the third request are processed.
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[request.request_id for request in requests],
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[0], [0]] + [[] for _ in range(len(requests) - 2)],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    scheduler.update_from_output(output1, model_runner_output)
+    output2 = scheduler.schedule()
+    assert len(scheduler.running) == 3
+    assert len(output2.scheduled_new_reqs) == 0
+    assert len(output2.scheduled_cached_reqs) == 3
+    assert len(output2.finished_req_ids) == 0
+    assert output2.num_scheduled_tokens[requests[0].request_id] == 1
+    assert output2.num_scheduled_tokens[requests[1].request_id] == 1
+    assert output2.num_scheduled_tokens[
+        requests[2].request_id] == 800 - 224 - 224
+
+
 def test_stop_via_update_from_output():
    """Test stopping behavior through update_from_output"""
    scheduler = create_scheduler()
@@ -516,3 +611,99 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool],
        prompt_logprobs_dict={},
    )
    scheduler.update_from_output(scheduler_output1, model_runner_output)
+
+
+# Note - these test cases mirror some of those in test_rejection_sampler.py
+@pytest.mark.parametrize(
+    "spec_tokens,output_tokens,expected",
+    [
+        ([[1, 2, 3]], [[1, 2, 3, 4]], (3, 3)),  # perfect match
+        ([[1, 2, 3]], [[1, 5]], (3, 1)),  # early mismatch
+        ([[1, 2], [3]], [[1, 2, 5], [3, 4]], (3, 3)),  # multiple sequences
+        ([[1]], [[1, 2]], (1, 1)),  # single token sequence
+        ([[]], [[5]], (0, 0)),  # empty sequence
+        ([[1, 2, 3], [4, 5, 6]], [[1, 2, 7], [4, 8]],
+         (6, 3)),  # multiple mismatches
+    ])
+def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
+    """Test scheduling behavior with speculative decoding.
+
+    This test verifies that:
+    1. Speculated tokens get scheduled correctly
+    2. Spec decoding stats properly count number of draft and accepted tokens
+    """
+    scheduler = create_scheduler()
+    requests = create_requests(num_requests=len(spec_tokens), num_tokens=1)
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
+
+    # Schedule a decode, which will also draft speculative tokens
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == len(requests)
+    assert output.total_num_scheduled_tokens == len(requests)
+    for i in range(len(requests)):
+        req_id = requests[i].request_id
+        assert output.num_scheduled_tokens[req_id] == 1
+        assert req_id not in output.scheduled_spec_decode_tokens
+
+    model_runner_output = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[0] for _ in range(len(requests))],
+        spec_token_ids=spec_tokens,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    engine_core_outputs = scheduler.update_from_output(output,
+                                                       model_runner_output)
+
+    for i in range(len(requests)):
+        running_req = scheduler.running[i]
+        # The prompt token
+        assert running_req.num_computed_tokens == 1
+        # The prompt token and the sampled token
+        assert running_req.num_tokens == 2
+        # The prompt token, the sampled token, and the speculated tokens
+        assert running_req.num_tokens_with_spec == 2 + len(spec_tokens[i])
+
+    # No draft or accepted tokens counted yet
+    assert engine_core_outputs.scheduler_stats.spec_decoding_stats is None
+
+    # Schedule the speculated tokens for validation
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 0
+    # The sampled token and speculated tokens
+    assert output.total_num_scheduled_tokens == \
+        len(requests) + sum(len(ids) for ids in spec_tokens)
+    for i in range(len(requests)):
+        req_id = requests[i].request_id
+        assert output.num_scheduled_tokens[req_id] == 1 + len(spec_tokens[i])
+        if spec_tokens[i]:
+            assert len(output.scheduled_spec_decode_tokens[req_id]) == \
+                len(spec_tokens[i])
+        else:
+            assert req_id not in output.scheduled_spec_decode_tokens
+
+    model_runner_output = ModelRunnerOutput(
+        req_ids=req_ids,
+        req_id_to_index=req_to_index,
+        sampled_token_ids=output_tokens,
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    engine_core_outputs = scheduler.update_from_output(output,
+                                                       model_runner_output)
+
+    scheduler_stats = engine_core_outputs.scheduler_stats
+    if expected[0] == 0:
+        assert scheduler_stats.spec_decoding_stats is None
+    else:
+        assert scheduler_stats.spec_decoding_stats is not None
+        stats = scheduler_stats.spec_decoding_stats
+        assert stats.num_draft_tokens == expected[0]
+        assert stats.num_accepted_tokens == expected[1]
--- a/tests/v1/core/test_scheduler_e2e.py
+++ b/tests/v1/core/test_scheduler_e2e.py
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+import pytest
+
+from vllm import LLM
+
+if os.getenv("VLLM_USE_V1", "0") != "1":
+    pytest.skip("Test package requires V1", allow_module_level=True)
+
+MODEL = "meta-llama/Llama-3.2-1B"
+PROMPT = "Hello my name is Robert and I"
+
+
+@pytest.fixture(scope="module")
+def model() -> LLM:
+    return LLM(MODEL,
+               enforce_eager=True,
+               enable_prefix_caching=True,
+               long_prefill_token_threshold=2,
+               max_num_batched_tokens=6,
+               max_num_seqs=3)
+
+
+def test_concurrent_partial_prefill(model):
+    outputs = model.generate([PROMPT] * 3)
+    assert len(outputs) == 3
+    for output in outputs:
+        assert len(output.outputs) == 1
--- a/tests/v1/core/test_specialized_manager.py
+++ b/tests/v1/core/test_specialized_manager.py
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+
+from vllm.v1.core.block_pool import BlockPool
+from vllm.v1.core.kv_cache_utils import BlockHashType, KVCacheBlock
+from vllm.v1.core.specialized_manager import SlidingWindowManager
+from vllm.v1.kv_cache_interface import SlidingWindowSpec
+
+
+def test_sliding_window_possible_cached_prefix():
+    sliding_window_spec = SlidingWindowSpec(
+        block_size=2,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        sliding_window=4,
+        use_mla=False,
+    )
+
+    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    manager = SlidingWindowManager(sliding_window_spec, block_pool)
+
+    def run_one_case(block_is_cached, expect_length):
+        block_hash_list = [
+            BlockHashType(i, ()) for i in range(len(block_is_cached))
+        ]
+
+        block_pool.cached_block_hash_to_block.clear()
+
+        # Mock the block pool with the cached blocks
+        for i, (block_hash,
+                is_cached) in enumerate(zip(block_hash_list, block_is_cached)):
+            if is_cached:
+                block_pool.cached_block_hash_to_block[block_hash] = {
+                    i: block_pool.blocks[i + 10]
+                }
+
+        computed_blocks = manager.find_longest_cache_hit(block_hash_list)
+        assert len(computed_blocks) == expect_length
+
+        assert all(block == block_pool.null_block
+                   for block in computed_blocks[:expect_length - 2])
+        for i in range(2):
+            if i < expect_length:
+                block_index = expect_length - i - 1
+                assert computed_blocks[
+                    block_index].block_id == block_index + 10
+
+    run_one_case([False] * 10, 0)
+    run_one_case([True], 1)
+    run_one_case([True, False], 1)
+    run_one_case([True, True], 2)
+    run_one_case([True, True, False], 2)
+    run_one_case([True, True, True], 3)
+    run_one_case([True, True, True, False], 3)
+    run_one_case([
+        True, True, False, True, False, False, True, True, False, True, True,
+        True
+    ], 12)
+    run_one_case([
+        True, True, False, True, False, False, True, True, False, False, False
+    ], 8)
+    run_one_case([
+        True, True, False, True, False, False, True, True, False, False, False,
+        True
+    ], 8)
+
+
+def test_sliding_window_remove_skipped_blocks():
+    sliding_window_spec = SlidingWindowSpec(
+        block_size=2,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        sliding_window=4,
+        use_mla=False,
+    )
+
+    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
+
+    manager = SlidingWindowManager(sliding_window_spec, block_pool)
+
+    null_block_id = block_pool.null_block.block_id
+
+    def id_to_block_table(ids):
+        return [
+            KVCacheBlock(id_)
+            if id_ != null_block_id else block_pool.null_block for id_ in ids
+        ]
+
+    def assert_block_id(block_table, ids):
+        for block, id_ in zip(block_table, ids):
+            if id_ == null_block_id:
+                assert block == block_pool.null_block
+            else:
+                assert block.block_id == id_
+
+    original_block_ids = [
+        1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010
+    ]
+    block_table = id_to_block_table(original_block_ids)
+    removed = manager.remove_skipped_blocks(block_table, 0)
+    assert_block_id(removed, [])
+    assert_block_id(block_table, original_block_ids)
+
+    # 4 tokens are computed. Only token 0 is out of the sliding window. As
+    # block 1000 also contains token 1 that is in the sliding window, block 1000
+    # cannot be removed.
+    removed = manager.remove_skipped_blocks(block_table, 4)
+    assert_block_id(removed, [])
+    assert_block_id(block_table, original_block_ids)
+
+    # 5 tokens are computed. Token 0 & 1 are out of the sliding window.
+    # Block 1000 can be removed.
+    removed = manager.remove_skipped_blocks(block_table, 5)
+    assert_block_id(removed, [original_block_ids[0]])
+    assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
+
+    # 6 tokens are computed. Token 0-2 are out of the sliding window.
+    # Cannot remove new block as the block 1001 is still used by token 3.
+    removed = manager.remove_skipped_blocks(block_table, 6)
+    assert_block_id(removed, [])
+    assert_block_id(block_table, [null_block_id] + original_block_ids[1:])
+
+    # 7 tokens are computed. Token 0-3 are out of the sliding window.
+    # Block 1001 can be removed and block 1000 is already removed.
+    removed = manager.remove_skipped_blocks(block_table, 7)
+    assert_block_id(removed, [original_block_ids[1]])
+    assert_block_id(block_table, [null_block_id] * 2 + original_block_ids[2:])
+
+    # 11 tokens are computed. Token 0-7 are out of the sliding window.
+    # Block 1002 & 1003 can be removed now. Block 1003 represents a longer
+    # sequence, and is expected to be evicted earlier than 1002, so the order
+    # of removed blocks should be [1003, 1002].
+    removed = manager.remove_skipped_blocks(block_table, 11)
+    assert_block_id(removed, [original_block_ids[3], original_block_ids[2]])
+    assert_block_id(block_table, [null_block_id] * 4 + original_block_ids[4:])
--- a/tests/v1/e2e/test_correctness_sliding_window.py
+++ b/tests/v1/e2e/test_correctness_sliding_window.py
+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+from ...core.block.e2e.test_correctness_sliding_window import (check_answers,
+                                                               prep_prompts)
+
+
+@dataclass
+class TestConfig:
+    sliding_window: int
+    ln_range: tuple[int, int]
+
+
+model_config = {
+    "bigcode/starcoder2-3b": TestConfig(4096, (800, 1100)),
+    "google/gemma-2-2b-it": TestConfig(4096, (400, 800)),
+}
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "bigcode/starcoder2-3b",  # sliding window only
+        "google/gemma-2-2b-it",  # sliding window + full attention
+    ])
+@pytest.mark.parametrize("batch_size", [5])
+@pytest.mark.parametrize("seed", [1])
+def test_sliding_window_retrival(monkeypatch, model, batch_size, seed):
+    """
+    The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then
+    asks for value of one of them (which is outside the sliding window).
+    If we tell it upfront which we are going to be looking for, then
+    it answers correctly (mostly).
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        test_config = model_config[model]
+
+        llm = LLM(model=model)
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+
+        prompts, answer, indices = prep_prompts(batch_size,
+                                                ln_range=test_config.ln_range)
+
+        check_length(prompts, llm, test_config.sliding_window)
+
+        # Fresh generation
+        responses = llm.generate(prompts, sampling_params)
+        check_answers(indices,
+                      answer,
+                      [response.outputs[0].text for response in responses],
+                      accept_rate=1.0)
+
+        # Re-generate with the same prompts to test prefix caching
+        responses = llm.generate(prompts, sampling_params)
+        check_answers(indices,
+                      answer,
+                      [response.outputs[0].text for response in responses],
+                      accept_rate=1.0)
+
+
+def check_length(prompts: list[str], llm: LLM, sliding_window: int):
+    """
+    Check if the prompt length is valid, i.e., longer than the sliding window 
+    size and shorter than the model's max length.
+
+    Args:
+        prompts: list of prompts
+        llm: LLM object
+        sliding_window: Sliding window size
+    """
+    tokenizer = llm.get_tokenizer()
+    max_model_len = llm.llm_engine.model_config.max_model_len
+    assert any(
+        len(tokenizer.encode(prompt)) > sliding_window
+        for prompt in prompts), "Prompt is too short for test"
+    assert all(
+        len(tokenizer.encode(prompt)) <= max_model_len
+        for prompt in prompts), "Prompt is too long for test"
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
 # SPDX-License-Identifier: Apache-2.0

 import os
+from argparse import ArgumentError
+
 import pytest

 from vllm import envs
@@ -34,6 +36,24 @@ def test_prefix_caching_from_cli():
    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
    assert vllm_config.cache_config.enable_prefix_caching

+    # default hash algorithm is "builtin"
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "builtin"
+
+    # set hash algorithm to sha256
+    args = parser.parse_args(["--prefix-caching-hash-algo", "sha256"])
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "sha256"
+
+    # set hash algorithm to builtin
+    args = parser.parse_args(["--prefix-caching-hash-algo", "builtin"])
+    vllm_config = EngineArgs.from_cli_args(args=args).create_engine_config()
+    assert vllm_config.cache_config.prefix_caching_hash_algo == "builtin"
+
+    # an invalid hash algorithm raises an error
+    parser.exit_on_error = False
+    with pytest.raises(ArgumentError):
+        args = parser.parse_args(["--prefix-caching-hash-algo", "invalid"])
+

 def test_defaults_with_usage_context():
    engine_args = EngineArgs(model=os.path.join(models_path_prefix, "facebook/opt-125m"))

--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -233,8 +233,10 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
    Test that the engine can handle multiple concurrent batches.
    """

-    def make_request_with_max_tokens(max_tokens: int) -> EngineCoreRequest:
+    def make_request_with_max_tokens(req_id: int,
+                                     max_tokens: int) -> EngineCoreRequest:
        request = make_request()
+        request.request_id = req_id
        request.sampling_params.max_tokens = max_tokens
        return request

@@ -281,6 +283,8 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
            # Avoid all requests being scheduled once.
            enable_prefix_caching=False,
            max_num_batched_tokens=10,
+            # Reduce startup time.
+            enforce_eager=True,
        )
        vllm_config = engine_args.create_engine_config()
        engine_core = EngineCore(vllm_config=vllm_config,
@@ -288,13 +292,13 @@ def test_engine_core_concurrent_batches(monkeypatch: pytest.MonkeyPatch):
                                 executor_class=DummyExecutor)
        assert engine_core.batch_queue is not None

-        # Add two requests in a row.
-        req = make_request_with_max_tokens(5)
-        engine_core.add_request(req)
-        req = make_request_with_max_tokens(5)
-        engine_core.add_request(req)
+        # Add two requests in a row. Each request have 12 prompt tokens.
+        req0 = make_request_with_max_tokens(0, 5)
+        engine_core.add_request(req0)
+        req1 = make_request_with_max_tokens(1, 5)
+        engine_core.add_request(req1)

-        # First saturate the batch queue.
+        # Schedule Batch 1: (10, req0)
        assert engine_core.step_with_batch_queue() is None
        assert engine_core.batch_queue.qsize() == 1
        assert engine_core.step_with_batch_queue() is None

--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -169,11 +169,11 @@ def test_engine_core_client(monkeypatch: pytest.MonkeyPatch,

            core_client: SyncMPClient = client

-            result = core_client._call_utility("echo", "testarg")
+            result = core_client.call_utility("echo", "testarg")
            assert result == "testarg"

            with pytest.raises(Exception) as e_info:
-                core_client._call_utility("echo", None, "help!")
+                core_client.call_utility("echo", None, "help!")

            assert str(e_info.value) == "Call to echo method failed: help!"

@@ -240,10 +240,10 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):

        core_client: AsyncMPClient = client

-        result = await core_client._call_utility_async("echo", "testarg")
+        result = await core_client.call_utility_async("echo", "testarg")
        assert result == "testarg"

        with pytest.raises(Exception) as e_info:
-            await core_client._call_utility_async("echo", None, "help!")
+            await core_client.call_utility_async("echo", None, "help!")

        assert str(e_info.value) == "Call to echo method failed: help!"
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -4,78 +4,76 @@ from __future__ import annotations

 import json
 import re
+from enum import Enum
 from typing import Any

 import jsonschema
 import pytest
+from pydantic import BaseModel

 from vllm.entrypoints.llm import LLM
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams

-GUIDED_DECODING_BACKENDS_V1 = ["xgrammar", "guidance"]
-MODELS_TO_TEST = [
-    "Qwen/Qwen2.5-1.5B-Instruct", "mistralai/Ministral-8B-Instruct-2410"
+PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
+    ("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
+     "auto"),
+    ("mistralai/Ministral-8B-Instruct-2410", "guidance:disable-any-whitespace",
+     "auto"),
+    ("mistralai/Ministral-8B-Instruct-2410", "xgrammar:disable-any-whitespace",
+     "mistral"),
+    ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar:disable-any-whitespace", "auto"),
+    #FIXME: This test is flaky on CI thus disabled
+    #("Qwen/Qwen2.5-1.5B-Instruct", "guidance:disable-any-whitespace", "auto"),
 ]

+PARAMS_MODELS_TOKENIZER_MODE = [
+    ("mistralai/Ministral-8B-Instruct-2410", "auto"),
+    ("Qwen/Qwen2.5-1.5B-Instruct", "auto"),
+]

-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_json_completion(
-    monkeypatch: pytest.MonkeyPatch,
-    sample_json_schema: dict[str, Any],
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
-    outputs = llm.generate(prompts=[
-        f"Give an example JSON for an employee profile "
-        f"that fits this schema: {sample_json_schema}"
-    ] * 2,
-                           sampling_params=sampling_params,
-                           use_tqdm=True)

-    assert outputs is not None
+class CarType(str, Enum):
+    sedan = "sedan"
+    suv = "SUV"
+    truck = "Truck"
+    coupe = "Coupe"

-    for output in outputs:
-        assert output is not None
-        assert isinstance(output, RequestOutput)
-        prompt = output.prompt

-        generated_text = output.outputs[0].text
-        assert generated_text is not None
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-        output_json = json.loads(generated_text)
-        jsonschema.validate(instance=output_json, schema=sample_json_schema)
+class CarDescription(BaseModel):
+    brand: str
+    model: str
+    car_type: CarType


 @pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_json_completion_disable_any_whitespace(
+@pytest.mark.parametrize("model_name, guided_decoding_backend, tokenizer_mode",
+                         PARAMS_MODELS_BACKENDS_TOKENIZER_MODE)
+def test_structured_output(
    monkeypatch: pytest.MonkeyPatch,
    sample_json_schema: dict[str, Any],
+    unsupported_json_schema: dict[str, Any],
+    sample_sql_ebnf: str,
+    sample_sql_lark: str,
+    sample_regex: str,
+    sample_guided_choice: str,
    guided_decoding_backend: str,
+    tokenizer_mode: str,
    model_name: str,
 ):
-    if guided_decoding_backend != "xgrammar":
-        pytest.skip("disable-any-whitespace is only supported for xgrammar.")
-    guided_decoding_backend = 'xgrammar:disable-any-whitespace'
-
    monkeypatch.setenv("VLLM_USE_V1", "1")
+
+    # Use a single LLM instance for several scenarios to
+    # speed up the test suite.
    llm = LLM(model=model_name,
+              enforce_eager=True,
              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+              guided_decoding_backend=guided_decoding_backend,
+              tokenizer_mode=tokenizer_mode)
+
+    #
+    # Test 1: Generate JSON output based on a provided schema
+    #
    sampling_params = SamplingParams(
        temperature=1.0,
        max_tokens=1000,
@@ -96,25 +94,15 @@ def test_guided_json_completion_disable_any_whitespace(

        generated_text = output.outputs[0].text
        assert generated_text is not None
-        assert "\n" not in generated_text
+        if 'disable-any-whitespace' in guided_decoding_backend:
+            assert "\n" not in generated_text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
        output_json = json.loads(generated_text)
        jsonschema.validate(instance=output_json, schema=sample_json_schema)

-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_json_object(
-    monkeypatch: pytest.MonkeyPatch,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 2: Generate JSON object without a schema
+    #
    sampling_params = SamplingParams(
        temperature=1.0,
        max_tokens=100,
@@ -137,38 +125,18 @@ def test_guided_json_object(
            print(generated_text)
            assert generated_text is not None

-            # Parse to verify it is valid JSON
+            # Parse to verify it is a valid JSON object
            parsed_json = json.loads(generated_text)
-            allowed_types: tuple[type, ...] = (dict, )
-            if guided_decoding_backend == "xgrammar":
-                # TODO - we are currently too permissive with xgrammar and
-                # allow # any valid json (typically comes back as a list or
-                # object).  We can fix this by specifying a jsonschema of
-                # {"type": "object"}, # but we need this fix in a release
-                # first: https://github.com/mlc-ai/xgrammar/pull/264
-                allowed_types = (dict, list)
-            assert isinstance(parsed_json, allowed_types)
-
+            assert isinstance(parsed_json, dict)

-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1 + ["auto"])
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_json_unsupported_schema(
-    monkeypatch: pytest.MonkeyPatch,
-    unsupported_json_schema: dict[str, Any],
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 3: test a jsonschema incompatible with xgrammar
+    #
    sampling_params = SamplingParams(
        temperature=1.0,
        max_tokens=1000,
        guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
-    if guided_decoding_backend == "xgrammar":
+    if guided_decoding_backend.startswith("xgrammar"):
        with pytest.raises(ValueError,
                           match="The provided JSON schema contains features "
                           "not supported by xgrammar."):
@@ -179,8 +147,6 @@ def test_guided_json_unsupported_schema(
                         sampling_params=sampling_params,
                         use_tqdm=True)
    else:
-        # This should work for both "guidance" and "auto".
-
        outputs = llm.generate(
            prompts=("Give an example JSON object for a grade "
                     "that fits this schema: "
@@ -199,21 +165,9 @@ def test_guided_json_unsupported_schema(
            parsed_json = json.loads(generated_text)
            assert isinstance(parsed_json, dict)

-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_grammar_ebnf(
-    monkeypatch: pytest.MonkeyPatch,
-    sample_sql_ebnf: str,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 4: Generate SQL statement using EBNF grammar
+    #
    sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.95,
@@ -243,21 +197,9 @@ def test_guided_grammar_ebnf(

        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_grammar_lark(
-    monkeypatch: pytest.MonkeyPatch,
-    sample_sql_lark: str,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 5: Generate SQL statement using Lark grammar
+    #
    sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.95,
@@ -292,20 +234,9 @@ def test_guided_grammar_lark(

        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_grammar_ebnf_invalid(
-    monkeypatch: pytest.MonkeyPatch,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 6: Test invalid grammar input
+    #
    sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.95,
@@ -319,21 +250,9 @@ def test_guided_grammar_ebnf_invalid(
            use_tqdm=True,
        )

-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_regex(
-    monkeypatch: pytest.MonkeyPatch,
-    sample_regex: str,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 7: Generate text based on a regex pattern
+    #
    sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.95,
@@ -357,21 +276,9 @@ def test_guided_regex(
        assert re.fullmatch(sample_regex, generated_text) is not None
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

-
-@pytest.mark.skip_global_cleanup
-@pytest.mark.parametrize("guided_decoding_backend",
-                         GUIDED_DECODING_BACKENDS_V1)
-@pytest.mark.parametrize("model_name", MODELS_TO_TEST)
-def test_guided_choice_completion(
-    monkeypatch: pytest.MonkeyPatch,
-    sample_guided_choice: str,
-    guided_decoding_backend: str,
-    model_name: str,
-):
-    monkeypatch.setenv("VLLM_USE_V1", "1")
-    llm = LLM(model=model_name,
-              max_model_len=1024,
-              guided_decoding_backend=guided_decoding_backend)
+    #
+    # Test 8: Generate text based on a choices
+    #
    sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.95,
@@ -390,3 +297,71 @@ def test_guided_choice_completion(
        assert generated_text is not None
        assert generated_text in sample_guided_choice
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    #
+    # Test 9: Generate structured output using a Pydantic model with an enum
+    #
+    json_schema = CarDescription.model_json_schema()
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=json_schema))
+    outputs = llm.generate(
+        prompts="Generate a JSON with the brand, model and car_type of"
+        "the most iconic car from the 90's",
+        sampling_params=sampling_params,
+        use_tqdm=True)
+
+    assert outputs is not None
+
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=json_schema)
+
+
+@pytest.mark.skip_global_cleanup
+@pytest.mark.parametrize("model_name, tokenizer_mode",
+                         PARAMS_MODELS_TOKENIZER_MODE)
+def test_structured_output_auto_mode(
+    monkeypatch: pytest.MonkeyPatch,
+    unsupported_json_schema: dict[str, Any],
+    model_name: str,
+    tokenizer_mode: str,
+):
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+
+    llm = LLM(model=model_name,
+              max_model_len=1024,
+              guided_decoding_backend="auto",
+              tokenizer_mode=tokenizer_mode)
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=unsupported_json_schema))
+
+    # This would fail with the default of "xgrammar", but in "auto"
+    # we will handle fallback automatically.
+    outputs = llm.generate(prompts=("Give an example JSON object for a grade "
+                                    "that fits this schema: "
+                                    f"{unsupported_json_schema}"),
+                           sampling_params=sampling_params,
+                           use_tqdm=True)
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(generated_text)
+
+        # Parse to verify it is valid JSON
+        parsed_json = json.loads(generated_text)
+        assert isinstance(parsed_json, dict)
--- a/tests/v1/sample/test_topk_topp_sampler.py
+++ b/tests/v1/sample/test_topk_topp_sampler.py
+# SPDX-License-Identifier: Apache-2.0
+import torch
+from torch import Generator
+
+from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+
+DEVICE = "cuda"
+
+BATCH_SIZE = 1024
+VOCAB_SIZE = 128 * 1024
+
+
+def test_topk_impl_equivalance():
+
+    with torch.device(DEVICE):
+        generator = Generator(device=DEVICE).manual_seed(33)
+
+        logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
+
+        # Random top-k values between 1 and 9.
+        k = torch.randint(1, 10, (BATCH_SIZE, ), generator=generator)
+
+        # Set k=vocab_size for ~50% of requests in the batch (top-k disabled).
+        k.masked_fill_(
+            torch.randint(0,
+                          2, (BATCH_SIZE, ),
+                          generator=generator,
+                          dtype=bool), VOCAB_SIZE)
+
+        # Top-k only implementation
+        result1 = apply_top_k_top_p(logits=logits.clone(), k=k, p=None)
+
+        # Top-p + top-k
+        no_op_top_p = torch.tensor([1.0])
+        result2 = apply_top_k_top_p(logits=logits.clone(), k=k, p=no_op_top_p)
+
+        assert torch.allclose(result1, result2)
--- a/tests/v1/structured_output/test_utils.py
+++ b/tests/v1/structured_output/test_utils.py
@@ -13,10 +13,6 @@ def unsupported_string_schemas():
            "type": "string",
            "pattern": "^[a-zA-Z]+$"
        },
-        {
-            "type": "string",
-            "enum": ["active", "inactive", "pending"]
-        },
        {
            "type": "string",
            "minLength": 1
@@ -164,6 +160,10 @@ def supported_schema():
                    "type": "number"
                }
            },
+            "car_type": {
+                "type": "string",
+                "enum": ["sedan", "suv", "truck"]
+            },
            "address": {
                "type": "object",
                "properties": {

--- a/tests/v1/test_async_llm_dp.py
+++ b/tests/v1/test_async_llm_dp.py
+# SPDX-License-Identifier: Apache-2.0
+
+import asyncio
+import os
+from contextlib import ExitStack
+from typing import Optional
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.inputs import PromptType
+from vllm.platforms import current_platform
+from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.core_client import DPAsyncMPClient
+
+engine_args = AsyncEngineArgs(
+    model="ibm-research/PowerMoE-3b",
+    enforce_eager=True,
+    disable_log_requests=True,
+    tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
+    data_parallel_size=int(os.getenv("DP_SIZE", 2)),
+)
+
+if not current_platform.supports_v1(engine_args.create_model_config()):
+    pytest.skip(reason="Requires V1-supporting platform.",
+                allow_module_level=True)
+
+
+async def generate(engine: AsyncLLM,
+                   request_id: str,
+                   prompt: PromptType,
+                   output_kind: RequestOutputKind,
+                   max_tokens: int,
+                   prompt_logprobs: Optional[int] = None) -> tuple[int, str]:
+    # Ensure generate doesn't complete too fast for cancellation test.
+    await asyncio.sleep(0.2)
+
+    count = 0
+    sampling_params = SamplingParams(max_tokens=max_tokens,
+                                     ignore_eos=True,
+                                     output_kind=output_kind,
+                                     temperature=0,
+                                     prompt_logprobs=prompt_logprobs)
+    async for out in engine.generate(request_id=request_id,
+                                     prompt=prompt,
+                                     sampling_params=sampling_params):
+
+        num_tokens = len(out.outputs[0].token_ids)
+        if output_kind == RequestOutputKind.DELTA:
+            count += num_tokens
+        else:
+            count = num_tokens
+
+        await asyncio.sleep(0.)
+
+    return count, request_id
+
+
+@pytest.mark.parametrize(
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+@pytest.mark.asyncio
+async def test_load(output_kind: RequestOutputKind):
+
+    with ExitStack() as after:
+
+        prompt = "This is a test of data parallel"
+
+        engine = AsyncLLM.from_engine_args(engine_args)
+        after.callback(engine.shutdown)
+
+        NUM_REQUESTS = 100
+        NUM_EXPECTED_TOKENS = 10
+
+        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
+
+        # Create concurrent requests.
+        tasks = []
+        for request_id in request_ids:
+            tasks.append(
+                asyncio.create_task(
+                    generate(engine, request_id, prompt, output_kind,
+                             NUM_EXPECTED_TOKENS)))
+
+        # Confirm that we got all the EXPECTED tokens from the requests.
+        done, pending = await asyncio.wait(tasks,
+                                           return_when=asyncio.FIRST_EXCEPTION)
+        for task in pending:
+            task.cancel()
+        for task in done:
+            num_generated_tokens, request_id = await task
+            assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
+                f"{request_id} generated {num_generated_tokens} but "
+                f"expected {NUM_EXPECTED_TOKENS}")
+
+        assert not engine.output_processor.has_unfinished_requests()
+
+        # testing internals here which may break
+        core_client: DPAsyncMPClient = engine.engine_core
+        # the engines only synchronize stopping every N steps so
+        # allow a small amount of time here.
+        for _ in range(10):
+            if core_client.num_engines_running == 0:
+                break
+            await asyncio.sleep(0.5)
+
+        assert core_client.num_engines_running == 0
+        assert not core_client.reqs_in_flight
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -49,7 +49,9 @@ def test_unsupported_configs(monkeypatch):
        with pytest.raises(NotImplementedError):
            AsyncEngineArgs(
                model=MODEL,
-                speculative_model=MODEL,
+                speculative_config={
+                    "model": MODEL,
+                },
            ).create_engine_config()

        with pytest.raises(NotImplementedError):
@@ -102,14 +104,6 @@ def test_enable_by_default_fallback(monkeypatch):
        assert envs.VLLM_USE_V1
        m.delenv("VLLM_USE_V1")

-        # Should fall back to V0 for experimental config.
-        _ = AsyncEngineArgs(
-            model=MODEL,
-            enable_lora=True,
-        ).create_engine_config()
-        assert not envs.VLLM_USE_V1
-        m.delenv("VLLM_USE_V1")
-
        # Should fall back to V0 for supported model.
        _ = AsyncEngineArgs(
            model=UNSUPPORTED_MODELS_V1[0]).create_engine_config()
@@ -123,7 +117,7 @@ def test_v1_llm_by_default(monkeypatch):
            m.delenv("VLLM_USE_V1")

        # Should default to V1 for supported config.
-        model = LLM(MODEL, enforce_eager=True)
+        model = LLM(MODEL, enforce_eager=True, enable_lora=True)
        print(model.generate("Hello my name is"))
        assert hasattr(model.llm_engine, "engine_core")
        m.delenv("VLLM_USE_V1")

--- a/tests/v1/tpu/test_basic.py
+++ b/tests/v1/tpu/test_basic.py
@@ -31,14 +31,12 @@ TENSOR_PARALLEL_SIZES = [1]
                    reason="This is a basic test for TPU only")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [5])
-@pytest.mark.parametrize("enforce_eager", [True])
 @pytest.mark.parametrize("tensor_parallel_size", TENSOR_PARALLEL_SIZES)
-def test_models(
+def test_basic(
    vllm_runner: type[VllmRunner],
    monkeypatch: pytest.MonkeyPatch,
    model: str,
    max_tokens: int,
-    enforce_eager: bool,
    tensor_parallel_size: int,
 ) -> None:
    prompt = "The next numbers of the sequence " + ", ".join(
@@ -50,12 +48,15 @@ def test_models(

        with vllm_runner(
                model,
-                max_model_len=8192,
-                enforce_eager=enforce_eager,
+                # Note: max_num_batched_tokens == 1024 is needed here to
+                # actually test chunked prompt
+                max_num_batched_tokens=1024,
+                max_model_len=8196,
                gpu_memory_utilization=0.7,
                max_num_seqs=16,
                tensor_parallel_size=tensor_parallel_size) as vllm_model:
            vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                      max_tokens)
        output = vllm_outputs[0][1]
-        assert "1024" in output
+
+        assert "1024" in output or "0, 1" in output
--- a/tests/v1/tpu/test_pallas.py
+++ b/tests/v1/tpu/test_pallas.py
+# SPDX-License-Identifier: Apache-2.0
+from unittest.mock import ANY, patch
+
+import torch
+
+from vllm.attention.backends.abstract import AttentionType
+from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
+                                               NUM_QUERIES_PER_BLOCK,
+                                               PallasAttentionBackendImpl,
+                                               PallasMetadata)
+
+
+def test_ragged_paged_attention():
+    # We verify that the kernel inputs such as sliding_window, etc. are passed
+    # in from the model correctly.
+    # The correctness of the paged attention kernel is tested in the kernel
+    # library.
+    num_heads = 4
+    head_size = 128
+    scale = 1.0
+    num_kv_heads = 4
+    sliding_window = 128
+    logits_soft_cap = 50.0
+    attn_impl = PallasAttentionBackendImpl(
+        num_heads=num_heads,
+        head_size=head_size,
+        scale=scale,
+        num_kv_heads=num_kv_heads,
+        alibi_slopes=None,
+        sliding_window=sliding_window,
+        kv_cache_dtype="auto",
+        logits_soft_cap=logits_soft_cap,
+        attn_type=AttentionType.DECODER,
+    )
+    mock_vmem_limit_bytes = 1024
+    attn_impl.vmem_limit_bytes = mock_vmem_limit_bytes
+
+    class FakeAttentionLayer:
+        _k_scale_float: float
+        _v_scale_float: float
+
+    layer = FakeAttentionLayer()
+    layer._k_scale_float = 1.0
+    layer._v_scale_float = 1.0
+
+    num_tokens = 16
+    num_blocks = 1024
+    block_size = 16
+    query = torch.zeros(num_tokens, num_heads * head_size)
+    key = torch.zeros(num_tokens, num_kv_heads * head_size)
+    value = torch.zeros(num_tokens, num_kv_heads * head_size)
+    kv_cache = torch.zeros(num_blocks, block_size, num_kv_heads * 2, head_size)
+    slot_mapping = torch.zeros(num_tokens, dtype=torch.int64)
+    max_num_reqs = 8
+    max_num_blocks_per_req = 8
+    block_tables = torch.zeros((max_num_reqs, max_num_blocks_per_req),
+                               dtype=torch.int32)
+    context_lens = torch.ones((max_num_reqs, ), dtype=torch.int32)
+    query_lens = [1] * max_num_reqs
+    query_start_loc = torch.cumsum(torch.tensor([0] + query_lens,
+                                                dtype=torch.int32),
+                                   dim=0,
+                                   dtype=torch.int32)
+    num_seqs = torch.tensor([max_num_reqs], dtype=torch.int32)
+    attn_metadata = PallasMetadata(
+        slot_mapping=slot_mapping,
+        block_tables=block_tables,
+        context_lens=context_lens,
+        query_start_loc=query_start_loc,
+        num_seqs=num_seqs,
+    )
+
+    with patch("torch.ops.xla.ragged_paged_attention"
+               ) as mock_ragged_paged_attention:
+        attn_impl.forward(
+            layer=layer,
+            query=query,
+            key=key,
+            value=value,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+        )
+
+        mock_ragged_paged_attention.assert_called_once_with(
+            ANY,  # query
+            ANY,  # kv_cache
+            ANY,  # context_lens
+            ANY,  # block_tables
+            ANY,  # query_start_loc
+            ANY,  # num_seqs
+            num_kv_pages_per_block=NUM_KV_PAGES_PER_BLOCK,
+            num_queries_per_block=NUM_QUERIES_PER_BLOCK,
+            vmem_limit_bytes=mock_vmem_limit_bytes,
+            use_kernel=True,
+            sm_scale=scale,
+            sliding_window=sliding_window,
+            soft_cap=logits_soft_cap,
+        )
--- a/tests/v1/tpu/test_perf.py
+++ b/tests/v1/tpu/test_perf.py
+# SPDX-License-Identifier: Apache-2.0
+"""A basic performance regression test for TPUs
+
+Run `pytest tests/v1/tpu/test_perf.py`.
+"""
+from __future__ import annotations
+
+import time
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pytest
+
+from vllm.platforms import current_platform
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+if TYPE_CHECKING:
+    from tests.conftest import VllmRunner
+
+
+@dataclass
+class TestParams:
+    model: str
+    num_prompts: int
+    prefix_len: int
+    decode_len: int
+    expected_avg_time: float
+    err_tol: float
+
+
+TEST_PARAMS = [
+    # TODO: Cannot run a series of tests because:
+    #   RuntimeError: Bad StatusOr access: UNKNOWN: TPU initialization failed:
+    #   open(/dev/vfio/0): Device or resource busy: Device or resource busy;
+    #   Couldn't open iommu group /dev/vfio/0
+    # => Investigate
+
+    # TestParams(
+    #     model="Qwen/Qwen2.5-1.5B-Instruct",
+    #     num_prompts=1,
+    #     prefix_len=10,
+    #     decode_len=5,
+    #     expected_avg_time=0.03,
+    #     err_tol=0.01,
+    # ),
+    # TestParams(
+    #     model="Qwen/Qwen2.5-1.5B-Instruct",
+    #     num_prompts=10,
+    #     prefix_len=100,
+    #     decode_len=50,
+    #     expected_avg_time=0.234,
+    #     err_tol=0.020,
+    # ),
+    TestParams(
+        model="Qwen/Qwen2.5-1.5B-Instruct",
+        num_prompts=64,
+        prefix_len=500,
+        decode_len=50,
+
+        # (This is the active CI/CD instance)
+        # commit id: ccb246776d93ef105904a8ec015b3587240a1183
+        # tpu: v5lite (vllm CI/CD)
+        expected_avg_time=1.4,
+        err_tol=0.30,
+
+        # (TODO: There is no v6e in CI/CD currently)
+        # commit id: ccb246776d93ef105904a8ec015b3587240a1183
+        # tpu: v6e
+        # expected_avg_time=1.5,
+        # err_tol=0.20,
+    ),
+]
+
+NUM_WARMUPS = 5
+NUM_RUNS = 10
+
+MAX_MODEL_LEN = 1024
+MAX_NUM_SEQS = 32
+GPU_UTIL = 0.9
+
+
+@pytest.mark.skipif(not current_platform.is_tpu(),
+                    reason="This is a basic performance test for TPU only")
+@pytest.mark.parametrize("params", TEST_PARAMS)
+def test_perf(
+    vllm_runner: type[VllmRunner],
+    monkeypatch: pytest.MonkeyPatch,
+    params: TestParams,
+) -> None:
+    tokenizer = get_tokenizer(params.model,
+                              tokenizer_mode="auto",
+                              trust_remote_code=True)
+
+    prompts = []
+    for i in range(params.num_prompts):
+        prefix_token_ids = np.random.randint(0,
+                                             tokenizer.vocab_size,
+                                             size=params.prefix_len).tolist()
+        prompt = tokenizer.decode(prefix_token_ids)
+        prompts.append(prompt)
+
+    print(
+        "-- Running: num_prompts = {} prefix_len = {} decode_len = {}".format(
+            len(prompts), params.prefix_len, params.decode_len))
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        sampling_params = SamplingParams(max_tokens=params.decode_len,
+                                         temperature=1.0,
+                                         min_p=0.0)
+
+        with vllm_runner(params.model,
+                         max_num_batched_tokens=MAX_MODEL_LEN,
+                         max_model_len=MAX_MODEL_LEN,
+                         max_num_seqs=MAX_NUM_SEQS,
+                         gpu_memory_utilization=GPU_UTIL,
+                         enforce_eager=False,
+                         tensor_parallel_size=1) as vllm_model:
+            print("  -- Warmup / Compile")
+            for i in range(NUM_WARMUPS):
+                _ = vllm_model.generate(prompts, sampling_params)
+
+            print("  -- Benchmarking... ")
+            times = []
+            for i in range(NUM_RUNS):
+                start_time = time.time()
+                _ = vllm_model.generate(prompts, sampling_params)
+                times.append(time.time() - start_time)
+
+            avg_time = sum(times) / len(times)
+
+            print("  -- avg_time = {}".format(avg_time))
+            print("  -- expected_avg_time = {} with err_tol = {}".format(
+                params.expected_avg_time, params.err_tol))
+            diff = avg_time - params.expected_avg_time
+            ok = diff < params.err_tol
+            if diff < -params.err_tol:
+                print("  !! WARNING !! Performance has improved by {}, "
+                      "it may be necessary to fine-tune the "
+                      "expected_avg_time = {}".format(
+                          -diff, params.expected_avg_time))
+
+            assert ok, " !! ERROR !! Regression detected"