Merge tag 'v0.8.4' into v0.8.4-ori

9c4ecf15 · zhuwenwen · bfc2d6f7 · dc1b4a6f · 9c4ecf15 · 9c4ecf15
Commit 9c4ecf15 authored Apr 14, 2025 by zhuwenwen
20 changed files
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -47,12 +47,10 @@ def test_filter_subtensors():
 @pytest.fixture(scope="module")
 def llama_3p2_1b_files():
-    with TemporaryDirectory() as cache_dir:
+    input_dir = snapshot_download("meta-llama/Llama-3.2-1B-Instruct",
-        input_dir = snapshot_download("meta-llama/Llama-3.2-1B-Instruct",
+                                  ignore_patterns=["*.bin*", "original/*"])
-                                      cache_dir=cache_dir,
-                                      ignore_patterns=["*.bin*", "original/*"])
-        yield input_dir
+    yield input_dir
 def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
@@ -64,9 +62,9 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
    # Copy metadata files to output directory
    for file in os.listdir(input_dir):
-        if not any(
+        if os.path.isdir(os.path.join(input_dir, file)):
-                file.endswith(ext) and not os.path.isdir(file)
+            continue
-                for ext in weights_patterns):
+        if not any(file.endswith(ext) for ext in weights_patterns):
            shutil.copy(f"{input_dir}/{file}", output_dir)
@@ -81,7 +79,8 @@ def _run_generate(input_dir, queue: mp.Queue, **kwargs):
 @pytest.mark.parametrize("enable_lora", [False, True])
 @pytest.mark.parametrize("tp_size", [1, 2])
 def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
-                              llama_3p2_1b_files):
+                              llama_3p2_1b_files,
+                              monkeypatch: pytest.MonkeyPatch):
    if num_gpus_available < tp_size:
        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
@@ -89,6 +88,8 @@ def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
    gpu_memory_utilization = 0.8
    input_dir = llama_3p2_1b_files
    ctx = mp.get_context("spawn")
+    # The interface in v1 engine has changed, run in v1 engine will hang.
+    monkeypatch.setenv("VLLM_USE_V1", "0")
    # Run in separate processes for memory & CUDA isolation
    with TemporaryDirectory() as output_dir:

--- a/tests/tool_use/conftest.py
+++ b/tests/tool_use/conftest.py
@@ -10,10 +10,33 @@ from vllm.platforms import current_platform
 from .utils import ARGS, CONFIGS, ServerConfig
+# select models to test based on command line arguments
+def pytest_addoption(parser):
+    parser.addoption("--models",
+                     nargs="+",
+                     help="Specify one or more models to test")
+    parser.addoption("--extended",
+                     action="store_true",
+                     default=False,
+                     help="invoke extended tests requiring large GPUs")
 # for each server config, download the model and return the config
 @pytest.fixture(scope="session", params=CONFIGS.keys())
 def server_config(request):
-    config = CONFIGS[request.param]
+    extended = request.config.getoption("--extended")
+    models = request.config.getoption("--models")
+    config_keys_to_test = [
+        key for key in CONFIGS if (models is None or key in models) and (
+            extended or not CONFIGS[key].get("extended", False))
+    ]
+    config_key = request.param
+    if config_key not in config_keys_to_test:
+        pytest.skip(f"Skipping config '{config_key}'")
+    config = CONFIGS[config_key]
    if current_platform.is_rocm() and not config.get("supports_rocm", True):
        pytest.skip("The {} model can't be tested on the ROCm platform".format(

--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -16,6 +16,7 @@ class ServerConfig(TypedDict, total=False):
    system_prompt: Optional[str]
    supports_parallel: Optional[bool]
    supports_rocm: Optional[bool]
+    extended: Optional[bool]  # tests do not run in CI automatically
 def patch_system_prompt(messages: list[dict[str, Any]],
@@ -82,6 +83,21 @@ CONFIGS: dict[str, ServerConfig] = {
        "supports_parallel":
        False,
    },
+    "llama4": {
+        "model":
+        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+        "arguments": [
+            "--enforce-eager", "--no-enable-prefix-caching",
+            "--tool-call-parser", "pythonic", "--chat-template",
+            str(VLLM_PATH /
+                "examples/tool_chat_template_llama4_pythonic.jinja"), "-tp",
+            "4"
+        ],
+        "supports_parallel":
+        False,
+        "extended":
+        True
+    },
    "mistral": {
        "model":
        "mistralai/Mistral-7B-Instruct-v0.3",

--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@@ -44,7 +44,7 @@ def test_tpu_compilation():
            assert generated_text.startswith(answer)
    compiled_codes = sorted(
-        glob.glob(os.path.join(temp_dir, "__transformed_code*.py")))
+        glob.glob(os.path.join(temp_dir, "__transformed_code*for_forward.py")))
    for i, compiled_code in enumerate(compiled_codes):
        print("{} file: {}".format(i + 1, compiled_code))
@@ -52,15 +52,21 @@ def test_tpu_compilation():
    # We should only trigger Dynamo compilation 2 times:
    # 1. Forward pass without kv_caches
    # 2. Forward pass with kv_caches
-    # Check we have 4 compiled codes
+    # Check we have 2 compiled codes
    assert len(compiled_codes) == 2
    kv_cache_prefix = "kv_cache"
    attn_prefix = "ragged_paged_attention"
+    def extract_compiled_index(s):
+        parts = s.replace(".", "_").split("_")
+        numbers = [int(part) for part in parts if part.isdigit()]
+        return numbers[0]
    # Check all the compilations are as expected
-    compiled_fns = sorted(
+    compiled_fns = sorted(glob.glob(
-        glob.glob(os.path.join(temp_dir, "__compiled_fn*Captured*.py")))
+        os.path.join(temp_dir, "__compiled_fn*Captured*.py")),
+                          key=lambda s: extract_compiled_index(s))
    for i, compiled_fn in enumerate(compiled_fns):
        print("{} file: {}".format(i + 1, compiled_fn))

--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -3,14 +3,17 @@
 import pytest
 import torch
-from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
-from vllm.utils import sha256
+from vllm.utils import GiB_bytes, sha256
+from vllm.v1.core.kv_cache_manager import KVCacheManager
 # disable yapf here as it formats differently than isort such that both fail
 # yapf: disable
 from vllm.v1.core.kv_cache_utils import (NONE_HASH, BlockHashType,
                                         FreeKVCacheBlockQueue, KVCacheBlock,
                                         PrefixCachingMetrics,
+                                         estimate_max_model_len,
                                         generate_block_hash_extra_keys,
                                         hash_block_tokens,
                                         hash_request_tokens,
@@ -46,6 +49,18 @@ def make_request(request_id,
    )
+def new_kv_cache_spec(block_size=16,
+                      num_kv_heads=2,
+                      head_size=64,
+                      dtype=torch.float32,
+                      use_mla=False):
+    return FullAttentionSpec(block_size=block_size,
+                             num_kv_heads=num_kv_heads,
+                             head_size=head_size,
+                             dtype=dtype,
+                             use_mla=use_mla)
 def test_none_hash():
    assert NONE_HASH is not None
    assert isinstance(NONE_HASH, int)
@@ -158,13 +173,10 @@ def test_generate_block_hash_extra_keys():
    request = make_request(
        request_id=0,
        prompt_token_ids=[_ for _ in range(20)],
-        mm_positions=[{
+        mm_positions=[
-            "offset": 0,
+            PlaceholderRange(offset=0, length=5),
-            "length": 5
+            PlaceholderRange(offset=10, length=5),
-        }, {
+        ],
-            "offset": 10,
-            "length": 5
-        }],
        mm_hashes=["hash1", "hash2"],
    )
@@ -222,13 +234,10 @@ def test_hash_request_tokens(hash_fn):
    request = make_request(
        request_id=0,
        prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[{
+        mm_positions=[
-            "offset": 0,
+            PlaceholderRange(offset=0, length=3),
-            "length": 3
+            PlaceholderRange(offset=3, length=3),
-        }, {
+        ],
-            "offset": 3,
-            "length": 3
-        }],
        mm_hashes=["hash1", "hash2"],
    )
@@ -253,25 +262,19 @@ def test_hash_tokens_different_mm_input(hash_fn):
    request1 = make_request(
        request_id=0,
        prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[{
+        mm_positions=[
-            "offset": 0,
+            PlaceholderRange(offset=0, length=3),
-            "length": 3
+            PlaceholderRange(offset=3, length=3),
-        }, {
+        ],
-            "offset": 3,
-            "length": 3
-        }],
        mm_hashes=["hash1", "hash2"],
    )
    request2 = make_request(
        request_id=1,
        prompt_token_ids=[_ for _ in range(6)],
-        mm_positions=[{
+        mm_positions=[
-            "offset": 0,
+            PlaceholderRange(offset=0, length=3),
-            "length": 3
+            PlaceholderRange(offset=3, length=3),
-        }, {
+        ],
-            "offset": 3,
-            "length": 3
-        }],
        mm_hashes=["hash3", "hash2"],
    )
    block_size = 3
@@ -337,18 +340,6 @@ def test_metrics():
 def test_unify_kv_cache_configs():
-    def new_kv_cache_spec(block_size=16,
-                          num_kv_heads=2,
-                          head_size=64,
-                          dtype=torch.float32,
-                          use_mla=False):
-        return FullAttentionSpec(block_size=block_size,
-                                 num_kv_heads=num_kv_heads,
-                                 head_size=head_size,
-                                 dtype=dtype,
-                                 use_mla=use_mla)
    same_kv_cache_config = [
        KVCacheConfig(
            num_blocks=10,
@@ -438,3 +429,106 @@ def test_unify_kv_cache_configs():
    ]
    with pytest.raises(AssertionError):
        unify_kv_cache_configs(diff_kv_cache_config)
+@pytest.mark.parametrize(
+    ("model_id", "max_model_len", "want_estimated_max_len"), [
+        ("Qwen/Qwen1.5-7B", 16385, 16384),
+        ("Qwen/Qwen1.5-7B", 16383, 16383),
+    ])
+def test_estimate_max_model_len(model_id, max_model_len,
+                                want_estimated_max_len):
+    # Create a VllmConfig
+    model_config = ModelConfig(
+        model_id,
+        task="generate",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        max_model_len=max_model_len,
+    )
+    scheduler_config = SchedulerConfig(max_num_batched_tokens=32768)
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        scheduler_config=scheduler_config,
+    )
+    # Create KV cache specs
+    kv_cache_spec = {}
+    for i in range(32):
+        layer_name = f"layer_{i}"
+        kv_cache_spec[layer_name] = FullAttentionSpec(
+            block_size=16,
+            num_kv_heads=32,
+            head_size=128,
+            dtype=torch.float16,
+            use_mla=False,
+        )
+    # Estimate the maximum model length, 16384 model_len need 8GB
+    estimated_max_len = estimate_max_model_len(vllm_config, kv_cache_spec,
+                                               8 * GiB_bytes)
+    assert estimated_max_len == want_estimated_max_len
+def test_allocate_with_lookahead():
+    """Verify that lookahead tokens correctly affect block allocation"""
+    block_size = 4
+    config = KVCacheConfig(
+        num_blocks=10,
+        tensors={
+            "layer1": KVCacheTensor(100),
+        },
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer1"],
+                             new_kv_cache_spec(block_size=block_size)),
+        ],
+    )
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[],
+        mm_positions=None,
+        mm_hashes=None,
+    )
+    # Test case 1: Requires additional lookahead tokens
+    kv_cache_manager = KVCacheManager(kv_cache_config=config,
+                                      max_model_len=100,
+                                      num_preallocate_tokens=0)
+    blocks = kv_cache_manager.allocate_slots(
+        request,
+        num_tokens=3,
+        num_lookahead_tokens=2,  # Total required: 3+2=5 tokens
+    )
+    assert len(blocks) == 2  # ceil(5/4)=2 blocks
+    # Test case 2: With precomputed blocks
+    kv_cache_manager = KVCacheManager(kv_cache_config=config,
+                                      max_model_len=100,
+                                      num_preallocate_tokens=4)
+    # num_preallocate_blocks = 4 // 4 - 2 // 4 = 1
+    # required_blocks = ceil((3 + 2) /4) = 2
+    # total_blocks = 1 + 2 = 3
+    blocks = kv_cache_manager.allocate_slots(
+        request,
+        num_tokens=3,
+        num_lookahead_tokens=2,
+    )
+    assert len(blocks) == 3
+    # Test case 3: With precomputed blocks
+    # num_preallocate_blocks = 4 // 4 - 4 // 4 = 0
+    # required_blocks = ceil((3 + 4) / 4) = 2
+    # total_blocks = 0 + 2 = 2
+    kv_cache_manager = KVCacheManager(kv_cache_config=config,
+                                      max_model_len=100,
+                                      num_preallocate_tokens=4)
+    blocks = kv_cache_manager.allocate_slots(
+        request,
+        num_tokens=3,
+        num_lookahead_tokens=4,
+    )
+    assert len(blocks) == 2
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -24,6 +24,7 @@ def create_scheduler(
    max_num_batched_tokens: int = 8192,
    enable_prefix_caching: Optional[bool] = None,
    long_prefill_token_threshold: int = 0,
+    disable_chunked_mm_input: bool = False,
 ) -> Scheduler:
    '''Create scheduler under test.
@@ -43,6 +44,7 @@ def create_scheduler(
        max_num_batched_tokens=max_num_batched_tokens,
        max_model_len=max_num_batched_tokens,
        long_prefill_token_threshold=long_prefill_token_threshold,
+        disable_chunked_mm_input=disable_chunked_mm_input,
    )
    model_config = ModelConfig(
        model=model,
@@ -278,6 +280,58 @@ def test_schedule_partial_requests():
    assert requests[2].request_id not in output.num_scheduled_tokens
+def test_no_mm_input_chunking():
+    # Disable multimodal input chunking.
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        max_num_batched_tokens=1024,
+        disable_chunked_mm_input=True,
+    )
+    mm_positions = [[PlaceholderRange(offset=400, length=800)]]
+    requests = create_requests(num_requests=1,
+                               num_tokens=1200,
+                               mm_positions=mm_positions)
+    for request in requests:
+        scheduler.add_request(request)
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 1
+    assert len(output.scheduled_cached_reqs) == 0
+    assert len(output.finished_req_ids) == 0
+    # We want to only see the 400 text tokens at the start scheduled
+    assert output.num_scheduled_tokens[requests[0].request_id] == 400
+    req_to_index = {
+        request.request_id: i
+        for i, request in enumerate(requests)
+    }
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[request.request_id for request in requests],
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[[] for _ in range(len(requests))],
+        spec_token_ids=None,
+        logprobs=None,
+        prompt_logprobs_dict={},
+    )
+    scheduler.update_from_output(output, model_runner_output)
+    output = scheduler.schedule()
+    assert len(scheduler.running) == 1
+    assert len(output.scheduled_new_reqs) == 0
+    assert len(output.scheduled_cached_reqs) == 1
+    assert len(output.finished_req_ids) == 0
+    assert output.num_scheduled_tokens[requests[0].request_id] == 800
+    # Test that we fail if we disable chunked mm input and use too small
+    # of a max_num_batched_tokens for the mm input.
+    with pytest.raises(ValueError):
+        _ = create_scheduler(
+            model="llava-hf/llava-1.5-7b-hf",
+            max_num_batched_tokens=100,
+            disable_chunked_mm_input=True,
+        )
 @pytest.mark.parametrize("enable_prefix_caching", [True, False])
 def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool):
    """Test scheduling behavior with concurrent partial requests.

--- a/tests/v1/e2e/test_ngram_spec_decode.py
+++ b/tests/v1/e2e/test_ngram_spec_decode.py
@@ -53,6 +53,11 @@ def model_name():
    return "meta-llama/Meta-Llama-3-8B-Instruct"
+@pytest.fixture
+def eagle_model_name():
+    return "yuhuili/EAGLE-LLaMA3-Instruct-8B"
 def test_ngram_correctness(
    monkeypatch: pytest.MonkeyPatch,
    test_prompts: list[list[dict[str, Any]]],
@@ -95,3 +100,47 @@ def test_ngram_correctness(
        # Upon failure, inspect the outputs to check for inaccuracy.
        assert matches > int(0.7 * len(ref_outputs))
        del spec_llm
+def test_eagle_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    test_prompts: list[list[dict[str, Any]]],
+    sampling_config: SamplingParams,
+    model_name: str,
+    eagle_model_name: str,
+):
+    '''
+    Compare the outputs of a original LLM and a speculative LLM
+    should be the same when using eagle speculative decoding.
+    '''
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        ref_llm = LLM(model=model_name, max_model_len=1024)
+        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+        del ref_llm
+        spec_llm = LLM(
+            model=model_name,
+            speculative_config={
+                "method": "eagle",
+                "model": eagle_model_name,
+                "num_speculative_tokens": 3,
+            },
+            max_model_len=1024,
+        )
+        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+        matches = 0
+        misses = 0
+        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+            if ref_output.outputs[0].text == spec_output.outputs[0].text:
+                matches += 1
+            else:
+                misses += 1
+                print(f"ref_output: {ref_output.outputs[0].text}")
+                print(f"spec_output: {spec_output.outputs[0].text}")
+        # Heuristic: expect at least 70% of the prompts to match exactly
+        # Upon failure, inspect the outputs to check for inaccuracy.
+        assert matches > int(0.7 * len(ref_outputs))
+        del spec_llm
--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
@@ -64,15 +64,17 @@ def test_defaults_with_usage_context():
        # For H100 and H200, we use larger default values.
        default_llm_tokens = 16384
        default_server_tokens = 8192
+        default_max_num_seqs = 1024
    else:
        default_llm_tokens = 8192
        default_server_tokens = 2048
+        default_max_num_seqs = 256
-    assert vllm_config.scheduler_config.max_num_seqs == 1024
+    assert vllm_config.scheduler_config.max_num_seqs == default_max_num_seqs
    assert vllm_config.scheduler_config.max_num_batched_tokens == default_llm_tokens  # noqa: E501
    engine_args = EngineArgs(model="facebook/opt-125m")
    vllm_config = engine_args.create_engine_config(
        UsageContext.OPENAI_API_SERVER)
-    assert vllm_config.scheduler_config.max_num_seqs == 1024
+    assert vllm_config.scheduler_config.max_num_seqs == default_max_num_seqs
    assert vllm_config.scheduler_config.max_num_batched_tokens == default_server_tokens  # noqa: E501
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -3,8 +3,10 @@
 import asyncio
 import time
 import uuid
+from threading import Thread
 from typing import Optional
+import psutil
 import pytest
 from transformers import AutoTokenizer
@@ -245,3 +247,42 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch):
            await core_client.call_utility_async("echo", None, "help!")
        assert str(e_info.value) == "Call to echo method failed: help!"
+@pytest.mark.timeout(10)
+def test_startup_failure(monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m, pytest.raises(Exception) as e_info:
+        m.setenv("VLLM_USE_V1", "1")
+        engine_args = EngineArgs(model=MODEL_NAME)
+        vllm_config = engine_args.create_engine_config(
+            usage_context=UsageContext.UNKNOWN_CONTEXT)
+        executor_class = Executor.get_class(vllm_config)
+        # Start another thread to wait for engine core process to start
+        # and kill it - simulate fatal uncaught process exit.
+        this_proc = psutil.Process()
+        children_before = set(this_proc.children())
+        def kill_first_child():
+            while True:
+                time.sleep(0.5)
+                children = set(this_proc.children()) - children_before
+                if children:
+                    child = children.pop()
+                    print("Killing child core process", child.pid)
+                    child.kill()
+                    break
+        Thread(target=kill_first_child, daemon=True).start()
+        _core_client = EngineCoreClient.make_client(
+            multiprocess_mode=True,
+            asyncio_mode=True,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_stats=True,
+        )
+    assert "Engine core initialization failed" in str(e_info.value)
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -325,6 +325,45 @@ def test_structured_output(
        output_json = json.loads(generated_text)
        jsonschema.validate(instance=output_json, schema=json_schema)
+    #
+    # Test 10: Generate structured with minLength and maxLength
+    #
+    min_length = 50
+    max_length = 50
+    json_schema = {
+        "type": "object",
+        "properties": {
+            "description": {
+                "type": "string",
+                "maxLength": max_length,
+                "minLength": min_length
+            }
+        },
+        "required": ["description"]
+    }
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        max_tokens=1000,
+        guided_decoding=GuidedDecodingParams(json=json_schema))
+    outputs = llm.generate(
+        prompts="Generate a description of a frog using 50 characters.",
+        sampling_params=sampling_params,
+        use_tqdm=True)
+    assert outputs is not None
+    for output in outputs:
+        assert output is not None
+        assert isinstance(output, RequestOutput)
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        assert generated_text is not None
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        output_json = json.loads(generated_text)
+        jsonschema.validate(instance=output_json, schema=json_schema)
 @pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("model_name, tokenizer_mode",

--- a/tests/v1/structured_output/test_utils.py
+++ b/tests/v1/structured_output/test_utils.py
@@ -13,14 +13,6 @@ def unsupported_string_schemas():
            "type": "string",
            "pattern": "^[a-zA-Z]+$"
        },
-        {
-            "type": "string",
-            "minLength": 1
-        },
-        {
-            "type": "string",
-            "maxLength": 100
-        },
        {
            "type": "string",
            "format": "email"
@@ -164,6 +156,14 @@ def supported_schema():
                "type": "string",
                "enum": ["sedan", "suv", "truck"]
            },
+            "short_description": {
+                "type": "string",
+                "maxLength": 50
+            },
+            "long_description": {
+                "type": "string",
+                "minLength": 50
+            },
            "address": {
                "type": "object",
                "properties": {

--- a/tests/v1/test_serial_utils.py
+++ b/tests/v1/test_serial_utils.py
+# SPDX-License-Identifier: Apache-2.0
+from collections import UserDict
+from dataclasses import dataclass
+import numpy as np
+import torch
+from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
+class UnrecognizedType(UserDict):
+    def __init__(self, an_int: int):
+        super().__init__()
+        self.an_int = an_int
+@dataclass
+class MyType:
+    tensor1: torch.Tensor
+    a_string: str
+    list_of_tensors: list[torch.Tensor]
+    numpy_array: np.ndarray
+    unrecognized: UnrecognizedType
+    small_f_contig_tensor: torch.Tensor
+    large_f_contig_tensor: torch.Tensor
+    small_non_contig_tensor: torch.Tensor
+    large_non_contig_tensor: torch.Tensor
+def test_encode_decode():
+    """Test encode/decode loop with zero-copy tensors."""
+    obj = MyType(
+        tensor1=torch.randint(low=0,
+                              high=100,
+                              size=(1024, ),
+                              dtype=torch.int32),
+        a_string="hello",
+        list_of_tensors=[
+            torch.rand((1, 10), dtype=torch.float32),
+            torch.rand((3, 5, 4000), dtype=torch.float64),
+            torch.tensor(1984),  # test scalar too
+        ],
+        numpy_array=np.arange(512),
+        unrecognized=UnrecognizedType(33),
+        small_f_contig_tensor=torch.rand(5, 4).t(),
+        large_f_contig_tensor=torch.rand(1024, 4).t(),
+        small_non_contig_tensor=torch.rand(2, 4)[:, 1:3],
+        large_non_contig_tensor=torch.rand(1024, 512)[:, 10:20],
+    )
+    encoder = MsgpackEncoder()
+    decoder = MsgpackDecoder(MyType)
+    encoded = encoder.encode(obj)
+    # There should be the main buffer + 4 large tensor buffers
+    # + 1 large numpy array. "large" is <= 512 bytes.
+    # The two small tensors are encoded inline.
+    assert len(encoded) == 6
+    decoded: MyType = decoder.decode(encoded)
+    assert_equal(decoded, obj)
+    # Test encode_into case
+    preallocated = bytearray()
+    encoded2 = encoder.encode_into(obj, preallocated)
+    assert len(encoded2) == 6
+    assert encoded2[0] is preallocated
+    decoded2: MyType = decoder.decode(encoded2)
+    assert_equal(decoded2, obj)
+def assert_equal(obj1: MyType, obj2: MyType):
+    assert torch.equal(obj1.tensor1, obj2.tensor1)
+    assert obj1.a_string == obj2.a_string
+    assert all(
+        torch.equal(a, b)
+        for a, b in zip(obj1.list_of_tensors, obj2.list_of_tensors))
+    assert np.array_equal(obj1.numpy_array, obj2.numpy_array)
+    assert obj1.unrecognized.an_int == obj2.unrecognized.an_int
+    assert torch.equal(obj1.small_f_contig_tensor, obj2.small_f_contig_tensor)
+    assert torch.equal(obj1.large_f_contig_tensor, obj2.large_f_contig_tensor)
+    assert torch.equal(obj1.small_non_contig_tensor,
+                       obj2.small_non_contig_tensor)
+    assert torch.equal(obj1.large_non_contig_tensor,
+                       obj2.large_non_contig_tensor)
--- a/tests/v1/tpu/test_pallas.py
+++ b/tests/v1/tpu/test_pallas.py
@@ -4,9 +4,7 @@ from unittest.mock import ANY, patch
 import torch
 from vllm.attention.backends.abstract import AttentionType
-from vllm.v1.attention.backends.pallas import (NUM_KV_PAGES_PER_BLOCK,
+from vllm.v1.attention.backends.pallas import (PallasAttentionBackendImpl,
-                                               NUM_QUERIES_PER_BLOCK,
-                                               PallasAttentionBackendImpl,
                                               PallasMetadata)
@@ -32,8 +30,6 @@ def test_ragged_paged_attention():
        logits_soft_cap=logits_soft_cap,
        attn_type=AttentionType.DECODER,
    )
-    mock_vmem_limit_bytes = 1024
-    attn_impl.vmem_limit_bytes = mock_vmem_limit_bytes
    class FakeAttentionLayer:
        _k_scale_float: float
@@ -88,9 +84,9 @@ def test_ragged_paged_attention():
            ANY,  # block_tables
            ANY,  # query_start_loc
            ANY,  # num_seqs
-            num_kv_pages_per_block=NUM_KV_PAGES_PER_BLOCK,
+            num_kv_pages_per_block=None,
-            num_queries_per_block=NUM_QUERIES_PER_BLOCK,
+            num_queries_per_block=None,
-            vmem_limit_bytes=mock_vmem_limit_bytes,
+            vmem_limit_bytes=None,
            use_kernel=True,
            sm_scale=scale,
            sliding_window=sliding_window,

--- a/tests/v1/tpu/test_sampler.py
+++ b/tests/v1/tpu/test_sampler.py
@@ -34,3 +34,8 @@ def test_sampler_different(model_name: str):
    sampling_params = SamplingParams(temperature=0.1, min_p=0.8, max_tokens=64)
    output2 = llm.generate(prompts, sampling_params)
    assert output[0].outputs[0].text != output2[0].outputs[0].text
+    with pytest.raises(ValueError):
+        # Unsupported `seed` param.
+        sampling_params = SamplingParams(temperature=0.3, seed=42)
+        output2 = llm.generate(prompts, sampling_params)
--- a/tests/v1/tpu/worker/test_tpu_model_runner.py
+++ b/tests/v1/tpu/worker/test_tpu_model_runner.py
@@ -7,9 +7,9 @@ from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData,
                                       SchedulerOutput)
-from vllm.v1.worker.tpu_model_runner import (TPUModelRunner,
+from vllm.v1.worker.tpu_model_runner import (
-                                             _get_padded_token_len,
+    TPUModelRunner, _get_padded_num_reqs_with_upper_limit,
-                                             _get_paddings)
+    _get_padded_token_len, _get_req_paddings, _get_token_paddings)
 # Mock torch_xla module since it may not be available in the test environments
 torch_xla_patcher = mock.patch.dict(
@@ -296,16 +296,29 @@ def test_update_states_request_unscheduled(model_runner):
 def test_get_paddings():
    min_token_size, max_token_size, padding_gap = 16, 512, 64
    expected_paddings = [16, 32, 64, 128, 192, 256, 320, 384, 448, 512]
-    actual_paddings = _get_paddings(min_token_size, max_token_size,
+    actual_paddings = _get_token_paddings(min_token_size, max_token_size,
-                                    padding_gap)
+                                          padding_gap)
    assert actual_paddings == expected_paddings
 def test_get_padded_token_len():
    min_token_size, max_token_size, padding_gap = 16, 512, 64
-    paddings = _get_paddings(min_token_size, max_token_size, padding_gap)
+    paddings = _get_token_paddings(min_token_size, max_token_size, padding_gap)
    assert _get_padded_token_len(paddings, 1) == 16
    assert _get_padded_token_len(paddings, 16) == 16
    assert _get_padded_token_len(paddings, 20) == 32
    assert _get_padded_token_len(paddings, 300) == 320
    assert _get_padded_token_len(paddings, 512) == 512
+def test_get_padded_num_reqs_with_upper_limit():
+    assert _get_padded_num_reqs_with_upper_limit(3, 32) == 8
+    assert _get_padded_num_reqs_with_upper_limit(9, 32) == 16
+    assert _get_padded_num_reqs_with_upper_limit(19, 32) == 32
+    assert _get_padded_num_reqs_with_upper_limit(17, 28) == 28
+def test_get_req_paddings():
+    assert _get_req_paddings(1, 32) == [8, 16, 32]
+    assert _get_req_paddings(8, 32) == [8, 16, 32]
+    assert _get_req_paddings(8, 36) == [8, 16, 32, 36]
--- a/tools/update-dockerfile-graph.sh
+++ b/tools/update-dockerfile-graph.sh
+#!/bin/bash
+# Update Dockerfile dependency graph when docker/Dockerfile changes.
+# This script is designed to be used as a pre-commit hook.
+set -euo pipefail
+# Check if docker/Dockerfile is staged for commit
+if git diff --cached --name-only | grep -q "^docker/Dockerfile$"; then
+  echo "docker/Dockerfile has changed, attempting to update dependency graph..."
+  # Check if Docker is installed and running
+  if ! command -v docker &> /dev/null; then
+    echo "Warning: Docker command not found. Skipping Dockerfile graph update."
+    echo "Please install Docker to automatically update the graph: https://docs.docker.com/get-docker/"
+    exit 0
+  fi
+  if ! docker info &> /dev/null; then
+    echo "Warning: Docker daemon is not running. Skipping Dockerfile graph update."
+    echo "Please start Docker to automatically update the graph."
+    exit 0
+  fi
+  # Define the target file path
+  TARGET_GRAPH_FILE="docs/source/assets/contributing/dockerfile-stages-dependency.png"
+  # Ensure target directory exists
+  mkdir -p "$(dirname "$TARGET_GRAPH_FILE")"
+  # Store old image hash in a variable if the file exists
+  OLD_HASH=""
+  if [ -f "$TARGET_GRAPH_FILE" ]; then
+    OLD_HASH=$(sha256sum "$TARGET_GRAPH_FILE")
+  fi
+  # Generate Dockerfile graph
+  echo "Running dockerfilegraph tool..."
+  docker run \
+    --rm \
+    --user "$(id -u):$(id -g)" \
+    --workdir /workspace \
+    --volume "$(pwd)":/workspace \
+    ghcr.io/patrickhoefler/dockerfilegraph:alpine \
+    --output png \
+    --dpi 200 \
+    --max-label-length 50 \
+    --filename docker/Dockerfile \
+    --legend
+  echo "Finding generated PNG file..."
+  # Check for Dockerfile.png in the root directory (most likely location)
+  if [ -f "./Dockerfile.png" ]; then
+    echo "Found generated file at: ./Dockerfile.png"
+    mv "./Dockerfile.png" "$TARGET_GRAPH_FILE"
+  else
+    # Try to find it elsewhere
+    DOCKERFILE_PNG=$(find . -name "Dockerfile.png" -type f | head -1)
+    if [ -n "$DOCKERFILE_PNG" ]; then
+      echo "Found generated file at: $DOCKERFILE_PNG"
+      mv "$DOCKERFILE_PNG" "$TARGET_GRAPH_FILE"
+    else
+      echo "Error: Could not find the generated PNG file"
+      find . -name "*.png" -type f -mmin -5
+      exit 1
+    fi
+  fi
+  # Check if the graph has changed
+  NEW_HASH=$(sha256sum "$TARGET_GRAPH_FILE")
+  if [ "$NEW_HASH" != "$OLD_HASH" ]; then
+    echo "Graph has changed. Please stage the updated file: $TARGET_GRAPH_FILE"
+    exit 1
+  else
+    echo "No changes in graph detected."
+  fi
+fi
+exit 0 
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -138,6 +138,17 @@ def mla_decode_kvcache_cpu(
                                        block_tables, seq_lens)
+# merge attn states ops
+def merge_attn_states(output: torch.Tensor,
+                      prefix_output: torch.Tensor,
+                      prefix_lse: torch.Tensor,
+                      suffix_output: torch.Tensor,
+                      suffix_lse: torch.Tensor,
+                      output_lse: Optional[torch.Tensor] = None) -> None:
+    torch.ops._C.merge_attn_states(output, output_lse, prefix_output,
+                                   prefix_lse, suffix_output, suffix_lse)
 # pos encoding ops
 def rotary_embedding(
    positions: torch.Tensor,

--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -326,7 +326,7 @@ class FlashAttentionMetadata(AttentionMetadata):
            assert self.use_cuda_graph
        if turn_prefills_into_decodes:
-            # When Mutli-Step is enabled with Chunked-Prefill, prefills and
+            # When Multi-Step is enabled with Chunked-Prefill, prefills and
            # decodes are scheduled together. In the first step, all the
            # prefills turn into decodes. This update reflects that
            # conversion.
@@ -617,10 +617,15 @@ class FlashAttentionImpl(AttentionImpl):
        blocksparse_params: Optional[Dict[str, Any]] = None,
        logits_soft_cap: Optional[float] = None,
        attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
    ) -> None:
        if blocksparse_params is not None:
            raise ValueError(
                "FlashAttention does not support block-sparse attention.")
+        if use_irope:
+            logger.warning(
+                "Using irope in V0 is not supported yet, it will fall back "
+                "to global attention for long context.")
        self.num_heads = num_heads
        self.head_size = head_size
        self.scale = float(scale)

--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -38,9 +38,12 @@ from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
 from vllm.attention.layer import Attention
 from vllm.attention.ops.paged_attn import PagedAttention
 from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.logger import init_logger
 from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype,
                        make_tensor_with_pad)
+logger = init_logger(__name__)
 if TYPE_CHECKING:
    from vllm.worker.model_runner import (ModelInputForGPUBuilder,
                                          ModelInputForGPUWithSamplingMetadata)
@@ -907,7 +910,12 @@ class FlashInferImpl(AttentionImpl):
        blocksparse_params: Optional[Dict[str, Any]] = None,
        logits_soft_cap: Optional[float] = None,
        attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
    ) -> None:
+        if use_irope:
+            logger.warning_once(
+                "Using irope in FlashInfer is not supported yet, it will fall"
+                " back to global attention for long context.")
        self.num_heads = num_heads
        self.head_size = head_size
        self.scale = float(scale)

--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
@@ -108,8 +108,13 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
        blocksparse_params: Optional[Dict[str, Any]] = None,
        max_seq_len: int = 4096,
        attn_type: str = AttentionType.DECODER,
+        use_irope: bool = False,
    ) -> None:
        super(AttentionImpl, self).__init__()
+        if use_irope:
+            logger.warning_once(
+                "Using irope in HPU is not supported yet, it will fall back "
+                "to global attention for long context.")
        self.kv_cache_dtype = kv_cache_dtype
        self.num_heads = num_heads
        self.head_size = head_size
@@ -144,14 +149,14 @@ class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
                self.fused_scaled_dot_product_attention = ModuleFusedSDPA(
                    FusedSDPA)
            except ImportError:
-                logger().warning("Could not import HPU FusedSDPA kernel. "
+                logger.warning("Could not import HPU FusedSDPA kernel. "
-                                 "vLLM will use native implementation.")
+                               "vLLM will use native implementation.")
-        suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes()
+        supported_head_sizes = HPUPagedAttention.get_supported_head_sizes()
-        if head_size not in suppored_head_sizes:
+        if head_size not in supported_head_sizes:
            raise ValueError(
                f"Head size {head_size} is not supported by PagedAttention. "
-                f"Supported head sizes are: {suppored_head_sizes}.")
+                f"Supported head sizes are: {supported_head_sizes}.")
        if attn_type != AttentionType.DECODER:
            raise NotImplementedError("Encoder self-attention and "