Merge tag 'v0.7.2' into v0.7.2-dev

66b809cc · zhuwenwen · 37b63c24 · 0408efc6 · 66b809cc · 66b809cc
Commit 66b809cc authored Feb 08, 2025 by zhuwenwen
20 changed files
--- a/tests/tool_use/test_chat_completions.py
+++ b/tests/tool_use/test_chat_completions.py
+# SPDX-License-Identifier: Apache-2.0
 from typing import List
 import openai

--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_use/test_jamba_tool_parser.py
+# SPDX-License-Identifier: Apache-2.0
 import json
 from typing import Generator, List, Optional

--- a/tests/tool_use/test_parallel_tool_calls.py
+++ b/tests/tool_use/test_parallel_tool_calls.py
+# SPDX-License-Identifier: Apache-2.0
 import json
 from typing import Dict, List, Optional

--- a/tests/tool_use/test_tool_calls.py
+++ b/tests/tool_use/test_tool_calls.py
+# SPDX-License-Identifier: Apache-2.0
 import json
 from typing import Dict, List, Optional

--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
+# SPDX-License-Identifier: Apache-2.0
 from copy import deepcopy
 from typing import Any, Dict, List, Optional
 import os

--- a/tests/tpu/test_quantization_accuracy.py
+++ b/tests/tpu/test_quantization_accuracy.py
+# SPDX-License-Identifier: Apache-2.0
 from dataclasses import dataclass
 import lm_eval

--- a/tests/tpu/untest_compilation.py
+++ b/tests/tpu/untest_compilation.py
+# SPDX-License-Identifier: Apache-2.0
 import glob
 import os
 import tempfile

--- a/tests/tpu/untest_custom_dispatcher.py
+++ b/tests/tpu/untest_custom_dispatcher.py
+# SPDX-License-Identifier: Apache-2.0
 import os
 from vllm.config import CompilationLevel

--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
+# SPDX-License-Identifier: Apache-2.0
 import os
 import threading
 from concurrent import futures

--- a/tests/utils.py
+++ b/tests/utils.py
+# SPDX-License-Identifier: Apache-2.0
 import asyncio
 import copy
 import functools

--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
+# SPDX-License-Identifier: Apache-2.0
 import pytest
 from vllm.multimodal.inputs import MultiModalKwargs

--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
+# SPDX-License-Identifier: Apache-2.0
 """Compare the with and without prefix caching."""
 import pytest
@@ -164,7 +165,7 @@ def test_decode():
    req0.num_computed_tokens = 55
    for _ in range(4):
        req0.append_output_token_ids(8)
-    new_blocks = manager.append_slots(req0, 4)
+    new_blocks = manager.allocate_slots(req0, 4)
    assert new_blocks is not None and len(new_blocks) == 0
    assert manager.req_to_blocks[req0.request_id][-2].block_hash is None
@@ -175,7 +176,7 @@ def test_decode():
    # the preallocated block.
    for _ in range(5 + 10):
        req0.append_output_token_ids(7)
-    new_blocks = manager.append_slots(req0, 15)
+    new_blocks = manager.allocate_slots(req0, 15)
    assert new_blocks is not None and len(new_blocks) == 0
    assert manager.req_to_blocks[req0.request_id][-2].block_hash is not None
@@ -185,7 +186,7 @@ def test_decode():
    # the preallocated block.
    for _ in range(6 + 11):
        req0.append_output_token_ids(12)
-    new_blocks = manager.append_slots(req0, 17)
+    new_blocks = manager.allocate_slots(req0, 17)
    # Plus one preallocated block.
    assert new_blocks is not None and len(new_blocks) == 2
@@ -395,12 +396,14 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
    req.num_computed_tokens = block_size
    assert len(blocks) == 1 + num_preallocated_blocks
-    # Assume all computed.
+    # Assume all computed, only when num_preallocate_tokens > 0, we need to
-    manager.append_slots(req, block_size * (len(blocks) - 1))
+    # consume the previously preallocated blocks.
-    req.num_computed_tokens = block_size * len(blocks)
+    if num_preallocated_blocks > 0:
+        manager.allocate_slots(req, block_size * (len(blocks) - 1))
+        req.num_computed_tokens = block_size * len(blocks)
    # Append 1 block.
-    blocks = manager.append_slots(req, block_size)
+    blocks = manager.allocate_slots(req, block_size)
    assert len(blocks) == 1 + num_preallocated_blocks
@@ -503,7 +506,7 @@ def test_mm_prefix_caching():
    # Append slots without allocating a new block.
    for _ in range(5):
        req0.append_output_token_ids(8)
-    new_blocks = manager.append_slots(req0, 5)
+    new_blocks = manager.allocate_slots(req0, 5)
    assert new_blocks is not None and len(new_blocks) == 0
    # The just completed block should have hashes with extra keys.
@@ -603,7 +606,7 @@ def test_reset_prefix_cache():
    unique_token_ids = [3] * 7
    all_token_ids = full_block_token_ids + unique_token_ids
    req0 = make_request("0", all_token_ids)
-    blocks = manager.allocate_slots(req0, 55, [])
+    blocks = manager.allocate_slots(req0, 55)
    assert [b.block_id for b in blocks] == [0, 1, 2, 3]
    unique_token_ids = [4] * 7
@@ -626,33 +629,3 @@ def test_reset_prefix_cache():
    assert manager.reset_prefix_cache()
    assert not manager.cached_block_hash_to_block
    assert all([blk.block_hash is None for blk in manager.block_pool])
-def test_uncache_blocks():
-    manager = KVCacheManager(
-        block_size=16,
-        num_gpu_blocks=10,
-        max_model_len=8192,
-        sliding_window=None,
-        enable_caching=True,
-        num_preallocate_tokens=0,
-    )
-    req0 = make_request("0", list(range(30)))
-    blocks = manager.allocate_slots(req0, 30, [])
-    assert [b.block_id for b in blocks] == [0, 1]
-    assert len(manager.cached_block_hash_to_block) == 1
-    req0.num_computed_tokens = 30
-    # Simulate speculative tokens.
-    for _ in range(5):
-        req0.append_output_token_ids(8)
-    manager.append_slots(req0, 5)
-    assert len(manager.cached_block_hash_to_block) == 2
-    # After sampling, assuming only 1 token is accepted.
-    req0.num_computed_tokens = 31
-    num_uncached_blocks = manager.uncache_blocks(req0)
-    assert num_uncached_blocks == 1
-    assert len(manager.cached_block_hash_to_block) == 1
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
+# SPDX-License-Identifier: Apache-2.0
+from typing import List, Optional
+from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
+from vllm.sampling_params import SamplingParams
+from vllm.v1.core.scheduler import Scheduler
+from vllm.v1.outputs import ModelRunnerOutput
+from vllm.v1.request import Request, RequestStatus
+def create_scheduler(
+    model: str = "facebook/opt-125m",
+    max_num_seqs: int = 16,
+    max_num_batched_tokens: int = 8192,
+) -> Scheduler:
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=max_num_seqs,
+        max_num_batched_tokens=max_num_batched_tokens,
+        max_model_len=max_num_batched_tokens,
+    )
+    model_config = ModelConfig(
+        model=model,
+        task="auto",
+        tokenizer=model,
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        dtype="float16",
+        seed=42,
+    )
+    cache_config = CacheConfig(
+        block_size=16,
+        gpu_memory_utilization=0.9,
+        swap_space=0,
+        cache_dtype="auto",
+    )
+    cache_config.num_gpu_blocks = 10000
+    return Scheduler(scheduler_config,
+                     model_config,
+                     cache_config,
+                     lora_config=None)
+def create_requests(
+    num_requests: int,
+    num_tokens: int = 10,
+    mm_positions: Optional[List[PlaceholderRange]] = None,
+):
+    sampling_params = SamplingParams()
+    requests = []
+    for i in range(num_requests):
+        if mm_positions is not None:
+            mm_position = mm_positions[i]
+            mm_inputs = [MultiModalKwargs({})] * len(mm_position)
+        else:
+            mm_position = None
+            mm_inputs = None
+        request = Request(
+            request_id=f"{i}",
+            prompt=None,
+            prompt_token_ids=[i] * num_tokens,
+            sampling_params=sampling_params,
+            multi_modal_inputs=mm_inputs,
+            multi_modal_placeholders=mm_position,
+            multi_modal_hashes=None,
+            eos_token_id=None,
+            arrival_time=0,
+        )
+        requests.append(request)
+    return requests
+def test_add_requests():
+    scheduler = create_scheduler()
+    requests = create_requests(num_requests=10)
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        assert request.request_id in scheduler.requests
+        assert len(scheduler.waiting) == i + 1
+def test_finish_request():
+    scheduler = create_scheduler()
+    requests = create_requests(num_requests=10)
+    for request in requests:
+        scheduler.add_request(request)
+    for i, request in enumerate(requests):
+        scheduler.finish_requests(request.request_id,
+                                  RequestStatus.FINISHED_ABORTED)
+        assert request.request_id not in scheduler.requests
+        assert len(scheduler.waiting) == 9 - i
+def test_get_num_unfinished_requests():
+    scheduler = create_scheduler()
+    requests = create_requests(num_requests=10)
+    for request in requests:
+        scheduler.add_request(request)
+    for i, request in enumerate(requests):
+        scheduler.finish_requests(request.request_id,
+                                  RequestStatus.FINISHED_STOPPED)
+        assert scheduler.get_num_unfinished_requests() == len(requests) - i - 1
+def test_schedule():
+    scheduler = create_scheduler()
+    requests = create_requests(num_requests=10)
+    for request in requests:
+        scheduler.add_request(request)
+    # Test initial scheduling
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == len(requests)
+    assert len(output.scheduled_cached_reqs) == 0
+    assert len(output.finished_req_ids) == 0
+    # Verify all requests are scheduled.
+    for req_id, num_tokens in output.num_scheduled_tokens.items():
+        assert num_tokens == len(requests[int(req_id)].prompt_token_ids)
+    # Verify requests moved from waiting to running
+    assert len(scheduler.waiting) == 0
+    assert len(scheduler.running) == len(requests)
+    for i, request in enumerate(requests):
+        assert scheduler.running[i] == request
+def test_schedule_multimodal_requests():
+    scheduler = create_scheduler(model="llava-hf/llava-1.5-7b-hf")
+    mm_positions = [[PlaceholderRange(offset=i, length=100)]
+                    for i in range(10)]
+    requests = create_requests(
+        num_requests=10,
+        num_tokens=200,
+        mm_positions=mm_positions,
+    )
+    for request in requests:
+        scheduler.add_request(request)
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == len(requests)
+    assert len(output.scheduled_cached_reqs) == 0
+    assert len(output.finished_req_ids) == 0
+    for req_id, num_tokens in output.num_scheduled_tokens.items():
+        assert num_tokens == len(requests[int(req_id)].prompt_token_ids)
+    assert len(output.scheduled_encoder_inputs) == 10
+    for req_id, encoder_input in output.scheduled_encoder_inputs.items():
+        assert len(encoder_input) == 1
+def test_schedule_partial_requests():
+    """Test scheduling behavior with partial requests.
+    This test verifies that:
+    1. The scheduler can handle multiple partial requests in a single step when
+       constrained by encoder budget.
+    2. A request in RUNNING state may be unscheduled in subsequent steps if
+       there is insufficient encoder budget.
+    """
+    scheduler = create_scheduler(
+        model="llava-hf/llava-1.5-7b-hf",
+        max_num_batched_tokens=1024,
+    )
+    mm_positions = [[PlaceholderRange(offset=100, length=600)]
+                    for _ in range(3)]
+    requests = create_requests(
+        num_requests=3,
+        num_tokens=800,
+        mm_positions=mm_positions,
+    )
+    for request in requests:
+        scheduler.add_request(request)
+    output = scheduler.schedule()
+    assert len(output.scheduled_new_reqs) == 3
+    assert len(output.scheduled_cached_reqs) == 0
+    assert len(output.finished_req_ids) == 0
+    assert scheduler.max_num_encoder_input_tokens == 1024
+    # The first request is scheduled fully.
+    assert output.num_scheduled_tokens[requests[0].request_id] == 800
+    # The second request is scheduled partially.
+    # The <img> tokens are not scheduled because of the encoder budget.
+    assert output.num_scheduled_tokens[requests[1].request_id] == 100
+    # The third request is also scheduled partially.
+    # The <img> tokens are not scheduled because of the encoder budget.
+    assert output.num_scheduled_tokens[requests[2].request_id] == 100
+    req_to_index = {
+        request.request_id: i
+        for i, request in enumerate(requests)
+    }
+    model_runner_output = ModelRunnerOutput(
+        req_ids=[request.request_id for request in requests],
+        req_id_to_index=req_to_index,
+        sampled_token_ids=[0] * len(requests),
+        logprob_token_ids_cpu=None,
+        logprobs_cpu=None,
+    )
+    scheduler.update_from_output(output, model_runner_output)
+    # Schedule the next step.
+    # Only the first and second requests are scheduled.
+    # The third request is in the RUNNING state but not scheduled in this step
+    # because of the encoder budget.
+    output = scheduler.schedule()
+    assert len(scheduler.running) == 3
+    assert len(output.scheduled_new_reqs) == 0
+    assert len(output.scheduled_cached_reqs) == 2
+    assert len(output.finished_req_ids) == 0
+    assert output.num_scheduled_tokens[requests[0].request_id] == 1
+    assert output.num_scheduled_tokens[requests[1].request_id] == 700
+    assert requests[2].request_id not in output.num_scheduled_tokens
--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
+# SPDX-License-Identifier: Apache-2.0
 from vllm import LLM, SamplingParams

--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
+# SPDX-License-Identifier: Apache-2.0
 import asyncio
 from contextlib import ExitStack
 from typing import List, Tuple

--- a/tests/v1/engine/test_engine_args.py
+++ b/tests/v1/engine/test_engine_args.py
+# SPDX-License-Identifier: Apache-2.0
 import os
 import pytest

--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
+# SPDX-License-Identifier: Apache-2.0
 import time
 import uuid

--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
+# SPDX-License-Identifier: Apache-2.0
 import asyncio
 import time
 import uuid

--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
+# SPDX-License-Identifier: Apache-2.0
 from typing import List
 import os

--- a/tests/v1/sample/test_sampler.py
+++ b/tests/v1/sample/test_sampler.py
+# SPDX-License-Identifier: Apache-2.0
 from typing import List, Set, Tuple
 import numpy as np