"lib/memory/vscode:/vscode.git/clone" did not exist on "8245633a490d1d9beebb983d5c3caa32ae188277"
Commit 66b809cc authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.2' into v0.7.2-dev

parents 37b63c24 0408efc6
# SPDX-License-Identifier: Apache-2.0
from typing import List from typing import List
import openai import openai
......
# SPDX-License-Identifier: Apache-2.0
import json import json
from typing import Generator, List, Optional from typing import Generator, List, Optional
......
# SPDX-License-Identifier: Apache-2.0
import json import json
from typing import Dict, List, Optional from typing import Dict, List, Optional
......
# SPDX-License-Identifier: Apache-2.0
import json import json
from typing import Dict, List, Optional from typing import Dict, List, Optional
......
# SPDX-License-Identifier: Apache-2.0
from copy import deepcopy from copy import deepcopy
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
import os import os
......
# SPDX-License-Identifier: Apache-2.0
from dataclasses import dataclass from dataclasses import dataclass
import lm_eval import lm_eval
......
# SPDX-License-Identifier: Apache-2.0
import glob import glob
import os import os
import tempfile import tempfile
......
# SPDX-License-Identifier: Apache-2.0
import os import os
from vllm.config import CompilationLevel from vllm.config import CompilationLevel
......
# SPDX-License-Identifier: Apache-2.0
import os import os
import threading import threading
from concurrent import futures from concurrent import futures
......
# SPDX-License-Identifier: Apache-2.0
import asyncio import asyncio
import copy import copy
import functools import functools
......
# SPDX-License-Identifier: Apache-2.0
import pytest import pytest
from vllm.multimodal.inputs import MultiModalKwargs from vllm.multimodal.inputs import MultiModalKwargs
......
# SPDX-License-Identifier: Apache-2.0
"""Compare the with and without prefix caching.""" """Compare the with and without prefix caching."""
import pytest import pytest
...@@ -164,7 +165,7 @@ def test_decode(): ...@@ -164,7 +165,7 @@ def test_decode():
req0.num_computed_tokens = 55 req0.num_computed_tokens = 55
for _ in range(4): for _ in range(4):
req0.append_output_token_ids(8) req0.append_output_token_ids(8)
new_blocks = manager.append_slots(req0, 4) new_blocks = manager.allocate_slots(req0, 4)
assert new_blocks is not None and len(new_blocks) == 0 assert new_blocks is not None and len(new_blocks) == 0
assert manager.req_to_blocks[req0.request_id][-2].block_hash is None assert manager.req_to_blocks[req0.request_id][-2].block_hash is None
...@@ -175,7 +176,7 @@ def test_decode(): ...@@ -175,7 +176,7 @@ def test_decode():
# the preallocated block. # the preallocated block.
for _ in range(5 + 10): for _ in range(5 + 10):
req0.append_output_token_ids(7) req0.append_output_token_ids(7)
new_blocks = manager.append_slots(req0, 15) new_blocks = manager.allocate_slots(req0, 15)
assert new_blocks is not None and len(new_blocks) == 0 assert new_blocks is not None and len(new_blocks) == 0
assert manager.req_to_blocks[req0.request_id][-2].block_hash is not None assert manager.req_to_blocks[req0.request_id][-2].block_hash is not None
...@@ -185,7 +186,7 @@ def test_decode(): ...@@ -185,7 +186,7 @@ def test_decode():
# the preallocated block. # the preallocated block.
for _ in range(6 + 11): for _ in range(6 + 11):
req0.append_output_token_ids(12) req0.append_output_token_ids(12)
new_blocks = manager.append_slots(req0, 17) new_blocks = manager.allocate_slots(req0, 17)
# Plus one preallocated block. # Plus one preallocated block.
assert new_blocks is not None and len(new_blocks) == 2 assert new_blocks is not None and len(new_blocks) == 2
...@@ -395,12 +396,14 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int): ...@@ -395,12 +396,14 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
req.num_computed_tokens = block_size req.num_computed_tokens = block_size
assert len(blocks) == 1 + num_preallocated_blocks assert len(blocks) == 1 + num_preallocated_blocks
# Assume all computed. # Assume all computed, only when num_preallocate_tokens > 0, we need to
manager.append_slots(req, block_size * (len(blocks) - 1)) # consume the previously preallocated blocks.
req.num_computed_tokens = block_size * len(blocks) if num_preallocated_blocks > 0:
manager.allocate_slots(req, block_size * (len(blocks) - 1))
req.num_computed_tokens = block_size * len(blocks)
# Append 1 block. # Append 1 block.
blocks = manager.append_slots(req, block_size) blocks = manager.allocate_slots(req, block_size)
assert len(blocks) == 1 + num_preallocated_blocks assert len(blocks) == 1 + num_preallocated_blocks
...@@ -503,7 +506,7 @@ def test_mm_prefix_caching(): ...@@ -503,7 +506,7 @@ def test_mm_prefix_caching():
# Append slots without allocating a new block. # Append slots without allocating a new block.
for _ in range(5): for _ in range(5):
req0.append_output_token_ids(8) req0.append_output_token_ids(8)
new_blocks = manager.append_slots(req0, 5) new_blocks = manager.allocate_slots(req0, 5)
assert new_blocks is not None and len(new_blocks) == 0 assert new_blocks is not None and len(new_blocks) == 0
# The just completed block should have hashes with extra keys. # The just completed block should have hashes with extra keys.
...@@ -603,7 +606,7 @@ def test_reset_prefix_cache(): ...@@ -603,7 +606,7 @@ def test_reset_prefix_cache():
unique_token_ids = [3] * 7 unique_token_ids = [3] * 7
all_token_ids = full_block_token_ids + unique_token_ids all_token_ids = full_block_token_ids + unique_token_ids
req0 = make_request("0", all_token_ids) req0 = make_request("0", all_token_ids)
blocks = manager.allocate_slots(req0, 55, []) blocks = manager.allocate_slots(req0, 55)
assert [b.block_id for b in blocks] == [0, 1, 2, 3] assert [b.block_id for b in blocks] == [0, 1, 2, 3]
unique_token_ids = [4] * 7 unique_token_ids = [4] * 7
...@@ -626,33 +629,3 @@ def test_reset_prefix_cache(): ...@@ -626,33 +629,3 @@ def test_reset_prefix_cache():
assert manager.reset_prefix_cache() assert manager.reset_prefix_cache()
assert not manager.cached_block_hash_to_block assert not manager.cached_block_hash_to_block
assert all([blk.block_hash is None for blk in manager.block_pool]) assert all([blk.block_hash is None for blk in manager.block_pool])
def test_uncache_blocks():
manager = KVCacheManager(
block_size=16,
num_gpu_blocks=10,
max_model_len=8192,
sliding_window=None,
enable_caching=True,
num_preallocate_tokens=0,
)
req0 = make_request("0", list(range(30)))
blocks = manager.allocate_slots(req0, 30, [])
assert [b.block_id for b in blocks] == [0, 1]
assert len(manager.cached_block_hash_to_block) == 1
req0.num_computed_tokens = 30
# Simulate speculative tokens.
for _ in range(5):
req0.append_output_token_ids(8)
manager.append_slots(req0, 5)
assert len(manager.cached_block_hash_to_block) == 2
# After sampling, assuming only 1 token is accepted.
req0.num_computed_tokens = 31
num_uncached_blocks = manager.uncache_blocks(req0)
assert num_uncached_blocks == 1
assert len(manager.cached_block_hash_to_block) == 1
# SPDX-License-Identifier: Apache-2.0
from typing import List, Optional
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
from vllm.sampling_params import SamplingParams
from vllm.v1.core.scheduler import Scheduler
from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus
def create_scheduler(
model: str = "facebook/opt-125m",
max_num_seqs: int = 16,
max_num_batched_tokens: int = 8192,
) -> Scheduler:
scheduler_config = SchedulerConfig(
max_num_seqs=max_num_seqs,
max_num_batched_tokens=max_num_batched_tokens,
max_model_len=max_num_batched_tokens,
)
model_config = ModelConfig(
model=model,
task="auto",
tokenizer=model,
tokenizer_mode="auto",
trust_remote_code=True,
dtype="float16",
seed=42,
)
cache_config = CacheConfig(
block_size=16,
gpu_memory_utilization=0.9,
swap_space=0,
cache_dtype="auto",
)
cache_config.num_gpu_blocks = 10000
return Scheduler(scheduler_config,
model_config,
cache_config,
lora_config=None)
def create_requests(
num_requests: int,
num_tokens: int = 10,
mm_positions: Optional[List[PlaceholderRange]] = None,
):
sampling_params = SamplingParams()
requests = []
for i in range(num_requests):
if mm_positions is not None:
mm_position = mm_positions[i]
mm_inputs = [MultiModalKwargs({})] * len(mm_position)
else:
mm_position = None
mm_inputs = None
request = Request(
request_id=f"{i}",
prompt=None,
prompt_token_ids=[i] * num_tokens,
sampling_params=sampling_params,
multi_modal_inputs=mm_inputs,
multi_modal_placeholders=mm_position,
multi_modal_hashes=None,
eos_token_id=None,
arrival_time=0,
)
requests.append(request)
return requests
def test_add_requests():
scheduler = create_scheduler()
requests = create_requests(num_requests=10)
for i, request in enumerate(requests):
scheduler.add_request(request)
assert request.request_id in scheduler.requests
assert len(scheduler.waiting) == i + 1
def test_finish_request():
scheduler = create_scheduler()
requests = create_requests(num_requests=10)
for request in requests:
scheduler.add_request(request)
for i, request in enumerate(requests):
scheduler.finish_requests(request.request_id,
RequestStatus.FINISHED_ABORTED)
assert request.request_id not in scheduler.requests
assert len(scheduler.waiting) == 9 - i
def test_get_num_unfinished_requests():
scheduler = create_scheduler()
requests = create_requests(num_requests=10)
for request in requests:
scheduler.add_request(request)
for i, request in enumerate(requests):
scheduler.finish_requests(request.request_id,
RequestStatus.FINISHED_STOPPED)
assert scheduler.get_num_unfinished_requests() == len(requests) - i - 1
def test_schedule():
scheduler = create_scheduler()
requests = create_requests(num_requests=10)
for request in requests:
scheduler.add_request(request)
# Test initial scheduling
output = scheduler.schedule()
assert len(output.scheduled_new_reqs) == len(requests)
assert len(output.scheduled_cached_reqs) == 0
assert len(output.finished_req_ids) == 0
# Verify all requests are scheduled.
for req_id, num_tokens in output.num_scheduled_tokens.items():
assert num_tokens == len(requests[int(req_id)].prompt_token_ids)
# Verify requests moved from waiting to running
assert len(scheduler.waiting) == 0
assert len(scheduler.running) == len(requests)
for i, request in enumerate(requests):
assert scheduler.running[i] == request
def test_schedule_multimodal_requests():
scheduler = create_scheduler(model="llava-hf/llava-1.5-7b-hf")
mm_positions = [[PlaceholderRange(offset=i, length=100)]
for i in range(10)]
requests = create_requests(
num_requests=10,
num_tokens=200,
mm_positions=mm_positions,
)
for request in requests:
scheduler.add_request(request)
output = scheduler.schedule()
assert len(output.scheduled_new_reqs) == len(requests)
assert len(output.scheduled_cached_reqs) == 0
assert len(output.finished_req_ids) == 0
for req_id, num_tokens in output.num_scheduled_tokens.items():
assert num_tokens == len(requests[int(req_id)].prompt_token_ids)
assert len(output.scheduled_encoder_inputs) == 10
for req_id, encoder_input in output.scheduled_encoder_inputs.items():
assert len(encoder_input) == 1
def test_schedule_partial_requests():
"""Test scheduling behavior with partial requests.
This test verifies that:
1. The scheduler can handle multiple partial requests in a single step when
constrained by encoder budget.
2. A request in RUNNING state may be unscheduled in subsequent steps if
there is insufficient encoder budget.
"""
scheduler = create_scheduler(
model="llava-hf/llava-1.5-7b-hf",
max_num_batched_tokens=1024,
)
mm_positions = [[PlaceholderRange(offset=100, length=600)]
for _ in range(3)]
requests = create_requests(
num_requests=3,
num_tokens=800,
mm_positions=mm_positions,
)
for request in requests:
scheduler.add_request(request)
output = scheduler.schedule()
assert len(output.scheduled_new_reqs) == 3
assert len(output.scheduled_cached_reqs) == 0
assert len(output.finished_req_ids) == 0
assert scheduler.max_num_encoder_input_tokens == 1024
# The first request is scheduled fully.
assert output.num_scheduled_tokens[requests[0].request_id] == 800
# The second request is scheduled partially.
# The <img> tokens are not scheduled because of the encoder budget.
assert output.num_scheduled_tokens[requests[1].request_id] == 100
# The third request is also scheduled partially.
# The <img> tokens are not scheduled because of the encoder budget.
assert output.num_scheduled_tokens[requests[2].request_id] == 100
req_to_index = {
request.request_id: i
for i, request in enumerate(requests)
}
model_runner_output = ModelRunnerOutput(
req_ids=[request.request_id for request in requests],
req_id_to_index=req_to_index,
sampled_token_ids=[0] * len(requests),
logprob_token_ids_cpu=None,
logprobs_cpu=None,
)
scheduler.update_from_output(output, model_runner_output)
# Schedule the next step.
# Only the first and second requests are scheduled.
# The third request is in the RUNNING state but not scheduled in this step
# because of the encoder budget.
output = scheduler.schedule()
assert len(scheduler.running) == 3
assert len(output.scheduled_new_reqs) == 0
assert len(output.scheduled_cached_reqs) == 2
assert len(output.finished_req_ids) == 0
assert output.num_scheduled_tokens[requests[0].request_id] == 1
assert output.num_scheduled_tokens[requests[1].request_id] == 700
assert requests[2].request_id not in output.num_scheduled_tokens
# SPDX-License-Identifier: Apache-2.0
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
......
# SPDX-License-Identifier: Apache-2.0
import asyncio import asyncio
from contextlib import ExitStack from contextlib import ExitStack
from typing import List, Tuple from typing import List, Tuple
......
# SPDX-License-Identifier: Apache-2.0
import os import os
import pytest import pytest
......
# SPDX-License-Identifier: Apache-2.0
import time import time
import uuid import uuid
......
# SPDX-License-Identifier: Apache-2.0
import asyncio import asyncio
import time import time
import uuid import uuid
......
# SPDX-License-Identifier: Apache-2.0
from typing import List from typing import List
import os import os
......
# SPDX-License-Identifier: Apache-2.0
from typing import List, Set, Tuple from typing import List, Set, Tuple
import numpy as np import numpy as np
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment