Commit afd0da21 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.1' into v0.7.1-dev

parents 1a11f127 4f4d427a
......@@ -101,32 +101,32 @@ def test_traces(trace_service):
attributes = decode_attributes(
request.resource_spans[0].scope_spans[0].spans[0].attributes)
assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model
assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
assert attributes.get(
SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id
SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
) == sampling_params.temperature
assert attributes.get(
SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature
SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
assert attributes.get(
SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
assert attributes.get(
SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
outputs[0].prompt_token_ids)
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
assert attributes.get(
SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
metrics = outputs[0].metrics
assert attributes.get(
SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
ttft = metrics.first_token_time - metrics.arrival_time
assert attributes.get(
SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
e2e_time = metrics.finished_time - metrics.arrival_time
assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
assert metrics.scheduler_time > 0
assert attributes.get(
SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
) == metrics.scheduler_time
# Model forward and model execute should be none, since detailed traces is
# not enabled.
assert metrics.model_forward_time is None
......@@ -167,37 +167,37 @@ def test_traces_with_detailed_steps(trace_service):
attributes = decode_attributes(
request.resource_spans[0].scope_spans[0].spans[0].attributes)
assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model
assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
assert attributes.get(
SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id
SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
) == sampling_params.temperature
assert attributes.get(
SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature
SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
assert attributes.get(
SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
assert attributes.get(
SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
outputs[0].prompt_token_ids)
completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
assert attributes.get(
SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens
SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
metrics = outputs[0].metrics
assert attributes.get(
SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
ttft = metrics.first_token_time - metrics.arrival_time
assert attributes.get(
SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
e2e_time = metrics.finished_time - metrics.arrival_time
assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
assert metrics.scheduler_time > 0
assert attributes.get(
SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
) == metrics.scheduler_time
assert metrics.model_forward_time > 0
assert attributes.get(
SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx(
SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx(
metrics.model_forward_time / 1000)
assert metrics.model_execute_time > 0
assert attributes.get(SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE
assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
) == metrics.model_execute_time
assert metrics.model_forward_time < 1000 * metrics.model_execute_time
......@@ -163,13 +163,19 @@ class RemoteOpenAIServer:
def url_for(self, *parts: str) -> str:
return self.url_root + "/" + "/".join(parts)
def get_client(self):
def get_client(self, **kwargs):
if "timeout" not in kwargs:
kwargs["timeout"] = 600
return openai.OpenAI(
base_url=self.url_for("v1"),
api_key=self.DUMMY_API_KEY,
max_retries=0,
**kwargs,
)
def get_async_client(self, **kwargs):
if "timeout" not in kwargs:
kwargs["timeout"] = 600
return openai.AsyncOpenAI(base_url=self.url_for("v1"),
api_key=self.DUMMY_API_KEY,
max_retries=0,
......@@ -816,7 +822,6 @@ async def completions_with_server_args(
assert len(max_tokens) == len(prompts)
outputs = None
max_wait_seconds = 240 * 3 # 240 is default
with RemoteOpenAIServer(model_name,
server_cli_args,
max_wait_seconds=max_wait_seconds) as server:
......
import pytest
from vllm.multimodal.inputs import MultiModalKwargs
from vllm.sampling_params import SamplingParams
from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
KVCacheBlock,
generate_block_hash_extra_keys,
hash_block_tokens,
hash_request_tokens)
from vllm.v1.request import Request
def make_request(request_id,
prompt_token_ids,
mm_positions=None,
mm_hashes=None):
if mm_positions is None:
multi_modal_inputs = None
else:
multi_modal_inputs = [MultiModalKwargs({})] * len(mm_positions)
return Request(
request_id=request_id,
prompt=None,
prompt_token_ids=prompt_token_ids,
multi_modal_inputs=multi_modal_inputs,
multi_modal_hashes=mm_hashes,
multi_modal_placeholders=mm_positions,
sampling_params=SamplingParams(max_tokens=17),
eos_token_id=100,
arrival_time=0,
lora_request=None,
)
def test_kv_cache_block():
# Test KVCacheBlock initialization
block = KVCacheBlock(block_id=0)
assert block.block_id == 0
assert block.ref_cnt == 0
assert block.block_hash is None
# Test reference count manipulation
block.incr_ref()
assert block.ref_cnt == 1
block.decr_ref()
assert block.ref_cnt == 0
# Test block hash setting and resetting
block_hash = BlockHashType(hash_value=123, token_ids=(1, 2, 3))
block.block_hash = block_hash
assert block.block_hash == block_hash
block.reset_hash()
assert block.block_hash is None
def test_free_kv_cache_block_queue_initialization():
# Test with a single block
block = KVCacheBlock(block_id=0)
queue = FreeKVCacheBlockQueue([block])
assert queue.num_free_blocks == 1
assert queue.free_list_head == block
assert queue.free_list_tail == block
def test_free_kv_cache_block_queue_operations():
# Create a list of KVCacheBlock objects
blocks = [KVCacheBlock(block_id=i) for i in range(5)]
# Create a FreeKVCacheBlockQueue with these blocks
queue = FreeKVCacheBlockQueue(blocks)
# Check initial state
assert queue.num_free_blocks == 5
assert queue.free_list_head == blocks[0]
assert queue.free_list_tail == blocks[4]
# Pop the first block
block1 = queue.popleft()
assert block1 == blocks[0]
assert queue.num_free_blocks == 4
assert queue.free_list_head == blocks[1]
assert queue.free_list_tail == blocks[4]
# Remove a block from the middle
block_to_remove = blocks[2]
queue.remove(block_to_remove)
assert queue.num_free_blocks == 3
assert blocks[1].next_free_block == blocks[3]
assert blocks[3].prev_free_block == blocks[1]
# Append a block back
queue.append(block_to_remove)
assert queue.num_free_blocks == 4
assert queue.free_list_tail == block_to_remove
assert block_to_remove.prev_free_block == blocks[4]
assert block_to_remove.next_free_block is None
# Pop blocks until empty
for _ in range(4):
queue.popleft()
assert queue.num_free_blocks == 0
assert queue.free_list_head is None
assert queue.free_list_tail is None
# Attempt to pop from an empty queue
with pytest.raises(ValueError) as e:
queue.popleft()
assert str(e.value) == "No free blocks available"
def test_free_kv_cache_block_queue_get_all_free_blocks():
# Create a list of KVCacheBlock objects
blocks = [KVCacheBlock(block_id=i) for i in range(5)]
# Create a FreeKVCacheBlockQueue with these blocks
queue = FreeKVCacheBlockQueue(blocks)
# Check all blocks are correctly retrieved
assert queue.get_all_free_blocks() == blocks
# Pop a block and check again
queue.popleft()
assert queue.get_all_free_blocks() == blocks[1:]
# Remove a block and check again
block_to_remove = blocks[2]
queue.remove(block_to_remove)
assert queue.get_all_free_blocks() == blocks[1:2] + blocks[3:]
# Append a block back and check again
queue.append(block_to_remove)
assert queue.get_all_free_blocks() == \
blocks[1:2] + blocks[3:] + [block_to_remove]
def test_generate_block_hash_extra_keys():
request = make_request(
request_id=0,
prompt_token_ids=[_ for _ in range(20)],
mm_positions=[{
"offset": 0,
"length": 5
}, {
"offset": 10,
"length": 5
}],
mm_hashes=["hash1", "hash2"],
)
# Test with no extra keys
extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0)
assert extra_keys == ("hash1", )
assert next_mm_idx == 1
# Test with partial overlap
extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 3, 8, 0)
assert extra_keys == ("hash1", )
assert next_mm_idx == 1
# Test with no overlap
extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 6, 10, 0)
assert extra_keys == ()
assert next_mm_idx == 1
# Test with multiple extra keys
extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 15, 0)
assert extra_keys == ('hash1', 'hash2')
assert next_mm_idx == 2
def test_generate_block_hash_extra_keys_no_mm_inputs():
request = make_request(
request_id=0,
prompt_token_ids=[_ for _ in range(6)],
mm_positions=None,
mm_hashes=None,
)
extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0)
assert extra_keys is None
assert next_mm_idx == 0
def test_hash_block_tokens():
parent_block_hash = 123
curr_block_token_ids = (1, 2, 3)
extra_keys = ("key1", "key2")
block_hash = hash_block_tokens(parent_block_hash, curr_block_token_ids,
extra_keys)
assert isinstance(block_hash, BlockHashType)
assert block_hash.hash_value == hash(
(parent_block_hash, curr_block_token_ids, extra_keys))
assert block_hash.token_ids == curr_block_token_ids
assert block_hash.extra_keys == extra_keys
def test_hash_request_tokens():
request = make_request(
request_id=0,
prompt_token_ids=[_ for _ in range(6)],
mm_positions=[{
"offset": 0,
"length": 3
}, {
"offset": 3,
"length": 3
}],
mm_hashes=["hash1", "hash2"],
)
block_size = 3
block_hashes = hash_request_tokens(block_size, request)
assert len(block_hashes) == 2
assert isinstance(block_hashes[0], BlockHashType)
assert isinstance(block_hashes[1], BlockHashType)
# Check the first block
assert block_hashes[0].token_ids == (0, 1, 2)
assert block_hashes[0].extra_keys == ("hash1", )
# Check the second block
assert block_hashes[1].token_ids == (3, 4, 5)
assert block_hashes[1].extra_keys == ("hash2", )
def test_hash_tokens_different_mm_input():
request1 = make_request(
request_id=0,
prompt_token_ids=[_ for _ in range(6)],
mm_positions=[{
"offset": 0,
"length": 3
}, {
"offset": 3,
"length": 3
}],
mm_hashes=["hash1", "hash2"],
)
request2 = make_request(
request_id=1,
prompt_token_ids=[_ for _ in range(6)],
mm_positions=[{
"offset": 0,
"length": 3
}, {
"offset": 3,
"length": 3
}],
mm_hashes=["hash3", "hash2"],
)
block_size = 3
block_hashes1 = hash_request_tokens(block_size, request1)
block_hashes2 = hash_request_tokens(block_size, request2)
assert block_hashes1[0] != block_hashes2[0]
assert block_hashes1[1] != block_hashes2[1]
def test_hash_request_tokens_no_mm_inputs():
request = make_request(
request_id=0,
prompt_token_ids=[_ for _ in range(6)],
mm_positions=None,
mm_hashes=None,
)
block_size = 3
block_hashes = hash_request_tokens(block_size, request)
assert len(block_hashes) == 2
assert block_hashes[0].token_ids == (0, 1, 2)
assert block_hashes[0].extra_keys is None
assert block_hashes[1].token_ids == (3, 4, 5)
assert block_hashes[1].extra_keys is None
"""Compare the with and without prefix caching."""
import pytest
from vllm.inputs import token_inputs
from vllm.multimodal.inputs import PlaceholderRange
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
from vllm.sampling_params import SamplingParams
from vllm.utils import cdiv
from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
......@@ -13,12 +12,18 @@ def make_request(request_id,
prompt_token_ids,
mm_positions=None,
mm_hashes=None):
if mm_positions is None:
multi_modal_inputs = None
else:
multi_modal_inputs = [MultiModalKwargs({})] * len(mm_positions)
return Request(
request_id=request_id,
inputs=token_inputs(prompt_token_ids=prompt_token_ids,
multi_modal_placeholders={"image": mm_positions}
if mm_positions else None,
multi_modal_hashes=mm_hashes),
prompt=None,
prompt_token_ids=prompt_token_ids,
multi_modal_inputs=multi_modal_inputs,
multi_modal_hashes=mm_hashes,
multi_modal_placeholders=mm_positions,
sampling_params=SamplingParams(max_tokens=17),
eos_token_id=100,
arrival_time=0,
......@@ -44,9 +49,10 @@ def test_prefill():
unique_token_ids = [3] * 7
all_token_ids = common_token_ids + unique_token_ids
req0 = make_request("0", all_token_ids)
computed_blocks = manager.get_computed_blocks(req0)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
assert len(req0.kv_block_hashes) == 3
assert not computed_blocks
assert num_computed_tokens == 0
blocks = manager.allocate_slots(req0, 55, computed_blocks)
assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
......@@ -68,9 +74,10 @@ def test_prefill():
# Incomplete 1 block (5 tokens)
unique_token_ids = [3] * 5
req1 = make_request("1", common_token_ids + unique_token_ids)
computed_blocks = manager.get_computed_blocks(req1)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
assert len(req1.kv_block_hashes) == 3
assert [b.block_id for b in computed_blocks] == [0, 1, 2]
assert num_computed_tokens == 3 * 16
num_new_tokens = 53 - 3 * 16
blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
assert [b.block_id for b in blocks] == [5, 6]
......@@ -86,7 +93,7 @@ def test_prefill():
# All blocks should be available.
assert manager.free_block_queue.num_free_blocks == 10
# The order should be
# [unallocated (7, 8)]
# [unallocated (7, 8, 9)]
# [unique_req0 (4, 3)]
# [unique_req1 (6, 5)]
# [common (2, 1, 0)]
......@@ -98,9 +105,10 @@ def test_prefill():
# Incomplete 1 block (6 tokens)
unique_token_ids = [3] * 6
req2 = make_request("2", common_token_ids + unique_token_ids)
computed_block = manager.get_computed_blocks(req2)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
assert len(req2.kv_block_hashes) == 3
assert [b.block_id for b in computed_block] == [0, 1, 2]
assert [b.block_id for b in computed_blocks] == [0, 1, 2]
assert num_computed_tokens == 3 * 16
num_new_tokens = 53 - 3 * 16
blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
assert [b.block_id for b in blocks] == [7, 8]
......@@ -118,8 +126,9 @@ def test_prefill():
# Cache miss and eviction.
req3 = make_request("3", [99] * (16 * 9))
computed_blocks = manager.get_computed_blocks(req3)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
assert not computed_blocks
assert num_computed_tokens == 0
blocks = manager.allocate_slots(req3, 16 * 9, computed_blocks)
# This block ID order also checks the eviction order.
assert [b.block_id for b in blocks] == [9, 4, 3, 6, 5, 8, 7, 2, 1, 0]
......@@ -145,8 +154,9 @@ def test_decode():
# Incomplete 1 block (7 tokens)
unique_token_ids = [3] * 7
req0 = make_request("0", common_token_ids + unique_token_ids)
computed_blocks = manager.get_computed_blocks(req0)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
assert not computed_blocks
assert num_computed_tokens == 0
blocks = manager.allocate_slots(req0, 55, computed_blocks)
assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
......@@ -192,16 +202,18 @@ def test_evict():
last_token_id = 5 * 16 + 7
req0 = make_request("0", list(range(last_token_id)))
computed_blocks = manager.get_computed_blocks(req0)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
assert not computed_blocks
assert num_computed_tokens == 0
blocks = manager.allocate_slots(req0, 5 * 16 + 7, computed_blocks)
assert len(blocks) == 7 # 5 full + 1 partial + 1 preallocated
# 3 blocks.
req1 = make_request("1", list(range(last_token_id,
last_token_id + 3 * 16)))
computed_blocks = manager.get_computed_blocks(req1)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
assert not computed_blocks
assert num_computed_tokens == 0
blocks = manager.allocate_slots(req1, 3 * 16, computed_blocks)
assert len(blocks) == 3 # 3 full blocks
last_token_id += 3 * 16
......@@ -217,8 +229,9 @@ def test_evict():
# Touch the first 2 blocks.
req2 = make_request("2", list(range(2 * 16 + 3)))
computed_blocks = manager.get_computed_blocks(req2)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
assert [b.block_id for b in computed_blocks] == [0, 1]
assert num_computed_tokens == 2 * 16
blocks = manager.allocate_slots(req2, 3, computed_blocks)
assert [b.block_id for b in blocks] == [6, 5]
assert manager.free_block_queue.num_free_blocks == 6
......@@ -242,8 +255,9 @@ def test_hash_block_correct_reuse():
# Allocate 1 block and cache it.
num_tokens = block_size * 1
req = make_request("0", list(range(num_tokens)))
computed_blocks = manager.get_computed_blocks(req)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
assert not computed_blocks
assert num_computed_tokens == 0
blocks = manager.allocate_slots(req, num_tokens, computed_blocks)
assert len(blocks) == 1
......@@ -253,8 +267,9 @@ def test_hash_block_correct_reuse():
# Allocate a new block that's not full, make sure hash info on the
# block is cleared.
req = make_request("1", list(range(num_tokens - 1)))
computed_blocks = manager.get_computed_blocks(req)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
assert not computed_blocks
assert num_computed_tokens == 0
blocks = manager.allocate_slots(req, num_tokens - 1, computed_blocks)
assert len(blocks) == 1
......@@ -279,16 +294,18 @@ def test_computed_blocks_not_evicted():
# Allocate a block and cache it.
num_tokens = block_size * 1
req0 = make_request("0", list(range(num_tokens)))
computed_blocks = manager.get_computed_blocks(req0)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
assert not computed_blocks
assert num_computed_tokens == 0
blocks = manager.allocate_slots(req0, num_tokens, computed_blocks)
assert len(blocks) == 1
assert blocks[0].block_id == 0
# Allocate another block.
req1 = make_request("1", list(range(num_tokens, num_tokens * 2)))
computed_blocks = manager.get_computed_blocks(req1)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
assert not computed_blocks
assert num_computed_tokens == 0
blocks = manager.allocate_slots(req1, num_tokens, computed_blocks)
assert len(blocks) == 1
assert blocks[0].block_id == 1
......@@ -300,9 +317,10 @@ def test_computed_blocks_not_evicted():
# Now if we have a cache hit on the first block, we should evict the second
# cached block rather than the first one.
req2 = make_request("2", list(range(num_tokens * 2)))
computed_blocks = manager.get_computed_blocks(req2)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
assert len(computed_blocks) == 1
assert computed_blocks[0].block_id == 0
assert num_computed_tokens == block_size
blocks = manager.allocate_slots(req2, num_tokens * 2 - num_tokens,
computed_blocks)
......@@ -326,8 +344,9 @@ def test_basic_prefix_caching_disabled():
req1 = make_request("1", list(range(10))) # 2 blocks and some more
computed_blocks = manager.get_computed_blocks(req1)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
assert not computed_blocks
assert num_computed_tokens == 0
blocks = manager.allocate_slots(req1, 10, computed_blocks)
assert len(blocks) == 3
......@@ -336,15 +355,17 @@ def test_basic_prefix_caching_disabled():
# No caching.
req2 = make_request("2", list(range(16))) # shared prefix
computed_blocks = manager.get_computed_blocks(req2)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
assert not computed_blocks
assert num_computed_tokens == 0
blocks = manager.allocate_slots(req2, 16, computed_blocks)
assert len(blocks) == 4
# New requests should not have any blocks.
req3 = make_request("3", list(range(4)))
computed_blocks = manager.get_computed_blocks(req3)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
assert not computed_blocks
assert num_computed_tokens == 0
blocks = manager.allocate_slots(req3, 4, computed_blocks)
assert not blocks
......@@ -366,8 +387,9 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
num_preallocated_blocks = cdiv(num_preallocate_tokens, block_size)
req = make_request("0", list(range(block_size * 30)))
computed_blocks = manager.get_computed_blocks(req)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
assert not computed_blocks
assert num_computed_tokens == 0
# Just ask for 1 block.
blocks = manager.allocate_slots(req, block_size, computed_blocks)
req.num_computed_tokens = block_size
......@@ -464,14 +486,15 @@ def test_mm_prefix_caching():
all_token_ids,
mm_positions=mm_positions,
mm_hashes=mm_hashes)
computed_blocks = manager.get_computed_blocks(req0)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
# Completed block should have hashes with extra keys.
assert not computed_blocks
assert num_computed_tokens == 0
assert len(req0.kv_block_hashes) == 3
assert req0.kv_block_hashes[0].extra_keys == (("aaa", 0), )
assert req0.kv_block_hashes[1].extra_keys == (("aaa", 5), ("bbb", 0))
assert req0.kv_block_hashes[2].extra_keys == (("bbb", 2), )
assert req0.kv_block_hashes[0].extra_keys == ("aaa", )
assert req0.kv_block_hashes[1].extra_keys == ("aaa", "bbb")
assert req0.kv_block_hashes[2].extra_keys == ("bbb", )
blocks = manager.allocate_slots(req0, 59, computed_blocks)
assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
......@@ -485,7 +508,7 @@ def test_mm_prefix_caching():
# The just completed block should have hashes with extra keys.
assert len(req0.kv_block_hashes) == 4
assert req0.kv_block_hashes[3].extra_keys == (("ccc", 0), )
assert req0.kv_block_hashes[3].extra_keys == ("ccc", )
# Cache hit.
unique_token_ids = [-1] * 7 + [200] * 5
......@@ -498,5 +521,138 @@ def test_mm_prefix_caching():
all_token_ids,
mm_positions=mm_positions,
mm_hashes=mm_hashes)
computed_blocks = manager.get_computed_blocks(req1)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
assert len(computed_blocks) == 3
assert num_computed_tokens == 3 * 16
def test_prefill_not_enough_free_blocks_with_computed_blocks():
"""
This is a unit test that tests the correctness of the allocate_slots
when there is not enough free blocks. Specifically, when a request
has computed blocks but cannot be allocated due to not enough free blocks,
the computed blocks should not be touched.
"""
block_size = 16
manager = KVCacheManager(
block_size=block_size,
num_gpu_blocks=10,
max_model_len=8192,
sliding_window=None,
enable_caching=True,
num_preallocate_tokens=0,
)
# Complete 3 blocks (48 tokens)
# | Common-0 | Common-1 | Common-2 | ... |
common_token_ids = [i for i in range(3) for _ in range(16)]
req0 = make_request("0", common_token_ids)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
assert not computed_blocks
assert num_computed_tokens == 0
manager.allocate_slots(req0, 48, computed_blocks)
block_part0 = manager.req_to_blocks[req0.request_id]
# | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
req1 = make_request("1", common_token_ids * 2)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
assert computed_blocks == block_part0
assert num_computed_tokens == 3 * 16
manager.allocate_slots(req1, 48, computed_blocks)
block_part1 = manager.req_to_blocks[req1.request_id]
# | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
# | Req1-5(F)| ... |
manager.free(req1)
assert {block.ref_cnt for block in block_part1[:3]} == {1}
assert {block.ref_cnt for block in block_part1[3:]} == {0}
# | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
# | Req1-5(F)| Req2-0 | Req2-1 | ... |
req2 = make_request("2", [7] * block_size * 2)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
assert not computed_blocks
assert num_computed_tokens == 0
manager.allocate_slots(req2, block_size * 2, computed_blocks)
# Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
# but it cannot be allocated due to insufficient free blocks (2).
# In this case, the ref_cnt of the computed blocks should not be changed.
assert manager.free_block_queue.num_free_blocks == 5
req3 = make_request("3", common_token_ids * 3)
computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
assert computed_blocks == block_part1
assert num_computed_tokens == 6 * 16
# Req3 cannot be allocated.
assert manager.allocate_slots(req3, 48, computed_blocks) is None
# Block 0-2 are used by Req 1.
assert {block.ref_cnt for block in block_part1[:3]} == {1}
# Block 3-5 are free.
assert {block.ref_cnt for block in block_part1[3:]} == {0}
def test_reset_prefix_cache():
manager = KVCacheManager(
block_size=16,
num_gpu_blocks=10,
max_model_len=8192,
sliding_window=None,
enable_caching=True,
num_preallocate_tokens=0,
)
full_block_token_ids = [i for i in range(3) for _ in range(16)]
unique_token_ids = [3] * 7
all_token_ids = full_block_token_ids + unique_token_ids
req0 = make_request("0", all_token_ids)
blocks = manager.allocate_slots(req0, 55, [])
assert [b.block_id for b in blocks] == [0, 1, 2, 3]
unique_token_ids = [4] * 7
all_token_ids = full_block_token_ids + unique_token_ids
req1 = make_request("1", all_token_ids)
computed_blocks, _ = manager.get_computed_blocks(req1)
assert len(req1.kv_block_hashes) == 3
assert len(computed_blocks) == 3
blocks = manager.allocate_slots(req1, 7, computed_blocks)
assert [b.block_id for b in blocks] == [4]
# Failed to reset prefix cache because some blocks are not freed yet.
assert not manager.reset_prefix_cache()
assert manager.cached_block_hash_to_block
# Free the blocks.
manager.free(req0)
manager.free(req1)
assert manager.reset_prefix_cache()
assert not manager.cached_block_hash_to_block
assert all([blk.block_hash is None for blk in manager.block_pool])
def test_uncache_blocks():
manager = KVCacheManager(
block_size=16,
num_gpu_blocks=10,
max_model_len=8192,
sliding_window=None,
enable_caching=True,
num_preallocate_tokens=0,
)
req0 = make_request("0", list(range(30)))
blocks = manager.allocate_slots(req0, 30, [])
assert [b.block_id for b in blocks] == [0, 1]
assert len(manager.cached_block_hash_to_block) == 1
req0.num_computed_tokens = 30
# Simulate speculative tokens.
for _ in range(5):
req0.append_output_token_ids(8)
manager.append_slots(req0, 5)
assert len(manager.cached_block_hash_to_block) == 2
# After sampling, assuming only 1 token is accepted.
req0.num_computed_tokens = 31
num_uncached_blocks = manager.uncache_blocks(req0)
assert num_uncached_blocks == 1
assert len(manager.cached_block_hash_to_block) == 1
from vllm import LLM, SamplingParams
def test_cascade_attention(example_system_message, monkeypatch):
prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
# No cascade attention.
single_prompt = [example_system_message + prompt]
responses = llm.generate(single_prompt, sampling_params)
ref_output = responses[0].outputs[0].text
# (Probably) Use cascade attention.
prompts = [example_system_message + prompt] * 64
responses = llm.generate(prompts, sampling_params)
for response in responses:
assert response.outputs[0].text == ref_output
import asyncio
from typing import Tuple
from contextlib import ExitStack
from typing import List, Tuple
import os
import pytest
......@@ -7,6 +8,7 @@ import pytest
from vllm import SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.platforms import current_platform
from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine.async_llm import AsyncLLM
from ...utils import models_path_prefix
......@@ -15,32 +17,44 @@ if not current_platform.is_cuda():
allow_module_level=True)
ENGINE_ARGS = AsyncEngineArgs(model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"),
enforce_eager=True,
disable_log_requests=True)
async def generate(engine: AsyncLLM, request_id: str,
output_kind: RequestOutputKind,
max_tokens: int) -> Tuple[int, str]:
count = 0
async for _ in engine.generate(request_id=request_id,
prompt="Hello my name is Robert and",
sampling_params=SamplingParams(
max_tokens=max_tokens, temperature=0)):
sampling_params = SamplingParams(max_tokens=max_tokens,
output_kind=output_kind,
temperature=0)
async for out in engine.generate(request_id=request_id,
prompt="Hello my name is Robert and",
sampling_params=sampling_params):
num_tokens = len(out.outputs[0].token_ids)
if output_kind == RequestOutputKind.DELTA:
count += num_tokens
else:
count = num_tokens
count += 1
await asyncio.sleep(0.)
return count, request_id
@pytest.mark.parametrize(
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
@pytest.mark.asyncio
async def test_load(monkeypatch):
async def test_load(monkeypatch, output_kind: RequestOutputKind):
# TODO(rickyx): Remove monkeypatch once we have a better way to test V1
# so that in the future when we switch, we don't have to change all the
# tests.
with monkeypatch.context() as m:
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
engine = AsyncLLM.from_engine_args(ENGINE_ARGS)
after.callback(engine.shutdown)
NUM_REQUESTS = 10000
NUM_EXPECTED_TOKENS = 10
......@@ -52,20 +66,72 @@ async def test_load(monkeypatch):
for request_id in request_ids:
tasks.append(
asyncio.create_task(
generate(engine, request_id, NUM_EXPECTED_TOKENS)))
generate(engine, request_id, output_kind,
NUM_EXPECTED_TOKENS)))
# Confirm that we got all the EXPECTED tokens from the requests.
failed_request_id = None
tokens = None
for task in tasks:
done, pending = await asyncio.wait(tasks,
return_when=asyncio.FIRST_EXCEPTION)
for task in pending:
task.cancel()
for task in done:
num_generated_tokens, request_id = await task
if (num_generated_tokens != NUM_EXPECTED_TOKENS
and failed_request_id is None):
failed_request_id = request_id
tokens = num_generated_tokens
assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
f"{request_id} generated {num_generated_tokens} but "
f"expected {NUM_EXPECTED_TOKENS}")
assert failed_request_id is None, (
f"{failed_request_id} generated {tokens} but "
f"expected {NUM_EXPECTED_TOKENS}")
assert not engine.output_processor.has_unfinished_requests()
engine.shutdown()
@pytest.mark.parametrize(
"output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
@pytest.mark.asyncio
async def test_abort(monkeypatch, output_kind: RequestOutputKind):
with monkeypatch.context() as m, ExitStack() as after:
m.setenv("VLLM_USE_V1", "1")
engine = AsyncLLM.from_engine_args(ENGINE_ARGS)
after.callback(engine.shutdown)
NUM_REQUESTS = 100
NUM_EXPECTED_TOKENS = 100
REQUEST_IDS_TO_ABORT = range(1, 100, 10)
request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
# Create concurrent requests.
tasks: List[asyncio.Task] = []
for request_id in request_ids:
tasks.append(
asyncio.create_task(
generate(engine, request_id, output_kind,
NUM_EXPECTED_TOKENS)))
# API server cancels requests when they disconnect.
for idx in REQUEST_IDS_TO_ABORT:
tasks[idx].cancel()
await asyncio.sleep(0.1)
# Confirm the other requests are okay.
for idx, task in enumerate(tasks):
# Confirm that it was actually canceled.
if idx in REQUEST_IDS_TO_ABORT:
with pytest.raises(asyncio.CancelledError):
await task
else:
# Otherwise, make sure the request was not impacted.
num_generated_tokens, request_id = await task
assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
f"{request_id} generated {num_generated_tokens} but "
f"expected {NUM_EXPECTED_TOKENS}")
assert not engine.output_processor.has_unfinished_requests()
# Confirm we can do another generation.
request_id = f"request-{REQUEST_IDS_TO_ABORT[0]}"
task = asyncio.create_task(
generate(engine, request_id, output_kind, NUM_EXPECTED_TOKENS))
num_generated_tokens, request_id = await task
assert num_generated_tokens == NUM_EXPECTED_TOKENS
assert not engine.output_processor.has_unfinished_requests()
......@@ -5,13 +5,13 @@ import os
import pytest
from transformers import AutoTokenizer
from tests.utils import fork_new_process_for_each_test
from vllm import SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.platforms import current_platform
from vllm.usage.usage_lib import UsageContext
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.core import EngineCore
from vllm.v1.executor.abstract import Executor
from ...utils import models_path_prefix
if not current_platform.is_cuda():
......@@ -39,19 +39,18 @@ def make_request() -> EngineCoreRequest:
)
@fork_new_process_for_each_test
def test_engine_core(monkeypatch):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
"""Setup the EngineCore."""
engine_args = EngineArgs(model=MODEL_NAME)
vllm_config = engine_args.create_engine_config(
usage_context=UsageContext.UNKNOWN_CONTEXT)
executor_class = AsyncLLM._get_executor_cls(vllm_config)
vllm_config = engine_args.create_engine_config()
executor_class = Executor.get_class(vllm_config)
engine_core = EngineCore(vllm_config=vllm_config,
executor_class=executor_class,
usage_context=UsageContext.UNKNOWN_CONTEXT)
executor_class=executor_class)
"""Test basic request lifecycle."""
# First request.
......@@ -83,7 +82,7 @@ def test_engine_core(monkeypatch):
assert len(engine_core.scheduler.running) == 4
# Loop through until they are all done.
while len(engine_core.step()) > 0:
while len(engine_core.step().outputs) > 0:
pass
assert len(engine_core.scheduler.waiting) == 0
......@@ -143,23 +142,22 @@ def test_engine_core(monkeypatch):
assert len(engine_core.scheduler.running) == 0
@fork_new_process_for_each_test
def test_engine_core_advanced_sampling(monkeypatch):
"""
A basic end-to-end test to verify that the engine functions correctly
when additional sampling parameters, such as min_tokens and
when additional sampling parameters, such as top_p, min_tokens, and
presence_penalty, are set.
"""
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
"""Setup the EngineCore."""
engine_args = EngineArgs(model=MODEL_NAME)
vllm_config = engine_args.create_engine_config(
usage_context=UsageContext.UNKNOWN_CONTEXT)
executor_class = AsyncLLM._get_executor_cls(vllm_config)
vllm_config = engine_args.create_engine_config()
executor_class = Executor.get_class(vllm_config)
engine_core = EngineCore(vllm_config=vllm_config,
executor_class=executor_class,
usage_context=UsageContext.UNKNOWN_CONTEXT)
executor_class=executor_class)
"""Test basic request lifecycle."""
# First request.
request: EngineCoreRequest = make_request()
......@@ -171,11 +169,23 @@ def test_engine_core_advanced_sampling(monkeypatch):
stop_token_ids=[1001, 1002],
)
engine_core.add_request(request)
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 0
# Loop through until they are all done.
while len(engine_core.step()) > 0:
pass
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0
def _check_engine_state():
assert len(engine_core.scheduler.waiting) == 1
assert len(engine_core.scheduler.running) == 0
# Loop through until they are all done.
while len(engine_core.step().outputs) > 0:
pass
assert len(engine_core.scheduler.waiting) == 0
assert len(engine_core.scheduler.running) == 0
_check_engine_state()
# Second request.
request2 = make_request()
request2.sampling_params = SamplingParams(
top_p=0.99,
top_k=50,
)
engine_core.add_request(request2)
_check_engine_state()
......@@ -7,13 +7,14 @@ import os
import pytest
from transformers import AutoTokenizer
from tests.utils import fork_new_process_for_each_test
from vllm import SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.platforms import current_platform
from vllm.usage.usage_lib import UsageContext
from vllm.v1.engine import EngineCoreRequest
from vllm.v1.engine.async_llm import AsyncLLM
from vllm.v1.engine.core_client import EngineCoreClient
from vllm.v1.executor.abstract import Executor
from ...utils import models_path_prefix
if not current_platform.is_cuda():
......@@ -44,7 +45,7 @@ def make_request(params: SamplingParams) -> EngineCoreRequest:
def loop_until_done(client: EngineCoreClient, outputs: Dict):
while True:
engine_core_outputs = client.get_output()
engine_core_outputs = client.get_output().outputs
if len(engine_core_outputs) == 0:
break
......@@ -62,7 +63,7 @@ def loop_until_done(client: EngineCoreClient, outputs: Dict):
async def loop_until_done_async(client: EngineCoreClient, outputs: Dict):
while True:
engine_core_outputs = await client.get_output_async()
engine_core_outputs = await client.get_output_async().outputs
if len(engine_core_outputs) == 0:
break
......@@ -77,6 +78,7 @@ async def loop_until_done_async(client: EngineCoreClient, outputs: Dict):
break
@fork_new_process_for_each_test
@pytest.mark.parametrize("multiprocessing_mode", [True, False])
def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
......@@ -86,13 +88,12 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
engine_args = EngineArgs(model=MODEL_NAME, compilation_config=3)
vllm_config = engine_args.create_engine_config(
UsageContext.UNKNOWN_CONTEXT)
executor_class = AsyncLLM._get_executor_cls(vllm_config)
executor_class = Executor.get_class(vllm_config)
client = EngineCoreClient.make_client(
vllm_config,
executor_class,
UsageContext.UNKNOWN_CONTEXT,
multiprocess_mode=multiprocessing_mode,
asyncio_mode=False,
vllm_config=vllm_config,
executor_class=executor_class,
)
MAX_TOKENS = 20
......@@ -145,10 +146,8 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
client.abort_requests([request.request_id])
# Shutdown the client.
client.shutdown()
@fork_new_process_for_each_test
@pytest.mark.asyncio
async def test_engine_core_client_asyncio(monkeypatch):
......@@ -158,13 +157,12 @@ async def test_engine_core_client_asyncio(monkeypatch):
engine_args = EngineArgs(model=MODEL_NAME)
vllm_config = engine_args.create_engine_config(
usage_context=UsageContext.UNKNOWN_CONTEXT)
executor_class = AsyncLLM._get_executor_cls(vllm_config)
executor_class = Executor.get_class(vllm_config)
client = EngineCoreClient.make_client(
vllm_config,
executor_class,
UsageContext.UNKNOWN_CONTEXT,
multiprocess_mode=True,
asyncio_mode=True,
vllm_config=vllm_config,
executor_class=executor_class,
)
MAX_TOKENS = 20
......@@ -204,6 +202,3 @@ async def test_engine_core_client_asyncio(monkeypatch):
else:
assert len(outputs[req_id]) == MAX_TOKENS, (
f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
# Shutdown the client.
client.shutdown()
from typing import List
import os
import pytest
from transformers import AutoTokenizer
import os
from vllm.sampling_params import RequestOutputKind
from vllm.v1.engine import EngineCoreOutput
from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerRequest
from vllm.engine.arg_utils import EngineArgs
from vllm.sampling_params import RequestOutputKind, SamplingParams
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
from vllm.v1.engine.output_processor import OutputProcessor
from ...utils import models_path_prefix
TOKENIZER_NAME = os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.3")
VLLM_CONFIG = EngineArgs(model=TOKENIZER_NAME).create_engine_config()
TOKENIZER_GROUP = init_tokenizer_from_configs(VLLM_CONFIG.model_config,
VLLM_CONFIG.scheduler_config,
VLLM_CONFIG.parallel_config,
VLLM_CONFIG.lora_config)
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
FULL_STRINGS = [
......@@ -68,28 +76,34 @@ class MockEngineCore:
"request_output_kind",
[RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
def test_incremental_detokenization(request_output_kind: RequestOutputKind):
detokenizer = Detokenizer(TOKENIZER_NAME)
output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
engine_core = MockEngineCore(GENERATION_TOKENS)
# Make N requests.
requests = [
DetokenizerRequest(
request_id=f"request-{idx}",
prompt=prompt,
prompt_token_ids=prompt_tokens,
skip_special_tokens=False,
spaces_between_special_tokens=False,
output_kind=request_output_kind,
stop=[],
include_stop_str_in_output=False,
) for idx, (
EngineCoreRequest(request_id=f"request-{idx}",
prompt=prompt,
prompt_token_ids=prompt_tokens,
arrival_time=0,
mm_inputs=None,
mm_hashes=None,
mm_placeholders=None,
eos_token_id=None,
lora_request=None,
sampling_params=SamplingParams(
skip_special_tokens=False,
spaces_between_special_tokens=False,
output_kind=request_output_kind,
stop=[],
include_stop_str_in_output=False))
for idx, (
prompt,
prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
]
# Add requests to the detokenizer.
for request in requests:
detokenizer.add_request(request)
output_processor.add_request(request)
gen_strings = {}
gen_tokens = {}
......@@ -100,7 +114,9 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
break
# Step the Detokenizer.
request_outputs, requests_to_abort = detokenizer.step(outputs)
processed_outputs = output_processor.process_outputs(outputs, )
request_outputs = processed_outputs.request_outputs
requests_to_abort = processed_outputs.reqs_to_abort
assert len(requests_to_abort) == 0
# Update tracking.
......@@ -124,34 +140,41 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}"
assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}"
assert detokenizer.get_num_unfinished_requests() == 0
assert not detokenizer.has_unfinished_requests()
assert output_processor.get_num_unfinished_requests() == 0
assert not output_processor.has_unfinished_requests()
@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
def test_stop_string(include_stop_str_in_output: bool):
detokenizer = Detokenizer(TOKENIZER_NAME)
output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
engine_core = MockEngineCore(GENERATION_TOKENS)
# Make N requests.
requests = [
DetokenizerRequest(
EngineCoreRequest(
request_id=f"request-{idx}",
prompt=prompt,
prompt_token_ids=prompt_tokens,
skip_special_tokens=False,
spaces_between_special_tokens=False,
output_kind=RequestOutputKind.DELTA,
stop=STOP_STRINGS,
include_stop_str_in_output=include_stop_str_in_output,
) for idx, (
prompt,
prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
arrival_time=0,
mm_inputs=None,
mm_hashes=None,
mm_placeholders=None,
eos_token_id=None,
lora_request=None,
sampling_params=SamplingParams(
skip_special_tokens=False,
spaces_between_special_tokens=False,
output_kind=RequestOutputKind.DELTA,
stop=STOP_STRINGS,
include_stop_str_in_output=include_stop_str_in_output,
)) for idx, (
prompt,
prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
]
# Add requests to the detokenizer.
for request in requests:
detokenizer.add_request(request)
output_processor.add_request(request)
gen_strings = {}
aborted = []
......@@ -162,7 +185,9 @@ def test_stop_string(include_stop_str_in_output: bool):
break
# Step the Detokenizer.
request_outputs, requests_to_abort = detokenizer.step(outputs)
processed_outputs = output_processor.process_outputs(outputs)
request_outputs = processed_outputs.request_outputs
requests_to_abort = processed_outputs.reqs_to_abort
for request_output in request_outputs:
# If aborted, we should not get a request output.
assert request_output.request_id not in aborted
......@@ -203,5 +228,71 @@ def test_stop_string(include_stop_str_in_output: bool):
assert gen_str == ref_str_exc_stop, (
f"{gen_str=}, {ref_str_exc_stop=}")
assert detokenizer.get_num_unfinished_requests() == 0
assert not detokenizer.has_unfinished_requests()
assert output_processor.get_num_unfinished_requests() == 0
assert not output_processor.has_unfinished_requests()
def test_iteration_stats():
output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=True)
engine_core = MockEngineCore(GENERATION_TOKENS)
# Make N requests.
requests = [
EngineCoreRequest(
request_id=f"request-{idx}",
prompt=prompt,
prompt_token_ids=prompt_tokens,
arrival_time=0,
mm_inputs=None,
mm_hashes=None,
mm_placeholders=None,
eos_token_id=None,
lora_request=None,
sampling_params=SamplingParams(),
) for idx, (
prompt,
prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
]
# Add all requests except one to the OutputProcessor.
num_active = len(GENERATION_TOKENS) - 1
for request in requests[:num_active]:
output_processor.add_request(request)
inactive_request = requests[num_active]
# First iteration has 2 prefills.
outputs = engine_core.get_outputs()[:num_active]
processed_outputs = output_processor.process_outputs(outputs)
iteration_stats = processed_outputs.iteration_stats
total_prompt_tokens = sum(
[len(prompt_tokens) for prompt_tokens in PROMPT_TOKENS[:num_active]])
assert iteration_stats.num_prompt_tokens == total_prompt_tokens
assert iteration_stats.num_generation_tokens == num_active
# Just decodes in this step.
outputs = engine_core.get_outputs()[:num_active]
processed_outputs = output_processor.process_outputs(outputs)
iteration_stats = processed_outputs.iteration_stats
assert iteration_stats.num_prompt_tokens == 0
assert iteration_stats.num_generation_tokens == num_active
# Add a new request - prefill and 2 decodes in this step.
output_processor.add_request(inactive_request)
num_active += 1
outputs = engine_core.get_outputs()[:num_active]
processed_outputs = output_processor.process_outputs(outputs)
iteration_stats = processed_outputs.iteration_stats
total_prompt_tokens = len(PROMPT_TOKENS[num_active - 1])
assert iteration_stats.num_prompt_tokens == total_prompt_tokens
assert iteration_stats.num_generation_tokens == num_active
# Just decodes in this step.
outputs = engine_core.get_outputs()[:num_active]
processed_outputs = output_processor.process_outputs(outputs)
iteration_stats = processed_outputs.iteration_stats
assert iteration_stats.num_prompt_tokens == 0
assert iteration_stats.num_generation_tokens == num_active
import pytest
from vllm.sampling_params import SamplingParams
from vllm.v1.stats.common import RequestStats, RequestStatsUpdate
def make_update(
request_id: str,
update_type: RequestStatsUpdate.Type,
monotonic_ts_s: float,
**kwargs,
):
if update_type == RequestStatsUpdate.Type.INPUT_PROCESSED:
kwargs.setdefault("sampling_params", SamplingParams(n=1))
kwargs.setdefault("num_prompt_tokens", 10)
elif update_type == RequestStatsUpdate.Type.PREFILLING:
kwargs.setdefault("num_computed_tokens", 10)
kwargs.setdefault("num_cached_tokens", 10)
elif update_type == RequestStatsUpdate.Type.DETOKENIZED:
kwargs.setdefault("num_new_tokens", 10)
elif update_type == RequestStatsUpdate.Type.FINISHED:
kwargs.setdefault("finish_reason", "test_reason")
return RequestStatsUpdate(
request_id=request_id,
type=update_type,
monotonic_ts_s=monotonic_ts_s,
**kwargs,
)
def test_invalid_request_update():
request_id = "test_request"
update_specific_required_fields = {
RequestStatsUpdate.Type.INPUT_PROCESSED: [
"sampling_params",
"num_prompt_tokens",
],
RequestStatsUpdate.Type.PREFILLING: [
"num_computed_tokens",
"num_cached_tokens",
],
RequestStatsUpdate.Type.DETOKENIZED: ["num_new_tokens"],
RequestStatsUpdate.Type.FINISHED: ["finish_reason"],
}
# Missing a required field should raise an assertion error.
for update_type in RequestStatsUpdate.Type:
required_fields = update_specific_required_fields.get(update_type, [])
# Try to miss one of the required fields.
kwargs = {field: object() for field in required_fields}
for field in required_fields:
copy_kwargs = kwargs.copy()
copy_kwargs.pop(field)
with pytest.raises(ValueError):
RequestStatsUpdate(
request_id=request_id,
type=update_type,
**copy_kwargs,
)
def test_invalid_request_update_transition():
# Test invalid transition type.
for src in RequestStatsUpdate.Type:
for dst in RequestStatsUpdate.Type:
if dst not in RequestStatsUpdate._VALID_TRANSITIONS[src]:
with pytest.raises(AssertionError):
RequestStatsUpdate.check_valid_update(
make_update(
update_type=dst,
request_id="test_request",
monotonic_ts_s=1,
),
last_update_type=src,
last_updated_ts_s=0,
)
else:
RequestStatsUpdate.check_valid_update(
make_update(
request_id="test_request",
update_type=dst,
monotonic_ts_s=1,
),
last_update_type=src,
last_updated_ts_s=0,
)
# Test invalid timestamp.
with pytest.raises(AssertionError):
RequestStatsUpdate.check_valid_update(
make_update(
request_id="test_request",
update_type=RequestStatsUpdate.Type.ARRIVED,
monotonic_ts_s=1,
),
last_update_type=None,
last_updated_ts_s=2,
)
def test_lifecycle_updates():
request_id = "test_request"
stats = RequestStats(request_id=request_id)
# Test the below scenario:
arrived_ts = 0
input_processed_ts = 1
queued_ts = 2
prefilling_ts = 3
decoded_ts = 5
detokenized_ts = 6
decoded_2_ts = 7
detokenized_2_ts = 8
preempted_ts = 9
resumed_ts = 10
decoded_3_ts = 11
detokenized_3_ts = 12
finished_ts = 13
# Test ARRIVED
arrived_update = RequestStatsUpdate(
request_id=request_id,
type=RequestStatsUpdate.Type.ARRIVED,
monotonic_ts_s=arrived_ts,
)
stats.update_from(arrived_update)
assert stats.arrival_ts_s == arrived_ts
assert stats.last_updated_ts_s == arrived_ts
# Test INPUT_PROCESSED
sampling_params = SamplingParams(n=1)
input_processed_update = RequestStatsUpdate(
request_id=request_id,
type=RequestStatsUpdate.Type.INPUT_PROCESSED,
monotonic_ts_s=input_processed_ts,
sampling_params=sampling_params,
num_prompt_tokens=6,
)
stats.update_from(input_processed_update)
assert stats.input_processor_end_ts_s == input_processed_ts
assert stats.last_updated_ts_s == input_processed_ts
assert stats.num_prompt_tokens == 6
assert stats.sampling_params == sampling_params
assert stats.first_token_ts_s is None
assert stats.prefill_ts_s is None
# Test QUEUED
queued_update = RequestStatsUpdate(
request_id=request_id,
type=RequestStatsUpdate.Type.QUEUED,
monotonic_ts_s=queued_ts,
)
stats.update_from(queued_update)
assert stats.queued_ts_s == queued_ts
assert stats.last_updated_ts_s == queued_ts
# Test PREFILLING
prefilling_update = RequestStatsUpdate(
request_id=request_id,
type=RequestStatsUpdate.Type.PREFILLING,
monotonic_ts_s=prefilling_ts,
num_computed_tokens=3,
num_cached_tokens=1,
)
stats.update_from(prefilling_update)
assert stats.prefill_ts_s == prefilling_ts
assert stats.num_computed_tokens == 3
assert stats.num_cached_tokens == 1
assert stats.queue_duration_s == prefilling_ts - queued_ts
# Test DECODING
decoded_update = RequestStatsUpdate(
request_id=request_id,
type=RequestStatsUpdate.Type.DECODING,
monotonic_ts_s=decoded_ts,
)
stats.update_from(decoded_update)
assert stats.last_updated_ts_s == decoded_ts
# Test DETOKENIZED
detokenized_update = RequestStatsUpdate(
request_id=request_id,
type=RequestStatsUpdate.Type.DETOKENIZED,
monotonic_ts_s=detokenized_ts,
num_new_tokens=1,
)
stats.update_from(detokenized_update)
assert stats.last_updated_ts_s == detokenized_ts
assert stats.num_output_tokens == 1
# Since arrival
assert stats.first_token_latency_s == detokenized_ts - arrived_ts
# Since first scheduled
assert stats.prefill_latency_s == detokenized_ts - prefilling_ts
# Test another DECODING and DETOKENIZED should
# yield correct inter token latency
decoded_update = RequestStatsUpdate(
request_id=request_id,
type=RequestStatsUpdate.Type.DECODING,
monotonic_ts_s=decoded_2_ts,
)
stats.update_from(decoded_update)
detokenized_update = RequestStatsUpdate(
request_id=request_id,
type=RequestStatsUpdate.Type.DETOKENIZED,
monotonic_ts_s=detokenized_2_ts,
num_new_tokens=1,
)
stats.update_from(detokenized_update)
assert stats.output_token_latency_s_lst == [
detokenized_2_ts - detokenized_ts,
]
assert stats.num_output_tokens == 2
# Test PREEMPTED
preempted_update = RequestStatsUpdate(
request_id=request_id,
type=RequestStatsUpdate.Type.PREEMPTED,
monotonic_ts_s=preempted_ts,
)
stats.update_from(preempted_update)
assert stats.last_updated_ts_s == preempted_ts
assert stats.preempted_ts_s_lst == [preempted_ts]
# States should be reset
assert stats.num_computed_tokens == 0
assert stats.num_cached_tokens == 0
# These states should not be reset
assert stats.num_output_tokens == 2
assert stats.output_token_latency_s_lst == [
detokenized_2_ts - detokenized_ts,
]
assert stats.prefill_latency_s == prefilling_ts - arrived_ts
assert stats.num_prompt_tokens == 6
assert stats.prefill_start_ts_s_lst == [prefilling_ts]
# Test resumed
resumed_update = RequestStatsUpdate(
request_id=request_id,
type=RequestStatsUpdate.Type.PREFILLING,
monotonic_ts_s=resumed_ts,
num_computed_tokens=6,
num_cached_tokens=2,
)
stats.update_from(resumed_update)
# prefill timestamp should not be updated since it's a resumed prefill
assert stats.prefill_ts_s == prefilling_ts
assert stats.num_computed_tokens == 6
assert stats.num_cached_tokens == 2
assert stats.prefill_start_ts_s_lst == [
prefilling_ts,
resumed_ts,
]
assert stats.last_updated_ts_s == resumed_ts
# Test another DECODED/DETOKENIZED should yield correct first token latency.
decoded_update = RequestStatsUpdate(
request_id=request_id,
type=RequestStatsUpdate.Type.DECODING,
monotonic_ts_s=decoded_3_ts,
)
detokenized_update = RequestStatsUpdate(
request_id=request_id,
type=RequestStatsUpdate.Type.DETOKENIZED,
monotonic_ts_s=detokenized_3_ts,
num_new_tokens=1,
)
stats.update_from(decoded_update)
stats.update_from(detokenized_update)
assert stats.first_token_ts_s == detokenized_ts - arrived_ts
assert stats.num_output_tokens == 3
assert stats.output_token_latency_s_lst == [
detokenized_2_ts - detokenized_ts,
detokenized_3_ts - detokenized_2_ts,
]
# Test FINISHED
finished_update = RequestStatsUpdate(
request_id=request_id,
type=RequestStatsUpdate.Type.FINISHED,
monotonic_ts_s=finished_ts,
finish_reason="test_reason",
)
stats.update_from(finished_update)
assert stats.last_updated_ts_s == finished_ts
assert stats.e2e_latency_s == finished_ts - arrived_ts
assert stats.inference_latency_s == finished_ts - prefilling_ts
assert stats.prefill_latency_s == detokenized_ts - prefilling_ts
assert stats.decode_latency_s == finished_ts - detokenized_ts
assert stats.first_token_latency_s == detokenized_ts - arrived_ts
assert stats.queue_duration_s == prefilling_ts - queued_ts
assert stats.is_finished
assert stats.finish_reason == "test_reason"
# TODO(rickyx): Add model forward/execute time.
assert stats.model_forward_duration_s == 0.0
assert stats.model_execute_duration_s == 0.0
from typing import List
import torch
from vllm.v1.utils import bind_kv_cache
def test_bind_kv_cache():
from vllm.attention import Attention
ctx = {
'layers.0.self_attn': Attention(32, 128, 0.1),
'layers.1.self_attn': Attention(32, 128, 0.1),
'layers.2.self_attn': Attention(32, 128, 0.1),
'layers.3.self_attn': Attention(32, 128, 0.1),
}
kv_cache = {
'layers.0.self_attn': torch.zeros((1, )),
'layers.1.self_attn': torch.zeros((1, )),
'layers.2.self_attn': torch.zeros((1, )),
'layers.3.self_attn': torch.zeros((1, )),
}
runner_kv_caches: List[torch.Tensor] = []
bind_kv_cache(kv_cache, ctx, runner_kv_caches)
assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[
'layers.0.self_attn']
assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[
'layers.1.self_attn']
assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[
'layers.2.self_attn']
assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[
'layers.3.self_attn']
assert runner_kv_caches[0] is kv_cache['layers.0.self_attn']
assert runner_kv_caches[1] is kv_cache['layers.1.self_attn']
assert runner_kv_caches[2] is kv_cache['layers.2.self_attn']
assert runner_kv_caches[3] is kv_cache['layers.3.self_attn']
def test_bind_kv_cache_non_attention():
from vllm.attention import Attention
# example from Jamba PP=2
ctx = {
'model.layers.20.attn': Attention(32, 128, 0.1),
'model.layers.28.attn': Attention(32, 128, 0.1),
}
kv_cache = {
'model.layers.20.attn': torch.zeros((1, )),
'model.layers.28.attn': torch.zeros((1, )),
}
runner_kv_caches: List[torch.Tensor] = []
bind_kv_cache(kv_cache, ctx, runner_kv_caches)
assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[
'model.layers.20.attn']
assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[
'model.layers.28.attn']
assert runner_kv_caches[0] is kv_cache['model.layers.20.attn']
assert runner_kv_caches[1] is kv_cache['model.layers.28.attn']
......@@ -4,5 +4,6 @@ It does not import any vLLM modules.
"""
from .blame import BlameResult, blame
from .monitor import MonitoredValues, monitor
__all__ = ["blame", "BlameResult"]
__all__ = ["blame", "BlameResult", "monitor", "MonitoredValues"]
import contextlib
import dataclasses
import sys
import traceback
from typing import Callable, Generator, Generic, TypeVar
_T = TypeVar("_T")
@dataclasses.dataclass
class MonitoredValues(Generic[_T]):
values: list[_T] = dataclasses.field(default_factory=list)
trace_stacks: list[str] = dataclasses.field(default_factory=list)
@contextlib.contextmanager
def monitor(
measure_func: Callable[[],
_T]) -> Generator[MonitoredValues[_T], None, None]:
"""
Trace the function calls to continuously monitor the change of
a value.
Usage:
```python
def measure_func():
... # measure the current value
return current_value
with monitor(measure_func) as monitored_values:
# do something
monitored_values.values # all changes of the values
monitored_values.trace_stacks # trace stacks of every change
```
"""
monitored_values = MonitoredValues[_T]()
def _trace_calls(frame, event, arg=None):
nonlocal monitored_values
if event in ['line']:
# triggered by every line of Python code.
# only Python functions will trigger it,
# c/cpp functions will not trigger it.
try:
# Temporarily disable the trace function
sys.settrace(None)
# do a measurement
current_value = measure_func()
if len(monitored_values.values
) == 0 or current_value != monitored_values.values[-1]:
monitored_values.values.append(current_value)
monitored_values.trace_stacks.append("".join(
traceback.format_stack()))
# Re-enable the trace function
sys.settrace(_trace_calls)
except NameError:
# modules are deleted during shutdown
pass
return _trace_calls
try:
sys.settrace(_trace_calls)
yield monitored_values
finally:
sys.settrace(None)
......@@ -20,7 +20,7 @@ compressed-tensors, nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test, main
compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
#compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-FP8-Dynamic-testing, main, 90
compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing, main, 90
awq, casperhansen/mixtral-instruct-awq, main
......@@ -30,4 +30,5 @@ marlin, nm-testing/zephyr-beta-7b-marlin-g128, main
marlin, robertgshaw2/zephyr-7b-beta-channelwise-marlin, main
qqq, HandH1998/QQQ-Llama-3-8b-g128, main
qqq, HandH1998/QQQ-Llama-3-8b, main
hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main
\ No newline at end of file
hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main
None, mgleize/fairseq2-dummy-Llama-3.2-1B, main
\ No newline at end of file
......@@ -3,7 +3,7 @@ SUCCESS=0
while getopts "c:" OPT; do
case ${OPT} in
c )
c )
CONFIG="$OPTARG"
;;
\? )
......@@ -18,9 +18,14 @@ IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
do
if [[ $MODEL_CONFIG == \#* ]]; then
echo "=== SKIPPING MODEL: $MODEL_CONFIG ==="
continue
fi
LOCAL_SUCCESS=0
IFS=', ' read -r -a array <<< "$MODEL_CONFIG"
echo "=== RUNNING MODEL: $MODEL_CONFIG ==="
export QUANTIZATION=${array[0]}
......
......@@ -21,12 +21,13 @@ def test_weight_loading(vllm_runner):
"""
Test parameter weight loading with tp>1.
"""
with vllm_runner(model_name=MODEL_NAME,
# revision=REVISION,
dtype=torch.half if QUANTIZATION == "gptq" else "auto",
quantization=QUANTIZATION,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=2) as model:
with vllm_runner(
model_name=MODEL_NAME,
# revision=REVISION,
dtype=torch.half if QUANTIZATION == "gptq" else "auto",
quantization=None if QUANTIZATION == "None" else QUANTIZATION,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=2) as model:
output = model.generate_greedy("Hello world!", max_tokens=20)
print(output)
......
......@@ -74,6 +74,7 @@ def test_model_runner_input():
num_decode_tokens=3,
slot_mapping=torch.zeros(1),
multi_modal_placeholder_index_maps=None,
enable_kv_scales_calculation=True,
)
model_input = ModelInputForGPUWithSamplingMetadata(
input_tokens=torch.ones(10),
......@@ -126,6 +127,7 @@ def test_embedding_model_runner_input():
num_decode_tokens=3,
slot_mapping=torch.zeros(1),
multi_modal_placeholder_index_maps=None,
enable_kv_scales_calculation=True,
)
model_input = ModelInputForGPUWithPoolingMetadata(
input_tokens=torch.ones(10),
......@@ -177,6 +179,7 @@ def test_multi_step_model_runner_input():
num_decode_tokens=3,
slot_mapping=torch.zeros(1),
multi_modal_placeholder_index_maps=None,
enable_kv_scales_calculation=True,
)
frozen_model_input = ModelInputForGPUWithSamplingMetadata(
input_tokens=torch.ones(10),
......
#!/bin/bash
if command -v actionlint &> /dev/null; then
actionlint "$@"
exit 0
elif [ -x ./actionlint ]; then
./actionlint "$@"
exit 0
fi
# download a binary to the current directory - v1.7.3
bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash)
./actionlint "$@"
#!/bin/bash
CI=${1:-0}
PYTHON_VERSION=${2:-3.9}
PYTHON_VERSION=${2:-local}
if [ "$CI" -eq 1 ]; then
set -e
fi
if [ $PYTHON_VERSION == "local" ]; then
PYTHON_VERSION=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
fi
run_mypy() {
echo "Running mypy on $1"
if [ "$CI" -eq 1 ] && [ -z "$1" ]; then
......@@ -23,6 +27,7 @@ run_mypy vllm/compilation
run_mypy vllm/distributed
run_mypy vllm/engine
run_mypy vllm/executor
run_mypy vllm/inputs
run_mypy vllm/lora
run_mypy vllm/model_executor
run_mypy vllm/plugins
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment