Merge tag 'v0.7.1' into v0.7.1-dev

afd0da21 · zhuwenwen · 1a11f127 · 4f4d427a · afd0da21 · afd0da21
Commit afd0da21 authored Feb 03, 2025 by zhuwenwen
20 changed files
--- a/tests/tracing/test_tracing.py
+++ b/tests/tracing/test_tracing.py
@@ -101,32 +101,32 @@ def test_traces(trace_service):

    attributes = decode_attributes(
        request.resource_spans[0].scope_spans[0].spans[0].attributes)
-    assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model
+    assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
    assert attributes.get(
-        SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id
+        SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
+                          ) == sampling_params.temperature
    assert attributes.get(
-        SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature
+        SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
    assert attributes.get(
-        SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
-    assert attributes.get(
-        SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
-    assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
-    assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
+        SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
+    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+    assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
        outputs[0].prompt_token_ids)
    completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
    assert attributes.get(
-        SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens
+        SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
    metrics = outputs[0].metrics
    assert attributes.get(
-        SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
+        SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
    ttft = metrics.first_token_time - metrics.arrival_time
    assert attributes.get(
-        SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+        SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
    e2e_time = metrics.finished_time - metrics.arrival_time
-    assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time
+    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
    assert metrics.scheduler_time > 0
-    assert attributes.get(
-        SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time
+    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
+                          ) == metrics.scheduler_time
    # Model forward and model execute should be none, since detailed traces is
    # not enabled.
    assert metrics.model_forward_time is None
@@ -167,37 +167,37 @@ def test_traces_with_detailed_steps(trace_service):

    attributes = decode_attributes(
        request.resource_spans[0].scope_spans[0].spans[0].attributes)
-    assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model
+    assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
    assert attributes.get(
-        SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id
+        SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
+                          ) == sampling_params.temperature
    assert attributes.get(
-        SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature
+        SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
    assert attributes.get(
-        SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p
-    assert attributes.get(
-        SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
-    assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n
-    assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len(
+        SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens
+    assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+    assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
        outputs[0].prompt_token_ids)
    completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
    assert attributes.get(
-        SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens
+        SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
    metrics = outputs[0].metrics
    assert attributes.get(
-        SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
+        SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue
    ttft = metrics.first_token_time - metrics.arrival_time
    assert attributes.get(
-        SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
+        SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft
    e2e_time = metrics.finished_time - metrics.arrival_time
-    assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time
+    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time
    assert metrics.scheduler_time > 0
-    assert attributes.get(
-        SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time
+    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER
+                          ) == metrics.scheduler_time
    assert metrics.model_forward_time > 0
    assert attributes.get(
-        SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx(
+        SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx(
            metrics.model_forward_time / 1000)
    assert metrics.model_execute_time > 0
-    assert attributes.get(SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE
+    assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE
                          ) == metrics.model_execute_time
    assert metrics.model_forward_time < 1000 * metrics.model_execute_time
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -163,13 +163,19 @@ class RemoteOpenAIServer:
    def url_for(self, *parts: str) -> str:
        return self.url_root + "/" + "/".join(parts)

-    def get_client(self):
+    def get_client(self, **kwargs):
+        if "timeout" not in kwargs:
+            kwargs["timeout"] = 600
        return openai.OpenAI(
            base_url=self.url_for("v1"),
            api_key=self.DUMMY_API_KEY,
+            max_retries=0,
+            **kwargs,
        )

    def get_async_client(self, **kwargs):
+        if "timeout" not in kwargs:
+            kwargs["timeout"] = 600
        return openai.AsyncOpenAI(base_url=self.url_for("v1"),
                                  api_key=self.DUMMY_API_KEY,
                                  max_retries=0,
@@ -816,7 +822,6 @@ async def completions_with_server_args(
    assert len(max_tokens) == len(prompts)

    outputs = None
-    max_wait_seconds = 240 * 3  # 240 is default
    with RemoteOpenAIServer(model_name,
                            server_cli_args,
                            max_wait_seconds=max_wait_seconds) as server:

--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
+import pytest
+
+from vllm.multimodal.inputs import MultiModalKwargs
+from vllm.sampling_params import SamplingParams
+from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
+                                         KVCacheBlock,
+                                         generate_block_hash_extra_keys,
+                                         hash_block_tokens,
+                                         hash_request_tokens)
+from vllm.v1.request import Request
+
+
+def make_request(request_id,
+                 prompt_token_ids,
+                 mm_positions=None,
+                 mm_hashes=None):
+    if mm_positions is None:
+        multi_modal_inputs = None
+    else:
+        multi_modal_inputs = [MultiModalKwargs({})] * len(mm_positions)
+
+    return Request(
+        request_id=request_id,
+        prompt=None,
+        prompt_token_ids=prompt_token_ids,
+        multi_modal_inputs=multi_modal_inputs,
+        multi_modal_hashes=mm_hashes,
+        multi_modal_placeholders=mm_positions,
+        sampling_params=SamplingParams(max_tokens=17),
+        eos_token_id=100,
+        arrival_time=0,
+        lora_request=None,
+    )
+
+
+def test_kv_cache_block():
+    # Test KVCacheBlock initialization
+    block = KVCacheBlock(block_id=0)
+    assert block.block_id == 0
+    assert block.ref_cnt == 0
+    assert block.block_hash is None
+
+    # Test reference count manipulation
+    block.incr_ref()
+    assert block.ref_cnt == 1
+    block.decr_ref()
+    assert block.ref_cnt == 0
+
+    # Test block hash setting and resetting
+    block_hash = BlockHashType(hash_value=123, token_ids=(1, 2, 3))
+    block.block_hash = block_hash
+    assert block.block_hash == block_hash
+
+    block.reset_hash()
+    assert block.block_hash is None
+
+
+def test_free_kv_cache_block_queue_initialization():
+    # Test with a single block
+    block = KVCacheBlock(block_id=0)
+    queue = FreeKVCacheBlockQueue([block])
+    assert queue.num_free_blocks == 1
+    assert queue.free_list_head == block
+    assert queue.free_list_tail == block
+
+
+def test_free_kv_cache_block_queue_operations():
+    # Create a list of KVCacheBlock objects
+    blocks = [KVCacheBlock(block_id=i) for i in range(5)]
+
+    # Create a FreeKVCacheBlockQueue with these blocks
+    queue = FreeKVCacheBlockQueue(blocks)
+
+    # Check initial state
+    assert queue.num_free_blocks == 5
+    assert queue.free_list_head == blocks[0]
+    assert queue.free_list_tail == blocks[4]
+
+    # Pop the first block
+    block1 = queue.popleft()
+    assert block1 == blocks[0]
+    assert queue.num_free_blocks == 4
+    assert queue.free_list_head == blocks[1]
+    assert queue.free_list_tail == blocks[4]
+
+    # Remove a block from the middle
+    block_to_remove = blocks[2]
+    queue.remove(block_to_remove)
+    assert queue.num_free_blocks == 3
+    assert blocks[1].next_free_block == blocks[3]
+    assert blocks[3].prev_free_block == blocks[1]
+
+    # Append a block back
+    queue.append(block_to_remove)
+    assert queue.num_free_blocks == 4
+    assert queue.free_list_tail == block_to_remove
+    assert block_to_remove.prev_free_block == blocks[4]
+    assert block_to_remove.next_free_block is None
+
+    # Pop blocks until empty
+    for _ in range(4):
+        queue.popleft()
+    assert queue.num_free_blocks == 0
+    assert queue.free_list_head is None
+    assert queue.free_list_tail is None
+
+    # Attempt to pop from an empty queue
+    with pytest.raises(ValueError) as e:
+        queue.popleft()
+    assert str(e.value) == "No free blocks available"
+
+
+def test_free_kv_cache_block_queue_get_all_free_blocks():
+    # Create a list of KVCacheBlock objects
+    blocks = [KVCacheBlock(block_id=i) for i in range(5)]
+
+    # Create a FreeKVCacheBlockQueue with these blocks
+    queue = FreeKVCacheBlockQueue(blocks)
+
+    # Check all blocks are correctly retrieved
+    assert queue.get_all_free_blocks() == blocks
+
+    # Pop a block and check again
+    queue.popleft()
+    assert queue.get_all_free_blocks() == blocks[1:]
+
+    # Remove a block and check again
+    block_to_remove = blocks[2]
+    queue.remove(block_to_remove)
+    assert queue.get_all_free_blocks() == blocks[1:2] + blocks[3:]
+
+    # Append a block back and check again
+    queue.append(block_to_remove)
+    assert queue.get_all_free_blocks() == \
+        blocks[1:2] + blocks[3:] + [block_to_remove]
+
+
+def test_generate_block_hash_extra_keys():
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(20)],
+        mm_positions=[{
+            "offset": 0,
+            "length": 5
+        }, {
+            "offset": 10,
+            "length": 5
+        }],
+        mm_hashes=["hash1", "hash2"],
+    )
+
+    # Test with no extra keys
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0)
+    assert extra_keys == ("hash1", )
+    assert next_mm_idx == 1
+
+    # Test with partial overlap
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 3, 8, 0)
+    assert extra_keys == ("hash1", )
+    assert next_mm_idx == 1
+
+    # Test with no overlap
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 6, 10, 0)
+    assert extra_keys == ()
+    assert next_mm_idx == 1
+
+    # Test with multiple extra keys
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 15, 0)
+    assert extra_keys == ('hash1', 'hash2')
+    assert next_mm_idx == 2
+
+
+def test_generate_block_hash_extra_keys_no_mm_inputs():
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=None,
+        mm_hashes=None,
+    )
+
+    extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0)
+    assert extra_keys is None
+    assert next_mm_idx == 0
+
+
+def test_hash_block_tokens():
+    parent_block_hash = 123
+    curr_block_token_ids = (1, 2, 3)
+    extra_keys = ("key1", "key2")
+
+    block_hash = hash_block_tokens(parent_block_hash, curr_block_token_ids,
+                                   extra_keys)
+    assert isinstance(block_hash, BlockHashType)
+    assert block_hash.hash_value == hash(
+        (parent_block_hash, curr_block_token_ids, extra_keys))
+    assert block_hash.token_ids == curr_block_token_ids
+    assert block_hash.extra_keys == extra_keys
+
+
+def test_hash_request_tokens():
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=[{
+            "offset": 0,
+            "length": 3
+        }, {
+            "offset": 3,
+            "length": 3
+        }],
+        mm_hashes=["hash1", "hash2"],
+    )
+
+    block_size = 3
+    block_hashes = hash_request_tokens(block_size, request)
+
+    assert len(block_hashes) == 2
+    assert isinstance(block_hashes[0], BlockHashType)
+    assert isinstance(block_hashes[1], BlockHashType)
+
+    # Check the first block
+    assert block_hashes[0].token_ids == (0, 1, 2)
+    assert block_hashes[0].extra_keys == ("hash1", )
+
+    # Check the second block
+    assert block_hashes[1].token_ids == (3, 4, 5)
+    assert block_hashes[1].extra_keys == ("hash2", )
+
+
+def test_hash_tokens_different_mm_input():
+    request1 = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=[{
+            "offset": 0,
+            "length": 3
+        }, {
+            "offset": 3,
+            "length": 3
+        }],
+        mm_hashes=["hash1", "hash2"],
+    )
+    request2 = make_request(
+        request_id=1,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=[{
+            "offset": 0,
+            "length": 3
+        }, {
+            "offset": 3,
+            "length": 3
+        }],
+        mm_hashes=["hash3", "hash2"],
+    )
+    block_size = 3
+    block_hashes1 = hash_request_tokens(block_size, request1)
+    block_hashes2 = hash_request_tokens(block_size, request2)
+    assert block_hashes1[0] != block_hashes2[0]
+    assert block_hashes1[1] != block_hashes2[1]
+
+
+def test_hash_request_tokens_no_mm_inputs():
+    request = make_request(
+        request_id=0,
+        prompt_token_ids=[_ for _ in range(6)],
+        mm_positions=None,
+        mm_hashes=None,
+    )
+
+    block_size = 3
+    block_hashes = hash_request_tokens(block_size, request)
+
+    assert len(block_hashes) == 2
+    assert block_hashes[0].token_ids == (0, 1, 2)
+    assert block_hashes[0].extra_keys is None
+    assert block_hashes[1].token_ids == (3, 4, 5)
+    assert block_hashes[1].extra_keys is None
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
 """Compare the with and without prefix caching."""
 import pytest

-from vllm.inputs import token_inputs
-from vllm.multimodal.inputs import PlaceholderRange
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.utils import cdiv
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
@@ -13,12 +12,18 @@ def make_request(request_id,
                 prompt_token_ids,
                 mm_positions=None,
                 mm_hashes=None):
+    if mm_positions is None:
+        multi_modal_inputs = None
+    else:
+        multi_modal_inputs = [MultiModalKwargs({})] * len(mm_positions)
+
    return Request(
        request_id=request_id,
-        inputs=token_inputs(prompt_token_ids=prompt_token_ids,
-                            multi_modal_placeholders={"image": mm_positions}
-                            if mm_positions else None,
-                            multi_modal_hashes=mm_hashes),
+        prompt=None,
+        prompt_token_ids=prompt_token_ids,
+        multi_modal_inputs=multi_modal_inputs,
+        multi_modal_hashes=mm_hashes,
+        multi_modal_placeholders=mm_positions,
        sampling_params=SamplingParams(max_tokens=17),
        eos_token_id=100,
        arrival_time=0,
@@ -44,9 +49,10 @@ def test_prefill():
    unique_token_ids = [3] * 7
    all_token_ids = common_token_ids + unique_token_ids
    req0 = make_request("0", all_token_ids)
-    computed_blocks = manager.get_computed_blocks(req0)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
    assert len(req0.kv_block_hashes) == 3
    assert not computed_blocks
+    assert num_computed_tokens == 0
    blocks = manager.allocate_slots(req0, 55, computed_blocks)
    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]

@@ -68,9 +74,10 @@ def test_prefill():
    # Incomplete 1 block (5 tokens)
    unique_token_ids = [3] * 5
    req1 = make_request("1", common_token_ids + unique_token_ids)
-    computed_blocks = manager.get_computed_blocks(req1)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
    assert len(req1.kv_block_hashes) == 3
    assert [b.block_id for b in computed_blocks] == [0, 1, 2]
+    assert num_computed_tokens == 3 * 16
    num_new_tokens = 53 - 3 * 16
    blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
    assert [b.block_id for b in blocks] == [5, 6]
@@ -86,7 +93,7 @@ def test_prefill():
    # All blocks should be available.
    assert manager.free_block_queue.num_free_blocks == 10
    # The order should be
-    # [unallocated (7, 8)]
+    # [unallocated (7, 8, 9)]
    # [unique_req0 (4, 3)]
    # [unique_req1 (6, 5)]
    # [common (2, 1, 0)]
@@ -98,9 +105,10 @@ def test_prefill():
    # Incomplete 1 block (6 tokens)
    unique_token_ids = [3] * 6
    req2 = make_request("2", common_token_ids + unique_token_ids)
-    computed_block = manager.get_computed_blocks(req2)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
    assert len(req2.kv_block_hashes) == 3
-    assert [b.block_id for b in computed_block] == [0, 1, 2]
+    assert [b.block_id for b in computed_blocks] == [0, 1, 2]
+    assert num_computed_tokens == 3 * 16
    num_new_tokens = 53 - 3 * 16
    blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
    assert [b.block_id for b in blocks] == [7, 8]
@@ -118,8 +126,9 @@ def test_prefill():

    # Cache miss and eviction.
    req3 = make_request("3", [99] * (16 * 9))
-    computed_blocks = manager.get_computed_blocks(req3)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
    assert not computed_blocks
+    assert num_computed_tokens == 0
    blocks = manager.allocate_slots(req3, 16 * 9, computed_blocks)
    # This block ID order also checks the eviction order.
    assert [b.block_id for b in blocks] == [9, 4, 3, 6, 5, 8, 7, 2, 1, 0]
@@ -145,8 +154,9 @@ def test_decode():
    # Incomplete 1 block (7 tokens)
    unique_token_ids = [3] * 7
    req0 = make_request("0", common_token_ids + unique_token_ids)
-    computed_blocks = manager.get_computed_blocks(req0)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
    assert not computed_blocks
+    assert num_computed_tokens == 0
    blocks = manager.allocate_slots(req0, 55, computed_blocks)
    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]

@@ -192,16 +202,18 @@ def test_evict():

    last_token_id = 5 * 16 + 7
    req0 = make_request("0", list(range(last_token_id)))
-    computed_blocks = manager.get_computed_blocks(req0)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
    assert not computed_blocks
+    assert num_computed_tokens == 0
    blocks = manager.allocate_slots(req0, 5 * 16 + 7, computed_blocks)
    assert len(blocks) == 7  # 5 full + 1 partial + 1 preallocated

    # 3 blocks.
    req1 = make_request("1", list(range(last_token_id,
                                        last_token_id + 3 * 16)))
-    computed_blocks = manager.get_computed_blocks(req1)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
    assert not computed_blocks
+    assert num_computed_tokens == 0
    blocks = manager.allocate_slots(req1, 3 * 16, computed_blocks)
    assert len(blocks) == 3  # 3 full blocks
    last_token_id += 3 * 16
@@ -217,8 +229,9 @@ def test_evict():

    # Touch the first 2 blocks.
    req2 = make_request("2", list(range(2 * 16 + 3)))
-    computed_blocks = manager.get_computed_blocks(req2)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
    assert [b.block_id for b in computed_blocks] == [0, 1]
+    assert num_computed_tokens == 2 * 16
    blocks = manager.allocate_slots(req2, 3, computed_blocks)
    assert [b.block_id for b in blocks] == [6, 5]
    assert manager.free_block_queue.num_free_blocks == 6
@@ -242,8 +255,9 @@ def test_hash_block_correct_reuse():
    # Allocate 1 block and cache it.
    num_tokens = block_size * 1
    req = make_request("0", list(range(num_tokens)))
-    computed_blocks = manager.get_computed_blocks(req)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
    assert not computed_blocks
+    assert num_computed_tokens == 0
    blocks = manager.allocate_slots(req, num_tokens, computed_blocks)
    assert len(blocks) == 1

@@ -253,8 +267,9 @@ def test_hash_block_correct_reuse():
    # Allocate a new block that's not full, make sure hash info on the
    # block is cleared.
    req = make_request("1", list(range(num_tokens - 1)))
-    computed_blocks = manager.get_computed_blocks(req)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
    assert not computed_blocks
+    assert num_computed_tokens == 0
    blocks = manager.allocate_slots(req, num_tokens - 1, computed_blocks)
    assert len(blocks) == 1

@@ -279,16 +294,18 @@ def test_computed_blocks_not_evicted():
    # Allocate a block and cache it.
    num_tokens = block_size * 1
    req0 = make_request("0", list(range(num_tokens)))
-    computed_blocks = manager.get_computed_blocks(req0)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
    assert not computed_blocks
+    assert num_computed_tokens == 0
    blocks = manager.allocate_slots(req0, num_tokens, computed_blocks)
    assert len(blocks) == 1
    assert blocks[0].block_id == 0

    # Allocate another block.
    req1 = make_request("1", list(range(num_tokens, num_tokens * 2)))
-    computed_blocks = manager.get_computed_blocks(req1)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
    assert not computed_blocks
+    assert num_computed_tokens == 0
    blocks = manager.allocate_slots(req1, num_tokens, computed_blocks)
    assert len(blocks) == 1
    assert blocks[0].block_id == 1
@@ -300,9 +317,10 @@ def test_computed_blocks_not_evicted():
    # Now if we have a cache hit on the first block, we should evict the second
    # cached block rather than the first one.
    req2 = make_request("2", list(range(num_tokens * 2)))
-    computed_blocks = manager.get_computed_blocks(req2)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
    assert len(computed_blocks) == 1
    assert computed_blocks[0].block_id == 0
+    assert num_computed_tokens == block_size

    blocks = manager.allocate_slots(req2, num_tokens * 2 - num_tokens,
                                    computed_blocks)
@@ -326,8 +344,9 @@ def test_basic_prefix_caching_disabled():

    req1 = make_request("1", list(range(10)))  # 2 blocks and some more

-    computed_blocks = manager.get_computed_blocks(req1)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
    assert not computed_blocks
+    assert num_computed_tokens == 0
    blocks = manager.allocate_slots(req1, 10, computed_blocks)
    assert len(blocks) == 3

@@ -336,15 +355,17 @@ def test_basic_prefix_caching_disabled():

    # No caching.
    req2 = make_request("2", list(range(16)))  # shared prefix
-    computed_blocks = manager.get_computed_blocks(req2)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
    assert not computed_blocks
+    assert num_computed_tokens == 0
    blocks = manager.allocate_slots(req2, 16, computed_blocks)
    assert len(blocks) == 4

    # New requests should not have any blocks.
    req3 = make_request("3", list(range(4)))
-    computed_blocks = manager.get_computed_blocks(req3)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
    assert not computed_blocks
+    assert num_computed_tokens == 0
    blocks = manager.allocate_slots(req3, 4, computed_blocks)
    assert not blocks

@@ -366,8 +387,9 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int):
    num_preallocated_blocks = cdiv(num_preallocate_tokens, block_size)

    req = make_request("0", list(range(block_size * 30)))
-    computed_blocks = manager.get_computed_blocks(req)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req)
    assert not computed_blocks
+    assert num_computed_tokens == 0
    # Just ask for 1 block.
    blocks = manager.allocate_slots(req, block_size, computed_blocks)
    req.num_computed_tokens = block_size
@@ -464,14 +486,15 @@ def test_mm_prefix_caching():
                        all_token_ids,
                        mm_positions=mm_positions,
                        mm_hashes=mm_hashes)
-    computed_blocks = manager.get_computed_blocks(req0)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)

    # Completed block should have hashes with extra keys.
    assert not computed_blocks
+    assert num_computed_tokens == 0
    assert len(req0.kv_block_hashes) == 3
-    assert req0.kv_block_hashes[0].extra_keys == (("aaa", 0), )
-    assert req0.kv_block_hashes[1].extra_keys == (("aaa", 5), ("bbb", 0))
-    assert req0.kv_block_hashes[2].extra_keys == (("bbb", 2), )
+    assert req0.kv_block_hashes[0].extra_keys == ("aaa", )
+    assert req0.kv_block_hashes[1].extra_keys == ("aaa", "bbb")
+    assert req0.kv_block_hashes[2].extra_keys == ("bbb", )

    blocks = manager.allocate_slots(req0, 59, computed_blocks)
    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
@@ -485,7 +508,7 @@ def test_mm_prefix_caching():

    # The just completed block should have hashes with extra keys.
    assert len(req0.kv_block_hashes) == 4
-    assert req0.kv_block_hashes[3].extra_keys == (("ccc", 0), )
+    assert req0.kv_block_hashes[3].extra_keys == ("ccc", )

    # Cache hit.
    unique_token_ids = [-1] * 7 + [200] * 5
@@ -498,5 +521,138 @@ def test_mm_prefix_caching():
                        all_token_ids,
                        mm_positions=mm_positions,
                        mm_hashes=mm_hashes)
-    computed_blocks = manager.get_computed_blocks(req1)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert len(computed_blocks) == 3
+    assert num_computed_tokens == 3 * 16
+
+
+def test_prefill_not_enough_free_blocks_with_computed_blocks():
+    """
+    This is a unit test that tests the correctness of the allocate_slots
+    when there is not enough free blocks. Specifically, when a request
+    has computed blocks but cannot be allocated due to not enough free blocks,
+    the computed blocks should not be touched.
+    """
+    block_size = 16
+    manager = KVCacheManager(
+        block_size=block_size,
+        num_gpu_blocks=10,
+        max_model_len=8192,
+        sliding_window=None,
+        enable_caching=True,
+        num_preallocate_tokens=0,
+    )
+    # Complete 3 blocks (48 tokens)
+    # | Common-0 | Common-1 | Common-2 | ... |
+    common_token_ids = [i for i in range(3) for _ in range(16)]
+    req0 = make_request("0", common_token_ids)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+    assert not computed_blocks
+    assert num_computed_tokens == 0
+    manager.allocate_slots(req0, 48, computed_blocks)
+    block_part0 = manager.req_to_blocks[req0.request_id]
+
+    # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
+    req1 = make_request("1", common_token_ids * 2)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert computed_blocks == block_part0
+    assert num_computed_tokens == 3 * 16
+    manager.allocate_slots(req1, 48, computed_blocks)
+    block_part1 = manager.req_to_blocks[req1.request_id]
+    # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
+    # | Req1-5(F)| ... |
+    manager.free(req1)
+    assert {block.ref_cnt for block in block_part1[:3]} == {1}
+    assert {block.ref_cnt for block in block_part1[3:]} == {0}
+
+    # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
+    # | Req1-5(F)| Req2-0   | Req2-1   | ... |
+    req2 = make_request("2", [7] * block_size * 2)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
+    assert not computed_blocks
+    assert num_computed_tokens == 0
+    manager.allocate_slots(req2, block_size * 2, computed_blocks)
+
+    # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
+    # but it cannot be allocated due to insufficient free blocks (2).
+    # In this case, the ref_cnt of the computed blocks should not be changed.
+    assert manager.free_block_queue.num_free_blocks == 5
+    req3 = make_request("3", common_token_ids * 3)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req3)
+    assert computed_blocks == block_part1
+    assert num_computed_tokens == 6 * 16
+    # Req3 cannot be allocated.
+    assert manager.allocate_slots(req3, 48, computed_blocks) is None
+    # Block 0-2 are used by Req 1.
+    assert {block.ref_cnt for block in block_part1[:3]} == {1}
+    # Block 3-5 are free.
+    assert {block.ref_cnt for block in block_part1[3:]} == {0}
+
+
+def test_reset_prefix_cache():
+    manager = KVCacheManager(
+        block_size=16,
+        num_gpu_blocks=10,
+        max_model_len=8192,
+        sliding_window=None,
+        enable_caching=True,
+        num_preallocate_tokens=0,
+    )
+
+    full_block_token_ids = [i for i in range(3) for _ in range(16)]
+    unique_token_ids = [3] * 7
+    all_token_ids = full_block_token_ids + unique_token_ids
+    req0 = make_request("0", all_token_ids)
+    blocks = manager.allocate_slots(req0, 55, [])
+    assert [b.block_id for b in blocks] == [0, 1, 2, 3]
+
+    unique_token_ids = [4] * 7
+    all_token_ids = full_block_token_ids + unique_token_ids
+    req1 = make_request("1", all_token_ids)
+    computed_blocks, _ = manager.get_computed_blocks(req1)
+    assert len(req1.kv_block_hashes) == 3
    assert len(computed_blocks) == 3
+    blocks = manager.allocate_slots(req1, 7, computed_blocks)
+    assert [b.block_id for b in blocks] == [4]
+
+    # Failed to reset prefix cache because some blocks are not freed yet.
+    assert not manager.reset_prefix_cache()
+    assert manager.cached_block_hash_to_block
+
+    # Free the blocks.
+    manager.free(req0)
+    manager.free(req1)
+
+    assert manager.reset_prefix_cache()
+    assert not manager.cached_block_hash_to_block
+    assert all([blk.block_hash is None for blk in manager.block_pool])
+
+
+def test_uncache_blocks():
+    manager = KVCacheManager(
+        block_size=16,
+        num_gpu_blocks=10,
+        max_model_len=8192,
+        sliding_window=None,
+        enable_caching=True,
+        num_preallocate_tokens=0,
+    )
+
+    req0 = make_request("0", list(range(30)))
+    blocks = manager.allocate_slots(req0, 30, [])
+    assert [b.block_id for b in blocks] == [0, 1]
+    assert len(manager.cached_block_hash_to_block) == 1
+
+    req0.num_computed_tokens = 30
+
+    # Simulate speculative tokens.
+    for _ in range(5):
+        req0.append_output_token_ids(8)
+    manager.append_slots(req0, 5)
+    assert len(manager.cached_block_hash_to_block) == 2
+
+    # After sampling, assuming only 1 token is accepted.
+    req0.num_computed_tokens = 31
+    num_uncached_blocks = manager.uncache_blocks(req0)
+    assert num_uncached_blocks == 1
+    assert len(manager.cached_block_hash_to_block) == 1
--- a/tests/v1/e2e/__init__.py
+++ b/tests/v1/e2e/__init__.py
--- a/tests/v1/e2e/test_cascade_attention.py
+++ b/tests/v1/e2e/test_cascade_attention.py
+from vllm import LLM, SamplingParams
+
+
+def test_cascade_attention(example_system_message, monkeypatch):
+    prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct")
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
+
+        # No cascade attention.
+        single_prompt = [example_system_message + prompt]
+        responses = llm.generate(single_prompt, sampling_params)
+        ref_output = responses[0].outputs[0].text
+
+        # (Probably) Use cascade attention.
+        prompts = [example_system_message + prompt] * 64
+        responses = llm.generate(prompts, sampling_params)
+        for response in responses:
+            assert response.outputs[0].text == ref_output
--- a/tests/v1/engine/test_async_llm.py
+++ b/tests/v1/engine/test_async_llm.py
 import asyncio
-from typing import Tuple
+from contextlib import ExitStack
+from typing import List, Tuple

 import os
 import pytest
@@ -7,6 +8,7 @@ import pytest
 from vllm import SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.platforms import current_platform
+from vllm.sampling_params import RequestOutputKind
 from vllm.v1.engine.async_llm import AsyncLLM
 from ...utils import models_path_prefix

@@ -15,32 +17,44 @@ if not current_platform.is_cuda():
                allow_module_level=True)

 ENGINE_ARGS = AsyncEngineArgs(model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"),
+                              enforce_eager=True,
                              disable_log_requests=True)


 async def generate(engine: AsyncLLM, request_id: str,
+                   output_kind: RequestOutputKind,
                   max_tokens: int) -> Tuple[int, str]:
    count = 0
-    async for _ in engine.generate(request_id=request_id,
-                                   prompt="Hello my name is Robert and",
-                                   sampling_params=SamplingParams(
-                                       max_tokens=max_tokens, temperature=0)):
+    sampling_params = SamplingParams(max_tokens=max_tokens,
+                                     output_kind=output_kind,
+                                     temperature=0)
+    async for out in engine.generate(request_id=request_id,
+                                     prompt="Hello my name is Robert and",
+                                     sampling_params=sampling_params):
+
+        num_tokens = len(out.outputs[0].token_ids)
+        if output_kind == RequestOutputKind.DELTA:
+            count += num_tokens
+        else:
+            count = num_tokens

-        count += 1
        await asyncio.sleep(0.)

    return count, request_id


+@pytest.mark.parametrize(
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
 @pytest.mark.asyncio
-async def test_load(monkeypatch):
+async def test_load(monkeypatch, output_kind: RequestOutputKind):
    # TODO(rickyx): Remove monkeypatch once we have a better way to test V1
    # so that in the future when we switch, we don't have to change all the
    # tests.
-    with monkeypatch.context() as m:
+    with monkeypatch.context() as m, ExitStack() as after:
        m.setenv("VLLM_USE_V1", "1")

        engine = AsyncLLM.from_engine_args(ENGINE_ARGS)
+        after.callback(engine.shutdown)

        NUM_REQUESTS = 10000
        NUM_EXPECTED_TOKENS = 10
@@ -52,20 +66,72 @@ async def test_load(monkeypatch):
        for request_id in request_ids:
            tasks.append(
                asyncio.create_task(
-                    generate(engine, request_id, NUM_EXPECTED_TOKENS)))
+                    generate(engine, request_id, output_kind,
+                             NUM_EXPECTED_TOKENS)))

        # Confirm that we got all the EXPECTED tokens from the requests.
-        failed_request_id = None
-        tokens = None
-        for task in tasks:
+        done, pending = await asyncio.wait(tasks,
+                                           return_when=asyncio.FIRST_EXCEPTION)
+        for task in pending:
+            task.cancel()
+        for task in done:
            num_generated_tokens, request_id = await task
-            if (num_generated_tokens != NUM_EXPECTED_TOKENS
-                    and failed_request_id is None):
-                failed_request_id = request_id
-                tokens = num_generated_tokens
+            assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
+                f"{request_id} generated {num_generated_tokens} but "
+                f"expected {NUM_EXPECTED_TOKENS}")

-        assert failed_request_id is None, (
-            f"{failed_request_id} generated {tokens} but "
-            f"expected {NUM_EXPECTED_TOKENS}")
+        assert not engine.output_processor.has_unfinished_requests()

-        engine.shutdown()
+
+@pytest.mark.parametrize(
+    "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+@pytest.mark.asyncio
+async def test_abort(monkeypatch, output_kind: RequestOutputKind):
+
+    with monkeypatch.context() as m, ExitStack() as after:
+        m.setenv("VLLM_USE_V1", "1")
+
+        engine = AsyncLLM.from_engine_args(ENGINE_ARGS)
+        after.callback(engine.shutdown)
+
+        NUM_REQUESTS = 100
+        NUM_EXPECTED_TOKENS = 100
+        REQUEST_IDS_TO_ABORT = range(1, 100, 10)
+
+        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
+
+        # Create concurrent requests.
+        tasks: List[asyncio.Task] = []
+        for request_id in request_ids:
+            tasks.append(
+                asyncio.create_task(
+                    generate(engine, request_id, output_kind,
+                             NUM_EXPECTED_TOKENS)))
+
+        # API server cancels requests when they disconnect.
+        for idx in REQUEST_IDS_TO_ABORT:
+            tasks[idx].cancel()
+            await asyncio.sleep(0.1)
+
+        # Confirm the other requests are okay.
+        for idx, task in enumerate(tasks):
+            # Confirm that it was actually canceled.
+            if idx in REQUEST_IDS_TO_ABORT:
+                with pytest.raises(asyncio.CancelledError):
+                    await task
+            else:
+                # Otherwise, make sure the request was not impacted.
+                num_generated_tokens, request_id = await task
+                assert num_generated_tokens == NUM_EXPECTED_TOKENS, (
+                    f"{request_id} generated {num_generated_tokens} but "
+                    f"expected {NUM_EXPECTED_TOKENS}")
+
+        assert not engine.output_processor.has_unfinished_requests()
+
+        # Confirm we can do another generation.
+        request_id = f"request-{REQUEST_IDS_TO_ABORT[0]}"
+        task = asyncio.create_task(
+            generate(engine, request_id, output_kind, NUM_EXPECTED_TOKENS))
+        num_generated_tokens, request_id = await task
+        assert num_generated_tokens == NUM_EXPECTED_TOKENS
+        assert not engine.output_processor.has_unfinished_requests()
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -5,13 +5,13 @@ import os
 import pytest
 from transformers import AutoTokenizer

+from tests.utils import fork_new_process_for_each_test
 from vllm import SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
-from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core import EngineCore
+from vllm.v1.executor.abstract import Executor
 from ...utils import models_path_prefix

 if not current_platform.is_cuda():
@@ -39,19 +39,18 @@ def make_request() -> EngineCoreRequest:
    )


+@fork_new_process_for_each_test
 def test_engine_core(monkeypatch):

    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
        """Setup the EngineCore."""
        engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config(
-            usage_context=UsageContext.UNKNOWN_CONTEXT)
-        executor_class = AsyncLLM._get_executor_cls(vllm_config)
+        vllm_config = engine_args.create_engine_config()
+        executor_class = Executor.get_class(vllm_config)

        engine_core = EngineCore(vllm_config=vllm_config,
-                                 executor_class=executor_class,
-                                 usage_context=UsageContext.UNKNOWN_CONTEXT)
+                                 executor_class=executor_class)
        """Test basic request lifecycle."""

        # First request.
@@ -83,7 +82,7 @@ def test_engine_core(monkeypatch):
        assert len(engine_core.scheduler.running) == 4

        # Loop through until they are all done.
-        while len(engine_core.step()) > 0:
+        while len(engine_core.step().outputs) > 0:
            pass

        assert len(engine_core.scheduler.waiting) == 0
@@ -143,23 +142,22 @@ def test_engine_core(monkeypatch):
        assert len(engine_core.scheduler.running) == 0


+@fork_new_process_for_each_test
 def test_engine_core_advanced_sampling(monkeypatch):
    """
    A basic end-to-end test to verify that the engine functions correctly 
-    when additional sampling parameters, such as min_tokens and 
+    when additional sampling parameters, such as top_p, min_tokens, and 
    presence_penalty, are set.
    """
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
        """Setup the EngineCore."""
        engine_args = EngineArgs(model=MODEL_NAME)
-        vllm_config = engine_args.create_engine_config(
-            usage_context=UsageContext.UNKNOWN_CONTEXT)
-        executor_class = AsyncLLM._get_executor_cls(vllm_config)
+        vllm_config = engine_args.create_engine_config()
+        executor_class = Executor.get_class(vllm_config)

        engine_core = EngineCore(vllm_config=vllm_config,
-                                 executor_class=executor_class,
-                                 usage_context=UsageContext.UNKNOWN_CONTEXT)
+                                 executor_class=executor_class)
        """Test basic request lifecycle."""
        # First request.
        request: EngineCoreRequest = make_request()
@@ -171,11 +169,23 @@ def test_engine_core_advanced_sampling(monkeypatch):
            stop_token_ids=[1001, 1002],
        )
        engine_core.add_request(request)
-        assert len(engine_core.scheduler.waiting) == 1
-        assert len(engine_core.scheduler.running) == 0
-        # Loop through until they are all done.
-        while len(engine_core.step()) > 0:
-            pass

-        assert len(engine_core.scheduler.waiting) == 0
-        assert len(engine_core.scheduler.running) == 0
+        def _check_engine_state():
+            assert len(engine_core.scheduler.waiting) == 1
+            assert len(engine_core.scheduler.running) == 0
+            # Loop through until they are all done.
+            while len(engine_core.step().outputs) > 0:
+                pass
+            assert len(engine_core.scheduler.waiting) == 0
+            assert len(engine_core.scheduler.running) == 0
+
+        _check_engine_state()
+
+        # Second request.
+        request2 = make_request()
+        request2.sampling_params = SamplingParams(
+            top_p=0.99,
+            top_k=50,
+        )
+        engine_core.add_request(request2)
+        _check_engine_state()
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -7,13 +7,14 @@ import os
 import pytest
 from transformers import AutoTokenizer

+from tests.utils import fork_new_process_for_each_test
 from vllm import SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.platforms import current_platform
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core_client import EngineCoreClient
+from vllm.v1.executor.abstract import Executor
 from ...utils import models_path_prefix

 if not current_platform.is_cuda():
@@ -44,7 +45,7 @@ def make_request(params: SamplingParams) -> EngineCoreRequest:
 def loop_until_done(client: EngineCoreClient, outputs: Dict):

    while True:
-        engine_core_outputs = client.get_output()
+        engine_core_outputs = client.get_output().outputs

        if len(engine_core_outputs) == 0:
            break
@@ -62,7 +63,7 @@ def loop_until_done(client: EngineCoreClient, outputs: Dict):
 async def loop_until_done_async(client: EngineCoreClient, outputs: Dict):

    while True:
-        engine_core_outputs = await client.get_output_async()
+        engine_core_outputs = await client.get_output_async().outputs

        if len(engine_core_outputs) == 0:
            break
@@ -77,6 +78,7 @@ async def loop_until_done_async(client: EngineCoreClient, outputs: Dict):
            break


+@fork_new_process_for_each_test
 @pytest.mark.parametrize("multiprocessing_mode", [True, False])
 def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):

@@ -86,13 +88,12 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
        engine_args = EngineArgs(model=MODEL_NAME, compilation_config=3)
        vllm_config = engine_args.create_engine_config(
            UsageContext.UNKNOWN_CONTEXT)
-        executor_class = AsyncLLM._get_executor_cls(vllm_config)
+        executor_class = Executor.get_class(vllm_config)
        client = EngineCoreClient.make_client(
-            vllm_config,
-            executor_class,
-            UsageContext.UNKNOWN_CONTEXT,
            multiprocess_mode=multiprocessing_mode,
            asyncio_mode=False,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
        )

        MAX_TOKENS = 20
@@ -145,10 +146,8 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):

        client.abort_requests([request.request_id])

-        # Shutdown the client.
-        client.shutdown()
-

+@fork_new_process_for_each_test
 @pytest.mark.asyncio
 async def test_engine_core_client_asyncio(monkeypatch):

@@ -158,13 +157,12 @@ async def test_engine_core_client_asyncio(monkeypatch):
        engine_args = EngineArgs(model=MODEL_NAME)
        vllm_config = engine_args.create_engine_config(
            usage_context=UsageContext.UNKNOWN_CONTEXT)
-        executor_class = AsyncLLM._get_executor_cls(vllm_config)
+        executor_class = Executor.get_class(vllm_config)
        client = EngineCoreClient.make_client(
-            vllm_config,
-            executor_class,
-            UsageContext.UNKNOWN_CONTEXT,
            multiprocess_mode=True,
            asyncio_mode=True,
+            vllm_config=vllm_config,
+            executor_class=executor_class,
        )

        MAX_TOKENS = 20
@@ -204,6 +202,3 @@ async def test_engine_core_client_asyncio(monkeypatch):
            else:
                assert len(outputs[req_id]) == MAX_TOKENS, (
                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
-
-        # Shutdown the client.
-        client.shutdown()
--- a/tests/v1/engine/test_detokenizer.py
+++ b/tests/v1/engine/test_detokenizer.py
 from typing import List

+import os
 import pytest
 from transformers import AutoTokenizer

-import os
-from vllm.sampling_params import RequestOutputKind
-from vllm.v1.engine import EngineCoreOutput
-from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerRequest
+from vllm.engine.arg_utils import EngineArgs
+from vllm.sampling_params import RequestOutputKind, SamplingParams
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
+from vllm.v1.engine.output_processor import OutputProcessor
 from ...utils import models_path_prefix

 TOKENIZER_NAME = os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.3")
+VLLM_CONFIG = EngineArgs(model=TOKENIZER_NAME).create_engine_config()
+TOKENIZER_GROUP = init_tokenizer_from_configs(VLLM_CONFIG.model_config,
+                                              VLLM_CONFIG.scheduler_config,
+                                              VLLM_CONFIG.parallel_config,
+                                              VLLM_CONFIG.lora_config)
+
 tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

 FULL_STRINGS = [
@@ -68,28 +76,34 @@ class MockEngineCore:
    "request_output_kind",
    [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
 def test_incremental_detokenization(request_output_kind: RequestOutputKind):
-    detokenizer = Detokenizer(TOKENIZER_NAME)
+    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
    engine_core = MockEngineCore(GENERATION_TOKENS)

    # Make N requests.
    requests = [
-        DetokenizerRequest(
-            request_id=f"request-{idx}",
-            prompt=prompt,
-            prompt_token_ids=prompt_tokens,
-            skip_special_tokens=False,
-            spaces_between_special_tokens=False,
-            output_kind=request_output_kind,
-            stop=[],
-            include_stop_str_in_output=False,
-        ) for idx, (
+        EngineCoreRequest(request_id=f"request-{idx}",
+                          prompt=prompt,
+                          prompt_token_ids=prompt_tokens,
+                          arrival_time=0,
+                          mm_inputs=None,
+                          mm_hashes=None,
+                          mm_placeholders=None,
+                          eos_token_id=None,
+                          lora_request=None,
+                          sampling_params=SamplingParams(
+                              skip_special_tokens=False,
+                              spaces_between_special_tokens=False,
+                              output_kind=request_output_kind,
+                              stop=[],
+                              include_stop_str_in_output=False))
+        for idx, (
            prompt,
            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
    ]

    # Add requests to the detokenizer.
    for request in requests:
-        detokenizer.add_request(request)
+        output_processor.add_request(request)

    gen_strings = {}
    gen_tokens = {}
@@ -100,7 +114,9 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
            break

        # Step the Detokenizer.
-        request_outputs, requests_to_abort = detokenizer.step(outputs)
+        processed_outputs = output_processor.process_outputs(outputs, )
+        request_outputs = processed_outputs.request_outputs
+        requests_to_abort = processed_outputs.reqs_to_abort
        assert len(requests_to_abort) == 0

        # Update tracking.
@@ -124,34 +140,41 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind):
        assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}"
        assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}"

-    assert detokenizer.get_num_unfinished_requests() == 0
-    assert not detokenizer.has_unfinished_requests()
+    assert output_processor.get_num_unfinished_requests() == 0
+    assert not output_processor.has_unfinished_requests()


 @pytest.mark.parametrize("include_stop_str_in_output", [True, False])
 def test_stop_string(include_stop_str_in_output: bool):
-    detokenizer = Detokenizer(TOKENIZER_NAME)
+    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=False)
    engine_core = MockEngineCore(GENERATION_TOKENS)

    # Make N requests.
    requests = [
-        DetokenizerRequest(
+        EngineCoreRequest(
            request_id=f"request-{idx}",
            prompt=prompt,
            prompt_token_ids=prompt_tokens,
-            skip_special_tokens=False,
-            spaces_between_special_tokens=False,
-            output_kind=RequestOutputKind.DELTA,
-            stop=STOP_STRINGS,
-            include_stop_str_in_output=include_stop_str_in_output,
-        ) for idx, (
-            prompt,
-            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+            arrival_time=0,
+            mm_inputs=None,
+            mm_hashes=None,
+            mm_placeholders=None,
+            eos_token_id=None,
+            lora_request=None,
+            sampling_params=SamplingParams(
+                skip_special_tokens=False,
+                spaces_between_special_tokens=False,
+                output_kind=RequestOutputKind.DELTA,
+                stop=STOP_STRINGS,
+                include_stop_str_in_output=include_stop_str_in_output,
+            )) for idx, (
+                prompt,
+                prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
    ]

    # Add requests to the detokenizer.
    for request in requests:
-        detokenizer.add_request(request)
+        output_processor.add_request(request)

    gen_strings = {}
    aborted = []
@@ -162,7 +185,9 @@ def test_stop_string(include_stop_str_in_output: bool):
            break

        # Step the Detokenizer.
-        request_outputs, requests_to_abort = detokenizer.step(outputs)
+        processed_outputs = output_processor.process_outputs(outputs)
+        request_outputs = processed_outputs.request_outputs
+        requests_to_abort = processed_outputs.reqs_to_abort
        for request_output in request_outputs:
            # If aborted, we should not get a request output.
            assert request_output.request_id not in aborted
@@ -203,5 +228,71 @@ def test_stop_string(include_stop_str_in_output: bool):
            assert gen_str == ref_str_exc_stop, (
                f"{gen_str=}, {ref_str_exc_stop=}")

-    assert detokenizer.get_num_unfinished_requests() == 0
-    assert not detokenizer.has_unfinished_requests()
+    assert output_processor.get_num_unfinished_requests() == 0
+    assert not output_processor.has_unfinished_requests()
+
+
+def test_iteration_stats():
+    output_processor = OutputProcessor(TOKENIZER_GROUP, log_stats=True)
+    engine_core = MockEngineCore(GENERATION_TOKENS)
+
+    # Make N requests.
+    requests = [
+        EngineCoreRequest(
+            request_id=f"request-{idx}",
+            prompt=prompt,
+            prompt_token_ids=prompt_tokens,
+            arrival_time=0,
+            mm_inputs=None,
+            mm_hashes=None,
+            mm_placeholders=None,
+            eos_token_id=None,
+            lora_request=None,
+            sampling_params=SamplingParams(),
+        ) for idx, (
+            prompt,
+            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+    ]
+
+    # Add all requests except one to the OutputProcessor.
+    num_active = len(GENERATION_TOKENS) - 1
+    for request in requests[:num_active]:
+        output_processor.add_request(request)
+    inactive_request = requests[num_active]
+
+    # First iteration has 2 prefills.
+    outputs = engine_core.get_outputs()[:num_active]
+    processed_outputs = output_processor.process_outputs(outputs)
+    iteration_stats = processed_outputs.iteration_stats
+    total_prompt_tokens = sum(
+        [len(prompt_tokens) for prompt_tokens in PROMPT_TOKENS[:num_active]])
+
+    assert iteration_stats.num_prompt_tokens == total_prompt_tokens
+    assert iteration_stats.num_generation_tokens == num_active
+
+    # Just decodes in this step.
+    outputs = engine_core.get_outputs()[:num_active]
+    processed_outputs = output_processor.process_outputs(outputs)
+    iteration_stats = processed_outputs.iteration_stats
+
+    assert iteration_stats.num_prompt_tokens == 0
+    assert iteration_stats.num_generation_tokens == num_active
+
+    # Add a new request - prefill and 2 decodes in this step.
+    output_processor.add_request(inactive_request)
+    num_active += 1
+    outputs = engine_core.get_outputs()[:num_active]
+    processed_outputs = output_processor.process_outputs(outputs)
+    iteration_stats = processed_outputs.iteration_stats
+    total_prompt_tokens = len(PROMPT_TOKENS[num_active - 1])
+
+    assert iteration_stats.num_prompt_tokens == total_prompt_tokens
+    assert iteration_stats.num_generation_tokens == num_active
+
+    # Just decodes in this step.
+    outputs = engine_core.get_outputs()[:num_active]
+    processed_outputs = output_processor.process_outputs(outputs)
+    iteration_stats = processed_outputs.iteration_stats
+
+    assert iteration_stats.num_prompt_tokens == 0
+    assert iteration_stats.num_generation_tokens == num_active
--- a/tests/v1/test_stats.py
+++ b/tests/v1/test_stats.py
+import pytest
+
+from vllm.sampling_params import SamplingParams
+from vllm.v1.stats.common import RequestStats, RequestStatsUpdate
+
+
+def make_update(
+    request_id: str,
+    update_type: RequestStatsUpdate.Type,
+    monotonic_ts_s: float,
+    **kwargs,
+):
+    if update_type == RequestStatsUpdate.Type.INPUT_PROCESSED:
+        kwargs.setdefault("sampling_params", SamplingParams(n=1))
+        kwargs.setdefault("num_prompt_tokens", 10)
+    elif update_type == RequestStatsUpdate.Type.PREFILLING:
+        kwargs.setdefault("num_computed_tokens", 10)
+        kwargs.setdefault("num_cached_tokens", 10)
+    elif update_type == RequestStatsUpdate.Type.DETOKENIZED:
+        kwargs.setdefault("num_new_tokens", 10)
+    elif update_type == RequestStatsUpdate.Type.FINISHED:
+        kwargs.setdefault("finish_reason", "test_reason")
+
+    return RequestStatsUpdate(
+        request_id=request_id,
+        type=update_type,
+        monotonic_ts_s=monotonic_ts_s,
+        **kwargs,
+    )
+
+
+def test_invalid_request_update():
+    request_id = "test_request"
+    update_specific_required_fields = {
+        RequestStatsUpdate.Type.INPUT_PROCESSED: [
+            "sampling_params",
+            "num_prompt_tokens",
+        ],
+        RequestStatsUpdate.Type.PREFILLING: [
+            "num_computed_tokens",
+            "num_cached_tokens",
+        ],
+        RequestStatsUpdate.Type.DETOKENIZED: ["num_new_tokens"],
+        RequestStatsUpdate.Type.FINISHED: ["finish_reason"],
+    }
+
+    # Missing a required field should raise an assertion error.
+    for update_type in RequestStatsUpdate.Type:
+        required_fields = update_specific_required_fields.get(update_type, [])
+
+        # Try to miss one of the required fields.
+        kwargs = {field: object() for field in required_fields}
+        for field in required_fields:
+            copy_kwargs = kwargs.copy()
+            copy_kwargs.pop(field)
+            with pytest.raises(ValueError):
+                RequestStatsUpdate(
+                    request_id=request_id,
+                    type=update_type,
+                    **copy_kwargs,
+                )
+
+
+def test_invalid_request_update_transition():
+    # Test invalid transition type.
+    for src in RequestStatsUpdate.Type:
+        for dst in RequestStatsUpdate.Type:
+            if dst not in RequestStatsUpdate._VALID_TRANSITIONS[src]:
+                with pytest.raises(AssertionError):
+                    RequestStatsUpdate.check_valid_update(
+                        make_update(
+                            update_type=dst,
+                            request_id="test_request",
+                            monotonic_ts_s=1,
+                        ),
+                        last_update_type=src,
+                        last_updated_ts_s=0,
+                    )
+            else:
+                RequestStatsUpdate.check_valid_update(
+                    make_update(
+                        request_id="test_request",
+                        update_type=dst,
+                        monotonic_ts_s=1,
+                    ),
+                    last_update_type=src,
+                    last_updated_ts_s=0,
+                )
+
+    # Test invalid timestamp.
+    with pytest.raises(AssertionError):
+        RequestStatsUpdate.check_valid_update(
+            make_update(
+                request_id="test_request",
+                update_type=RequestStatsUpdate.Type.ARRIVED,
+                monotonic_ts_s=1,
+            ),
+            last_update_type=None,
+            last_updated_ts_s=2,
+        )
+
+
+def test_lifecycle_updates():
+    request_id = "test_request"
+    stats = RequestStats(request_id=request_id)
+
+    # Test the below scenario:
+    arrived_ts = 0
+    input_processed_ts = 1
+    queued_ts = 2
+    prefilling_ts = 3
+    decoded_ts = 5
+    detokenized_ts = 6
+    decoded_2_ts = 7
+    detokenized_2_ts = 8
+    preempted_ts = 9
+    resumed_ts = 10
+    decoded_3_ts = 11
+    detokenized_3_ts = 12
+    finished_ts = 13
+
+    # Test ARRIVED
+    arrived_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.ARRIVED,
+        monotonic_ts_s=arrived_ts,
+    )
+    stats.update_from(arrived_update)
+    assert stats.arrival_ts_s == arrived_ts
+    assert stats.last_updated_ts_s == arrived_ts
+
+    # Test INPUT_PROCESSED
+    sampling_params = SamplingParams(n=1)
+    input_processed_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.INPUT_PROCESSED,
+        monotonic_ts_s=input_processed_ts,
+        sampling_params=sampling_params,
+        num_prompt_tokens=6,
+    )
+    stats.update_from(input_processed_update)
+    assert stats.input_processor_end_ts_s == input_processed_ts
+    assert stats.last_updated_ts_s == input_processed_ts
+    assert stats.num_prompt_tokens == 6
+    assert stats.sampling_params == sampling_params
+
+    assert stats.first_token_ts_s is None
+    assert stats.prefill_ts_s is None
+
+    # Test QUEUED
+    queued_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.QUEUED,
+        monotonic_ts_s=queued_ts,
+    )
+    stats.update_from(queued_update)
+    assert stats.queued_ts_s == queued_ts
+    assert stats.last_updated_ts_s == queued_ts
+
+    # Test PREFILLING
+    prefilling_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.PREFILLING,
+        monotonic_ts_s=prefilling_ts,
+        num_computed_tokens=3,
+        num_cached_tokens=1,
+    )
+    stats.update_from(prefilling_update)
+    assert stats.prefill_ts_s == prefilling_ts
+    assert stats.num_computed_tokens == 3
+    assert stats.num_cached_tokens == 1
+    assert stats.queue_duration_s == prefilling_ts - queued_ts
+
+    # Test DECODING
+    decoded_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.DECODING,
+        monotonic_ts_s=decoded_ts,
+    )
+    stats.update_from(decoded_update)
+    assert stats.last_updated_ts_s == decoded_ts
+
+    # Test DETOKENIZED
+    detokenized_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.DETOKENIZED,
+        monotonic_ts_s=detokenized_ts,
+        num_new_tokens=1,
+    )
+    stats.update_from(detokenized_update)
+    assert stats.last_updated_ts_s == detokenized_ts
+    assert stats.num_output_tokens == 1
+    # Since arrival
+    assert stats.first_token_latency_s == detokenized_ts - arrived_ts
+    # Since first scheduled
+    assert stats.prefill_latency_s == detokenized_ts - prefilling_ts
+
+    # Test another DECODING and DETOKENIZED should
+    # yield correct inter token latency
+    decoded_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.DECODING,
+        monotonic_ts_s=decoded_2_ts,
+    )
+    stats.update_from(decoded_update)
+
+    detokenized_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.DETOKENIZED,
+        monotonic_ts_s=detokenized_2_ts,
+        num_new_tokens=1,
+    )
+    stats.update_from(detokenized_update)
+    assert stats.output_token_latency_s_lst == [
+        detokenized_2_ts - detokenized_ts,
+    ]
+    assert stats.num_output_tokens == 2
+
+    # Test PREEMPTED
+    preempted_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.PREEMPTED,
+        monotonic_ts_s=preempted_ts,
+    )
+    stats.update_from(preempted_update)
+    assert stats.last_updated_ts_s == preempted_ts
+    assert stats.preempted_ts_s_lst == [preempted_ts]
+    # States should be reset
+    assert stats.num_computed_tokens == 0
+    assert stats.num_cached_tokens == 0
+    # These states should not be reset
+    assert stats.num_output_tokens == 2
+    assert stats.output_token_latency_s_lst == [
+        detokenized_2_ts - detokenized_ts,
+    ]
+    assert stats.prefill_latency_s == prefilling_ts - arrived_ts
+    assert stats.num_prompt_tokens == 6
+    assert stats.prefill_start_ts_s_lst == [prefilling_ts]
+
+    # Test resumed
+    resumed_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.PREFILLING,
+        monotonic_ts_s=resumed_ts,
+        num_computed_tokens=6,
+        num_cached_tokens=2,
+    )
+    stats.update_from(resumed_update)
+    # prefill timestamp should not be updated since it's a resumed prefill
+    assert stats.prefill_ts_s == prefilling_ts
+    assert stats.num_computed_tokens == 6
+    assert stats.num_cached_tokens == 2
+    assert stats.prefill_start_ts_s_lst == [
+        prefilling_ts,
+        resumed_ts,
+    ]
+    assert stats.last_updated_ts_s == resumed_ts
+
+    # Test another DECODED/DETOKENIZED should yield correct first token latency.
+    decoded_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.DECODING,
+        monotonic_ts_s=decoded_3_ts,
+    )
+    detokenized_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.DETOKENIZED,
+        monotonic_ts_s=detokenized_3_ts,
+        num_new_tokens=1,
+    )
+    stats.update_from(decoded_update)
+    stats.update_from(detokenized_update)
+    assert stats.first_token_ts_s == detokenized_ts - arrived_ts
+    assert stats.num_output_tokens == 3
+    assert stats.output_token_latency_s_lst == [
+        detokenized_2_ts - detokenized_ts,
+        detokenized_3_ts - detokenized_2_ts,
+    ]
+
+    # Test FINISHED
+    finished_update = RequestStatsUpdate(
+        request_id=request_id,
+        type=RequestStatsUpdate.Type.FINISHED,
+        monotonic_ts_s=finished_ts,
+        finish_reason="test_reason",
+    )
+    stats.update_from(finished_update)
+    assert stats.last_updated_ts_s == finished_ts
+    assert stats.e2e_latency_s == finished_ts - arrived_ts
+    assert stats.inference_latency_s == finished_ts - prefilling_ts
+    assert stats.prefill_latency_s == detokenized_ts - prefilling_ts
+    assert stats.decode_latency_s == finished_ts - detokenized_ts
+    assert stats.first_token_latency_s == detokenized_ts - arrived_ts
+    assert stats.queue_duration_s == prefilling_ts - queued_ts
+    assert stats.is_finished
+    assert stats.finish_reason == "test_reason"
+
+    # TODO(rickyx): Add model forward/execute time.
+    assert stats.model_forward_duration_s == 0.0
+    assert stats.model_execute_duration_s == 0.0
--- a/tests/v1/test_utils.py
+++ b/tests/v1/test_utils.py
+from typing import List
+
+import torch
+
+from vllm.v1.utils import bind_kv_cache
+
+
+def test_bind_kv_cache():
+    from vllm.attention import Attention
+
+    ctx = {
+        'layers.0.self_attn': Attention(32, 128, 0.1),
+        'layers.1.self_attn': Attention(32, 128, 0.1),
+        'layers.2.self_attn': Attention(32, 128, 0.1),
+        'layers.3.self_attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = {
+        'layers.0.self_attn': torch.zeros((1, )),
+        'layers.1.self_attn': torch.zeros((1, )),
+        'layers.2.self_attn': torch.zeros((1, )),
+        'layers.3.self_attn': torch.zeros((1, )),
+    }
+    runner_kv_caches: List[torch.Tensor] = []
+    bind_kv_cache(kv_cache, ctx, runner_kv_caches)
+    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[
+        'layers.0.self_attn']
+    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[
+        'layers.1.self_attn']
+    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[
+        'layers.2.self_attn']
+    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[
+        'layers.3.self_attn']
+
+    assert runner_kv_caches[0] is kv_cache['layers.0.self_attn']
+    assert runner_kv_caches[1] is kv_cache['layers.1.self_attn']
+    assert runner_kv_caches[2] is kv_cache['layers.2.self_attn']
+    assert runner_kv_caches[3] is kv_cache['layers.3.self_attn']
+
+
+def test_bind_kv_cache_non_attention():
+    from vllm.attention import Attention
+
+    # example from Jamba PP=2
+    ctx = {
+        'model.layers.20.attn': Attention(32, 128, 0.1),
+        'model.layers.28.attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = {
+        'model.layers.20.attn': torch.zeros((1, )),
+        'model.layers.28.attn': torch.zeros((1, )),
+    }
+
+    runner_kv_caches: List[torch.Tensor] = []
+    bind_kv_cache(kv_cache, ctx, runner_kv_caches)
+
+    assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[
+        'model.layers.20.attn']
+    assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[
+        'model.layers.28.attn']
+
+    assert runner_kv_caches[0] is kv_cache['model.layers.20.attn']
+    assert runner_kv_caches[1] is kv_cache['model.layers.28.attn']
--- a/tests/vllm_test_utils/vllm_test_utils/__init__.py
+++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py
@@ -4,5 +4,6 @@ It does not import any vLLM modules.
 """

 from .blame import BlameResult, blame
+from .monitor import MonitoredValues, monitor

-__all__ = ["blame", "BlameResult"]
+__all__ = ["blame", "BlameResult", "monitor", "MonitoredValues"]
--- a/tests/vllm_test_utils/vllm_test_utils/monitor.py
+++ b/tests/vllm_test_utils/vllm_test_utils/monitor.py
+import contextlib
+import dataclasses
+import sys
+import traceback
+from typing import Callable, Generator, Generic, TypeVar
+
+_T = TypeVar("_T")
+
+
+@dataclasses.dataclass
+class MonitoredValues(Generic[_T]):
+    values: list[_T] = dataclasses.field(default_factory=list)
+    trace_stacks: list[str] = dataclasses.field(default_factory=list)
+
+
+@contextlib.contextmanager
+def monitor(
+    measure_func: Callable[[],
+                           _T]) -> Generator[MonitoredValues[_T], None, None]:
+    """
+    Trace the function calls to continuously monitor the change of
+    a value.
+
+    Usage:
+
+    ```python
+
+    def measure_func():
+        ... # measure the current value
+        return current_value
+
+    with monitor(measure_func) as monitored_values:
+        # do something
+    
+        monitored_values.values # all changes of the values
+        monitored_values.trace_stacks # trace stacks of every change
+    ```
+    """
+    monitored_values = MonitoredValues[_T]()
+
+    def _trace_calls(frame, event, arg=None):
+        nonlocal monitored_values
+        if event in ['line']:
+            # triggered by every line of Python code.
+            # only Python functions will trigger it,
+            # c/cpp functions will not trigger it.
+            try:
+                # Temporarily disable the trace function
+                sys.settrace(None)
+                # do a measurement
+                current_value = measure_func()
+                if len(monitored_values.values
+                       ) == 0 or current_value != monitored_values.values[-1]:
+                    monitored_values.values.append(current_value)
+                    monitored_values.trace_stacks.append("".join(
+                        traceback.format_stack()))
+                # Re-enable the trace function
+                sys.settrace(_trace_calls)
+            except NameError:
+                # modules are deleted during shutdown
+                pass
+        return _trace_calls
+
+    try:
+        sys.settrace(_trace_calls)
+        yield monitored_values
+    finally:
+        sys.settrace(None)
--- a/tests/weight_loading/models.txt
+++ b/tests/weight_loading/models.txt
@@ -20,7 +20,7 @@ compressed-tensors, nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test, main
 compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
 compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
 compressed-tensors, nm-testing/TinyLlama-1.1B-Chat-v1.0-actorder-group, main
-compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
+#compressed-tensors, mgoin/DeepSeek-Coder-V2-Lite-Instruct-FP8, main
 compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-FP8-Dynamic-testing, main, 90
 compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing, main, 90
 awq, casperhansen/mixtral-instruct-awq, main
@@ -30,4 +30,5 @@ marlin, nm-testing/zephyr-beta-7b-marlin-g128, main
 marlin, robertgshaw2/zephyr-7b-beta-channelwise-marlin, main
 qqq, HandH1998/QQQ-Llama-3-8b-g128, main
 qqq, HandH1998/QQQ-Llama-3-8b, main
-hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main
\ No newline at end of file
+hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main
+None, mgleize/fairseq2-dummy-Llama-3.2-1B, main
\ No newline at end of file
--- a/tests/weight_loading/run_model_weight_loading_test.sh
+++ b/tests/weight_loading/run_model_weight_loading_test.sh
@@ -3,7 +3,7 @@ SUCCESS=0

 while getopts "c:" OPT; do
  case ${OPT} in
-    c ) 
+    c )
        CONFIG="$OPTARG"
        ;;
    \? )
@@ -18,9 +18,14 @@ IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"

 for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
 do
+    if [[ $MODEL_CONFIG == \#* ]]; then
+        echo "=== SKIPPING MODEL: $MODEL_CONFIG ==="
+        continue
+    fi
+
    LOCAL_SUCCESS=0
    IFS=', ' read -r -a array <<< "$MODEL_CONFIG"
-    
+
    echo "=== RUNNING MODEL: $MODEL_CONFIG ==="

    export QUANTIZATION=${array[0]}

--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
@@ -21,12 +21,13 @@ def test_weight_loading(vllm_runner):
    """
    Test parameter weight loading with tp>1.
    """
-    with vllm_runner(model_name=MODEL_NAME,
-                    #  revision=REVISION,
-                     dtype=torch.half if QUANTIZATION == "gptq" else "auto",
-                     quantization=QUANTIZATION,
-                     max_model_len=MAX_MODEL_LEN,
-                     tensor_parallel_size=2) as model:
+    with vllm_runner(
+            model_name=MODEL_NAME,
+            # revision=REVISION,
+            dtype=torch.half if QUANTIZATION == "gptq" else "auto",
+            quantization=None if QUANTIZATION == "None" else QUANTIZATION,
+            max_model_len=MAX_MODEL_LEN,
+            tensor_parallel_size=2) as model:

        output = model.generate_greedy("Hello world!", max_tokens=20)
        print(output)

--- a/tests/worker/test_model_input.py
+++ b/tests/worker/test_model_input.py
@@ -74,6 +74,7 @@ def test_model_runner_input():
        num_decode_tokens=3,
        slot_mapping=torch.zeros(1),
        multi_modal_placeholder_index_maps=None,
+        enable_kv_scales_calculation=True,
    )
    model_input = ModelInputForGPUWithSamplingMetadata(
        input_tokens=torch.ones(10),
@@ -126,6 +127,7 @@ def test_embedding_model_runner_input():
        num_decode_tokens=3,
        slot_mapping=torch.zeros(1),
        multi_modal_placeholder_index_maps=None,
+        enable_kv_scales_calculation=True,
    )
    model_input = ModelInputForGPUWithPoolingMetadata(
        input_tokens=torch.ones(10),
@@ -177,6 +179,7 @@ def test_multi_step_model_runner_input():
        num_decode_tokens=3,
        slot_mapping=torch.zeros(1),
        multi_modal_placeholder_index_maps=None,
+        enable_kv_scales_calculation=True,
    )
    frozen_model_input = ModelInputForGPUWithSamplingMetadata(
        input_tokens=torch.ones(10),

--- a/tools/actionlint.sh
+++ b/tools/actionlint.sh
-#!/bin/bash
-
-if command -v actionlint &> /dev/null; then
-    actionlint "$@"
-    exit 0
-elif [ -x ./actionlint ]; then
-    ./actionlint "$@"
-    exit 0
-fi
-
-# download a binary to the current directory - v1.7.3
-bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash)
-./actionlint "$@"
--- a/tools/mypy.sh
+++ b/tools/mypy.sh
 #!/bin/bash

 CI=${1:-0}
-PYTHON_VERSION=${2:-3.9}
+PYTHON_VERSION=${2:-local}

 if [ "$CI" -eq 1 ]; then
    set -e
 fi

+if [ $PYTHON_VERSION == "local" ]; then
+    PYTHON_VERSION=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
+fi
+
 run_mypy() {
    echo "Running mypy on $1"
    if [ "$CI" -eq 1 ] && [ -z "$1" ]; then
@@ -23,6 +27,7 @@ run_mypy vllm/compilation
 run_mypy vllm/distributed
 run_mypy vllm/engine
 run_mypy vllm/executor
+run_mypy vllm/inputs
 run_mypy vllm/lora
 run_mypy vllm/model_executor
 run_mypy vllm/plugins