[Core] Don't count preempted tokens in prefix cache hit rate (#25787)

Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>

[Core] Don't count preempted tokens in prefix cache hit rate (#25787)
Signed-off-by: Zhuohan Li <zhuohan123@gmail.com>
8bf8f458 · Zhuohan Li · GitHub · 6f5c0931 · 8bf8f458 · 8bf8f458
Unverified Commit 8bf8f458 authored Sep 26, 2025 by Zhuohan Li Committed by GitHub Sep 27, 2025
4 changed files
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -184,6 +184,14 @@ class KVCacheManager:

        if self.log_stats:
            assert self.prefix_cache_stats is not None
+            if request.num_preemptions > 0:
+                # Previously preempted request
+                self.prefix_cache_stats.preempted_requests += 1
+                self.prefix_cache_stats.preempted_queries += request.num_tokens
+                self.prefix_cache_stats.preempted_hits += (
+                    num_new_computed_tokens)
+            else:
+                # New request
                self.prefix_cache_stats.requests += 1
                self.prefix_cache_stats.queries += request.num_tokens
                self.prefix_cache_stats.hits += num_new_computed_tokens

--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -251,12 +251,17 @@ class Scheduler(SchedulerInterface):
                req_index += 1
                continue

+            # Schedule newly needed KV blocks for the request.
            while True:
                new_blocks = self.kv_cache_manager.allocate_slots(
                    request,
                    num_new_tokens,
                    num_lookahead_tokens=self.num_lookahead_tokens)
-                if new_blocks is None:
+
+                if new_blocks is not None:
+                    # The request can be scheduled.
+                    break
+
                # The request cannot be scheduled.
                # Preempt the lowest-priority request.
                if self.policy == SchedulingPolicy.PRIORITY:
@@ -274,23 +279,20 @@ class Scheduler(SchedulerInterface):
                self.encoder_cache_manager.free(preempted_req)
                preempted_req.status = RequestStatus.PREEMPTED
                preempted_req.num_computed_tokens = 0
+                preempted_req.num_preemptions += 1
                if self.log_stats:
-                        preempted_req.record_event(
-                            EngineCoreEventType.PREEMPTED, scheduled_timestamp)
+                    preempted_req.record_event(EngineCoreEventType.PREEMPTED,
+                                               scheduled_timestamp)

                self.waiting.prepend_request(preempted_req)
                preempted_reqs.append(preempted_req)
                if preempted_req == request:
-                        # No more request to preempt.
-                        can_schedule = False
+                    # No more request to preempt. Cannot schedule this request.
                    break
-                else:
-                    # The request can be scheduled.
-                    can_schedule = True
-                    break
-            if not can_schedule:
+
+            if new_blocks is None:
+                # Cannot schedule this request.
                break
-            assert new_blocks is not None

            # Schedule the request.
            scheduled_running_reqs.append(request)

--- a/vllm/v1/metrics/stats.py
+++ b/vllm/v1/metrics/stats.py
@@ -17,13 +17,19 @@ class PrefixCacheStats:
    """Stores prefix cache hit statistics."""
    # Whether reset_prefix_cache was invoked.
    reset: bool = False
-    # The number of requests in this update.
+    # The number of new requests in this update.
    requests: int = 0
    # The number of queries in these requests. Note that "queries" here
    # means the number of tokens that were queried from the cache.
    queries: int = 0
    # The number of hits in these requests.
    hits: int = 0
+    # The number of previously preempted requests in this update.
+    preempted_requests: int = 0
+    # The `queries` number for preempted requests.
+    preempted_queries: int = 0
+    # The `hits` number for preempted requests.
+    preempted_hits: int = 0


 @dataclass

--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -115,6 +115,9 @@ class Request:
        # indicates that the output is corrupted
        self.num_nans_in_logits = 0

+        # The number of requests being preempted by the scheduler
+        self.num_preemptions = 0
+
        self.block_hashes: list[BlockHash] = []
        self.get_hash_new_full_blocks: Optional[Callable[
            [], list[BlockHash]]] = None