"docs/vscode:/vscode.git/clone" did not exist on "bda3eda82d6c9883a86ea40ece43731714ed5695"
Unverified Commit 72ff142c authored by Mark McLoughlin's avatar Mark McLoughlin Committed by GitHub
Browse files

[Core][Metrics] Remove `vllm:prompt_tokens_recomputed` metric (#38709)


Signed-off-by: default avatarMark McLoughlin <markmc@redhat.com>
parent ee3c0c83
...@@ -190,8 +190,7 @@ def test_prompt_token_stats_full_local_cache_recompute(): ...@@ -190,8 +190,7 @@ def test_prompt_token_stats_full_local_cache_recompute():
) )
assert stats.computed == 1 assert stats.computed == 1
assert stats.local_cache_hit == 1000 assert stats.local_cache_hit == 999
assert stats.recomputed_tokens == 1
def test_prompt_token_stats_full_external_transfer_recompute(): def test_prompt_token_stats_full_external_transfer_recompute():
...@@ -201,11 +200,10 @@ def test_prompt_token_stats_full_external_transfer_recompute(): ...@@ -201,11 +200,10 @@ def test_prompt_token_stats_full_external_transfer_recompute():
# Case 6: Full external transfer (999 cached after reduction, 1 recomputed) # Case 6: Full external transfer (999 cached after reduction, 1 recomputed)
stats.update_from_output( stats.update_from_output(
num_cached_tokens=999, num_cached_tokens=999,
num_external_computed_tokens=1000, num_external_computed_tokens=999,
prompt_len=1000, prompt_len=1000,
) )
assert stats.computed == 1 assert stats.computed == 1
assert stats.local_cache_hit == 0 assert stats.local_cache_hit == 0
assert stats.external_kv_transfer == 1000 assert stats.external_kv_transfer == 999
assert stats.recomputed_tokens == 1
...@@ -622,16 +622,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase): ...@@ -622,16 +622,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
counter_prompt_tokens_cached, per_engine_labelvalues counter_prompt_tokens_cached, per_engine_labelvalues
) )
# Recomputed tokens (last token recomputed when entire prompt is cached)
counter_prompt_tokens_recomputed = self._counter_cls(
name="vllm:prompt_tokens_recomputed",
documentation="Number of cached tokens recomputed for forward pass.",
labelnames=labelnames,
)
self.counter_prompt_tokens_recomputed = create_metric_per_engine(
counter_prompt_tokens_recomputed, per_engine_labelvalues
)
counter_generation_tokens = self._counter_cls( counter_generation_tokens = self._counter_cls(
name="vllm:generation_tokens", name="vllm:generation_tokens",
documentation="Number of generation tokens processed.", documentation="Number of generation tokens processed.",
...@@ -1122,7 +1112,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase): ...@@ -1122,7 +1112,6 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
pts.get_by_source(source) pts.get_by_source(source)
) )
self.counter_prompt_tokens_cached[engine_idx].inc(pts.cached_tokens) self.counter_prompt_tokens_cached[engine_idx].inc(pts.cached_tokens)
self.counter_prompt_tokens_recomputed[engine_idx].inc(pts.recomputed_tokens)
self.counter_generation_tokens[engine_idx].inc( self.counter_generation_tokens[engine_idx].inc(
iteration_stats.num_generation_tokens iteration_stats.num_generation_tokens
) )
......
...@@ -246,12 +246,11 @@ class PromptTokenStats: ...@@ -246,12 +246,11 @@ class PromptTokenStats:
local_cache_hit: Tokens from local prefix cache. local_cache_hit: Tokens from local prefix cache.
external_kv_transfer: Tokens from external KV transfer. external_kv_transfer: Tokens from external KV transfer.
cached_tokens: Tokens skipped during prefill (from scheduler). cached_tokens: Tokens skipped during prefill (from scheduler).
recomputed_tokens: Cached tokens that were recomputed (see below).
total: Total prompt tokens. total: Total prompt tokens.
Invariants: Invariants:
computed + local_cache_hit + external_kv_transfer - recomputed_tokens = total computed + local_cache_hit + external_kv_transfer = total
local_cache_hit + external_kv_transfer - recomputed_tokens = cached_tokens local_cache_hit + external_kv_transfer = cached_tokens
""" """
ALL_SOURCES: tuple[str, ...] = ( ALL_SOURCES: tuple[str, ...] = (
...@@ -264,7 +263,6 @@ class PromptTokenStats: ...@@ -264,7 +263,6 @@ class PromptTokenStats:
local_cache_hit: int = 0 local_cache_hit: int = 0
external_kv_transfer: int = 0 external_kv_transfer: int = 0
cached_tokens: int = 0 cached_tokens: int = 0
recomputed_tokens: int = 0
total: int = 0 total: int = 0
def update_from_output( def update_from_output(
...@@ -274,11 +272,6 @@ class PromptTokenStats: ...@@ -274,11 +272,6 @@ class PromptTokenStats:
prompt_len: int, prompt_len: int,
) -> None: ) -> None:
"""Update stats from a prefill output.""" """Update stats from a prefill output."""
# When all tokens are cached, the scheduler reduces num_cached_tokens
# by 1 to force the model to recompute the last token, since the model
# needs at least one input token to run a forward pass.
recomputed = 1 if (num_cached_tokens + 1 == prompt_len) else 0
self.computed += prompt_len - num_cached_tokens self.computed += prompt_len - num_cached_tokens
self.external_kv_transfer += num_external_computed_tokens self.external_kv_transfer += num_external_computed_tokens
# FIXME(yifan): local_cache_hit can go negative after preemption. # FIXME(yifan): local_cache_hit can go negative after preemption.
...@@ -290,10 +283,9 @@ class PromptTokenStats: ...@@ -290,10 +283,9 @@ class PromptTokenStats:
# as a separate metric rather than reusing num_external_computed_tokens # as a separate metric rather than reusing num_external_computed_tokens
# for metric directly. # for metric directly.
self.local_cache_hit += max( self.local_cache_hit += max(
0, (num_cached_tokens + recomputed - num_external_computed_tokens) 0, (num_cached_tokens - num_external_computed_tokens)
) )
self.cached_tokens += num_cached_tokens self.cached_tokens += num_cached_tokens
self.recomputed_tokens += recomputed
self.total += prompt_len self.total += prompt_len
def get_by_source(self, source: str) -> int: def get_by_source(self, source: str) -> int:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment