"vllm/vscode:/vscode.git/clone" did not exist on "3f42b83df72c50b113953ed8b157e480131232ed"
Unverified Commit f17f1d46 authored by Mark McLoughlin's avatar Mark McLoughlin Committed by GitHub
Browse files

[V1][Metrics] Add GPU cache usage % gauge (#12561)


Signed-off-by: default avatarMark McLoughlin <markmc@redhat.com>
parent 1c1bb0bb
...@@ -200,6 +200,7 @@ EXPECTED_METRICS = [ ...@@ -200,6 +200,7 @@ EXPECTED_METRICS = [
EXPECTED_METRICS_V1 = [ EXPECTED_METRICS_V1 = [
"vllm:num_requests_running", "vllm:num_requests_running",
"vllm:num_requests_waiting", "vllm:num_requests_waiting",
"vllm:gpu_cache_usage_perc",
"vllm:prompt_tokens_total", "vllm:prompt_tokens_total",
"vllm:generation_tokens_total", "vllm:generation_tokens_total",
"vllm:request_prompt_tokens_sum", "vllm:request_prompt_tokens_sum",
......
...@@ -69,6 +69,11 @@ class KVCacheManager: ...@@ -69,6 +69,11 @@ class KVCacheManager:
# is finished. # is finished.
self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {} self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {}
@property
def usage(self) -> float:
return 1.0 - (self.free_block_queue.num_free_blocks /
self.num_gpu_blocks)
def get_computed_blocks( def get_computed_blocks(
self, request: Request) -> Tuple[List[KVCacheBlock], int]: self, request: Request) -> Tuple[List[KVCacheBlock], int]:
"""Get the computed (cached) blocks for the request. """Get the computed (cached) blocks for the request.
......
...@@ -544,6 +544,7 @@ class Scheduler: ...@@ -544,6 +544,7 @@ class Scheduler:
return SchedulerStats( return SchedulerStats(
num_running_reqs=len(self.running), num_running_reqs=len(self.running),
num_waiting_reqs=len(self.waiting), num_waiting_reqs=len(self.waiting),
gpu_cache_usage=self.kv_cache_manager.usage,
) )
......
...@@ -69,11 +69,13 @@ class LoggingStatLogger(StatLoggerBase): ...@@ -69,11 +69,13 @@ class LoggingStatLogger(StatLoggerBase):
logger.info( logger.info(
"Avg prompt throughput: %.1f tokens/s, " "Avg prompt throughput: %.1f tokens/s, "
"Avg generation throughput: %.1f tokens/s, " "Avg generation throughput: %.1f tokens/s, "
"Running: %d reqs, Waiting: %d reqs ", "Running: %d reqs, Waiting: %d reqs "
"GPU KV cache usage: %.1f%%.",
prompt_throughput, prompt_throughput,
generation_throughput, generation_throughput,
scheduler_stats.num_running_reqs, scheduler_stats.num_running_reqs,
scheduler_stats.num_waiting_reqs, scheduler_stats.num_waiting_reqs,
scheduler_stats.gpu_cache_usage * 100,
) )
...@@ -97,6 +99,11 @@ class PrometheusStatLogger(StatLoggerBase): ...@@ -97,6 +99,11 @@ class PrometheusStatLogger(StatLoggerBase):
documentation="Number of requests waiting to be processed.", documentation="Number of requests waiting to be processed.",
labelnames=labelnames).labels(*labelvalues) labelnames=labelnames).labels(*labelvalues)
self.gauge_gpu_cache_usage = prometheus_client.Gauge(
name="vllm:gpu_cache_usage_perc",
documentation="GPU KV-cache usage. 1 means 100 percent usage.",
labelnames=labelnames).labels(*labelvalues)
self.counter_prompt_tokens = prometheus_client.Counter( self.counter_prompt_tokens = prometheus_client.Counter(
name="vllm:prompt_tokens_total", name="vllm:prompt_tokens_total",
documentation="Number of prefill tokens processed.", documentation="Number of prefill tokens processed.",
...@@ -147,6 +154,8 @@ class PrometheusStatLogger(StatLoggerBase): ...@@ -147,6 +154,8 @@ class PrometheusStatLogger(StatLoggerBase):
self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs) self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs)
self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs)
self.gauge_gpu_cache_usage.set(scheduler_stats.gpu_cache_usage)
self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens) self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens)
self.counter_generation_tokens.inc( self.counter_generation_tokens.inc(
iteration_stats.num_generation_tokens) iteration_stats.num_generation_tokens)
......
...@@ -14,7 +14,7 @@ class SchedulerStats: ...@@ -14,7 +14,7 @@ class SchedulerStats:
num_running_reqs: int = 0 num_running_reqs: int = 0
num_waiting_reqs: int = 0 num_waiting_reqs: int = 0
# gpu_cache_usage: float = 0.0 gpu_cache_usage: float = 0.0
# gpu_prefix_cache_hit_rate: float = 0.0 # gpu_prefix_cache_hit_rate: float = 0.0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment