Unverified Commit 39c8d125 authored by jain-ria's avatar jain-ria Committed by GitHub
Browse files

fix: metrics publishing in vllm v0 worker (#1845)

parent 0f2fa928
...@@ -34,7 +34,14 @@ from vllm.inputs import TokensPrompt ...@@ -34,7 +34,14 @@ from vllm.inputs import TokensPrompt
from vllm.remote_prefill import RemotePrefillParams, RemotePrefillRequest from vllm.remote_prefill import RemotePrefillParams, RemotePrefillRequest
from vllm.sampling_params import RequestOutputKind from vllm.sampling_params import RequestOutputKind
from dynamo.llm import ModelType, WorkerMetricsPublisher, register_llm from dynamo.llm import (
ForwardPassMetrics,
KvStats,
ModelType,
WorkerMetricsPublisher,
WorkerStats,
register_llm,
)
from dynamo.sdk import async_on_start, depends, dynamo_context, endpoint, service from dynamo.sdk import async_on_start, depends, dynamo_context, endpoint, service
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -122,15 +129,25 @@ class VllmWorker: ...@@ -122,15 +129,25 @@ class VllmWorker:
self.engine_client.set_metrics_publisher(self.metrics_publisher) self.engine_client.set_metrics_publisher(self.metrics_publisher)
# Initially send dummy metrics to kick start, # Initially send dummy metrics to kick start,
# vLLM will not update stat until forward pass is triggered # vLLM will not update stat until forward pass is triggered
self.metrics_publisher.publish( worker_stats = WorkerStats(
0, # request_active_slots request_active_slots=0,
1024, # request_total_slots request_total_slots=1024,
0, # kv_active_blocks num_requests_waiting=0,
1024, # kv_total_blocks data_parallel_rank=None,
0, # num_requests_waiting )
0.0, # gpu_cache_usage_perc kv_stats = KvStats(
0.0, # gpu_prefix_cache_hit_rate kv_active_blocks=0,
kv_total_blocks=1024,
gpu_cache_usage_perc=0.0,
gpu_prefix_cache_hit_rate=0.0,
)
spec_dec_stats = None
metrics = ForwardPassMetrics(
worker_stats=worker_stats,
kv_stats=kv_stats,
spec_decode_stats=spec_dec_stats,
) )
self.metrics_publisher.publish(metrics)
task = asyncio.create_task(self.create_metrics_publisher_endpoint()) task = asyncio.create_task(self.create_metrics_publisher_endpoint())
task.add_done_callback( task.add_done_callback(
lambda _: logger.info("metrics publisher endpoint created") lambda _: logger.info("metrics publisher endpoint created")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment