"lib/bindings/python/vscode:/vscode.git/clone" did not exist on "2ba8bdb19a9ac2ec1ce568a07f740757f1420d74"
Unverified Commit 17abc9de authored by Hongkuan Zhou's avatar Hongkuan Zhou Committed by GitHub
Browse files

chore: expose inc id and add version to forwardpassmetric (#7501)


Signed-off-by: default avatarhongkuanz <hongkuanz@nvidia.com>
parent da2c5e76
...@@ -36,8 +36,14 @@ TODO: planner consuming these metrics instead of frontend/router metrics ...@@ -36,8 +36,14 @@ TODO: planner consuming these metrics instead of frontend/router metrics
from __future__ import annotations from __future__ import annotations
import logging
import msgspec import msgspec
logger = logging.getLogger(__name__)
FPM_VERSION: int = 1
class WelfordAccumulator: class WelfordAccumulator:
"""Welford's online algorithm for count / sum / population-variance. """Welford's online algorithm for count / sum / population-variance.
...@@ -156,12 +162,21 @@ class ForwardPassMetrics( ...@@ -156,12 +162,21 @@ class ForwardPassMetrics(
engine transitions from active to idle. engine transitions from active to idle.
""" """
# Schema version. Consumers must check this before interpreting
# the remaining fields. Bump when the schema changes incompatibly.
version: int = FPM_VERSION
# Unique worker identifier (Dynamo runtime connection_id). # Unique worker identifier (Dynamo runtime connection_id).
worker_id: str = "" worker_id: str = ""
# Data parallel rank. Each DP rank has its own scheduler and ZMQ port. # Data parallel rank. Each DP rank has its own scheduler and ZMQ port.
dp_rank: int = 0 dp_rank: int = 0
# Monotonically increasing sequence number per (worker_id, dp_rank).
# Set by _FpmPublisherThread before encoding; 0 for messages that
# have not been stamped (e.g. unit-test fixtures).
counter_id: int = 0
# Wall-clock time of this iteration: from schedule() to update_from_output(). # Wall-clock time of this iteration: from schedule() to update_from_output().
# Covers scheduling + model forward pass + output processing. # Covers scheduling + model forward pass + output processing.
# 0.0 for idle heartbeat messages. # 0.0 for idle heartbeat messages.
...@@ -182,5 +197,27 @@ def encode(metrics: ForwardPassMetrics) -> bytes: ...@@ -182,5 +197,27 @@ def encode(metrics: ForwardPassMetrics) -> bytes:
return _encoder.encode(metrics) return _encoder.encode(metrics)
def decode(data: bytes) -> ForwardPassMetrics: class UnsupportedFpmVersionError(Exception):
return _decoder.decode(data) """Raised when a ForwardPassMetrics message has an unrecognised version."""
def decode(data: bytes) -> ForwardPassMetrics | None:
"""Decode a ForwardPassMetrics message, returning None for unknown versions.
Returns None (and logs a warning) if the message cannot be decoded or
carries a version this code does not understand, so callers can simply
skip unsupported messages without crashing.
"""
try:
metrics = _decoder.decode(data)
except Exception:
logger.warning("Failed to decode ForwardPassMetrics message, skipping")
return None
if metrics.version != FPM_VERSION:
logger.warning(
"Unsupported ForwardPassMetrics version %d (expected %d), skipping",
metrics.version,
FPM_VERSION,
)
return None
return metrics
...@@ -16,13 +16,18 @@ Usage: ...@@ -16,13 +16,18 @@ Usage:
import argparse import argparse
import asyncio import asyncio
import json import json
import logging
import os import os
import sys
import msgspec import msgspec
from dynamo.common.forward_pass_metrics import decode from dynamo.common.forward_pass_metrics import decode
from dynamo.llm import FpmEventSubscriber
from dynamo.runtime import DistributedRuntime from dynamo.runtime import DistributedRuntime
from dynamo.runtime.logging import configure_dynamo_logging
configure_dynamo_logging()
logger = logging.getLogger(__name__)
def main() -> None: def main() -> None:
...@@ -54,8 +59,6 @@ def main() -> None: ...@@ -54,8 +59,6 @@ def main() -> None:
async def run(args: argparse.Namespace) -> None: async def run(args: argparse.Namespace) -> None:
from dynamo.llm import FpmEventSubscriber
loop = asyncio.get_running_loop() loop = asyncio.get_running_loop()
event_plane = os.environ.get("DYN_EVENT_PLANE", "nats") event_plane = os.environ.get("DYN_EVENT_PLANE", "nats")
enable_nats = args.request_plane == "nats" or event_plane == "nats" enable_nats = args.request_plane == "nats" or event_plane == "nats"
...@@ -67,26 +70,32 @@ async def run(args: argparse.Namespace) -> None: ...@@ -67,26 +70,32 @@ async def run(args: argparse.Namespace) -> None:
subscriber = FpmEventSubscriber(endpoint) subscriber = FpmEventSubscriber(endpoint)
json_encoder = msgspec.json.Encoder() json_encoder = msgspec.json.Encoder()
print( logger.info(
f"Subscribed to forward-pass-metrics via event plane " "Subscribed to forward-pass-metrics via event plane "
f"(namespace={args.namespace}, component={args.component}) " "(namespace=%s, component=%s) Ctrl+C to stop",
f"Ctrl+C to stop", args.namespace,
file=sys.stderr, args.component,
) )
seq = 0
try: try:
while True: while True:
data = await asyncio.to_thread(subscriber.recv) data = await asyncio.to_thread(subscriber.recv)
if data is None: if data is None:
print("Stream closed.", file=sys.stderr) logger.info("Stream closed.")
break break
metrics = decode(data) metrics = decode(data)
if metrics is None:
continue
pretty = json.loads(json_encoder.encode(metrics)) pretty = json.loads(json_encoder.encode(metrics))
print(f"[seq={seq}] {json.dumps(pretty, indent=2)}", flush=True) logger.info(
seq += 1 "[worker=%s dp=%d counter=%d] %s",
metrics.worker_id,
metrics.dp_rank,
metrics.counter_id,
json.dumps(pretty, indent=2),
)
except KeyboardInterrupt: except KeyboardInterrupt:
print("\nStopped.", file=sys.stderr) logger.info("Stopped.")
finally: finally:
subscriber.shutdown() subscriber.shutdown()
......
...@@ -25,6 +25,7 @@ import time ...@@ -25,6 +25,7 @@ import time
from itertools import count from itertools import count
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
import msgspec.structs
import zmq import zmq
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.core.sched.scheduler import Scheduler from vllm.v1.core.sched.scheduler import Scheduler
...@@ -37,6 +38,7 @@ from dynamo.common.forward_pass_metrics import ( ...@@ -37,6 +38,7 @@ from dynamo.common.forward_pass_metrics import (
WelfordAccumulator, WelfordAccumulator,
encode, encode,
) )
from dynamo.runtime.logging import configure_dynamo_logging
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.config import VllmConfig from vllm.config import VllmConfig
...@@ -45,6 +47,7 @@ if TYPE_CHECKING: ...@@ -45,6 +47,7 @@ if TYPE_CHECKING:
from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.structured_output import StructuredOutputManager from vllm.v1.structured_output import StructuredOutputManager
configure_dynamo_logging()
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
DEFAULT_FPM_PORT = 20380 DEFAULT_FPM_PORT = 20380
...@@ -128,8 +131,10 @@ class _FpmPublisherThread: ...@@ -128,8 +131,10 @@ class _FpmPublisherThread:
continue continue
try: try:
seq = next(self._seq)
metrics = msgspec.structs.replace(metrics, counter_id=seq)
payload = encode(metrics) payload = encode(metrics)
seq_bytes = next(self._seq).to_bytes(8, "big") seq_bytes = seq.to_bytes(8, "big")
self._pub.send_multipart((topic, seq_bytes, payload), flags=zmq.NOBLOCK) self._pub.send_multipart((topic, seq_bytes, payload), flags=zmq.NOBLOCK)
last_publish = time.monotonic() last_publish = time.monotonic()
except zmq.Again: except zmq.Again:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment