Unverified Commit f597b75b authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

chore: remove NIM custom backend example (part 1, docs only) (#5891)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
Co-authored-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 373e76c1
# NIM Backend Metrics Mock Server
This directory contains a mock NIM (NVIDIA Inference Microservices) backend server for testing the frontend's on-demand metrics collection feature.
## Purpose
**NOTE: This is temporary code.** Once NIM starts using Dynamo backend components natively, this mock server and the associated NIM metrics polling code will be removed.
This example demonstrates:
- How the Dynamo frontend can poll external backends for metrics
- Dynamic metric generation and collection
- The `runtime_stats` endpoint pattern
- Integration between frontend metrics and backend services
## Running the Example
### 1. Start the Mock NIM Backend
**Static mode (default - NATS only, no etcd):**
```bash
python3 examples/custom_backend/nim/mock_nim_backend.py
```
**Dynamic mode (with etcd for service discovery):**
```bash
python3 examples/custom_backend/nim/mock_nim_backend.py
```
This starts a backend on `nim.backend.runtime_stats` (default) that returns incrementing metrics. You can customize with `--custom-backend-metrics-endpoint "namespace.component.endpoint"`.
### 2. Start the Frontend with Metrics Polling
```bash
python3 -m dynamo.frontend \
--model-name Qwen/Qwen2.5-0.5B-Instruct \
--custom-backend-metrics-endpoint nim.backend.runtime_stats \
--custom-backend-metrics-polling-interval 9.2
```
**Note:** The custom backend metrics polling works in both static (NATS-only) and dynamic (with etcd) modes. The frontend automatically detects and adapts to the backend's mode.
### 3. Query Metrics
```bash
curl http://localhost:8000/metrics
```
The frontend will periodically (every 9.2 seconds in this example):
1. Poll the mock NIM backend via the `runtime_stats` endpoint
2. Parse the returned metrics
3. Update Prometheus gauges
When you query the `/metrics` endpoint, you'll see the most recently polled metrics.
## Metrics Exposed
The mock server returns:
**Gauges:**
- `kv_cache_usage_perc` - Cycles between 0.30 and 0.93
**Note:** All metrics collected from custom backends are automatically prefixed with `dynamo_component_` when exposed via the frontend's `/metrics` endpoint. For example, the gauge `kv_cache_usage_perc` from the backend will appear as `dynamo_component_kv_cache_usage_perc` in Prometheus metrics.
## Implementation Details
The frontend's NIM metrics collection is implemented in:
- `lib/llm/src/http/service/custom_backend_metrics.rs` - Custom backend metrics collection (temporary)
- `lib/llm/src/http/service/metrics.rs` - Metrics router
- `components/src/dynamo/frontend/main.py` - `--custom-backend-metrics-polling-interval` flag
All NIM-specific code is marked with TODO comments for removal once NIM adopts Dynamo backend.
#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Mock NIM Backend Server for Metrics Testing
This server mocks a NIM (NVIDIA Inference Microservices) backend that exposes
runtime statistics via the runtime_stats endpoint.
NOTE: This is temporary code for testing purposes only. Once NIM starts using
Dynamo backend components natively, this mock server and the associated NIM
metrics polling code in the frontend will be removed. The NIM-specific metrics
collection exists only as a bridge until NIM adopts the Dynamo runtime.
The server demonstrates:
- Dynamic metric generation (gauges and counters)
- Proper async generator pattern for Dynamo endpoints
- JSON-encoded metric responses compatible with the frontend metrics collector
"""
import asyncio
import json
import time
from typing import Any, AsyncGenerator
import uvloop
from dynamo.runtime import DistributedRuntime
# Global counter for incrementing metrics
request_count = 0
async def handle_stats_request(request: Any) -> AsyncGenerator[str, None]:
"""Mock stats handler - returns incrementing metrics for testing
Args:
request: JsonLike input from the client (can be dict, list, str, int, float, bool, or None)
Yields:
str: JSON string of stats dict conforming to the runtime_stats schema
"""
global request_count
request_count += 1
print(f"Received stats request #{request_count}: {request!r}")
# Simulate changing metrics
kv_cache_usage = 0.3 + (request_count % 10) * 0.07 # Cycles between 0.3 and 0.93
gpu_utilization = 50 + (request_count % 20) * 2.5 # Cycles between 50 and 97.5
active_requests = request_count % 15 # Cycles 0-14
stats = {
"schema_version": 1,
"worker_id": "mock-worker-1",
"backend": "vllm",
"ts": int(time.time()),
"metrics": {
"gauges": {
"kv_cache_usage_perc": round(kv_cache_usage, 2),
"gpu_utilization_perc": round(gpu_utilization, 2),
"active_requests": active_requests,
},
},
}
# Yield as JSON string for Rust Annotated<String> compatibility
yield json.dumps(stats)
async def worker(runtime: DistributedRuntime):
import argparse
parser = argparse.ArgumentParser(description="Mock NIM Backend Server")
parser.add_argument(
"--custom-backend-metrics-endpoint",
type=str,
default="nim.backend.runtime_stats",
help="Custom backend metrics endpoint in format 'namespace.component.endpoint' (default: 'nim.backend.runtime_stats')",
)
parser.add_argument(
"--use-etcd",
action="store_true",
help="Use etcd for service discovery (dynamic mode). Default is static mode (no etcd).",
)
args = parser.parse_args()
# Parse endpoint (namespace.component.endpoint)
parts = args.custom_backend_metrics_endpoint.split(".")
if len(parts) != 3:
raise ValueError(
f"Invalid endpoint format. Expected 'namespace.component.endpoint', got: {args.custom_backend_metrics_endpoint}"
)
namespace, comp_name, endpoint_name = parts
component = runtime.namespace(namespace).component(comp_name)
stats_endpoint = component.endpoint(endpoint_name)
print(
f"Mock NIM stats server started on {namespace}/{comp_name}/{endpoint_name} endpoint"
)
print(
"Exposing incrementing metrics: kv_cache_usage_perc, gpu_utilization_perc, active_requests, memory_used_gb, counters"
)
await stats_endpoint.serve_endpoint(handle_stats_request) # type: ignore[arg-type]
async def main():
import argparse
# Parse args before calling dynamo_worker to determine static mode
parser = argparse.ArgumentParser(
description="Mock NIM Backend Server", add_help=False
)
parser.add_argument("--use-etcd", action="store_true")
args, _ = parser.parse_known_args()
# Set static mode based on --use-etcd flag (default is static/no etcd)
is_static = not args.use_etcd
loop = asyncio.get_running_loop()
if is_static:
runtime = DistributedRuntime(loop, "file", "nats")
else:
runtime = DistributedRuntime(loop, "etcd", "nats")
try:
await worker(runtime) # type: ignore[arg-type]
finally:
runtime.shutdown()
if __name__ == "__main__":
uvloop.run(main())
#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Mock NIM Frontend - Polls the mock NIM backend for metrics
This script demonstrates how to poll a custom backend for metrics using
the Dynamo runtime in static mode (no etcd required, uses NATS only).
"""
import asyncio
import json
import signal
import uvloop
from dynamo.runtime import DistributedRuntime
async def poll_custom_backend_metrics(
runtime, namespace_component_endpoint, interval_secs
):
"""Poll custom backend metrics and print the data"""
print(
f"Starting custom backend metrics polling: endpoint={namespace_component_endpoint}, interval={interval_secs}s"
)
# Parse endpoint string (namespace.component.endpoint)
parts = namespace_component_endpoint.split(".")
if len(parts) != 3:
print(f"ERROR: Invalid endpoint format: {namespace_component_endpoint}")
return
namespace, component_name, endpoint_name = parts
print(f"Polling {namespace}/{component_name}/{endpoint_name}")
try:
# Get the component and endpoint
ns = runtime.namespace(namespace)
component = ns.component(component_name)
endpoint = component.endpoint(endpoint_name)
# Get client (in static mode, no need to wait for instances)
client = await endpoint.client()
print("Client created for static endpoint")
except Exception as e:
print(f"ERROR during polling setup: {e}")
import traceback
traceback.print_exc()
return
# Poll loop
print(f"Starting polling loop (every {interval_secs}s)...")
while True:
try:
await asyncio.sleep(interval_secs)
print(f"\n{'='*60}")
print(f"Polling tick at {asyncio.get_event_loop().time():.2f}")
# Send request and collect responses
# In static mode, use client.static() or client.generate()
response_stream = await client.generate("")
responses = []
async for response in response_stream:
if response.data():
responses.append(response.data())
print(f"Received {len(responses)} responses")
for idx, data in enumerate(responses):
print(f"\nResponse #{idx+1}:")
if isinstance(data, str):
try:
parsed = json.loads(data)
print(json.dumps(parsed, indent=2))
except json.JSONDecodeError:
print(data)
else:
print(data)
print(f"{'='*60}\n")
except asyncio.CancelledError:
print("Polling cancelled")
break
except Exception as e:
print(f"ERROR polling backend: {e}")
import traceback
traceback.print_exc()
await asyncio.sleep(interval_secs)
async def graceful_shutdown(runtime):
"""Gracefully shutdown the runtime"""
print("\nShutting down...")
runtime.shutdown()
async def async_main():
"""Main async function - similar to frontend/main.py"""
import argparse
parser = argparse.ArgumentParser(
description="Mock NIM Frontend - Poll backend for metrics"
)
parser.add_argument(
"--custom-backend-metrics-endpoint",
type=str,
default="nim.backend.runtime_stats",
help="Custom backend metrics endpoint in format 'namespace.component.endpoint' (default: 'nim.backend.runtime_stats')",
)
parser.add_argument(
"--polling-interval",
type=float,
default=3.0,
help="Polling interval in seconds (default: 3.0)",
)
args = parser.parse_args()
# Get the event loop
loop = asyncio.get_running_loop()
# Create DistributedRuntime - similar to frontend/main.py line 246
runtime = DistributedRuntime(loop, "file", "nats") # type: ignore[call-arg]
# Setup signal handlers for graceful shutdown
def signal_handler():
asyncio.create_task(graceful_shutdown(runtime))
for sig in (signal.SIGTERM, signal.SIGINT):
loop.add_signal_handler(sig, signal_handler)
print("Mock NIM Frontend starting...")
print(f"Target endpoint: {args.custom_backend_metrics_endpoint}")
print(f"Polling interval: {args.polling_interval}s")
print("Static mode: No etcd required, using NATS only\n")
try:
# Start polling
await poll_custom_backend_metrics(
runtime, args.custom_backend_metrics_endpoint, args.polling_interval
)
except asyncio.exceptions.CancelledError:
pass
def main():
"""Entry point - similar to frontend/main.py"""
uvloop.run(async_main())
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment