fix: Fix KV Cache Hit Rate Metrics and Misc QoL Updates (#199)

c8737c1f · Ryan McCormick · GitHub · b92834c8 · c8737c1f · c8737c1f
Commit c8737c1f authored Mar 16, 2025 by Ryan McCormick Committed by GitHub Mar 16, 2025
10 changed files
--- a/components/metrics/README.md
+++ b/components/metrics/README.md
 # Metrics
+The `metrics` component is a utility that can collect, aggregate, and publish
+metrics from a Dynamo deployment for use in other applications or visualization
+tools like Prometheus and Grafana.
 ## Quickstart
-To start the `metrics` component, simply point it at the `namespace/component/endpoint` trio that
+To start the `metrics` component, simply point it at the `namespace/component/endpoint`
-you're interested in observing metrics from.
+trio for the Dynamo workers that you're interested in monitoring metrics on.
 This will:
-1. Scrape statistics from the services associated with that `endpoint`, do some postprocessing, and aggregate them.
+1. Collect statistics from workers associated with that `namespace/component/endpoint`
-2. Listen for `KvHitRateEvent`s on `namespace/kv-hit-rate`, and aggregate them.
+2. Postprocess and aggregate those statistics across the workers
+3. Publish them on a Prometheus-compatible metrics endpoint
 For example:
 ```bash
-# For more details, try DYN_LOG=debug
+# Default namespace is "dynamo", but can be configured with --namespace
-DYN_LOG=info metrics --namespace dynamo --component backend --endpoint generate
+# For more detailed output, try setting the env var: DYN_LOG=debug
+metrics --component my_component --endpoint my_endpoint
-# 2025-02-26T18:45:05.467026Z  INFO metrics: Creating unique instance of Metrics at dynamo/components/metrics/instance
+# 2025-03-17T00:07:05.202558Z  INFO metrics: Scraping endpoint dynamo/my_component/my_endpoint for stats
-# 2025-02-26T18:45:05.472146Z  INFO metrics: Scraping service dynamo_backend_720278f8 and filtering on subject dynamo_backend_720278f8.generate
+# 2025-03-17T00:07:05.202955Z  INFO metrics: Prometheus metrics server started at 0.0.0.0:9091/metrics
 # ...
 ```
 With no matching endpoints running to collect stats from, you should see warnings in the logs:
 ```bash
-2025-02-26T18:45:06.474161Z  WARN metrics: No endpoints found matching subject dynamo_backend_720278f8.generate
+2025-03-17T00:07:06.204756Z  WARN metrics: No endpoints found matching dynamo/my_component/my_endpoint
 ```
-After a matching endpoint gets started, you should see the warnings stop
+After a worker with a matching endpoint gets started, the endpoint
-when the endpoint gets automatically discovered.
+will get automatically discovered and the warnings will stop.
-## Building/Running from Source
+## Workers
-For easy iteration while making edits to the metrics component, you can use `cargo run`
+The `metrics` component needs running workers to gather metrics from,
-to build and run with your local changes:
+so below are some examples of workers and how they can be monitored.
+### Mock Worker
+For quick testing and debugging, there is a Rust-based
+[mock worker](src/bin/mock_worker.rs) that registers a mock
+`StatsHandler` under an endpoint named
+`dynamo/my_component/my_endpoint` and publishes random data.
+```bash
+# Can run multiple workers in separate shells to see aggregation as well.
+# Or to build/run from source: cargo run --bin mock_worker
+mock_worker
+# 2025-03-16T23:49:28.101668Z  INFO mock_worker: Starting Mock Worker on Endpoint: dynamo/my_component/my_endpoint
+```
+To monitor the metrics of these mock workers, run:
+```bash
+metrics --component my_component --endpoint my_endpoint
+```
+### Real Worker
+To run a more realistic deployment to gathering metrics from,
+see the examples in [deploy/examples/llm](deploy/examples/llm).
+For example, for a VLLM + KV Routing based deployment that
+exposes statistics on an endpoint labeled
+`dynamo/VllmWorker/load_metrics`:
+```bash
+cd deploy/examples/llm
+dynamo serve <vllm kv routing example args>
+```
+To monitor the metrics of these VllmWorkers, run:
+```bash
+metrics --component VllmWorker --endpoint load_metrics
+```
+**NOTE**: `load_metrics` is currently a
+[hard-coded](https://github.com/ai-dynamo/dynamo/blob/d5220c7b1151372ba3d2a061c7d0a7ed72724789/lib/llm/src/kv_router/publisher.rs#L108)
+endpoint name used for python-based workers that register a `KvMetricsPublisher`.
+## Visualization
+To visualize the metrics being exposed on the Prometheus endpoint,
+see the Prometheus and Grafana configurations in
+[deploy/metrics](deploy/metrics):
 ```bash
-DYN_LOG=info cargo run --bin metrics -- --namespace dynamo --component backend --endpoint generate
+docker compose -f deploy/docker-compose.yml --profile metrics up -d
 ```
 ## Metrics Collection Modes
@@ -42,17 +95,16 @@ The metrics component supports two modes for exposing metrics in a Prometheus fo
 ### Pull Mode (Default)
-When running in pull mode (the default), the metrics component will expose a Prometheus metrics endpoint on the specified host and port that a Prometheus server or curl client can pull from:
+When running in pull mode (the default), the metrics component will expose a
+Prometheus metrics endpoint on the specified host and port that a
+Prometheus server or curl client can pull from:
 ```bash
 # Start metrics server on default host (0.0.0.0) and port (9091)
-DYN_LOG=info metrics --component backend --endpoint generate
+metrics --component my_component --endpoint my_endpoint
 # Or specify a custom port
-DYN_LOG=info metrics --component backend --endpoint generate --port 9092
+metrics --component my_component --endpoint my_endpoint --port 9092
-# Or specify a custom host and port
-DYN_LOG=info metrics --component backend --endpoint generate --host 127.0.0.1 --port 9092
 ```
 In pull mode:
@@ -65,17 +117,20 @@ curl localhost:9091/metrics
 # # HELP llm_kv_blocks_active Active KV cache blocks
 # # TYPE llm_kv_blocks_active gauge
-# llm_kv_blocks_active{component="backend",endpoint="generate",worker_id="7587884888253033398"} 40
+# llm_kv_blocks_active{component="my_component",endpoint="my_endpoint",worker_id="7587884888253033398"} 40
-# llm_kv_blocks_active{component="backend",endpoint="generate",worker_id="7587884888253033401"} 2
+# llm_kv_blocks_active{component="my_component",endpoint="my_endpoint",worker_id="7587884888253033401"} 2
 # # HELP llm_kv_blocks_total Total KV cache blocks
 # # TYPE llm_kv_blocks_total gauge
-# llm_kv_blocks_total{component="backend",endpoint="generate",worker_id="7587884888253033398"} 100
+# llm_kv_blocks_total{component="my_component",endpoint="my_endpoint",worker_id="7587884888253033398"} 100
-# llm_kv_blocks_total{component="backend",endpoint="generate",worker_id="7587884888253033401"} 100
+# llm_kv_blocks_total{component="my_component",endpoint="my_endpoint",worker_id="7587884888253033401"} 100
 ```
 ### Push Mode
-For ephemeral or batch jobs, or when metrics need to be pushed through a firewall, you can use Push mode. In this mode, the metrics component will periodically push metrics to an externally hosted Prometheus PushGateway:
+For ephemeral or batch jobs, or when metrics need to be pushed through a firewall,
+you can use Push mode. In this mode, the metrics component will periodically push
+metrics to an externally hosted
+[Prometheus PushGateway](https://prometheus.io/docs/instrumenting/pushing/):
 Start a prometheus push gateway service via docker:
 ```bash
@@ -85,17 +140,18 @@ docker run --rm -d -p 9091:9091 --name pushgateway prom/pushgateway
 Start the metrics component in `--push` mode, specifying the host and port of your PushGateway:
 ```bash
 # Push metrics to a Prometheus PushGateway every --push-interval seconds
-DYN_LOG=info metrics \
+metrics \
-    --component backend \
+    --component my_component \
-    --endpoint generate \
+    --endpoint my_endpoint \
    --host 127.0.0.1 \
    --port 9091 \
    --push
 ```
 When using Push mode:
- The `--host` parameter specifies be the IP address of the PushGateway
+- The `--host` parameter must be a valid IPv4 or IPv6 address (e.g., "0.0.0.0", "127.0.0.1")
- The `--port` parameter specifies the port of the PushGateway
+  that the Prometheus PushGateway is running on
+- The `--port` parameter specifies the port of the Prometheus PushGateway
 - The push interval can be configured with `--push-interval` (default: 2 seconds)
 - A default job name of "dynamo_metrics" is used for the Prometheus job label
 - Metrics persist in the PushGateway until explicitly deleted
@@ -108,28 +164,12 @@ To view the metrics hosted on the PushGateway:
 curl 127.0.0.1:9091/metrics
 ```
-## Workers
+## Building/Running from Source
-### Mock Worker
-For convenience and debugging, there is a mock worker that registers a mock `StatsHandler`
+For easy iteration while making edits to the metrics component, you can use `cargo run`
-with the `endpoint` and publishes mock `KvHitRateEvent`s on `namespace/kv-hit-rate`.
+to build and run with your local changes:
 ```bash
-# Can run multiple workers in separate shells to see aggregation as well.
+cargo run --bin metrics -- --component my_component --endpoint my_endpoint
-DYN_LOG=info cargo run --bin mock_worker
 ```
-**NOTE**: When using the mock worker, the data from the stats handler and the
-events will be random and shouldn't be expected to correlate with each other.
-### Real Worker
-See the KV Routing example in `examples/python_rs/llm/vllm`.
-Start the `metrics` component with the corresponding namespace/component/endpoint that the
-KV Routing example is using (NOTE: `load_metrics` endpoint is currently a hard-coded value
-internally for the ForwardPassMetrics StatsHandler), for example:
-```
-DYN_LOG=info metrics --namespace dynamo --component vllm --endpoint load_metrics
-```
--- a/components/metrics/src/bin/mock_worker.rs
+++ b/components/metrics/src/bin/mock_worker.rs
@@ -68,6 +68,7 @@ impl AsyncEngine<SingleIn<String>, ManyOut<Annotated<String>>, Error> for MockRe
    }
 }
+// FIXME: These events are just for testing and may not currently be used.
 /// Spawns a background task that periodically publishes mock KV hit rate events
 async fn mock_event_publisher(namespace: Namespace) {
    // NOTE: These events are just for testing, and shouldn't be interpreted
@@ -97,7 +98,7 @@ async fn mock_event_publisher(namespace: Namespace) {
        if let Err(e) = namespace.publish(KV_HIT_RATE_SUBJECT, &event).await {
            tracing::warn!("Failed to publish KV hit rate event: {e}");
        } else {
-            tracing::info!(
+            tracing::debug!(
                "Published KV hit rate event: worker_id={worker_id}, isl_blocks={isl_blocks}, overlap_blocks={overlap_blocks}, hit_rate={:.2}%",
                (overlap_blocks as f64 / isl_blocks as f64) * 100.0
            );
@@ -107,7 +108,6 @@ async fn mock_event_publisher(namespace: Namespace) {
 /// Generates mock forward pass metrics for stats handler
 fn mock_stats_handler(_stats: Stats) -> serde_json::Value {
-    println!("stats in: {:?}", _stats);
    let request_total_slots = 100;
    let request_active_slots = rand::thread_rng().gen_range(0..=request_total_slots);
    let kv_total_blocks = 100;
@@ -124,12 +124,20 @@ fn mock_stats_handler(_stats: Stats) -> serde_json::Value {
        gpu_cache_usage_perc,
        gpu_prefix_cache_hit_rate,
    };
-    println!("stats out: {:?}", stats);
+    tracing::info!("Stats: {stats:?}");
    serde_json::to_value(stats).unwrap()
 }
 async fn backend(runtime: DistributedRuntime) -> Result<()> {
    let namespace = runtime.namespace("dynamo")?;
+    // we must first create a service, then we can attach one more more endpoints
+    let component = namespace
+        .component("my_component")?
+        .service_builder()
+        .create()
+        .await?;
+    let endpoint = component.endpoint("my_endpoint");
+    tracing::info!("Starting Mock Worker on Endpoint: {}", endpoint.path());
    // Spawn background task for publishing KV hit rate events
    let namespace_clone = namespace.clone();
@@ -137,17 +145,11 @@ async fn backend(runtime: DistributedRuntime) -> Result<()> {
        mock_event_publisher(namespace_clone).await;
    });
-    // attach an ingress to an engine
+    // Attach an ingress to the engine
    let ingress = Ingress::for_engine(MockRequestHandler::new())?;
-    // make the ingress discoverable via a component service
+    // Make the ingress discoverable via a component service
-    // we must first create a service, then we can attach one more more endpoints
+    endpoint
-    namespace
-        .component("backend")?
-        .service_builder()
-        .create()
-        .await?
-        .endpoint("generate")
        .endpoint_builder()
        // Dummy stats handler to demonstrate how to attach a custom stats handler
        .stats_handler(mock_stats_handler)

--- a/components/metrics/src/lib.rs
+++ b/components/metrics/src/lib.rs
@@ -362,6 +362,8 @@ pub struct PrometheusMetrics {
    load_avg: prometheus::GaugeVec,
    load_std: prometheus::GaugeVec,
    // KV hit rate metrics
+    kv_hit_rate_percent: prometheus::GaugeVec,
+    // FIXME: These are currently unused outside of mock_worker
    kv_hit_rate_isl_blocks: prometheus::CounterVec,
    kv_hit_rate_overlap_blocks: prometheus::CounterVec,
 }
@@ -400,9 +402,15 @@ impl PrometheusMetrics {
                "Load standard deviation across workers",
                &["component", "endpoint"]
            )?,
-            // TODO: The cumulative isl/overlap metrics are monotonically increasing
+            // KV hit rate (ForwardPassMetrics)
-            // and may overflow at some point, we may want to periodically reset them.
+            kv_hit_rate_percent: register_gauge_vec!(
-            // KV hit rate metrics
+                "llm_kv_hit_rate_percent",
+                "KV hit rate percentage per worker",
+                &["component", "endpoint", "worker_id"]
+            )?,
+            // FIXME: Cleanup/remove event based metrics after finalizaing
+            //        metrics collection approach with vllm/trtllm workers.
+            // Event-based KV hit rate metrics (not currently used outside mock worker)
            kv_hit_rate_isl_blocks: register_counter_vec!(
                "llm_kv_hit_rate_isl_blocks",
                "Cumulative count of ISL blocks in KV hit rate events",
@@ -485,6 +493,12 @@ impl PrometheusMetrics {
                &worker_id,
                metrics.request_total_slots as f64,
            );
+            self.set_worker_gauge(
+                &self.kv_hit_rate_percent,
+                config,
+                &worker_id,
+                metrics.gpu_prefix_cache_hit_rate as f64,
+            );
        }
        // Update aggregate metrics
@@ -541,7 +555,7 @@ impl PrometheusMetrics {
        if cumulative_isl > 0.0 {
            let cumulative_hit_rate = (cumulative_overlap / cumulative_isl) * 100.0;
-            tracing::info!(
+            tracing::debug!(
                "Estimated Cumulative KV hit rate: {cumulative_hit_rate:.2}% (Overlap: {cumulative_overlap} / ISL: {cumulative_isl})"
            );
        }
@@ -563,11 +577,6 @@ pub async fn collect_endpoints(
        .filter(|e| e.subject.starts_with(subject))
        .collect::<Vec<_>>();
    tracing::debug!("Endpoints: {endpoints:?}");
-    if endpoints.is_empty() {
-        tracing::warn!("No endpoints found matching subject {subject}");
-    }
    Ok(endpoints)
 }

--- a/components/metrics/src/main.rs
+++ b/components/metrics/src/main.rs
@@ -61,7 +61,7 @@ struct Args {
    endpoint: String,
    /// Polling interval in seconds for scraping dynamo endpoint stats (minimum 1 second)
-    #[arg(long, default_value = "2")]
+    #[arg(long, default_value = "1")]
    poll_interval: u64,
    /// Host for serving or pushing prometheus metrics (default: 0.0.0.0)
@@ -137,9 +137,9 @@ async fn app(runtime: Runtime) -> Result<()> {
    let target_component = namespace.component(&config.component_name)?;
    let target_endpoint = target_component.endpoint(&config.endpoint_name);
-    let service_name = target_component.service_name();
+    let service_path = target_endpoint.path();
    let service_subject = target_endpoint.subject();
-    tracing::info!("Scraping service {service_name} and filtering on subject {service_subject}");
+    tracing::info!("Scraping endpoint {service_path} for stats");
    let token = drt.primary_lease().child_token();
    let event_name = format!("l2c.{}.{}", config.component_name, config.endpoint_name);
@@ -165,9 +165,10 @@ async fn app(runtime: Runtime) -> Result<()> {
    metrics_collector.lock().await.start(metrics_mode)?;
+    // TODO: Consider removing event subscription until metrics are more standardized
    // Subscribe to KV hit rate events
    let kv_hit_rate_subject = KV_HIT_RATE_SUBJECT;
-    tracing::info!("Subscribing to KV hit rate events on subject: {kv_hit_rate_subject}");
+    tracing::debug!("Subscribing to KV hit rate events on subject: {kv_hit_rate_subject}");
    // Clone fields for the event subscription task
    let config_clone = config.clone();
@@ -186,7 +187,7 @@ async fn app(runtime: Runtime) -> Result<()> {
                            // TODO: Lower to debug
                            let cache_hit_pct =
                                (event.overlap_blocks as f64 / event.isl_blocks as f64) * 100.0;
-                            tracing::info!(
+                            tracing::debug!(
                                "Received KV hit rate event: worker_id={}, isl_blocks={}, overlap_blocks={}, cache_hit_pct={:.2}%",
                                event.worker_id,
                                event.isl_blocks,
@@ -226,7 +227,11 @@ async fn app(runtime: Runtime) -> Result<()> {
            collect_endpoints(&target_component, &service_subject, scrape_timeout).await?;
        let metrics = extract_metrics(&endpoints);
        let processed = postprocess_metrics(&metrics, &endpoints);
-        tracing::debug!("Aggregated metrics: {processed:?}");
+        if processed.endpoints.is_empty() {
+            tracing::warn!("No endpoints found matching {service_path}");
+        } else {
+            tracing::info!("Aggregated metrics: {processed:?}");
+        }
        // Update Prometheus metrics
        metrics_collector.lock().await.update(&config, &processed);

--- a/container/Dockerfile.tensorrt_llm
+++ b/container/Dockerfile.tensorrt_llm
@@ -90,7 +90,8 @@ RUN cargo build --release --locked --features mistralrs,sglang,python && \
    cp target/release/dynamo-run /usr/local/bin && \
    cp target/release/http /usr/local/bin && \
    cp target/release/llmctl /usr/local/bin && \
-    cp target/release/metrics /usr/local/bin
+    cp target/release/metrics /usr/local/bin && \
+    cp target/release/mock_worker /usr/local/bin
 COPY deploy/dynamo/sdk /workspace/deploy/dynamo/sdk

--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -266,7 +266,8 @@ RUN cargo build --release --locked --features mistralrs,sglang,vllm,python && \
    cp target/release/dynamo-run /usr/local/bin && \
    cp target/release/http /usr/local/bin && \
    cp target/release/llmctl /usr/local/bin && \
-    cp target/release/metrics /usr/local/bin
+    cp target/release/metrics /usr/local/bin && \
+    cp target/release/mock_worker /usr/local/bin
 COPY deploy/dynamo/sdk /workspace/deploy/dynamo/sdk
 # Build dynamo wheel

--- a/deploy/docker-compose.yml
+++ b/deploy/docker-compose.yml
@@ -59,6 +59,8 @@ services:
      - ./metrics/grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml
      - ./metrics/grafana-dashboard-providers.yml:/etc/grafana/provisioning/dashboards/dashboard-providers.yml
    environment:
+      # Port 3000 is used by "dynamo serve", so use 3001
+      - GF_SERVER_HTTP_PORT=3001
      - GF_SECURITY_ADMIN_USER=admin
      - GF_SECURITY_ADMIN_PASSWORD=admin
      - GF_USERS_ALLOW_SIGN_UP=false
@@ -68,7 +70,7 @@ services:
    restart: unless-stopped
    # TODO: Use more explicit networking setup when metrics is containerized
    #ports:
-    #  - "3000:3000"
+    #  - "3001:3001"
    #networks:
    #  - monitoring
    network_mode: "host"

--- a/deploy/metrics/README.md
+++ b/deploy/metrics/README.md
@@ -25,7 +25,7 @@ This directory contains configuration for visualizing metrics from the metrics a
  ```
 5. Web servers started:
-   - Grafana: `http://localhost:3000` (default login: admin/admin) (started by docker compose)
+   - Grafana: `http://localhost:3001` (default login: admin/admin) (started by docker compose)
   - Prometheus Server: `http://localhost:9090` (started by docker compose)
   - Prometheus Metrics Endpoint: `http://localhost:9091/metrics` (started by `components/metrics` application)
@@ -54,15 +54,14 @@ The following configuration files should be present in this directory:
 ## Metrics
-The prometheus service exposes the following metrics:
+The prometheus metrics endpoint exposes the following metrics:
+- `llm_requests_active_slots`: Number of currently active request slots per worker
+- `llm_requests_total_slots`: Total available request slots per worker
+- `llm_kv_blocks_active`: Number of active KV blocks per worker
+- `llm_kv_blocks_total`: Total KV blocks available per worker
+- `llm_kv_hit_rate_percent`: Cumulative KV Cache hit percent per worker
 - `llm_load_avg`: Average load across workers
 - `llm_load_std`: Load standard deviation across workers
- `llm_requests_active_slots`: Number of currently active request slots
- `llm_requests_total_slots`: Total available request slots
- `llm_kv_blocks_active`: Number of active KV blocks
- `llm_kv_blocks_total`: Total KV blocks available
- `llm_kv_hit_rate_isl_blocks`: Cumulative count of ISL blocks in KV hit rate events
- `llm_kv_hit_rate_overlap_blocks`: Cumulative count of overlapping blocks in KV hit rate events
 ## Troubleshooting

--- a/deploy/metrics/grafana.json
+++ b/deploy/metrics/grafana.json
@@ -380,14 +380,6 @@
              {
                "color": "green",
                "value": null
-              },
-              {
-                "color": "yellow",
-                "value": 50
-              },
-              {
-                "color": "red",
-                "value": 80
              }
            ]
          },
@@ -415,7 +407,7 @@
        "showThresholdMarkers": true
      },
      "pluginVersion": "10.0.0",
-      "title": "Cumulative KV Cache Hit Rate",
+      "title": "Average KV Cache Hit Rate",
      "type": "gauge",
      "targets": [
        {
@@ -424,7 +416,7 @@
            "uid": "prometheus"
          },
          "editorMode": "code",
-          "expr": "100 * sum(llm_kv_hit_rate_overlap_blocks{component=\"$component\", endpoint=\"$endpoint\"}) / sum(llm_kv_hit_rate_isl_blocks{component=\"$component\", endpoint=\"$endpoint\"})",
+          "expr": "100 * avg(llm_kv_hit_rate_percent{component=\"$component\", endpoint=\"$endpoint\"})",
          "legendFormat": "__auto",
          "range": true,
          "refId": "A"
@@ -620,7 +612,7 @@
            "uid": "prometheus"
          },
          "editorMode": "code",
-          "expr": "100 * llm_kv_hit_rate_overlap_blocks{component=\"$component\", endpoint=\"$endpoint\"} / llm_kv_hit_rate_isl_blocks{component=\"$component\", endpoint=\"$endpoint\"}",
+          "expr": "100 * llm_kv_hit_rate_percent{component=\"$component\", endpoint=\"$endpoint\"}",
          "legendFormat": "Worker {{worker_id}}",
          "range": true,
          "refId": "A"
@@ -703,7 +695,7 @@
          "sort": "none"
        }
      },
-      "title": "Cumulative KV Cache Hit Rate",
+      "title": "Average KV Cache Hit Rate",
      "type": "timeseries",
      "targets": [
        {
@@ -712,8 +704,8 @@
            "uid": "prometheus"
          },
          "editorMode": "code",
-          "expr": "100 * sum(llm_kv_hit_rate_overlap_blocks{component=\"$component\", endpoint=\"$endpoint\"}) / sum(llm_kv_hit_rate_isl_blocks{component=\"$component\", endpoint=\"$endpoint\"})",
+          "expr": "avg(100 * llm_kv_hit_rate_percent{component=\"$component\", endpoint=\"$endpoint\"})",
-          "legendFormat": "Overall Hit Rate",
+          "legendFormat": "Average Hit Rate",
          "range": true,
          "refId": "A"
        }

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,6 +59,7 @@ dynamo-run = "dynamo.sdk.cli.run_executable:dynamo_run"
 llmctl = "dynamo.sdk.cli.run_executable:llmctl"
 http = "dynamo.sdk.cli.run_executable:http"
 metrics = "dynamo.sdk.cli.run_executable:metrics"
+mock_worker = "dynamo.sdk.cli.run_executable:mock_worker"
 [build-system]
 requires = ["hatchling"]
@@ -74,6 +75,7 @@ packages = ["deploy/dynamo/sdk/src/dynamo"]
 "target/release/llmctl" = "dynamo/sdk/cli/bin/llmctl"
 "target/release/http" = "dynamo/sdk/cli/bin/http"
 "target/release/metrics" = "dynamo/sdk/cli/bin/metrics"
+"target/release/mock_worker" = "dynamo/sdk/cli/bin/mock_worker"
 [tool.codespell]
 # note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -
@@ -172,4 +174,4 @@ check_untyped_defs = true
 # Skip mypy analysis on internal dependencies of vllm
 module = ["vllm.*", "bentoml.*", "fs.*", "_bentoml_sdk.*"]
 follow_imports = "skip"
 ignore_missing_imports = true
\ No newline at end of file