feat: Base metrics: add generic ingress handler metrics (#2090)

Co-authored-by: Keiven Chang <keivenchang@users.noreply.github.com>

feat: Base metrics: add generic ingress handler metrics (#2090)
Co-authored-by: Keiven Chang <keivenchang@users.noreply.github.com>
615580d8 · Keiven C · GitHub · e2a514b2 · 615580d8 · 615580d8
Unverified Commit 615580d8 authored Jul 28, 2025 by Keiven C Committed by GitHub Jul 28, 2025
19 changed files
--- a/deploy/metrics/README.md
+++ b/deploy/metrics/README.md
@@ -19,6 +19,8 @@ graph TD
        PROMETHEUS -->|:9401/metrics| DCGM_EXPORTER[dcgm-exporter :9401]
        PROMETHEUS -->|:7777/metrics| NATS_PROM_EXP
        PROMETHEUS -->|:8080/metrics| DYNAMOFE[Dynamo HTTP FE :8080]
+        PROMETHEUS -->|:8081/metrics| DYNAMOBACKEND[Dynamo backend :8081]
+        DYNAMOFE --> DYNAMOBACKEND
        GRAFANA -->|:9090/query API| PROMETHEUS
    end
 ```
@@ -34,12 +36,14 @@ As of Q2 2025, Dynamo HTTP Frontend metrics are exposed when you build container
 2. Start Dynamo dependencies. Assume you're at the root dynamo path:

   ```bash
-   docker compose -f deploy/docker-compose.yml up -d  # Minimum components for Dynamo: etcd/nats/dcgm-exporter
-   # or
-   docker compose -f deploy/docker-compose.yml --profile metrics up -d  # In addition to the above, start Prometheus & Grafana
+   # Start the basic services (etcd & natsd), along with Prometheus and Grafana
+   docker compose -f deploy/docker-compose.yml --profile metrics up -d
+
+   # Minimum components for Dynamo: etcd/nats/dcgm-exporter
+   docker compose -f deploy/docker-compose.yml up -d
   ```

-   To target specific GPU(s), export the variable below before running Docker Compose:
+   Optional: To target specific GPU(s), export the variable below before running Docker Compose
   ```bash
   export CUDA_VISIBLE_DEVICES=0,2
   ```
@@ -63,9 +67,15 @@ As of Q2 2025, Dynamo HTTP Frontend metrics are exposed when you build container

 ### Prometheus

-The Prometheus configuration is defined in [prometheus.yml](./prometheus.yml). It is configured to scrape metrics from the metrics aggregation service endpoint.
+The Prometheus configuration is specified in [prometheus.yml](./prometheus.yml). This file is set up to collect metrics from the metrics aggregation service endpoint.
+
+Please be aware that you might need to modify the target settings to align with your specific host configuration and network environment.
+
+After making changes to prometheus.yml, it is necessary to reload the configuration using the command below. Simply sending a kill -HUP signal will not suffice due to the caching of the volume that contains the prometheus.yml file.

-Note: You may need to adjust the target based on your host configuration and network setup.
+```
+docker compose -f deploy/docker-compose.yml up prometheus -d --force-recreate
+```

 ### Grafana

@@ -82,11 +92,13 @@ The following configuration files should be present in this directory:
 - [grafana-datasources.yml](./grafana-datasources.yml): Contains Grafana datasource configuration
 - [grafana_dashboards/grafana-dashboard-providers.yml](./grafana_dashboards/grafana-dashboard-providers.yml): Contains Grafana dashboard provider configuration
 - [grafana_dashboards/grafana-dynamo-dashboard.json](./grafana_dashboards/grafana-dynamo-dashboard.json): A general Dynamo Dashboard for both SW and HW metrics.
- [grafana_dashboards/grafana-llm-metrics.json](./grafana_dashboards/grafana-llm-metrics.json): Contains Grafana dashboard configuration for LLM specific metrics.
 - [grafana_dashboards/grafana-dcgm-metrics.json](./grafana_dashboards/grafana-dcgm-metrics.json): Contains Grafana dashboard configuration for DCGM GPU metrics
+- [grafana_dashboards/grafana-llm-metrics.json](./grafana_dashboards/grafana-llm-metrics.json): This file, which is being phased out, contains the Grafana dashboard configuration for LLM-specific metrics. It requires an additional `metrics` component to operate concurrently. A new version is under development.

 ## Running the example `metrics` component

+IMPORTANT: This section is being phased out, and some metrics may not function as expected. A new solution is under development.
+
 When you run the example [components/metrics](../../components/metrics/README.md) component, it exposes a Prometheus /metrics endpoint with the followings (defined in [../../components/metrics/src/lib.rs](../../components/metrics/src/lib.rs)):
 - `llm_requests_active_slots`: Number of currently active request slots per worker
 - `llm_requests_total_slots`: Total available request slots per worker

--- a/deploy/metrics/grafana_dashboards/grafana-dynamo-dashboard.json
+++ b/deploy/metrics/grafana_dashboards/grafana-dynamo-dashboard.json
@@ -19,7 +19,7 @@
  "editable": true,
  "fiscalYearStartMonth": 0,
  "graphTooltip": 0,
-  "id": 1,
+  "id": 4,
  "links": [],
  "panels": [
    {
@@ -112,7 +112,7 @@
          "refId": "A"
        }
      ],
-      "title": "Requests / Sec",
+      "title": "Frontend Requests / Sec",
      "type": "timeseries"
    },
    {
@@ -205,7 +205,7 @@
          "refId": "A"
        }
      ],
-      "title": "Avg Time to First Token",
+      "title": "Frontend Avg Time to First Token",
      "type": "timeseries"
    },
    {
@@ -298,7 +298,7 @@
          "refId": "A"
        }
      ],
-      "title": "Avg Inter-Token Latency",
+      "title": "Frontend Avg Inter-Token Latency",
      "type": "timeseries"
    },
    {
@@ -391,7 +391,7 @@
          "refId": "A"
        }
      ],
-      "title": "Avg Request Duration",
+      "title": "Frontend Avg Request Duration",
      "type": "timeseries"
    },
    {
@@ -497,7 +497,7 @@
          "refId": "B"
        }
      ],
-      "title": "Avg Input/Output Sequence Length",
+      "title": "Frontend Avg Input/Output Sequence Length",
      "type": "timeseries"
    },
    {
@@ -611,17 +611,406 @@
      ],
      "title": "DCGM GPU Utilization",
      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "P1809F7CD0C75ACF3"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 6,
+        "x": 0,
+        "y": 16
+      },
+      "id": 19,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.1",
+      "targets": [
+        {
+          "editorMode": "code",
+          "expr": "rate(dynamo_response_bytes_total{endpoint=\"generate\"}[1m])",
+          "legendFormat": "Response bytes",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "P1809F7CD0C75ACF3"
+          },
+          "editorMode": "code",
+          "expr": "rate(dynamo_request_bytes_total{endpoint=\"generate\"}[1m])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "Request bytes",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "dynamo.vllm bytes / sec",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "P1809F7CD0C75ACF3"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 6,
+        "x": 6,
+        "y": 16
+      },
+      "id": 18,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.1",
+      "targets": [
+        {
+          "editorMode": "code",
+          "expr": "rate(dynamo_requests_total{endpoint=\"generate\"}[1m])",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "P1809F7CD0C75ACF3"
+          },
+          "editorMode": "code",
+          "expr": "",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "dynamo.vllm requests / sec",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "P1809F7CD0C75ACF3"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 6,
+        "x": 12,
+        "y": 16
+      },
+      "id": 20,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.1",
+      "targets": [
+        {
+          "editorMode": "code",
+          "expr": "dynamo_request_duration_seconds_sum / dynamo_request_duration_seconds_count",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "dynamo.vllm Avg Request Duration (seconds)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "P1809F7CD0C75ACF3"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 6,
+        "x": 18,
+        "y": 16
+      },
+      "id": 21,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "hideZeros": false,
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "pluginVersion": "12.0.1",
+      "targets": [
+        {
+          "editorMode": "code",
+          "expr": "rate(dynamo_errors_total{endpoint=\"generate\"}[1m])",
+          "legendFormat": "{{error_type}}",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "dynamo.vllm Avg Errors / sec",
+      "type": "timeseries"
    }
  ],
  "preload": false,
  "refresh": "",
  "schemaVersion": 41,
-  "tags": [
-    "Dynamo",
-    "DCGM",
-    "etcd",
-    "NATS"
-  ],
+  "tags": [],
  "templating": {
    "list": []
  },
@@ -632,6 +1021,6 @@
  "timepicker": {},
  "timezone": "browser",
  "title": "Dynamo Dashboard",
-  "uid": "a7d3733f-f8e7-423a-ab4b-b18e3d7d0357",
-  "version": 5
+  "uid": "97ae8df9-138a-4f7a-9b0f-635b77d818fe",
+  "version": 1
 }
\ No newline at end of file
--- a/deploy/metrics/prometheus.yml
+++ b/deploy/metrics/prometheus.yml
@@ -34,11 +34,18 @@ scrape_configs:
      - targets: ['dcgm-exporter:9401']  # on the "monitoring" network

  # This is a demo service that needs to be launched manually. See components/metrics/README.md
-  # Note that you may need to disable the firewall on your host. On Ubuntu: sudo ufw allow 8000/tcp
-  - job_name: 'dynamo-backend'
+  # Note that you may need to disable the firewall on your host. On Ubuntu: sudo ufw allow 8080/tcp
+  # You can also force the port, if the default is different: python -m dynamo.frontend --http-port 8080
+  - job_name: 'dynamo-frontend'
    scrape_interval: 10s
    static_configs:
-      - targets: ['host.docker.internal:8000']  # on the "monitoring" network
+      - targets: ['host.docker.internal:8080']  # on the "monitoring" network
+
+  # Launch via: DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 dynamo.<backend> ...
+  - job_name: 'dynamo-backend'
+    scrape_interval: 6s
+    static_configs:
+      - targets: ['host.docker.internal:8081']

  # This is another demo aggregator that needs to be launched manually. See components/metrics/README.md
  # Note that you may need to disable the firewall on your host. On Ubuntu: sudo ufw allow 9091/tcp

--- a/examples/multimodal/README.md
+++ b/examples/multimodal/README.md
--- a/lib/runtime/examples/Cargo.lock
+++ b/lib/runtime/examples/Cargo.lock
@@ -995,8 +995,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592"
 dependencies = [
 "cfg-if 1.0.0",
+ "js-sys",
 "libc",
 "wasi 0.11.0+wasi-snapshot-preview1",
+ "wasm-bindgen",
 ]

 [[package]]
@@ -1006,9 +1008,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0"
 dependencies = [
 "cfg-if 1.0.0",
+ "js-sys",
 "libc",
 "r-efi",
 "wasi 0.14.2+wasi-0.2.4",
+ "wasm-bindgen",
 ]

 [[package]]
@@ -1134,6 +1138,23 @@ dependencies = [
 "want",
 ]

+[[package]]
+name = "hyper-rustls"
+version = "0.27.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58"
+dependencies = [
+ "http",
+ "hyper",
+ "hyper-util",
+ "rustls",
+ "rustls-pki-types",
+ "tokio",
+ "tokio-rustls",
+ "tower-service",
+ "webpki-roots 1.0.2",
+]
+
 [[package]]
 name = "hyper-timeout"
 version = "0.5.2"
@@ -1149,17 +1170,21 @@ dependencies = [

 [[package]]
 name = "hyper-util"
-version = "0.1.11"
+version = "0.1.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "497bbc33a26fdd4af9ed9c70d63f61cf56a938375fbb32df34db9b1cd6d643f2"
+checksum = "8d9b05277c7e8da2c93a568989bb6207bef0112e8d17df7a6eda4a3cf143bc5e"
 dependencies = [
+ "base64",
 "bytes",
 "futures-channel",
+ "futures-core",
 "futures-util",
 "http",
 "http-body",
 "hyper",
+ "ipnet",
 "libc",
+ "percent-encoding",
 "pin-project-lite",
 "socket2",
 "tokio",
@@ -1371,6 +1396,22 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "ipnet"
+version = "2.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130"
+
+[[package]]
+name = "iri-string"
+version = "0.7.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dbc5ebe9c3a1a7a5127f920a418f7585e9e758e911d0466ed004f393b0e380b2"
+dependencies = [
+ "memchr",
+ "serde",
+]
+
 [[package]]
 name = "itertools"
 version = "0.14.0"
@@ -1478,6 +1519,12 @@ version = "0.4.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"

+[[package]]
+name = "lru-slab"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
+
 [[package]]
 name = "matchers"
 version = "0.1.0"
@@ -1993,6 +2040,61 @@ dependencies = [
 "thiserror 1.0.69",
 ]

+[[package]]
+name = "quinn"
+version = "0.11.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "626214629cda6781b6dc1d316ba307189c85ba657213ce642d9c77670f8202c8"
+dependencies = [
+ "bytes",
+ "cfg_aliases",
+ "pin-project-lite",
+ "quinn-proto",
+ "quinn-udp",
+ "rustc-hash",
+ "rustls",
+ "socket2",
+ "thiserror 2.0.12",
+ "tokio",
+ "tracing",
+ "web-time",
+]
+
+[[package]]
+name = "quinn-proto"
+version = "0.11.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49df843a9161c85bb8aae55f101bc0bac8bcafd637a620d9122fd7e0b2f7422e"
+dependencies = [
+ "bytes",
+ "getrandom 0.3.2",
+ "lru-slab",
+ "rand 0.9.1",
+ "ring",
+ "rustc-hash",
+ "rustls",
+ "rustls-pki-types",
+ "slab",
+ "thiserror 2.0.12",
+ "tinyvec",
+ "tracing",
+ "web-time",
+]
+
+[[package]]
+name = "quinn-udp"
+version = "0.5.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fcebb1209ee276352ef14ff8732e24cc2b02bbac986cd74a4c81bcb2f9881970"
+dependencies = [
+ "cfg_aliases",
+ "libc",
+ "once_cell",
+ "socket2",
+ "tracing",
+ "windows-sys 0.59.0",
+]
+
 [[package]]
 name = "quote"
 version = "1.0.40"
@@ -2140,6 +2242,47 @@ version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"

+[[package]]
+name = "reqwest"
+version = "0.12.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cbc931937e6ca3a06e3b6c0aa7841849b160a90351d6ab467a8b9b9959767531"
+dependencies = [
+ "base64",
+ "bytes",
+ "futures-core",
+ "futures-util",
+ "http",
+ "http-body",
+ "http-body-util",
+ "hyper",
+ "hyper-rustls",
+ "hyper-util",
+ "js-sys",
+ "log",
+ "percent-encoding",
+ "pin-project-lite",
+ "quinn",
+ "rustls",
+ "rustls-pki-types",
+ "serde",
+ "serde_json",
+ "serde_urlencoded",
+ "sync_wrapper",
+ "tokio",
+ "tokio-rustls",
+ "tokio-util",
+ "tower 0.5.2",
+ "tower-http",
+ "tower-service",
+ "url",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "wasm-streams",
+ "web-sys",
+ "webpki-roots 1.0.2",
+]
+
 [[package]]
 name = "ring"
 version = "0.17.14"
@@ -2160,6 +2303,12 @@ version = "0.1.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"

+[[package]]
+name = "rustc-hash"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
+
 [[package]]
 name = "rustc_version"
 version = "0.4.1"
@@ -2224,6 +2373,9 @@ name = "rustls-pki-types"
 version = "1.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c"
+dependencies = [
+ "web-time",
+]

 [[package]]
 name = "rustls-webpki"
@@ -2542,6 +2694,9 @@ name = "sync_wrapper"
 version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263"
+dependencies = [
+ "futures-core",
+]

 [[package]]
 name = "synstructure"
@@ -2574,9 +2729,12 @@ dependencies = [
 "dynamo-runtime",
 "futures",
 "prometheus",
+ "rand 0.9.1",
+ "reqwest",
 "serde",
 "serde_json",
 "tokio",
+ "tokio-test",
 ]

 [[package]]
@@ -2691,6 +2849,21 @@ dependencies = [
 "zerovec",
 ]

+[[package]]
+name = "tinyvec"
+version = "1.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09b3661f17e86524eccd4371ab0429194e0d7c008abb45f7a7495b1719463c71"
+dependencies = [
+ "tinyvec_macros",
+]
+
+[[package]]
+name = "tinyvec_macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
+
 [[package]]
 name = "tokio"
 version = "1.44.2"
@@ -2741,6 +2914,19 @@ dependencies = [
 "tokio",
 ]

+[[package]]
+name = "tokio-test"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2468baabc3311435b55dd935f702f42cd1b8abb7e754fb7dfb16bd36aa88f9f7"
+dependencies = [
+ "async-stream",
+ "bytes",
+ "futures-core",
+ "tokio",
+ "tokio-stream",
+]
+
 [[package]]
 name = "tokio-util"
 version = "0.7.15"
@@ -2772,7 +2958,7 @@ dependencies = [
 "tokio",
 "tokio-rustls",
 "tokio-util",
- "webpki-roots",
+ "webpki-roots 0.26.8",
 ]

 [[package]]
@@ -2898,6 +3084,24 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "tower-http"
+version = "0.6.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2"
+dependencies = [
+ "bitflags 2.9.0",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "iri-string",
+ "pin-project-lite",
+ "tower 0.5.2",
+ "tower-layer",
+ "tower-service",
+]
+
 [[package]]
 name = "tower-layer"
 version = "0.3.3"
@@ -3174,6 +3378,19 @@ dependencies = [
 "wasm-bindgen-shared",
 ]

+[[package]]
+name = "wasm-bindgen-futures"
+version = "0.4.50"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61"
+dependencies = [
+ "cfg-if 1.0.0",
+ "js-sys",
+ "once_cell",
+ "wasm-bindgen",
+ "web-sys",
+]
+
 [[package]]
 name = "wasm-bindgen-macro"
 version = "0.2.100"
@@ -3206,6 +3423,39 @@ dependencies = [
 "unicode-ident",
 ]

+[[package]]
+name = "wasm-streams"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65"
+dependencies = [
+ "futures-util",
+ "js-sys",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+]
+
+[[package]]
+name = "web-sys"
+version = "0.3.77"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "web-time"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
 [[package]]
 name = "webpki-roots"
 version = "0.26.8"
@@ -3215,6 +3465,15 @@ dependencies = [
 "rustls-pki-types",
 ]

+[[package]]
+name = "webpki-roots"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e8983c3ab33d6fb807cfcdad2491c4ea8cbc8ed839181c7dfd9c67c83e261b2"
+dependencies = [
+ "rustls-pki-types",
+]
+
 [[package]]
 name = "winapi"
 version = "0.2.8"

--- a/lib/runtime/examples/Cargo.toml
+++ b/lib/runtime/examples/Cargo.toml
@@ -33,4 +33,4 @@ repository = "https://github.com/ai-dynamo/dynamo.git"
 [workspace.dependencies]
 # local or crates.io
 dynamo-runtime = { path = "../" }
-prometheus = { workspace = true }
+prometheus = { version = "0.14" }
--- a/lib/runtime/examples/system_metrics/Cargo.toml
+++ b/lib/runtime/examples/system_metrics/Cargo.toml
@@ -22,6 +22,10 @@ license.workspace = true
 homepage.workspace = true
 repository.workspace = true

+[features]
+default = []
+integration = [] # Integration tests that require NATS
+
 [dependencies]
 dynamo-runtime = { workspace = true }

@@ -31,3 +35,13 @@ serde = { version = "1", features = ["derive"] }
 serde_json = { version = "1" }
 tokio = { version = "1", features = ["full"] }
 prometheus = { version = "0.14" }
+
+[dev-dependencies]
+rand = { version = "0.9.0" }
+reqwest = { version = "0.12.22", default-features = false, features = ["json", "stream", "rustls-tls"] }
+tokio-test = "0.4.4"
+
+[[test]]
+name = "integration_test"
+path = "tests/integration_test.rs"
+required-features = ["integration"]
--- a/lib/runtime/examples/system_metrics/README.md
+++ b/lib/runtime/examples/system_metrics/README.md
-# System Metrics Example
+# Generic Profiling for Work Handlers

-Demonstrates custom metrics and monitoring in Dynamo Runtime using Prometheus.
+This example demonstrates how to add automatic Prometheus metrics profiling to any work handler without modifying the handler code itself.

 ## Overview

- Automatic hierarchical labeling: Runtime automatically adds `namespace` → `component` → `endpoint` labels
- Uses existing Prometheus implementations
- HTTP metrics endpoint automatically added
+The `WorkHandlerMetrics` system provides automatic profiling capabilities that are applied to all work handlers automatically. It automatically tracks:

-## Quick Start
+- **Request Count**: Total number of requests processed
+- **Request Duration**: Time spent processing each request
+- **Request/Response Bytes**: Total bytes received and sent
+- **Error Count**: Total number of errors encountered

-### Build
-```bash
-cd lib/runtime/examples/system_metrics
-cargo build
+Additionally, the example demonstrates how to add custom metrics with data bytes tracking in `MySystemStatsMetrics`.
+
+## How It Works
+
+**Automatic Metrics**: All work handlers automatically get profiling metrics without any code changes.
+
+**Custom Metrics**: If you want to add custom metrics IN ADDITION to the automatic ones, you can use the `add_metrics` method:
+
+```rust
+use dynamo_runtime::pipeline::network::Ingress;
+
+// Automatic profiling - no code changes needed!
+let ingress = Ingress::for_engine(my_handler)?;
+
+// Optional: Add custom metrics IN ADDITION to automatic ones
+ingress.add_metrics(&endpoint)?;
 ```

-### Run Server
-```bash
-export DYN_LOG=1 DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081
-cargo run --bin system_server
+The endpoint automatically provides proper labeling (namespace, component, endpoint) for all metrics.
+
+## Available Methods
+
+The `Ingress` struct provides methods for metrics:
+
+- **Automatic**: All handlers get profiling metrics automatically
+- `Ingress::add_metrics(&endpoint)` - Add custom metrics IN ADDITION to automatic ones (optional)
+
+## Metrics Generated
+
+### Automatic Metrics (No Code Changes Required)
+The following Prometheus metrics are automatically created for all work handlers:
+
+### Counters
+- `requests_total` - Total requests processed
+- `request_bytes_total` - Total bytes received in requests
+- `response_bytes_total` - Total bytes sent in responses
+- `errors_total` - Total errors encountered (with error_type labels)
+
+### Error Types
+The `errors_total` metric includes the following error types:
+- `deserialization` - Errors parsing request messages
+- `invalid_message` - Unexpected message format
+- `response_stream` - Errors creating response streams
+- `generate` - Errors in request processing
+- `publish_response` - Errors publishing response data
+- `publish_final` - Errors publishing final response
+
+### Histograms
+- `request_duration_seconds` - Request processing time
+
+### Gauges
+- `concurrent_requests` - Number of requests currently being processed
+
+### Custom Metrics (Optional)
+- `my_custom_bytes_processed_total` - Total data bytes processed by system handler (example)
+
+### Labels
+All metrics automatically include these labels from the endpoint:
+- `namespace` - The namespace name
+- `component` - The component name
+- `endpoint` - The endpoint name
+
+## Example Metrics Output
+
+When the system is running, you'll see metrics from the /metrics HTTP path like this:
+
+```prometheus
+# HELP concurrent_requests Number of requests currently being processed by work handler
+# TYPE concurrent_requests gauge
+concurrent_requests{component="dyn_example_component",endpoint="dyn_example_endpoint9881",namespace="dyn_example_namespace"} 0
+
+# HELP my_custom_bytes_processed_total Example of a custom metric. Total number of data bytes processed by system handler
+# TYPE my_custom_bytes_processed_total counter
+my_custom_bytes_processed_total{component="dyn_example_component",endpoint="dyn_example_endpoint9881",namespace="dyn_example_namespace"} 42
+
+# HELP request_bytes_total Total number of bytes received in requests by work handler
+# TYPE request_bytes_total counter
+request_bytes_total{component="dyn_example_component",endpoint="dyn_example_endpoint9881",namespace="dyn_example_namespace"} 1098
+
+# HELP request_duration_seconds Time spent processing requests by work handler
+# TYPE request_duration_seconds histogram
+request_duration_seconds_bucket{component="dyn_example_component",endpoint="dyn_example_endpoint9881",namespace="dyn_example_namespace",le="0.005"} 3
+request_duration_seconds_bucket{component="dyn_example_component",endpoint="dyn_example_endpoint9881",namespace="dyn_example_namespace",le="0.01"} 3
+request_duration_seconds_bucket{component="dyn_example_component",endpoint="dyn_example_endpoint9881",namespace="dyn_example_namespace",le="0.025"} 3
+request_duration_seconds_bucket{component="dyn_example_component",endpoint="dyn_example_endpoint9881",namespace="dyn_example_namespace",le="0.05"} 3
+request_duration_seconds_bucket{component="dyn_example_component",endpoint="dyn_example_endpoint9881",namespace="dyn_example_namespace",le="0.1"} 3
+request_duration_seconds_bucket{component="dyn_example_component",endpoint="dyn_example_endpoint9881",namespace="dyn_example_namespace",le="0.25"} 3
+request_duration_seconds_bucket{component="dyn_example_component",endpoint="dyn_example_endpoint9881",namespace="dyn_example_namespace",le="0.5"} 3
+request_duration_seconds_bucket{component="dyn_example_component",endpoint="dyn_example_endpoint9881",namespace="dyn_example_namespace",le="1"} 3
+request_duration_seconds_bucket{component="dyn_example_component",endpoint="dyn_example_endpoint9881",namespace="dyn_example_namespace",le="2.5"} 3
+request_duration_seconds_bucket{component="dyn_example_component",endpoint="dyn_example_endpoint9881",namespace="dyn_example_namespace",le="5"} 3
+request_duration_seconds_bucket{component="dyn_example_component",endpoint="dyn_example_endpoint9881",namespace="dyn_example_namespace",le="10"} 3
+request_duration_seconds_bucket{component="dyn_example_component",endpoint="dyn_example_endpoint9881",namespace="dyn_example_namespace",le="+Inf"} 3
+request_duration_seconds_sum{component="dyn_example_component",endpoint="dyn_example_endpoint9881",namespace="dyn_example_namespace"} 0.00048793700000000003
+request_duration_seconds_count{component="dyn_example_component",endpoint="dyn_example_endpoint9881",namespace="dyn_example_namespace"} 3
+
+# HELP requests_total Total number of requests processed by work handler
+# TYPE requests_total counter
+requests_total{component="dyn_example_component",endpoint="dyn_example_endpoint9881",namespace="dyn_example_namespace"} 3
+
+# HELP response_bytes_total Total number of bytes sent in responses by work handler
+# TYPE response_bytes_total counter
+response_bytes_total{component="dyn_example_component",endpoint="dyn_example_endpoint9881",namespace="dyn_example_namespace"} 1917
+
+# HELP uptime_seconds Total uptime of the DistributedRuntime in seconds
+# TYPE uptime_seconds gauge
+uptime_seconds{namespace="http_server"} 1.8226759879999999
 ```

-### Run Client
-```bash
-cargo run --bin system_client
+## Examples
+
+### Example 1: Simple Handler with Automatic Profiling
+
+```rust
+struct SimpleHandler;
+
+#[async_trait]
+impl AsyncEngine<SingleIn<String>, ManyOut<Annotated<String>>, Error> for SimpleHandler {
+    async fn generate(&self, input: SingleIn<String>) -> Result<ManyOut<Annotated<String>>> {
+        // Your business logic here
+        // No need to add any metrics code!
+        Ok(ResponseStream::new(Box::pin(stream), ctx.context()))
+    }
+}
+
+// Automatic profiling - no additional code needed!
+let ingress = Ingress::for_engine(SimpleHandler::new())?;
 ```

-Note: Running the client will increment `service_requests_total`.
+### Example 2: Custom Handler with Data Bytes Tracking

-### View Metrics
-```bash
-curl http://localhost:8081/metrics
+```rust
+struct RequestHandler {
+    metrics: Option<Arc<MySystemStatsMetrics>>,
+}
+
+#[async_trait]
+impl AsyncEngine<SingleIn<String>, ManyOut<Annotated<String>>, Error> for RequestHandler {
+    async fn generate(&self, input: SingleIn<String>) -> Result<ManyOut<Annotated<String>>> {
+        let (data, ctx) = input.into_parts();
+
+        // Track data bytes processed (custom metric)
+        if let Some(metrics) = &self.metrics {
+            metrics.data_bytes_processed.inc_by(data.len() as u64);
+        }
+
+        // Your business logic here...
+
+        Ok(ResponseStream::new(Box::pin(stream), ctx.context()))
+    }
+}
+
+// Create custom metrics and handler
+let system_metrics = MySystemStatsMetrics::from_endpoint(&endpoint)?;
+let handler = RequestHandler::with_metrics(system_metrics);
+let ingress = Ingress::for_engine(handler)?;
+
+// Add custom metrics IN ADDITION to automatic ones
+// You'll get both: automatic metrics (requests_total, request_duration_seconds, etc.)
+// AND custom metrics (my_custom_bytes_processed_total)
+ingress.add_metrics(&endpoint)?;
 ```

-Example output:
+## Benefits
+
+1. **Zero Code Changes**: Existing handlers automatically get profiling metrics
+2. **Simple API**: Just create an Ingress and you get metrics automatically
+3. **Optional Custom Metrics**: Add custom metrics when needed
+4. **Automatic Profiling**: Request count, duration, and error tracking out of the box
+5. **Automatic Labeling**: Endpoint provides proper namespace/component/endpoint labels
+6. **Performance**: Minimal overhead, metrics are only recorded when provided
+
+## Running the Example
+
+**Important**: You must set the `DYN_SYSTEM_PORT` environment variable to specify which port the HTTP server will run on.
+
+```bash
+# Run the system metrics example
+DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 cargo run --bin system_server
 ```
-# HELP service_request_duration_seconds Time spent processing requests
-# TYPE service_request_duration_seconds histogram
-service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="0.005"} 2
-service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="0.01"} 2
-service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="0.025"} 2
-service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="0.05"} 2
-service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="0.1"} 2
-service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="0.25"} 2
-service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="0.5"} 2
-service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="1"} 2
-service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="2.5"} 2
-service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="5"} 2
-service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="10"} 2
-service_request_duration_seconds_bucket{component="component",endpoint="endpoint",namespace="system",service="backend",le="+Inf"} 2
-service_request_duration_seconds_sum{component="component",endpoint="endpoint",namespace="system",service="backend"} 0.000022239000000000002
-service_request_duration_seconds_count{component="component",endpoint="endpoint",namespace="system",service="backend"} 2
-# HELP service_requests_total Total number of requests processed
-# TYPE service_requests_total counter
-service_requests_total{component="component",endpoint="endpoint",namespace="system",service="backend"} 2
-# HELP uptime_seconds Total uptime of the DistributedRuntime in seconds
-# TYPE uptime_seconds gauge
-uptime_seconds{namespace="http_server"} 725.997013676
+The server will start an HTTP server on the specified port (8081 in this example) that exposes the Prometheus metrics endpoint at `/metrics`.
+
+
+To Run an actual LLM frontend + server (aggregated example), launch both of them. By default, the frontend listens to port 8080.
 ```
+python -m dynamo.frontend &

-## Configuration
+DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 python -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --no-enable-prefix-caching &
+```
+Then make curl requests to the frontend (see the [main README](../../../../README.md))

-| Variable | Description | Default |
-|----------|-------------|---------|
-| `DYN_LOG` | Enable logging | `0` |
-| `DYN_SYSTEM_ENABLED` | Enable system metrics | `false` |
-| `DYN_SYSTEM_PORT` | HTTP server port | `8081` |
+## Querying Metrics

-## Metrics
+Once running, you can query the metrics:

- `service_requests_total`: Request counter
- `service_request_duration_seconds`: Request duration histogram
- `uptime_seconds`: Server uptime gauge
+```bash
+# Get all work handler metrics
+curl http://localhost:8081/metrics | grep -E "(requests_total|request_bytes_total|response_bytes_total|errors_total|request_duration_seconds|concurrent_requests)"

-This provides automatic context and grouping for all metrics without manual configuration.
+# Get request count for specific endpoint
+curl http://localhost:8081/metrics | grep 'requests_total{endpoint="dyn_example_endpoint"}'

-## Troubleshooting
+# Get request duration histogram
+curl http://localhost:8081/metrics | grep 'request_duration_seconds'

- **Port in use**: Change `DYN_SYSTEM_PORT`
- **Connection refused**: Ensure server is running first
- **No metrics**: Verify `DYN_SYSTEM_ENABLED=true`
\ No newline at end of file
+# Get custom system metrics
+curl http://localhost:8081/metrics | grep 'my_custom_bytes_processed_total'
+```
\ No newline at end of file
--- a/lib/runtime/examples/system_metrics/src/bin/system_client.rs
+++ b/lib/runtime/examples/system_metrics/src/bin/system_client.rs
@@ -14,7 +14,7 @@
 // limitations under the License.

 use futures::StreamExt;
-use system_metrics::DEFAULT_NAMESPACE;
+use system_metrics::{DEFAULT_COMPONENT, DEFAULT_ENDPOINT, DEFAULT_NAMESPACE};

 use dynamo_runtime::{
    logging, pipeline::PushRouter, protocols::annotated::Annotated, utils::Duration,
@@ -31,9 +31,9 @@ async fn app(runtime: Runtime) -> Result<()> {
    let distributed = DistributedRuntime::from_settings(runtime.clone()).await?;

    let namespace = distributed.namespace(DEFAULT_NAMESPACE)?;
-    let component = namespace.component("component")?;
+    let component = namespace.component(DEFAULT_COMPONENT)?;

-    let client = component.endpoint("endpoint").client().await?;
+    let client = component.endpoint(DEFAULT_ENDPOINT).client().await?;

    client.wait_for_instances().await?;
    let router =

--- a/lib/runtime/examples/system_metrics/src/bin/system_server.rs
+++ b/lib/runtime/examples/system_metrics/src/bin/system_server.rs
@@ -13,50 +13,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use system_metrics::{MyStats, DEFAULT_NAMESPACE};
-
-use dynamo_runtime::{
-    logging,
-    metrics::MetricsRegistry,
-    pipeline::{
-        async_trait, network::Ingress, AsyncEngine, AsyncEngineContextProvider, Error, ManyOut,
-        ResponseStream, SingleIn,
-    },
-    protocols::annotated::Annotated,
-    stream, DistributedRuntime, Result, Runtime, Worker,
-};
-
-use prometheus::{Counter, Histogram};
-use std::sync::Arc;
-
-/// Service metrics struct using the metric classes from metrics.rs
-pub struct MySystemStatsMetrics {
-    pub request_counter: Arc<Counter>,
-    pub request_duration: Arc<Histogram>,
-}
-
-impl MySystemStatsMetrics {
-    /// Create a new ServiceMetrics instance using the metric backend
-    pub fn new<R: MetricsRegistry>(
-        metrics_registry: Arc<R>,
-    ) -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {
-        let request_counter = metrics_registry.create_counter(
-            "service_requests_total",
-            "Total number of requests processed",
-            &[("service", "backend")],
-        )?;
-        let request_duration = metrics_registry.create_histogram(
-            "service_request_duration_seconds",
-            "Time spent processing requests",
-            &[("service", "backend")],
-            None,
-        )?;
-        Ok(Self {
-            request_counter,
-            request_duration,
-        })
-    }
-}
+use dynamo_runtime::{logging, DistributedRuntime, Result, Runtime, Worker};
+use system_metrics::backend;

 fn main() -> Result<()> {
    logging::init();
@@ -66,74 +24,5 @@ fn main() -> Result<()> {

 async fn app(runtime: Runtime) -> Result<()> {
    let distributed = DistributedRuntime::from_settings(runtime.clone()).await?;
-    backend(distributed).await
-}
-
-struct RequestHandler {
-    metrics: Arc<MySystemStatsMetrics>,
-}
-
-impl RequestHandler {
-    fn new(metrics: Arc<MySystemStatsMetrics>) -> Arc<Self> {
-        Arc::new(Self { metrics })
-    }
-}
-
-#[async_trait]
-impl AsyncEngine<SingleIn<String>, ManyOut<Annotated<String>>, Error> for RequestHandler {
-    async fn generate(&self, input: SingleIn<String>) -> Result<ManyOut<Annotated<String>>> {
-        let start_time = std::time::Instant::now();
-
-        // Record request start
-        self.metrics.request_counter.inc();
-
-        let (data, ctx) = input.into_parts();
-
-        let chars = data
-            .chars()
-            .map(|c| Annotated::from_data(c.to_string()))
-            .collect::<Vec<_>>();
-
-        let stream = stream::iter(chars);
-
-        // Record request duration
-        let duration = start_time.elapsed();
-        self.metrics
-            .request_duration
-            .observe(duration.as_secs_f64());
-
-        Ok(ResponseStream::new(Box::pin(stream), ctx.context()))
-    }
-}
-
-async fn backend(drt: DistributedRuntime) -> Result<()> {
-    let endpoint = drt
-        .namespace(DEFAULT_NAMESPACE)?
-        .component("component")?
-        .service_builder()
-        .create()
-        .await?
-        .endpoint("endpoint");
-
-    // make the ingress discoverable via a component service
-    // we must first create a service, then we can attach one more more endpoints
-    // attach an ingress to an engine, with the RequestHandler using the metrics struct
-    let endpoint_metrics = Arc::new(
-        MySystemStatsMetrics::new(Arc::new(endpoint.clone()))
-            .map_err(|e| Error::msg(e.to_string()))?,
-    );
-    let ingress = Ingress::for_engine(RequestHandler::new(endpoint_metrics.clone()))?;
-
-    endpoint
-        .endpoint_builder()
-        .stats_handler(|_stats| {
-            println!("Stats handler called with stats: {:?}", _stats);
-            let stats = MyStats { val: 10 };
-            serde_json::to_value(stats).unwrap()
-        })
-        .handler(ingress)
-        .start()
-        .await?;
-
-    Ok(())
+    backend(distributed, None).await
 }
--- a/lib/runtime/examples/system_metrics/src/lib.rs
+++ b/lib/runtime/examples/system_metrics/src/lib.rs
@@ -13,12 +13,120 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-use serde::{Deserialize, Serialize};
+use dynamo_runtime::{
+    metrics::MetricsRegistry,
+    pipeline::{
+        async_trait, network::Ingress, AsyncEngine, AsyncEngineContextProvider, Error, ManyOut,
+        ResponseStream, SingleIn,
+    },
+    protocols::annotated::Annotated,
+    stream, DistributedRuntime, Result,
+};
+use prometheus::IntCounter;
+use std::sync::Arc;

-pub const DEFAULT_NAMESPACE: &str = "system";
+pub const DEFAULT_NAMESPACE: &str = "dyn_example_namespace";
+pub const DEFAULT_COMPONENT: &str = "dyn_example_component";
+pub const DEFAULT_ENDPOINT: &str = "dyn_example_endpoint";

-#[derive(Serialize, Deserialize)]
-// Dummy Stats object to demonstrate how to attach a custom stats handler
+/// Stats structure returned by the endpoint's stats handler
+#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
 pub struct MyStats {
-    pub val: u32,
+    // Example value for demonstration purposes
+    pub val: i32,
+}
+
+/// Custom metrics for system stats with data bytes tracking
+#[derive(Clone, Debug)]
+pub struct MySystemStatsMetrics {
+    pub data_bytes_processed: Arc<IntCounter>,
+}
+
+impl MySystemStatsMetrics {
+    pub fn from_endpoint(
+        endpoint: &dynamo_runtime::component::Endpoint,
+    ) -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {
+        let data_bytes_processed = endpoint.create_intcounter(
+            "my_custom_bytes_processed_total",
+            "Example of a custom metric. Total number of data bytes processed by system handler",
+            &[],
+        )?;
+
+        Ok(Self {
+            data_bytes_processed,
+        })
+    }
+}
+
+#[derive(Clone)]
+pub struct RequestHandler {
+    metrics: Option<Arc<MySystemStatsMetrics>>,
+}
+
+impl RequestHandler {
+    pub fn new() -> Arc<Self> {
+        Arc::new(Self { metrics: None })
+    }
+
+    pub fn with_metrics(metrics: MySystemStatsMetrics) -> Arc<Self> {
+        Arc::new(Self {
+            metrics: Some(Arc::new(metrics)),
+        })
+    }
+}
+
+#[async_trait]
+impl AsyncEngine<SingleIn<String>, ManyOut<Annotated<String>>, Error> for RequestHandler {
+    async fn generate(&self, input: SingleIn<String>) -> Result<ManyOut<Annotated<String>>> {
+        let (data, ctx) = input.into_parts();
+
+        // Track data bytes processed if metrics are available
+        if let Some(metrics) = &self.metrics {
+            metrics.data_bytes_processed.inc_by(data.len() as u64);
+        }
+
+        let chars = data
+            .chars()
+            .map(|c| Annotated::from_data(c.to_string()))
+            .collect::<Vec<_>>();
+
+        let stream = stream::iter(chars);
+
+        Ok(ResponseStream::new(Box::pin(stream), ctx.context()))
+    }
+}
+
+/// Backend function that sets up the system server with metrics and ingress handler
+/// This function can be reused by integration tests to ensure they use the exact same setup
+pub async fn backend(drt: DistributedRuntime, endpoint_name: Option<&str>) -> Result<()> {
+    let endpoint_name = endpoint_name.unwrap_or(DEFAULT_ENDPOINT);
+
+    let endpoint = drt
+        .namespace(DEFAULT_NAMESPACE)?
+        .component(DEFAULT_COMPONENT)?
+        .service_builder()
+        .create()
+        .await?
+        .endpoint(endpoint_name);
+
+    // Create custom metrics for system stats
+    let system_metrics =
+        MySystemStatsMetrics::from_endpoint(&endpoint).expect("Failed to create system metrics");
+
+    // Use the factory pattern - single line factory call with metrics
+    let ingress = Ingress::for_engine(RequestHandler::with_metrics(system_metrics))?;
+
+    endpoint
+        .endpoint_builder()
+        .stats_handler(|_stats| {
+            println!("Stats handler called with stats: {:?}", _stats);
+            // TODO(keivenc): return a real stats object
+            let stats = MyStats { val: 10 };
+            serde_json::to_value(stats).unwrap()
+        })
+        .handler(ingress)
+        .start()
+        .await?;
+
+    Ok(())
 }
--- a/lib/runtime/examples/system_metrics/tests/integration_test.rs
+++ b/lib/runtime/examples/system_metrics/tests/integration_test.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#![cfg(feature = "integration")]
+
+use dynamo_runtime::{
+    pipeline::PushRouter, protocols::annotated::Annotated, DistributedRuntime, Result, Runtime,
+};
+use futures::StreamExt;
+use rand::Rng;
+use reqwest;
+use std::env;
+use system_metrics::{backend, DEFAULT_COMPONENT, DEFAULT_ENDPOINT, DEFAULT_NAMESPACE};
+use tokio::time::{sleep, Duration};
+
+#[tokio::test]
+async fn test_backend_with_metrics() -> Result<()> {
+    // Set environment variables for dynamic port allocation
+    env::set_var("DYN_SYSTEM_ENABLED", "true");
+    env::set_var("DYN_SYSTEM_PORT", "0");
+
+    // Generate a random endpoint name to avoid collisions
+    let random_suffix = rand::rng().random_range(1000..9999);
+    let test_endpoint = format!("{}{}", DEFAULT_ENDPOINT, random_suffix);
+
+    // Initialize logging
+    dynamo_runtime::logging::init();
+
+    // Create a runtime and distributed runtime for the backend
+    let runtime = Runtime::from_current()?;
+    let distributed = DistributedRuntime::from_settings(runtime.clone()).await?;
+
+    // Get the HTTP server info to find the actual port
+    let http_server_info = distributed.http_server_info();
+    let metrics_port = match http_server_info {
+        Some(info) => {
+            println!("HTTP server running on: {}", info.address());
+            info.port()
+        }
+        None => {
+            panic!("HTTP server not started - check DYN_SYSTEM_ENABLED environment variable");
+        }
+    };
+
+    // Start the backend in a separate task with custom endpoint
+    let test_endpoint_clone = test_endpoint.clone();
+    let backend_handle =
+        tokio::spawn(async move { backend(distributed, Some(&test_endpoint_clone)).await });
+
+    // Give the backend some time to start up
+    sleep(Duration::from_millis(1000)).await;
+
+    // Create a client runtime to connect to the backend
+    let client_runtime = Runtime::from_current()?;
+    let client_distributed = DistributedRuntime::from_settings(client_runtime.clone()).await?;
+
+    // Connect to the backend similar to system_client.rs
+    let namespace = client_distributed.namespace(DEFAULT_NAMESPACE)?;
+    let component = namespace.component(DEFAULT_COMPONENT)?;
+    let client = component.endpoint(&test_endpoint).client().await?;
+
+    // Wait for backend instances to be available
+    client.wait_for_instances().await?;
+
+    // Create a router and send some requests to generate metrics
+    let router =
+        PushRouter::<String, Annotated<String>>::from_client(client, Default::default()).await?;
+
+    // Send a few test requests to generate metrics
+    for i in 0..3 {
+        let test_message = format!("test message {}", i);
+        let mut stream = router.random(test_message.clone().into()).await?;
+
+        // Process the response stream
+        while let Some(resp) = stream.next().await {
+            println!("Response {}: {:?}", i, resp);
+        }
+
+        // Small delay between requests
+        sleep(Duration::from_millis(100)).await;
+    }
+
+    // Give some time for metrics to be updated
+    sleep(Duration::from_millis(500)).await;
+
+    // Now fetch the HTTP metrics endpoint using the dynamic port
+    let metrics_url = format!("http://localhost:{}/metrics", metrics_port);
+
+    println!("Fetching metrics from: {}", metrics_url);
+
+    // Make HTTP request to get metrics
+    let client = reqwest::Client::new();
+    let response = client.get(&metrics_url).send().await;
+
+    match response {
+        Ok(response) => {
+            if response.status().is_success() {
+                let metrics_content = response
+                    .text()
+                    .await
+                    .unwrap_or_else(|_| "Failed to read response body".to_string());
+
+                println!("=== METRICS CONTENT ===");
+                println!("{}", metrics_content);
+                println!("=== END METRICS CONTENT ===");
+
+                // Parse and verify ingress metrics are greater than 0 (except concurrent_requests)
+                verify_ingress_metrics_greater_than_0(&metrics_content);
+
+                println!("Successfully retrieved and verified metrics!");
+            } else {
+                println!("HTTP request failed with status: {}", response.status());
+                panic!("Failed to get metrics: HTTP {}", response.status());
+            }
+        }
+        Err(e) => {
+            println!("Failed to connect to metrics endpoint: {}", e);
+            panic!("Failed to connect to metrics endpoint: {}", e);
+        }
+    }
+
+    // Shutdown the runtime
+    client_runtime.shutdown();
+
+    // Cancel the backend task
+    backend_handle.abort();
+
+    Ok(())
+}
+
+fn verify_ingress_metrics_greater_than_0(metrics_content: &str) {
+    // Define the work handler metrics we want to verify (excluding concurrent_requests which can be 0)
+    let metrics_to_verify = [
+        "my_custom_bytes_processed_total",
+        "requests_total",
+        "request_bytes_total",
+        "response_bytes_total",
+        "request_duration_seconds_count",
+        "request_duration_seconds_sum",
+    ];
+
+    for metric_name in &metrics_to_verify {
+        let line = metrics_content
+            .lines()
+            .find(|l| l.contains(metric_name) && !l.contains("#"))
+            .unwrap_or_else(|| panic!("{} metric not found", metric_name));
+
+        let value = extract_metric_value(line);
+        assert!(
+            value > 0.0,
+            "{} should be greater than 0, got: {}",
+            metric_name,
+            value
+        );
+        println!("{}: {}", metric_name, value);
+    }
+
+    println!("All work handler metrics verified successfully!");
+}
+
+fn extract_metric_value(line: &str) -> f64 {
+    // Extract the numeric value from a Prometheus metric line
+    // Format: metric_name{labels} value
+    line.split_whitespace()
+        .last()
+        .expect("Metric line should have a value")
+        .parse::<f64>()
+        .expect("Metric value should be a valid number")
+}
--- a/lib/runtime/src/component/endpoint.rs
+++ b/lib/runtime/src/component/endpoint.rs
@@ -69,6 +69,9 @@ impl EndpointConfigBuilder {
        // acquire the registry lock
        let registry = endpoint.drt().component_registry.inner.lock().await;

+        // Add metrics to the handler. The endpoint provides additional information to the handler.
+        handler.add_metrics(&endpoint)?;
+
        // get the group
        let group = registry
            .services

--- a/lib/runtime/src/distributed.rs
+++ b/lib/runtime/src/distributed.rs
@@ -24,6 +24,7 @@ use crate::{
 };

 use super::{error, Arc, DistributedRuntime, OnceCell, Result, Runtime, SystemHealth, Weak, OK};
+use std::sync::OnceLock;

 use derive_getters::Dissolve;
 use figment::error;
@@ -97,6 +98,7 @@ impl DistributedRuntime {
            etcd_client,
            nats_client,
            tcp_server: Arc::new(OnceCell::new()),
+            http_server: Arc::new(OnceLock::new()),
            component_registry: component::Registry::new(),
            is_static,
            instance_sources: Arc::new(Mutex::new(HashMap::new())),
@@ -121,8 +123,18 @@ impl DistributedRuntime {
            )
            .await
            {
-                Ok((addr, _)) => {
+                Ok((addr, handle)) => {
                    tracing::info!("HTTP server started successfully on {}", addr);
+
+                    // Store HTTP server information
+                    let http_server_info =
+                        crate::http_server::HttpServerInfo::new(addr, Some(handle));
+
+                    // Initialize the http_server field
+                    distributed_runtime
+                        .http_server
+                        .set(Arc::new(http_server_info))
+                        .expect("HTTP server info should only be set once");
                }
                Err(e) => {
                    tracing::error!("HTTP server startup failed: {}", e);
@@ -210,6 +222,11 @@ impl DistributedRuntime {
        self.nats_client.clone()
    }

+    /// Get HTTP server information if available
+    pub fn http_server_info(&self) -> Option<Arc<crate::http_server::HttpServerInfo>> {
+        self.http_server.get().cloned()
+    }
+
    // todo(ryan): deprecate this as we move to Discovery traits and Component Identifiers
    pub fn etcd_client(&self) -> Option<etcd::Client> {
        self.etcd_client.clone()

--- a/lib/runtime/src/http_server.rs
+++ b/lib/runtime/src/http_server.rs
@@ -22,10 +22,47 @@ use std::collections::HashMap;
 use std::sync::Arc;
 use std::sync::OnceLock;
 use std::time::Instant;
-use tokio::net::TcpListener;
+use tokio::{net::TcpListener, task::JoinHandle};
 use tokio_util::sync::CancellationToken;
 use tracing;

+/// HTTP server information containing socket address and handle
+#[derive(Debug)]
+pub struct HttpServerInfo {
+    pub socket_addr: std::net::SocketAddr,
+    pub handle: Option<Arc<JoinHandle<()>>>,
+}
+
+impl HttpServerInfo {
+    pub fn new(socket_addr: std::net::SocketAddr, handle: Option<JoinHandle<()>>) -> Self {
+        Self {
+            socket_addr,
+            handle: handle.map(Arc::new),
+        }
+    }
+
+    pub fn address(&self) -> String {
+        self.socket_addr.to_string()
+    }
+
+    pub fn hostname(&self) -> String {
+        self.socket_addr.ip().to_string()
+    }
+
+    pub fn port(&self) -> u16 {
+        self.socket_addr.port()
+    }
+}
+
+impl Clone for HttpServerInfo {
+    fn clone(&self) -> Self {
+        Self {
+            socket_addr: self.socket_addr,
+            handle: self.handle.clone(),
+        }
+    }
+}
+
 pub struct HttpMetricsRegistry {
    pub drt: Arc<crate::DistributedRuntime>,
 }
@@ -58,8 +95,10 @@ impl HttpServerState {
    /// Create new HTTP server state with the provided metrics registry
    pub fn new(drt: Arc<crate::DistributedRuntime>) -> anyhow::Result<Self> {
        let http_metrics_registry = Arc::new(HttpMetricsRegistry { drt: drt.clone() });
+        // Note: This metric is created at the DRT level (no namespace), so we manually add "dynamo_" prefix
+        // to maintain consistency with the project's metric naming convention
        let uptime_gauge = http_metrics_registry.as_ref().create_gauge(
-            "uptime_seconds",
+            "dynamo_uptime_seconds",
            "Total uptime of the DistributedRuntime in seconds",
            &[],
        )?;
@@ -293,9 +332,9 @@ mod tests {
        println!("Full metrics response:\n{}", response);

        let expected = "\
-# HELP uptime_seconds Total uptime of the DistributedRuntime in seconds
-# TYPE uptime_seconds gauge
-uptime_seconds{namespace=\"http_server\"} 42
+# HELP dynamo_uptime_seconds Total uptime of the DistributedRuntime in seconds
+# TYPE dynamo_uptime_seconds gauge
+dynamo_uptime_seconds{namespace=\"http_server\"} 42
 ";
        assert_eq!(response, expected);
    }
@@ -405,6 +444,9 @@ uptime_seconds{namespace=\"http_server\"} 42
    #[tokio::test]
    async fn test_spawn_http_server_endpoints() {
        // use reqwest for HTTP requests
+        temp_env::async_with_vars(
+            [("DYN_SYSTEM_STARTING_HEALTH_STATUS", Some("ready"))],
+            async {
                let cancel_token = CancellationToken::new();
                let drt = create_test_drt_async().await;
                let (addr, server_handle) =
@@ -416,8 +458,8 @@ uptime_seconds{namespace=\"http_server\"} 42
                println!("[test] Server should be up, starting requests...");
                let client = reqwest::Client::new();
                for (path, expect_200, expect_body) in [
-            ("/health", true, "OK"),
-            ("/live", true, "OK"),
+                    ("/health", true, "ready"),
+                    ("/live", true, "ready"),
                    ("/someRandomPathNotFoundHere", false, "Route not found"),
                ] {
                    println!("[test] Sending request to {}", path);
@@ -452,6 +494,9 @@ uptime_seconds{namespace=\"http_server\"} 42
                        }
                    }
                }
+            },
+        )
+        .await;
    }

    #[cfg(feature = "integration")]

--- a/lib/runtime/src/lib.rs
+++ b/lib/runtime/src/lib.rs
@@ -20,7 +20,7 @@

 use std::{
    collections::HashMap,
-    sync::{Arc, Weak},
+    sync::{Arc, OnceLock, Weak},
 };
 use tokio::sync::Mutex;

@@ -37,6 +37,7 @@ pub mod component;
 pub mod discovery;
 pub mod engine;
 pub mod http_server;
+pub use http_server::HttpServerInfo;
 pub mod logging;
 pub mod metrics;
 pub mod pipeline;
@@ -150,6 +151,7 @@ pub struct DistributedRuntime {
    etcd_client: Option<transports::etcd::Client>,
    nats_client: transports::nats::Client,
    tcp_server: Arc<OnceCell<Arc<transports::tcp::server::TcpStreamServer>>>,
+    http_server: Arc<OnceLock<Arc<http_server::HttpServerInfo>>>,

    // local registry for components
    // the registry allows us to use share runtime resources across instances of the same component object.

--- a/lib/runtime/src/metrics.rs
+++ b/lib/runtime/src/metrics.rs
--- a/lib/runtime/src/pipeline/network.rs
+++ b/lib/runtime/src/pipeline/network.rs
@@ -36,6 +36,11 @@ use super::{
    context, AsyncTransportEngine, Context, Data, Error, ManyOut, PipelineError, PipelineIO,
    SegmentSource, ServiceBackend, ServiceEngine, SingleIn, Source,
 };
+use ingress::push_handler::WorkHandlerMetrics;
+
+// Add Prometheus metrics types
+use crate::metrics::MetricsRegistry;
+use prometheus::{CounterVec, Histogram, IntCounter, IntCounterVec, IntGauge};

 pub trait Codable: PipelineIO + Serialize + for<'de> Deserialize<'de> {}
 impl<T: PipelineIO + Serialize + for<'de> Deserialize<'de>> Codable for T {}
@@ -278,12 +283,14 @@ struct RequestControlMessage {

 pub struct Ingress<Req: PipelineIO, Resp: PipelineIO> {
    segment: OnceLock<Arc<SegmentSource<Req, Resp>>>,
+    metrics: OnceLock<Arc<WorkHandlerMetrics>>,
 }

 impl<Req: PipelineIO + Sync, Resp: PipelineIO> Ingress<Req, Resp> {
    pub fn new() -> Arc<Self> {
        Arc::new(Self {
            segment: OnceLock::new(),
+            metrics: OnceLock::new(),
        })
    }

@@ -293,6 +300,15 @@ impl<Req: PipelineIO + Sync, Resp: PipelineIO> Ingress<Req, Resp> {
            .map_err(|_| anyhow::anyhow!("Segment already set"))
    }

+    pub fn add_metrics(&self, endpoint: &crate::component::Endpoint) -> Result<()> {
+        let metrics = WorkHandlerMetrics::from_endpoint(endpoint)
+            .map_err(|e| anyhow::anyhow!("Failed to create work handler metrics: {}", e))?;
+
+        self.metrics
+            .set(Arc::new(metrics))
+            .map_err(|_| anyhow::anyhow!("Metrics already set"))
+    }
+
    pub fn link(segment: Arc<SegmentSource<Req, Resp>>) -> Result<Arc<Self>> {
        let ingress = Ingress::new();
        ingress.attach(segment)?;
@@ -317,11 +333,19 @@ impl<Req: PipelineIO + Sync, Resp: PipelineIO> Ingress<Req, Resp> {

        Ok(ingress)
    }
+
+    /// Helper method to access metrics if available
+    fn metrics(&self) -> Option<&Arc<WorkHandlerMetrics>> {
+        self.metrics.get()
+    }
 }

 #[async_trait]
 pub trait PushWorkHandler: Send + Sync {
    async fn handle_payload(&self, payload: Bytes) -> Result<(), PipelineError>;
+
+    /// Add metrics to the handler
+    fn add_metrics(&self, endpoint: &crate::component::Endpoint) -> Result<()>;
 }

 /*

--- a/lib/runtime/src/pipeline/network/ingress/push_handler.rs
+++ b/lib/runtime/src/pipeline/network/ingress/push_handler.rs
@@ -14,7 +14,92 @@
 // limitations under the License.

 use super::*;
+use prometheus::{Histogram, IntCounter, IntCounterVec, IntGauge};
 use serde::{Deserialize, Serialize};
+use std::sync::Arc;
+
+/// Metrics configuration for profiling work handlers
+#[derive(Clone, Debug)]
+pub struct WorkHandlerMetrics {
+    pub request_counter: Arc<IntCounter>,
+    pub request_duration: Arc<Histogram>,
+    pub concurrent_requests: Arc<IntGauge>,
+    pub request_bytes: Arc<IntCounter>,
+    pub response_bytes: Arc<IntCounter>,
+    pub error_counter: Arc<IntCounterVec>,
+}
+
+impl WorkHandlerMetrics {
+    pub fn new(
+        request_counter: Arc<IntCounter>,
+        request_duration: Arc<Histogram>,
+        concurrent_requests: Arc<IntGauge>,
+        request_bytes: Arc<IntCounter>,
+        response_bytes: Arc<IntCounter>,
+        error_counter: Arc<IntCounterVec>,
+    ) -> Self {
+        Self {
+            request_counter,
+            request_duration,
+            concurrent_requests,
+            request_bytes,
+            response_bytes,
+            error_counter,
+        }
+    }
+
+    /// Create WorkHandlerMetrics from an endpoint using its built-in labeling
+    pub fn from_endpoint(
+        endpoint: &crate::component::Endpoint,
+    ) -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {
+        let request_counter = endpoint.create_intcounter(
+            "requests_total",
+            "Total number of requests processed by work handler",
+            &[],
+        )?;
+
+        let request_duration = endpoint.create_histogram(
+            "request_duration_seconds",
+            "Time spent processing requests by work handler",
+            &[],
+            None,
+        )?;
+
+        let concurrent_requests = endpoint.create_intgauge(
+            "concurrent_requests",
+            "Number of requests currently being processed by work handler",
+            &[],
+        )?;
+
+        let request_bytes = endpoint.create_intcounter(
+            "request_bytes_total",
+            "Total number of bytes received in requests by work handler",
+            &[],
+        )?;
+
+        let response_bytes = endpoint.create_intcounter(
+            "response_bytes_total",
+            "Total number of bytes sent in responses by work handler",
+            &[],
+        )?;
+
+        let error_counter = endpoint.create_intcountervec(
+            "errors_total",
+            "Total number of errors in work handler processing",
+            &["error_type"],
+            &[],
+        )?;
+
+        Ok(Self::new(
+            request_counter,
+            request_duration,
+            concurrent_requests,
+            request_bytes,
+            response_bytes,
+            error_counter,
+        ))
+    }
+}

 #[async_trait]
 impl<T: Data, U: Data> PushWorkHandler for Ingress<SingleIn<T>, ManyOut<U>>
@@ -22,7 +107,21 @@ where
    T: Data + for<'de> Deserialize<'de> + std::fmt::Debug,
    U: Data + Serialize + std::fmt::Debug,
 {
+    fn add_metrics(&self, endpoint: &crate::component::Endpoint) -> Result<()> {
+        // Call the Ingress-specific add_metrics implementation
+        use crate::pipeline::network::Ingress;
+        Ingress::add_metrics(self, endpoint)
+    }
+
    async fn handle_payload(&self, payload: Bytes) -> Result<(), PipelineError> {
+        let start_time = std::time::Instant::now();
+
+        if let Some(m) = self.metrics() {
+            m.request_counter.inc();
+            m.concurrent_requests.inc();
+            m.request_bytes.inc_by(payload.len() as u64);
+        }
+
        // decode the control message and the request
        let msg = TwoPartCodec::default()
            .decode_message(payload)?
@@ -41,6 +140,11 @@ where
                    Ok(cm) => cm,
                    Err(err) => {
                        let json_str = String::from_utf8_lossy(&header);
+                        if let Some(m) = self.metrics() {
+                            m.error_counter
+                                .with_label_values(&["deserialization"])
+                                .inc();
+                        }
                        return Err(PipelineError::DeserializationError(
                            format!("Failed deserializing to RequestControlMessage. err={err}, json_str={json_str}"),
                        ));
@@ -50,6 +154,11 @@ where
                (control_msg, request)
            }
            _ => {
+                if let Some(m) = self.metrics() {
+                    m.error_counter
+                        .with_label_values(&["invalid_message"])
+                        .inc();
+                }
                return Err(PipelineError::Generic(String::from("Unexpected message from work queue; unable extract a TwoPartMessage with a header and data")));
            }
        };
@@ -68,6 +177,11 @@ where
        )
        .await
        .map_err(|e| {
+            if let Some(m) = self.metrics() {
+                m.error_counter
+                    .with_label_values(&["response_stream"])
+                    .inc();
+            }
            PipelineError::Generic(format!("Failed to create response stream: {:?}", e,))
        })?;

@@ -78,7 +192,12 @@ where
            .expect("segment not set")
            .generate(request)
            .await
-            .map_err(PipelineError::GenerateError);
+            .map_err(|e| {
+                if let Some(m) = self.metrics() {
+                    m.error_counter.with_label_values(&["generate"]).inc();
+                }
+                PipelineError::GenerateError(e)
+            });

        // the prolouge is sent to the client to indicate that the stream is ready to receive data
        // or if the generate call failed, the error is sent to the client
@@ -107,10 +226,18 @@ where
            };
            let resp_bytes = serde_json::to_vec(&resp_wrapper)
                .expect("fatal error: invalid response object - this should never happen");
+            if let Some(m) = self.metrics() {
+                m.response_bytes.inc_by(resp_bytes.len() as u64);
+            }
            if (publisher.send(resp_bytes.into()).await).is_err() {
                tracing::error!("Failed to publish response for stream {}", context.id());
                context.stop_generating();
                send_complete_final = false;
+                if let Some(m) = self.metrics() {
+                    m.error_counter
+                        .with_label_values(&["publish_response"])
+                        .inc();
+                }
                break;
            }
        }
@@ -121,13 +248,25 @@ where
            };
            let resp_bytes = serde_json::to_vec(&resp_wrapper)
                .expect("fatal error: invalid response object - this should never happen");
+            if let Some(m) = self.metrics() {
+                m.response_bytes.inc_by(resp_bytes.len() as u64);
+            }
            if (publisher.send(resp_bytes.into()).await).is_err() {
                tracing::error!(
                    "Failed to publish complete final for stream {}",
                    context.id()
                );
+                if let Some(m) = self.metrics() {
+                    m.error_counter.with_label_values(&["publish_final"]).inc();
                }
            }
+        }
+
+        if let Some(m) = self.metrics() {
+            let duration = start_time.elapsed();
+            m.request_duration.observe(duration.as_secs_f64());
+            m.concurrent_requests.dec();
+        }

        Ok(())
    }