feat: Add initial prometheus/grafana support for count (#303)

d38325c2 · Ryan McCormick · GitHub · 6e0cfbd9 · d38325c2 · d38325c2
Commit d38325c2 authored Feb 28, 2025 by Ryan McCormick Committed by GitHub Feb 28, 2025
15 changed files
--- a/applications/llm/count/Cargo.lock
+++ b/applications/llm/count/Cargo.lock
@@ -128,7 +128,7 @@ dependencies = [
 "regex",
 "ring",
 "rustls-native-certs 0.7.3",
- "rustls-pemfile",
+ "rustls-pemfile 2.2.0",
 "rustls-webpki",
 "serde",
 "serde_json",
@@ -164,7 +164,7 @@ dependencies = [
 "eventsource-stream",
 "futures",
 "rand",
- "reqwest",
+ "reqwest 0.12.12",
 "reqwest-eventsource",
 "secrecy",
 "serde",
@@ -244,6 +244,38 @@ version = "1.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"

+[[package]]
+name = "axum"
+version = "0.6.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf"
+dependencies = [
+ "async-trait",
+ "axum-core 0.3.4",
+ "bitflags 1.3.2",
+ "bytes",
+ "futures-util",
+ "http 0.2.12",
+ "http-body 0.4.6",
+ "hyper 0.14.32",
+ "itoa",
+ "matchit 0.7.3",
+ "memchr",
+ "mime",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustversion",
+ "serde",
+ "serde_json",
+ "serde_path_to_error",
+ "serde_urlencoded",
+ "sync_wrapper 0.1.2",
+ "tokio",
+ "tower 0.4.13",
+ "tower-layer",
+ "tower-service",
+]
+
 [[package]]
 name = "axum"
 version = "0.7.9"
@@ -254,8 +286,8 @@ dependencies = [
 "axum-core 0.4.5",
 "bytes",
 "futures-util",
- "http",
- "http-body",
+ "http 1.2.0",
+ "http-body 1.0.1",
 "http-body-util",
 "itoa",
 "matchit 0.7.3",
@@ -265,7 +297,7 @@ dependencies = [
 "pin-project-lite",
 "rustversion",
 "serde",
- "sync_wrapper",
+ "sync_wrapper 1.0.2",
 "tower 0.5.2",
 "tower-layer",
 "tower-service",
@@ -281,10 +313,10 @@ dependencies = [
 "bytes",
 "form_urlencoded",
 "futures-util",
- "http",
- "http-body",
+ "http 1.2.0",
+ "http-body 1.0.1",
 "http-body-util",
- "hyper",
+ "hyper 1.6.0",
 "hyper-util",
 "itoa",
 "matchit 0.8.4",
@@ -297,7 +329,7 @@ dependencies = [
 "serde_json",
 "serde_path_to_error",
 "serde_urlencoded",
- "sync_wrapper",
+ "sync_wrapper 1.0.2",
 "tokio",
 "tower 0.5.2",
 "tower-layer",
@@ -305,6 +337,23 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "axum-core"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "759fa577a247914fd3f7f76d62972792636412fbfd634cd452f6a385a74d2d2c"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "futures-util",
+ "http 0.2.12",
+ "http-body 0.4.6",
+ "mime",
+ "rustversion",
+ "tower-layer",
+ "tower-service",
+]
+
 [[package]]
 name = "axum-core"
 version = "0.4.5"
@@ -314,13 +363,13 @@ dependencies = [
 "async-trait",
 "bytes",
 "futures-util",
- "http",
- "http-body",
+ "http 1.2.0",
+ "http-body 1.0.1",
 "http-body-util",
 "mime",
 "pin-project-lite",
 "rustversion",
- "sync_wrapper",
+ "sync_wrapper 1.0.2",
 "tower-layer",
 "tower-service",
 ]
@@ -333,13 +382,13 @@ checksum = "df1362f362fd16024ae199c1970ce98f9661bf5ef94b9808fee734bc3698b733"
 dependencies = [
 "bytes",
 "futures-util",
- "http",
- "http-body",
+ "http 1.2.0",
+ "http-body 1.0.1",
 "http-body-util",
 "mime",
 "pin-project-lite",
 "rustversion",
- "sync_wrapper",
+ "sync_wrapper 1.0.2",
 "tower-layer",
 "tower-service",
 "tracing",
@@ -380,6 +429,12 @@ version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"

+[[package]]
+name = "base64"
+version = "0.21.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
+
 [[package]]
 name = "base64"
 version = "0.22.1"
@@ -631,7 +686,13 @@ checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
 name = "count"
 version = "0.1.0"
 dependencies = [
+ "axum 0.6.20",
 "clap",
+ "opentelemetry",
+ "opentelemetry-prometheus",
+ "prometheus",
+ "rand",
+ "reqwest 0.11.27",
 "serde",
 "serde_json",
 "thiserror 1.0.69",
@@ -963,6 +1024,15 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"

+[[package]]
+name = "encoding_rs"
+version = "0.8.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
+dependencies = [
+ "cfg-if 1.0.0",
+]
+
 [[package]]
 name = "enum-ordinalize"
 version = "4.3.0"
@@ -1024,7 +1094,7 @@ version = "0.14.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fc0452bcc559431b16f472b7ab86e2f9ccd5f3c2da3795afbd6b773665e047fe"
 dependencies = [
- "http",
+ "http 1.2.0",
 "prost",
 "tokio",
 "tokio-stream",
@@ -1282,6 +1352,25 @@ version = "0.31.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"

+[[package]]
+name = "h2"
+version = "0.3.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8"
+dependencies = [
+ "bytes",
+ "fnv",
+ "futures-core",
+ "futures-sink",
+ "futures-util",
+ "http 0.2.12",
+ "indexmap 2.7.1",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tracing",
+]
+
 [[package]]
 name = "h2"
 version = "0.4.8"
@@ -1293,7 +1382,7 @@ dependencies = [
 "fnv",
 "futures-core",
 "futures-sink",
- "http",
+ "http 1.2.0",
 "indexmap 2.7.1",
 "slab",
 "tokio",
@@ -1336,6 +1425,17 @@ dependencies = [
 "ureq",
 ]

+[[package]]
+name = "http"
+version = "0.2.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1"
+dependencies = [
+ "bytes",
+ "fnv",
+ "itoa",
+]
+
 [[package]]
 name = "http"
 version = "1.2.0"
@@ -1347,6 +1447,17 @@ dependencies = [
 "itoa",
 ]

+[[package]]
+name = "http-body"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2"
+dependencies = [
+ "bytes",
+ "http 0.2.12",
+ "pin-project-lite",
+]
+
 [[package]]
 name = "http-body"
 version = "1.0.1"
@@ -1354,7 +1465,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
 dependencies = [
 "bytes",
- "http",
+ "http 1.2.0",
 ]

 [[package]]
@@ -1365,8 +1476,8 @@ checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f"
 dependencies = [
 "bytes",
 "futures-util",
- "http",
- "http-body",
+ "http 1.2.0",
+ "http-body 1.0.1",
 "pin-project-lite",
 ]

@@ -1388,6 +1499,30 @@ version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"

+[[package]]
+name = "hyper"
+version = "0.14.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7"
+dependencies = [
+ "bytes",
+ "futures-channel",
+ "futures-core",
+ "futures-util",
+ "h2 0.3.26",
+ "http 0.2.12",
+ "http-body 0.4.6",
+ "httparse",
+ "httpdate",
+ "itoa",
+ "pin-project-lite",
+ "socket2",
+ "tokio",
+ "tower-service",
+ "tracing",
+ "want",
+]
+
 [[package]]
 name = "hyper"
 version = "1.6.0"
@@ -1397,9 +1532,9 @@ dependencies = [
 "bytes",
 "futures-channel",
 "futures-util",
- "h2",
- "http",
- "http-body",
+ "h2 0.4.8",
+ "http 1.2.0",
+ "http-body 1.0.1",
 "httparse",
 "httpdate",
 "itoa",
@@ -1416,8 +1551,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2d191583f3da1305256f22463b9bb0471acad48a4e534a5218b9963e9c1f59b2"
 dependencies = [
 "futures-util",
- "http",
- "hyper",
+ "http 1.2.0",
+ "hyper 1.6.0",
 "hyper-util",
 "rustls",
 "rustls-native-certs 0.8.1",
@@ -1433,13 +1568,26 @@ version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0"
 dependencies = [
- "hyper",
+ "hyper 1.6.0",
 "hyper-util",
 "pin-project-lite",
 "tokio",
 "tower-service",
 ]

+[[package]]
+name = "hyper-tls"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
+dependencies = [
+ "bytes",
+ "hyper 0.14.32",
+ "native-tls",
+ "tokio",
+ "tokio-native-tls",
+]
+
 [[package]]
 name = "hyper-util"
 version = "0.1.10"
@@ -1449,9 +1597,9 @@ dependencies = [
 "bytes",
 "futures-channel",
 "futures-util",
- "http",
- "http-body",
- "hyper",
+ "http 1.2.0",
+ "http-body 1.0.1",
+ "hyper 1.6.0",
 "pin-project-lite",
 "socket2",
 "tokio",
@@ -2280,12 +2428,80 @@ dependencies = [
 "vcpkg",
 ]

+[[package]]
+name = "opentelemetry"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9591d937bc0e6d2feb6f71a559540ab300ea49955229c347a517a28d27784c54"
+dependencies = [
+ "opentelemetry_api",
+ "opentelemetry_sdk",
+]
+
+[[package]]
+name = "opentelemetry-prometheus"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c7d81bc254e2d572120363a2b16cdb0d715d301b5789be0cfc26ad87e4e10e53"
+dependencies = [
+ "once_cell",
+ "opentelemetry_api",
+ "opentelemetry_sdk",
+ "prometheus",
+ "protobuf",
+]
+
+[[package]]
+name = "opentelemetry_api"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a81f725323db1b1206ca3da8bb19874bbd3f57c3bcd59471bfb04525b265b9b"
+dependencies = [
+ "futures-channel",
+ "futures-util",
+ "indexmap 1.9.3",
+ "js-sys",
+ "once_cell",
+ "pin-project-lite",
+ "thiserror 1.0.69",
+ "urlencoding",
+]
+
+[[package]]
+name = "opentelemetry_sdk"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fa8e705a0612d48139799fcbaba0d4a90f06277153e43dd2bdc16c6f0edd8026"
+dependencies = [
+ "async-trait",
+ "crossbeam-channel",
+ "futures-channel",
+ "futures-executor",
+ "futures-util",
+ "once_cell",
+ "opentelemetry_api",
+ "ordered-float",
+ "percent-encoding",
+ "rand",
+ "regex",
+ "thiserror 1.0.69",
+]
+
 [[package]]
 name = "option-ext"
 version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"

+[[package]]
+name = "ordered-float"
+version = "3.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "overload"
 version = "0.1.1"
@@ -2677,7 +2893,7 @@ dependencies = [
 "once_cell",
 "socket2",
 "tracing",
- "windows-sys 0.52.0",
+ "windows-sys 0.59.0",
 ]

 [[package]]
@@ -2814,6 +3030,46 @@ version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"

+[[package]]
+name = "reqwest"
+version = "0.11.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dd67538700a17451e7cba03ac727fb961abb7607553461627b97de0b89cf4a62"
+dependencies = [
+ "base64 0.21.7",
+ "bytes",
+ "encoding_rs",
+ "futures-core",
+ "futures-util",
+ "h2 0.3.26",
+ "http 0.2.12",
+ "http-body 0.4.6",
+ "hyper 0.14.32",
+ "hyper-tls",
+ "ipnet",
+ "js-sys",
+ "log",
+ "mime",
+ "native-tls",
+ "once_cell",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustls-pemfile 1.0.4",
+ "serde",
+ "serde_json",
+ "serde_urlencoded",
+ "sync_wrapper 0.1.2",
+ "system-configuration",
+ "tokio",
+ "tokio-native-tls",
+ "tower-service",
+ "url",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+ "winreg",
+]
+
 [[package]]
 name = "reqwest"
 version = "0.12.12"
@@ -2824,10 +3080,10 @@ dependencies = [
 "bytes",
 "futures-core",
 "futures-util",
- "http",
- "http-body",
+ "http 1.2.0",
+ "http-body 1.0.1",
 "http-body-util",
- "hyper",
+ "hyper 1.6.0",
 "hyper-rustls",
 "hyper-util",
 "ipnet",
@@ -2841,12 +3097,12 @@ dependencies = [
 "quinn",
 "rustls",
 "rustls-native-certs 0.8.1",
- "rustls-pemfile",
+ "rustls-pemfile 2.2.0",
 "rustls-pki-types",
 "serde",
 "serde_json",
 "serde_urlencoded",
- "sync_wrapper",
+ "sync_wrapper 1.0.2",
 "tokio",
 "tokio-rustls",
 "tokio-util",
@@ -2872,7 +3128,7 @@ dependencies = [
 "mime",
 "nom",
 "pin-project-lite",
- "reqwest",
+ "reqwest 0.12.12",
 "thiserror 1.0.69",
 ]

@@ -2946,7 +3202,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5bfb394eeed242e909609f56089eecfe5fda225042e8b171791b9c95f5931e5"
 dependencies = [
 "openssl-probe",
- "rustls-pemfile",
+ "rustls-pemfile 2.2.0",
 "rustls-pki-types",
 "schannel",
 "security-framework 2.11.1",
@@ -2964,6 +3220,15 @@ dependencies = [
 "security-framework 3.2.0",
 ]

+[[package]]
+name = "rustls-pemfile"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c"
+dependencies = [
+ "base64 0.21.7",
+]
+
 [[package]]
 name = "rustls-pemfile"
 version = "2.2.0"
@@ -3352,6 +3617,12 @@ dependencies = [
 "unicode-ident",
 ]

+[[package]]
+name = "sync_wrapper"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
+
 [[package]]
 name = "sync_wrapper"
 version = "1.0.2"
@@ -3372,6 +3643,27 @@ dependencies = [
 "syn 2.0.98",
 ]

+[[package]]
+name = "system-configuration"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7"
+dependencies = [
+ "bitflags 1.3.2",
+ "core-foundation 0.9.4",
+ "system-configuration-sys",
+]
+
+[[package]]
+name = "system-configuration-sys"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
 [[package]]
 name = "system-deps"
 version = "6.2.2"
@@ -3575,6 +3867,16 @@ dependencies = [
 "syn 2.0.98",
 ]

+[[package]]
+name = "tokio-native-tls"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
+dependencies = [
+ "native-tls",
+ "tokio",
+]
+
 [[package]]
 name = "tokio-rustls"
 version = "0.26.1"
@@ -3619,7 +3921,7 @@ dependencies = [
 "bytes",
 "futures-core",
 "futures-sink",
- "http",
+ "http 1.2.0",
 "httparse",
 "rand",
 "ring",
@@ -3702,11 +4004,11 @@ dependencies = [
 "axum 0.7.9",
 "base64 0.22.1",
 "bytes",
- "h2",
- "http",
- "http-body",
+ "h2 0.4.8",
+ "http 1.2.0",
+ "http-body 1.0.1",
 "http-body-util",
- "hyper",
+ "hyper 1.6.0",
 "hyper-timeout",
 "hyper-util",
 "percent-encoding",
@@ -3764,7 +4066,7 @@ dependencies = [
 "futures-core",
 "futures-util",
 "pin-project-lite",
- "sync_wrapper",
+ "sync_wrapper 1.0.2",
 "tokio",
 "tower-layer",
 "tower-service",
@@ -4074,6 +4376,12 @@ dependencies = [
 "percent-encoding",
 ]

+[[package]]
+name = "urlencoding"
+version = "2.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
+
 [[package]]
 name = "utf16_iter"
 version = "1.0.5"
@@ -4542,6 +4850,16 @@ dependencies = [
 "memchr",
 ]

+[[package]]
+name = "winreg"
+version = "0.50.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1"
+dependencies = [
+ "cfg-if 1.0.0",
+ "windows-sys 0.48.0",
+]
+
 [[package]]
 name = "wit-bindgen-rt"
 version = "0.33.0"

--- a/applications/llm/count/Cargo.toml
+++ b/applications/llm/count/Cargo.toml
@@ -33,3 +33,11 @@ serde_json = { version = "1" }
 tokio = { version = "1", features = ["full"] }
 tracing = { version = "0.1" }
 thiserror = "1.0"
+opentelemetry = "0.20"
+opentelemetry-prometheus = "0.13"
+prometheus = "0.13"
+rand = "0.8"
+axum = "0.6"
+
+[dev-dependencies]
+reqwest = { version = "0.11", features = ["blocking"] }
\ No newline at end of file
--- a/applications/llm/count/README.md
+++ b/applications/llm/count/README.md
@@ -9,7 +9,7 @@ and then publish an event with the postprocessed data.

 ```bash
 # For more details, try TRD_LOG=debug
-TRD_LOG=info cargo run -- --namespace triton-init --component backend --endpoint generate
+TRD_LOG=info cargo run --bin count -- --namespace triton-init --component backend --endpoint generate

 # 2025-02-26T18:45:05.467026Z  INFO count: Creating unique instance of Count at triton-init/components/count/instance
 # 2025-02-26T18:45:05.472146Z  INFO count: Scraping service triton_init_backend_720278f8 and filtering on subject triton_init_backend_720278f8.generate
@@ -21,14 +21,33 @@ With no matching endpoints running, you should see warnings in the logs:
 2025-02-26T18:45:06.474161Z  WARN count: No endpoints found matching subject triton_init_backend_720278f8.generate
 ```

-But after starting a matching endpoint, such as the
-[service_metrics example](examples/rust/service_metrics/src/bin/server.rs),
-you should see these warnings go away since the endpoint will automatically
-get discovered.
+To see metrics published to a matching endpoint, you can use the
+[mock_worker example](src/bin/mock_worker.rs) in this directory to launch
+1 or more workers that publish LLM Metrics:
+```bash
+# Can run multiple workers in separate shells
+cargo run --bin mock_worker
+```
+
+After a matching endpoint gets started, you should see the warnings go away
+since the endpoint will automatically get discovered.

-Whether there are matching endpoints found or not, `count` will publish events, for example:
+When stats are found from the target endpoints being listened on, count will
+aggregate and publish some metrics as both an event and to a prometheus web server:
 ```
-2025-02-26T18:45:46.501874Z  INFO count: Publishing event l2c.backend.generate on Namespace { name: "triton-init" } with ProcessedEndpoints { capacity_with_ids: [], load_avg: NaN, load_std: NaN, address: "backend.generate" }
+2025-02-28T04:05:58.077901Z  INFO count: Aggregated metrics: ProcessedEndpoints { endpoints: [Endpoint { name: "worker-7587884888253033398", subject: "triton_init_backend_720278f8.generate-694d951a80e06bb6", data: ForwardPassMetrics { request_active_slots: 58, request_total_slots: 100, kv_active_blocks: 77, kv_total_blocks: 100 } }, Endpoint { name: "worker-7587884888253033401", subject: "triton_init_backend_720278f8.generate-694d951a80e06bb9", data: ForwardPassMetrics { request_active_slots: 71, request_total_slots: 100, kv_active_blocks: 29, kv_total_blocks: 100 } }], worker_ids: [7587884888253033398, 7587884888253033401], load_avg: 53.0, load_std: 24.0 }
 ```

-However, the events may not be very useful until there are corresponding stats found from endpoints for processing.
+To see the metrics being published in prometheus format, you can run:
+```bash
+curl localhost:9091/metrics
+
+# # HELP llm_kv_blocks_active Active KV cache blocks
+# # TYPE llm_kv_blocks_active gauge
+# llm_kv_blocks_active{component="backend",endpoint="generate",worker_id="7587884888253033398"} 40
+# llm_kv_blocks_active{component="backend",endpoint="generate",worker_id="7587884888253033401"} 2
+# # HELP llm_kv_blocks_total Total KV cache blocks
+# # TYPE llm_kv_blocks_total gauge
+# llm_kv_blocks_total{component="backend",endpoint="generate",worker_id="7587884888253033398"} 100
+# llm_kv_blocks_total{component="backend",endpoint="generate",worker_id="7587884888253033401"} 100
+```
--- a/applications/llm/count/src/bin/mock_worker.rs
+++ b/applications/llm/count/src/bin/mock_worker.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use rand::Rng;
+use std::sync::Arc;
+use triton_distributed_llm::kv_router::protocols::ForwardPassMetrics;
+use triton_distributed_runtime::{
+    logging,
+    pipeline::{
+        async_trait, network::Ingress, AsyncEngine, AsyncEngineContextProvider, Error, ManyOut,
+        ResponseStream, SingleIn,
+    },
+    protocols::annotated::Annotated,
+    stream, DistributedRuntime, Result, Runtime, Worker,
+};
+
+fn main() -> Result<()> {
+    logging::init();
+    let worker = Worker::from_settings()?;
+    worker.execute(app)
+}
+
+async fn app(runtime: Runtime) -> Result<()> {
+    let distributed = DistributedRuntime::from_settings(runtime.clone()).await?;
+    backend(distributed).await
+}
+
+struct RequestHandler {}
+
+impl RequestHandler {
+    fn new() -> Arc<Self> {
+        Arc::new(Self {})
+    }
+}
+
+#[async_trait]
+impl AsyncEngine<SingleIn<String>, ManyOut<Annotated<String>>, Error> for RequestHandler {
+    async fn generate(&self, input: SingleIn<String>) -> Result<ManyOut<Annotated<String>>> {
+        let (data, ctx) = input.into_parts();
+
+        let chars = data
+            .chars()
+            .map(|c| Annotated::from_data(c.to_string()))
+            .collect::<Vec<_>>();
+
+        let stream = stream::iter(chars);
+
+        Ok(ResponseStream::new(Box::pin(stream), ctx.context()))
+    }
+}
+
+async fn backend(runtime: DistributedRuntime) -> Result<()> {
+    // attach an ingress to an engine
+    let ingress = Ingress::for_engine(RequestHandler::new())?;
+
+    // make the ingress discoverable via a component service
+    // we must first create a service, then we can attach one more more endpoints
+
+    runtime
+        .namespace("triton-init")?
+        .component("backend")?
+        .service_builder()
+        .create()
+        .await?
+        .endpoint("generate")
+        .endpoint_builder()
+        // Dummy stats handler to demonstrate how to attach a custom stats handler
+        .stats_handler(|_stats| {
+            println!("stats in: {:?}", _stats);
+            let request_total_slots = 100;
+            let request_active_slots = rand::thread_rng().gen_range(0..request_total_slots);
+            let kv_total_blocks = 100;
+            let kv_active_blocks = rand::thread_rng().gen_range(0..kv_total_blocks);
+            let stats = ForwardPassMetrics {
+                request_active_slots,
+                request_total_slots,
+                kv_active_blocks,
+                kv_total_blocks,
+            };
+            println!("stats out: {:?}", stats);
+            serde_json::to_value(stats).unwrap()
+        })
+        .handler(ingress)
+        .start()
+        .await
+}
--- a/applications/llm/count/src/lib.rs
+++ b/applications/llm/count/src/lib.rs
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Library functions for the count application.
+
+use axum::{routing::get, Router};
+use prometheus::register_gauge_vec;
+use serde::{Deserialize, Serialize};
+use std::net::SocketAddr;
+
+use triton_distributed_llm::kv_router::protocols::ForwardPassMetrics;
+use triton_distributed_llm::kv_router::scheduler::Endpoint;
+use triton_distributed_llm::kv_router::scoring::ProcessedEndpoints;
+
+use triton_distributed_runtime::{
+    distributed::Component, service::EndpointInfo, utils::Duration, Result,
+};
+
+/// Configuration for LLM worker load capacity metrics
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LLMWorkerLoadCapacityConfig {
+    pub component_name: String,
+    pub endpoint_name: String,
+}
+
+// TODO: This is _really_ close to the async_nats::service::Stats object,
+// but it's missing a few fields like "name", so use a temporary struct
+// for easy deserialization. Ideally, this type already exists or can
+// be exposed in the library somewhere.
+/// Stats structure returned from NATS service API
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct StatsWithData {
+    // Standard NATS Service API fields
+    pub average_processing_time: f64,
+    pub last_error: String,
+    pub num_errors: u64,
+    pub num_requests: u64,
+    pub processing_time: u64,
+    pub queue_group: String,
+    // Field containing custom stats handler data
+    pub data: serde_json::Value,
+}
+
+/// Prometheus metrics server for exposing metrics
+pub struct PrometheusMetricsServer {
+    metrics: PrometheusMetrics,
+}
+
+impl PrometheusMetricsServer {
+    /// Initialize the metrics server
+    pub fn new() -> Result<Self> {
+        Ok(Self {
+            metrics: PrometheusMetrics::new()?,
+        })
+    }
+
+    /// Start the metrics server on the specified port
+    pub fn start(&mut self, port: u16) {
+        // Create an axum router with a metrics endpoint
+        let app = Router::new().route(
+            "/metrics",
+            get(|| async {
+                // Gather and encode metrics
+                use prometheus::Encoder;
+                let encoder = prometheus::TextEncoder::new();
+                let mut buffer = Vec::new();
+                encoder.encode(&prometheus::gather(), &mut buffer).unwrap();
+                String::from_utf8(buffer).unwrap()
+            }),
+        );
+
+        // Create a socket address to listen on
+        let addr = SocketAddr::from(([0, 0, 0, 0], port));
+
+        // Spawn the server in a background task
+        tokio::spawn(async move {
+            axum::Server::bind(&addr)
+                .serve(app.into_make_service())
+                .await
+                .unwrap();
+        });
+
+        tracing::info!("Prometheus metrics server started at {addr:?}/metrics");
+    }
+
+    /// Update metrics with current values
+    pub fn update(&mut self, config: &LLMWorkerLoadCapacityConfig, processed: &ProcessedEndpoints) {
+        self.metrics.update(config, processed);
+    }
+}
+
+/// Prometheus metrics collection
+pub struct PrometheusMetrics {
+    kv_blocks_active: prometheus::GaugeVec,
+    kv_blocks_total: prometheus::GaugeVec,
+    requests_active: prometheus::GaugeVec,
+    requests_total: prometheus::GaugeVec,
+    load_avg: prometheus::GaugeVec,
+    load_std: prometheus::GaugeVec,
+}
+
+impl PrometheusMetrics {
+    /// Initialize all metrics
+    fn new() -> Result<Self> {
+        Ok(Self {
+            kv_blocks_active: register_gauge_vec!(
+                "llm_kv_blocks_active",
+                "Active KV cache blocks",
+                &["component", "endpoint", "worker_id"]
+            )?,
+            kv_blocks_total: register_gauge_vec!(
+                "llm_kv_blocks_total",
+                "Total KV cache blocks",
+                &["component", "endpoint", "worker_id"]
+            )?,
+            requests_active: register_gauge_vec!(
+                "llm_requests_active_slots",
+                "Active request slots",
+                &["component", "endpoint", "worker_id"]
+            )?,
+            requests_total: register_gauge_vec!(
+                "llm_requests_total_slots",
+                "Total request slots",
+                &["component", "endpoint", "worker_id"]
+            )?,
+            load_avg: register_gauge_vec!(
+                "llm_load_avg",
+                "Average load across workers",
+                &["component", "endpoint"]
+            )?,
+            load_std: register_gauge_vec!(
+                "llm_load_std",
+                "Load standard deviation across workers",
+                &["component", "endpoint"]
+            )?,
+        })
+    }
+
+    /// Helper method to set a gauge with worker-specific labels (3 labels)
+    fn set_worker_gauge(
+        &self,
+        gauge: &prometheus::GaugeVec,
+        config: &LLMWorkerLoadCapacityConfig,
+        worker_id: &str,
+        value: f64,
+    ) {
+        gauge
+            .with_label_values(&[&config.component_name, &config.endpoint_name, worker_id])
+            .set(value);
+    }
+
+    /// Helper method to set a gauge with component/endpoint labels only (2 labels)
+    fn set_endpoint_gauge(
+        &self,
+        gauge: &prometheus::GaugeVec,
+        config: &LLMWorkerLoadCapacityConfig,
+        value: f64,
+    ) {
+        gauge
+            .with_label_values(&[&config.component_name, &config.endpoint_name])
+            .set(value);
+    }
+
+    /// Update metrics with current values
+    fn update(&self, config: &LLMWorkerLoadCapacityConfig, processed: &ProcessedEndpoints) {
+        // Update per-worker metrics
+        for endpoint in processed.endpoints.iter() {
+            let worker_id = endpoint.worker_id().to_string();
+            let metrics = endpoint.data.clone();
+
+            self.set_worker_gauge(
+                &self.kv_blocks_active,
+                config,
+                &worker_id,
+                metrics.kv_active_blocks as f64,
+            );
+            self.set_worker_gauge(
+                &self.kv_blocks_total,
+                config,
+                &worker_id,
+                metrics.kv_total_blocks as f64,
+            );
+            self.set_worker_gauge(
+                &self.requests_active,
+                config,
+                &worker_id,
+                metrics.request_active_slots as f64,
+            );
+            self.set_worker_gauge(
+                &self.requests_total,
+                config,
+                &worker_id,
+                metrics.request_total_slots as f64,
+            );
+        }
+
+        // Update aggregate metrics
+        self.set_endpoint_gauge(&self.load_avg, config, processed.load_avg);
+        self.set_endpoint_gauge(&self.load_std, config, processed.load_std);
+    }
+}
+
+/// Collect endpoints from a component
+pub async fn collect_endpoints(
+    component: &Component,
+    subject: &str,
+    timeout: Duration,
+) -> Result<Vec<EndpointInfo>> {
+    // Collect stats from each backend
+    let stream = component.scrape_stats(timeout).await?;
+
+    // Filter the stats by the service subject
+    let endpoints = stream
+        .into_endpoints()
+        .filter(|e| e.subject.starts_with(subject))
+        .collect::<Vec<_>>();
+    tracing::debug!("Endpoints: {endpoints:?}");
+
+    if endpoints.is_empty() {
+        tracing::warn!("No endpoints found matching subject {subject}");
+    }
+
+    Ok(endpoints)
+}
+
+/// Extract metrics from endpoints
+pub fn extract_metrics(endpoints: &[EndpointInfo]) -> Vec<ForwardPassMetrics> {
+    let endpoint_data = endpoints.iter().map(|e| e.data.clone()).collect::<Vec<_>>();
+
+    // Extract StatsWithData objects from endpoint services
+    let stats: Vec<StatsWithData> = endpoint_data
+        .iter()
+        .filter_map(|e| {
+            let metrics_data = e.as_ref()?;
+            metrics_data.clone().decode::<StatsWithData>().ok()
+        })
+        .collect();
+    tracing::debug!("Stats: {stats:?}");
+
+    // Extract ForwardPassMetrics nested within Stats object
+    let metrics: Vec<ForwardPassMetrics> = stats
+        .iter()
+        .filter_map(
+            |s| match serde_json::from_value::<ForwardPassMetrics>(s.data.clone()) {
+                Ok(metrics) => Some(metrics),
+                Err(err) => {
+                    tracing::warn!("Error decoding metrics: {err}");
+                    None
+                }
+            },
+        )
+        .collect();
+    tracing::debug!("Metrics: {metrics:?}");
+
+    metrics
+}
+
+/// Create ProcessedEndpoints from metrics and endpoints
+pub fn postprocess_metrics(
+    metrics: &[ForwardPassMetrics],
+    endpoints: &[EndpointInfo],
+) -> ProcessedEndpoints {
+    let processed_endpoints: Vec<Endpoint> = metrics
+        .iter()
+        .zip(endpoints.iter())
+        .filter_map(|(m, e)| {
+            e.id().ok().map(|id| Endpoint {
+                name: format!("worker-{id}"),
+                subject: e.subject.clone(),
+                data: m.clone(),
+            })
+        })
+        .collect();
+
+    ProcessedEndpoints::new(processed_endpoints)
+}
--- a/applications/llm/count/src/main.rs
+++ b/applications/llm/count/src/main.rs
@@ -24,8 +24,6 @@
 //!   - KV Cache Blocks: [Active, Total]

 use clap::Parser;
-use serde::{Deserialize, Serialize};
-
 use triton_distributed_runtime::{
    error, logging,
    traits::events::EventPublisher,
@@ -33,6 +31,12 @@ use triton_distributed_runtime::{
    DistributedRuntime, ErrorContext, Result, Runtime, Worker,
 };

+// Import from our library
+use count::{
+    collect_endpoints, extract_metrics, postprocess_metrics, LLMWorkerLoadCapacityConfig,
+    PrometheusMetricsServer,
+};
+
 /// CLI arguments for the count application
 #[derive(Parser, Debug)]
 #[command(author, version, about, long_about = None)]
@@ -73,34 +77,8 @@ fn get_config(args: &Args) -> Result<LLMWorkerLoadCapacityConfig> {
    })
 }

-// we will scrape the service_name and extract the endpoint_name metrics
-// we will bcast them as {namespace}.events.l2c.{service_name}.{endpoint_name}
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct LLMWorkerLoadCapacityConfig {
-    component_name: String,
-    endpoint_name: String,
-}
-
-/// LLM Worker Load Capacity Metrics
-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct LLMWorkerLoadCapacity {
-    pub requests_active_slots: u32,
-    pub requests_total_slots: u32,
-    pub kv_blocks_active: u32,
-    pub kv_blocks_total: u32,
-}
-
-fn main() -> Result<()> {
-    logging::init();
-    let worker = Worker::from_settings()?;
-    worker.execute(app)
-}
-
-// TODO - refactor much of this back into the library
 async fn app(runtime: Runtime) -> Result<()> {
    let args = Args::parse();
-    // we will start by assuming that there is no oscar and no planner
-    // to that end, we will use CLI args to get a singular config for scraping a single backend
    let config = get_config(&args)?;
    tracing::info!("Config: {config:?}");

@@ -109,8 +87,7 @@ async fn app(runtime: Runtime) -> Result<()> {
    let namespace = drt.namespace(args.namespace)?;
    let component = namespace.component("count")?;

-    // there should only be one count
-    // check {component.etcd_path()}/instance for existing instances
+    // Create unique instance of Count
    let key = format!("{}/instance", component.etcd_path());
    tracing::info!("Creating unique instance of Count at {key}");
    drt.etcd_client()
@@ -122,113 +99,53 @@ async fn app(runtime: Runtime) -> Result<()> {
        .await
        .context("Unable to create unique instance of Count; possibly one already exists")?;

-    let target = namespace.component(&config.component_name)?;
-    let target_endpoint = target.endpoint(&config.endpoint_name);
+    let target_component = namespace.component(&config.component_name)?;
+    let target_endpoint = target_component.endpoint(&config.endpoint_name);

-    let service_name = target.service_name();
+    let service_name = target_component.service_name();
    let service_subject = target_endpoint.subject();
    tracing::info!("Scraping service {service_name} and filtering on subject {service_subject}");

    let token = drt.primary_lease().child_token();
+    let event_name = format!("l2c.{}.{}", config.component_name, config.endpoint_name);

-    let address = format!("{}.{}", config.component_name, config.endpoint_name,);
-    let event_name = format!("l2c.{}", address);
+    // TODO: Make metrics host/port configurable
+    // Initialize Prometheus metrics and start server
+    let mut metrics_server = PrometheusMetricsServer::new()?;
+    metrics_server.start(9091);

    loop {
        let next = Instant::now() + Duration::from_secs(args.poll_interval);

-        // collect stats from each backend
-        let stream = target.scrape_stats(Duration::from_secs(1)).await?;
-        tracing::debug!("Scraped Stats Stream: {stream:?}");
-
-        // filter the stats by the service subject
-        let endpoints = stream
-            .into_endpoints()
-            .filter(|e| e.subject.starts_with(&service_subject))
-            .collect::<Vec<_>>();
+        // Collect and process metrics
+        let scrape_timeout = Duration::from_secs(1);
+        let endpoints =
+            collect_endpoints(&target_component, &service_subject, scrape_timeout).await?;
+        let metrics = extract_metrics(&endpoints);
+        let processed = postprocess_metrics(&metrics, &endpoints);
+        tracing::info!("Aggregated metrics: {processed:?}");

-        tracing::debug!("Endpoints: {endpoints:?}");
-        if endpoints.is_empty() {
-            tracing::warn!("No endpoints found matching subject {}", service_subject);
-        }
+        // Update Prometheus metrics
+        metrics_server.update(&config, &processed);

-        // extract the custom data from the stats and try to decode it as LLMWorkerLoadCapacity
-        let metrics = endpoints
-            .iter()
-            .filter_map(|e| match e.data.clone() {
-                Some(metrics) => metrics.decode::<LLMWorkerLoadCapacity>().ok(),
-                None => None,
-            })
-            .collect::<Vec<_>>();
-        tracing::debug!("Metrics: {metrics:?}");
-
-        // parse the endpoint ids
-        // the ids are the last part of the subject in hexadecimal
-        // form a list of tuples (kv_blocks_total - kv_blocks_active, requests_total_slots - requests_active_slots, id)
-        // this tuple represent the remaining capacity of each endpoint
-        let capacity_with_ids = metrics
-            .iter()
-            .zip(endpoints.iter())
-            .filter_map(|(m, e)| {
-                e.id().ok().map(|id| {
-                    (
-                        m.kv_blocks_total - m.kv_blocks_active,
-                        m.requests_total_slots - m.requests_active_slots,
-                        id,
-                    )
-                })
-            })
-            .collect::<Vec<_>>();
-
-        // compute mean / std of LLMWorkerLoadCapacity
-        let load_values: Vec<f64> = metrics.iter().map(|x| x.kv_blocks_active as f64).collect();
-        let load_avg = load_values.iter().sum::<f64>() / load_values.len() as f64;
-        let variance = load_values
-            .iter()
-            .map(|&x| (x - load_avg).powi(2))
-            .sum::<f64>()
-            / load_values.len() as f64;
-        let load_std = variance.sqrt();
-
-        let processed = ProcessedEndpoints {
-            capacity_with_ids,
-            load_avg,
-            load_std,
-            address: address.clone(),
-        };
-
-        // publish using the namespace event plane
-        tracing::info!(
-            "Publishing event {event_name} on namespace {namespace:?} with {processed:?}"
-        );
+        // TODO: Who needs to consume these events?
+        // Publish metrics event
        namespace.publish(&event_name, &processed).await?;

-        // wait until cancelled or the next tick
+        // Wait until cancelled or the next tick
        match tokio::time::timeout_at(next, token.cancelled()).await {
            Ok(_) => break,
-            Err(_) => {
-                // timeout, we continue
-                continue;
-            }
+            Err(_) => continue,
        }
    }

    Ok(())
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct ProcessedEndpoints {
-    /// (kv_blocks_total - kv_blocks_active, requests_total_slots - requests_active_slots, id)
-    pub capacity_with_ids: Vec<(u32, u32, i64)>,
-
-    /// kv_blocks_active average
-    pub load_avg: f64,
-
-    /// kv_blocks_active standard deviation
-    pub load_std: f64,
-
-    /// {component}.{endpoint}
-    pub address: String,
+fn main() -> Result<()> {
+    logging::init();
+    let worker = Worker::from_settings()?;
+    worker.execute(app)
 }

 #[cfg(test)]
@@ -239,11 +156,7 @@ mod tests {
    #[test]
    fn test_namespace_from_env() {
        env::set_var("TRD_NAMESPACE", "test-namespace");
-
-        // Parse args with no explicit namespace
        let args = Args::parse_from(["count", "--component", "comp", "--endpoint", "end"]);
-
-        // Verify namespace was taken from environment variable
        assert_eq!(args.namespace, "test-namespace");
    }
 }
--- a/applications/llm/count/visualization/README.md
+++ b/applications/llm/count/visualization/README.md
+# Metrics Visualization with Prometheus and Grafana
+
+This directory contains configuration for visualizing metrics from the metrics aggregation service using Prometheus and Grafana.
+
+## Components
+
+- **Prometheus**: Collects and stores metrics from the service
+- **Grafana**: Provides visualization dashboards for the metrics
+
+## Getting Started
+
+1. Make sure Docker and Docker Compose are installed on your system
+
+2. Start `count` and the corresponding `examples/rust/service_metrics/bin/server.rs` that populates dummy KV Cache metrics.
+
+3. Start the visualization stack:
+
+  ```bash
+  docker compose up -d
+  ```
+
+4. Web servers started:
+   - Grafana: http://localhost:3000 (default login: admin/admin)
+   - Prometheus: http://localhost:9090
+
+## Configuration
+
+### Prometheus
+
+The Prometheus configuration is defined in `prometheus.yml`. It is configured to scrape metrics from the metrics aggregation service endpoint.
+
+Note: You may need to adjust the target based on your host configuration and network setup.
+
+### Grafana
+
+Grafana is pre-configured with:
+- Prometheus datasource
+- Sample dashboard for visualizing service metrics
+
+## Required Files
+
+The following configuration files should be present in this directory:
+- `docker-compose.yml`: Defines the Prometheus and Grafana services
+- `prometheus.yml`: Contains Prometheus scraping configuration
+- `grafana.json`: Contains Grafana dashboard configuration
+- `grafana-datasources.yml`: Contains Grafana datasource configuration
+- `grafana-dashboard-providers.yml`: Contains Grafana dashboard provider configuration
+
+## Metrics
+
+The prometheus service exposes the following metrics:
+- `llm_load_avg`: Average load across workers
+- `llm_load_std`: Load standard deviation across workers
+- `llm_requests_active_slots`: Number of currently active request slots
+- `llm_requests_total_slots`: Total available request slots
+- `llm_kv_blocks_active`: Number of active KV blocks
+- `llm_kv_blocks_total`: Total KV blocks available
+
+## Troubleshooting
+
+1. Verify services are running:
+  ```bash
+  docker compose ps
+  ```
+
+2. Check logs:
+  ```bash
+  docker compose logs prometheus
+  docker compose logs grafana
+  ```
+
--- a/applications/llm/count/visualization/docker-compose.yml
+++ b/applications/llm/count/visualization/docker-compose.yml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+services:
+  prometheus:
+    image: prom/prometheus:latest
+    container_name: prometheus
+    volumes:
+      - ./prometheus.yml:/etc/prometheus/prometheus.yml
+      - prometheus_data:/prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      # These provide the web console functionality
+      - '--web.console.libraries=/etc/prometheus/console_libraries'
+      - '--web.console.templates=/etc/prometheus/consoles'
+      - '--web.enable-lifecycle'
+    restart: unless-stopped
+    # TODO: Use more explicit networking setup when count is containerized
+    #ports:
+    #  - "9090:9090"
+    #networks:
+    #  - monitoring
+    network_mode: "host"
+
+  grafana:
+    image: grafana/grafana-enterprise:latest
+    container_name: grafana
+    volumes:
+      - ./grafana.json:/etc/grafana/provisioning/dashboards/llm-worker-dashboard.json
+      - ./grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml
+      - ./grafana-dashboard-providers.yml:/etc/grafana/provisioning/dashboards/dashboard-providers.yml
+      - grafana_data:/var/lib/grafana
+    environment:
+      - GF_SECURITY_ADMIN_USER=admin
+      - GF_SECURITY_ADMIN_PASSWORD=admin
+      - GF_USERS_ALLOW_SIGN_UP=false
+      - GF_INSTALL_PLUGINS=grafana-piechart-panel
+    restart: unless-stopped
+    # TODO: Use more explicit networking setup when count is containerized
+    #ports:
+    #  - "3000:3000"
+    #networks:
+    #  - monitoring
+    network_mode: "host"
+    depends_on:
+      - prometheus
+
+networks:
+  monitoring:
+    driver: bridge
+
+volumes:
+  prometheus_data:
+  grafana_data:
--- a/applications/llm/count/visualization/grafana-dashboard-providers.yml
+++ b/applications/llm/count/visualization/grafana-dashboard-providers.yml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: 1
+
+providers:
+  - name: 'default'
+    orgId: 1
+    folder: ''
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 10
+    allowUiUpdates: true
+    options:
+      path: /etc/grafana/provisioning/dashboards
+      foldersFromFilesStructure: true
--- a/applications/llm/count/visualization/grafana-datasources.yml
+++ b/applications/llm/count/visualization/grafana-datasources.yml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: 1
+
+datasources:
+  - name: prometheus
+    type: prometheus
+    access: proxy
+    # TODO: Use proper docker networking
+    # url: http://prometheus:9090
+    url: http://localhost:9090
+    isDefault: true
--- a/applications/llm/count/visualization/grafana.json
+++ b/applications/llm/count/visualization/grafana.json
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "copyright": [
+        "SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.",
+        "SPDX-License-Identifier: Apache-2.0",
+        "Licensed under the Apache License, Version 2.0 (the \"License\");",
+        "you may not use this file except in compliance with the License.",
+        "You may obtain a copy of the License at",
+        "http://www.apache.org/licenses/LICENSE-2.0",
+        "Unless required by applicable law or agreed to in writing, software",
+        "distributed under the License is distributed on an \"AS IS\" BASIS,",
+        "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.",
+        "See the License for the specific language governing permissions and",
+        "limitations under the License."
+  ],
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": 1,
+  "links": [],
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 20,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineInterpolation": "smooth",
+            "lineWidth": 2,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 0
+      },
+      "id": 1,
+      "options": {
+        "legend": {
+          "calcs": [
+            "mean",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "title": "KV Cache Utilization by Worker",
+      "type": "timeseries",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "editorMode": "code",
+          "expr": "100 * llm_kv_blocks_active{component=\"$component\", endpoint=\"$endpoint\"} / llm_kv_blocks_total{component=\"$component\", endpoint=\"$endpoint\"}",
+          "legendFormat": "Worker {{worker_id}}",
+          "range": true,
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 20,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineInterpolation": "smooth",
+            "lineWidth": 2,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 0
+      },
+      "id": 2,
+      "options": {
+        "legend": {
+          "calcs": [
+            "mean",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "title": "Request Slot Utilization by Worker",
+      "type": "timeseries",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "editorMode": "code",
+          "expr": "100 * llm_requests_active_slots{component=\"$component\", endpoint=\"$endpoint\"} / llm_requests_total_slots{component=\"$component\", endpoint=\"$endpoint\"}",
+          "legendFormat": "Worker {{worker_id}}",
+          "range": true,
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 50
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 6,
+        "x": 0,
+        "y": 8
+      },
+      "id": 3,
+      "options": {
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "showThresholdLabels": false,
+        "showThresholdMarkers": true
+      },
+      "pluginVersion": "10.0.0",
+      "title": "Average KV Cache Utilization",
+      "type": "gauge",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "editorMode": "code",
+          "expr": "100 * avg(llm_kv_blocks_active{component=\"$component\", endpoint=\"$endpoint\"}) / avg(llm_kv_blocks_total{component=\"$component\", endpoint=\"$endpoint\"})",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "yellow",
+                "value": 50
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "percent"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 6,
+        "x": 6,
+        "y": 8
+      },
+      "id": 4,
+      "options": {
+        "orientation": "auto",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "showThresholdLabels": false,
+        "showThresholdMarkers": true
+      },
+      "pluginVersion": "10.0.0",
+      "title": "Average Request Slot Utilization",
+      "type": "gauge",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "editorMode": "code",
+          "expr": "100 * avg(llm_requests_active_slots{component=\"$component\", endpoint=\"$endpoint\"}) / avg(llm_requests_total_slots{component=\"$component\", endpoint=\"$endpoint\"})",
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ]
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 20,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineInterpolation": "smooth",
+            "lineWidth": 2,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 8
+      },
+      "id": 5,
+      "options": {
+        "legend": {
+          "calcs": [
+            "mean",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "title": "Load Average & Standard Deviation",
+      "type": "timeseries",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "editorMode": "code",
+          "expr": "llm_load_avg{component=\"$component\", endpoint=\"$endpoint\"}",
+          "legendFormat": "Average",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "editorMode": "code",
+          "expr": "llm_load_std{component=\"$component\", endpoint=\"$endpoint\"}",
+          "hide": false,
+          "legendFormat": "StdDev",
+          "range": true,
+          "refId": "B"
+        }
+      ]
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 20,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "lineInterpolation": "smooth",
+            "lineWidth": 2,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "never",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          },
+          "unit": "none"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 24,
+        "x": 0,
+        "y": 16
+      },
+      "id": 6,
+      "options": {
+        "legend": {
+          "calcs": [
+            "mean",
+            "max"
+          ],
+          "displayMode": "table",
+          "placement": "right",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "multi",
+          "sort": "none"
+        }
+      },
+      "title": "Available Resources",
+      "type": "timeseries",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "editorMode": "code",
+          "expr": "sum(llm_kv_blocks_total{component=\"$component\", endpoint=\"$endpoint\"} - llm_kv_blocks_active{component=\"$component\", endpoint=\"$endpoint\"})",
+          "legendFormat": "Available KV Blocks",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "prometheus"
+          },
+          "editorMode": "code",
+          "expr": "sum(llm_requests_total_slots{component=\"$component\", endpoint=\"$endpoint\"} - llm_requests_active_slots{component=\"$component\", endpoint=\"$endpoint\"})",
+          "hide": false,
+          "legendFormat": "Available Request Slots",
+          "range": true,
+          "refId": "B"
+        }
+      ]
+    }
+  ],
+  "refresh": "5s",
+  "schemaVersion": 38,
+  "style": "dark",
+  "tags": [
+    "llm",
+    "metrics"
+  ],
+  "templating": {
+    "list": [
+      {
+        "current": {
+          "selected": false,
+          "text": "backend",
+          "value": "backend"
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "definition": "label_values(llm_kv_blocks_active, component)",
+        "hide": 0,
+        "includeAll": false,
+        "label": "Component",
+        "multi": false,
+        "name": "component",
+        "options": [],
+        "query": {
+          "query": "label_values(llm_kv_blocks_active, component)",
+          "refId": "StandardVariableQuery"
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "type": "query"
+      },
+      {
+        "current": {
+          "selected": false,
+          "text": "generate",
+          "value": "generate"
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "prometheus"
+        },
+        "definition": "label_values(llm_kv_blocks_active{component=\"$component\"}, endpoint)",
+        "hide": 0,
+        "includeAll": false,
+        "label": "Endpoint",
+        "multi": false,
+        "name": "endpoint",
+        "options": [],
+        "query": {
+          "query": "label_values(llm_kv_blocks_active{component=\"$component\"}, endpoint)",
+          "refId": "StandardVariableQuery"
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "type": "query"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-15m",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "LLM Worker Metrics",
+  "uid": "llm-worker-metrics",
+  "version": 1,
+  "weekStart": ""
+}
--- a/applications/llm/count/visualization/prometheus.yml
+++ b/applications/llm/count/visualization/prometheus.yml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+global:
+  scrape_interval: 1s
+  evaluation_interval: 1s
+
+scrape_configs:
+  - job_name: 'count'
+    static_configs:
+      # TODO: Use proper docker networking
+      # - targets: ['host.docker.internal:9091']
+      - targets: ['localhost:9091']
--- a/examples/rust/service_metrics/Cargo.toml
+++ b/examples/rust/service_metrics/Cargo.toml
@@ -29,4 +29,4 @@ triton-distributed-runtime = { workspace = true }
 futures = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
-tokio = { workspace = true }
+tokio = { workspace = true }
\ No newline at end of file
--- a/lib/llm/src/kv_router.rs
+++ b/lib/llm/src/kv_router.rs
@@ -23,8 +23,8 @@ use triton_distributed_runtime::{component::Component, DistributedRuntime};
 pub mod indexer;
 pub mod protocols;
 pub mod publisher;
-mod scheduler;
-mod scoring;
+pub mod scheduler;
+pub mod scoring;

 use crate::kv_router::{
    indexer::{KvIndexer, KvIndexerInterface, RouterEvent},

--- a/lib/llm/src/kv_router/scoring.rs
+++ b/lib/llm/src/kv_router/scoring.rs
@@ -15,11 +15,12 @@

 //! Scoring functions for the KV router.

+use serde::{Deserialize, Serialize};
 use std::collections::HashSet;

 use crate::kv_router::scheduler::Endpoint;

-#[derive(Debug, Default)]
+#[derive(Debug, Default, Serialize, Deserialize)]
 pub struct ProcessedEndpoints {
    pub endpoints: Vec<Endpoint>,
    pub worker_ids: Vec<i64>,