Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
d38325c2
Commit
d38325c2
authored
Feb 28, 2025
by
Ryan McCormick
Committed by
GitHub
Feb 28, 2025
Browse files
feat: Add initial prometheus/grafana support for count (#303)
parent
6e0cfbd9
Changes
15
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
1685 additions
and
174 deletions
+1685
-174
applications/llm/count/Cargo.lock
applications/llm/count/Cargo.lock
+361
-43
applications/llm/count/Cargo.toml
applications/llm/count/Cargo.toml
+8
-0
applications/llm/count/README.md
applications/llm/count/README.md
+27
-8
applications/llm/count/src/bin/mock_worker.rs
applications/llm/count/src/bin/mock_worker.rs
+98
-0
applications/llm/count/src/lib.rs
applications/llm/count/src/lib.rs
+288
-0
applications/llm/count/src/main.rs
applications/llm/count/src/main.rs
+32
-119
applications/llm/count/visualization/README.md
applications/llm/count/visualization/README.md
+71
-0
applications/llm/count/visualization/docker-compose.yml
applications/llm/count/visualization/docker-compose.yml
+67
-0
applications/llm/count/visualization/grafana-dashboard-providers.yml
...s/llm/count/visualization/grafana-dashboard-providers.yml
+28
-0
applications/llm/count/visualization/grafana-datasources.yml
applications/llm/count/visualization/grafana-datasources.yml
+25
-0
applications/llm/count/visualization/grafana.json
applications/llm/count/visualization/grafana.json
+650
-0
applications/llm/count/visualization/prometheus.yml
applications/llm/count/visualization/prometheus.yml
+25
-0
examples/rust/service_metrics/Cargo.toml
examples/rust/service_metrics/Cargo.toml
+1
-1
lib/llm/src/kv_router.rs
lib/llm/src/kv_router.rs
+2
-2
lib/llm/src/kv_router/scoring.rs
lib/llm/src/kv_router/scoring.rs
+2
-1
No files found.
applications/llm/count/Cargo.lock
View file @
d38325c2
...
...
@@ -128,7 +128,7 @@ dependencies = [
"regex",
"ring",
"rustls-native-certs 0.7.3",
"rustls-pemfile",
"rustls-pemfile
2.2.0
",
"rustls-webpki",
"serde",
"serde_json",
...
...
@@ -164,7 +164,7 @@ dependencies = [
"eventsource-stream",
"futures",
"rand",
"reqwest",
"reqwest
0.12.12
",
"reqwest-eventsource",
"secrecy",
"serde",
...
...
@@ -244,6 +244,38 @@ version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
[[package]]
name = "axum"
version = "0.6.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf"
dependencies = [
"async-trait",
"axum-core 0.3.4",
"bitflags 1.3.2",
"bytes",
"futures-util",
"http 0.2.12",
"http-body 0.4.6",
"hyper 0.14.32",
"itoa",
"matchit 0.7.3",
"memchr",
"mime",
"percent-encoding",
"pin-project-lite",
"rustversion",
"serde",
"serde_json",
"serde_path_to_error",
"serde_urlencoded",
"sync_wrapper 0.1.2",
"tokio",
"tower 0.4.13",
"tower-layer",
"tower-service",
]
[[package]]
name = "axum"
version = "0.7.9"
...
...
@@ -254,8 +286,8 @@ dependencies = [
"axum-core 0.4.5",
"bytes",
"futures-util",
"http",
"http-body",
"http
1.2.0
",
"http-body
1.0.1
",
"http-body-util",
"itoa",
"matchit 0.7.3",
...
...
@@ -265,7 +297,7 @@ dependencies = [
"pin-project-lite",
"rustversion",
"serde",
"sync_wrapper",
"sync_wrapper
1.0.2
",
"tower 0.5.2",
"tower-layer",
"tower-service",
...
...
@@ -281,10 +313,10 @@ dependencies = [
"bytes",
"form_urlencoded",
"futures-util",
"http",
"http-body",
"http
1.2.0
",
"http-body
1.0.1
",
"http-body-util",
"hyper",
"hyper
1.6.0
",
"hyper-util",
"itoa",
"matchit 0.8.4",
...
...
@@ -297,7 +329,7 @@ dependencies = [
"serde_json",
"serde_path_to_error",
"serde_urlencoded",
"sync_wrapper",
"sync_wrapper
1.0.2
",
"tokio",
"tower 0.5.2",
"tower-layer",
...
...
@@ -305,6 +337,23 @@ dependencies = [
"tracing",
]
[[package]]
name = "axum-core"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "759fa577a247914fd3f7f76d62972792636412fbfd634cd452f6a385a74d2d2c"
dependencies = [
"async-trait",
"bytes",
"futures-util",
"http 0.2.12",
"http-body 0.4.6",
"mime",
"rustversion",
"tower-layer",
"tower-service",
]
[[package]]
name = "axum-core"
version = "0.4.5"
...
...
@@ -314,13 +363,13 @@ dependencies = [
"async-trait",
"bytes",
"futures-util",
"http",
"http-body",
"http
1.2.0
",
"http-body
1.0.1
",
"http-body-util",
"mime",
"pin-project-lite",
"rustversion",
"sync_wrapper",
"sync_wrapper
1.0.2
",
"tower-layer",
"tower-service",
]
...
...
@@ -333,13 +382,13 @@ checksum = "df1362f362fd16024ae199c1970ce98f9661bf5ef94b9808fee734bc3698b733"
dependencies = [
"bytes",
"futures-util",
"http",
"http-body",
"http
1.2.0
",
"http-body
1.0.1
",
"http-body-util",
"mime",
"pin-project-lite",
"rustversion",
"sync_wrapper",
"sync_wrapper
1.0.2
",
"tower-layer",
"tower-service",
"tracing",
...
...
@@ -380,6 +429,12 @@ version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
[[package]]
name = "base64"
version = "0.21.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
[[package]]
name = "base64"
version = "0.22.1"
...
...
@@ -631,7 +686,13 @@ checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
name = "count"
version = "0.1.0"
dependencies = [
"axum 0.6.20",
"clap",
"opentelemetry",
"opentelemetry-prometheus",
"prometheus",
"rand",
"reqwest 0.11.27",
"serde",
"serde_json",
"thiserror 1.0.69",
...
...
@@ -963,6 +1024,15 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
[[package]]
name = "encoding_rs"
version = "0.8.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
dependencies = [
"cfg-if 1.0.0",
]
[[package]]
name = "enum-ordinalize"
version = "4.3.0"
...
...
@@ -1024,7 +1094,7 @@ version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fc0452bcc559431b16f472b7ab86e2f9ccd5f3c2da3795afbd6b773665e047fe"
dependencies = [
"http",
"http
1.2.0
",
"prost",
"tokio",
"tokio-stream",
...
...
@@ -1282,6 +1352,25 @@ version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
[[package]]
name = "h2"
version = "0.3.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8"
dependencies = [
"bytes",
"fnv",
"futures-core",
"futures-sink",
"futures-util",
"http 0.2.12",
"indexmap 2.7.1",
"slab",
"tokio",
"tokio-util",
"tracing",
]
[[package]]
name = "h2"
version = "0.4.8"
...
...
@@ -1293,7 +1382,7 @@ dependencies = [
"fnv",
"futures-core",
"futures-sink",
"http",
"http
1.2.0
",
"indexmap 2.7.1",
"slab",
"tokio",
...
...
@@ -1336,6 +1425,17 @@ dependencies = [
"ureq",
]
[[package]]
name = "http"
version = "0.2.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1"
dependencies = [
"bytes",
"fnv",
"itoa",
]
[[package]]
name = "http"
version = "1.2.0"
...
...
@@ -1347,6 +1447,17 @@ dependencies = [
"itoa",
]
[[package]]
name = "http-body"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2"
dependencies = [
"bytes",
"http 0.2.12",
"pin-project-lite",
]
[[package]]
name = "http-body"
version = "1.0.1"
...
...
@@ -1354,7 +1465,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
dependencies = [
"bytes",
"http",
"http
1.2.0
",
]
[[package]]
...
...
@@ -1365,8 +1476,8 @@ checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f"
dependencies = [
"bytes",
"futures-util",
"http",
"http-body",
"http
1.2.0
",
"http-body
1.0.1
",
"pin-project-lite",
]
...
...
@@ -1388,6 +1499,30 @@ version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
[[package]]
name = "hyper"
version = "0.14.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7"
dependencies = [
"bytes",
"futures-channel",
"futures-core",
"futures-util",
"h2 0.3.26",
"http 0.2.12",
"http-body 0.4.6",
"httparse",
"httpdate",
"itoa",
"pin-project-lite",
"socket2",
"tokio",
"tower-service",
"tracing",
"want",
]
[[package]]
name = "hyper"
version = "1.6.0"
...
...
@@ -1397,9 +1532,9 @@ dependencies = [
"bytes",
"futures-channel",
"futures-util",
"h2",
"http",
"http-body",
"h2
0.4.8
",
"http
1.2.0
",
"http-body
1.0.1
",
"httparse",
"httpdate",
"itoa",
...
...
@@ -1416,8 +1551,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2d191583f3da1305256f22463b9bb0471acad48a4e534a5218b9963e9c1f59b2"
dependencies = [
"futures-util",
"http",
"hyper",
"http
1.2.0
",
"hyper
1.6.0
",
"hyper-util",
"rustls",
"rustls-native-certs 0.8.1",
...
...
@@ -1433,13 +1568,26 @@ version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0"
dependencies = [
"hyper",
"hyper
1.6.0
",
"hyper-util",
"pin-project-lite",
"tokio",
"tower-service",
]
[[package]]
name = "hyper-tls"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905"
dependencies = [
"bytes",
"hyper 0.14.32",
"native-tls",
"tokio",
"tokio-native-tls",
]
[[package]]
name = "hyper-util"
version = "0.1.10"
...
...
@@ -1449,9 +1597,9 @@ dependencies = [
"bytes",
"futures-channel",
"futures-util",
"http",
"http-body",
"hyper",
"http
1.2.0
",
"http-body
1.0.1
",
"hyper
1.6.0
",
"pin-project-lite",
"socket2",
"tokio",
...
...
@@ -2280,12 +2428,80 @@ dependencies = [
"vcpkg",
]
[[package]]
name = "opentelemetry"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9591d937bc0e6d2feb6f71a559540ab300ea49955229c347a517a28d27784c54"
dependencies = [
"opentelemetry_api",
"opentelemetry_sdk",
]
[[package]]
name = "opentelemetry-prometheus"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7d81bc254e2d572120363a2b16cdb0d715d301b5789be0cfc26ad87e4e10e53"
dependencies = [
"once_cell",
"opentelemetry_api",
"opentelemetry_sdk",
"prometheus",
"protobuf",
]
[[package]]
name = "opentelemetry_api"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a81f725323db1b1206ca3da8bb19874bbd3f57c3bcd59471bfb04525b265b9b"
dependencies = [
"futures-channel",
"futures-util",
"indexmap 1.9.3",
"js-sys",
"once_cell",
"pin-project-lite",
"thiserror 1.0.69",
"urlencoding",
]
[[package]]
name = "opentelemetry_sdk"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa8e705a0612d48139799fcbaba0d4a90f06277153e43dd2bdc16c6f0edd8026"
dependencies = [
"async-trait",
"crossbeam-channel",
"futures-channel",
"futures-executor",
"futures-util",
"once_cell",
"opentelemetry_api",
"ordered-float",
"percent-encoding",
"rand",
"regex",
"thiserror 1.0.69",
]
[[package]]
name = "option-ext"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
[[package]]
name = "ordered-float"
version = "3.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc"
dependencies = [
"num-traits",
]
[[package]]
name = "overload"
version = "0.1.1"
...
...
@@ -2677,7 +2893,7 @@ dependencies = [
"once_cell",
"socket2",
"tracing",
"windows-sys 0.5
2
.0",
"windows-sys 0.5
9
.0",
]
[[package]]
...
...
@@ -2814,6 +3030,46 @@ version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
[[package]]
name = "reqwest"
version = "0.11.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd67538700a17451e7cba03ac727fb961abb7607553461627b97de0b89cf4a62"
dependencies = [
"base64 0.21.7",
"bytes",
"encoding_rs",
"futures-core",
"futures-util",
"h2 0.3.26",
"http 0.2.12",
"http-body 0.4.6",
"hyper 0.14.32",
"hyper-tls",
"ipnet",
"js-sys",
"log",
"mime",
"native-tls",
"once_cell",
"percent-encoding",
"pin-project-lite",
"rustls-pemfile 1.0.4",
"serde",
"serde_json",
"serde_urlencoded",
"sync_wrapper 0.1.2",
"system-configuration",
"tokio",
"tokio-native-tls",
"tower-service",
"url",
"wasm-bindgen",
"wasm-bindgen-futures",
"web-sys",
"winreg",
]
[[package]]
name = "reqwest"
version = "0.12.12"
...
...
@@ -2824,10 +3080,10 @@ dependencies = [
"bytes",
"futures-core",
"futures-util",
"http",
"http-body",
"http
1.2.0
",
"http-body
1.0.1
",
"http-body-util",
"hyper",
"hyper
1.6.0
",
"hyper-rustls",
"hyper-util",
"ipnet",
...
...
@@ -2841,12 +3097,12 @@ dependencies = [
"quinn",
"rustls",
"rustls-native-certs 0.8.1",
"rustls-pemfile",
"rustls-pemfile
2.2.0
",
"rustls-pki-types",
"serde",
"serde_json",
"serde_urlencoded",
"sync_wrapper",
"sync_wrapper
1.0.2
",
"tokio",
"tokio-rustls",
"tokio-util",
...
...
@@ -2872,7 +3128,7 @@ dependencies = [
"mime",
"nom",
"pin-project-lite",
"reqwest",
"reqwest
0.12.12
",
"thiserror 1.0.69",
]
...
...
@@ -2946,7 +3202,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5bfb394eeed242e909609f56089eecfe5fda225042e8b171791b9c95f5931e5"
dependencies = [
"openssl-probe",
"rustls-pemfile",
"rustls-pemfile
2.2.0
",
"rustls-pki-types",
"schannel",
"security-framework 2.11.1",
...
...
@@ -2964,6 +3220,15 @@ dependencies = [
"security-framework 3.2.0",
]
[[package]]
name = "rustls-pemfile"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c"
dependencies = [
"base64 0.21.7",
]
[[package]]
name = "rustls-pemfile"
version = "2.2.0"
...
...
@@ -3352,6 +3617,12 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "sync_wrapper"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
[[package]]
name = "sync_wrapper"
version = "1.0.2"
...
...
@@ -3372,6 +3643,27 @@ dependencies = [
"syn 2.0.98",
]
[[package]]
name = "system-configuration"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7"
dependencies = [
"bitflags 1.3.2",
"core-foundation 0.9.4",
"system-configuration-sys",
]
[[package]]
name = "system-configuration-sys"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9"
dependencies = [
"core-foundation-sys",
"libc",
]
[[package]]
name = "system-deps"
version = "6.2.2"
...
...
@@ -3575,6 +3867,16 @@ dependencies = [
"syn 2.0.98",
]
[[package]]
name = "tokio-native-tls"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
dependencies = [
"native-tls",
"tokio",
]
[[package]]
name = "tokio-rustls"
version = "0.26.1"
...
...
@@ -3619,7 +3921,7 @@ dependencies = [
"bytes",
"futures-core",
"futures-sink",
"http",
"http
1.2.0
",
"httparse",
"rand",
"ring",
...
...
@@ -3702,11 +4004,11 @@ dependencies = [
"axum 0.7.9",
"base64 0.22.1",
"bytes",
"h2",
"http",
"http-body",
"h2
0.4.8
",
"http
1.2.0
",
"http-body
1.0.1
",
"http-body-util",
"hyper",
"hyper
1.6.0
",
"hyper-timeout",
"hyper-util",
"percent-encoding",
...
...
@@ -3764,7 +4066,7 @@ dependencies = [
"futures-core",
"futures-util",
"pin-project-lite",
"sync_wrapper",
"sync_wrapper
1.0.2
",
"tokio",
"tower-layer",
"tower-service",
...
...
@@ -4074,6 +4376,12 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "urlencoding"
version = "2.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
[[package]]
name = "utf16_iter"
version = "1.0.5"
...
...
@@ -4542,6 +4850,16 @@ dependencies = [
"memchr",
]
[[package]]
name = "winreg"
version = "0.50.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1"
dependencies = [
"cfg-if 1.0.0",
"windows-sys 0.48.0",
]
[[package]]
name = "wit-bindgen-rt"
version = "0.33.0"
...
...
applications/llm/count/Cargo.toml
View file @
d38325c2
...
...
@@ -33,3 +33,11 @@ serde_json = { version = "1" }
tokio
=
{
version
=
"1"
,
features
=
["full"]
}
tracing
=
{
version
=
"0.1"
}
thiserror
=
"1.0"
opentelemetry
=
"0.20"
opentelemetry-prometheus
=
"0.13"
prometheus
=
"0.13"
rand
=
"0.8"
axum
=
"0.6"
[dev-dependencies]
reqwest
=
{
version
=
"0.11"
,
features
=
["blocking"]
}
\ No newline at end of file
applications/llm/count/README.md
View file @
d38325c2
...
...
@@ -9,7 +9,7 @@ and then publish an event with the postprocessed data.
```
bash
# For more details, try TRD_LOG=debug
TRD_LOG
=
info cargo run
--
--namespace
triton-init
--component
backend
--endpoint
generate
TRD_LOG
=
info cargo run
--bin
count
--
--namespace
triton-init
--component
backend
--endpoint
generate
# 2025-02-26T18:45:05.467026Z INFO count: Creating unique instance of Count at triton-init/components/count/instance
# 2025-02-26T18:45:05.472146Z INFO count: Scraping service triton_init_backend_720278f8 and filtering on subject triton_init_backend_720278f8.generate
...
...
@@ -21,14 +21,33 @@ With no matching endpoints running, you should see warnings in the logs:
2025-02-26T18:45:06.474161Z WARN count: No endpoints found matching subject triton_init_backend_720278f8.generate
```
But after starting a matching endpoint, such as the
[
service_metrics example
](
examples/rust/service_metrics/src/bin/server.rs
)
,
you should see these warnings go away since the endpoint will automatically
get discovered.
To see metrics published to a matching endpoint, you can use the
[
mock_worker example
](
src/bin/mock_worker.rs
)
in this directory to launch
1 or more workers that publish LLM Metrics:
```
bash
# Can run multiple workers in separate shells
cargo run
--bin
mock_worker
```
After a matching endpoint gets started, you should see the warnings go away
since the endpoint will automatically get discovered.
Whether there are matching endpoints found or not,
`count`
will publish events, for example:
When stats are found from the target endpoints being listened on, count will
aggregate and publish some metrics as both an event and to a prometheus web server:
```
2025-02-2
6T18:45:46.501874Z INFO count: Publishing event l2c.backend.generate on Namespace { name
: "triton
-
init
" } with ProcessedEndpoints { capacity_with_ids: [
], load_avg:
NaN
, load_std:
NaN, address: "backend.generate"
}
2025-02-2
8T04:05:58.077901Z INFO count: Aggregated metrics: ProcessedEndpoints { endpoints: [Endpoint { name: "worker-7587884888253033398", subject: "triton_init_backend_720278f8.generate-694d951a80e06bb6", data: ForwardPassMetrics { request_active_slots: 58, request_total_slots: 100, kv_active_blocks: 77, kv_total_blocks: 100 } }, Endpoint { name: "worker-7587884888253033401", subject
: "triton
_
init
_backend_720278f8.generate-694d951a80e06bb9", data: ForwardPassMetrics { request_active_slots: 71, request_total_slots: 100, kv_active_blocks: 29, kv_total_blocks: 100 } }], worker_ids: [7587884888253033398, 7587884888253033401
], load_avg:
53.0
, load_std:
24.0
}
```
However, the events may not be very useful until there are corresponding stats found from endpoints for processing.
To see the metrics being published in prometheus format, you can run:
```
bash
curl localhost:9091/metrics
# # HELP llm_kv_blocks_active Active KV cache blocks
# # TYPE llm_kv_blocks_active gauge
# llm_kv_blocks_active{component="backend",endpoint="generate",worker_id="7587884888253033398"} 40
# llm_kv_blocks_active{component="backend",endpoint="generate",worker_id="7587884888253033401"} 2
# # HELP llm_kv_blocks_total Total KV cache blocks
# # TYPE llm_kv_blocks_total gauge
# llm_kv_blocks_total{component="backend",endpoint="generate",worker_id="7587884888253033398"} 100
# llm_kv_blocks_total{component="backend",endpoint="generate",worker_id="7587884888253033401"} 100
```
applications/llm/count/src/bin/mock_worker.rs
0 → 100644
View file @
d38325c2
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use
rand
::
Rng
;
use
std
::
sync
::
Arc
;
use
triton_distributed_llm
::
kv_router
::
protocols
::
ForwardPassMetrics
;
use
triton_distributed_runtime
::{
logging
,
pipeline
::{
async_trait
,
network
::
Ingress
,
AsyncEngine
,
AsyncEngineContextProvider
,
Error
,
ManyOut
,
ResponseStream
,
SingleIn
,
},
protocols
::
annotated
::
Annotated
,
stream
,
DistributedRuntime
,
Result
,
Runtime
,
Worker
,
};
fn
main
()
->
Result
<
()
>
{
logging
::
init
();
let
worker
=
Worker
::
from_settings
()
?
;
worker
.execute
(
app
)
}
async
fn
app
(
runtime
:
Runtime
)
->
Result
<
()
>
{
let
distributed
=
DistributedRuntime
::
from_settings
(
runtime
.clone
())
.await
?
;
backend
(
distributed
)
.await
}
struct
RequestHandler
{}
impl
RequestHandler
{
fn
new
()
->
Arc
<
Self
>
{
Arc
::
new
(
Self
{})
}
}
#[async_trait]
impl
AsyncEngine
<
SingleIn
<
String
>
,
ManyOut
<
Annotated
<
String
>>
,
Error
>
for
RequestHandler
{
async
fn
generate
(
&
self
,
input
:
SingleIn
<
String
>
)
->
Result
<
ManyOut
<
Annotated
<
String
>>>
{
let
(
data
,
ctx
)
=
input
.into_parts
();
let
chars
=
data
.chars
()
.map
(|
c
|
Annotated
::
from_data
(
c
.to_string
()))
.collect
::
<
Vec
<
_
>>
();
let
stream
=
stream
::
iter
(
chars
);
Ok
(
ResponseStream
::
new
(
Box
::
pin
(
stream
),
ctx
.context
()))
}
}
async
fn
backend
(
runtime
:
DistributedRuntime
)
->
Result
<
()
>
{
// attach an ingress to an engine
let
ingress
=
Ingress
::
for_engine
(
RequestHandler
::
new
())
?
;
// make the ingress discoverable via a component service
// we must first create a service, then we can attach one more more endpoints
runtime
.namespace
(
"triton-init"
)
?
.component
(
"backend"
)
?
.service_builder
()
.create
()
.await
?
.endpoint
(
"generate"
)
.endpoint_builder
()
// Dummy stats handler to demonstrate how to attach a custom stats handler
.stats_handler
(|
_
stats
|
{
println!
(
"stats in: {:?}"
,
_
stats
);
let
request_total_slots
=
100
;
let
request_active_slots
=
rand
::
thread_rng
()
.gen_range
(
0
..
request_total_slots
);
let
kv_total_blocks
=
100
;
let
kv_active_blocks
=
rand
::
thread_rng
()
.gen_range
(
0
..
kv_total_blocks
);
let
stats
=
ForwardPassMetrics
{
request_active_slots
,
request_total_slots
,
kv_active_blocks
,
kv_total_blocks
,
};
println!
(
"stats out: {:?}"
,
stats
);
serde_json
::
to_value
(
stats
)
.unwrap
()
})
.handler
(
ingress
)
.start
()
.await
}
applications/llm/count/src/lib.rs
0 → 100644
View file @
d38325c2
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Library functions for the count application.
use
axum
::{
routing
::
get
,
Router
};
use
prometheus
::
register_gauge_vec
;
use
serde
::{
Deserialize
,
Serialize
};
use
std
::
net
::
SocketAddr
;
use
triton_distributed_llm
::
kv_router
::
protocols
::
ForwardPassMetrics
;
use
triton_distributed_llm
::
kv_router
::
scheduler
::
Endpoint
;
use
triton_distributed_llm
::
kv_router
::
scoring
::
ProcessedEndpoints
;
use
triton_distributed_runtime
::{
distributed
::
Component
,
service
::
EndpointInfo
,
utils
::
Duration
,
Result
,
};
/// Configuration for LLM worker load capacity metrics
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
LLMWorkerLoadCapacityConfig
{
pub
component_name
:
String
,
pub
endpoint_name
:
String
,
}
// TODO: This is _really_ close to the async_nats::service::Stats object,
// but it's missing a few fields like "name", so use a temporary struct
// for easy deserialization. Ideally, this type already exists or can
// be exposed in the library somewhere.
/// Stats structure returned from NATS service API
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
StatsWithData
{
// Standard NATS Service API fields
pub
average_processing_time
:
f64
,
pub
last_error
:
String
,
pub
num_errors
:
u64
,
pub
num_requests
:
u64
,
pub
processing_time
:
u64
,
pub
queue_group
:
String
,
// Field containing custom stats handler data
pub
data
:
serde_json
::
Value
,
}
/// Prometheus metrics server for exposing metrics
pub
struct
PrometheusMetricsServer
{
metrics
:
PrometheusMetrics
,
}
impl
PrometheusMetricsServer
{
/// Initialize the metrics server
pub
fn
new
()
->
Result
<
Self
>
{
Ok
(
Self
{
metrics
:
PrometheusMetrics
::
new
()
?
,
})
}
/// Start the metrics server on the specified port
pub
fn
start
(
&
mut
self
,
port
:
u16
)
{
// Create an axum router with a metrics endpoint
let
app
=
Router
::
new
()
.route
(
"/metrics"
,
get
(||
async
{
// Gather and encode metrics
use
prometheus
::
Encoder
;
let
encoder
=
prometheus
::
TextEncoder
::
new
();
let
mut
buffer
=
Vec
::
new
();
encoder
.encode
(
&
prometheus
::
gather
(),
&
mut
buffer
)
.unwrap
();
String
::
from_utf8
(
buffer
)
.unwrap
()
}),
);
// Create a socket address to listen on
let
addr
=
SocketAddr
::
from
(([
0
,
0
,
0
,
0
],
port
));
// Spawn the server in a background task
tokio
::
spawn
(
async
move
{
axum
::
Server
::
bind
(
&
addr
)
.serve
(
app
.into_make_service
())
.await
.unwrap
();
});
tracing
::
info!
(
"Prometheus metrics server started at {addr:?}/metrics"
);
}
/// Update metrics with current values
pub
fn
update
(
&
mut
self
,
config
:
&
LLMWorkerLoadCapacityConfig
,
processed
:
&
ProcessedEndpoints
)
{
self
.metrics
.update
(
config
,
processed
);
}
}
/// Prometheus metrics collection
pub
struct
PrometheusMetrics
{
kv_blocks_active
:
prometheus
::
GaugeVec
,
kv_blocks_total
:
prometheus
::
GaugeVec
,
requests_active
:
prometheus
::
GaugeVec
,
requests_total
:
prometheus
::
GaugeVec
,
load_avg
:
prometheus
::
GaugeVec
,
load_std
:
prometheus
::
GaugeVec
,
}
impl
PrometheusMetrics
{
/// Initialize all metrics
fn
new
()
->
Result
<
Self
>
{
Ok
(
Self
{
kv_blocks_active
:
register_gauge_vec!
(
"llm_kv_blocks_active"
,
"Active KV cache blocks"
,
&
[
"component"
,
"endpoint"
,
"worker_id"
]
)
?
,
kv_blocks_total
:
register_gauge_vec!
(
"llm_kv_blocks_total"
,
"Total KV cache blocks"
,
&
[
"component"
,
"endpoint"
,
"worker_id"
]
)
?
,
requests_active
:
register_gauge_vec!
(
"llm_requests_active_slots"
,
"Active request slots"
,
&
[
"component"
,
"endpoint"
,
"worker_id"
]
)
?
,
requests_total
:
register_gauge_vec!
(
"llm_requests_total_slots"
,
"Total request slots"
,
&
[
"component"
,
"endpoint"
,
"worker_id"
]
)
?
,
load_avg
:
register_gauge_vec!
(
"llm_load_avg"
,
"Average load across workers"
,
&
[
"component"
,
"endpoint"
]
)
?
,
load_std
:
register_gauge_vec!
(
"llm_load_std"
,
"Load standard deviation across workers"
,
&
[
"component"
,
"endpoint"
]
)
?
,
})
}
/// Helper method to set a gauge with worker-specific labels (3 labels)
fn
set_worker_gauge
(
&
self
,
gauge
:
&
prometheus
::
GaugeVec
,
config
:
&
LLMWorkerLoadCapacityConfig
,
worker_id
:
&
str
,
value
:
f64
,
)
{
gauge
.with_label_values
(
&
[
&
config
.component_name
,
&
config
.endpoint_name
,
worker_id
])
.set
(
value
);
}
/// Helper method to set a gauge with component/endpoint labels only (2 labels)
fn
set_endpoint_gauge
(
&
self
,
gauge
:
&
prometheus
::
GaugeVec
,
config
:
&
LLMWorkerLoadCapacityConfig
,
value
:
f64
,
)
{
gauge
.with_label_values
(
&
[
&
config
.component_name
,
&
config
.endpoint_name
])
.set
(
value
);
}
/// Update metrics with current values
fn
update
(
&
self
,
config
:
&
LLMWorkerLoadCapacityConfig
,
processed
:
&
ProcessedEndpoints
)
{
// Update per-worker metrics
for
endpoint
in
processed
.endpoints
.iter
()
{
let
worker_id
=
endpoint
.worker_id
()
.to_string
();
let
metrics
=
endpoint
.data
.clone
();
self
.set_worker_gauge
(
&
self
.kv_blocks_active
,
config
,
&
worker_id
,
metrics
.kv_active_blocks
as
f64
,
);
self
.set_worker_gauge
(
&
self
.kv_blocks_total
,
config
,
&
worker_id
,
metrics
.kv_total_blocks
as
f64
,
);
self
.set_worker_gauge
(
&
self
.requests_active
,
config
,
&
worker_id
,
metrics
.request_active_slots
as
f64
,
);
self
.set_worker_gauge
(
&
self
.requests_total
,
config
,
&
worker_id
,
metrics
.request_total_slots
as
f64
,
);
}
// Update aggregate metrics
self
.set_endpoint_gauge
(
&
self
.load_avg
,
config
,
processed
.load_avg
);
self
.set_endpoint_gauge
(
&
self
.load_std
,
config
,
processed
.load_std
);
}
}
/// Collect endpoints from a component
pub
async
fn
collect_endpoints
(
component
:
&
Component
,
subject
:
&
str
,
timeout
:
Duration
,
)
->
Result
<
Vec
<
EndpointInfo
>>
{
// Collect stats from each backend
let
stream
=
component
.scrape_stats
(
timeout
)
.await
?
;
// Filter the stats by the service subject
let
endpoints
=
stream
.into_endpoints
()
.filter
(|
e
|
e
.subject
.starts_with
(
subject
))
.collect
::
<
Vec
<
_
>>
();
tracing
::
debug!
(
"Endpoints: {endpoints:?}"
);
if
endpoints
.is_empty
()
{
tracing
::
warn!
(
"No endpoints found matching subject {subject}"
);
}
Ok
(
endpoints
)
}
/// Extract metrics from endpoints
pub
fn
extract_metrics
(
endpoints
:
&
[
EndpointInfo
])
->
Vec
<
ForwardPassMetrics
>
{
let
endpoint_data
=
endpoints
.iter
()
.map
(|
e
|
e
.data
.clone
())
.collect
::
<
Vec
<
_
>>
();
// Extract StatsWithData objects from endpoint services
let
stats
:
Vec
<
StatsWithData
>
=
endpoint_data
.iter
()
.filter_map
(|
e
|
{
let
metrics_data
=
e
.as_ref
()
?
;
metrics_data
.clone
()
.decode
::
<
StatsWithData
>
()
.ok
()
})
.collect
();
tracing
::
debug!
(
"Stats: {stats:?}"
);
// Extract ForwardPassMetrics nested within Stats object
let
metrics
:
Vec
<
ForwardPassMetrics
>
=
stats
.iter
()
.filter_map
(
|
s
|
match
serde_json
::
from_value
::
<
ForwardPassMetrics
>
(
s
.data
.clone
())
{
Ok
(
metrics
)
=>
Some
(
metrics
),
Err
(
err
)
=>
{
tracing
::
warn!
(
"Error decoding metrics: {err}"
);
None
}
},
)
.collect
();
tracing
::
debug!
(
"Metrics: {metrics:?}"
);
metrics
}
/// Create ProcessedEndpoints from metrics and endpoints
pub
fn
postprocess_metrics
(
metrics
:
&
[
ForwardPassMetrics
],
endpoints
:
&
[
EndpointInfo
],
)
->
ProcessedEndpoints
{
let
processed_endpoints
:
Vec
<
Endpoint
>
=
metrics
.iter
()
.zip
(
endpoints
.iter
())
.filter_map
(|(
m
,
e
)|
{
e
.id
()
.ok
()
.map
(|
id
|
Endpoint
{
name
:
format!
(
"worker-{id}"
),
subject
:
e
.subject
.clone
(),
data
:
m
.clone
(),
})
})
.collect
();
ProcessedEndpoints
::
new
(
processed_endpoints
)
}
applications/llm/count/src/main.rs
View file @
d38325c2
...
...
@@ -24,8 +24,6 @@
//! - KV Cache Blocks: [Active, Total]
use
clap
::
Parser
;
use
serde
::{
Deserialize
,
Serialize
};
use
triton_distributed_runtime
::{
error
,
logging
,
traits
::
events
::
EventPublisher
,
...
...
@@ -33,6 +31,12 @@ use triton_distributed_runtime::{
DistributedRuntime
,
ErrorContext
,
Result
,
Runtime
,
Worker
,
};
// Import from our library
use
count
::{
collect_endpoints
,
extract_metrics
,
postprocess_metrics
,
LLMWorkerLoadCapacityConfig
,
PrometheusMetricsServer
,
};
/// CLI arguments for the count application
#[derive(Parser,
Debug)]
#[command(author,
version,
about,
long_about
=
None)]
...
...
@@ -73,34 +77,8 @@ fn get_config(args: &Args) -> Result<LLMWorkerLoadCapacityConfig> {
})
}
// we will scrape the service_name and extract the endpoint_name metrics
// we will bcast them as {namespace}.events.l2c.{service_name}.{endpoint_name}
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
LLMWorkerLoadCapacityConfig
{
component_name
:
String
,
endpoint_name
:
String
,
}
/// LLM Worker Load Capacity Metrics
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
LLMWorkerLoadCapacity
{
pub
requests_active_slots
:
u32
,
pub
requests_total_slots
:
u32
,
pub
kv_blocks_active
:
u32
,
pub
kv_blocks_total
:
u32
,
}
fn
main
()
->
Result
<
()
>
{
logging
::
init
();
let
worker
=
Worker
::
from_settings
()
?
;
worker
.execute
(
app
)
}
// TODO - refactor much of this back into the library
async
fn
app
(
runtime
:
Runtime
)
->
Result
<
()
>
{
let
args
=
Args
::
parse
();
// we will start by assuming that there is no oscar and no planner
// to that end, we will use CLI args to get a singular config for scraping a single backend
let
config
=
get_config
(
&
args
)
?
;
tracing
::
info!
(
"Config: {config:?}"
);
...
...
@@ -109,8 +87,7 @@ async fn app(runtime: Runtime) -> Result<()> {
let
namespace
=
drt
.namespace
(
args
.namespace
)
?
;
let
component
=
namespace
.component
(
"count"
)
?
;
// there should only be one count
// check {component.etcd_path()}/instance for existing instances
// Create unique instance of Count
let
key
=
format!
(
"{}/instance"
,
component
.etcd_path
());
tracing
::
info!
(
"Creating unique instance of Count at {key}"
);
drt
.etcd_client
()
...
...
@@ -122,113 +99,53 @@ async fn app(runtime: Runtime) -> Result<()> {
.await
.context
(
"Unable to create unique instance of Count; possibly one already exists"
)
?
;
let
target
=
namespace
.component
(
&
config
.component_name
)
?
;
let
target_endpoint
=
target
.endpoint
(
&
config
.endpoint_name
);
let
target
_component
=
namespace
.component
(
&
config
.component_name
)
?
;
let
target_endpoint
=
target
_component
.endpoint
(
&
config
.endpoint_name
);
let
service_name
=
target
.service_name
();
let
service_name
=
target
_component
.service_name
();
let
service_subject
=
target_endpoint
.subject
();
tracing
::
info!
(
"Scraping service {service_name} and filtering on subject {service_subject}"
);
let
token
=
drt
.primary_lease
()
.child_token
();
let
event_name
=
format!
(
"l2c.{}.{}"
,
config
.component_name
,
config
.endpoint_name
);
let
address
=
format!
(
"{}.{}"
,
config
.component_name
,
config
.endpoint_name
,);
let
event_name
=
format!
(
"l2c.{}"
,
address
);
// TODO: Make metrics host/port configurable
// Initialize Prometheus metrics and start server
let
mut
metrics_server
=
PrometheusMetricsServer
::
new
()
?
;
metrics_server
.start
(
9091
);
loop
{
let
next
=
Instant
::
now
()
+
Duration
::
from_secs
(
args
.poll_interval
);
// collect stats from each backend
let
stream
=
target
.scrape_stats
(
Duration
::
from_secs
(
1
))
.await
?
;
tracing
::
debug!
(
"Scraped Stats Stream: {stream:?}"
);
// filter the stats by the service subject
let
endpoints
=
stream
.into_endpoints
()
.filter
(|
e
|
e
.subject
.starts_with
(
&
service_subject
))
.collect
::
<
Vec
<
_
>>
();
// Collect and process metrics
let
scrape_timeout
=
Duration
::
from_secs
(
1
);
let
endpoints
=
collect_endpoints
(
&
target_component
,
&
service_subject
,
scrape_timeout
)
.await
?
;
let
metrics
=
extract_metrics
(
&
endpoints
);
let
processed
=
postprocess_metrics
(
&
metrics
,
&
endpoints
);
tracing
::
info!
(
"Aggregated metrics: {processed:?}"
);
tracing
::
debug!
(
"Endpoints: {endpoints:?}"
);
if
endpoints
.is_empty
()
{
tracing
::
warn!
(
"No endpoints found matching subject {}"
,
service_subject
);
}
// Update Prometheus metrics
metrics_server
.update
(
&
config
,
&
processed
);
// extract the custom data from the stats and try to decode it as LLMWorkerLoadCapacity
let
metrics
=
endpoints
.iter
()
.filter_map
(|
e
|
match
e
.data
.clone
()
{
Some
(
metrics
)
=>
metrics
.decode
::
<
LLMWorkerLoadCapacity
>
()
.ok
(),
None
=>
None
,
})
.collect
::
<
Vec
<
_
>>
();
tracing
::
debug!
(
"Metrics: {metrics:?}"
);
// parse the endpoint ids
// the ids are the last part of the subject in hexadecimal
// form a list of tuples (kv_blocks_total - kv_blocks_active, requests_total_slots - requests_active_slots, id)
// this tuple represent the remaining capacity of each endpoint
let
capacity_with_ids
=
metrics
.iter
()
.zip
(
endpoints
.iter
())
.filter_map
(|(
m
,
e
)|
{
e
.id
()
.ok
()
.map
(|
id
|
{
(
m
.kv_blocks_total
-
m
.kv_blocks_active
,
m
.requests_total_slots
-
m
.requests_active_slots
,
id
,
)
})
})
.collect
::
<
Vec
<
_
>>
();
// compute mean / std of LLMWorkerLoadCapacity
let
load_values
:
Vec
<
f64
>
=
metrics
.iter
()
.map
(|
x
|
x
.kv_blocks_active
as
f64
)
.collect
();
let
load_avg
=
load_values
.iter
()
.sum
::
<
f64
>
()
/
load_values
.len
()
as
f64
;
let
variance
=
load_values
.iter
()
.map
(|
&
x
|
(
x
-
load_avg
)
.powi
(
2
))
.sum
::
<
f64
>
()
/
load_values
.len
()
as
f64
;
let
load_std
=
variance
.sqrt
();
let
processed
=
ProcessedEndpoints
{
capacity_with_ids
,
load_avg
,
load_std
,
address
:
address
.clone
(),
};
// publish using the namespace event plane
tracing
::
info!
(
"Publishing event {event_name} on namespace {namespace:?} with {processed:?}"
);
// TODO: Who needs to consume these events?
// Publish metrics event
namespace
.publish
(
&
event_name
,
&
processed
)
.await
?
;
//
w
ait until cancelled or the next tick
//
W
ait until cancelled or the next tick
match
tokio
::
time
::
timeout_at
(
next
,
token
.cancelled
())
.await
{
Ok
(
_
)
=>
break
,
Err
(
_
)
=>
{
// timeout, we continue
continue
;
}
Err
(
_
)
=>
continue
,
}
}
Ok
(())
}
#[derive(Debug,
Clone,
Serialize,
Deserialize)]
pub
struct
ProcessedEndpoints
{
/// (kv_blocks_total - kv_blocks_active, requests_total_slots - requests_active_slots, id)
pub
capacity_with_ids
:
Vec
<
(
u32
,
u32
,
i64
)
>
,
/// kv_blocks_active average
pub
load_avg
:
f64
,
/// kv_blocks_active standard deviation
pub
load_std
:
f64
,
/// {component}.{endpoint}
pub
address
:
String
,
fn
main
()
->
Result
<
()
>
{
logging
::
init
();
let
worker
=
Worker
::
from_settings
()
?
;
worker
.execute
(
app
)
}
#[cfg(test)]
...
...
@@ -239,11 +156,7 @@ mod tests {
#[test]
fn
test_namespace_from_env
()
{
env
::
set_var
(
"TRD_NAMESPACE"
,
"test-namespace"
);
// Parse args with no explicit namespace
let
args
=
Args
::
parse_from
([
"count"
,
"--component"
,
"comp"
,
"--endpoint"
,
"end"
]);
// Verify namespace was taken from environment variable
assert_eq!
(
args
.namespace
,
"test-namespace"
);
}
}
applications/llm/count/visualization/README.md
0 → 100644
View file @
d38325c2
# Metrics Visualization with Prometheus and Grafana
This directory contains configuration for visualizing metrics from the metrics aggregation service using Prometheus and Grafana.
## Components
-
**Prometheus**
: Collects and stores metrics from the service
-
**Grafana**
: Provides visualization dashboards for the metrics
## Getting Started
1.
Make sure Docker and Docker Compose are installed on your system
2.
Start
`count`
and the corresponding
`examples/rust/service_metrics/bin/server.rs`
that populates dummy KV Cache metrics.
3.
Start the visualization stack:
```
bash
docker compose up
-d
```
4.
Web servers started:
-
Grafana: http://localhost:3000 (default login: admin/admin)
-
Prometheus: http://localhost:9090
## Configuration
### Prometheus
The Prometheus configuration is defined in
`prometheus.yml`
. It is configured to scrape metrics from the metrics aggregation service endpoint.
Note: You may need to adjust the target based on your host configuration and network setup.
### Grafana
Grafana is pre-configured with:
-
Prometheus datasource
-
Sample dashboard for visualizing service metrics
## Required Files
The following configuration files should be present in this directory:
-
`docker-compose.yml`
: Defines the Prometheus and Grafana services
-
`prometheus.yml`
: Contains Prometheus scraping configuration
-
`grafana.json`
: Contains Grafana dashboard configuration
-
`grafana-datasources.yml`
: Contains Grafana datasource configuration
-
`grafana-dashboard-providers.yml`
: Contains Grafana dashboard provider configuration
## Metrics
The prometheus service exposes the following metrics:
-
`llm_load_avg`
: Average load across workers
-
`llm_load_std`
: Load standard deviation across workers
-
`llm_requests_active_slots`
: Number of currently active request slots
-
`llm_requests_total_slots`
: Total available request slots
-
`llm_kv_blocks_active`
: Number of active KV blocks
-
`llm_kv_blocks_total`
: Total KV blocks available
## Troubleshooting
1.
Verify services are running:
```
bash
docker compose ps
```
2.
Check logs:
```
bash
docker compose logs prometheus
docker compose logs grafana
```
applications/llm/count/visualization/docker-compose.yml
0 → 100644
View file @
d38325c2
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
services
:
prometheus
:
image
:
prom/prometheus:latest
container_name
:
prometheus
volumes
:
-
./prometheus.yml:/etc/prometheus/prometheus.yml
-
prometheus_data:/prometheus
command
:
-
'
--config.file=/etc/prometheus/prometheus.yml'
-
'
--storage.tsdb.path=/prometheus'
# These provide the web console functionality
-
'
--web.console.libraries=/etc/prometheus/console_libraries'
-
'
--web.console.templates=/etc/prometheus/consoles'
-
'
--web.enable-lifecycle'
restart
:
unless-stopped
# TODO: Use more explicit networking setup when count is containerized
#ports:
# - "9090:9090"
#networks:
# - monitoring
network_mode
:
"
host"
grafana
:
image
:
grafana/grafana-enterprise:latest
container_name
:
grafana
volumes
:
-
./grafana.json:/etc/grafana/provisioning/dashboards/llm-worker-dashboard.json
-
./grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml
-
./grafana-dashboard-providers.yml:/etc/grafana/provisioning/dashboards/dashboard-providers.yml
-
grafana_data:/var/lib/grafana
environment
:
-
GF_SECURITY_ADMIN_USER=admin
-
GF_SECURITY_ADMIN_PASSWORD=admin
-
GF_USERS_ALLOW_SIGN_UP=false
-
GF_INSTALL_PLUGINS=grafana-piechart-panel
restart
:
unless-stopped
# TODO: Use more explicit networking setup when count is containerized
#ports:
# - "3000:3000"
#networks:
# - monitoring
network_mode
:
"
host"
depends_on
:
-
prometheus
networks
:
monitoring
:
driver
:
bridge
volumes
:
prometheus_data
:
grafana_data
:
applications/llm/count/visualization/grafana-dashboard-providers.yml
0 → 100644
View file @
d38325c2
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion
:
1
providers
:
-
name
:
'
default'
orgId
:
1
folder
:
'
'
type
:
file
disableDeletion
:
false
updateIntervalSeconds
:
10
allowUiUpdates
:
true
options
:
path
:
/etc/grafana/provisioning/dashboards
foldersFromFilesStructure
:
true
applications/llm/count/visualization/grafana-datasources.yml
0 → 100644
View file @
d38325c2
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion
:
1
datasources
:
-
name
:
prometheus
type
:
prometheus
access
:
proxy
# TODO: Use proper docker networking
# url: http://prometheus:9090
url
:
http://localhost:9090
isDefault
:
true
applications/llm/count/visualization/grafana.json
0 → 100644
View file @
d38325c2
{
"annotations"
:
{
"list"
:
[
{
"builtIn"
:
1
,
"datasource"
:
{
"type"
:
"grafana"
,
"uid"
:
"-- Grafana --"
},
"enable"
:
true
,
"hide"
:
true
,
"iconColor"
:
"rgba(0, 211, 255, 1)"
,
"name"
:
"Annotations & Alerts"
,
"type"
:
"dashboard"
}
]
},
"copyright"
:
[
"SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved."
,
"SPDX-License-Identifier: Apache-2.0"
,
"Licensed under the Apache License, Version 2.0 (the
\"
License
\"
);"
,
"you may not use this file except in compliance with the License."
,
"You may obtain a copy of the License at"
,
"http://www.apache.org/licenses/LICENSE-2.0"
,
"Unless required by applicable law or agreed to in writing, software"
,
"distributed under the License is distributed on an
\"
AS IS
\"
BASIS,"
,
"WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied."
,
"See the License for the specific language governing permissions and"
,
"limitations under the License."
],
"editable"
:
true
,
"fiscalYearStartMonth"
:
0
,
"graphTooltip"
:
0
,
"id"
:
1
,
"links"
:
[],
"liveNow"
:
false
,
"panels"
:
[
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"fieldConfig"
:
{
"defaults"
:
{
"color"
:
{
"mode"
:
"palette-classic"
},
"custom"
:
{
"axisCenteredZero"
:
false
,
"axisColorMode"
:
"text"
,
"axisLabel"
:
""
,
"axisPlacement"
:
"auto"
,
"barAlignment"
:
0
,
"drawStyle"
:
"line"
,
"fillOpacity"
:
20
,
"gradientMode"
:
"none"
,
"hideFrom"
:
{
"legend"
:
false
,
"tooltip"
:
false
,
"viz"
:
false
},
"lineInterpolation"
:
"smooth"
,
"lineWidth"
:
2
,
"pointSize"
:
5
,
"scaleDistribution"
:
{
"type"
:
"linear"
},
"showPoints"
:
"never"
,
"spanNulls"
:
false
,
"stacking"
:
{
"group"
:
"A"
,
"mode"
:
"none"
},
"thresholdsStyle"
:
{
"mode"
:
"off"
}
},
"mappings"
:
[],
"thresholds"
:
{
"mode"
:
"absolute"
,
"steps"
:
[
{
"color"
:
"green"
,
"value"
:
null
},
{
"color"
:
"red"
,
"value"
:
80
}
]
},
"unit"
:
"percent"
},
"overrides"
:
[]
},
"gridPos"
:
{
"h"
:
8
,
"w"
:
12
,
"x"
:
0
,
"y"
:
0
},
"id"
:
1
,
"options"
:
{
"legend"
:
{
"calcs"
:
[
"mean"
,
"max"
],
"displayMode"
:
"table"
,
"placement"
:
"right"
,
"showLegend"
:
true
},
"tooltip"
:
{
"mode"
:
"multi"
,
"sort"
:
"none"
}
},
"title"
:
"KV Cache Utilization by Worker"
,
"type"
:
"timeseries"
,
"targets"
:
[
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"editorMode"
:
"code"
,
"expr"
:
"100 * llm_kv_blocks_active{component=
\"
$component
\"
, endpoint=
\"
$endpoint
\"
} / llm_kv_blocks_total{component=
\"
$component
\"
, endpoint=
\"
$endpoint
\"
}"
,
"legendFormat"
:
"Worker {{worker_id}}"
,
"range"
:
true
,
"refId"
:
"A"
}
]
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"fieldConfig"
:
{
"defaults"
:
{
"color"
:
{
"mode"
:
"palette-classic"
},
"custom"
:
{
"axisCenteredZero"
:
false
,
"axisColorMode"
:
"text"
,
"axisLabel"
:
""
,
"axisPlacement"
:
"auto"
,
"barAlignment"
:
0
,
"drawStyle"
:
"line"
,
"fillOpacity"
:
20
,
"gradientMode"
:
"none"
,
"hideFrom"
:
{
"legend"
:
false
,
"tooltip"
:
false
,
"viz"
:
false
},
"lineInterpolation"
:
"smooth"
,
"lineWidth"
:
2
,
"pointSize"
:
5
,
"scaleDistribution"
:
{
"type"
:
"linear"
},
"showPoints"
:
"never"
,
"spanNulls"
:
false
,
"stacking"
:
{
"group"
:
"A"
,
"mode"
:
"none"
},
"thresholdsStyle"
:
{
"mode"
:
"off"
}
},
"mappings"
:
[],
"thresholds"
:
{
"mode"
:
"absolute"
,
"steps"
:
[
{
"color"
:
"green"
,
"value"
:
null
},
{
"color"
:
"red"
,
"value"
:
80
}
]
},
"unit"
:
"percent"
},
"overrides"
:
[]
},
"gridPos"
:
{
"h"
:
8
,
"w"
:
12
,
"x"
:
12
,
"y"
:
0
},
"id"
:
2
,
"options"
:
{
"legend"
:
{
"calcs"
:
[
"mean"
,
"max"
],
"displayMode"
:
"table"
,
"placement"
:
"right"
,
"showLegend"
:
true
},
"tooltip"
:
{
"mode"
:
"multi"
,
"sort"
:
"none"
}
},
"title"
:
"Request Slot Utilization by Worker"
,
"type"
:
"timeseries"
,
"targets"
:
[
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"editorMode"
:
"code"
,
"expr"
:
"100 * llm_requests_active_slots{component=
\"
$component
\"
, endpoint=
\"
$endpoint
\"
} / llm_requests_total_slots{component=
\"
$component
\"
, endpoint=
\"
$endpoint
\"
}"
,
"legendFormat"
:
"Worker {{worker_id}}"
,
"range"
:
true
,
"refId"
:
"A"
}
]
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"fieldConfig"
:
{
"defaults"
:
{
"color"
:
{
"mode"
:
"thresholds"
},
"mappings"
:
[],
"thresholds"
:
{
"mode"
:
"absolute"
,
"steps"
:
[
{
"color"
:
"green"
,
"value"
:
null
},
{
"color"
:
"yellow"
,
"value"
:
50
},
{
"color"
:
"red"
,
"value"
:
80
}
]
},
"unit"
:
"percent"
},
"overrides"
:
[]
},
"gridPos"
:
{
"h"
:
8
,
"w"
:
6
,
"x"
:
0
,
"y"
:
8
},
"id"
:
3
,
"options"
:
{
"orientation"
:
"auto"
,
"reduceOptions"
:
{
"calcs"
:
[
"lastNotNull"
],
"fields"
:
""
,
"values"
:
false
},
"showThresholdLabels"
:
false
,
"showThresholdMarkers"
:
true
},
"pluginVersion"
:
"10.0.0"
,
"title"
:
"Average KV Cache Utilization"
,
"type"
:
"gauge"
,
"targets"
:
[
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"editorMode"
:
"code"
,
"expr"
:
"100 * avg(llm_kv_blocks_active{component=
\"
$component
\"
, endpoint=
\"
$endpoint
\"
}) / avg(llm_kv_blocks_total{component=
\"
$component
\"
, endpoint=
\"
$endpoint
\"
})"
,
"legendFormat"
:
"__auto"
,
"range"
:
true
,
"refId"
:
"A"
}
]
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"fieldConfig"
:
{
"defaults"
:
{
"color"
:
{
"mode"
:
"thresholds"
},
"mappings"
:
[],
"thresholds"
:
{
"mode"
:
"absolute"
,
"steps"
:
[
{
"color"
:
"green"
,
"value"
:
null
},
{
"color"
:
"yellow"
,
"value"
:
50
},
{
"color"
:
"red"
,
"value"
:
80
}
]
},
"unit"
:
"percent"
},
"overrides"
:
[]
},
"gridPos"
:
{
"h"
:
8
,
"w"
:
6
,
"x"
:
6
,
"y"
:
8
},
"id"
:
4
,
"options"
:
{
"orientation"
:
"auto"
,
"reduceOptions"
:
{
"calcs"
:
[
"lastNotNull"
],
"fields"
:
""
,
"values"
:
false
},
"showThresholdLabels"
:
false
,
"showThresholdMarkers"
:
true
},
"pluginVersion"
:
"10.0.0"
,
"title"
:
"Average Request Slot Utilization"
,
"type"
:
"gauge"
,
"targets"
:
[
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"editorMode"
:
"code"
,
"expr"
:
"100 * avg(llm_requests_active_slots{component=
\"
$component
\"
, endpoint=
\"
$endpoint
\"
}) / avg(llm_requests_total_slots{component=
\"
$component
\"
, endpoint=
\"
$endpoint
\"
})"
,
"legendFormat"
:
"__auto"
,
"range"
:
true
,
"refId"
:
"A"
}
]
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"fieldConfig"
:
{
"defaults"
:
{
"color"
:
{
"mode"
:
"palette-classic"
},
"custom"
:
{
"axisCenteredZero"
:
false
,
"axisColorMode"
:
"text"
,
"axisLabel"
:
""
,
"axisPlacement"
:
"auto"
,
"barAlignment"
:
0
,
"drawStyle"
:
"line"
,
"fillOpacity"
:
20
,
"gradientMode"
:
"none"
,
"hideFrom"
:
{
"legend"
:
false
,
"tooltip"
:
false
,
"viz"
:
false
},
"lineInterpolation"
:
"smooth"
,
"lineWidth"
:
2
,
"pointSize"
:
5
,
"scaleDistribution"
:
{
"type"
:
"linear"
},
"showPoints"
:
"never"
,
"spanNulls"
:
false
,
"stacking"
:
{
"group"
:
"A"
,
"mode"
:
"none"
},
"thresholdsStyle"
:
{
"mode"
:
"off"
}
},
"mappings"
:
[],
"thresholds"
:
{
"mode"
:
"absolute"
,
"steps"
:
[
{
"color"
:
"green"
,
"value"
:
null
}
]
},
"unit"
:
"none"
},
"overrides"
:
[]
},
"gridPos"
:
{
"h"
:
8
,
"w"
:
12
,
"x"
:
12
,
"y"
:
8
},
"id"
:
5
,
"options"
:
{
"legend"
:
{
"calcs"
:
[
"mean"
,
"max"
],
"displayMode"
:
"table"
,
"placement"
:
"right"
,
"showLegend"
:
true
},
"tooltip"
:
{
"mode"
:
"multi"
,
"sort"
:
"none"
}
},
"title"
:
"Load Average & Standard Deviation"
,
"type"
:
"timeseries"
,
"targets"
:
[
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"editorMode"
:
"code"
,
"expr"
:
"llm_load_avg{component=
\"
$component
\"
, endpoint=
\"
$endpoint
\"
}"
,
"legendFormat"
:
"Average"
,
"range"
:
true
,
"refId"
:
"A"
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"editorMode"
:
"code"
,
"expr"
:
"llm_load_std{component=
\"
$component
\"
, endpoint=
\"
$endpoint
\"
}"
,
"hide"
:
false
,
"legendFormat"
:
"StdDev"
,
"range"
:
true
,
"refId"
:
"B"
}
]
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"fieldConfig"
:
{
"defaults"
:
{
"color"
:
{
"mode"
:
"palette-classic"
},
"custom"
:
{
"axisCenteredZero"
:
false
,
"axisColorMode"
:
"text"
,
"axisLabel"
:
""
,
"axisPlacement"
:
"auto"
,
"barAlignment"
:
0
,
"drawStyle"
:
"line"
,
"fillOpacity"
:
20
,
"gradientMode"
:
"none"
,
"hideFrom"
:
{
"legend"
:
false
,
"tooltip"
:
false
,
"viz"
:
false
},
"lineInterpolation"
:
"smooth"
,
"lineWidth"
:
2
,
"pointSize"
:
5
,
"scaleDistribution"
:
{
"type"
:
"linear"
},
"showPoints"
:
"never"
,
"spanNulls"
:
false
,
"stacking"
:
{
"group"
:
"A"
,
"mode"
:
"none"
},
"thresholdsStyle"
:
{
"mode"
:
"off"
}
},
"mappings"
:
[],
"thresholds"
:
{
"mode"
:
"absolute"
,
"steps"
:
[
{
"color"
:
"green"
,
"value"
:
null
}
]
},
"unit"
:
"none"
},
"overrides"
:
[]
},
"gridPos"
:
{
"h"
:
8
,
"w"
:
24
,
"x"
:
0
,
"y"
:
16
},
"id"
:
6
,
"options"
:
{
"legend"
:
{
"calcs"
:
[
"mean"
,
"max"
],
"displayMode"
:
"table"
,
"placement"
:
"right"
,
"showLegend"
:
true
},
"tooltip"
:
{
"mode"
:
"multi"
,
"sort"
:
"none"
}
},
"title"
:
"Available Resources"
,
"type"
:
"timeseries"
,
"targets"
:
[
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"editorMode"
:
"code"
,
"expr"
:
"sum(llm_kv_blocks_total{component=
\"
$component
\"
, endpoint=
\"
$endpoint
\"
} - llm_kv_blocks_active{component=
\"
$component
\"
, endpoint=
\"
$endpoint
\"
})"
,
"legendFormat"
:
"Available KV Blocks"
,
"range"
:
true
,
"refId"
:
"A"
},
{
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"editorMode"
:
"code"
,
"expr"
:
"sum(llm_requests_total_slots{component=
\"
$component
\"
, endpoint=
\"
$endpoint
\"
} - llm_requests_active_slots{component=
\"
$component
\"
, endpoint=
\"
$endpoint
\"
})"
,
"hide"
:
false
,
"legendFormat"
:
"Available Request Slots"
,
"range"
:
true
,
"refId"
:
"B"
}
]
}
],
"refresh"
:
"5s"
,
"schemaVersion"
:
38
,
"style"
:
"dark"
,
"tags"
:
[
"llm"
,
"metrics"
],
"templating"
:
{
"list"
:
[
{
"current"
:
{
"selected"
:
false
,
"text"
:
"backend"
,
"value"
:
"backend"
},
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"definition"
:
"label_values(llm_kv_blocks_active, component)"
,
"hide"
:
0
,
"includeAll"
:
false
,
"label"
:
"Component"
,
"multi"
:
false
,
"name"
:
"component"
,
"options"
:
[],
"query"
:
{
"query"
:
"label_values(llm_kv_blocks_active, component)"
,
"refId"
:
"StandardVariableQuery"
},
"refresh"
:
1
,
"regex"
:
""
,
"skipUrlSync"
:
false
,
"sort"
:
0
,
"type"
:
"query"
},
{
"current"
:
{
"selected"
:
false
,
"text"
:
"generate"
,
"value"
:
"generate"
},
"datasource"
:
{
"type"
:
"prometheus"
,
"uid"
:
"prometheus"
},
"definition"
:
"label_values(llm_kv_blocks_active{component=
\"
$component
\"
}, endpoint)"
,
"hide"
:
0
,
"includeAll"
:
false
,
"label"
:
"Endpoint"
,
"multi"
:
false
,
"name"
:
"endpoint"
,
"options"
:
[],
"query"
:
{
"query"
:
"label_values(llm_kv_blocks_active{component=
\"
$component
\"
}, endpoint)"
,
"refId"
:
"StandardVariableQuery"
},
"refresh"
:
1
,
"regex"
:
""
,
"skipUrlSync"
:
false
,
"sort"
:
0
,
"type"
:
"query"
}
]
},
"time"
:
{
"from"
:
"now-15m"
,
"to"
:
"now"
},
"timepicker"
:
{},
"timezone"
:
""
,
"title"
:
"LLM Worker Metrics"
,
"uid"
:
"llm-worker-metrics"
,
"version"
:
1
,
"weekStart"
:
""
}
applications/llm/count/visualization/prometheus.yml
0 → 100644
View file @
d38325c2
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
global
:
scrape_interval
:
1s
evaluation_interval
:
1s
scrape_configs
:
-
job_name
:
'
count'
static_configs
:
# TODO: Use proper docker networking
# - targets: ['host.docker.internal:9091']
-
targets
:
[
'
localhost:9091'
]
examples/rust/service_metrics/Cargo.toml
View file @
d38325c2
...
...
@@ -29,4 +29,4 @@ triton-distributed-runtime = { workspace = true }
futures
=
{
workspace
=
true
}
serde
=
{
workspace
=
true
}
serde_json
=
{
workspace
=
true
}
tokio
=
{
workspace
=
true
}
tokio
=
{
workspace
=
true
}
\ No newline at end of file
lib/llm/src/kv_router.rs
View file @
d38325c2
...
...
@@ -23,8 +23,8 @@ use triton_distributed_runtime::{component::Component, DistributedRuntime};
pub
mod
indexer
;
pub
mod
protocols
;
pub
mod
publisher
;
mod
scheduler
;
mod
scoring
;
pub
mod
scheduler
;
pub
mod
scoring
;
use
crate
::
kv_router
::{
indexer
::{
KvIndexer
,
KvIndexerInterface
,
RouterEvent
},
...
...
lib/llm/src/kv_router/scoring.rs
View file @
d38325c2
...
...
@@ -15,11 +15,12 @@
//! Scoring functions for the KV router.
use
serde
::{
Deserialize
,
Serialize
};
use
std
::
collections
::
HashSet
;
use
crate
::
kv_router
::
scheduler
::
Endpoint
;
#[derive(Debug,
Default)]
#[derive(Debug,
Default
,
Serialize,
Deserialize
)]
pub
struct
ProcessedEndpoints
{
pub
endpoints
:
Vec
<
Endpoint
>
,
pub
worker_ids
:
Vec
<
i64
>
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment