"deploy/vscode:/vscode.git/clone" did not exist on "7f136e29c1c697676226ccbefbcbafd7c70dbb58"
Commit 3b7a462d authored by Ryan Olson's avatar Ryan Olson Committed by GitHub
Browse files

feat: event plane + count


Signed-off-by: default avatarRyan Olson <ryanolson@users.noreply.github.com>
Co-authored-by: default avatarRyan McCormick <rmccormick@nvidia.com>
parent 6e0ccccb
......@@ -28,7 +28,7 @@ on:
paths:
- 'runtime/rust/**'
- 'llm/rust/**'
- 'applications/llm/bin/tio/**'
- 'applications/llm/tio/**'
- '**.rs'
- 'Cargo.toml'
- 'Cargo.lock'
......@@ -68,7 +68,7 @@ jobs:
working-directory: runtime/rust
run: cargo check --locked
- name: Run Cargo Check on tio
working-directory: applications/llm/bin/tio
working-directory: applications/llm/tio
run: cargo check --locked
- name: Verify Code Formatting
working-directory: runtime/rust
......@@ -77,7 +77,7 @@ jobs:
working-directory: runtime/rust
run: cargo clippy --no-deps --all-targets -- -D warnings
- name: Run Clippy Checks on tio
working-directory: applications/llm/bin/tio
working-directory: applications/llm/tio
run: cargo clippy --no-deps --all-targets -- -D warnings
- name: Install and Run cargo-deny
working-directory: runtime/rust
......
This diff is collapsed.
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[package]
name = "count"
version = "0.1.0"
edition = "2021"
[dependencies]
# local
triton-distributed = { path = "../../../runtime/rust" }
triton-llm = { path = "../../../llm/rust/triton-llm" }
# workspace - todo
# crates.io
serde = { version = "1", features = ["derive"] }
serde_json = { version = "1" }
tokio = { version = "1", features = ["full"] }
tracing = { version = "0.1" }
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Count is a metrics aggregator designed to operate within a namespace and collect
//! metrics from all workers.
//!
//! Metrics will collect for now:
//!
//! - LLM Worker Load:Capacity
//! - These metrics will be scraped by the LLM NATS Service API's stats request
//! - Request Slots: [Active, Total]
//! - KV Cache Blocks: [Active, Total]
use serde::{Deserialize, Serialize};
use triton_distributed::{
error, logging,
traits::events::EventPublisher,
utils::{Duration, Instant},
DistributedRuntime, ErrorContext, Result, Runtime, Worker,
};
use tracing as log;
// enum MetricTypes {
// LLMWorkerLoadCapacity(LLMWorkerLoadCapacityConfig),
// }
fn get_config() -> Result<LLMWorkerLoadCapacityConfig> {
let component_name = std::env::var("TRD_COUNT_SCRAPE_COMPONENT")?;
if component_name.is_empty() {
return Err(error!("TRD_COUNT_SCRAPE_COMPONENT is not set"));
}
let endpoint_name = std::env::var("TRD_COUNT_SCRAPE_ENDPOINT")?;
if endpoint_name.is_empty() {
return Err(error!("TRD_COUNT_SCRAPE_ENDPOINT is not set"));
}
Ok(LLMWorkerLoadCapacityConfig {
component_name,
endpoint_name,
})
}
// we will scrape the service_name and extract the endpoint_name metrics
// we will bcast them as {namespace}.events.l2c.{service_name}.{endpoint_name}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LLMWorkerLoadCapacityConfig {
component_name: String,
endpoint_name: String,
}
/// LLM Worker Load Capacity Metrics
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LLMWorkerLoadCapacity {
pub requests_active_slots: u32,
pub requests_total_slots: u32,
pub kv_blocks_active: u32,
pub kv_blocks_total: u32,
}
fn main() -> Result<()> {
logging::init();
let worker = Worker::from_settings()?;
worker.execute(app)
}
// TODO - refactor much of this back into the library
async fn app(runtime: Runtime) -> Result<()> {
// we will start by assuming that there is no oscar and no planner
// to that end, we will use an env to get a singular config for scraping a single backend
let config = get_config()?;
let drt = DistributedRuntime::from_settings(runtime.clone()).await?;
// todo move to distributed and standardize and move into file/env/cli config
let namespace = std::env::var("TRD_NAMESPACE").unwrap_or("default".to_string());
let namespace = drt.namespace(namespace)?;
let component = namespace.component("count")?;
// there should only be one count
// check {component.etcd_path()}/instance for existing instances
let key = format!("{}/instance", component.etcd_path());
drt.etcd_client()
.kv_create(
key,
serde_json::to_vec_pretty(&config)?,
Some(drt.primary_lease().id()),
)
.await
.context("Unable to create unique instance of Count; possibly one already exists")?;
let target = namespace.component(&config.component_name)?;
let target_endpoint = target.endpoint(&config.endpoint_name);
let service_name = target.service_name();
let service_subject = target_endpoint.subject();
log::debug!("Scraping service {service_name} and filtering on subject {service_subject}");
let token = drt.primary_lease().child_token();
let address = format!("{}.{}", config.component_name, config.endpoint_name,);
let event_name = format!("l2c.{}", address);
loop {
// TODO - make this configurable
let next = Instant::now() + Duration::from_secs(2);
// collect stats from each backend
let stream = target.scrape_stats(Duration::from_secs(1)).await?;
// filter the stats by the service subject
let endpoints = stream
.into_endpoints()
.filter(|e| e.subject.starts_with(&service_subject))
.collect::<Vec<_>>();
// extract the custom data from the stats and try to decode it as LLMWorkerLoadCapacity
let metrics = endpoints
.iter()
.filter_map(|e| match e.data.clone() {
Some(metrics) => metrics.decode::<LLMWorkerLoadCapacity>().ok(),
None => None,
})
.collect::<Vec<_>>();
// parse the endpoint ids
// the ids are the last part of the subject in hexadecimal
// form a list of tuples (kv_blocks_total - kv_blocks_active, requests_total_slots - requests_active_slots, id)
// this tuple represent the remaining capacity of each endpoint
let capacity_with_ids = metrics
.iter()
.zip(endpoints.iter())
.filter_map(|(m, e)| {
e.id().ok().map(|id| {
(
m.kv_blocks_total - m.kv_blocks_active,
m.requests_total_slots - m.requests_active_slots,
id,
)
})
})
.collect::<Vec<_>>();
// compute mean / std of LLMWorkerLoadCapacity
let load_values: Vec<f64> = metrics.iter().map(|x| x.kv_blocks_active as f64).collect();
let load_avg = load_values.iter().sum::<f64>() / load_values.len() as f64;
let variance = load_values
.iter()
.map(|&x| (x - load_avg).powi(2))
.sum::<f64>()
/ load_values.len() as f64;
let load_std = variance.sqrt();
let processed = ProcessedEndpoints {
capacity_with_ids,
load_avg,
load_std,
address: address.clone(),
};
// publish using the namespace event plane
namespace.publish(&event_name, &processed).await?;
// wait until cancelled or the next tick
match tokio::time::timeout_at(next, token.cancelled()).await {
Ok(_) => break,
Err(_) => {
// timeout, we continue
continue;
}
}
}
Ok(())
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProcessedEndpoints {
/// (kv_blocks_total - kv_blocks_active, requests_total_slots - requests_active_slots, id)
pub capacity_with_ids: Vec<(u32, u32, i64)>,
/// kv_blocks_active average
pub load_avg: f64,
/// kv_blocks_active standard deviation
pub load_std: f64,
/// {component}.{endpoint}
pub address: String,
}
......@@ -445,15 +445,16 @@ checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"
[[package]]
name = "blake3"
version = "1.5.5"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8ee0c1824c4dea5b5f81736aff91bae041d2c07ee1192bec91054e10e3e601e"
checksum = "1230237285e3e10cde447185e8975408ae24deaa67205ce684805c25bc0c7937"
dependencies = [
"arrayref",
"arrayvec",
"cc",
"cfg-if 1.0.0",
"constant_time_eq",
"memmap2",
]
[[package]]
......@@ -626,9 +627,9 @@ dependencies = [
[[package]]
name = "cc"
version = "1.2.13"
version = "1.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7777341816418c02e033934a09f20dc0ccaf65a5201ef8a450ae0105a573fda"
checksum = "0c3d1b2e905a3a7b00a6141adb0e4c0bb941d11caf55349d863942a1cc44e3c9"
dependencies = [
"jobserver",
"libc",
......@@ -694,9 +695,9 @@ dependencies = [
[[package]]
name = "clap"
version = "4.5.29"
version = "4.5.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8acebd8ad879283633b343856142139f2da2317c96b05b4dd6181c61e2480184"
checksum = "92b7b18d71fad5313a1e320fa9897994228ce274b60faa4d694fe0ea89cd9e6d"
dependencies = [
"clap_builder",
"clap_derive",
......@@ -704,9 +705,9 @@ dependencies = [
[[package]]
name = "clap_builder"
version = "4.5.29"
version = "4.5.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6ba32cbda51c7e1dfd49acc1457ba1a7dec5b64fe360e828acb13ca8dc9c2f9"
checksum = "a35db2071778a7344791a4fb4f95308b5673d219dee3ae348b86642574ecc90c"
dependencies = [
"anstream",
"anstyle",
......@@ -945,9 +946,9 @@ dependencies = [
[[package]]
name = "cudarc"
version = "0.13.4"
version = "0.13.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b68d7c284d40d96a4251330ab583c2718b412f4fc53239d295b3a1f8735f426"
checksum = "cf16a4eaf3c5c36c9a7e4096bf8611cd963aa71d6b67162d538d7ea13befeeea"
dependencies = [
"half",
"libloading",
......@@ -1349,9 +1350,9 @@ dependencies = [
[[package]]
name = "equivalent"
version = "1.0.1"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
[[package]]
name = "errno"
......@@ -1828,9 +1829,9 @@ checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2"
[[package]]
name = "h2"
version = "0.4.7"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ccae279728d634d083c00f6099cb58f01cc99c145b84b8be2f6c74618d79922e"
checksum = "5017294ff4bb30944501348f6f8e42e6ad28f42c8bbef7a74029aff064a4e3c2"
dependencies = [
"atomic-waker",
"bytes",
......@@ -2787,7 +2788,7 @@ dependencies = [
"tqdm",
"tracing",
"tracing-subscriber",
"uuid 1.13.1",
"uuid 1.13.2",
"variantly",
"vob",
]
......@@ -3212,9 +3213,9 @@ dependencies = [
[[package]]
name = "openssl"
version = "0.10.70"
version = "0.10.71"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "61cfb4e166a8bb8c9b55c500bc2308550148ece889be90f609377e58140f42c6"
checksum = "5e14130c6a98cd258fdcb0fb6d744152343ff729cbfcb28c656a9d12b999fbcd"
dependencies = [
"bitflags 2.8.0",
"cfg-if 1.0.0",
......@@ -3244,9 +3245,9 @@ checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e"
[[package]]
name = "openssl-sys"
version = "0.9.105"
version = "0.9.106"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b22d5b84be05a8d6947c7cb71f7c849aa0f112acd4bf51c2a7c1c988ac0a9dc"
checksum = "8bb61ea9811cc39e3c2069f40b8b8e2e70d8569b361f879786cc7ed48b777cdd"
dependencies = [
"cc",
"libc",
......@@ -3856,15 +3857,14 @@ dependencies = [
[[package]]
name = "ring"
version = "0.17.8"
version = "0.17.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d"
checksum = "e75ec5e92c4d8aede845126adc388046234541629e76029599ed35a003c7ed24"
dependencies = [
"cc",
"cfg-if 1.0.0",
"getrandom 0.2.15",
"libc",
"spin",
"untrusted",
"windows-sys 0.52.0",
]
......@@ -4323,9 +4323,9 @@ dependencies = [
[[package]]
name = "smallvec"
version = "1.13.2"
version = "1.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
checksum = "7fcf8323ef1faaee30a44a340193b1ac6814fd9b7b4e88e9d4519a3e4abe1cfd"
[[package]]
name = "socket2"
......@@ -4360,12 +4360,6 @@ dependencies = [
"vob",
]
[[package]]
name = "spin"
version = "0.9.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
[[package]]
name = "spki"
version = "0.7.3"
......@@ -4547,9 +4541,9 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
[[package]]
name = "tempfile"
version = "3.16.0"
version = "3.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38c246215d7d24f48ae091a2902398798e05d978b24315d6efbc00ede9a8bb91"
checksum = "22e5a0acb1f3f55f65cc4a866c361b2fb2a0ff6366785ae6fbb5f85df07ba230"
dependencies = [
"cfg-if 1.0.0",
"fastrand",
......@@ -5113,7 +5107,7 @@ dependencies = [
"tokio-util",
"tracing",
"tracing-subscriber",
"uuid 1.13.1",
"uuid 1.13.2",
"validator",
"xxhash-rust",
]
......@@ -5145,7 +5139,7 @@ dependencies = [
"tracing",
"triton-distributed",
"unicode-segmentation",
"uuid 1.13.1",
"uuid 1.13.2",
"validator",
"xxhash-rust",
]
......@@ -5169,9 +5163,9 @@ dependencies = [
[[package]]
name = "typenum"
version = "1.17.0"
version = "1.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825"
checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f"
[[package]]
name = "uncased"
......@@ -5293,9 +5287,9 @@ dependencies = [
[[package]]
name = "uuid"
version = "1.13.1"
version = "1.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ced87ca4be083373936a67f8de945faa23b6b42384bd5b64434850802c6dccd0"
checksum = "8c1f41ffb7cf259f1ecc2876861a17e7142e63ead296f671f81f6ae85903e0d6"
dependencies = [
"getrandom 0.3.1",
"serde",
......
......@@ -42,5 +42,5 @@ tokio = { version = "1", features = ["full"] }
tokio-util = { version = "0.7", features = ["codec", "net"] }
tracing = { version = "0.1" }
tracing-subscriber = { version = "0.3", features = ["env-filter", "local-time", "json"] }
triton-distributed = { path = "../../../../runtime/rust" }
triton-llm = { path = "../../../../llm/rust/triton-llm" }
triton-distributed = { path = "../../../runtime/rust" }
triton-llm = { path = "../../../llm/rust/triton-llm" }
......@@ -2470,6 +2470,15 @@ dependencies = [
"serde",
]
[[package]]
name = "service_metrics"
version = "0.2.0"
dependencies = [
"futures",
"tokio",
"triton-distributed",
]
[[package]]
name = "sha2"
version = "0.10.8"
......
......@@ -18,6 +18,7 @@ members = [
"hello_world",
"http",
"llmctl",
"service_metrics",
]
resolver = "2"
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
[package]
name = "service_metrics"
version.workspace = true
edition.workspace = true
authors.workspace = true
license.workspace = true
homepage.workspace = true
repository.workspace = true
[dependencies]
triton-distributed = { workspace = true }
# third-party
futures = { workspace = true }
tokio = { workspace = true }
# Service Metrics
This example extends the hello_world example by calling the `scrape_service` method
with the service name for the request response the client just issued a request.
The client can now observe some basic statistics about each instance of the service
begin hosted.
If you start two copies of the server, you will see two entries being emitted.
## Example Output
```
Annotated { data: Some("h"), id: None, event: None, comment: None }
Annotated { data: Some("e"), id: None, event: None, comment: None }
Annotated { data: Some("l"), id: None, event: None, comment: None }
Annotated { data: Some("l"), id: None, event: None, comment: None }
Annotated { data: Some("o"), id: None, event: None, comment: None }
Annotated { data: Some(" "), id: None, event: None, comment: None }
Annotated { data: Some("w"), id: None, event: None, comment: None }
Annotated { data: Some("o"), id: None, event: None, comment: None }
Annotated { data: Some("r"), id: None, event: None, comment: None }
Annotated { data: Some("l"), id: None, event: None, comment: None }
Annotated { data: Some("d"), id: None, event: None, comment: None }
ServiceSet { services: [ServiceInfo { name: "triton_init_backend_720278f8", id: "j6n37goJog3df2PMkQK1Ry", version: "0.0.1", started: "2025-02-18T20:51:01.40830026Z", endpoints: [EndpointInfo { name: "triton_init_backend_720278f8-generate-694d94fc30dbb562", subject: "triton_init_backend_720278f8.generate-694d94fc30dbb562", data: Some(Metrics(Object {"average_processing_time": Number(67387), "last_error": String(""), "num_errors": Number(0), "num_requests": Number(1), "processing_time": Number(67387), "queue_group": String("q")})) }] }] }
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment