Unverified Commit f05f7629 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

chore: Make nats_client private at crate level, various tidy up (#4513)


Signed-off-by: default avatarGraham King <grahamk@nvidia.com>
parent 27904535
...@@ -2,22 +2,22 @@ ...@@ -2,22 +2,22 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
use super::handle::AuditRecord; use super::handle::AuditRecord;
use std::sync::{Arc, OnceLock}; use std::sync::OnceLock;
use tokio::sync::broadcast; use tokio::sync::broadcast;
static BUS: OnceLock<broadcast::Sender<Arc<AuditRecord>>> = OnceLock::new(); static BUS: OnceLock<broadcast::Sender<AuditRecord>> = OnceLock::new();
pub fn init(capacity: usize) { pub fn init(capacity: usize) {
let (tx, _rx) = broadcast::channel::<Arc<AuditRecord>>(capacity); let (tx, _rx) = broadcast::channel::<AuditRecord>(capacity);
let _ = BUS.set(tx); let _ = BUS.set(tx);
} }
pub fn subscribe() -> broadcast::Receiver<Arc<AuditRecord>> { pub fn subscribe() -> broadcast::Receiver<AuditRecord> {
BUS.get().expect("audit bus not initialized").subscribe() BUS.get().expect("audit bus not initialized").subscribe()
} }
pub fn publish(rec: AuditRecord) { pub fn publish(rec: AuditRecord) {
if let Some(tx) = BUS.get() { if let Some(tx) = BUS.get() {
let _ = tx.send(Arc::new(rec)); let _ = tx.send(rec);
} }
} }
...@@ -10,11 +10,11 @@ pub struct AuditPolicy { ...@@ -10,11 +10,11 @@ pub struct AuditPolicy {
static POLICY: OnceLock<AuditPolicy> = OnceLock::new(); static POLICY: OnceLock<AuditPolicy> = OnceLock::new();
/// Audit is enabled if we have at least one sink
pub fn init_from_env() -> AuditPolicy { pub fn init_from_env() -> AuditPolicy {
let enabled = std::env::var("DYN_AUDIT_ENABLED") AuditPolicy {
.map(|v| v == "1" || v.eq_ignore_ascii_case("true")) enabled: std::env::var("DYN_AUDIT_SINKS").is_ok(),
.unwrap_or(false); }
AuditPolicy { enabled }
} }
pub fn policy() -> AuditPolicy { pub fn policy() -> AuditPolicy {
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
use anyhow::Context as _;
use async_nats::jetstream; use async_nats::jetstream;
use async_trait::async_trait; use async_trait::async_trait;
use dynamo_runtime::transports::nats;
use std::sync::Arc; use std::sync::Arc;
use tokio::sync::broadcast; use tokio::sync::broadcast;
...@@ -36,7 +38,7 @@ pub struct NatsSink { ...@@ -36,7 +38,7 @@ pub struct NatsSink {
} }
impl NatsSink { impl NatsSink {
pub fn new(nats_client: &dynamo_runtime::transports::nats::Client) -> Self { pub fn new(nats_client: dynamo_runtime::transports::nats::Client) -> Self {
let subject = std::env::var("DYN_AUDIT_NATS_SUBJECT") let subject = std::env::var("DYN_AUDIT_NATS_SUBJECT")
.unwrap_or_else(|_| "dynamo.audit.v1".to_string()); .unwrap_or_else(|_| "dynamo.audit.v1".to_string());
Self { Self {
...@@ -64,37 +66,32 @@ impl AuditSink for NatsSink { ...@@ -64,37 +66,32 @@ impl AuditSink for NatsSink {
} }
} }
fn parse_sinks_from_env( async fn parse_sinks_from_env() -> anyhow::Result<Vec<Arc<dyn AuditSink>>> {
nats_client: Option<&dynamo_runtime::transports::nats::Client>,
) -> Vec<Arc<dyn AuditSink>> {
let cfg = std::env::var("DYN_AUDIT_SINKS").unwrap_or_else(|_| "stderr".into()); let cfg = std::env::var("DYN_AUDIT_SINKS").unwrap_or_else(|_| "stderr".into());
let mut out: Vec<Arc<dyn AuditSink>> = Vec::new(); let mut out: Vec<Arc<dyn AuditSink>> = Vec::new();
for name in cfg.split(',').map(|s| s.trim().to_lowercase()) { for name in cfg.split(',').map(|s| s.trim().to_lowercase()) {
match name.as_str() { match name.as_str() {
"stderr" | "" => out.push(Arc::new(StderrSink)), "stderr" | "" => out.push(Arc::new(StderrSink)),
"nats" => { "nats" => {
if let Some(client) = nats_client { let nats_client = nats::ClientOptions::default()
out.push(Arc::new(NatsSink::new(client))); .connect()
} else { .await
tracing::warn!( .context("Attempting to connect NATS sink from env var DYN_AUDIT_SINKS")?;
"NATS sink requested but no DistributedRuntime NATS client available; skipping" out.push(Arc::new(NatsSink::new(nats_client)));
);
}
} }
// "pg" => out.push(Arc::new(PostgresSink::from_env())), // "pg" => out.push(Arc::new(PostgresSink::from_env())),
other => tracing::warn!(%other, "audit: unknown sink ignored"), other => tracing::warn!(%other, "audit: unknown sink ignored"),
} }
} }
out Ok(out)
} }
/// spawn one worker per sink; each subscribes to the bus (off hot path) /// spawn one worker per sink; each subscribes to the bus (off hot path)
pub fn spawn_workers_from_env(drt: &dynamo_runtime::DistributedRuntime) { pub async fn spawn_workers_from_env() -> anyhow::Result<()> {
let nats_client = drt.nats_client(); let sinks = parse_sinks_from_env().await?;
let sinks = parse_sinks_from_env(nats_client);
for sink in sinks { for sink in sinks {
let name = sink.name(); let name = sink.name();
let mut rx: broadcast::Receiver<Arc<AuditRecord>> = bus::subscribe(); let mut rx: broadcast::Receiver<AuditRecord> = bus::subscribe();
tokio::spawn(async move { tokio::spawn(async move {
loop { loop {
match rx.recv().await { match rx.recv().await {
...@@ -110,4 +107,5 @@ pub fn spawn_workers_from_env(drt: &dynamo_runtime::DistributedRuntime) { ...@@ -110,4 +107,5 @@ pub fn spawn_workers_from_env(drt: &dynamo_runtime::DistributedRuntime) {
}); });
} }
tracing::info!("Audit sinks ready."); tracing::info!("Audit sinks ready.");
Ok(())
} }
...@@ -39,10 +39,8 @@ pub struct Controller<Locality: LocalityProvider, Metadata: BlockMetadata> { ...@@ -39,10 +39,8 @@ pub struct Controller<Locality: LocalityProvider, Metadata: BlockMetadata> {
impl<Locality: LocalityProvider, Metadata: BlockMetadata> Controller<Locality, Metadata> { impl<Locality: LocalityProvider, Metadata: BlockMetadata> Controller<Locality, Metadata> {
pub async fn new( pub async fn new(
block_manager: KvBlockManager<Locality, Metadata>, block_manager: KvBlockManager<Locality, Metadata>,
mut component: dynamo_runtime::component::Component, component: dynamo_runtime::component::Component,
) -> anyhow::Result<Self> { ) -> anyhow::Result<Self> {
component.add_stats_service().await?;
let handler = ControllerHandler::new(block_manager.clone()); let handler = ControllerHandler::new(block_manager.clone());
let engine = Ingress::for_engine(handler.clone())?; let engine = Ingress::for_engine(handler.clone())?;
......
...@@ -117,7 +117,7 @@ pub async fn run_input( ...@@ -117,7 +117,7 @@ pub async fn run_input(
.and_then(|v| v.parse().ok()) .and_then(|v| v.parse().ok())
.unwrap_or(1024); .unwrap_or(1024);
crate::audit::bus::init(cap); crate::audit::bus::init(cap);
crate::audit::sink::spawn_workers_from_env(&drt); crate::audit::sink::spawn_workers_from_env().await?;
tracing::info!(cap, "Audit initialized"); tracing::info!(cap, "Audit initialized");
} }
......
...@@ -32,14 +32,9 @@ pub async fn run( ...@@ -32,14 +32,9 @@ pub async fn run(
let cancel_token = distributed_runtime.primary_token().clone(); let cancel_token = distributed_runtime.primary_token().clone();
let endpoint_id: EndpointId = path.parse()?; let endpoint_id: EndpointId = path.parse()?;
let mut component = distributed_runtime let component = distributed_runtime
.namespace(&endpoint_id.namespace)? .namespace(&endpoint_id.namespace)?
.component(&endpoint_id.component)?; .component(&endpoint_id.component)?;
// We can only make the NATS service if we have NATS
if distributed_runtime.nats_client().is_some() {
component.add_stats_service().await?;
}
let endpoint = component.endpoint(&endpoint_id.name); let endpoint = component.endpoint(&endpoint_id.name);
let rt_fut: Pin<Box<dyn Future<Output = _> + Send + 'static>> = match engine_config { let rt_fut: Pin<Box<dyn Future<Output = _> + Send + 'static>> = match engine_config {
......
...@@ -1002,8 +1002,7 @@ mod tests { ...@@ -1002,8 +1002,7 @@ mod tests {
// Create namespace and shared component for both seq_managers // Create namespace and shared component for both seq_managers
let namespace = distributed.namespace("test_cross_instance_sync")?; let namespace = distributed.namespace("test_cross_instance_sync")?;
let mut component = namespace.component("sequences")?; let component = namespace.component("sequences")?;
component.add_stats_service().await?;
// Create multi-worker sequence managers with: // Create multi-worker sequence managers with:
// - Worker 0 with dp_size=2 (dp_ranks 0 and 1) // - Worker 0 with dp_size=2 (dp_ranks 0 and 1)
...@@ -1168,8 +1167,7 @@ mod tests { ...@@ -1168,8 +1167,7 @@ mod tests {
// Create namespace and shared component for both seq_managers // Create namespace and shared component for both seq_managers
let namespace = distributed.namespace("test_no_token_seq_sync")?; let namespace = distributed.namespace("test_no_token_seq_sync")?;
let mut component = namespace.component("sequences")?; let component = namespace.component("sequences")?;
component.add_stats_service().await?;
// Create multi-worker sequence managers with ALL workers [0, 1, 2] // Create multi-worker sequence managers with ALL workers [0, 1, 2]
// Both use the same component to ensure event synchronization works // Both use the same component to ensure event synchronization works
......
...@@ -463,8 +463,7 @@ mod integration_tests { ...@@ -463,8 +463,7 @@ mod integration_tests {
tracing::info!("✓ Runtime and distributed runtime created"); tracing::info!("✓ Runtime and distributed runtime created");
// Create component for MockVllmEngine (needed for publishers) // Create component for MockVllmEngine (needed for publishers)
let mut test_component = distributed.namespace("test")?.component(MOCKER_COMPONENT)?; let test_component = distributed.namespace("test")?.component(MOCKER_COMPONENT)?;
test_component.add_stats_service().await?;
tracing::info!("✓ Test component created"); tracing::info!("✓ Test component created");
// Create MockVllmEngine WITH component (enables publishers) // Create MockVllmEngine WITH component (enables publishers)
......
...@@ -856,7 +856,7 @@ impl ...@@ -856,7 +856,7 @@ impl
let request_id = context.id().to_string(); let request_id = context.id().to_string();
let original_stream_flag = request.inner.stream.unwrap_or(false); let original_stream_flag = request.inner.stream.unwrap_or(false);
// Build audit handle (None if DYN_AUDIT_ENABLED=0) // Build audit handle (None if no DYN_AUDIT_SINKS)
let mut audit_handle = crate::audit::handle::create_handle(&request, &request_id); let mut audit_handle = crate::audit::handle::create_handle(&request, &request_id);
if let Some(ref mut h) = audit_handle { if let Some(ref mut h) = audit_handle {
......
...@@ -27,7 +27,6 @@ mod tests { ...@@ -27,7 +27,6 @@ mod tests {
use dynamo_llm::protocols::openai::chat_completions::{ use dynamo_llm::protocols::openai::chat_completions::{
NvCreateChatCompletionRequest, NvCreateChatCompletionResponse, NvCreateChatCompletionRequest, NvCreateChatCompletionResponse,
}; };
use dynamo_runtime::Runtime;
use dynamo_runtime::transports::nats; use dynamo_runtime::transports::nats;
use futures::StreamExt; use futures::StreamExt;
use serde_json::Value; use serde_json::Value;
...@@ -48,15 +47,6 @@ mod tests { ...@@ -48,15 +47,6 @@ mod tests {
.expect("Failed to connect to NATS server") .expect("Failed to connect to NATS server")
} }
/// Helper to create a test DistributedRuntime with NATS
async fn create_test_drt() -> dynamo_runtime::DistributedRuntime {
let rt = Runtime::from_current().unwrap();
let config = dynamo_runtime::distributed::DistributedConfig::from_settings();
dynamo_runtime::DistributedRuntime::new(rt, config)
.await
.expect("Failed to create DistributedRuntime")
}
/// Helper to create a minimal test request /// Helper to create a minimal test request
fn create_test_request(model: &str, store: bool) -> NvCreateChatCompletionRequest { fn create_test_request(model: &str, store: bool) -> NvCreateChatCompletionRequest {
let json = serde_json::json!({ let json = serde_json::json!({
...@@ -155,7 +145,6 @@ mod tests { ...@@ -155,7 +145,6 @@ mod tests {
// Core test: audit records are published to NATS with correct structure // Core test: audit records are published to NATS with correct structure
async_with_vars( async_with_vars(
[ [
("DYN_AUDIT_ENABLED", Some("1")),
("DYN_AUDIT_SINKS", Some("nats")), ("DYN_AUDIT_SINKS", Some("nats")),
("DYN_AUDIT_NATS_SUBJECT", Some(TEST_SUBJECT)), ("DYN_AUDIT_NATS_SUBJECT", Some(TEST_SUBJECT)),
], ],
...@@ -166,8 +155,7 @@ mod tests { ...@@ -166,8 +155,7 @@ mod tests {
setup_test_stream(&client, &stream_name, TEST_SUBJECT).await; setup_test_stream(&client, &stream_name, TEST_SUBJECT).await;
bus::init(100); bus::init(100);
let drt = create_test_drt().await; sink::spawn_workers_from_env().await.unwrap();
sink::spawn_workers_from_env(&drt);
time::sleep(Duration::from_millis(100)).await; time::sleep(Duration::from_millis(100)).await;
// Emit audit record // Emit audit record
...@@ -212,7 +200,6 @@ mod tests { ...@@ -212,7 +200,6 @@ mod tests {
async_with_vars( async_with_vars(
[ [
("DYN_AUDIT_ENABLED", Some("1")),
("DYN_AUDIT_SINKS", Some("nats")), ("DYN_AUDIT_SINKS", Some("nats")),
("DYN_AUDIT_NATS_SUBJECT", Some(TEST_SUBJECT)), ("DYN_AUDIT_NATS_SUBJECT", Some(TEST_SUBJECT)),
], ],
...@@ -223,8 +210,7 @@ mod tests { ...@@ -223,8 +210,7 @@ mod tests {
setup_test_stream(&client, &stream_name, TEST_SUBJECT).await; setup_test_stream(&client, &stream_name, TEST_SUBJECT).await;
bus::init(100); bus::init(100);
let drt = create_test_drt().await; sink::spawn_workers_from_env().await.unwrap();
sink::spawn_workers_from_env(&drt);
time::sleep(Duration::from_millis(100)).await; time::sleep(Duration::from_millis(100)).await;
// Request with store=true (should be audited) // Request with store=true (should be audited)
......
This diff is collapsed.
...@@ -55,10 +55,7 @@ async fn backend(runtime: DistributedRuntime) -> anyhow::Result<()> { ...@@ -55,10 +55,7 @@ async fn backend(runtime: DistributedRuntime) -> anyhow::Result<()> {
// attach an ingress to an engine // attach an ingress to an engine
let ingress = Ingress::for_engine(RequestHandler::new())?; let ingress = Ingress::for_engine(RequestHandler::new())?;
// // make the ingress discoverable via a component service let component = runtime.namespace(DEFAULT_NAMESPACE)?.component("backend")?;
// // we must first create a service, then we can attach one more more endpoints
let mut component = runtime.namespace(DEFAULT_NAMESPACE)?.component("backend")?;
component.add_stats_service().await?;
component component
.endpoint("generate") .endpoint("generate")
.endpoint_builder() .endpoint_builder()
......
...@@ -6,7 +6,7 @@ use service_metrics::DEFAULT_NAMESPACE; ...@@ -6,7 +6,7 @@ use service_metrics::DEFAULT_NAMESPACE;
use dynamo_runtime::{ use dynamo_runtime::{
DistributedRuntime, Runtime, Worker, logging, pipeline::PushRouter, DistributedRuntime, Runtime, Worker, logging, pipeline::PushRouter,
protocols::annotated::Annotated, utils::Duration, protocols::annotated::Annotated,
}; };
fn main() -> anyhow::Result<()> { fn main() -> anyhow::Result<()> {
...@@ -33,13 +33,6 @@ async fn app(runtime: Runtime) -> anyhow::Result<()> { ...@@ -33,13 +33,6 @@ async fn app(runtime: Runtime) -> anyhow::Result<()> {
println!("{:?}", resp); println!("{:?}", resp);
} }
// This is just an illustration to invoke the server's stats_registry(<action>), where
// the action currently increments the `service_requests_total` metric. You can validate
// the result by running `curl http://localhost:8000/metrics`
let service_set = component.scrape_stats(Duration::from_millis(100)).await?;
println!("{:?}", service_set);
runtime.shutdown(); runtime.shutdown();
Ok(()) Ok(())
} }
...@@ -59,8 +59,7 @@ async fn backend(runtime: DistributedRuntime) -> anyhow::Result<()> { ...@@ -59,8 +59,7 @@ async fn backend(runtime: DistributedRuntime) -> anyhow::Result<()> {
// make the ingress discoverable via a component service // make the ingress discoverable via a component service
// we must first create a service, then we can attach one more more endpoints // we must first create a service, then we can attach one more more endpoints
let mut component = runtime.namespace(DEFAULT_NAMESPACE)?.component("backend")?; let component = runtime.namespace(DEFAULT_NAMESPACE)?.component("backend")?;
component.add_stats_service().await?;
component component
.endpoint("generate") .endpoint("generate")
.endpoint_builder() .endpoint_builder()
......
...@@ -6,7 +6,7 @@ use system_metrics::{DEFAULT_COMPONENT, DEFAULT_ENDPOINT, DEFAULT_NAMESPACE}; ...@@ -6,7 +6,7 @@ use system_metrics::{DEFAULT_COMPONENT, DEFAULT_ENDPOINT, DEFAULT_NAMESPACE};
use dynamo_runtime::{ use dynamo_runtime::{
DistributedRuntime, Runtime, Worker, logging, pipeline::PushRouter, DistributedRuntime, Runtime, Worker, logging, pipeline::PushRouter,
protocols::annotated::Annotated, utils::Duration, protocols::annotated::Annotated,
}; };
fn main() -> anyhow::Result<()> { fn main() -> anyhow::Result<()> {
...@@ -33,9 +33,6 @@ async fn app(runtime: Runtime) -> anyhow::Result<()> { ...@@ -33,9 +33,6 @@ async fn app(runtime: Runtime) -> anyhow::Result<()> {
println!("{:?}", resp); println!("{:?}", resp);
} }
let service_set = component.scrape_stats(Duration::from_millis(100)).await?;
println!("{:?}", service_set);
runtime.shutdown(); runtime.shutdown();
Ok(()) Ok(())
......
...@@ -91,10 +91,9 @@ impl AsyncEngine<SingleIn<String>, ManyOut<Annotated<String>>, Error> for Reques ...@@ -91,10 +91,9 @@ impl AsyncEngine<SingleIn<String>, ManyOut<Annotated<String>>, Error> for Reques
pub async fn backend(drt: DistributedRuntime, endpoint_name: Option<&str>) -> anyhow::Result<()> { pub async fn backend(drt: DistributedRuntime, endpoint_name: Option<&str>) -> anyhow::Result<()> {
let endpoint_name = endpoint_name.unwrap_or(DEFAULT_ENDPOINT); let endpoint_name = endpoint_name.unwrap_or(DEFAULT_ENDPOINT);
let mut component = drt let component = drt
.namespace(DEFAULT_NAMESPACE)? .namespace(DEFAULT_NAMESPACE)?
.component(DEFAULT_COMPONENT)?; .component(DEFAULT_COMPONENT)?;
component.add_stats_service().await?;
let endpoint = component.endpoint(endpoint_name); let endpoint = component.endpoint(endpoint_name);
// Create custom metrics for system stats // Create custom metrics for system stats
......
...@@ -35,6 +35,7 @@ use crate::{ ...@@ -35,6 +35,7 @@ use crate::{
config::HealthStatus, config::HealthStatus,
distributed::RequestPlaneMode, distributed::RequestPlaneMode,
metrics::{MetricsHierarchy, MetricsRegistry, prometheus_names}, metrics::{MetricsHierarchy, MetricsRegistry, prometheus_names},
service::ServiceClient,
service::ServiceSet, service::ServiceSet,
transports::etcd::{ETCD_ROOT_PATH, EtcdPath}, transports::etcd::{ETCD_ROOT_PATH, EtcdPath},
}; };
...@@ -150,13 +151,12 @@ impl PartialOrd for Instance { ...@@ -150,13 +151,12 @@ impl PartialOrd for Instance {
/// You can also issue a request to a [Component]'s [Endpoint] by creating a [Client]. /// You can also issue a request to a [Component]'s [Endpoint] by creating a [Client].
#[derive(Educe, Builder, Clone, Validate)] #[derive(Educe, Builder, Clone, Validate)]
#[educe(Debug)] #[educe(Debug)]
#[builder(pattern = "owned")] #[builder(pattern = "owned", build_fn(private, name = "build_internal"))]
pub struct Component { pub struct Component {
#[builder(private)] #[builder(private)]
#[educe(Debug(ignore))] #[educe(Debug(ignore))]
drt: Arc<DistributedRuntime>, drt: Arc<DistributedRuntime>,
// todo - restrict the namespace to a-z0-9-_A-Z
/// Name of the component /// Name of the component
#[builder(setter(into))] #[builder(setter(into))]
#[validate(custom(function = "validate_allowed_chars"))] #[validate(custom(function = "validate_allowed_chars"))]
...@@ -299,10 +299,14 @@ impl Component { ...@@ -299,10 +299,14 @@ impl Component {
/// Scrape ServiceSet, which contains NATS stats as well as user defined stats /// Scrape ServiceSet, which contains NATS stats as well as user defined stats
/// embedded in data field of ServiceInfo. /// embedded in data field of ServiceInfo.
pub async fn scrape_stats(&self, timeout: Duration) -> anyhow::Result<ServiceSet> { async fn scrape_stats(&self, timeout: Duration) -> anyhow::Result<ServiceSet> {
// Debug: scraping stats for component // Debug: scraping stats for component
let service_name = self.service_name(); let service_name = self.service_name();
let Some(service_client) = self.drt().service_client() else { let Some(service_client) = self
.drt()
.nats_client()
.map(|nc| ServiceClient::new(nc.clone()))
else {
anyhow::bail!("ServiceSet is gathered via NATS, do not call this in non-NATS setups."); anyhow::bail!("ServiceSet is gathered via NATS, do not call this in non-NATS setups.");
}; };
service_client service_client
...@@ -317,7 +321,7 @@ impl Component { ...@@ -317,7 +321,7 @@ impl Component {
/// then subsequent scrapes occur at a fixed interval of 9.8 seconds (MAX_WAIT_MS), /// then subsequent scrapes occur at a fixed interval of 9.8 seconds (MAX_WAIT_MS),
/// which should be near or smaller than typical Prometheus scraping intervals to ensure /// which should be near or smaller than typical Prometheus scraping intervals to ensure
/// metrics are fresh when Prometheus collects them. /// metrics are fresh when Prometheus collects them.
pub fn start_scraping_nats_service_component_metrics(&self) -> anyhow::Result<()> { fn start_scraping_nats_service_component_metrics(&self) -> anyhow::Result<()> {
const MAX_WAIT_MS: std::time::Duration = std::time::Duration::from_millis(9800); // Should be <= Prometheus scrape interval const MAX_WAIT_MS: std::time::Duration = std::time::Duration::from_millis(9800); // Should be <= Prometheus scrape interval
// If there is another component with the same service name, this will fail. // If there is another component with the same service name, this will fail.
...@@ -364,18 +368,8 @@ impl Component { ...@@ -364,18 +368,8 @@ impl Component {
Ok(()) Ok(())
} }
/// TODO
///
/// This method will scrape the stats for all available services
/// Returns a stream of `ServiceInfo` objects.
/// This should be consumed by a `[tokio::time::timeout_at`] because each services
/// will only respond once, but there is no way to know when all services have responded.
pub async fn stats_stream(&self) -> anyhow::Result<()> {
unimplemented!("collect_stats")
}
// Gather NATS metrics // Gather NATS metrics
pub async fn add_stats_service(&mut self) -> anyhow::Result<()> { async fn add_stats_service(&mut self) -> anyhow::Result<()> {
let service_name = self.service_name(); let service_name = self.service_name();
// Pre-check to save cost of creating the service, but don't hold the lock // Pre-check to save cost of creating the service, but don't hold the lock
...@@ -433,6 +427,21 @@ impl ComponentBuilder { ...@@ -433,6 +427,21 @@ impl ComponentBuilder {
pub fn from_runtime(drt: Arc<DistributedRuntime>) -> Self { pub fn from_runtime(drt: Arc<DistributedRuntime>) -> Self {
Self::default().drt(drt) Self::default().drt(drt)
} }
pub fn build(self) -> Result<Component, anyhow::Error> {
let component = self.build_internal()?;
// If this component is using NATS, gather it's metrics
if component.drt().request_plane().is_nats() {
let mut c = component.clone();
// Start in the background to isolate the async, and because we don't need it yet
component.drt().runtime().secondary().spawn(async move {
if let Err(err) = c.add_stats_service().await {
tracing::error!(error = %err, component = c.service_name(), "Failed starting stats service");
}
});
}
Ok(component)
}
} }
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
...@@ -652,10 +661,10 @@ impl Namespace { ...@@ -652,10 +661,10 @@ impl Namespace {
/// Create a [`Component`] in the namespace who's endpoints can be discovered with etcd /// Create a [`Component`] in the namespace who's endpoints can be discovered with etcd
pub fn component(&self, name: impl Into<String>) -> anyhow::Result<Component> { pub fn component(&self, name: impl Into<String>) -> anyhow::Result<Component> {
Ok(ComponentBuilder::from_runtime(self.runtime.clone()) ComponentBuilder::from_runtime(self.runtime.clone())
.name(name) .name(name)
.namespace(self.clone()) .namespace(self.clone())
.build()?) .build()
} }
/// Create a [`Namespace`] in the parent namespace /// Create a [`Namespace`] in the parent namespace
...@@ -690,106 +699,3 @@ fn validate_allowed_chars(input: &str) -> Result<(), ValidationError> { ...@@ -690,106 +699,3 @@ fn validate_allowed_chars(input: &str) -> Result<(), ValidationError> {
Err(ValidationError::new("invalid_characters")) Err(ValidationError::new("invalid_characters"))
} }
} }
// TODO - enable restrictions to the character sets allowed for namespaces,
// components, and endpoints.
//
// Put Validate traits on the struct and use the `validate_allowed_chars` method
// to validate the fields.
// #[cfg(test)]
// mod tests {
// use super::*;
// use validator::Validate;
// #[test]
// fn test_valid_names() {
// // Valid strings
// let valid_inputs = vec![
// "abc", // Lowercase letters
// "abc123", // Letters and numbers
// "a-b-c", // Letters with hyphens
// "a_b_c", // Letters with underscores
// "a-b_c-123", // Mixed valid characters
// "a", // Single character
// "a_b", // Short valid pattern
// "123456", // Only numbers
// "a---b_c123", // Repeated hyphens/underscores
// ];
// for input in valid_inputs {
// let result = validate_allowed_chars(input);
// assert!(result.is_ok(), "Expected '{}' to be valid", input);
// }
// }
// #[test]
// fn test_invalid_names() {
// // Invalid strings
// let invalid_inputs = vec![
// "abc!", // Invalid character `!`
// "abc@", // Invalid character `@`
// "123$", // Invalid character `$`
// "foo.bar", // Invalid character `.`
// "foo/bar", // Invalid character `/`
// "foo\\bar", // Invalid character `\`
// "abc#", // Invalid character `#`
// "abc def", // Spaces are not allowed
// "foo,", // Invalid character `,`
// "", // Empty string
// ];
// for input in invalid_inputs {
// let result = validate_allowed_chars(input);
// assert!(result.is_err(), "Expected '{}' to be invalid", input);
// }
// }
// // #[test]
// // fn test_struct_validation_valid() {
// // // Struct with valid data
// // let valid_data = InputData {
// // name: "valid-name_123".to_string(),
// // };
// // assert!(valid_data.validate().is_ok());
// // }
// // #[test]
// // fn test_struct_validation_invalid() {
// // // Struct with invalid data
// // let invalid_data = InputData {
// // name: "invalid!name".to_string(),
// // };
// // let result = invalid_data.validate();
// // assert!(result.is_err());
// // if let Err(errors) = result {
// // let error_map = errors.field_errors();
// // assert!(error_map.contains_key("name"));
// // let name_errors = &error_map["name"];
// // assert_eq!(name_errors[0].code, "invalid_characters");
// // }
// // }
// #[test]
// fn test_edge_cases() {
// // Edge cases
// let edge_inputs = vec![
// ("-", true), // Single hyphen
// ("_", true), // Single underscore
// ("a-", true), // Letter with hyphen
// ("-", false), // Repeated hyphens
// ("-a", false), // Hyphen at the beginning
// ("a-", false), // Hyphen at the end
// ];
// for (input, expected_validity) in edge_inputs {
// let result = validate_allowed_chars(input);
// if expected_validity {
// assert!(result.is_ok(), "Expected '{}' to be valid", input);
// } else {
// assert!(result.is_err(), "Expected '{}' to be invalid", input);
// }
// }
// }
// }
...@@ -64,7 +64,7 @@ impl EndpointConfigBuilder { ...@@ -64,7 +64,7 @@ impl EndpointConfigBuilder {
pub async fn start(self) -> Result<()> { pub async fn start(self) -> Result<()> {
let ( let (
mut endpoint, endpoint,
handler, handler,
stats_handler, stats_handler,
metrics_labels, metrics_labels,
...@@ -86,19 +86,6 @@ impl EndpointConfigBuilder { ...@@ -86,19 +86,6 @@ impl EndpointConfigBuilder {
// Add metrics to the handler. The endpoint provides additional information to the handler. // Add metrics to the handler. The endpoint provides additional information to the handler.
handler.add_metrics(&endpoint, metrics_labels.as_deref())?; handler.add_metrics(&endpoint, metrics_labels.as_deref())?;
// Determine request plane mode
let request_plane_mode = endpoint.drt().request_plane();
if request_plane_mode.is_nats() {
// We only need the service if we want NATS metrics.
// TODO: This is called for every endpoint of a component. Ideally we only call it once
// on the component.
endpoint.component.add_stats_service().await?;
}
tracing::info!(
"Endpoint starting with request plane mode: {:?}",
request_plane_mode
);
// Insert the stats handler. depends on NATS. // Insert the stats handler. depends on NATS.
if let Some(stats_handler) = stats_handler { if let Some(stats_handler) = stats_handler {
let registry = endpoint.drt().component_registry().inner.lock().await; let registry = endpoint.drt().component_registry().inner.lock().await;
...@@ -123,6 +110,9 @@ impl EndpointConfigBuilder { ...@@ -123,6 +110,9 @@ impl EndpointConfigBuilder {
let system_health = endpoint.drt().system_health(); let system_health = endpoint.drt().system_health();
let subject = endpoint.subject_to(connection_id); let subject = endpoint.subject_to(connection_id);
let request_plane_mode = endpoint.drt().request_plane();
tracing::info!("Endpoint starting with request plane mode: {request_plane_mode}",);
// Register health check target in SystemHealth if provided // Register health check target in SystemHealth if provided
if let Some(health_check_payload) = &health_check_payload { if let Some(health_check_payload) = &health_check_payload {
// Build transport based on request plane mode // Build transport based on request plane mode
......
...@@ -13,7 +13,6 @@ use crate::{ ...@@ -13,7 +13,6 @@ use crate::{
discovery::Discovery, discovery::Discovery,
metrics::PrometheusUpdateCallback, metrics::PrometheusUpdateCallback,
metrics::{MetricsHierarchy, MetricsRegistry}, metrics::{MetricsHierarchy, MetricsRegistry},
service::ServiceClient,
transports::{etcd, nats, tcp}, transports::{etcd, nats, tcp},
}; };
use crate::{discovery, system_status_server, transports}; use crate::{discovery, system_status_server, transports};
...@@ -136,8 +135,6 @@ impl DistributedRuntime { ...@@ -136,8 +135,6 @@ impl DistributedRuntime {
live_endpoint_path, live_endpoint_path,
))); )));
let nats_client_for_metrics = nats_client.clone();
// Initialize discovery client based on backend configuration // Initialize discovery client based on backend configuration
let discovery_backend = let discovery_backend =
std::env::var("DYN_DISCOVERY_BACKEND").unwrap_or_else(|_| "kv_store".to_string()); std::env::var("DYN_DISCOVERY_BACKEND").unwrap_or_else(|_| "kv_store".to_string());
...@@ -171,6 +168,8 @@ impl DistributedRuntime { ...@@ -171,6 +168,8 @@ impl DistributedRuntime {
} }
}; };
let nats_client_for_metrics = nats_client.clone();
let distributed_runtime = Self { let distributed_runtime = Self {
runtime, runtime,
store, store,
...@@ -325,10 +324,6 @@ impl DistributedRuntime { ...@@ -325,10 +324,6 @@ impl DistributedRuntime {
self.discovery_client.clone() self.discovery_client.clone()
} }
pub(crate) fn service_client(&self) -> Option<ServiceClient> {
self.nats_client().map(|nc| ServiceClient::new(nc.clone()))
}
pub async fn tcp_server(&self) -> Result<Arc<tcp::server::TcpStreamServer>> { pub async fn tcp_server(&self) -> Result<Arc<tcp::server::TcpStreamServer>> {
Ok(self Ok(self
.tcp_server .tcp_server
...@@ -380,35 +375,7 @@ impl DistributedRuntime { ...@@ -380,35 +375,7 @@ impl DistributedRuntime {
manager.server().await manager.server().await
} }
/// DEPRECATED: Use network_manager().server() instead pub(crate) fn nats_client(&self) -> Option<&nats::Client> {
#[deprecated(note = "Use request_plane_server() or network_manager().server() instead")]
pub async fn http_server(
&self,
) -> Result<Arc<crate::pipeline::network::ingress::http_endpoint::SharedHttpServer>> {
// For backward compatibility, try to downcast
let _server = self.request_plane_server().await?;
// This will only work if we're actually in HTTP mode
// For now, just return an error suggesting the new API
anyhow::bail!(
"http_server() is deprecated. Use request_plane_server() instead, which returns a trait object that works with all transport types."
)
}
/// DEPRECATED: Use network_manager().server() instead
#[deprecated(note = "Use request_plane_server() or network_manager().server() instead")]
pub async fn shared_tcp_server(
&self,
) -> Result<Arc<crate::pipeline::network::ingress::shared_tcp_endpoint::SharedTcpServer>> {
// For backward compatibility, try to downcast
let _server = self.request_plane_server().await?;
// This will only work if we're actually in TCP mode
// For now, just return an error suggesting the new API
anyhow::bail!(
"shared_tcp_server() is deprecated. Use request_plane_server() instead, which returns a trait object that works with all transport types."
)
}
pub fn nats_client(&self) -> Option<&nats::Client> {
self.nats_client.as_ref() self.nats_client.as_ref()
} }
......
...@@ -1553,10 +1553,7 @@ mod test_metricsregistry_nats { ...@@ -1553,10 +1553,7 @@ mod test_metricsregistry_nats {
// Create a namespace and component from the DRT // Create a namespace and component from the DRT
let namespace = drt.namespace("ns789").unwrap(); let namespace = drt.namespace("ns789").unwrap();
let mut component = namespace.component("comp789").unwrap(); let component = namespace.component("comp789").unwrap();
// Create a service to trigger metrics callback registration
component.add_stats_service().await.unwrap();
// Get component output which should include NATS client metrics // Get component output which should include NATS client metrics
// Additional checks for NATS client metrics (without checking specific values) // Additional checks for NATS client metrics (without checking specific values)
...@@ -1662,11 +1659,10 @@ mod test_metricsregistry_nats { ...@@ -1662,11 +1659,10 @@ mod test_metricsregistry_nats {
let runtime = Runtime::from_current()?; let runtime = Runtime::from_current()?;
let drt = DistributedRuntime::from_settings(runtime.clone()).await?; let drt = DistributedRuntime::from_settings(runtime.clone()).await?;
let namespace = drt.namespace("ns123").unwrap(); let namespace = drt.namespace("ns123").unwrap();
let mut component = namespace.component("comp123").unwrap(); let component = namespace.component("comp123").unwrap();
let ingress = Ingress::for_engine(MessageHandler::new()).unwrap(); let ingress = Ingress::for_engine(MessageHandler::new()).unwrap();
let _backend_handle = tokio::spawn(async move { let _backend_handle = tokio::spawn(async move {
component.add_stats_service().await.unwrap();
let endpoint = component let endpoint = component
.endpoint("echo") .endpoint("echo")
.endpoint_builder() .endpoint_builder()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment