Unverified Commit cf630bf7 authored by Graham King's avatar Graham King Committed by GitHub
Browse files

refactor: Make the Runtime and DistributedRuntime fields private (#4193)


Signed-off-by: default avatarGraham King <grahamk@nvidia.com>
parent 0e623146
......@@ -8,15 +8,16 @@
// component's "service state"
use crate::{
DistributedRuntime, Result,
DistributedRuntime,
component::Component,
error,
metrics::{MetricsHierarchy, prometheus_names, prometheus_names::nats_service},
traits::*,
transports::nats,
utils::stream,
};
use anyhow::Result;
use anyhow::anyhow as error;
use async_nats::Message;
use async_stream::try_stream;
use bytes::Bytes;
......
......@@ -81,13 +81,13 @@ pub async fn spawn_system_status_server(
let server_state = Arc::new(SystemStatusState::new(drt)?);
let health_path = server_state
.drt()
.system_health
.system_health()
.lock()
.health_path()
.to_string();
let live_path = server_state
.drt()
.system_health
.system_health()
.lock()
.live_path()
.to_string();
......@@ -158,9 +158,11 @@ pub async fn spawn_system_status_server(
#[tracing::instrument(skip_all, level = "trace")]
async fn health_handler(state: Arc<SystemStatusState>) -> impl IntoResponse {
// Get basic health status
let system_health = state.drt().system_health.lock();
let (healthy, endpoints) = system_health.get_health_status();
let uptime = Some(system_health.uptime());
let system_health = state.drt().system_health();
let system_health_lock = system_health.lock();
let (healthy, endpoints) = system_health_lock.get_health_status();
let uptime = Some(system_health_lock.uptime());
drop(system_health_lock);
let healthy_string = if healthy { "ready" } else { "notready" };
let status_code = if healthy {
......@@ -184,7 +186,7 @@ async fn health_handler(state: Arc<SystemStatusState>) -> impl IntoResponse {
#[tracing::instrument(skip_all, level = "trace")]
async fn metrics_handler(state: Arc<SystemStatusState>) -> impl IntoResponse {
// Update the uptime gauge with current value
state.drt().system_health.lock().update_uptime_gauge();
state.drt().system_health().lock().update_uptime_gauge();
// Get all metrics from DistributedRuntime
// Note: In the new hierarchy-based architecture, metrics are automatically registered
......@@ -260,13 +262,13 @@ mod integration_tests {
let drt = create_test_drt_async().await;
// Get uptime from SystemHealth
let uptime = drt.system_health.lock().uptime();
let uptime = drt.system_health().lock().uptime();
// Uptime should exist (even if close to zero)
assert!(uptime.as_nanos() > 0 || uptime.is_zero());
// Sleep briefly and check uptime increases
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
let uptime_after = drt.system_health.lock().uptime();
let uptime_after = drt.system_health().lock().uptime();
assert!(uptime_after > uptime);
})
.await;
......@@ -317,19 +319,19 @@ mod integration_tests {
let drt = create_test_drt_async().await;
// Get initial uptime
let initial_uptime = drt.system_health.lock().uptime();
let initial_uptime = drt.system_health().lock().uptime();
// Update the gauge with initial value
drt.system_health.lock().update_uptime_gauge();
drt.system_health().lock().update_uptime_gauge();
// Sleep for 100ms
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
// Get uptime after sleep
let uptime_after_sleep = drt.system_health.lock().uptime();
let uptime_after_sleep = drt.system_health().lock().uptime();
// Update the gauge again
drt.system_health.lock().update_uptime_gauge();
drt.system_health().lock().update_uptime_gauge();
// Verify uptime increased by at least 100ms
let elapsed = uptime_after_sleep - initial_uptime;
......@@ -582,8 +584,8 @@ mod integration_tests {
struct TestHandler;
#[async_trait]
impl AsyncEngine<SingleIn<String>, ManyOut<Annotated<String>>, Error> for TestHandler {
async fn generate(&self, input: SingleIn<String>) -> crate::Result<ManyOut<Annotated<String>>> {
impl AsyncEngine<SingleIn<String>, ManyOut<Annotated<String>>, anyhow::Error> for TestHandler {
async fn generate(&self, input: SingleIn<String>) -> anyhow::Result<ManyOut<Annotated<String>>> {
let (data, ctx) = input.into_parts();
let response = Annotated::from_data(format!("You responded: {}", data));
Ok(crate::pipeline::ResponseStream::new(
......@@ -733,8 +735,9 @@ mod integration_tests {
// Register the endpoint and its health check payload
{
let system_health = drt.system_health.lock();
system_health.register_health_check_target(
let system_health = drt.system_health();
let system_health_lock = system_health.lock();
system_health_lock.register_health_check_target(
endpoint,
crate::component::Instance {
component: "test_component".to_string(),
......@@ -760,7 +763,7 @@ mod integration_tests {
);
// Set endpoint to healthy state
drt.system_health
drt.system_health()
.lock()
.set_endpoint_health_status(endpoint, HealthStatus::Ready);
......@@ -777,7 +780,7 @@ mod integration_tests {
// Verify the endpoint status in SystemHealth directly
let endpoint_status = drt
.system_health
.system_health()
.lock()
.get_endpoint_health_status(endpoint);
assert_eq!(
......
......@@ -16,7 +16,7 @@ pub trait DistributedRuntimeProvider {
impl RuntimeProvider for DistributedRuntime {
fn rt(&self) -> &Runtime {
&self.runtime
self.runtime()
}
}
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use std::fmt::Debug;
use crate::Result;
use anyhow::Result;
use async_trait::async_trait;
use serde::{Deserialize, Serialize};
// #[async_trait]
// pub trait Publisher: Debug + Clone + Send + Sync {
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use crate::{CancellationToken, ErrorContext, Result, Runtime, error};
use crate::runtime::Runtime;
use anyhow::{Context, Result};
use async_nats::jetstream::kv;
use derive_builder::Builder;
......@@ -19,6 +20,7 @@ use etcd_client::{
};
pub use etcd_client::{ConnectOptions, KeyValue, LeaseClient};
use tokio::time::{Duration, interval};
use tokio_util::sync::CancellationToken;
mod connector;
mod lease;
......@@ -139,7 +141,7 @@ impl Client {
for resp in result.op_responses() {
tracing::warn!(response = ?resp, "kv_create etcd op response");
}
Err(error!("Unable to create key. Check etcd server status"))
anyhow::bail!("Unable to create key. Check etcd server status")
}
}
......@@ -180,17 +182,15 @@ impl Client {
Some(response) => match response {
TxnOpResponse::Txn(response) => match response.succeeded() {
true => Ok(()),
false => Err(error!(
false => anyhow::bail!(
"Unable to create or validate key. Check etcd server status"
)),
),
},
_ => Err(error!(
"Unable to validate key operation. Check etcd server status"
)),
_ => {
anyhow::bail!("Unable to validate key operation. Check etcd server status")
}
},
None => Err(error!(
"Unable to create or validate key. Check etcd server status"
)),
None => anyhow::bail!("Unable to create or validate key. Check etcd server status"),
}
}
}
......@@ -372,7 +372,7 @@ impl Client {
// Get the start revision
let mut start_revision = get_response
.header()
.ok_or(error!("missing header; unable to get revision"))?
.ok_or(anyhow::anyhow!("missing header; unable to get revision"))?
.revision();
tracing::trace!("{prefix}: start_revision: {start_revision}");
start_revision += 1;
......
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use crate::{ErrorContext, Result, error};
use anyhow::{Context, Result};
use etcd_client::ConnectOptions;
use parking_lot::RwLock;
use std::{sync::Arc, time::Duration};
......@@ -76,9 +76,7 @@ impl Connector {
loop {
backoff_state.apply_backoff(deadline).await;
if std::time::Instant::now() >= deadline {
return Err(error!(
"Unable to reconnect to ETCD cluster: deadline exceeded"
));
anyhow::bail!("Unable to reconnect to ETCD cluster: deadline exceeded");
}
match Self::connect(&self.etcd_urls, &self.connect_options).await {
......
......@@ -7,7 +7,7 @@ use std::time::Duration;
use etcd_client::{Compare, CompareOp, PutOptions, Txn, TxnOp};
use crate::Result;
use anyhow::Result;
use super::Client;
......
......@@ -16,9 +16,10 @@
//! - `NATS_AUTH_CREDENTIALS_FILE`: the path to the credentials file
//!
//! Note: `NATS_AUTH_USERNAME` and `NATS_AUTH_PASSWORD` must be used together.
use crate::metrics::MetricsHierarchy;
use crate::traits::events::EventPublisher;
use crate::{Result, metrics::MetricsHierarchy};
use anyhow::Result;
use async_nats::connection::State;
use async_nats::{Subscriber, client, jetstream};
use async_trait::async_trait;
......
......@@ -28,7 +28,6 @@ use tokio::{
task::{JoinError, JoinHandle},
};
use tokio_util::sync::CancellationToken;
use tracing as log;
// Core message types
#[derive(Debug, Clone, Serialize, Deserialize)]
......@@ -116,11 +115,11 @@ impl Server {
// but we also propagate the error to the caller's cancellation token
let watch_task = tokio::spawn(async move {
let result = primary_task.await.inspect_err(|e| {
log::error!("zmq server/router task failed: {}", e);
tracing::error!("zmq server/router task failed: {}", e);
cancel_token.cancel();
})?;
result.inspect_err(|e| {
log::error!("zmq server/router task failed: {}", e);
tracing::error!("zmq server/router task failed: {}", e);
cancel_token.cancel();
})
});
......@@ -156,7 +155,7 @@ impl Server {
// let port = addr.as_socket().map(|s| s.port());
// if let Some(port) = port {
// log::info!("Server listening on port {}", port);
// tracing::info!("Server listening on port {}", port);
// }
loop {
......@@ -169,7 +168,7 @@ impl Server {
frames
},
Some(Err(e)) => {
log::warn!("Error receiving message: {}", e);
tracing::warn!("Error receiving message: {}", e);
continue;
}
None => break,
......@@ -177,7 +176,7 @@ impl Server {
}
_ = token.cancelled() => {
log::info!("Server shutting down");
tracing::info!("Server shutting down");
break;
}
};
......@@ -203,7 +202,7 @@ impl Server {
// first we try to send the data eagerly without blocking
let action = match tx.try_send(message.into()) {
Ok(_) => {
log::trace!(
tracing::trace!(
request_id,
"response data sent eagerly to stream: {} bytes",
message_size
......@@ -212,11 +211,14 @@ impl Server {
}
Err(e) => match e {
mpsc::error::TrySendError::Closed(_) => {
log::info!(request_id, "response stream was closed");
tracing::info!(request_id, "response stream was closed");
StreamAction::Close
}
mpsc::error::TrySendError::Full(data) => {
log::warn!(request_id, "response stream is full; backpressue alert");
tracing::warn!(
request_id,
"response stream is full; backpressure alert"
);
// todo - add timeout - we are blocking all other streams
if (tx.send(data).await).is_err() {
StreamAction::Close
......@@ -245,7 +247,7 @@ impl Server {
} else {
// increment bytes_dropped
// increment messages_dropped
log::trace!(request_id, "no active stream for request_id");
tracing::trace!(request_id, "no active stream for request_id");
}
}
......
......@@ -6,8 +6,8 @@
//! This module provides reusable patterns for watching etcd prefixes and maintaining
//! HashMap-based state that automatically updates based on etcd events.
use crate::Result;
use crate::transports::etcd::{Client as EtcdClient, WatchEvent};
use anyhow::Result;
use etcd_client::KeyValue;
use serde::de::DeserializeOwned;
use std::collections::HashMap;
......
......@@ -20,7 +20,7 @@
//! and release builds. In development, the default is [DEFAULT_GRACEFUL_SHUTDOWN_TIMEOUT_DEBUG] and
//! in release, the default is [DEFAULT_GRACEFUL_SHUTDOWN_TIMEOUT_RELEASE].
use super::{CancellationToken, Result, Runtime, RuntimeConfig, error};
use super::{CancellationToken, Runtime, RuntimeConfig};
use futures::Future;
use once_cell::sync::OnceCell;
......@@ -30,7 +30,7 @@ use tokio::{signal, task::JoinHandle};
static RT: OnceCell<tokio::runtime::Runtime> = OnceCell::new();
static RTHANDLE: OnceCell<tokio::runtime::Handle> = OnceCell::new();
static INIT: OnceCell<Mutex<Option<tokio::task::JoinHandle<Result<()>>>>> = OnceCell::new();
static INIT: OnceCell<Mutex<Option<tokio::task::JoinHandle<anyhow::Result<()>>>>> = OnceCell::new();
const SHUTDOWN_MESSAGE: &str =
"Application received shutdown signal; attempting to gracefully shutdown";
......@@ -54,30 +54,30 @@ pub struct Worker {
impl Worker {
/// Create a new [`Worker`] instance from [`RuntimeConfig`] settings which is sourced from the environment
pub fn from_settings() -> Result<Worker> {
pub fn from_settings() -> anyhow::Result<Worker> {
let config = RuntimeConfig::from_settings()?;
Worker::from_config(config)
}
/// Create a new [`Worker`] instance from a provided [`RuntimeConfig`]
pub fn from_config(config: RuntimeConfig) -> Result<Worker> {
pub fn from_config(config: RuntimeConfig) -> anyhow::Result<Worker> {
// if the runtime is already initialized, return an error
if RT.get().is_some() || RTHANDLE.get().is_some() {
return Err(error!("Worker already initialized"));
return Err(anyhow::anyhow!("Worker already initialized"));
}
// create a new runtime and insert it into the OnceCell
// there is still a potential race-condition here, two threads cou have passed the first check
// but only one will succeed in inserting the runtime
let rt = RT.try_insert(config.create_runtime()?).map_err(|_| {
error!("Failed to create worker; Only a single Worker should ever be created")
anyhow::anyhow!("Failed to create worker; Only a single Worker should ever be created")
})?;
let runtime = Runtime::from_handle(rt.handle().clone())?;
Ok(Worker { runtime, config })
}
pub fn runtime_from_existing() -> Result<Runtime> {
pub fn runtime_from_existing() -> anyhow::Result<Runtime> {
if let Some(rt) = RT.get() {
Ok(Runtime::from_handle(rt.handle().clone())?)
} else if let Some(rt) = RTHANDLE.get() {
......@@ -87,18 +87,19 @@ impl Worker {
}
}
pub fn tokio_runtime(&self) -> Result<&'static tokio::runtime::Runtime> {
RT.get().ok_or_else(|| error!("Worker not initialized"))
pub fn tokio_runtime(&self) -> anyhow::Result<&'static tokio::runtime::Runtime> {
RT.get()
.ok_or_else(|| anyhow::anyhow!("Worker not initialized"))
}
pub fn runtime(&self) -> &Runtime {
&self.runtime
}
pub fn execute<F, Fut>(self, f: F) -> Result<()>
pub fn execute<F, Fut>(self, f: F) -> anyhow::Result<()>
where
F: FnOnce(Runtime) -> Fut + Send + 'static,
Fut: Future<Output = Result<()>> + Send + 'static,
Fut: Future<Output = anyhow::Result<()>> + Send + 'static,
{
let runtime = self.runtime.clone();
runtime.secondary().block_on(self.execute_internal(f))??;
......@@ -106,10 +107,10 @@ impl Worker {
Ok(())
}
pub async fn execute_async<F, Fut>(self, f: F) -> Result<()>
pub async fn execute_async<F, Fut>(self, f: F) -> anyhow::Result<()>
where
F: FnOnce(Runtime) -> Fut + Send + 'static,
Fut: Future<Output = Result<()>> + Send + 'static,
Fut: Future<Output = anyhow::Result<()>> + Send + 'static,
{
let runtime = self.runtime.clone();
let task = self.execute_internal(f);
......@@ -120,10 +121,10 @@ impl Worker {
/// Executes the provided application/closure on the [`Runtime`].
/// This is designed to be called once from main and will block the calling thread until the application completes.
fn execute_internal<F, Fut>(self, f: F) -> JoinHandle<Result<()>>
fn execute_internal<F, Fut>(self, f: F) -> JoinHandle<anyhow::Result<()>>
where
F: FnOnce(Runtime) -> Fut + Send + 'static,
Fut: Future<Output = Result<()>> + Send + 'static,
Fut: Future<Output = anyhow::Result<()>> + Send + 'static,
{
let runtime = self.runtime.clone();
let primary = runtime.primary();
......@@ -142,13 +143,13 @@ impl Worker {
INIT.set(Mutex::new(Some(secondary.spawn(async move {
// start signal handler
tokio::spawn(signal_handler(runtime.cancellation_token.clone()));
tokio::spawn(signal_handler(runtime.primary_token().clone()));
let cancel_token = runtime.child_token();
let (mut app_tx, app_rx) = tokio::sync::oneshot::channel::<()>();
// spawn a task to run the application
let task: JoinHandle<Result<()>> = primary.spawn(async move {
let task: JoinHandle<anyhow::Result<()>> = primary.spawn(async move {
let _rx = app_rx;
f(runtime).await
});
......@@ -195,9 +196,9 @@ impl Worker {
.expect("Application initialized; but another thread is awaiting it; Worker.execute() can only be called once")
}
pub fn from_current() -> Result<Worker> {
pub fn from_current() -> anyhow::Result<Worker> {
if RT.get().is_some() || RTHANDLE.get().is_some() {
return Err(error!("Worker already initialized"));
return Err(anyhow::anyhow!("Worker already initialized"));
}
let runtime = Runtime::from_current()?;
let config = RuntimeConfig::from_settings()?;
......@@ -206,7 +207,7 @@ impl Worker {
}
/// Catch signals and trigger a shutdown
async fn signal_handler(cancel_token: CancellationToken) -> Result<()> {
async fn signal_handler(cancel_token: CancellationToken) -> anyhow::Result<()> {
let ctrl_c = async {
signal::ctrl_c().await?;
anyhow::Ok(())
......
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use dynamo_runtime::{Result, Runtime, worker::Worker};
use anyhow::Result;
use dynamo_runtime::{Runtime, worker::Worker};
async fn hello_world(_runtime: Runtime) -> Result<()> {
Ok(())
......@@ -12,50 +14,3 @@ fn test_lifecycle() {
let worker = Worker::from_settings().unwrap();
worker.execute(hello_world).unwrap();
}
// async fn discoverable(runtime: Runtime) -> Result<()> {
// let config = DiscoveryConfig {
// etcd_url: vec!["http://localhost:2379".to_string()],
// etcd_connect_options: None,
// };
// let client = DiscoveryClient::new(config, runtime.clone()).await?;
// println!("Primary lease id: {:x}", client.lease_id());
// let lease = client.create_lease(60).await?;
// // Keys and values
// let lock_key = "lock_key"; // Key for the lock
// let object_key = "object_key"; // Key for the object
// let object_value = "This is the object value"; // Value for the object
// let lock_value = "locked"; // Value indicating a lock
// let put_options = Some(PutOptions::new().with_lease(lease.id()));
// // Build the transaction
// let txn = Txn::new()
// .when(vec![Compare::version(lock_key, CompareOp::Equal, 0)]) // Ensure the lock does not exist
// .and_then(vec![
// TxnOp::put(object_key, object_value, put_options.clone()), // Create the object
// TxnOp::put(lock_key, lock_value, put_options), // Set the lock
// ]);
// // Execute the transaction
// let txn_response = client.etc_client().kv_client().txn(txn).await?;
// tokio::spawn(async move {
// println!("custom lease id: {:x}", lease.id());
// lease.cancel_token().cancelled().await;
// println!("custom lease revoked");
// });
// runtime.child_token().cancelled().await;
// Ok(())
// }
// #[test]
// fn test_discovery_client() {
// let runtime = Runtime::new(RuntimeConfig::default()).unwrap();
// runtime.execute(discoverable).unwrap();
// }
......@@ -3,17 +3,23 @@
#![allow(dead_code)]
use anyhow::Error;
use futures::{StreamExt, stream};
use serde::{Deserialize, Serialize};
use std::{sync::Arc, time::Duration};
use dynamo_runtime::engine::ResponseStream;
use dynamo_runtime::{
Error,
pipeline::{
AsyncEngine, Data, Event, ManyOut, Operator, ServiceBackend, ServiceEngine,
ServiceFrontend, SingleIn, async_trait, *,
},
use dynamo_runtime::pipeline::{
AsyncEngine,
Data,
Event,
ManyOut,
Operator,
ServiceBackend,
ServiceEngine,
ServiceFrontend,
SingleIn,
*, // TODO remove the star
};
mod common;
......@@ -46,7 +52,7 @@ pub enum Annotated<T: Data> {
/// to the output stream
struct PreprocesOperator {}
#[async_trait]
#[async_trait::async_trait]
impl
Operator<
SingleIn<String>,
......
......@@ -18,7 +18,7 @@ mod integration {
pub const DEFAULT_NAMESPACE: &str = "dynamo";
use dynamo_runtime::{
DistributedRuntime, ErrorContext, Result, Runtime, Worker, logging,
DistributedRuntime, Runtime, Worker, logging,
pipeline::{
AsyncEngine, AsyncEngineContextProvider, Error, ManyOut, PushRouter, ResponseStream,
SingleIn, async_trait, network::Ingress,
......@@ -26,6 +26,8 @@ mod integration {
protocols::annotated::Annotated,
stream,
};
use anyhow::{Context, Result};
use futures::StreamExt;
use std::{
sync::Arc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment