lib.rs 3.46 KB
Newer Older
1
2
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
Ryan Olson's avatar
Ryan Olson committed
3

Neelay Shah's avatar
Neelay Shah committed
4
//! Dynamo
Ryan Olson's avatar
Ryan Olson committed
5
6
7
8

#![allow(dead_code)]
#![allow(unused_imports)]

9
10
use std::{
    collections::HashMap,
11
    sync::{Arc, OnceLock, Weak},
12
};
Ryan Olson's avatar
Ryan Olson committed
13

Ryan Olson's avatar
Ryan Olson committed
14
pub use anyhow::{
15
    Context as ErrorContext, Error, Ok as OK, Result, anyhow as error, bail as raise,
Ryan Olson's avatar
Ryan Olson committed
16
};
Ryan Olson's avatar
Ryan Olson committed
17
18
19
20
21
22
23

use async_once_cell::OnceCell;

mod config;
pub use config::RuntimeConfig;

pub mod component;
24
pub mod compute;
Ryan Olson's avatar
Ryan Olson committed
25
pub mod engine;
26
pub mod health_check;
27
28
pub mod system_status_server;
pub use system_status_server::SystemStatusServerInfo;
29
pub mod instances;
30
pub mod logging;
31
pub mod metrics;
Ryan Olson's avatar
Ryan Olson committed
32
pub mod pipeline;
33
pub mod prelude;
Ryan Olson's avatar
Ryan Olson committed
34
pub mod protocols;
Ryan Olson's avatar
Ryan Olson committed
35
pub mod runnable;
Ryan Olson's avatar
Ryan Olson committed
36
37
pub mod runtime;
pub mod service;
38
pub mod slug;
39
pub mod storage;
40
pub mod system_health;
Ryan Olson's avatar
Ryan Olson committed
41
pub mod traits;
Ryan Olson's avatar
Ryan Olson committed
42
pub mod transports;
Ryan Olson's avatar
Ryan Olson committed
43
pub mod utils;
Ryan Olson's avatar
Ryan Olson committed
44
45
46
pub mod worker;

pub mod distributed;
47
pub use distributed::distributed_test_utils;
Ryan Olson's avatar
Ryan Olson committed
48
pub use futures::stream;
49
pub use metrics::MetricsRegistry;
50
pub use system_health::{HealthCheckTarget, SystemHealth};
Ryan Olson's avatar
Ryan Olson committed
51
52
53
pub use tokio_util::sync::CancellationToken;
pub use worker::Worker;

54
use crate::{
55
56
    metrics::prometheus_names::distributed_runtime,
    storage::key_value_store::{KeyValueStore, KeyValueStoreManager},
57
};
58

59
use component::{Endpoint, InstanceSource};
60
use utils::GracefulShutdownTracker;
61

62
63
use config::HealthStatus;

Neelay Shah's avatar
Neelay Shah committed
64
/// Types of Tokio runtimes that can be used to construct a Dynamo [Runtime].
Ryan Olson's avatar
Ryan Olson committed
65
66
67
68
69
70
71
72
73
74
75
#[derive(Clone)]
enum RuntimeType {
    Shared(Arc<tokio::runtime::Runtime>),
    External(tokio::runtime::Handle),
}

/// Local [Runtime] which provides access to shared resources local to the physical node/machine.
#[derive(Debug, Clone)]
pub struct Runtime {
    id: Arc<String>,
    primary: RuntimeType,
76
    secondary: RuntimeType,
Ryan Olson's avatar
Ryan Olson committed
77
    cancellation_token: CancellationToken,
78
79
    endpoint_shutdown_token: CancellationToken,
    graceful_shutdown_tracker: Arc<GracefulShutdownTracker>,
80
81
    compute_pool: Option<Arc<compute::ComputePool>>,
    block_in_place_permits: Option<Arc<tokio::sync::Semaphore>>,
Ryan Olson's avatar
Ryan Olson committed
82
83
84
85
86
87
88
89
90
91
}

/// Distributed [Runtime] which provides access to shared resources across the cluster, this includes
/// communication protocols and transports.
#[derive(Clone)]
pub struct DistributedRuntime {
    // local runtime
    runtime: Runtime,

    // we might consider a unifed transport manager here
92
    etcd_client: Option<transports::etcd::Client>,
93
    nats_client: Option<transports::nats::Client>,
94
    store: KeyValueStoreManager,
Ryan Olson's avatar
Ryan Olson committed
95
    tcp_server: Arc<OnceCell<Arc<transports::tcp::server::TcpStreamServer>>>,
96
    system_status_server: Arc<OnceLock<Arc<system_status_server::SystemStatusServerInfo>>>,
Ryan Olson's avatar
Ryan Olson committed
97
98
99

    // local registry for components
    // the registry allows us to use share runtime resources across instances of the same component object.
100
    // take for example two instances of a client to the same remote component. The registry allows us to use
Ryan Olson's avatar
Ryan Olson committed
101
102
103
    // a single endpoint watcher for both clients, this keeps the number background tasking watching specific
    // paths in etcd to a minimum.
    component_registry: component::Registry,
104
105
106
107

    // Will only have static components that are not discoverable via etcd, they must be know at
    // startup. Will not start etcd.
    is_static: bool,
108

109
    instance_sources: Arc<tokio::sync::Mutex<HashMap<Endpoint, Weak<InstanceSource>>>>,
110

111
    // Health Status
112
    system_health: Arc<parking_lot::Mutex<SystemHealth>>,
113

114
115
    // This hierarchy's own metrics registry
    metrics_registry: MetricsRegistry,
Ryan Olson's avatar
Ryan Olson committed
116
}