entrypoint.rs 3.17 KB
Newer Older
1
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
4
5
6
7
8
// SPDX-License-Identifier: Apache-2.0

//! The entrypoint module provides tools to build a Dynamo runner.
//! - Create an EngineConfig of the engine (potentially auto-discovered) to execute
//! - Connect it to an Input

pub mod input;
9
pub use input::{build_routed_pipeline, build_routed_pipeline_with_preprocessor};
10

11
12
use std::future::Future;
use std::pin::Pin;
13
14
use std::sync::Arc;

15
use dynamo_kv_router::config::KvRouterConfig;
16
use dynamo_runtime::{discovery::ModelCardInstanceId, pipeline::RouterMode};
17
18

use crate::{
19
    backend::ExecutionContext, discovery::LoadThresholdConfig, engines::StreamingEngine,
20
    local_model::LocalModel, model_card::ModelDeploymentCard,
21
    types::openai::chat_completions::OpenAIChatCompletionsStreamingEngine,
22
23
};

24
25
/// Callback type for chat engine factory (async)
pub type ChatEngineFactoryCallback = Arc<
26
    dyn Fn(
27
            ModelCardInstanceId,
28
29
30
31
32
33
34
            ModelDeploymentCard,
        ) -> Pin<
            Box<dyn Future<Output = anyhow::Result<OpenAIChatCompletionsStreamingEngine>> + Send>,
        > + Send
        + Sync,
>;

35
36
37
38
#[derive(Debug, Clone, Default)]
pub struct RouterConfig {
    pub router_mode: RouterMode,
    pub kv_router_config: KvRouterConfig,
39
40
    /// Load threshold configuration for busy detection
    pub load_threshold_config: LoadThresholdConfig,
41
    pub enforce_disagg: bool,
42
43
44
45
46
47
48
}

impl RouterConfig {
    pub fn new(router_mode: RouterMode, kv_router_config: KvRouterConfig) -> Self {
        Self {
            router_mode,
            kv_router_config,
49
            load_threshold_config: LoadThresholdConfig::default(),
50
            enforce_disagg: false,
51
52
        }
    }
53

54
55
    pub fn with_load_threshold_config(mut self, config: LoadThresholdConfig) -> Self {
        self.load_threshold_config = config;
56
57
        self
    }
58

59
60
    pub fn with_enforce_disagg(mut self, enforce_disagg: bool) -> Self {
        self.enforce_disagg = enforce_disagg;
61
62
        self
    }
63
64
}

65
#[derive(Clone)]
66
pub enum EngineConfig {
67
    /// Remote networked engines that we discover via etcd
68
69
    Dynamic {
        model: Box<LocalModel>,
70
        chat_engine_factory: Option<ChatEngineFactoryCallback>,
71
    },
72

73
74
    /// A Text engine receives text, does it's own tokenization and prompt formatting.
    InProcessText {
75
76
77
78
        engine: Arc<dyn StreamingEngine>,
        model: Box<LocalModel>,
    },

79
80
    /// A Tokens engine receives tokens, expects to be wrapped with pre/post processors that handle tokenization.
    InProcessTokens {
81
82
        engine: ExecutionContext,
        model: Box<LocalModel>,
83
        is_prefill: bool,
84
85
86
87
    },
}

impl EngineConfig {
88
    pub fn local_model(&self) -> &LocalModel {
89
90
        use EngineConfig::*;
        match self {
91
            Dynamic { model, .. } => model,
92
93
            InProcessText { model, .. } => model,
            InProcessTokens { model, .. } => model,
94
95
        }
    }
96

97
    pub fn chat_engine_factory(&self) -> Option<&ChatEngineFactoryCallback> {
98
        match self {
99
100
101
102
            EngineConfig::Dynamic {
                chat_engine_factory,
                ..
            } => chat_engine_factory.as_ref(),
103
104
105
            _ => None,
        }
    }
106
}