flags.rs 10.3 KB
Newer Older
1
2
3
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

4
use std::collections::HashMap;
5
6
use std::path::PathBuf;

7
use clap::ValueEnum;
8
use dynamo_llm::entrypoint::RouterConfig;
9
use dynamo_llm::entrypoint::input::Input;
10
use dynamo_llm::kv_router::KvRouterConfig;
11
use dynamo_llm::local_model::LocalModel;
12
use dynamo_llm::mocker::protocols::MockEngineArgs;
13
use dynamo_runtime::pipeline::RouterMode as RuntimeRouterMode;
14

15
16
use crate::Output;

17
18
19
20
/// Required options depend on the in and out choices
#[derive(clap::Parser, Debug, Clone)]
#[command(version, about, long_about = None)]
pub struct Flags {
21
22
23
24
25
26
27
    /// The model. The options depend on the engine.
    ///
    /// The full list - only mistralrs supports all three currently:
    /// - Full path to a GGUF file
    /// - Full path of a checked out Hugging Face repository containing safetensor files
    /// - Name of a Hugging Face repository, e.g 'google/flan-t5-small'. The model will be
    ///   downloaded and cached.
28
29
30
    #[arg(index = 1)]
    pub model_path_pos: Option<PathBuf>,

31
    // `--model-path`. The one above is `dynamo-run <positional-model-path>`
32
33
34
35
    #[arg(long = "model-path")]
    pub model_path_flag: Option<PathBuf>,

    /// HTTP port. `in=http` only
Graham King's avatar
Graham King committed
36
    /// If tls_cert_path and tls_key_path are provided, this will be TLS/HTTPS.
37
38
39
    #[arg(long, default_value = "8080")]
    pub http_port: u16,

Graham King's avatar
Graham King committed
40
41
42
43
44
45
46
47
    /// TLS certificate file
    #[arg(long, requires = "tls_key_path")]
    pub tls_cert_path: Option<PathBuf>,

    /// TLS certificate key file
    #[arg(long, requires = "tls_cert_path")]
    pub tls_key_path: Option<PathBuf>,

48
49
50
51
    /// The name of the model we are serving
    #[arg(long)]
    pub model_name: Option<String>,

52
53
54
55
    /// Verbose output (-v for debug, -vv for trace)
    #[arg(short = 'v', action = clap::ArgAction::Count, default_value_t = 0)]
    pub verbosity: u8,

56
57
58
59
60
61
62
63
64
    /// llamacpp only
    ///
    /// The path to the tokenizer and model config because:
    /// - llama_cpp only runs GGUF files
    /// - our engine is a 'core' engine in that we do the tokenization, so we need the vocab
    /// - TODO: we don't yet extract that from the GGUF. Once we do we can remove this flag.
    #[arg(long)]
    pub model_config: Option<PathBuf>,

65
    /// If using `out=dyn` with multiple instances, this says how to route the requests.
66
67
    ///
    /// Mostly interesting for KV-aware routing.
68
69
    /// Defaults to RouterMode::RoundRobin
    #[arg(long, default_value = "round-robin")]
70
71
    pub router_mode: RouterMode,

72
73
74
75
76
77
    /// Maximum number of batched tokens for KV routing
    /// Needed for informing the KV router
    /// NOTE: this is not actually used for now
    #[arg(long, default_value = "8192")]
    pub max_num_batched_tokens: Option<u32>,

78
    /// KV Router: Weight for overlap score in worker selection.
79
    /// Higher values prioritize KV cache reuse. Default: 1.0
80
81
82
    #[arg(long)]
    pub kv_overlap_score_weight: Option<f64>,

83
84
    /// KV Router: Temperature for worker sampling via softmax.
    /// Higher values promote more randomness, and 0 fallbacks to deterministic.
85
    /// Default: 0.0
86
    #[arg(long)]
87
    pub router_temperature: Option<f64>,
88

89
90
91
92
93
94
95
    /// KV Router: Whether to use KV events to maintain the view of cached blocks
    /// If false, would use ApproxKvRouter for predicting block creation / deletion
    /// based only on incoming requests at a timer.
    /// Default: true
    #[arg(long)]
    pub use_kv_events: Option<bool>,

96
97
98
99
100
101
    /// KV Router: Whether to enable replica synchronization across multiple router instances.
    /// When true, routers will publish and subscribe to events to maintain consistent state.
    /// Default: false
    #[arg(long)]
    pub router_replica_sync: Option<bool>,

102
103
104
105
106
107
108
    /// KV Router: Whether to track active blocks in the router for memory management.
    /// When false, the router will not maintain state about which blocks are active,
    /// reducing memory overhead but potentially affecting scheduling decisions.
    /// Default: true
    #[arg(long)]
    pub router_track_active_blocks: Option<bool>,

109
110
111
112
    /// Max model context length. Reduce this if you don't have enough VRAM for the full model
    /// context length (e.g. Llama 4).
    /// Defaults to the model's max, which is usually model_max_length in tokenizer_config.json.
    #[arg(long)]
113
    pub context_length: Option<u32>,
114

115
    /// KV cache block size (is this used? Maybe by Python vllm worker?)
116
    #[arg(long)]
117
    pub kv_cache_block_size: Option<u32>,
118

119
    /// Mocker engine only.
120
121
122
123
124
    /// Additional engine-specific arguments from a JSON file.
    /// Contains a mapping of parameter names to values.
    #[arg(long)]
    pub extra_engine_args: Option<PathBuf>,

125
126
127
128
129
130
131
132
133
134
135
    /// Path to a JSON file containing default request fields.
    /// These fields will be merged with each request, but can be overridden by the request.
    /// Example file contents:
    /// {
    ///     "model": "Qwen2.5-3B-Instruct",
    ///     "temperature": 0.7,
    ///     "max_completion_tokens": 4096
    /// }
    #[arg(long)]
    pub request_template: Option<PathBuf>,

136
137
138
139
140
    /// How many times a request can be migrated to another worker if the HTTP server lost
    /// connection to the current worker.
    #[arg(long, value_parser = clap::value_parser!(u32).range(0..1024))]
    pub migration_limit: Option<u32>,

141
142
143
144
145
146
    /// Make this a static worker.
    /// Do not connect to or advertise self on etcd.
    /// in=dyn://x.y.z only
    #[arg(long, default_value = "false")]
    pub static_worker: bool,

147
148
149
150
151
152
153
    /// Everything after a `--`.
    /// These are the command line arguments to the python engine when using `pystr` or `pytok`.
    #[arg(index = 2, last = true, hide = true, allow_hyphen_values = true)]
    pub last: Vec<String>,
}

impl Flags {
154
155
    /// For each Output variant, check if it would be able to run.
    /// This takes validation out of the main engine creation path.
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
    pub fn validate(
        &self,
        local_model: &LocalModel,
        in_opt: &Input,
        out_opt: &Output,
    ) -> anyhow::Result<()> {
        match in_opt {
            Input::Endpoint(_) => {}
            _ => {
                if self.static_worker {
                    anyhow::bail!("'--static-worker true' only applies to in=dyn://x.y.z");
                }
            }
        }

171
        match out_opt {
172
            Output::Auto => {
173
                if self.context_length.is_some() {
174
175
176
                    anyhow::bail!(
                        "'--context-length' flag should only be used on the worker node, not on the ingress"
                    );
177
178
                }
                if self.kv_cache_block_size.is_some() {
179
180
181
                    anyhow::bail!(
                        "'--kv-cache-block-size' flag should only be used on the worker node, not on the ingress"
                    );
182
                }
183
                if self.migration_limit.is_some() {
184
185
186
                    anyhow::bail!(
                        "'--migration-limit' flag should only be used on the worker node, not on the ingress"
                    );
187
                }
188
            }
189
190
191
192
193
194
195
196
197
198
199
200
201
            Output::Static(_) => {
                if self.model_name.is_none()
                    || self
                        .model_path_pos
                        .as_ref()
                        .or(self.model_path_flag.as_ref())
                        .is_none()
                {
                    anyhow::bail!(
                        "out=dyn://<path> requires --model-name and --model-path, which are the name and path on disk of the model we expect to serve."
                    );
                }
            }
202
            Output::Echo => {}
203
204
205
206
207
            #[cfg(feature = "mistralrs")]
            Output::MistralRs => {}
            #[cfg(feature = "llamacpp")]
            Output::LlamaCpp => {
                if !local_model.path().is_file() {
208
209
210
                    anyhow::bail!(
                        "--model-path should refer to a GGUF file. llama_cpp does not support safetensors."
                    );
211
212
                }
            }
213
214
215
            Output::Mocker => {
                // nothing to check here
            }
216
        }
217
218
219
220
221
222
223
224
225
226

        match out_opt {
            Output::Mocker => {}
            _ => {
                if self.extra_engine_args.is_some() {
                    anyhow::bail!("`--extra-engine-args` is only for the mocker engine");
                }
            }
        }

227
        Ok(())
228
229
    }

230
231
232
233
234
    pub fn router_config(&self) -> RouterConfig {
        RouterConfig::new(
            self.router_mode.into(),
            KvRouterConfig::new(
                self.kv_overlap_score_weight,
235
                self.router_temperature,
236
                self.use_kv_events,
237
                self.router_replica_sync,
238
                self.router_track_active_blocks,
239
                self.max_num_batched_tokens,
240
241
242
                // defaulting below args (no longer maintaining new flags for dynamo-run)
                None,
                None,
243
244
            ),
        )
245
    }
246
247
248
249
250
251
252
253
254
255
256
257
258
259

    /// Load extra engine arguments from a JSON file
    /// Returns a HashMap of parameter names to values
    pub fn load_extra_engine_args(
        &self,
    ) -> anyhow::Result<Option<HashMap<String, serde_json::Value>>> {
        if let Some(path) = &self.extra_engine_args {
            let file_content = std::fs::read_to_string(path)?;
            let args: HashMap<String, serde_json::Value> = serde_json::from_str(&file_content)?;
            Ok(Some(args))
        } else {
            Ok(None)
        }
    }
260
261
262
263
264
265
266
267
268

    pub fn mocker_config(&self) -> MockEngineArgs {
        let Some(path) = &self.extra_engine_args else {
            tracing::warn!("Did not specify extra engine args. Using default mocker args.");
            return MockEngineArgs::default();
        };
        MockEngineArgs::from_json_file(path)
            .unwrap_or_else(|e| panic!("Failed to build mocker engine args from {path:?}: {e}"))
    }
269
270
}

271
#[derive(Default, PartialEq, Eq, ValueEnum, Clone, Debug, Copy)]
272
273
274
275
pub enum RouterMode {
    #[default]
    #[value(name = "round-robin")]
    RoundRobin,
276
    Random,
277
278
279
280
    #[value(name = "kv")]
    KV,
}

281
282
283
284
285
286
impl From<RouterMode> for RuntimeRouterMode {
    fn from(r: RouterMode) -> RuntimeRouterMode {
        match r {
            RouterMode::RoundRobin => RuntimeRouterMode::RoundRobin,
            RouterMode::Random => RuntimeRouterMode::Random,
            RouterMode::KV => RuntimeRouterMode::KV,
287
288
289
        }
    }
}