local_model.rs 15.7 KB
Newer Older
1
2
3
4
5
6
7
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

use std::fs;
use std::path::{Path, PathBuf};
use std::sync::Arc;

8
use anyhow::Context as _;
9
use dynamo_runtime::protocols::EndpointId;
10
use dynamo_runtime::slug::Slug;
11
use dynamo_runtime::storage::key_value_store::Key;
12
use dynamo_runtime::traits::DistributedRuntimeProvider;
13
use dynamo_runtime::{
14
    component::Endpoint,
15
    storage::key_value_store::{EtcdStore, KeyValueStore, KeyValueStoreManager},
16
};
17

18
use crate::entrypoint::RouterConfig;
19
use crate::mocker::protocols::MockEngineArgs;
20
use crate::model_card::{self, ModelDeploymentCard};
21
use crate::model_type::{ModelInput, ModelType};
22
use crate::request_template::RequestTemplate;
23

24
25
26
pub mod runtime_config;

use runtime_config::ModelRuntimeConfig;
27

28
29
30
31
32
33
34
/// Prefix for Hugging Face model repository
const HF_SCHEME: &str = "hf://";

/// What we call a model if the user didn't provide a name. Usually this means the name
/// is invisible, for example in a text chat.
const DEFAULT_NAME: &str = "dynamo";

35
36
37
38
/// Engines don't usually provide a default, so we do.
const DEFAULT_KV_CACHE_BLOCK_SIZE: u32 = 16;

/// We can't have it default to 0, so pick something
Graham King's avatar
Graham King committed
39
40
/// 'pub' because the bindings use it for consistency.
pub const DEFAULT_HTTP_PORT: u16 = 8080;
41
42
43
44
45
46
47
48
49

pub struct LocalModelBuilder {
    model_path: Option<PathBuf>,
    model_name: Option<String>,
    endpoint_id: Option<EndpointId>,
    context_length: Option<u32>,
    template_file: Option<PathBuf>,
    router_config: Option<RouterConfig>,
    kv_cache_block_size: u32,
50
    http_host: Option<String>,
51
    http_port: u16,
Graham King's avatar
Graham King committed
52
53
    tls_cert_path: Option<PathBuf>,
    tls_key_path: Option<PathBuf>,
54
    migration_limit: u32,
55
    is_mocker: bool,
56
57
    extra_engine_args: Option<PathBuf>,
    runtime_config: ModelRuntimeConfig,
58
    user_data: Option<serde_json::Value>,
59
    custom_template_path: Option<PathBuf>,
60
    namespace: Option<String>,
61
62
    custom_backend_metrics_endpoint: Option<String>,
    custom_backend_metrics_polling_interval: Option<f64>,
63
64
}

65
impl Default for LocalModelBuilder {
66
    fn default() -> Self {
67
68
        LocalModelBuilder {
            kv_cache_block_size: DEFAULT_KV_CACHE_BLOCK_SIZE,
69
            http_host: Default::default(),
70
            http_port: DEFAULT_HTTP_PORT,
Graham King's avatar
Graham King committed
71
72
            tls_cert_path: Default::default(),
            tls_key_path: Default::default(),
73
74
75
76
77
78
            model_path: Default::default(),
            model_name: Default::default(),
            endpoint_id: Default::default(),
            context_length: Default::default(),
            template_file: Default::default(),
            router_config: Default::default(),
79
            migration_limit: Default::default(),
80
            is_mocker: Default::default(),
81
82
            extra_engine_args: Default::default(),
            runtime_config: Default::default(),
83
            user_data: Default::default(),
84
            custom_template_path: Default::default(),
85
            namespace: Default::default(),
86
87
            custom_backend_metrics_endpoint: Default::default(),
            custom_backend_metrics_polling_interval: Default::default(),
88
89
90
91
        }
    }
}

92
93
94
95
impl LocalModelBuilder {
    pub fn model_path(&mut self, model_path: Option<PathBuf>) -> &mut Self {
        self.model_path = model_path;
        self
96
97
    }

98
99
100
    pub fn model_name(&mut self, model_name: Option<String>) -> &mut Self {
        self.model_name = model_name;
        self
101
102
    }

103
104
    pub fn endpoint_id(&mut self, endpoint_id: Option<EndpointId>) -> &mut Self {
        self.endpoint_id = endpoint_id;
105
        self
106
107
    }

108
109
110
    pub fn context_length(&mut self, context_length: Option<u32>) -> &mut Self {
        self.context_length = context_length;
        self
111
112
    }

113
114
115
116
    /// Passing None resets it to default
    pub fn kv_cache_block_size(&mut self, kv_cache_block_size: Option<u32>) -> &mut Self {
        self.kv_cache_block_size = kv_cache_block_size.unwrap_or(DEFAULT_KV_CACHE_BLOCK_SIZE);
        self
117
118
    }

119
120
121
122
123
    pub fn http_host(&mut self, host: Option<String>) -> &mut Self {
        self.http_host = host;
        self
    }

Graham King's avatar
Graham King committed
124
125
126
127
128
129
130
131
132
133
134
135
    pub fn http_port(&mut self, port: u16) -> &mut Self {
        self.http_port = port;
        self
    }

    pub fn tls_cert_path(&mut self, p: Option<PathBuf>) -> &mut Self {
        self.tls_cert_path = p;
        self
    }

    pub fn tls_key_path(&mut self, p: Option<PathBuf>) -> &mut Self {
        self.tls_key_path = p;
136
        self
137
138
    }

139
140
    pub fn router_config(&mut self, router_config: Option<RouterConfig>) -> &mut Self {
        self.router_config = router_config;
141
142
143
        self
    }

144
145
146
147
148
    pub fn namespace(&mut self, namespace: Option<String>) -> &mut Self {
        self.namespace = namespace;
        self
    }

149
150
151
    pub fn request_template(&mut self, template_file: Option<PathBuf>) -> &mut Self {
        self.template_file = template_file;
        self
152
153
    }

154
155
156
157
158
    pub fn custom_template_path(&mut self, custom_template_path: Option<PathBuf>) -> &mut Self {
        self.custom_template_path = custom_template_path;
        self
    }

159
160
161
162
163
    pub fn migration_limit(&mut self, migration_limit: Option<u32>) -> &mut Self {
        self.migration_limit = migration_limit.unwrap_or(0);
        self
    }

164
165
166
167
168
    pub fn is_mocker(&mut self, is_mocker: bool) -> &mut Self {
        self.is_mocker = is_mocker;
        self
    }

169
170
171
172
173
174
175
176
177
178
    pub fn extra_engine_args(&mut self, extra_engine_args: Option<PathBuf>) -> &mut Self {
        self.extra_engine_args = extra_engine_args;
        self
    }

    pub fn runtime_config(&mut self, runtime_config: ModelRuntimeConfig) -> &mut Self {
        self.runtime_config = runtime_config;
        self
    }

179
180
181
182
183
    pub fn user_data(&mut self, user_data: Option<serde_json::Value>) -> &mut Self {
        self.user_data = user_data;
        self
    }

184
185
186
187
188
189
190
191
192
193
    pub fn custom_backend_metrics_endpoint(&mut self, endpoint: Option<String>) -> &mut Self {
        self.custom_backend_metrics_endpoint = endpoint;
        self
    }

    pub fn custom_backend_metrics_polling_interval(&mut self, interval: Option<f64>) -> &mut Self {
        self.custom_backend_metrics_polling_interval = interval;
        self
    }

194
195
196
197
198
199
200
201
    /// Make an LLM ready for use:
    /// - Download it from Hugging Face (and NGC in future) if necessary
    /// - Resolve the path
    /// - Load it's ModelDeploymentCard card
    /// - Name it correctly
    ///
    /// The model name will depend on what "model_path" is:
    /// - A folder: The last part of the folder name: "/data/llms/Qwen2.5-3B-Instruct" -> "Qwen2.5-3B-Instruct"
202
203
204
205
206
207
208
209
    /// - An HF repo: The HF repo name: "Qwen/Qwen3-0.6B" stays the same
    pub async fn build(&mut self) -> anyhow::Result<LocalModel> {
        // Generate an endpoint ID for this model if the user didn't provide one.
        // The user only provides one if exposing the model.
        let endpoint_id = self
            .endpoint_id
            .take()
            .unwrap_or_else(|| internal_endpoint("local_model"));
210

211
212
213
214
215
216
        let template = self
            .template_file
            .as_deref()
            .map(RequestTemplate::load)
            .transpose()?;

217
        // echo engine doesn't need a path. It's an edge case, move it out of the way.
218
        if self.model_path.is_none() {
219
220
221
222
            let mut card = ModelDeploymentCard::with_name_only(
                self.model_name.as_deref().unwrap_or(DEFAULT_NAME),
            );
            card.migration_limit = self.migration_limit;
223
            card.user_data = self.user_data.take();
224
            card.runtime_config = self.runtime_config.clone();
225

226
            return Ok(LocalModel {
227
                card,
228
229
230
                full_path: PathBuf::new(),
                endpoint_id,
                template,
231
                http_host: self.http_host.take(),
232
                http_port: self.http_port,
Graham King's avatar
Graham King committed
233
234
                tls_cert_path: self.tls_cert_path.take(),
                tls_key_path: self.tls_key_path.take(),
235
                router_config: self.router_config.take().unwrap_or_default(),
236
                runtime_config: self.runtime_config.clone(),
237
                namespace: self.namespace.clone(),
238
239
240
                custom_backend_metrics_endpoint: self.custom_backend_metrics_endpoint.clone(),
                custom_backend_metrics_polling_interval: self
                    .custom_backend_metrics_polling_interval,
241
242
243
244
245
246
            });
        }

        // Main logic. We are running a model.
        let model_path = self.model_path.take().unwrap();
        let model_path = model_path.to_str().context("Invalid UTF-8 in model path")?;
247
248
249
250
251
252
253
254

        // Check for hf:// prefix first, in case we really want an HF repo but it conflicts
        // with a relative path.
        let is_hf_repo =
            model_path.starts_with(HF_SCHEME) || !fs::exists(model_path).unwrap_or(false);
        let relative_path = model_path.trim_start_matches(HF_SCHEME);
        let full_path = if is_hf_repo {
            // HF download if necessary
255
            super::hub::from_hf(relative_path, self.is_mocker).await?
256
257
258
259
        } else {
            fs::canonicalize(relative_path)?
        };

260
261
        let mut card =
            ModelDeploymentCard::load_from_disk(&full_path, self.custom_template_path.as_deref())?;
262
263
264

        // Usually we infer from the path, self.model_name is user override
        let model_name = self.model_name.take().unwrap_or_else(|| {
265
266
267
268
269
270
271
272
273
274
275
276
            if is_hf_repo {
                // HF repos use their full name ("org/name") not the folder name
                relative_path.to_string()
            } else {
                full_path
                    .iter()
                    .next_back()
                    .map(|n| n.to_string_lossy().into_owned())
                    .unwrap_or_else(|| {
                        // Panic because we can't do anything without a model
                        panic!("Invalid model path, too short: '{}'", full_path.display())
                    })
277
            }
278
        });
279
        card.set_name(&model_name);
280

281
        card.kv_cache_block_size = self.kv_cache_block_size;
282

283
284
285
286
        // Override max number of tokens in context. We usually only do this to limit kv cache allocation.
        if let Some(context_length) = self.context_length {
            card.context_length = context_length;
        }
287

288
        // Override runtime configs with mocker engine args
289
290
291
292
293
294
295
296
297
        if self.is_mocker
            && let Some(path) = &self.extra_engine_args
        {
            let mocker_engine_args = MockEngineArgs::from_json_file(path)
                .expect("Failed to load mocker engine args for runtime config overriding.");
            self.runtime_config.total_kv_blocks = Some(mocker_engine_args.num_gpu_blocks as u64);
            self.runtime_config.max_num_seqs = mocker_engine_args.max_num_seqs.map(|v| v as u64);
            self.runtime_config.max_num_batched_tokens =
                mocker_engine_args.max_num_batched_tokens.map(|v| v as u64);
298
299
        }

300
        card.migration_limit = self.migration_limit;
301
        card.user_data = self.user_data.take();
302
        card.runtime_config = self.runtime_config.clone();
303

304
305
306
307
308
        Ok(LocalModel {
            card,
            full_path,
            endpoint_id,
            template,
309
            http_host: self.http_host.take(),
310
            http_port: self.http_port,
Graham King's avatar
Graham King committed
311
312
            tls_cert_path: self.tls_cert_path.take(),
            tls_key_path: self.tls_key_path.take(),
313
            router_config: self.router_config.take().unwrap_or_default(),
314
            runtime_config: self.runtime_config.clone(),
315
            namespace: self.namespace.clone(),
316
317
            custom_backend_metrics_endpoint: self.custom_backend_metrics_endpoint.clone(),
            custom_backend_metrics_polling_interval: self.custom_backend_metrics_polling_interval,
318
319
320
321
322
323
324
325
326
327
        })
    }
}

#[derive(Debug, Clone)]
pub struct LocalModel {
    full_path: PathBuf,
    card: ModelDeploymentCard,
    endpoint_id: EndpointId,
    template: Option<RequestTemplate>,
328
    http_host: Option<String>,
Graham King's avatar
Graham King committed
329
330
331
    http_port: u16,
    tls_cert_path: Option<PathBuf>,
    tls_key_path: Option<PathBuf>,
332
    router_config: RouterConfig,
333
    runtime_config: ModelRuntimeConfig,
334
    namespace: Option<String>,
335
336
    custom_backend_metrics_endpoint: Option<String>,
    custom_backend_metrics_polling_interval: Option<f64>,
337
338
339
340
341
342
343
344
345
346
347
}

impl LocalModel {
    pub fn card(&self) -> &ModelDeploymentCard {
        &self.card
    }

    pub fn path(&self) -> &Path {
        &self.full_path
    }

348
    /// Human friendly model name. This is the correct name.
349
350
351
352
    pub fn display_name(&self) -> &str {
        &self.card.display_name
    }

353
354
    /// The name under which we make this model available over HTTP.
    /// A slugified version of the model's name, for use in NATS, etcd, etc.
355
    pub fn service_name(&self) -> &str {
356
        self.card.slug().as_ref()
357
358
359
360
361
362
    }

    pub fn request_template(&self) -> Option<RequestTemplate> {
        self.template.clone()
    }

363
364
365
366
    pub fn http_host(&self) -> Option<String> {
        self.http_host.clone()
    }

367
368
369
370
    pub fn http_port(&self) -> u16 {
        self.http_port
    }

Graham King's avatar
Graham King committed
371
372
373
374
375
376
377
378
    pub fn tls_cert_path(&self) -> Option<&Path> {
        self.tls_cert_path.as_deref()
    }

    pub fn tls_key_path(&self) -> Option<&Path> {
        self.tls_key_path.as_deref()
    }

379
380
381
382
    pub fn router_config(&self) -> &RouterConfig {
        &self.router_config
    }

383
384
385
386
    pub fn runtime_config(&self) -> &ModelRuntimeConfig {
        &self.runtime_config
    }

387
388
389
390
    pub fn namespace(&self) -> Option<&str> {
        self.namespace.as_deref()
    }

391
392
393
394
395
396
397
398
399
400
401
402
403
404
    pub fn custom_backend_metrics_endpoint(&self) -> Option<&str> {
        self.custom_backend_metrics_endpoint.as_deref()
    }

    pub fn custom_backend_metrics_polling_interval(&self) -> Option<f64> {
        self.custom_backend_metrics_polling_interval
    }

    pub fn is_gguf(&self) -> bool {
        // GGUF is the only file (not-folder) we accept, so we don't need to check the extension
        // We will error when we come to parse it
        self.full_path.is_file()
    }

405
406
407
408
409
410
411
412
413
    /// An endpoint to identify this model by.
    pub fn endpoint_id(&self) -> &EndpointId {
        &self.endpoint_id
    }

    /// Drop the LocalModel returning it's ModelDeploymentCard.
    /// For the case where we only need the card and don't want to clone it.
    pub fn into_card(self) -> ModelDeploymentCard {
        self.card
414
415
416
417
418
419
420
421
    }

    /// Attach this model the endpoint. This registers it on the network
    /// allowing ingress to discover it.
    pub async fn attach(
        &mut self,
        endpoint: &Endpoint,
        model_type: ModelType,
422
        model_input: ModelInput,
423
424
425
426
427
    ) -> anyhow::Result<()> {
        // A static component doesn't have an etcd_client because it doesn't need to register
        let Some(etcd_client) = endpoint.drt().etcd_client() else {
            anyhow::bail!("Cannot attach to static endpoint");
        };
428
429
        self.card.model_type = model_type;
        self.card.model_input = model_input;
430

431
432
433
434
        // Store model config files in NATS object store
        let nats_client = endpoint.drt().nats_client();
        self.card.move_to_nats(nats_client.clone()).await?;

435
        // Publish the Model Deployment Card to KV store
436
        let kvstore: Box<dyn KeyValueStore> = Box::new(EtcdStore::new(etcd_client.clone()));
437
        let card_store = Arc::new(KeyValueStoreManager::new(kvstore));
438
439
        let lease_id = endpoint.drt().primary_lease().map(|l| l.id()).unwrap_or(0);
        let key = Key::from_raw(endpoint.unique_path(lease_id));
440

441
442
443
444
        let _outcome = card_store
            .publish(model_card::ROOT_PATH, None, &key, &mut self.card)
            .await?;
        Ok(())
445
446
    }
}
447
448
449
450
451
452
453
454
455
456

/// A random endpoint to use for internal communication
/// We can't hard code because we may be running several on the same machine (GPUs 0-3 and 4-7)
fn internal_endpoint(engine: &str) -> EndpointId {
    EndpointId {
        namespace: Slug::slugify(&uuid::Uuid::new_v4().to_string()).to_string(),
        component: engine.to_string(),
        name: "generate".to_string(),
    }
}