nvext.rs 16.2 KB
Newer Older
1
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
// SPDX-License-Identifier: Apache-2.0

4
use axum::http::HeaderMap;
5
6
use derive_builder::Builder;
use serde::{Deserialize, Serialize};
7
use utoipa::ToSchema;
8
9
use validator::{Validate, ValidationError};

10
11
pub use crate::protocols::common::timing::TimingInfo;

12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
pub const HEADER_WORKER_INSTANCE_ID: &str = "x-worker-instance-id";
pub const HEADER_PREFILL_INSTANCE_ID: &str = "x-prefill-instance-id";

/// Apply routing overrides from HTTP headers to nvext.
///
/// Header mappings:
/// - `x-worker-instance-id` -> `backend_instance_id` and `decode_worker_id`
/// - `x-prefill-instance-id` -> `prefill_worker_id`
///
/// Headers take priority over existing nvext values when present.
/// If no headers are present, returns the original nvext unchanged.
pub fn apply_header_routing_overrides(nvext: Option<NvExt>, headers: &HeaderMap) -> Option<NvExt> {
    let worker_id = headers
        .get(HEADER_WORKER_INSTANCE_ID)
        .and_then(|v| v.to_str().ok())
        .and_then(|s| s.parse::<u64>().ok());

    let prefill_id = headers
        .get(HEADER_PREFILL_INSTANCE_ID)
        .and_then(|v| v.to_str().ok())
        .and_then(|s| s.parse::<u64>().ok());

34
    if worker_id.is_none() && prefill_id.is_none() {
35
36
37
38
39
40
41
42
43
44
45
46
47
48
        return nvext;
    }

    let mut ext = nvext.unwrap_or_default();
    if let Some(id) = worker_id {
        ext.backend_instance_id = Some(id);
        ext.decode_worker_id = Some(id);
    }
    if let Some(id) = prefill_id {
        ext.prefill_worker_id = Some(id);
    }
    Some(ext)
}

49
50
51
pub trait NvExtProvider {
    fn nvext(&self) -> Option<&NvExt>;
    fn raw_prompt(&self) -> Option<String>;
52
53
54
55
56
57
58

    /// Return the effective cache control for this request.
    /// Default: delegates to `nvext.cache_control`. Implementations may override
    /// to also check a top-level `cache_control` field (see `NvCreateChatCompletionRequest`).
    fn effective_cache_control(&self) -> Option<&CacheControl> {
        self.nvext().and_then(|ext| ext.cache_control.as_ref())
    }
59
60
}

61
/// Worker ID information for disaggregated serving
62
#[derive(ToSchema, Serialize, Deserialize, Debug, Clone, PartialEq)]
63
64
65
66
67
pub struct WorkerIdInfo {
    /// The prefill worker ID that processed this request
    #[serde(skip_serializing_if = "Option::is_none")]
    pub prefill_worker_id: Option<u64>,

68
69
70
71
    /// The prefill worker's data parallel rank
    #[serde(skip_serializing_if = "Option::is_none")]
    pub prefill_dp_rank: Option<u32>,

72
73
74
    /// The decode worker ID that processed this request
    #[serde(skip_serializing_if = "Option::is_none")]
    pub decode_worker_id: Option<u64>,
75
76
77
78

    /// The decode worker's data parallel rank
    #[serde(skip_serializing_if = "Option::is_none")]
    pub decode_dp_rank: Option<u32>,
79
80
81
}

/// NVIDIA LLM response extensions
82
#[derive(ToSchema, Serialize, Deserialize, Debug, Clone)]
83
84
85
86
pub struct NvExtResponse {
    /// Worker ID information (prefill and decode worker IDs)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub worker_id: Option<WorkerIdInfo>,
87
88
89
90
91

    /// Per-request timing information
    /// Populated when client requests `extra_fields: ["timing"]`
    #[serde(skip_serializing_if = "Option::is_none")]
    pub timing: Option<TimingInfo>,
92
93
94
95
96

    /// Token IDs for GAIE Stage 1 query-only mode
    /// Contains the tokenized prompt for reuse in Stage 2
    #[serde(skip_serializing_if = "Option::is_none")]
    pub token_ids: Option<Vec<u32>>,
97
98
99
100

    /// Routed expert capture payload (SGLang-specific)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub routed_experts: Option<serde_json::Value>,
101
102
}

103
/// NVIDIA LLM extensions to the OpenAI API
104
#[derive(ToSchema, Serialize, Deserialize, Builder, Validate, Debug, Clone)]
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#[validate(schema(function = "validate_nv_ext"))]
pub struct NvExt {
    /// If true, sampling will be forced to be greedy.
    /// The backend is responsible for selecting the correct backend-specific options to
    /// implement this.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    #[builder(default, setter(strip_option))]
    pub greed_sampling: Option<bool>,

    /// If true, the preproessor will try to bypass the prompt template and pass the prompt directly to
    /// to the tokenizer.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    #[builder(default, setter(strip_option))]
    pub use_raw_prompt: Option<bool>,

    /// Annotations
    /// User requests triggers which result in the request issue back out-of-band information in the SSE
    /// stream using the `event:` field.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    #[builder(default, setter(strip_option))]
    pub annotations: Option<Vec<String>>,
126

127
128
129
130
131
    /// Targeted backend instance ID for the request
    /// If set, the request will be routed to backend instance with the given ID.
    /// If not set, the request will be routed to the best matching instance.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
132
    pub backend_instance_id: Option<u64>,
133
134
135
136
137
138
139

    /// Pre-tokenized data to use instead of tokenizing the prompt
    /// If provided along with backend_instance_id, these tokens will be used directly
    /// and tokenization will be skipped.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub token_data: Option<Vec<u32>>,
140

141
142
143
144
145
    /// Maximum number of thinking tokens allowed
    /// NOTE: Currently passed through to backends as a no-op for future implementation
    #[serde(default, skip_serializing_if = "Option::is_none")]
    #[builder(default, setter(strip_option))]
    pub max_thinking_tokens: Option<u32>,
146
147
148

    /// Extra fields to be included in the response's nvext
    /// This is a list of field names that should be populated in the response
149
150
    /// Supported fields include "worker_id", "timing", "routed_experts",
    /// which map to fields in NvExtResponse.
151
152
153
    #[serde(default, skip_serializing_if = "Option::is_none")]
    #[builder(default, setter(strip_option))]
    pub extra_fields: Option<Vec<String>>,
154
155
156
157
158
159
160
161
162
163
164
165

    /// Targeted prefill worker ID for disaggregated serving (GAIE Stage 2)
    /// When set, the request will be routed to this specific prefill worker.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub prefill_worker_id: Option<u64>,

    /// Targeted decode worker ID for disaggregated serving (GAIE Stage 2)
    /// When set, the request will be routed to this specific decode worker.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub decode_worker_id: Option<u64>,
166

167
    /// Agent-provided hints for request handling.
168
169
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
170
    pub agent_hints: Option<AgentHints>,
171
172
173
174
175
176

    /// Cache control hint (Anthropic-style). When present, the router pins
    /// the prefix on the selected worker with the given TTL.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub cache_control: Option<CacheControl>,
177
178
179
}

/// Hints from the agent/caller about request characteristics.
180
#[derive(ToSchema, Serialize, Deserialize, Builder, Debug, Clone, Default, PartialEq)]
181
pub struct AgentHints {
182
183
184
185
    /// Unified request priority.
    /// Higher values mean "more important" at the Dynamo API level.
    /// Dynamo uses this for router queue ordering and normalizes it per backend
    /// before forwarding engine priority values.
186
187
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
188
    pub priority: Option<i32>,
189
190
191
192
193
194
195

    /// Expected output sequence length (number of output tokens).
    /// Used as a hint for routing decisions to estimate resource requirements
    /// and for output block tracking decay.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub osl: Option<u32>,
Yan Ru Pei's avatar
Yan Ru Pei committed
196
197
198
199
200
201
202

    /// When true, after the assistant turn completes, the system will speculatively
    /// prefill the predicted next-turn prefix (conversation history with thinking
    /// content stripped) on a worker to warm the KV cache for the next request.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub speculative_prefill: Option<bool>,
203

204
205
    /// Deprecated alias for router-only priority.
    /// Kept as an undocumented fallback while callers migrate to `priority`.
206
207
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
208
209
    #[schema(ignore)]
    pub latency_sensitivity: Option<f64>,
210
211
}

212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
/// Anthropic-style cache control hint for prefix pinning with TTL.
#[derive(ToSchema, Serialize, Deserialize, Debug, Clone, Default, PartialEq)]
pub struct CacheControl {
    #[serde(rename = "type")]
    pub control_type: CacheControlType,
    /// TTL as seconds (integer) or shorthand ("5m" = 300s, "1h" = 3600s). Clamped to [300, 3600].
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub ttl: Option<String>,
}

#[derive(ToSchema, Serialize, Deserialize, Debug, Clone, Default, PartialEq)]
#[serde(rename_all = "lowercase")]
pub enum CacheControlType {
    #[default]
    Ephemeral,
    #[serde(other)]
    Unknown,
}

const MIN_TTL_SECONDS: u64 = 300;
const MAX_TTL_SECONDS: u64 = 3600;

impl CacheControl {
    /// Parse TTL string to seconds, clamped to [300, 3600].
    ///
    /// Accepts integer seconds ("120", "600") or shorthand ("5m", "1h").
    /// Values below 300 are clamped to 300; values above 3600 are clamped to 3600.
    /// Unrecognized strings default to 300s.
    pub fn ttl_seconds(&self) -> u64 {
        let raw = match self.ttl.as_deref() {
            None => return MIN_TTL_SECONDS,
            Some("5m") => 300,
            Some("1h") => 3600,
            Some(other) => match other.parse::<u64>() {
                Ok(secs) => secs,
                Err(_) => {
                    tracing::warn!("Unrecognized TTL '{}', defaulting to 300s", other);
                    return MIN_TTL_SECONDS;
                }
            },
        };
        raw.clamp(MIN_TTL_SECONDS, MAX_TTL_SECONDS)
    }
}

257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
impl Default for NvExt {
    fn default() -> Self {
        NvExt::builder().build().unwrap()
    }
}

impl NvExt {
    pub fn builder() -> NvExtBuilder {
        NvExtBuilder::default()
    }
}

fn validate_nv_ext(_nv_ext: &NvExt) -> Result<(), ValidationError> {
    Ok(())
}

impl NvExtBuilder {
    pub fn add_annotation(&mut self, annotation: impl Into<String>) -> &mut Self {
        self.annotations
            .get_or_insert_with(|| Some(vec![]))
            .as_mut()
            .expect("stop should always be Some(Vec)")
            .push(annotation.into());
        self
    }
}

#[cfg(test)]
mod tests {
    use validator::Validate;

    use super::*;

    // Test default builder configuration
    #[test]
    fn test_nv_ext_builder_default() {
        let nv_ext = NvExt::builder().build().unwrap();
        assert_eq!(nv_ext.greed_sampling, None);
295
296
297
298
        assert_eq!(nv_ext.use_raw_prompt, None);
        assert_eq!(nv_ext.annotations, None);
        assert_eq!(nv_ext.backend_instance_id, None);
        assert_eq!(nv_ext.token_data, None);
299
        assert_eq!(nv_ext.max_thinking_tokens, None);
300
        assert_eq!(nv_ext.extra_fields, None);
301
302
        assert_eq!(nv_ext.prefill_worker_id, None);
        assert_eq!(nv_ext.decode_worker_id, None);
303
        assert_eq!(nv_ext.agent_hints, None);
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
        assert_eq!(nv_ext.cache_control, None);
    }

    // Test CacheControl serde roundtrip and TTL parsing
    #[test]
    fn test_cache_control_serde_and_ttl() {
        // Default (ephemeral, no TTL)
        let cc = CacheControl::default();
        assert_eq!(cc.control_type, CacheControlType::Ephemeral);
        assert_eq!(cc.ttl, None);
        assert_eq!(cc.ttl_seconds(), 300);

        // Shorthand values
        let cc_5m = CacheControl {
            control_type: CacheControlType::Ephemeral,
            ttl: Some("5m".to_string()),
        };
        assert_eq!(cc_5m.ttl_seconds(), 300);

        let cc_1h = CacheControl {
            control_type: CacheControlType::Ephemeral,
            ttl: Some("1h".to_string()),
        };
        assert_eq!(cc_1h.ttl_seconds(), 3600);

        // Integer seconds -- within range
        let cc_600 = CacheControl {
            control_type: CacheControlType::Ephemeral,
            ttl: Some("600".to_string()),
        };
        assert_eq!(cc_600.ttl_seconds(), 600);

        // Integer seconds -- clamped to min (300)
        let cc_low = CacheControl {
            control_type: CacheControlType::Ephemeral,
            ttl: Some("10".to_string()),
        };
        assert_eq!(cc_low.ttl_seconds(), 300);

        // Integer seconds -- clamped to max (3600)
        let cc_high = CacheControl {
            control_type: CacheControlType::Ephemeral,
            ttl: Some("7200".to_string()),
        };
        assert_eq!(cc_high.ttl_seconds(), 3600);

        // Unrecognized string defaults to 300
        let cc_bad = CacheControl {
            control_type: CacheControlType::Ephemeral,
            ttl: Some("forever".to_string()),
        };
        assert_eq!(cc_bad.ttl_seconds(), 300);

        // Serde roundtrip
        let json = serde_json::to_string(&cc_5m).unwrap();
        let deser: CacheControl = serde_json::from_str(&json).unwrap();
        assert_eq!(deser, cc_5m);

        // Deserialize from API-style JSON
        let api_json = r#"{"type": "ephemeral", "ttl": "1h"}"#;
        let from_api: CacheControl = serde_json::from_str(api_json).unwrap();
        assert_eq!(from_api.ttl_seconds(), 3600);

        // NvExt with cache_control
        let nvext_json = r#"{"cache_control": {"type": "ephemeral", "ttl": "5m"}}"#;
        let nvext: NvExt = serde_json::from_str(nvext_json).unwrap();
        assert!(nvext.cache_control.is_some());
        assert_eq!(nvext.cache_control.unwrap().ttl_seconds(), 300);
372
373
374
375
376
377
378
    }

    // Test valid builder configurations
    #[test]
    fn test_nv_ext_builder_custom() {
        let nv_ext = NvExt::builder()
            .greed_sampling(true)
379
380
381
            .use_raw_prompt(true)
            .backend_instance_id(42)
            .token_data(vec![1, 2, 3, 4])
382
            .max_thinking_tokens(1024)
383
            .extra_fields(vec!["worker_id".to_string()])
384
385
386
387
            .build()
            .unwrap();

        assert_eq!(nv_ext.greed_sampling, Some(true));
388
389
390
        assert_eq!(nv_ext.use_raw_prompt, Some(true));
        assert_eq!(nv_ext.backend_instance_id, Some(42));
        assert_eq!(nv_ext.token_data, Some(vec![1, 2, 3, 4]));
391
        assert_eq!(nv_ext.max_thinking_tokens, Some(1024));
392
        assert_eq!(nv_ext.extra_fields, Some(vec!["worker_id".to_string()]));
393
394
395
        // Validate the built struct
        assert!(nv_ext.validate().is_ok());
    }
396
397
398
399
400
401
402
403
404
405
406
407
408
409

    // Test GAIE Stage 2 disaggregated worker IDs
    #[test]
    fn test_nv_ext_disagg_worker_ids() {
        let nv_ext = NvExt::builder()
            .prefill_worker_id(100)
            .decode_worker_id(200)
            .build()
            .unwrap();

        assert_eq!(nv_ext.prefill_worker_id, Some(100));
        assert_eq!(nv_ext.decode_worker_id, Some(200));
        assert!(nv_ext.validate().is_ok());
    }
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435

    // Test apply_header_routing_overrides - worker header present, prefill header absent
    #[test]
    fn test_apply_header_routing_overrides() {
        use axum::http::HeaderMap;

        // Only HEADER_WORKER_INSTANCE_ID is in the header
        let mut headers = HeaderMap::new();
        headers.insert(HEADER_WORKER_INSTANCE_ID, "123".parse().unwrap());
        // Note: HEADER_PREFILL_INSTANCE_ID is NOT in the header

        let nvext = NvExt::builder()
            .backend_instance_id(999)
            .decode_worker_id(888)
            .prefill_worker_id(777)
            .build()
            .unwrap();

        let result = apply_header_routing_overrides(Some(nvext), &headers).unwrap();

        // Header should override backend_instance_id and decode_worker_id
        assert_eq!(result.backend_instance_id, Some(123));
        assert_eq!(result.decode_worker_id, Some(123));
        // prefill_worker_id should remain from original nvext (not overwritten by header)
        assert_eq!(result.prefill_worker_id, Some(777));
    }
436
}