"vllm/vscode:/vscode.git/clone" did not exist on "133765760b21fde228bf1ca19fa4b35b5c206359"
nvext.rs 16.4 KB
Newer Older
1
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
// SPDX-License-Identifier: Apache-2.0

4
use axum::http::HeaderMap;
5
6
use derive_builder::Builder;
use serde::{Deserialize, Serialize};
7
use utoipa::ToSchema;
8
9
use validator::{Validate, ValidationError};

10
11
pub use crate::protocols::common::timing::TimingInfo;

12
13
pub const HEADER_WORKER_INSTANCE_ID: &str = "x-worker-instance-id";
pub const HEADER_PREFILL_INSTANCE_ID: &str = "x-prefill-instance-id";
atchernych's avatar
atchernych committed
14
15
pub const HEADER_DP_RANK: &str = "x-dp-rank";
pub const HEADER_PREFILL_DP_RANK: &str = "x-prefill-dp-rank";
16
const UNSET_DP_RANK_SENTINEL: u32 = u32::MAX;
17
18
19
20
21
22

/// Apply routing overrides from HTTP headers to nvext.
///
/// Header mappings:
/// - `x-worker-instance-id` -> `backend_instance_id` and `decode_worker_id`
/// - `x-prefill-instance-id` -> `prefill_worker_id`
atchernych's avatar
atchernych committed
23
24
/// - `x-dp-rank` -> `dp_rank` (decode worker's DP rank)
/// - `x-prefill-dp-rank` -> `prefill_dp_rank`
25
26
27
28
29
30
31
32
33
34
35
36
37
38
///
/// Headers take priority over existing nvext values when present.
/// If no headers are present, returns the original nvext unchanged.
pub fn apply_header_routing_overrides(nvext: Option<NvExt>, headers: &HeaderMap) -> Option<NvExt> {
    let worker_id = headers
        .get(HEADER_WORKER_INSTANCE_ID)
        .and_then(|v| v.to_str().ok())
        .and_then(|s| s.parse::<u64>().ok());

    let prefill_id = headers
        .get(HEADER_PREFILL_INSTANCE_ID)
        .and_then(|v| v.to_str().ok())
        .and_then(|s| s.parse::<u64>().ok());

atchernych's avatar
atchernych committed
39
40
41
42
43
44
45
46
47
    let dp_rank = headers
        .get(HEADER_DP_RANK)
        .and_then(|v| v.to_str().ok())
        .and_then(|s| s.parse::<u32>().ok());

    let prefill_dp_rank = headers
        .get(HEADER_PREFILL_DP_RANK)
        .and_then(|v| v.to_str().ok())
        .and_then(|s| s.parse::<u32>().ok());
48
    let prefill_dp_rank = prefill_dp_rank.filter(|rank| *rank != UNSET_DP_RANK_SENTINEL);
atchernych's avatar
atchernych committed
49
50
51

    if worker_id.is_none() && prefill_id.is_none() && dp_rank.is_none() && prefill_dp_rank.is_none()
    {
52
53
54
55
56
57
58
59
60
61
62
        return nvext;
    }

    let mut ext = nvext.unwrap_or_default();
    if let Some(id) = worker_id {
        ext.backend_instance_id = Some(id);
        ext.decode_worker_id = Some(id);
    }
    if let Some(id) = prefill_id {
        ext.prefill_worker_id = Some(id);
    }
atchernych's avatar
atchernych committed
63
64
65
66
67
68
    if let Some(rank) = dp_rank {
        ext.dp_rank = Some(rank);
    }
    if let Some(rank) = prefill_dp_rank {
        ext.prefill_dp_rank = Some(rank);
    }
69
70
71
    Some(ext)
}

72
73
74
75
76
pub trait NvExtProvider {
    fn nvext(&self) -> Option<&NvExt>;
    fn raw_prompt(&self) -> Option<String>;
}

77
/// Worker ID information for disaggregated serving
78
#[derive(ToSchema, Serialize, Deserialize, Debug, Clone, PartialEq)]
79
80
81
82
83
pub struct WorkerIdInfo {
    /// The prefill worker ID that processed this request
    #[serde(skip_serializing_if = "Option::is_none")]
    pub prefill_worker_id: Option<u64>,

84
85
86
87
    /// The prefill worker's data parallel rank
    #[serde(skip_serializing_if = "Option::is_none")]
    pub prefill_dp_rank: Option<u32>,

88
89
90
    /// The decode worker ID that processed this request
    #[serde(skip_serializing_if = "Option::is_none")]
    pub decode_worker_id: Option<u64>,
91
92
93
94

    /// The decode worker's data parallel rank
    #[serde(skip_serializing_if = "Option::is_none")]
    pub decode_dp_rank: Option<u32>,
95
96
97
}

/// NVIDIA LLM response extensions
98
#[derive(ToSchema, Serialize, Deserialize, Debug, Clone)]
99
100
101
102
pub struct NvExtResponse {
    /// Worker ID information (prefill and decode worker IDs)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub worker_id: Option<WorkerIdInfo>,
103
104
105
106
107

    /// Per-request timing information
    /// Populated when client requests `extra_fields: ["timing"]`
    #[serde(skip_serializing_if = "Option::is_none")]
    pub timing: Option<TimingInfo>,
108
109
110
111
112

    /// Token IDs for GAIE Stage 1 query-only mode
    /// Contains the tokenized prompt for reuse in Stage 2
    #[serde(skip_serializing_if = "Option::is_none")]
    pub token_ids: Option<Vec<u32>>,
113
114
115
116

    /// Routed expert capture payload (SGLang-specific)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub routed_experts: Option<serde_json::Value>,
117
118
}

119
/// NVIDIA LLM extensions to the OpenAI API
120
#[derive(ToSchema, Serialize, Deserialize, Builder, Validate, Debug, Clone)]
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#[validate(schema(function = "validate_nv_ext"))]
pub struct NvExt {
    /// If true, sampling will be forced to be greedy.
    /// The backend is responsible for selecting the correct backend-specific options to
    /// implement this.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    #[builder(default, setter(strip_option))]
    pub greed_sampling: Option<bool>,

    /// If true, the preproessor will try to bypass the prompt template and pass the prompt directly to
    /// to the tokenizer.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    #[builder(default, setter(strip_option))]
    pub use_raw_prompt: Option<bool>,

    /// Annotations
    /// User requests triggers which result in the request issue back out-of-band information in the SSE
    /// stream using the `event:` field.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    #[builder(default, setter(strip_option))]
    pub annotations: Option<Vec<String>>,
142

143
144
145
146
147
    /// Targeted backend instance ID for the request
    /// If set, the request will be routed to backend instance with the given ID.
    /// If not set, the request will be routed to the best matching instance.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
148
    pub backend_instance_id: Option<u64>,
149
150
151
152
153
154
155

    /// Pre-tokenized data to use instead of tokenizing the prompt
    /// If provided along with backend_instance_id, these tokens will be used directly
    /// and tokenization will be skipped.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub token_data: Option<Vec<u32>>,
156

157
158
159
160
161
    /// Maximum number of thinking tokens allowed
    /// NOTE: Currently passed through to backends as a no-op for future implementation
    #[serde(default, skip_serializing_if = "Option::is_none")]
    #[builder(default, setter(strip_option))]
    pub max_thinking_tokens: Option<u32>,
162
163
164

    /// Extra fields to be included in the response's nvext
    /// This is a list of field names that should be populated in the response
165
166
    /// Supported fields include "worker_id", "timing", "routed_experts",
    /// which map to fields in NvExtResponse.
167
168
169
    #[serde(default, skip_serializing_if = "Option::is_none")]
    #[builder(default, setter(strip_option))]
    pub extra_fields: Option<Vec<String>>,
170
171
172
173
174
175
176
177
178
179
180
181

    /// Targeted prefill worker ID for disaggregated serving (GAIE Stage 2)
    /// When set, the request will be routed to this specific prefill worker.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub prefill_worker_id: Option<u64>,

    /// Targeted decode worker ID for disaggregated serving (GAIE Stage 2)
    /// When set, the request will be routed to this specific decode worker.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub decode_worker_id: Option<u64>,
182

atchernych's avatar
atchernych committed
183
184
185
186
187
188
189
190
191
192
193
194
195
    /// Data parallel rank for the decode worker, set by the EPP via the
    /// `x-dp-rank` header. When a worker hosts multiple DP engines,
    /// this steers the request to the correct engine instance.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub dp_rank: Option<u32>,

    /// Data parallel rank for the prefill worker in disaggregated serving,
    /// set by the EPP via the `x-prefill-dp-rank` header.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub prefill_dp_rank: Option<u32>,

196
    /// Agent-provided hints for request handling.
197
198
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
199
    pub agent_hints: Option<AgentHints>,
200

201
202
203
204
    /// Optional request timestamp in milliseconds for trace replay / virtual-time simulation.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub request_timestamp_ms: Option<f64>,
205
206
207
208
209
210
211
212

    /// Session control for subagent KV isolation and sticky routing.
    /// When present, the router uses `session_id` for worker affinity.
    /// When `action` is set to `open` or `close`, the router also fires
    /// session lifecycle RPCs to the worker.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub session_control: Option<SessionControl>,
213
214
215
}

/// Hints from the agent/caller about request characteristics.
216
#[derive(ToSchema, Serialize, Deserialize, Builder, Debug, Clone, Default, PartialEq)]
217
pub struct AgentHints {
218
219
220
221
    /// Unified request priority.
    /// Higher values mean "more important" at the Dynamo API level.
    /// Dynamo uses this for router queue ordering and normalizes it per backend
    /// before forwarding engine priority values.
222
223
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
224
    pub priority: Option<i32>,
225
226
227
228
229
230
231

    /// Expected output sequence length (number of output tokens).
    /// Used as a hint for routing decisions to estimate resource requirements
    /// and for output block tracking decay.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub osl: Option<u32>,
Yan Ru Pei's avatar
Yan Ru Pei committed
232
233
234
235
236
237
238

    /// When true, after the assistant turn completes, the system will speculatively
    /// prefill the predicted next-turn prefix (conversation history with thinking
    /// content stripped) on a worker to warm the KV cache for the next request.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub speculative_prefill: Option<bool>,
239

240
241
    /// Deprecated alias for router-only priority.
    /// Kept as an undocumented fallback while callers migrate to `priority`.
242
243
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
244
245
    #[schema(ignore)]
    pub latency_sensitivity: Option<f64>,
246
247
}

248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
fn default_session_timeout() -> u64 {
    300
}

/// Session control for subagent KV isolation and sticky routing.
///
/// Always requires `session_id`. The `action` field is optional:
/// - `action: "open"` on the first turn creates a streaming session on the worker
/// - `action: "close"` on the last turn frees session KV after generation
/// - No `action` on intermediate turns -- just provides `session_id` for sticky routing
#[derive(ToSchema, Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct SessionControl {
    /// Unique session identifier. Present on every turn for sticky routing.
    pub session_id: String,
    /// Lifecycle action: `"open"` or `"close"`. Omit on intermediate turns.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub action: Option<SessionAction>,
    /// Inactivity timeout in seconds (default 300, only used with `action: "open"`).
    #[serde(default = "default_session_timeout")]
    pub timeout: u64,
}

/// Session lifecycle actions.
#[derive(ToSchema, Serialize, Deserialize, Debug, Clone, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum SessionAction {
    Open,
    Close,
}

278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
impl Default for NvExt {
    fn default() -> Self {
        NvExt::builder().build().unwrap()
    }
}

impl NvExt {
    pub fn builder() -> NvExtBuilder {
        NvExtBuilder::default()
    }
}

fn validate_nv_ext(_nv_ext: &NvExt) -> Result<(), ValidationError> {
    Ok(())
}

impl NvExtBuilder {
    pub fn add_annotation(&mut self, annotation: impl Into<String>) -> &mut Self {
        self.annotations
            .get_or_insert_with(|| Some(vec![]))
            .as_mut()
            .expect("stop should always be Some(Vec)")
            .push(annotation.into());
        self
    }
}

#[cfg(test)]
mod tests {
    use validator::Validate;

    use super::*;

    // Test default builder configuration
    #[test]
    fn test_nv_ext_builder_default() {
        let nv_ext = NvExt::builder().build().unwrap();
        assert_eq!(nv_ext.greed_sampling, None);
316
317
318
319
        assert_eq!(nv_ext.use_raw_prompt, None);
        assert_eq!(nv_ext.annotations, None);
        assert_eq!(nv_ext.backend_instance_id, None);
        assert_eq!(nv_ext.token_data, None);
320
        assert_eq!(nv_ext.max_thinking_tokens, None);
321
        assert_eq!(nv_ext.extra_fields, None);
322
323
        assert_eq!(nv_ext.prefill_worker_id, None);
        assert_eq!(nv_ext.decode_worker_id, None);
324
        assert_eq!(nv_ext.agent_hints, None);
325
        assert_eq!(nv_ext.request_timestamp_ms, None);
326
        assert_eq!(nv_ext.session_control, None);
327
328
329
330
331
332
333
    }

    // Test valid builder configurations
    #[test]
    fn test_nv_ext_builder_custom() {
        let nv_ext = NvExt::builder()
            .greed_sampling(true)
334
335
336
            .use_raw_prompt(true)
            .backend_instance_id(42)
            .token_data(vec![1, 2, 3, 4])
337
            .max_thinking_tokens(1024)
338
            .extra_fields(vec!["worker_id".to_string()])
339
340
341
342
            .build()
            .unwrap();

        assert_eq!(nv_ext.greed_sampling, Some(true));
343
344
345
        assert_eq!(nv_ext.use_raw_prompt, Some(true));
        assert_eq!(nv_ext.backend_instance_id, Some(42));
        assert_eq!(nv_ext.token_data, Some(vec![1, 2, 3, 4]));
346
        assert_eq!(nv_ext.max_thinking_tokens, Some(1024));
347
        assert_eq!(nv_ext.extra_fields, Some(vec!["worker_id".to_string()]));
348
349
350
        // Validate the built struct
        assert!(nv_ext.validate().is_ok());
    }
351
352
353
354
355
356
357
358
359
360
361
362
363
364

    // Test GAIE Stage 2 disaggregated worker IDs
    #[test]
    fn test_nv_ext_disagg_worker_ids() {
        let nv_ext = NvExt::builder()
            .prefill_worker_id(100)
            .decode_worker_id(200)
            .build()
            .unwrap();

        assert_eq!(nv_ext.prefill_worker_id, Some(100));
        assert_eq!(nv_ext.decode_worker_id, Some(200));
        assert!(nv_ext.validate().is_ok());
    }
365

366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
    #[test]
    fn test_session_control_serde() {
        // Open action with timeout
        let sc_json = r#"{"session_id": "sub-1", "action": "open", "timeout": 60}"#;
        let sc: SessionControl = serde_json::from_str(sc_json).unwrap();
        assert_eq!(sc.action, Some(SessionAction::Open));
        assert_eq!(sc.session_id, "sub-1");
        assert_eq!(sc.timeout, 60);

        // Close action (timeout defaults to 300)
        let sc_close = r#"{"session_id": "sub-1", "action": "close"}"#;
        let sc: SessionControl = serde_json::from_str(sc_close).unwrap();
        assert_eq!(sc.action, Some(SessionAction::Close));
        assert_eq!(sc.timeout, 300);

        // Continue (no action, just session_id for sticky routing)
        let sc_continue = r#"{"session_id": "sub-1"}"#;
        let sc: SessionControl = serde_json::from_str(sc_continue).unwrap();
        assert_eq!(sc.action, None);
        assert_eq!(sc.session_id, "sub-1");

        // NvExt with session_control
        let nvext_json =
            r#"{"session_control": {"session_id": "sub-2", "action": "open", "timeout": 300}}"#;
        let nvext: NvExt = serde_json::from_str(nvext_json).unwrap();
        assert!(nvext.session_control.is_some());
        let sc = nvext.session_control.unwrap();
        assert_eq!(sc.action, Some(SessionAction::Open));
        assert_eq!(sc.session_id, "sub-2");

        // Roundtrip
        let original = SessionControl {
            session_id: "test-session".to_string(),
            action: Some(SessionAction::Close),
            timeout: 90,
        };
        let json = serde_json::to_string(&original).unwrap();
        let deser: SessionControl = serde_json::from_str(&json).unwrap();
        assert_eq!(deser, original);
    }

407
408
409
410
411
412
    #[test]
    fn test_apply_header_routing_overrides() {
        use axum::http::HeaderMap;

        let mut headers = HeaderMap::new();
        headers.insert(HEADER_WORKER_INSTANCE_ID, "123".parse().unwrap());
atchernych's avatar
atchernych committed
413
414
415
        headers.insert(HEADER_PREFILL_INSTANCE_ID, "456".parse().unwrap());
        headers.insert(HEADER_DP_RANK, "3".parse().unwrap());
        headers.insert(HEADER_PREFILL_DP_RANK, "5".parse().unwrap());
416

atchernych's avatar
atchernych committed
417
        let result = apply_header_routing_overrides(None, &headers).unwrap();
418
419
420

        assert_eq!(result.backend_instance_id, Some(123));
        assert_eq!(result.decode_worker_id, Some(123));
atchernych's avatar
atchernych committed
421
422
423
        assert_eq!(result.prefill_worker_id, Some(456));
        assert_eq!(result.dp_rank, Some(3));
        assert_eq!(result.prefill_dp_rank, Some(5));
424
    }
425
}