nvext.rs 12 KB
Newer Older
1
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
3
// SPDX-License-Identifier: Apache-2.0

4
use axum::http::HeaderMap;
5
6
use derive_builder::Builder;
use serde::{Deserialize, Serialize};
7
use utoipa::ToSchema;
8
9
use validator::{Validate, ValidationError};

10
11
pub use crate::protocols::common::timing::TimingInfo;

12
13
pub const HEADER_WORKER_INSTANCE_ID: &str = "x-worker-instance-id";
pub const HEADER_PREFILL_INSTANCE_ID: &str = "x-prefill-instance-id";
14
15
16
/// Header to disable local bookkeeping updates (for GAIE Stage 2)
/// When set to "false", the router skips add_request, mark_prefill_completed, and free calls.
pub const HEADER_ENABLE_LOCAL_UPDATES: &str = "x-enable-local-updates";
17
18
19
20
21
22

/// Apply routing overrides from HTTP headers to nvext.
///
/// Header mappings:
/// - `x-worker-instance-id` -> `backend_instance_id` and `decode_worker_id`
/// - `x-prefill-instance-id` -> `prefill_worker_id`
23
/// - `x-enable-local-updates` -> `enable_local_updates` (set to false to disable router bookkeeping)
24
25
26
27
28
29
30
31
32
33
34
35
36
37
///
/// Headers take priority over existing nvext values when present.
/// If no headers are present, returns the original nvext unchanged.
pub fn apply_header_routing_overrides(nvext: Option<NvExt>, headers: &HeaderMap) -> Option<NvExt> {
    let worker_id = headers
        .get(HEADER_WORKER_INSTANCE_ID)
        .and_then(|v| v.to_str().ok())
        .and_then(|s| s.parse::<u64>().ok());

    let prefill_id = headers
        .get(HEADER_PREFILL_INSTANCE_ID)
        .and_then(|v| v.to_str().ok())
        .and_then(|s| s.parse::<u64>().ok());

38
39
40
41
42
43
44
45
46
47
48
    // Parse enable_local_updates header: "true" or "false"
    let enable_local_updates = headers
        .get(HEADER_ENABLE_LOCAL_UPDATES)
        .and_then(|v| v.to_str().ok())
        .and_then(|s| match s.to_lowercase().as_str() {
            "true" | "1" => Some(true),
            "false" | "0" => Some(false),
            _ => None,
        });

    if worker_id.is_none() && prefill_id.is_none() && enable_local_updates.is_none() {
49
50
51
52
53
54
55
56
57
58
59
        return nvext;
    }

    let mut ext = nvext.unwrap_or_default();
    if let Some(id) = worker_id {
        ext.backend_instance_id = Some(id);
        ext.decode_worker_id = Some(id);
    }
    if let Some(id) = prefill_id {
        ext.prefill_worker_id = Some(id);
    }
60
61
62
    if let Some(enabled) = enable_local_updates {
        ext.enable_local_updates = Some(enabled);
    }
63
64
65
    Some(ext)
}

66
67
68
69
70
pub trait NvExtProvider {
    fn nvext(&self) -> Option<&NvExt>;
    fn raw_prompt(&self) -> Option<String>;
}

71
/// Worker ID information for disaggregated serving
72
#[derive(ToSchema, Serialize, Deserialize, Debug, Clone, PartialEq)]
73
74
75
76
77
pub struct WorkerIdInfo {
    /// The prefill worker ID that processed this request
    #[serde(skip_serializing_if = "Option::is_none")]
    pub prefill_worker_id: Option<u64>,

78
79
80
81
    /// The prefill worker's data parallel rank
    #[serde(skip_serializing_if = "Option::is_none")]
    pub prefill_dp_rank: Option<u32>,

82
83
84
    /// The decode worker ID that processed this request
    #[serde(skip_serializing_if = "Option::is_none")]
    pub decode_worker_id: Option<u64>,
85
86
87
88

    /// The decode worker's data parallel rank
    #[serde(skip_serializing_if = "Option::is_none")]
    pub decode_dp_rank: Option<u32>,
89
90
91
}

/// NVIDIA LLM response extensions
92
#[derive(ToSchema, Serialize, Deserialize, Debug, Clone)]
93
94
95
96
pub struct NvExtResponse {
    /// Worker ID information (prefill and decode worker IDs)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub worker_id: Option<WorkerIdInfo>,
97
98
99
100
101

    /// Per-request timing information
    /// Populated when client requests `extra_fields: ["timing"]`
    #[serde(skip_serializing_if = "Option::is_none")]
    pub timing: Option<TimingInfo>,
102
103
104
105
106

    /// Token IDs for GAIE Stage 1 query-only mode
    /// Contains the tokenized prompt for reuse in Stage 2
    #[serde(skip_serializing_if = "Option::is_none")]
    pub token_ids: Option<Vec<u32>>,
107
108
}

109
/// NVIDIA LLM extensions to the OpenAI API
110
#[derive(ToSchema, Serialize, Deserialize, Builder, Validate, Debug, Clone)]
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#[validate(schema(function = "validate_nv_ext"))]
pub struct NvExt {
    /// If true, sampling will be forced to be greedy.
    /// The backend is responsible for selecting the correct backend-specific options to
    /// implement this.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    #[builder(default, setter(strip_option))]
    pub greed_sampling: Option<bool>,

    /// If true, the preproessor will try to bypass the prompt template and pass the prompt directly to
    /// to the tokenizer.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    #[builder(default, setter(strip_option))]
    pub use_raw_prompt: Option<bool>,

    /// Annotations
    /// User requests triggers which result in the request issue back out-of-band information in the SSE
    /// stream using the `event:` field.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    #[builder(default, setter(strip_option))]
    pub annotations: Option<Vec<String>>,
132

133
134
135
136
137
    /// Targeted backend instance ID for the request
    /// If set, the request will be routed to backend instance with the given ID.
    /// If not set, the request will be routed to the best matching instance.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
138
    pub backend_instance_id: Option<u64>,
139
140
141
142
143
144
145

    /// Pre-tokenized data to use instead of tokenizing the prompt
    /// If provided along with backend_instance_id, these tokens will be used directly
    /// and tokenization will be skipped.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub token_data: Option<Vec<u32>>,
146

147
148
149
150
151
    /// Maximum number of thinking tokens allowed
    /// NOTE: Currently passed through to backends as a no-op for future implementation
    #[serde(default, skip_serializing_if = "Option::is_none")]
    #[builder(default, setter(strip_option))]
    pub max_thinking_tokens: Option<u32>,
152
153
154

    /// Extra fields to be included in the response's nvext
    /// This is a list of field names that should be populated in the response
155
    /// Supported fields: "worker_id", "timing", which has a 1:1 mapping with the NvExtResponse names
156
157
158
    #[serde(default, skip_serializing_if = "Option::is_none")]
    #[builder(default, setter(strip_option))]
    pub extra_fields: Option<Vec<String>>,
159
160
161
162
163
164
165
166
167
168
169
170

    /// Targeted prefill worker ID for disaggregated serving (GAIE Stage 2)
    /// When set, the request will be routed to this specific prefill worker.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub prefill_worker_id: Option<u64>,

    /// Targeted decode worker ID for disaggregated serving (GAIE Stage 2)
    /// When set, the request will be routed to this specific decode worker.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub decode_worker_id: Option<u64>,
171
172
173
174
175
176
177
178
179
180
181

    /// Controls whether the router should manage local bookkeeping (add_request,
    /// mark_prefill_completed, free) for this request.
    ///
    /// - `None` or `true`: Router handles bookkeeping locally (default behavior)
    /// - `false`: External caller (e.g., GAIE sidecar) handles bookkeeping via C FFI
    ///
    /// Set to `false` for GAIE Stage 2 when the EPP/sidecar manages request lifecycle.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub enable_local_updates: Option<bool>,
182

183
    /// Agent-provided hints for request handling.
184
185
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
    pub agent_hints: Option<AgentHints>,
}

/// Hints from the agent/caller about request characteristics.
#[derive(ToSchema, Serialize, Deserialize, Builder, Debug, Clone, Default)]
pub struct AgentHints {
    /// Latency sensitivity in seconds for queue ordering.
    /// Higher values cause the request to be scheduled sooner when the router queue is enabled.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub latency_sensitivity: Option<f64>,

    /// Expected output sequence length (number of output tokens).
    /// Used as a hint for routing decisions to estimate resource requirements
    /// and for output block tracking decay.
    #[builder(default, setter(strip_option))]
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub osl: Option<u32>,
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
}

impl Default for NvExt {
    fn default() -> Self {
        NvExt::builder().build().unwrap()
    }
}

impl NvExt {
    pub fn builder() -> NvExtBuilder {
        NvExtBuilder::default()
    }
}

fn validate_nv_ext(_nv_ext: &NvExt) -> Result<(), ValidationError> {
    Ok(())
}

impl NvExtBuilder {
    pub fn add_annotation(&mut self, annotation: impl Into<String>) -> &mut Self {
        self.annotations
            .get_or_insert_with(|| Some(vec![]))
            .as_mut()
            .expect("stop should always be Some(Vec)")
            .push(annotation.into());
        self
    }
}

#[cfg(test)]
mod tests {
    use validator::Validate;

    use super::*;

    // Test default builder configuration
    #[test]
    fn test_nv_ext_builder_default() {
        let nv_ext = NvExt::builder().build().unwrap();
        assert_eq!(nv_ext.greed_sampling, None);
244
245
246
247
        assert_eq!(nv_ext.use_raw_prompt, None);
        assert_eq!(nv_ext.annotations, None);
        assert_eq!(nv_ext.backend_instance_id, None);
        assert_eq!(nv_ext.token_data, None);
248
        assert_eq!(nv_ext.max_thinking_tokens, None);
249
        assert_eq!(nv_ext.extra_fields, None);
250
251
        assert_eq!(nv_ext.prefill_worker_id, None);
        assert_eq!(nv_ext.decode_worker_id, None);
252
        assert_eq!(nv_ext.enable_local_updates, None);
253
254
255
256
257
258
259
    }

    // Test valid builder configurations
    #[test]
    fn test_nv_ext_builder_custom() {
        let nv_ext = NvExt::builder()
            .greed_sampling(true)
260
261
262
            .use_raw_prompt(true)
            .backend_instance_id(42)
            .token_data(vec![1, 2, 3, 4])
263
            .max_thinking_tokens(1024)
264
            .extra_fields(vec!["worker_id".to_string()])
265
266
267
268
            .build()
            .unwrap();

        assert_eq!(nv_ext.greed_sampling, Some(true));
269
270
271
        assert_eq!(nv_ext.use_raw_prompt, Some(true));
        assert_eq!(nv_ext.backend_instance_id, Some(42));
        assert_eq!(nv_ext.token_data, Some(vec![1, 2, 3, 4]));
272
        assert_eq!(nv_ext.max_thinking_tokens, Some(1024));
273
        assert_eq!(nv_ext.extra_fields, Some(vec!["worker_id".to_string()]));
274
275
276
        // Validate the built struct
        assert!(nv_ext.validate().is_ok());
    }
277
278
279
280
281
282
283
284
285
286
287
288
289
290

    // Test GAIE Stage 2 disaggregated worker IDs
    #[test]
    fn test_nv_ext_disagg_worker_ids() {
        let nv_ext = NvExt::builder()
            .prefill_worker_id(100)
            .decode_worker_id(200)
            .build()
            .unwrap();

        assert_eq!(nv_ext.prefill_worker_id, Some(100));
        assert_eq!(nv_ext.decode_worker_id, Some(200));
        assert!(nv_ext.validate().is_ok());
    }
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316

    // Test apply_header_routing_overrides - worker header present, prefill header absent
    #[test]
    fn test_apply_header_routing_overrides() {
        use axum::http::HeaderMap;

        // Only HEADER_WORKER_INSTANCE_ID is in the header
        let mut headers = HeaderMap::new();
        headers.insert(HEADER_WORKER_INSTANCE_ID, "123".parse().unwrap());
        // Note: HEADER_PREFILL_INSTANCE_ID is NOT in the header

        let nvext = NvExt::builder()
            .backend_instance_id(999)
            .decode_worker_id(888)
            .prefill_worker_id(777)
            .build()
            .unwrap();

        let result = apply_header_routing_overrides(Some(nvext), &headers).unwrap();

        // Header should override backend_instance_id and decode_worker_id
        assert_eq!(result.backend_instance_id, Some(123));
        assert_eq!(result.decode_worker_id, Some(123));
        // prefill_worker_id should remain from original nvext (not overwritten by header)
        assert_eq!(result.prefill_worker_id, Some(777));
    }
317
}