feat: Full Anthropic Messages API cache_control support (top-level, per-block,...

feat: Full Anthropic Messages API cache_control support (top-level, per-block, system block arrays) (#6629) Signed-off-by: Matej Kosec <mkosec@nvidia.com>

feat: Full Anthropic Messages API cache_control support (top-level, per-block,...
feat: Full Anthropic Messages API cache_control support (top-level, per-block, system block arrays) (#6629) Signed-off-by: Matej Kosec <mkosec@nvidia.com>
4d3e1ae3 · MatejKosec · GitHub · a3cf35c3 · 4d3e1ae3 · 4d3e1ae3
Unverified Commit 4d3e1ae3 authored Mar 02, 2026 by MatejKosec Committed by GitHub Mar 02, 2026
5 changed files
--- a/lib/llm/src/preprocessor.rs
+++ b/lib/llm/src/preprocessor.rs
@@ -272,6 +272,9 @@ impl OpenAIPreprocessor {
        builder.mdc_sum(Some(self.mdcsum.clone()));
        let lora_name = self.lora_name.clone();

+        // Extract cache_control TTL from either nvext or top-level field
+        let cache_control_ttl = request.effective_cache_control().map(|cc| cc.ttl_seconds());
+
        // Extract routing hints from nvext if present
        if let Some(nvext) = request.nvext() {
            // Build routing hints from nvext fields
@@ -289,10 +292,12 @@ impl OpenAIPreprocessor {
                allowed_worker_ids: None,
            };
            builder.routing(Some(routing));
-        } else if lora_name.is_some() {
-            // Ensure LoRA-aware routing still gets hints even when nvext is absent.
+        } else if lora_name.is_some() || cache_control_ttl.is_some() {
+            // Ensure routing hints exist when we have LoRA or cache_control,
+            // even when nvext is absent (e.g. Anthropic endpoint requests).
            builder.routing(Some(RoutingHints {
                lora_name,
+                cache_control_ttl,
                ..Default::default()
            }));
        }

--- a/lib/llm/src/protocols/anthropic/stream_converter.rs
+++ b/lib/llm/src/protocols/anthropic/stream_converter.rs
@@ -30,6 +30,7 @@ pub struct AnthropicStreamConverter {
    // Token usage (from engine)
    input_token_count: u32,
    output_token_count: u32,
+    cached_token_count: Option<u32>,
    // Tool call tracking
    tool_call_states: Vec<ToolCallState>,
    tool_calls_sent: HashSet<String>,
@@ -57,6 +58,7 @@ impl AnthropicStreamConverter {
            text_block_index: 0,
            input_token_count: 0,
            output_token_count: 0,
+            cached_token_count: None,
            tool_call_states: Vec::new(),
            tool_calls_sent: HashSet::new(),
            next_block_index: 0,
@@ -77,6 +79,8 @@ impl AnthropicStreamConverter {
            usage: AnthropicUsage {
                input_tokens: 0,
                output_tokens: 0,
+                cache_creation_input_tokens: None,
+                cache_read_input_tokens: None,
            },
        };

@@ -95,6 +99,10 @@ impl AnthropicStreamConverter {
        if let Some(usage) = &chunk.usage {
            self.input_token_count = usage.prompt_tokens;
            self.output_token_count = usage.completion_tokens;
+            self.cached_token_count = usage
+                .prompt_tokens_details
+                .as_ref()
+                .and_then(|d| d.cached_tokens);
        }

        for choice in &chunk.choices {
@@ -138,6 +146,7 @@ impl AnthropicStreamConverter {
                        index: self.text_block_index,
                        content_block: AnthropicResponseContentBlock::Text {
                            text: String::new(),
+                            citations: None,
                        },
                    };
                    events.push(make_sse_event("content_block_start", &block_start));
@@ -271,6 +280,8 @@ impl AnthropicStreamConverter {
            usage: AnthropicUsage {
                input_tokens: self.input_token_count,
                output_tokens: self.output_token_count,
+                cache_creation_input_tokens: None,
+                cache_read_input_tokens: self.cached_token_count,
            },
        };
        events.push(make_sse_event("message_delta", &message_delta));
@@ -329,6 +340,10 @@ impl AnthropicStreamConverter {
        if let Some(usage) = &chunk.usage {
            self.input_token_count = usage.prompt_tokens;
            self.output_token_count = usage.completion_tokens;
+            self.cached_token_count = usage
+                .prompt_tokens_details
+                .as_ref()
+                .and_then(|d| d.cached_tokens);
        }

        for choice in &chunk.choices {
@@ -369,6 +384,7 @@ impl AnthropicStreamConverter {
                        index: self.text_block_index,
                        content_block: AnthropicResponseContentBlock::Text {
                            text: String::new(),
+                            citations: None,
                        },
                    };
                    events.push(make_tagged_event("content_block_start", &ev));
@@ -483,6 +499,8 @@ impl AnthropicStreamConverter {
            usage: AnthropicUsage {
                input_tokens: self.input_token_count,
                output_tokens: self.output_token_count,
+                cache_creation_input_tokens: None,
+                cache_read_input_tokens: self.cached_token_count,
            },
        };
        events.push(make_tagged_event("message_delta", &ev));

--- a/lib/llm/src/protocols/anthropic/types.rs
+++ b/lib/llm/src/protocols/anthropic/types.rs
--- a/lib/llm/src/protocols/openai/chat_completions.rs
+++ b/lib/llm/src/protocols/openai/chat_completions.rs
@@ -92,6 +92,10 @@ impl NvExtProvider for NvCreateChatCompletionRequest {
    fn raw_prompt(&self) -> Option<String> {
        None
    }
+
+    fn effective_cache_control(&self) -> Option<&crate::protocols::openai::nvext::CacheControl> {
+        NvExtProvider::nvext(self).and_then(|ext| ext.cache_control.as_ref())
+    }
 }

 /// Implements `AnnotationsProvider` for `NvCreateChatCompletionRequest`,

--- a/lib/llm/src/protocols/openai/nvext.rs
+++ b/lib/llm/src/protocols/openai/nvext.rs
@@ -49,6 +49,13 @@ pub fn apply_header_routing_overrides(nvext: Option<NvExt>, headers: &HeaderMap)
 pub trait NvExtProvider {
    fn nvext(&self) -> Option<&NvExt>;
    fn raw_prompt(&self) -> Option<String>;
+
+    /// Return the effective cache control for this request.
+    /// Default: delegates to `nvext.cache_control`. Implementations may override
+    /// to also check a top-level `cache_control` field (see `NvCreateChatCompletionRequest`).
+    fn effective_cache_control(&self) -> Option<&CacheControl> {
+        self.nvext().and_then(|ext| ext.cache_control.as_ref())
+    }
 }

 /// Worker ID information for disaggregated serving