fix: respect per-request chat_template_kwargs for DeepSeek V3.2 thinking mode (#7286)

Co-authored-by: kangclzjc <kangz@nvidia.com>

fix: respect per-request chat_template_kwargs for DeepSeek V3.2 thinking mode (#7286)
Co-authored-by: kangclzjc <kangz@nvidia.com>
745d7e4c · brluo · GitHub · 387100c8 · 745d7e4c · 745d7e4c
Unverified Commit 745d7e4c authored Mar 12, 2026 by brluo Committed by GitHub Mar 12, 2026
Hide whitespace changes
Inline Side-by-side

Showing with 338 additions and 2 deletions

lib/llm/src/preprocessor.rs lib/llm/src/preprocessor.rs +61 -1

lib/llm/src/preprocessor/prompt/deepseek_v32.rs lib/llm/src/preprocessor/prompt/deepseek_v32.rs +277 -1

No files found.
--- a/lib/llm/src/preprocessor.rs
+++ b/lib/llm/src/preprocessor.rs
@@ -1090,6 +1090,8 @@ impl OpenAIPreprocessor {
    /// For kimi_k25: disabled when chat_template_args contains "thinking": false.
    /// For nemotron_nano: disabled when chat_template_args contains "enable_thinking": false
    ///   or "force_nonempty_content": true.
+    /// For deepseek_r1: disabled when chat_template_args contains "thinking": false
+    ///   or "thinking_mode": "chat".
    fn is_reasoning_disabled_by_request(
        reasoning_parser: Option<&str>,
        chat_template_args: Option<&std::collections::HashMap<String, serde_json::Value>>,
@@ -1118,6 +1120,17 @@ impl OpenAIPreprocessor {
                }
                false
            }
+            Some("deepseek_r1") => {
+                if let Some(args) = chat_template_args {
+                    if let Some(thinking) = args.get("thinking") {
+                        return thinking == &serde_json::Value::Bool(false);
+                    }
+                    if let Some(mode) = args.get("thinking_mode").and_then(|v| v.as_str()) {
+                        return mode == "chat";
+                    }
+                }
+                false
+            }
            _ => false,
        }
    }
@@ -1500,6 +1513,22 @@ mod tests {
            );
            m
        };
+        let thinking_mode_chat = {
+            let mut m = std::collections::HashMap::new();
+            m.insert(
+                "thinking_mode".to_string(),
+                serde_json::Value::String("chat".to_string()),
+            );
+            m
+        };
+        let thinking_mode_thinking = {
+            let mut m = std::collections::HashMap::new();
+            m.insert(
+                "thinking_mode".to_string(),
+                serde_json::Value::String("thinking".to_string()),
+            );
+            m
+        };
        let empty_args = std::collections::HashMap::new();

        // (parser, args, expected_disabled, description)
@@ -1528,11 +1557,42 @@ mod tests {
                false,
                "kimi_k25 + empty args → enabled",
            ),
+            // deepseek_r1 uses "thinking" bool or "thinking_mode" string
            (
                Some("deepseek_r1"),
                Some(&thinking_false),
+                true,
+                "deepseek_r1 + thinking=false → disabled",
+            ),
+            (
+                Some("deepseek_r1"),
+                Some(&thinking_true),
+                false,
+                "deepseek_r1 + thinking=true → enabled",
+            ),
+            (
+                Some("deepseek_r1"),
+                Some(&thinking_mode_chat),
+                true,
+                "deepseek_r1 + thinking_mode=chat → disabled",
+            ),
+            (
+                Some("deepseek_r1"),
+                Some(&thinking_mode_thinking),
+                false,
+                "deepseek_r1 + thinking_mode=thinking → enabled",
+            ),
+            (
+                Some("deepseek_r1"),
+                None,
+                false,
+                "deepseek_r1 + no args → enabled",
+            ),
+            (
+                Some("deepseek_r1"),
+                Some(&empty_args),
                false,
-                "deepseek_r1 → never disabled",
+                "deepseek_r1 + empty args → enabled",
            ),
            (
                Some("basic"),

--- a/lib/llm/src/preprocessor/prompt/deepseek_v32.rs
+++ b/lib/llm/src/preprocessor/prompt/deepseek_v32.rs
@@ -459,6 +459,33 @@ impl DeepSeekV32Formatter {
    pub fn new_chat() -> Self {
        Self::new(ThinkingMode::Chat)
    }
+
+    /// Resolve thinking mode from per-request `chat_template_args`, falling back to the
+    /// formatter's default. Two conventions are supported:
+    ///   - `{"thinking": bool}` — common across models (e.g. Kimi K25)
+    ///   - `{"thinking_mode": "chat"|"thinking"}` — matches the DSV3.2 Jinja template parameter
+    fn resolve_thinking_mode(
+        &self,
+        args: Option<&std::collections::HashMap<String, serde_json::Value>>,
+    ) -> ThinkingMode {
+        if let Some(args) = args {
+            if let Some(thinking) = args.get("thinking").and_then(|v| v.as_bool()) {
+                return if thinking {
+                    ThinkingMode::Thinking
+                } else {
+                    ThinkingMode::Chat
+                };
+            }
+            if let Some(mode) = args.get("thinking_mode").and_then(|v| v.as_str()) {
+                match mode {
+                    "chat" => return ThinkingMode::Chat,
+                    "thinking" => return ThinkingMode::Thinking,
+                    _ => {}
+                }
+            }
+        }
+        self.thinking_mode
+    }
 }

 impl super::OAIPromptFormatter for DeepSeekV32Formatter {
@@ -467,6 +494,8 @@ impl super::OAIPromptFormatter for DeepSeekV32Formatter {
    }

    fn render(&self, req: &dyn super::OAIChatLikeRequest) -> Result<String> {
+        let thinking_mode = self.resolve_thinking_mode(req.chat_template_args());
+
        // Get messages from request
        let messages_value = req.messages();

@@ -532,7 +561,7 @@ impl super::OAIPromptFormatter for DeepSeekV32Formatter {
        // Encode with native implementation
        encode_messages(
            &messages_array,
-            self.thinking_mode,
+            thinking_mode,
            true, // always add BOS token
        )
    }
@@ -597,6 +626,7 @@ mod tests {
        messages: JsonValue,
        tools: Option<JsonValue>,
        response_format: Option<JsonValue>,
+        chat_template_args: Option<std::collections::HashMap<String, JsonValue>>,
    }

    impl MockRequest {
@@ -605,6 +635,7 @@ mod tests {
                messages,
                tools: None,
                response_format: None,
+                chat_template_args: None,
            }
        }

@@ -617,6 +648,14 @@ mod tests {
            self.response_format = Some(response_format);
            self
        }
+
+        fn with_chat_template_args(
+            mut self,
+            args: std::collections::HashMap<String, JsonValue>,
+        ) -> Self {
+            self.chat_template_args = Some(args);
+            self
+        }
    }

    impl super::super::OAIChatLikeRequest for MockRequest {
@@ -643,6 +682,12 @@ mod tests {
        fn should_add_generation_prompt(&self) -> bool {
            true
        }
+
+        fn chat_template_args(
+            &self,
+        ) -> Option<&std::collections::HashMap<String, serde_json::Value>> {
+            self.chat_template_args.as_ref()
+        }
    }

    #[test]
@@ -990,4 +1035,235 @@ mod tests {
            "Should not contain Response Format section when not provided"
        );
    }
+
+    // ==================== Thinking Mode Override Tests ====================
+
+    #[test]
+    fn test_chat_mode_via_thinking_false() {
+        use super::super::OAIPromptFormatter;
+
+        let args = std::collections::HashMap::from([("thinking".to_string(), json!(false))]);
+
+        let request = MockRequest::new(json!([
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Hello!"}
+        ]))
+        .with_chat_template_args(args);
+
+        let formatter = DeepSeekV32Formatter::new_thinking();
+        let result = formatter.render(&request).unwrap();
+
+        // In chat mode, the last user message should be followed by </think> (closing tag)
+        // rather than <think> (opening tag)
+        assert!(
+            result.ends_with(&format!(
+                "{}{}",
+                tokens::ASSISTANT_START,
+                tokens::THINKING_END
+            )),
+            "Chat mode should end with </think> after Assistant token, got: ...{}",
+            &result[result.len().saturating_sub(80)..],
+        );
+        assert!(
+            !result.ends_with(&format!(
+                "{}{}",
+                tokens::ASSISTANT_START,
+                tokens::THINKING_START
+            )),
+            "Chat mode should NOT end with <think>",
+        );
+    }
+
+    #[test]
+    fn test_explicit_thinking_true_via_args() {
+        use super::super::OAIPromptFormatter;
+
+        let args = std::collections::HashMap::from([("thinking".to_string(), json!(true))]);
+
+        let request = MockRequest::new(json!([
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Hello!"}
+        ]))
+        .with_chat_template_args(args);
+
+        let formatter = DeepSeekV32Formatter::new_thinking();
+        let result = formatter.render(&request).unwrap();
+
+        assert!(
+            result.ends_with(&format!(
+                "{}{}",
+                tokens::ASSISTANT_START,
+                tokens::THINKING_START
+            )),
+            "Thinking mode should end with <think> after Assistant token",
+        );
+    }
+
+    #[test]
+    fn test_chat_mode_via_thinking_mode_string() {
+        use super::super::OAIPromptFormatter;
+
+        let args = std::collections::HashMap::from([("thinking_mode".to_string(), json!("chat"))]);
+
+        let request = MockRequest::new(json!([
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Hello!"}
+        ]))
+        .with_chat_template_args(args);
+
+        let formatter = DeepSeekV32Formatter::new_thinking();
+        let result = formatter.render(&request).unwrap();
+
+        assert!(
+            result.ends_with(&format!(
+                "{}{}",
+                tokens::ASSISTANT_START,
+                tokens::THINKING_END
+            )),
+            "thinking_mode='chat' should produce chat mode (ends with </think>)",
+        );
+    }
+
+    #[test]
+    fn test_thinking_mode_string_thinking() {
+        use super::super::OAIPromptFormatter;
+
+        let args =
+            std::collections::HashMap::from([("thinking_mode".to_string(), json!("thinking"))]);
+
+        let request = MockRequest::new(json!([
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Hello!"}
+        ]))
+        .with_chat_template_args(args);
+
+        let formatter = DeepSeekV32Formatter::new_thinking();
+        let result = formatter.render(&request).unwrap();
+
+        assert!(
+            result.ends_with(&format!(
+                "{}{}",
+                tokens::ASSISTANT_START,
+                tokens::THINKING_START
+            )),
+            "thinking_mode='thinking' should produce thinking mode (ends with <think>)",
+        );
+    }
+
+    #[test]
+    fn test_default_thinking_mode_without_args() {
+        use super::super::OAIPromptFormatter;
+
+        let request = MockRequest::new(json!([
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Hello!"}
+        ]));
+
+        // No chat_template_args — should default to formatter's thinking mode
+        let formatter = DeepSeekV32Formatter::new_thinking();
+        let result = formatter.render(&request).unwrap();
+
+        assert!(
+            result.ends_with(&format!(
+                "{}{}",
+                tokens::ASSISTANT_START,
+                tokens::THINKING_START
+            )),
+            "Default (new_thinking) should produce thinking mode",
+        );
+
+        // Verify new_chat() default also works
+        let formatter_chat = DeepSeekV32Formatter::new_chat();
+        let result_chat = formatter_chat.render(&request).unwrap();
+
+        assert!(
+            result_chat.ends_with(&format!(
+                "{}{}",
+                tokens::ASSISTANT_START,
+                tokens::THINKING_END
+            )),
+            "Default (new_chat) should produce chat mode",
+        );
+    }
+
+    #[test]
+    fn test_thinking_false_overrides_default_thinking() {
+        use super::super::OAIPromptFormatter;
+
+        let args = std::collections::HashMap::from([("thinking".to_string(), json!(false))]);
+
+        let request = MockRequest::new(json!([
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Hello!"}
+        ]))
+        .with_chat_template_args(args);
+
+        // Formatter defaults to thinking, but request overrides to chat
+        let formatter = DeepSeekV32Formatter::new_thinking();
+        let result = formatter.render(&request).unwrap();
+
+        assert!(
+            result.ends_with(&format!(
+                "{}{}",
+                tokens::ASSISTANT_START,
+                tokens::THINKING_END
+            )),
+            "Per-request thinking=false should override new_thinking() default",
+        );
+    }
+
+    #[test]
+    fn test_thinking_true_overrides_default_chat() {
+        use super::super::OAIPromptFormatter;
+
+        let args = std::collections::HashMap::from([("thinking".to_string(), json!(true))]);
+
+        let request = MockRequest::new(json!([
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Hello!"}
+        ]))
+        .with_chat_template_args(args);
+
+        // Formatter defaults to chat, but request overrides to thinking
+        let formatter = DeepSeekV32Formatter::new_chat();
+        let result = formatter.render(&request).unwrap();
+
+        assert!(
+            result.ends_with(&format!(
+                "{}{}",
+                tokens::ASSISTANT_START,
+                tokens::THINKING_START
+            )),
+            "Per-request thinking=true should override new_chat() default",
+        );
+    }
+
+    #[test]
+    fn test_thinking_bool_takes_precedence_over_thinking_mode_string() {
+        use super::super::OAIPromptFormatter;
+
+        let args = std::collections::HashMap::from([
+            ("thinking".to_string(), json!(false)),
+            ("thinking_mode".to_string(), json!("thinking")),
+        ]);
+
+        let request = MockRequest::new(json!([
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Hello!"}
+        ]))
+        .with_chat_template_args(args);
+
+        let formatter = DeepSeekV32Formatter::new_thinking();
+        let result = formatter.render(&request).unwrap();
+
+        // "thinking": false should win over "thinking_mode": "thinking"
+        assert!(
+            result.ends_with(&format!(
+                "{}{}",
+                tokens::ASSISTANT_START,
+                tokens::THINKING_END
+            )),
+            "Boolean 'thinking' key should take precedence over 'thinking_mode' string",
+        );
+    }
 }