"examples/backends/vllm/vscode:/vscode.git/clone" did not exist on "80e7bafd37a0bc5970bea955a63e746ba5adac5a"
Unverified Commit 745d7e4c authored by brluo's avatar brluo Committed by GitHub
Browse files

fix: respect per-request chat_template_kwargs for DeepSeek V3.2 thinking mode (#7286)


Co-authored-by: default avatarkangclzjc <kangz@nvidia.com>
parent 387100c8
......@@ -1090,6 +1090,8 @@ impl OpenAIPreprocessor {
/// For kimi_k25: disabled when chat_template_args contains "thinking": false.
/// For nemotron_nano: disabled when chat_template_args contains "enable_thinking": false
/// or "force_nonempty_content": true.
/// For deepseek_r1: disabled when chat_template_args contains "thinking": false
/// or "thinking_mode": "chat".
fn is_reasoning_disabled_by_request(
reasoning_parser: Option<&str>,
chat_template_args: Option<&std::collections::HashMap<String, serde_json::Value>>,
......@@ -1118,6 +1120,17 @@ impl OpenAIPreprocessor {
}
false
}
Some("deepseek_r1") => {
if let Some(args) = chat_template_args {
if let Some(thinking) = args.get("thinking") {
return thinking == &serde_json::Value::Bool(false);
}
if let Some(mode) = args.get("thinking_mode").and_then(|v| v.as_str()) {
return mode == "chat";
}
}
false
}
_ => false,
}
}
......@@ -1500,6 +1513,22 @@ mod tests {
);
m
};
let thinking_mode_chat = {
let mut m = std::collections::HashMap::new();
m.insert(
"thinking_mode".to_string(),
serde_json::Value::String("chat".to_string()),
);
m
};
let thinking_mode_thinking = {
let mut m = std::collections::HashMap::new();
m.insert(
"thinking_mode".to_string(),
serde_json::Value::String("thinking".to_string()),
);
m
};
let empty_args = std::collections::HashMap::new();
// (parser, args, expected_disabled, description)
......@@ -1528,11 +1557,42 @@ mod tests {
false,
"kimi_k25 + empty args → enabled",
),
// deepseek_r1 uses "thinking" bool or "thinking_mode" string
(
Some("deepseek_r1"),
Some(&thinking_false),
true,
"deepseek_r1 + thinking=false → disabled",
),
(
Some("deepseek_r1"),
Some(&thinking_true),
false,
"deepseek_r1 + thinking=true → enabled",
),
(
Some("deepseek_r1"),
Some(&thinking_mode_chat),
true,
"deepseek_r1 + thinking_mode=chat → disabled",
),
(
Some("deepseek_r1"),
Some(&thinking_mode_thinking),
false,
"deepseek_r1 + thinking_mode=thinking → enabled",
),
(
Some("deepseek_r1"),
None,
false,
"deepseek_r1 + no args → enabled",
),
(
Some("deepseek_r1"),
Some(&empty_args),
false,
"deepseek_r1 → never disabled",
"deepseek_r1 + empty args → enabled",
),
(
Some("basic"),
......
......@@ -459,6 +459,33 @@ impl DeepSeekV32Formatter {
pub fn new_chat() -> Self {
Self::new(ThinkingMode::Chat)
}
/// Resolve thinking mode from per-request `chat_template_args`, falling back to the
/// formatter's default. Two conventions are supported:
/// - `{"thinking": bool}` — common across models (e.g. Kimi K25)
/// - `{"thinking_mode": "chat"|"thinking"}` — matches the DSV3.2 Jinja template parameter
fn resolve_thinking_mode(
&self,
args: Option<&std::collections::HashMap<String, serde_json::Value>>,
) -> ThinkingMode {
if let Some(args) = args {
if let Some(thinking) = args.get("thinking").and_then(|v| v.as_bool()) {
return if thinking {
ThinkingMode::Thinking
} else {
ThinkingMode::Chat
};
}
if let Some(mode) = args.get("thinking_mode").and_then(|v| v.as_str()) {
match mode {
"chat" => return ThinkingMode::Chat,
"thinking" => return ThinkingMode::Thinking,
_ => {}
}
}
}
self.thinking_mode
}
}
impl super::OAIPromptFormatter for DeepSeekV32Formatter {
......@@ -467,6 +494,8 @@ impl super::OAIPromptFormatter for DeepSeekV32Formatter {
}
fn render(&self, req: &dyn super::OAIChatLikeRequest) -> Result<String> {
let thinking_mode = self.resolve_thinking_mode(req.chat_template_args());
// Get messages from request
let messages_value = req.messages();
......@@ -532,7 +561,7 @@ impl super::OAIPromptFormatter for DeepSeekV32Formatter {
// Encode with native implementation
encode_messages(
&messages_array,
self.thinking_mode,
thinking_mode,
true, // always add BOS token
)
}
......@@ -597,6 +626,7 @@ mod tests {
messages: JsonValue,
tools: Option<JsonValue>,
response_format: Option<JsonValue>,
chat_template_args: Option<std::collections::HashMap<String, JsonValue>>,
}
impl MockRequest {
......@@ -605,6 +635,7 @@ mod tests {
messages,
tools: None,
response_format: None,
chat_template_args: None,
}
}
......@@ -617,6 +648,14 @@ mod tests {
self.response_format = Some(response_format);
self
}
fn with_chat_template_args(
mut self,
args: std::collections::HashMap<String, JsonValue>,
) -> Self {
self.chat_template_args = Some(args);
self
}
}
impl super::super::OAIChatLikeRequest for MockRequest {
......@@ -643,6 +682,12 @@ mod tests {
fn should_add_generation_prompt(&self) -> bool {
true
}
fn chat_template_args(
&self,
) -> Option<&std::collections::HashMap<String, serde_json::Value>> {
self.chat_template_args.as_ref()
}
}
#[test]
......@@ -990,4 +1035,235 @@ mod tests {
"Should not contain Response Format section when not provided"
);
}
// ==================== Thinking Mode Override Tests ====================
#[test]
fn test_chat_mode_via_thinking_false() {
use super::super::OAIPromptFormatter;
let args = std::collections::HashMap::from([("thinking".to_string(), json!(false))]);
let request = MockRequest::new(json!([
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello!"}
]))
.with_chat_template_args(args);
let formatter = DeepSeekV32Formatter::new_thinking();
let result = formatter.render(&request).unwrap();
// In chat mode, the last user message should be followed by </think> (closing tag)
// rather than <think> (opening tag)
assert!(
result.ends_with(&format!(
"{}{}",
tokens::ASSISTANT_START,
tokens::THINKING_END
)),
"Chat mode should end with </think> after Assistant token, got: ...{}",
&result[result.len().saturating_sub(80)..],
);
assert!(
!result.ends_with(&format!(
"{}{}",
tokens::ASSISTANT_START,
tokens::THINKING_START
)),
"Chat mode should NOT end with <think>",
);
}
#[test]
fn test_explicit_thinking_true_via_args() {
use super::super::OAIPromptFormatter;
let args = std::collections::HashMap::from([("thinking".to_string(), json!(true))]);
let request = MockRequest::new(json!([
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello!"}
]))
.with_chat_template_args(args);
let formatter = DeepSeekV32Formatter::new_thinking();
let result = formatter.render(&request).unwrap();
assert!(
result.ends_with(&format!(
"{}{}",
tokens::ASSISTANT_START,
tokens::THINKING_START
)),
"Thinking mode should end with <think> after Assistant token",
);
}
#[test]
fn test_chat_mode_via_thinking_mode_string() {
use super::super::OAIPromptFormatter;
let args = std::collections::HashMap::from([("thinking_mode".to_string(), json!("chat"))]);
let request = MockRequest::new(json!([
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello!"}
]))
.with_chat_template_args(args);
let formatter = DeepSeekV32Formatter::new_thinking();
let result = formatter.render(&request).unwrap();
assert!(
result.ends_with(&format!(
"{}{}",
tokens::ASSISTANT_START,
tokens::THINKING_END
)),
"thinking_mode='chat' should produce chat mode (ends with </think>)",
);
}
#[test]
fn test_thinking_mode_string_thinking() {
use super::super::OAIPromptFormatter;
let args =
std::collections::HashMap::from([("thinking_mode".to_string(), json!("thinking"))]);
let request = MockRequest::new(json!([
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello!"}
]))
.with_chat_template_args(args);
let formatter = DeepSeekV32Formatter::new_thinking();
let result = formatter.render(&request).unwrap();
assert!(
result.ends_with(&format!(
"{}{}",
tokens::ASSISTANT_START,
tokens::THINKING_START
)),
"thinking_mode='thinking' should produce thinking mode (ends with <think>)",
);
}
#[test]
fn test_default_thinking_mode_without_args() {
use super::super::OAIPromptFormatter;
let request = MockRequest::new(json!([
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello!"}
]));
// No chat_template_args — should default to formatter's thinking mode
let formatter = DeepSeekV32Formatter::new_thinking();
let result = formatter.render(&request).unwrap();
assert!(
result.ends_with(&format!(
"{}{}",
tokens::ASSISTANT_START,
tokens::THINKING_START
)),
"Default (new_thinking) should produce thinking mode",
);
// Verify new_chat() default also works
let formatter_chat = DeepSeekV32Formatter::new_chat();
let result_chat = formatter_chat.render(&request).unwrap();
assert!(
result_chat.ends_with(&format!(
"{}{}",
tokens::ASSISTANT_START,
tokens::THINKING_END
)),
"Default (new_chat) should produce chat mode",
);
}
#[test]
fn test_thinking_false_overrides_default_thinking() {
use super::super::OAIPromptFormatter;
let args = std::collections::HashMap::from([("thinking".to_string(), json!(false))]);
let request = MockRequest::new(json!([
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello!"}
]))
.with_chat_template_args(args);
// Formatter defaults to thinking, but request overrides to chat
let formatter = DeepSeekV32Formatter::new_thinking();
let result = formatter.render(&request).unwrap();
assert!(
result.ends_with(&format!(
"{}{}",
tokens::ASSISTANT_START,
tokens::THINKING_END
)),
"Per-request thinking=false should override new_thinking() default",
);
}
#[test]
fn test_thinking_true_overrides_default_chat() {
use super::super::OAIPromptFormatter;
let args = std::collections::HashMap::from([("thinking".to_string(), json!(true))]);
let request = MockRequest::new(json!([
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello!"}
]))
.with_chat_template_args(args);
// Formatter defaults to chat, but request overrides to thinking
let formatter = DeepSeekV32Formatter::new_chat();
let result = formatter.render(&request).unwrap();
assert!(
result.ends_with(&format!(
"{}{}",
tokens::ASSISTANT_START,
tokens::THINKING_START
)),
"Per-request thinking=true should override new_chat() default",
);
}
#[test]
fn test_thinking_bool_takes_precedence_over_thinking_mode_string() {
use super::super::OAIPromptFormatter;
let args = std::collections::HashMap::from([
("thinking".to_string(), json!(false)),
("thinking_mode".to_string(), json!("thinking")),
]);
let request = MockRequest::new(json!([
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello!"}
]))
.with_chat_template_args(args);
let formatter = DeepSeekV32Formatter::new_thinking();
let result = formatter.render(&request).unwrap();
// "thinking": false should win over "thinking_mode": "thinking"
assert!(
result.ends_with(&format!(
"{}{}",
tokens::ASSISTANT_START,
tokens::THINKING_END
)),
"Boolean 'thinking' key should take precedence over 'thinking_mode' string",
);
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment