Unverified Commit 256c4c25 authored by mlmz's avatar mlmz Committed by GitHub
Browse files

fix: correct stream response when enable_thinking is set to false (#5881)

parent 9f21e754
......@@ -899,6 +899,24 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
return response
def _get_enable_thinking_from_request(request_obj):
"""Extracts the 'enable_thinking' flag from request chat_template_kwargs.
Args:
request_obj: The request object (or an item from a list of requests).
Returns:
The boolean value of 'enable_thinking' if found and not True, otherwise True.
"""
if (
hasattr(request_obj, "chat_template_kwargs")
and request_obj.chat_template_kwargs
and request_obj.chat_template_kwargs.get("enable_thinking") is not None
):
return request_obj.chat_template_kwargs.get("enable_thinking")
return True
def v1_chat_generate_request(
all_requests: List[ChatCompletionRequest],
tokenizer_manager,
......@@ -1263,31 +1281,16 @@ def v1_chat_generate_response(
tool_calls = None
text = ret_item["text"]
enable_thinking = True
if isinstance(request, list):
tool_choice = request[idx].tool_choice
tools = request[idx].tools
separate_reasoning = request[idx].separate_reasoning
if (
request[idx].chat_template_kwargs
and request[idx].chat_template_kwargs.get("enable_thinking") is not None
):
enable_thinking = request[idx].chat_template_kwargs.get(
"enable_thinking", True
)
enable_thinking = _get_enable_thinking_from_request(request[idx])
else:
tool_choice = request.tool_choice
tools = request.tools
separate_reasoning = request.separate_reasoning
if (
request.chat_template_kwargs
and request.chat_template_kwargs.get("enable_thinking") is not None
):
enable_thinking = request.chat_template_kwargs.get(
"enable_thinking", True
)
enable_thinking = _get_enable_thinking_from_request(request)
reasoning_text = None
if reasoning_parser and separate_reasoning and enable_thinking:
......@@ -1526,9 +1529,12 @@ async def v1_chat_completions(
delta = text[len(stream_buffer) :]
new_stream_buffer = stream_buffer + delta
enable_thinking = _get_enable_thinking_from_request(request)
if (
tokenizer_manager.server_args.reasoning_parser
and request.separate_reasoning
and enable_thinking
):
if index not in reasoning_parser_dict:
reasoning_parser_dict[index] = ReasoningParser(
......
......@@ -69,6 +69,7 @@ DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
"hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
)
DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST = "Qwen/Qwen3-30B-A3B"
# Nightly tests
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
......
......@@ -59,6 +59,7 @@ suites = {
TestFile("test_pytorch_sampling_backend.py", 66),
TestFile("test_radix_attention.py", 167),
TestFile("test_reasoning_content.py", 89),
TestFile("test_enable_thinking.py", 70),
TestFile("test_regex_constrained.py", 64),
TestFile("test_release_memory_occupation.py", 44),
TestFile("test_request_length_validation.py", 31),
......
"""
Usage:
python3 -m unittest test_enable_thinking.TestEnableThinking.test_chat_completion_with_reasoning
python3 -m unittest test_enable_thinking.TestEnableThinking.test_chat_completion_without_reasoning
python3 -m unittest test_enable_thinking.TestEnableThinking.test_stream_chat_completion_with_reasoning
python3 -m unittest test_enable_thinking.TestEnableThinking.test_stream_chat_completion_without_reasoning
"""
import asyncio
import json
import os
import sys
import time
import unittest
import requests
from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import (
DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
popen_launch_server,
)
class TestEnableThinking(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.api_key = "sk-1234"
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
api_key=cls.api_key,
other_args=[
"--reasoning-parser",
"qwen3",
],
)
@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)
def test_chat_completion_with_reasoning(self):
# Test non-streaming with "enable_thinking": True, reasoning_content should not be empty
client = requests.post(
f"{self.base_url}/v1/chat/completions",
headers={"Authorization": f"Bearer {self.api_key}"},
json={
"model": self.model,
"messages": [{"role": "user", "content": "Hello"}],
"temperature": 0,
"separate_reasoning": True,
"chat_template_kwargs": {"enable_thinking": True},
},
)
self.assertEqual(client.status_code, 200, f"Failed with: {client.text}")
data = client.json()
self.assertIn("choices", data)
self.assertTrue(len(data["choices"]) > 0)
self.assertIn("message", data["choices"][0])
self.assertIn("reasoning_content", data["choices"][0]["message"])
self.assertIsNotNone(data["choices"][0]["message"]["reasoning_content"])
def test_chat_completion_without_reasoning(self):
# Test non-streaming with "enable_thinking": False, reasoning_content should be empty
client = requests.post(
f"{self.base_url}/v1/chat/completions",
headers={"Authorization": f"Bearer {self.api_key}"},
json={
"model": self.model,
"messages": [{"role": "user", "content": "Hello"}],
"temperature": 0,
"separate_reasoning": True,
"chat_template_kwargs": {"enable_thinking": False},
},
)
self.assertEqual(client.status_code, 200, f"Failed with: {client.text}")
data = client.json()
self.assertIn("choices", data)
self.assertTrue(len(data["choices"]) > 0)
self.assertIn("message", data["choices"][0])
if "reasoning_content" in data["choices"][0]["message"]:
self.assertIsNone(data["choices"][0]["message"]["reasoning_content"])
def test_stream_chat_completion_with_reasoning(self):
# Test streaming with "enable_thinking": True, reasoning_content should not be empty
response = requests.post(
f"{self.base_url}/v1/chat/completions",
headers={"Authorization": f"Bearer {self.api_key}"},
json={
"model": self.model,
"messages": [{"role": "user", "content": "Hello"}],
"temperature": 0,
"separate_reasoning": True,
"stream": True,
"chat_template_kwargs": {"enable_thinking": True},
},
stream=True,
)
self.assertEqual(response.status_code, 200, f"Failed with: {response.text}")
has_reasoning = False
has_content = False
print("\n=== Stream With Reasoning ===")
for line in response.iter_lines():
if line:
line = line.decode("utf-8")
if line.startswith("data:") and not line.startswith("data: [DONE]"):
data = json.loads(line[6:])
if "choices" in data and len(data["choices"]) > 0:
delta = data["choices"][0].get("delta", {})
if "reasoning_content" in delta and delta["reasoning_content"]:
has_reasoning = True
if "content" in delta and delta["content"]:
has_content = True
self.assertTrue(
has_reasoning,
"The reasoning content is not included in the stream response",
)
self.assertTrue(
has_content, "The stream response does not contain normal content"
)
def test_stream_chat_completion_without_reasoning(self):
# Test streaming with "enable_thinking": False, reasoning_content should be empty
response = requests.post(
f"{self.base_url}/v1/chat/completions",
headers={"Authorization": f"Bearer {self.api_key}"},
json={
"model": self.model,
"messages": [{"role": "user", "content": "Hello"}],
"temperature": 0,
"separate_reasoning": True,
"stream": True,
"chat_template_kwargs": {"enable_thinking": False},
},
stream=True,
)
self.assertEqual(response.status_code, 200, f"Failed with: {response.text}")
has_reasoning = False
has_content = False
print("\n=== Stream Without Reasoning ===")
for line in response.iter_lines():
if line:
line = line.decode("utf-8")
if line.startswith("data:") and not line.startswith("data: [DONE]"):
data = json.loads(line[6:])
if "choices" in data and len(data["choices"]) > 0:
delta = data["choices"][0].get("delta", {})
if "reasoning_content" in delta and delta["reasoning_content"]:
has_reasoning = True
if "content" in delta and delta["content"]:
has_content = True
self.assertFalse(
has_reasoning,
"The reasoning content should not be included in the stream response",
)
self.assertTrue(
has_content, "The stream response does not contain normal content"
)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment