Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
256c4c25
Unverified
Commit
256c4c25
authored
May 01, 2025
by
mlmz
Committed by
GitHub
Apr 30, 2025
Browse files
fix: correct stream response when enable_thinking is set to false (#5881)
parent
9f21e754
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
211 additions
and
17 deletions
+211
-17
python/sglang/srt/openai_api/adapter.py
python/sglang/srt/openai_api/adapter.py
+23
-17
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+1
-0
test/srt/run_suite.py
test/srt/run_suite.py
+1
-0
test/srt/test_enable_thinking.py
test/srt/test_enable_thinking.py
+186
-0
No files found.
python/sglang/srt/openai_api/adapter.py
View file @
256c4c25
...
...
@@ -899,6 +899,24 @@ async def v1_completions(tokenizer_manager, raw_request: Request):
return
response
def
_get_enable_thinking_from_request
(
request_obj
):
"""Extracts the 'enable_thinking' flag from request chat_template_kwargs.
Args:
request_obj: The request object (or an item from a list of requests).
Returns:
The boolean value of 'enable_thinking' if found and not True, otherwise True.
"""
if
(
hasattr
(
request_obj
,
"chat_template_kwargs"
)
and
request_obj
.
chat_template_kwargs
and
request_obj
.
chat_template_kwargs
.
get
(
"enable_thinking"
)
is
not
None
):
return
request_obj
.
chat_template_kwargs
.
get
(
"enable_thinking"
)
return
True
def
v1_chat_generate_request
(
all_requests
:
List
[
ChatCompletionRequest
],
tokenizer_manager
,
...
...
@@ -1263,31 +1281,16 @@ def v1_chat_generate_response(
tool_calls
=
None
text
=
ret_item
[
"text"
]
enable_thinking
=
True
if
isinstance
(
request
,
list
):
tool_choice
=
request
[
idx
].
tool_choice
tools
=
request
[
idx
].
tools
separate_reasoning
=
request
[
idx
].
separate_reasoning
if
(
request
[
idx
].
chat_template_kwargs
and
request
[
idx
].
chat_template_kwargs
.
get
(
"enable_thinking"
)
is
not
None
):
enable_thinking
=
request
[
idx
].
chat_template_kwargs
.
get
(
"enable_thinking"
,
True
)
enable_thinking
=
_get_enable_thinking_from_request
(
request
[
idx
])
else
:
tool_choice
=
request
.
tool_choice
tools
=
request
.
tools
separate_reasoning
=
request
.
separate_reasoning
if
(
request
.
chat_template_kwargs
and
request
.
chat_template_kwargs
.
get
(
"enable_thinking"
)
is
not
None
):
enable_thinking
=
request
.
chat_template_kwargs
.
get
(
"enable_thinking"
,
True
)
enable_thinking
=
_get_enable_thinking_from_request
(
request
)
reasoning_text
=
None
if
reasoning_parser
and
separate_reasoning
and
enable_thinking
:
...
...
@@ -1526,9 +1529,12 @@ async def v1_chat_completions(
delta
=
text
[
len
(
stream_buffer
)
:]
new_stream_buffer
=
stream_buffer
+
delta
enable_thinking
=
_get_enable_thinking_from_request
(
request
)
if
(
tokenizer_manager
.
server_args
.
reasoning_parser
and
request
.
separate_reasoning
and
enable_thinking
):
if
index
not
in
reasoning_parser_dict
:
reasoning_parser_dict
[
index
]
=
ReasoningParser
(
...
...
python/sglang/test/test_utils.py
View file @
256c4c25
...
...
@@ -69,6 +69,7 @@ DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST
=
(
"hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
)
DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST
=
"Qwen/Qwen3-30B-A3B"
# Nightly tests
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1
=
"meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
...
...
test/srt/run_suite.py
View file @
256c4c25
...
...
@@ -59,6 +59,7 @@ suites = {
TestFile
(
"test_pytorch_sampling_backend.py"
,
66
),
TestFile
(
"test_radix_attention.py"
,
167
),
TestFile
(
"test_reasoning_content.py"
,
89
),
TestFile
(
"test_enable_thinking.py"
,
70
),
TestFile
(
"test_regex_constrained.py"
,
64
),
TestFile
(
"test_release_memory_occupation.py"
,
44
),
TestFile
(
"test_request_length_validation.py"
,
31
),
...
...
test/srt/test_enable_thinking.py
0 → 100644
View file @
256c4c25
"""
Usage:
python3 -m unittest test_enable_thinking.TestEnableThinking.test_chat_completion_with_reasoning
python3 -m unittest test_enable_thinking.TestEnableThinking.test_chat_completion_without_reasoning
python3 -m unittest test_enable_thinking.TestEnableThinking.test_stream_chat_completion_with_reasoning
python3 -m unittest test_enable_thinking.TestEnableThinking.test_stream_chat_completion_without_reasoning
"""
import
asyncio
import
json
import
os
import
sys
import
time
import
unittest
import
requests
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.test_utils
import
(
DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
CustomTestCase
,
popen_launch_server
,
)
class
TestEnableThinking
(
CustomTestCase
):
@
classmethod
def
setUpClass
(
cls
):
cls
.
model
=
DEFAULT_ENABLE_THINKING_MODEL_NAME_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
api_key
=
"sk-1234"
cls
.
process
=
popen_launch_server
(
cls
.
model
,
cls
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
api_key
=
cls
.
api_key
,
other_args
=
[
"--reasoning-parser"
,
"qwen3"
,
],
)
@
classmethod
def
tearDownClass
(
cls
):
kill_process_tree
(
cls
.
process
.
pid
)
def
test_chat_completion_with_reasoning
(
self
):
# Test non-streaming with "enable_thinking": True, reasoning_content should not be empty
client
=
requests
.
post
(
f
"
{
self
.
base_url
}
/v1/chat/completions"
,
headers
=
{
"Authorization"
:
f
"Bearer
{
self
.
api_key
}
"
},
json
=
{
"model"
:
self
.
model
,
"messages"
:
[{
"role"
:
"user"
,
"content"
:
"Hello"
}],
"temperature"
:
0
,
"separate_reasoning"
:
True
,
"chat_template_kwargs"
:
{
"enable_thinking"
:
True
},
},
)
self
.
assertEqual
(
client
.
status_code
,
200
,
f
"Failed with:
{
client
.
text
}
"
)
data
=
client
.
json
()
self
.
assertIn
(
"choices"
,
data
)
self
.
assertTrue
(
len
(
data
[
"choices"
])
>
0
)
self
.
assertIn
(
"message"
,
data
[
"choices"
][
0
])
self
.
assertIn
(
"reasoning_content"
,
data
[
"choices"
][
0
][
"message"
])
self
.
assertIsNotNone
(
data
[
"choices"
][
0
][
"message"
][
"reasoning_content"
])
def
test_chat_completion_without_reasoning
(
self
):
# Test non-streaming with "enable_thinking": False, reasoning_content should be empty
client
=
requests
.
post
(
f
"
{
self
.
base_url
}
/v1/chat/completions"
,
headers
=
{
"Authorization"
:
f
"Bearer
{
self
.
api_key
}
"
},
json
=
{
"model"
:
self
.
model
,
"messages"
:
[{
"role"
:
"user"
,
"content"
:
"Hello"
}],
"temperature"
:
0
,
"separate_reasoning"
:
True
,
"chat_template_kwargs"
:
{
"enable_thinking"
:
False
},
},
)
self
.
assertEqual
(
client
.
status_code
,
200
,
f
"Failed with:
{
client
.
text
}
"
)
data
=
client
.
json
()
self
.
assertIn
(
"choices"
,
data
)
self
.
assertTrue
(
len
(
data
[
"choices"
])
>
0
)
self
.
assertIn
(
"message"
,
data
[
"choices"
][
0
])
if
"reasoning_content"
in
data
[
"choices"
][
0
][
"message"
]:
self
.
assertIsNone
(
data
[
"choices"
][
0
][
"message"
][
"reasoning_content"
])
def
test_stream_chat_completion_with_reasoning
(
self
):
# Test streaming with "enable_thinking": True, reasoning_content should not be empty
response
=
requests
.
post
(
f
"
{
self
.
base_url
}
/v1/chat/completions"
,
headers
=
{
"Authorization"
:
f
"Bearer
{
self
.
api_key
}
"
},
json
=
{
"model"
:
self
.
model
,
"messages"
:
[{
"role"
:
"user"
,
"content"
:
"Hello"
}],
"temperature"
:
0
,
"separate_reasoning"
:
True
,
"stream"
:
True
,
"chat_template_kwargs"
:
{
"enable_thinking"
:
True
},
},
stream
=
True
,
)
self
.
assertEqual
(
response
.
status_code
,
200
,
f
"Failed with:
{
response
.
text
}
"
)
has_reasoning
=
False
has_content
=
False
print
(
"
\n
=== Stream With Reasoning ==="
)
for
line
in
response
.
iter_lines
():
if
line
:
line
=
line
.
decode
(
"utf-8"
)
if
line
.
startswith
(
"data:"
)
and
not
line
.
startswith
(
"data: [DONE]"
):
data
=
json
.
loads
(
line
[
6
:])
if
"choices"
in
data
and
len
(
data
[
"choices"
])
>
0
:
delta
=
data
[
"choices"
][
0
].
get
(
"delta"
,
{})
if
"reasoning_content"
in
delta
and
delta
[
"reasoning_content"
]:
has_reasoning
=
True
if
"content"
in
delta
and
delta
[
"content"
]:
has_content
=
True
self
.
assertTrue
(
has_reasoning
,
"The reasoning content is not included in the stream response"
,
)
self
.
assertTrue
(
has_content
,
"The stream response does not contain normal content"
)
def
test_stream_chat_completion_without_reasoning
(
self
):
# Test streaming with "enable_thinking": False, reasoning_content should be empty
response
=
requests
.
post
(
f
"
{
self
.
base_url
}
/v1/chat/completions"
,
headers
=
{
"Authorization"
:
f
"Bearer
{
self
.
api_key
}
"
},
json
=
{
"model"
:
self
.
model
,
"messages"
:
[{
"role"
:
"user"
,
"content"
:
"Hello"
}],
"temperature"
:
0
,
"separate_reasoning"
:
True
,
"stream"
:
True
,
"chat_template_kwargs"
:
{
"enable_thinking"
:
False
},
},
stream
=
True
,
)
self
.
assertEqual
(
response
.
status_code
,
200
,
f
"Failed with:
{
response
.
text
}
"
)
has_reasoning
=
False
has_content
=
False
print
(
"
\n
=== Stream Without Reasoning ==="
)
for
line
in
response
.
iter_lines
():
if
line
:
line
=
line
.
decode
(
"utf-8"
)
if
line
.
startswith
(
"data:"
)
and
not
line
.
startswith
(
"data: [DONE]"
):
data
=
json
.
loads
(
line
[
6
:])
if
"choices"
in
data
and
len
(
data
[
"choices"
])
>
0
:
delta
=
data
[
"choices"
][
0
].
get
(
"delta"
,
{})
if
"reasoning_content"
in
delta
and
delta
[
"reasoning_content"
]:
has_reasoning
=
True
if
"content"
in
delta
and
delta
[
"content"
]:
has_content
=
True
self
.
assertFalse
(
has_reasoning
,
"The reasoning content should not be included in the stream response"
,
)
self
.
assertTrue
(
has_content
,
"The stream response does not contain normal content"
)
if
__name__
==
"__main__"
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment