Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
6299628d
Unverified
Commit
6299628d
authored
Dec 11, 2025
by
Rei.
Committed by
GitHub
Dec 11, 2025
Browse files
[bugfix] fix MiniMaxM2ReasoningParser streaming output not separating reasoning_content. (#29882)
Signed-off-by:
Rei
<
1477174254@qq.com
>
parent
fba89069
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
468 additions
and
0 deletions
+468
-0
tests/reasoning/test_minimax_m2_append_reasoning_parser.py
tests/reasoning/test_minimax_m2_append_reasoning_parser.py
+195
-0
tests/reasoning/test_minimax_m2_reasoning_parser.py
tests/reasoning/test_minimax_m2_reasoning_parser.py
+230
-0
vllm/reasoning/minimax_m2_reasoning_parser.py
vllm/reasoning/minimax_m2_reasoning_parser.py
+43
-0
No files found.
tests/reasoning/test_minimax_m2_append_reasoning_parser.py
0 → 100644
View file @
6299628d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
transformers
import
AutoTokenizer
from
tests.reasoning.utils
import
run_reasoning_extraction
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
parser_name
=
"minimax_m2_append_think"
end_token
=
"</think>"
# MiniMax M2 model path
REASONING_MODEL_NAME
=
"MiniMaxAI/MiniMax-M2"
@
pytest
.
fixture
(
scope
=
"module"
)
def
minimax_m2_tokenizer
():
return
AutoTokenizer
.
from_pretrained
(
REASONING_MODEL_NAME
)
# =============================================================================
# MiniMaxM2AppendThinkReasoningParser behavior:
# - Prepends <think> to the beginning of the output
# - Does NOT separate reasoning and content
# - Returns everything as content (with <think> prepended)
# - reasoning is always None
#
# This parser is used when you want to keep the raw output with <think> added
# =============================================================================
# Case: simple output with end token
SIMPLE_OUTPUT
=
{
"output"
:
"This is reasoning</think>This is response"
,
"reasoning"
:
None
,
"content"
:
"<think>This is reasoning</think>This is response"
,
"is_reasoning_end"
:
True
,
}
# Case: output without end token (reasoning in progress)
NO_END_TOKEN
=
{
"output"
:
"This is reasoning in progress"
,
"reasoning"
:
None
,
"content"
:
"<think>This is reasoning in progress"
,
"is_reasoning_end"
:
False
,
}
# Case: only end token
ONLY_END_TOKEN
=
{
"output"
:
"</think>This is response"
,
"reasoning"
:
None
,
"content"
:
"<think></think>This is response"
,
"is_reasoning_end"
:
True
,
}
# Case: multiple lines
MULTIPLE_LINES
=
{
"output"
:
"Line 1
\n
Line 2</think>Response 1
\n
Response 2"
,
"reasoning"
:
None
,
"content"
:
"<think>Line 1
\n
Line 2</think>Response 1
\n
Response 2"
,
"is_reasoning_end"
:
True
,
}
# Case: empty output (non-streaming prepends <think>)
EMPTY
=
{
"output"
:
""
,
"reasoning"
:
None
,
"content"
:
"<think>"
,
"is_reasoning_end"
:
False
,
}
# Case: empty output streaming (no tokens = no output)
EMPTY_STREAMING
=
{
"output"
:
""
,
"reasoning"
:
None
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
}
# Case: special characters
SPECIAL_CHARS
=
{
"output"
:
"Let me think... 1+1=2</think>Yes!"
,
"reasoning"
:
None
,
"content"
:
"<think>Let me think... 1+1=2</think>Yes!"
,
"is_reasoning_end"
:
True
,
}
# Case: code in output
CODE_OUTPUT
=
{
"output"
:
"```python
\n
print('hi')
\n
```</think>Here's the code."
,
"reasoning"
:
None
,
"content"
:
"<think>```python
\n
print('hi')
\n
```</think>Here's the code."
,
"is_reasoning_end"
:
True
,
}
TEST_CASES
=
[
pytest
.
param
(
False
,
SIMPLE_OUTPUT
,
id
=
"simple_output"
,
),
pytest
.
param
(
True
,
SIMPLE_OUTPUT
,
id
=
"simple_output_streaming"
,
),
pytest
.
param
(
False
,
NO_END_TOKEN
,
id
=
"no_end_token"
,
),
pytest
.
param
(
True
,
NO_END_TOKEN
,
id
=
"no_end_token_streaming"
,
),
pytest
.
param
(
False
,
ONLY_END_TOKEN
,
id
=
"only_end_token"
,
),
pytest
.
param
(
True
,
ONLY_END_TOKEN
,
id
=
"only_end_token_streaming"
,
),
pytest
.
param
(
False
,
MULTIPLE_LINES
,
id
=
"multiple_lines"
,
),
pytest
.
param
(
True
,
MULTIPLE_LINES
,
id
=
"multiple_lines_streaming"
,
),
pytest
.
param
(
False
,
EMPTY
,
id
=
"empty"
,
),
pytest
.
param
(
True
,
EMPTY_STREAMING
,
id
=
"empty_streaming"
,
),
pytest
.
param
(
False
,
SPECIAL_CHARS
,
id
=
"special_chars"
,
),
pytest
.
param
(
True
,
SPECIAL_CHARS
,
id
=
"special_chars_streaming"
,
),
pytest
.
param
(
False
,
CODE_OUTPUT
,
id
=
"code_output"
,
),
pytest
.
param
(
True
,
CODE_OUTPUT
,
id
=
"code_output_streaming"
,
),
]
@
pytest
.
mark
.
parametrize
(
"streaming, param_dict"
,
TEST_CASES
)
def
test_reasoning
(
streaming
:
bool
,
param_dict
:
dict
,
minimax_m2_tokenizer
,
):
output
=
minimax_m2_tokenizer
.
tokenize
(
param_dict
[
"output"
])
# decode everything to tokens
output_tokens
:
list
[
str
]
=
[
minimax_m2_tokenizer
.
convert_tokens_to_string
([
token
])
for
token
in
output
]
parser
:
ReasoningParser
=
ReasoningParserManager
.
get_reasoning_parser
(
parser_name
)(
minimax_m2_tokenizer
)
reasoning
,
content
=
run_reasoning_extraction
(
parser
,
output_tokens
,
streaming
=
streaming
)
assert
reasoning
==
param_dict
[
"reasoning"
]
assert
content
==
param_dict
[
"content"
]
# Test is_reasoning_end
output_ids
=
minimax_m2_tokenizer
.
convert_tokens_to_ids
(
output
)
is_reasoning_end
=
parser
.
is_reasoning_end
(
output_ids
)
assert
is_reasoning_end
==
param_dict
[
"is_reasoning_end"
]
tests/reasoning/test_minimax_m2_reasoning_parser.py
0 → 100644
View file @
6299628d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
transformers
import
AutoTokenizer
from
tests.reasoning.utils
import
run_reasoning_extraction
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
parser_name
=
"minimax_m2"
end_token
=
"</think>"
# MiniMax M2 model path
REASONING_MODEL_NAME
=
"MiniMaxAI/MiniMax-M2"
@
pytest
.
fixture
(
scope
=
"module"
)
def
minimax_m2_tokenizer
():
return
AutoTokenizer
.
from_pretrained
(
REASONING_MODEL_NAME
)
# =============================================================================
# MiniMax M2 specific behavior:
# - Model does NOT generate <think> start token
# - Model only generates </think> end token
# - All content before </think> is reasoning
# - All content after </think> is the actual response (content)
# =============================================================================
# Case: reasoning + end token + content (typical case)
SIMPLE_REASONING
=
{
"output"
:
"This is a reasoning section</think>This is the rest"
,
"reasoning"
:
"This is a reasoning section"
,
"content"
:
"This is the rest"
,
"is_reasoning_end"
:
True
,
}
# Case: reasoning + end token only (no content after)
COMPLETE_REASONING
=
{
"output"
:
"This is a reasoning section</think>"
,
"reasoning"
:
"This is a reasoning section"
,
"content"
:
None
,
"is_reasoning_end"
:
True
,
}
# Case: no end token yet (streaming in progress, all is reasoning)
NO_END_TOKEN
=
{
"output"
:
"This is reasoning in progress"
,
"reasoning"
:
"This is reasoning in progress"
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
}
# Case: multiple lines of reasoning
MULTIPLE_LINES
=
{
"output"
:
"First line
\n
Second line</think>Response first line
\n
Response second"
,
"reasoning"
:
"First line
\n
Second line"
,
"content"
:
"Response first line
\n
Response second"
,
"is_reasoning_end"
:
True
,
}
# Case: only end token (empty reasoning, immediate response)
SHORTEST_REASONING_NO_STREAMING
=
{
"output"
:
"</think>This is the response"
,
"reasoning"
:
""
,
"content"
:
"This is the response"
,
"is_reasoning_end"
:
True
,
}
# Case: only end token streaming (reasoning is None because it's just the token)
SHORTEST_REASONING_STREAMING
=
{
"output"
:
"</think>This is the response"
,
"reasoning"
:
None
,
"content"
:
"This is the response"
,
"is_reasoning_end"
:
True
,
}
# Case: empty output
EMPTY
=
{
"output"
:
""
,
"reasoning"
:
""
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
}
# Case: empty streaming
EMPTY_STREAMING
=
{
"output"
:
""
,
"reasoning"
:
None
,
"content"
:
None
,
"is_reasoning_end"
:
False
,
}
# Case: long reasoning with special characters
SPECIAL_CHARS
=
{
"output"
:
"Let me think... 1+1=2, right?</think>Yes, 1+1=2."
,
"reasoning"
:
"Let me think... 1+1=2, right?"
,
"content"
:
"Yes, 1+1=2."
,
"is_reasoning_end"
:
True
,
}
# Case: reasoning with code blocks
CODE_IN_REASONING
=
{
"output"
:
"```python
\n
print('hello')
\n
```</think>Here is the code."
,
"reasoning"
:
"```python
\n
print('hello')
\n
```"
,
"content"
:
"Here is the code."
,
"is_reasoning_end"
:
True
,
}
TEST_CASES
=
[
# Core cases: no start token (MiniMax M2 actual behavior)
pytest
.
param
(
False
,
SIMPLE_REASONING
,
id
=
"simple_reasoning"
,
),
pytest
.
param
(
True
,
SIMPLE_REASONING
,
id
=
"simple_reasoning_streaming"
,
),
pytest
.
param
(
False
,
COMPLETE_REASONING
,
id
=
"complete_reasoning"
,
),
pytest
.
param
(
True
,
COMPLETE_REASONING
,
id
=
"complete_reasoning_streaming"
,
),
pytest
.
param
(
False
,
NO_END_TOKEN
,
id
=
"no_end_token"
,
),
pytest
.
param
(
True
,
NO_END_TOKEN
,
id
=
"no_end_token_streaming"
,
),
pytest
.
param
(
False
,
MULTIPLE_LINES
,
id
=
"multiple_lines"
,
),
pytest
.
param
(
True
,
MULTIPLE_LINES
,
id
=
"multiple_lines_streaming"
,
),
pytest
.
param
(
False
,
SHORTEST_REASONING_NO_STREAMING
,
id
=
"shortest_reasoning"
,
),
pytest
.
param
(
True
,
SHORTEST_REASONING_STREAMING
,
id
=
"shortest_reasoning_streaming"
,
),
pytest
.
param
(
False
,
EMPTY
,
id
=
"empty"
,
),
pytest
.
param
(
True
,
EMPTY_STREAMING
,
id
=
"empty_streaming"
,
),
pytest
.
param
(
False
,
SPECIAL_CHARS
,
id
=
"special_chars"
,
),
pytest
.
param
(
True
,
SPECIAL_CHARS
,
id
=
"special_chars_streaming"
,
),
pytest
.
param
(
False
,
CODE_IN_REASONING
,
id
=
"code_in_reasoning"
,
),
pytest
.
param
(
True
,
CODE_IN_REASONING
,
id
=
"code_in_reasoning_streaming"
,
),
]
@
pytest
.
mark
.
parametrize
(
"streaming, param_dict"
,
TEST_CASES
)
def
test_reasoning
(
streaming
:
bool
,
param_dict
:
dict
,
minimax_m2_tokenizer
,
):
output
=
minimax_m2_tokenizer
.
tokenize
(
param_dict
[
"output"
])
# decode everything to tokens
output_tokens
:
list
[
str
]
=
[
minimax_m2_tokenizer
.
convert_tokens_to_string
([
token
])
for
token
in
output
]
parser
:
ReasoningParser
=
ReasoningParserManager
.
get_reasoning_parser
(
parser_name
)(
minimax_m2_tokenizer
)
reasoning
,
content
=
run_reasoning_extraction
(
parser
,
output_tokens
,
streaming
=
streaming
)
assert
reasoning
==
param_dict
[
"reasoning"
]
assert
content
==
param_dict
[
"content"
]
# Test is_reasoning_end
output_ids
=
minimax_m2_tokenizer
.
convert_tokens_to_ids
(
output
)
is_reasoning_end
=
parser
.
is_reasoning_end
(
output_ids
)
assert
is_reasoning_end
==
param_dict
[
"is_reasoning_end"
]
# Test extract_content
if
param_dict
[
"content"
]
is
not
None
:
content
=
parser
.
extract_content_ids
(
output_ids
)
assert
content
==
minimax_m2_tokenizer
.
convert_tokens_to_ids
(
minimax_m2_tokenizer
.
tokenize
(
param_dict
[
"content"
])
)
else
:
content
=
parser
.
extract_content_ids
(
output
)
assert
content
==
[]
vllm/reasoning/minimax_m2_reasoning_parser.py
View file @
6299628d
...
...
@@ -19,6 +19,10 @@ logger = init_logger(__name__)
class
MiniMaxM2ReasoningParser
(
BaseThinkingReasoningParser
):
"""
Reasoning parser for MiniMax M2 model.
MiniMax M2 models don't generate <think> start token, only </think> end
token. All content before </think> is reasoning, content after is the
actual response.
"""
@
property
...
...
@@ -31,6 +35,45 @@ class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser):
"""The token that ends reasoning content."""
return
"</think>"
def
extract_reasoning_streaming
(
self
,
previous_text
:
str
,
current_text
:
str
,
delta_text
:
str
,
previous_token_ids
:
Sequence
[
int
],
current_token_ids
:
Sequence
[
int
],
delta_token_ids
:
Sequence
[
int
],
)
->
DeltaMessage
|
None
:
"""
Extract reasoning content from a delta message for streaming.
MiniMax M2 models don't generate <think> start token, so we assume
all content is reasoning until we encounter the </think> end token.
"""
# Skip single end token
if
len
(
delta_token_ids
)
==
1
and
delta_token_ids
[
0
]
==
self
.
end_token_id
:
return
None
# Check if end token has already appeared in previous tokens
# meaning we're past the reasoning phase
if
self
.
end_token_id
in
previous_token_ids
:
# We're past the reasoning phase, this is content
return
DeltaMessage
(
content
=
delta_text
)
# Check if end token is in delta tokens
if
self
.
end_token_id
in
delta_token_ids
:
# End token in delta, split reasoning and content
end_index
=
delta_text
.
find
(
self
.
end_token
)
reasoning
=
delta_text
[:
end_index
]
content
=
delta_text
[
end_index
+
len
(
self
.
end_token
)
:]
return
DeltaMessage
(
reasoning
=
reasoning
if
reasoning
else
None
,
content
=
content
if
content
else
None
,
)
# No end token yet, all content is reasoning
return
DeltaMessage
(
reasoning
=
delta_text
)
class
MiniMaxM2AppendThinkReasoningParser
(
ReasoningParser
):
"""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment