Update minimax-m2.1 tool call

688d8492 · chenych · 9d2097be · 688d8492 · 688d8492 · 688d8492
Commit 688d8492 authored Jan 26, 2026 by chenych
7 changed files
--- a/README.md
+++ b/README.md
@@ -38,6 +38,12 @@ docker run -it --shm-size 60g --network=host --name minimax_m2 --privileged --de
 更多镜像可前往[光源](https://sourcefind.cn/#/service-list)下载使用。

 关于本项目DCU显卡所需的特殊深度学习库可从[光合](https://developer.sourcefind.cn/tool/)开发者社区下载安装。
+vllm文件替换
+```bash
+# /path/of/vllm 可以通过 "pip show vllm" 中的 "Location" 字段来获取环境中vllm所在目录
+cp codes/minimax_m2_tool_parser.py /path/of/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py
+cp codes/minimax_m2_reasoning_parser.py /path/of/vllm/reasoning/minimax_m2_reasoning_parser.py
+```

 ## 数据集
 暂无
@@ -71,11 +77,10 @@ cp /path/of/MiniMax/MiniMax-M2/vocab.json /path/of/MiniMax/MiniMax-M2-bf16

 ### vllm
 #### 单机推理
+- MiniMax-M2
+
 ```bash
 ## serve启动
-export ALLREDUCE_STREAM_WITH_COMPUTE=1
-export VLLM_MLA_DISABLE=0
-export VLLM_USE_FLASH_MLA=1

 vllm serve /path/of/MiniMax/MiniMax-M2-bf16/ \
    --trust-remote-code \
@@ -98,6 +103,38 @@ curl http://localhost:8000/v1/chat/completions   \
    }'
 ```

+- MiniMax-M2.1
+```bash
+## serve启动
+vllm serve /path/of/MiniMax/MiniMax-M2.1-bf16 \
+    --trust-remote-code \
+    --served-model-name minimax-m2.1 \
+    --max-model-len 32768 \
+    --dtype bfloat16 \
+    -tp 8 \
+    --port 8001 \
+    --enable-auto-tool-choice \
+    --tool-call-parser minimax-m2 \
+    --enable-expert-parallel \
+    --reasoning-parser minimax_m2
+
+## client访问
+curl http://localhost:8000/v1/chat/completions   \
+    -H "Content-Type: application/json"  \
+    -d '{
+        "model": "minimax-m2.1",
+        "messages": [
+            {
+                "role": "user",
+                "content": "牛顿提出了哪三大运动定律？请简要说明。"
+            }
+        ]
+    }'
+
+## 离线工具调用
+python offline_tools.py
+```
+
 ## 效果展示
 - MiniMax-M2 模型效果
 <div align=center>
@@ -105,11 +142,15 @@ curl http://localhost:8000/v1/chat/completions   \
 </div>

 - MiniMax-M2.1 模型效果
+1. 对话
 <div align=center>
    <img src="./doc/result-minimax-m2_1.png"/>
 </div>

-
+2. 离线工具调用
+<div align=center>
+    <img src="./doc/results-minimax-m2_1-tool.png"/>
+</div>

 ### 精度
 DCU与GPU精度一致，推理框架：vllm。
@@ -117,8 +158,8 @@ DCU与GPU精度一致，推理框架：vllm。
 ## 预训练权重
 | 模型名称  | 权重大小  | DCU型号  | 最低卡数需求 |下载地址|
 |:-----:|:----------:|:----------:|:---------------------:|:----------:|
-| MiniMax-M2 | 230 B | K100AI | 8 | [下载地址](https://huggingface.co/MiniMaxAI/MiniMax-M2) |
-| MiniMax-M2.1 | 230 B | K100AI | 8 | [下载地址](https://www.modelscope.cn/models/MiniMax/MiniMax-M2.1) |
+| MiniMax-M2 | 230 B | K100AI,BW1000 | 8 | [下载地址](https://huggingface.co/MiniMaxAI/MiniMax-M2) |
+| MiniMax-M2.1 | 230 B | K100AI,BW1000 | 8 | [下载地址](https://www.modelscope.cn/models/MiniMax/MiniMax-M2.1) |

 ## 源码仓库及问题反馈
 - https://developer.sourcefind.cn/codes/modelzoo/minimax-m2_vllm

--- a/codes/minimax_m2_reasoning_parser.py
+++ b/codes/minimax_m2_reasoning_parser.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Sequence
+
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    DeltaMessage,
+    ResponsesRequest,
+)
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
+logger = init_logger(__name__)
+
+@ReasoningParserManager.register_module("minimax_m2")
+class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser):
+    """
+    Reasoning parser for MiniMax M2 model.
+
+    MiniMax M2 models don't generate <think> start token, only </think> end
+    token. All content before </think> is reasoning, content after is the
+    actual response.
+    """
+
+    @property
+    def start_token(self) -> str:
+        """The token that starts reasoning content."""
+        return "<think>"
+
+    @property
+    def end_token(self) -> str:
+        """The token that ends reasoning content."""
+        return "</think>"
+
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        """
+        Extract reasoning content from a delta message for streaming.
+
+        MiniMax M2 models don't generate <think> start token, so we assume
+        all content is reasoning until we encounter the </think> end token.
+        """
+        # Skip single end token
+        if len(delta_token_ids) == 1 and delta_token_ids[0] == self.end_token_id:
+            return None
+
+        # Check if end token has already appeared in previous tokens
+        # meaning we're past the reasoning phase
+        if self.end_token_id in previous_token_ids:
+            # We're past the reasoning phase, this is content
+            return DeltaMessage(content=delta_text)
+
+        # Check if end token is in delta tokens
+        if self.end_token_id in delta_token_ids:
+            # End token in delta, split reasoning and content
+            end_index = delta_text.find(self.end_token)
+            reasoning = delta_text[:end_index]
+            content = delta_text[end_index + len(self.end_token) :]
+            return DeltaMessage(
+                reasoning=reasoning if reasoning else None,
+                content=content if content else None,
+            )
+
+        # No end token yet, all content is reasoning
+        return DeltaMessage(reasoning=delta_text)
+
+
+class MiniMaxM2AppendThinkReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for MiniMax M2 model.
+    """
+
+    def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs):
+        super().__init__(tokenizer, *args, **kwargs)
+        self.end_token_id = self.vocab.get("</think>")
+
+    def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
+        end_token_id = self.end_token_id
+        return any(input_id == end_token_id for input_id in reversed(input_ids))
+
+    def extract_content_ids(self, input_ids: list[int]) -> list[int]:
+        return input_ids
+
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        if len(previous_token_ids) == 0:
+            delta_text = "<think>" + delta_text
+        return DeltaMessage(content=delta_text)
+
+    def extract_reasoning(
+        self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
+    ) -> tuple[str | None, str | None]:
+        return None, "<think>" + model_output
\ No newline at end of file
--- a/codes/minimax_m2_tool_parser.py
+++ b/codes/minimax_m2_tool_parser.py
--- a/config.json
+++ b/config.json
-{
-  "architectures": [
-    "MiniMaxM2ForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "attn_type_list": [
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1,
-    1
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_minimax_m2.MiniMaxM2Config",
-    "AutoModelForCausalLM": "modeling_minimax_m2.MiniMaxM2ForCausalLM"
-  },
-  "bos_token_id": null,
-  "eos_token_id": null,
-  "head_dim": 128,
-  "hidden_act": "silu",
-  "hidden_size": 3072,
-  "initializer_range": 0.02,
-  "intermediate_size": 1536,
-  "layernorm_full_attention_beta": 1.0,
-  "layernorm_linear_attention_beta": 1.0,
-  "layernorm_mlp_beta": 1.0,
-  "max_position_embeddings": 196608,
-  "mlp_intermediate_size": 8192,
-  "model_type": "minimax_m2",
-  "mtp_transformer_layers": 1,
-  "num_attention_heads": 48,
-  "num_experts_per_tok": 8,
-  "num_hidden_layers": 62,
-  "num_key_value_heads": 8,
-  "num_local_experts": 256,
-  "num_mtp_modules": 3,
-  "output_router_logits": false,
-  "qk_norm_type": "per_layer",
-  "rms_norm_eps": 1e-06,
-  "rope_theta": 5000000,
-  "rotary_dim": 64,
-  "router_aux_loss_coef": 0.001,
-  "router_jitter_noise": 0.0,
-  "scoring_func": "sigmoid",
-  "shared_intermediate_size": 0,
-  "shared_moe_mode": "sigmoid",
-  "sliding_window": null,
-  "tie_word_embeddings": false,
-  "transformers_version": "4.57.1",
-  "use_cache": true,
-  "use_mtp": true,
-  "use_qk_norm": true,
-  "use_routing_bias": true,
-  "vocab_size": 200064
-}
--- a/doc/result-minimax-m2_1.png
+++ b/doc/result-minimax-m2_1.png
--- a/doc/results-minimax-m2_1-tool.png
+++ b/doc/results-minimax-m2_1-tool.png
--- a/offline_tools.py
+++ b/offline_tools.py
+from openai import OpenAI
+import json
+
+client = OpenAI(base_url="http://localhost:8001/v1", api_key="dummy")
+
+def get_weather(location: str, unit: str):
+    return f"Getting the weather for {location} in {unit}..."
+
+tool_functions = {"get_weather": get_weather}
+
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
+                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
+            },
+            "required": ["location", "unit"]
+        }
+    }
+}]
+
+response = client.chat.completions.create(
+    model=client.models.list().data[0].id,
+    messages=[{"role": "user", "content": "What's the weather like in San Francisco? use celsius."}],
+    tools=tools,
+    tool_choice="auto"
+)
+
+print(response)
+
+tool_call = response.choices[0].message.tool_calls[0].function
+print(f"Function called: {tool_call.name}")
+print(f"Arguments: {tool_call.arguments}")
+print(f"Result: {get_weather(**json.loads(tool_call.arguments))}")
\ No newline at end of file