Commit 688d8492 authored by chenych's avatar chenych
Browse files

Update minimax-m2.1 tool call

parent 9d2097be
......@@ -38,6 +38,12 @@ docker run -it --shm-size 60g --network=host --name minimax_m2 --privileged --de
更多镜像可前往[光源](https://sourcefind.cn/#/service-list)下载使用。
关于本项目DCU显卡所需的特殊深度学习库可从[光合](https://developer.sourcefind.cn/tool/)开发者社区下载安装。
vllm文件替换
```bash
# /path/of/vllm 可以通过 "pip show vllm" 中的 "Location" 字段来获取环境中vllm所在目录
cp codes/minimax_m2_tool_parser.py /path/of/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py
cp codes/minimax_m2_reasoning_parser.py /path/of/vllm/reasoning/minimax_m2_reasoning_parser.py
```
## 数据集
暂无
......@@ -71,11 +77,10 @@ cp /path/of/MiniMax/MiniMax-M2/vocab.json /path/of/MiniMax/MiniMax-M2-bf16
### vllm
#### 单机推理
- MiniMax-M2
```bash
## serve启动
export ALLREDUCE_STREAM_WITH_COMPUTE=1
export VLLM_MLA_DISABLE=0
export VLLM_USE_FLASH_MLA=1
vllm serve /path/of/MiniMax/MiniMax-M2-bf16/ \
--trust-remote-code \
......@@ -98,6 +103,38 @@ curl http://localhost:8000/v1/chat/completions \
}'
```
- MiniMax-M2.1
```bash
## serve启动
vllm serve /path/of/MiniMax/MiniMax-M2.1-bf16 \
--trust-remote-code \
--served-model-name minimax-m2.1 \
--max-model-len 32768 \
--dtype bfloat16 \
-tp 8 \
--port 8001 \
--enable-auto-tool-choice \
--tool-call-parser minimax-m2 \
--enable-expert-parallel \
--reasoning-parser minimax_m2
## client访问
curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "minimax-m2.1",
"messages": [
{
"role": "user",
"content": "牛顿提出了哪三大运动定律?请简要说明。"
}
]
}'
## 离线工具调用
python offline_tools.py
```
## 效果展示
- MiniMax-M2 模型效果
<div align=center>
......@@ -105,11 +142,15 @@ curl http://localhost:8000/v1/chat/completions \
</div>
- MiniMax-M2.1 模型效果
1. 对话
<div align=center>
<img src="./doc/result-minimax-m2_1.png"/>
</div>
2. 离线工具调用
<div align=center>
<img src="./doc/results-minimax-m2_1-tool.png"/>
</div>
### 精度
DCU与GPU精度一致,推理框架:vllm。
......@@ -117,8 +158,8 @@ DCU与GPU精度一致,推理框架:vllm。
## 预训练权重
| 模型名称 | 权重大小 | DCU型号 | 最低卡数需求 |下载地址|
|:-----:|:----------:|:----------:|:---------------------:|:----------:|
| MiniMax-M2 | 230 B | K100AI | 8 | [下载地址](https://huggingface.co/MiniMaxAI/MiniMax-M2) |
| MiniMax-M2.1 | 230 B | K100AI | 8 | [下载地址](https://www.modelscope.cn/models/MiniMax/MiniMax-M2.1) |
| MiniMax-M2 | 230 B | K100AI,BW1000 | 8 | [下载地址](https://huggingface.co/MiniMaxAI/MiniMax-M2) |
| MiniMax-M2.1 | 230 B | K100AI,BW1000 | 8 | [下载地址](https://www.modelscope.cn/models/MiniMax/MiniMax-M2.1) |
## 源码仓库及问题反馈
- https://developer.sourcefind.cn/codes/modelzoo/minimax-m2_vllm
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from vllm.entrypoints.openai.protocol import (
ChatCompletionRequest,
DeltaMessage,
ResponsesRequest,
)
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser, ReasoningParserManager
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
from vllm.transformers_utils.tokenizer import AnyTokenizer
logger = init_logger(__name__)
@ReasoningParserManager.register_module("minimax_m2")
class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser):
"""
Reasoning parser for MiniMax M2 model.
MiniMax M2 models don't generate <think> start token, only </think> end
token. All content before </think> is reasoning, content after is the
actual response.
"""
@property
def start_token(self) -> str:
"""The token that starts reasoning content."""
return "<think>"
@property
def end_token(self) -> str:
"""The token that ends reasoning content."""
return "</think>"
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
"""
Extract reasoning content from a delta message for streaming.
MiniMax M2 models don't generate <think> start token, so we assume
all content is reasoning until we encounter the </think> end token.
"""
# Skip single end token
if len(delta_token_ids) == 1 and delta_token_ids[0] == self.end_token_id:
return None
# Check if end token has already appeared in previous tokens
# meaning we're past the reasoning phase
if self.end_token_id in previous_token_ids:
# We're past the reasoning phase, this is content
return DeltaMessage(content=delta_text)
# Check if end token is in delta tokens
if self.end_token_id in delta_token_ids:
# End token in delta, split reasoning and content
end_index = delta_text.find(self.end_token)
reasoning = delta_text[:end_index]
content = delta_text[end_index + len(self.end_token) :]
return DeltaMessage(
reasoning=reasoning if reasoning else None,
content=content if content else None,
)
# No end token yet, all content is reasoning
return DeltaMessage(reasoning=delta_text)
class MiniMaxM2AppendThinkReasoningParser(ReasoningParser):
"""
Reasoning parser for MiniMax M2 model.
"""
def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
self.end_token_id = self.vocab.get("</think>")
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
end_token_id = self.end_token_id
return any(input_id == end_token_id for input_id in reversed(input_ids))
def extract_content_ids(self, input_ids: list[int]) -> list[int]:
return input_ids
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
if len(previous_token_ids) == 0:
delta_text = "<think>" + delta_text
return DeltaMessage(content=delta_text)
def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest | ResponsesRequest
) -> tuple[str | None, str | None]:
return None, "<think>" + model_output
\ No newline at end of file
This diff is collapsed.
{
"architectures": [
"MiniMaxM2ForCausalLM"
],
"attention_dropout": 0.0,
"attn_type_list": [
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1,
1
],
"auto_map": {
"AutoConfig": "configuration_minimax_m2.MiniMaxM2Config",
"AutoModelForCausalLM": "modeling_minimax_m2.MiniMaxM2ForCausalLM"
},
"bos_token_id": null,
"eos_token_id": null,
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 3072,
"initializer_range": 0.02,
"intermediate_size": 1536,
"layernorm_full_attention_beta": 1.0,
"layernorm_linear_attention_beta": 1.0,
"layernorm_mlp_beta": 1.0,
"max_position_embeddings": 196608,
"mlp_intermediate_size": 8192,
"model_type": "minimax_m2",
"mtp_transformer_layers": 1,
"num_attention_heads": 48,
"num_experts_per_tok": 8,
"num_hidden_layers": 62,
"num_key_value_heads": 8,
"num_local_experts": 256,
"num_mtp_modules": 3,
"output_router_logits": false,
"qk_norm_type": "per_layer",
"rms_norm_eps": 1e-06,
"rope_theta": 5000000,
"rotary_dim": 64,
"router_aux_loss_coef": 0.001,
"router_jitter_noise": 0.0,
"scoring_func": "sigmoid",
"shared_intermediate_size": 0,
"shared_moe_mode": "sigmoid",
"sliding_window": null,
"tie_word_embeddings": false,
"transformers_version": "4.57.1",
"use_cache": true,
"use_mtp": true,
"use_qk_norm": true,
"use_routing_bias": true,
"vocab_size": 200064
}
doc/result-minimax-m2_1.png

232 KB | W: | H:

doc/result-minimax-m2_1.png

257 KB | W: | H:

doc/result-minimax-m2_1.png
doc/result-minimax-m2_1.png
doc/result-minimax-m2_1.png
doc/result-minimax-m2_1.png
  • 2-up
  • Swipe
  • Onion skin
from openai import OpenAI
import json
client = OpenAI(base_url="http://localhost:8001/v1", api_key="dummy")
def get_weather(location: str, unit: str):
return f"Getting the weather for {location} in {unit}..."
tool_functions = {"get_weather": get_weather}
tools = [{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
},
"required": ["location", "unit"]
}
}
}]
response = client.chat.completions.create(
model=client.models.list().data[0].id,
messages=[{"role": "user", "content": "What's the weather like in San Francisco? use celsius."}],
tools=tools,
tool_choice="auto"
)
print(response)
tool_call = response.choices[0].message.tool_calls[0].function
print(f"Function called: {tool_call.name}")
print(f"Arguments: {tool_call.arguments}")
print(f"Result: {get_weather(**json.loads(tool_call.arguments))}")
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment