"...git@developer.sourcefind.cn:jerrrrry/infinicore.git" did not exist on "1d95ddf31e32b8f57fc312f2b5ba1befcceb5df7"
Commit 20e943f8 authored by shihm's avatar shihm
Browse files

add inference.py

parent 4bc377fc
...@@ -59,37 +59,7 @@ docker run -it \ ...@@ -59,37 +59,7 @@ docker run -it \
### transformers ### transformers
#### 单机推理 #### 单机推理
```bash ```bash
python python inference.py
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
import torch
os.environ['TRANSFORMERS_OFFLINE'] = '1'
os.environ['MODELSCOPE_OFFLINE'] = '1'
model_path = "/baichuan-inc/Baichuan-M3-235B"
model = AutoModelForCausalLM.from_pretrained(
model_path,
trust_remote_code=True,
device_map="auto",
torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
messages = [{"role": "user", "content": "I've been having headaches lately, especially worse in the afternoon. What should I do?"}]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
thinking_mode='on'
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
generated_ids = model.generate(
**model_inputs,
max_new_tokens=32768,
temperature=0.6
)
response = tokenizer.decode(generated_ids[0][len(model_inputs.input_ids[0]):], skip_special_tokens=True)
print(response)
``` ```
### vllm ### vllm
...@@ -136,6 +106,7 @@ curl http://localhost:8000/v1/chat/completions \ ...@@ -136,6 +106,7 @@ curl http://localhost:8000/v1/chat/completions \
| 模型名称 | 权重大小 | DCU型号 | 最低卡数需求 |下载地址| | 模型名称 | 权重大小 | DCU型号 | 最低卡数需求 |下载地址|
|:-----:|:----------:|:----------:|:---------------------:|:----------:| |:-----:|:----------:|:----------:|:---------------------:|:----------:|
| Baichuan-M3-235B | 235B | BW1000 | 8 | [Modelscope](https://modelscope.cn/models/baichuan-inc/Baichuan-M3-235B) | | Baichuan-M3-235B | 235B | BW1000 | 8 | [Modelscope](https://modelscope.cn/models/baichuan-inc/Baichuan-M3-235B) |
| Baichuan-M3-235B-GPTQ-INT4 | 235B | BW1000 | 4 | [Modelscope](https://modelscope.cn/models/baichuan-inc/Baichuan-M3-235B-GPTQ-INT4) |
## 源码仓库及问题反馈 ## 源码仓库及问题反馈
- https://developer.sourcefind.cn/codes/modelzoo/baichuan-m3-235b_vllm - https://developer.sourcefind.cn/codes/modelzoo/baichuan-m3-235b_vllm
......
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
import torch
model_path = "/home/download/baichuan-inc/Baichuan-M3-235B"
os.environ['TRANSFORMERS_OFFLINE'] = '1'
os.environ['MODELSCOPE_OFFLINE'] = '1'
model = AutoModelForCausalLM.from_pretrained(
model_path,
trust_remote_code=True,
device_map="auto",
dtype=torch.bfloat16
)
enizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
messages = [{"role": "user", "content": "I've been having headaches lately, especially worse in the afternoon. What should I do?"}]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
thinking_mode='on'
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
generated_ids = model.generate(
**model_inputs,
max_new_tokens=32768,
temperature=0.6
)
response = tokenizer.decode(generated_ids[0][len(model_inputs.input_ids[0]):], skip_special_tokens=True)
print(response)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment