add inference.py

20e943f8 · shihm · 4bc377fc · 20e943f8 · 20e943f8
Commit 20e943f8 authored Mar 12, 2026 by shihm
Show whitespace changes
Inline Side-by-side

Showing with 30 additions and 31 deletions

README.md README.md +2 -31

inference.py inference.py +28 -0

No files found.
--- a/README.md
+++ b/README.md
@@ -59,37 +59,7 @@ docker run -it \
 ### transformers
 #### 单机推理
 ```bash
-python
-from transformers import AutoTokenizer, AutoModelForCausalLM
-import os
-import torch
-os.environ['TRANSFORMERS_OFFLINE'] = '1'
-os.environ['MODELSCOPE_OFFLINE'] = '1'
-model_path = "/baichuan-inc/Baichuan-M3-235B"
-model = AutoModelForCausalLM.from_pretrained(
-  model_path, 
-  trust_remote_code=True,
-  device_map="auto",
-  torch_dtype=torch.bfloat16
-  )
-tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-
-messages = [{"role": "user", "content": "I've been having headaches lately, especially worse in the afternoon. What should I do?"}]
-text = tokenizer.apply_chat_template(
-    messages,
-    tokenize=False,
-    add_generation_prompt=True,
-    thinking_mode='on'
-)
-model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
-
-generated_ids = model.generate(
-    **model_inputs,
-    max_new_tokens=32768,
-    temperature=0.6
-)
-response = tokenizer.decode(generated_ids[0][len(model_inputs.input_ids[0]):], skip_special_tokens=True)
-print(response)
+python inference.py
 ```

 ### vllm
@@ -136,6 +106,7 @@ curl http://localhost:8000/v1/chat/completions   \
 | 模型名称  | 权重大小  | DCU型号  | 最低卡数需求 |下载地址|
 |:-----:|:----------:|:----------:|:---------------------:|:----------:|
 | Baichuan-M3-235B | 235B | BW1000  | 8  | [Modelscope](https://modelscope.cn/models/baichuan-inc/Baichuan-M3-235B) |
+| Baichuan-M3-235B-GPTQ-INT4 | 235B | BW1000  | 4  | [Modelscope](https://modelscope.cn/models/baichuan-inc/Baichuan-M3-235B-GPTQ-INT4) |

 ## 源码仓库及问题反馈
 - https://developer.sourcefind.cn/codes/modelzoo/baichuan-m3-235b_vllm

--- a/inference.py
+++ b/inference.py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import os
+import torch
+model_path = "/home/download/baichuan-inc/Baichuan-M3-235B"
+os.environ['TRANSFORMERS_OFFLINE'] = '1'
+os.environ['MODELSCOPE_OFFLINE'] = '1'
+model = AutoModelForCausalLM.from_pretrained(
+    model_path,
+    trust_remote_code=True,
+    device_map="auto",
+    dtype=torch.bfloat16
+)
+enizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+messages = [{"role": "user", "content": "I've been having headaches lately, especially worse in the afternoon. What should I do?"}]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True,
+    thinking_mode='on'
+)
+model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+generated_ids = model.generate(
+    **model_inputs,
+    max_new_tokens=32768,
+    temperature=0.6
+)
+response = tokenizer.decode(generated_ids[0][len(model_inputs.input_ids[0]):], skip_special_tokens=True)
+print(response)