from transformers import AutoTokenizer, AutoModelForCausalLM import os import torch model_path = "/home/download/baichuan-inc/Baichuan-M3-235B" os.environ['TRANSFORMERS_OFFLINE'] = '1' os.environ['MODELSCOPE_OFFLINE'] = '1' model = AutoModelForCausalLM.from_pretrained( model_path, trust_remote_code=True, device_map="auto", dtype=torch.bfloat16 ) enizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) messages = [{"role": "user", "content": "I've been having headaches lately, especially worse in the afternoon. What should I do?"}] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, thinking_mode='on' ) model_inputs = tokenizer([text], return_tensors="pt").to(model.device) generated_ids = model.generate( **model_inputs, max_new_tokens=32768, temperature=0.6 ) response = tokenizer.decode(generated_ids[0][len(model_inputs.input_ids[0]):], skip_special_tokens=True) print(response)