from transformers import AutoModelForCausalLM, AutoTokenizer import os import re model_name_or_path = os.environ['MODEL_PATH'] # model_name_or_path = "tencent/Hunyuan-A13B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map="auto",trust_remote_code=True) # You may want to use bfloat16 and/or move to GPU here messages = [ {"role": "user", "content": "Write a short summary of the benefits of regular exercise"}, ] text = tokenizer.apply_chat_template( messages, tokenize=False, enable_thinking=True ) model_inputs = tokenizer([text], return_tensors="pt").to(model.device) model_inputs.pop("token_type_ids", None) outputs = model.generate(**model_inputs, max_new_tokens=4096) output_text = tokenizer.decode(outputs[0]) think_pattern = r'(.*?)' think_matches = re.findall(think_pattern, output_text, re.DOTALL) answer_pattern = r'(.*?)' answer_matches = re.findall(answer_pattern, output_text, re.DOTALL) think_content = [match.strip() for match in think_matches][0] answer_content = [match.strip() for match in answer_matches][0] print(f"thinking_content:{think_content}\n\n") print(f"answer_content:{answer_content}\n\n")