import torch from transformers import LlamaForCausalLM, LlamaTokenizer model_name_or_path = "/home/wanglch/projects/XuanYuan/XuanYuan-13B-Chat" tokenizer = LlamaTokenizer.from_pretrained(model_name_or_path, use_fast=False, legacy=True, trust_remote_code=True) model = LlamaForCausalLM.from_pretrained(model_name_or_path, device_map="auto", trust_remote_code=True) model.eval() seps = [" ", ""] roles = ["Human", "Assistant"] content = "互联网金融机构如何确认该笔贷款是由本人申请的?" prompt = "Human: " + content + " Assistant:" print(f"输入: {content}") inputs = tokenizer(prompt, return_tensors="pt").to("cuda") outputs = model.generate(**inputs, max_new_tokens=256, do_sample=True, temperature=0.7, top_p=0.95) outputs = tokenizer.decode(outputs.cpu()[0][len(inputs.input_ids[0]):], skip_special_tokens=True) print(f"输出: {outputs}")