# Load model directly import argparse import torch from transformers import AutoModelForCausalLM, LlamaTokenizer ## params parser = argparse.ArgumentParser() parser.add_argument('--model_path_or_name', default="IEITYuan/Yuan2-M32-hf", help='model path') args = parser.parse_args() model_path_or_name = args.model_path_or_name device = "cuda" tokenizer = LlamaTokenizer.from_pretrained(model_path_or_name, add_eos_token=False, add_bos_token=False, eos_token='') tokenizer.add_tokens(['', '', '', '', '', '', '','','','','','','','',''], special_tokens=True) model = AutoModelForCausalLM.from_pretrained(model_path_or_name, trust_remote_code=True, device_map='auto', torch_dtype=torch.float16) prompts = "写一篇春游作文" input_tensor = tokenizer(prompts, return_tensors="pt")["input_ids"].to("cuda:0") outputs = model.generate(input_tensor, do_sample=False, max_length=100) result = tokenizer.decode(outputs[0]) print("***", result)