from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer import torch import torch.nn.functional as F import time torch.manual_seed(0) # path = "output/AdvertiseGenLoRA/xxx/checkpoint-3000" # xxx:系统时间路径 num_labels = 2 # if cls=n: num_labels = n content = "简约而不简单的牛仔外套,白色的衣身十分百搭。衣身多处有做旧破洞设计,打破单调乏味,增加一丝造型看点。衣身后背处有趣味刺绣装饰,丰富层次感,彰显别样时尚。" path = 'checkpoint/miniCPM-bf16' tokenizer = AutoTokenizer.from_pretrained(path) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = AutoModelForSequenceClassification.from_pretrained(path, torch_dtype=torch.bfloat16, num_labels=num_labels, device_map=device, trust_remote_code=True) # start_time = time.time() # responds, history = model.chat(tokenizer, "山东省最高的山是哪座山, 它比黄山高还是矮?差距多少?", temperature=0.5, top_p=0.8, repetition_penalty=1.02) input = [tokenizer.bos_token_id] content = tokenizer.encode(content, add_special_tokens=False) user_tokens=[1786, 4194, 95388] model_max_length=4096 sequence_lengths = -1 input += user_tokens + content input = input[: model_max_length] input += [tokenizer.eos_token_id] * (model_max_length - len(input)) input = torch.LongTensor(input).unsqueeze(0).to(device) attention_mask = input.ne(tokenizer.eos_token_id).unsqueeze(0).to(device) with torch.no_grad(): output = model( input, attention_mask=attention_mask, position_ids=None, past_key_values=None, inputs_embeds=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=True, ) logits = F.softmax(output[0].view(-1, num_labels), dim=1) # print("infer time:", time.time() - start_time, "s") logits = logits.argmax(1) print(logits)