infer.py

from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
import torch
import torch.nn.functional as F
import time
torch.manual_seed(0)

# path = "output/AdvertiseGenLoRA/xxx/checkpoint-3000" # xxx：系统时间路径
num_labels = 2 # if cls=n: num_labels = n
content = "简约而不简单的牛仔外套，白色的衣身十分百搭。衣身多处有做旧破洞设计，打破单调乏味，增加一丝造型看点。衣身后背处有趣味刺绣装饰，丰富层次感，彰显别样时尚。"
path = 'checkpoint/miniCPM-bf16'
tokenizer = AutoTokenizer.from_pretrained(path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(path, torch_dtype=torch.bfloat16, num_labels=num_labels, device_map=device, trust_remote_code=True)

# start_time = time.time()
# responds, history = model.chat(tokenizer, "山东省最高的山是哪座山, 它比黄山高还是矮？差距多少？", temperature=0.5, top_p=0.8, repetition_penalty=1.02)
input = [tokenizer.bos_token_id]
content = tokenizer.encode(content, add_special_tokens=False)
user_tokens=[1786, 4194, 95388]
model_max_length=4096
sequence_lengths = -1
input += user_tokens + content
input = input[: model_max_length]
input += [tokenizer.eos_token_id] * (model_max_length - len(input))
input = torch.LongTensor(input).unsqueeze(0).to(device)
attention_mask = input.ne(tokenizer.eos_token_id).unsqueeze(0).to(device)

with torch.no_grad():
  output = model(
    input,
    attention_mask=attention_mask,
    position_ids=None,
    past_key_values=None,
    inputs_embeds=None,
    use_cache=None,
    output_attentions=None,
    output_hidden_states=None,
    return_dict=True,
  )

logits = F.softmax(output[0].view(-1, num_labels), dim=1)
# print("infer time：", time.time() - start_time, "s")
logits = logits.argmax(1)
print(logits)