hf_causal_model.py

import os
import torch
import numpy as np
import argparse
import time
from tqdm import tqdm
from utils import choices, format_example, gen_prompt, run_eval

from peft import PeftModel
from transformers import LlamaForCausalLM, LlamaTokenizer, AutoModelForCausalLM, AutoTokenizer, AutoModel
from transformers.generation.utils import GenerationConfig

def eval(model, tokenizer, subject, dev_df, test_df, num_few_shot, max_length, cot):
    cors = []
    all_preds = []

    for i in tqdm(range(test_df.shape[0])):
        prompt_end = format_example(test_df, i, subject, include_answer=False, cot=cot)
        prompt = gen_prompt(dev_df=dev_df,
                            subject=subject,
                            prompt_end=prompt_end,
                            num_few_shot=num_few_shot,
                            tokenizer=tokenizer,
                            max_length=max_length)
        label = test_df.iloc[i, test_df.shape[1] - 1]

        with torch.no_grad():
            if is_chat_history:
                pred, history = model.chat(tokenizer, prompt, history=[]) # for ChatGLM and InternLM-Chat
            else:
                if is_chat_model:
                    messages = [{"role": "user", "content": prompt}]
                    pred = model.chat(tokenizer, messages) # for model with model.chat() function, e.g. Baichuan-13B-Chat
                else:
                    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
                    outputs = model.generate(**inputs, max_new_tokens=512, repetition_penalty=1.1)
                    pred = tokenizer.decode(outputs.cpu()[0][len(inputs.input_ids[0]):], skip_special_tokens=True)

        all_preds.append(pred.replace("\n", ""))
        if pred and pred[0] in choices:
            cors.append(pred[0] == label)

    acc = np.mean(cors)
    print(f"Average accuracy {acc:.4f} - {subject}")
    print(f"{len(cors)} results, {len(all_preds)-len(cors)} inappropriate formated answers.")
    return all_preds


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name_or_path", type=str, default="")
    parser.add_argument("--lora_weights", type=str, default="")
    parser.add_argument("--data_dir", type=str, default="../data")
    parser.add_argument("--save_dir", type=str, default="../results/not_specified")
    parser.add_argument("--num_few_shot", type=int, default=0)
    parser.add_argument("--max_length", type=int, default=2048)
    parser.add_argument("--load_in_8bit", action='store_true')
    parser.add_argument("--with_conf", action='store_true')
    parser.add_argument("--cot", action='store_true')
    args = parser.parse_args()
    print('\n\n\n=========================')
    print(f'args = {args}')
    print('=========================')
    
    # is LLaMA series model
    is_llama = 'llama' in args.model_name_or_path.lower() \
                or 'alpaca' in args.model_name_or_path.lower()
    if is_llama:
        is_chat_model = False # The LLaMA family models do not support model.chat() function
    else:
        is_chat_model = 'chat' in args.model_name_or_path.lower()

    is_chat_history = 'chatglm' in args.model_name_or_path.lower() \
        or 'internlm-chat' in args.model_name_or_path.lower() \
        or 'qwen-7b-chat' in args.model_name_or_path.lower()
    is_chatglm = 'chatglm' in args.model_name_or_path.lower()

    print(f'model: {args.model_name_or_path}')
    if is_llama:
        tokenizer_class = LlamaTokenizer
        model_class = LlamaForCausalLM
    elif is_chatglm:
        tokenizer_class = AutoTokenizer
        model_class = AutoModel
    else:
        tokenizer_class = AutoTokenizer
        model_class = AutoModelForCausalLM
    tic = time.time()
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path, trust_remote_code=True)
    model = model_class.from_pretrained(args.model_name_or_path,
                                        trust_remote_code=True,
                                        load_in_8bit=args.load_in_8bit,
                                        device_map="auto"
                                        )
    if is_chatglm:
        model = model.half().cuda()  # For ChatGLM
    if is_chat_model and not is_chatglm:
        model.generation_config = GenerationConfig.from_pretrained(args.model_name_or_path)
    print(model.generation_config)
    print(f'loaded model: {args.model_name_or_path}  costtime = {time.time()-tic:2f}s')
    
    if args.lora_weights != "":
        model = PeftModel.from_pretrained(
                        model,
                        args.lora_weights,
                        torch_dtype=torch.float16,
                        )
        
    run_eval(model, tokenizer, eval, args)