import argparse import evaluate import glob import nltk import numpy as np import os import pandas as pd import pickle from pathlib import Path from transformers import LlamaTokenizerFast from tqdm import tqdm def get_args(): parser = argparse.ArgumentParser() parser.add_argument( "--dataset-path", type=str, default=None, help="Path to .pkl generated by processorca.py", ) parser.add_argument( "--run-outputs", type=str, default="run_outputs", help="Output dir generated by accuracy run.", ) parser.add_argument( "--model-dir", type=str, default=None, help="Path to Llamav2 HuggingFace repo clone", ) parser.add_argument( "--output-pkl-path", type=str, default="full_output.pkl", help="Path to dump output to", ) args = parser.parse_args() return args def load_dataset(p: os.PathLike): print(f"Loading from {p}...") return pd.read_pickle(p) def load_run_outputs(p: os.PathLike): g = glob.glob(str(Path(p) / "q*.pkl")) by_query_idx = dict() for pkl_file in g: print(f"Loading from {pkl_file}...") with open(pkl_file, "rb") as f: d = pickle.load(f) assert len(d["query_ids"]) == len(d["outputs"]) for i in range(len(d["query_ids"])): qid = d["query_ids"][i] assert qid not in by_query_idx by_query_idx[qid] = d["outputs"][i] return by_query_idx def main(args): # Set up decode and evaluation objects tokenizer = LlamaTokenizerFast.from_pretrained(args.model_dir) metric = evaluate.load("rouge") nltk.download("punkt") # Load Data df = load_dataset(args.dataset_path) run_outputs = load_run_outputs(args.run_outputs) assert len(run_outputs) == 24576 # Set up columns to add output_tok_ids_col = [None] * 24576 output_text_col = [None] * 24576 output_lens = [None] * 24576 # Process data no_eos_ids = [] for qid, output in tqdm(run_outputs.items()): L = list(output) # Prune trailing 2s (EOS token) try: first2 = L.index(2) L = L[:first2] except ValueError: # Do nothing no_eos_ids.append(qid) assert L[-1] != 2 output_tok_ids_col[qid] = L output_lens[qid] = len(L) # Decode tokens output_text_col[qid] = tokenizer.decode( output_tok_ids_col[qid], skip_special_tokens=True ) print(f"Found {len(no_eos_ids)} samples with no EOS token") print("Calculating rouge scores...") def _preproc(s): return "\n".join(nltk.sent_tokenize(s.strip())) preds = list(map(_preproc, output_text_col)) targets = list(map(_preproc, list(df["output"]))) rouge_scores = metric.compute( predictions=preds, references=targets, use_stemmer=True, use_aggregator=False ) assert len(rouge_scores["rouge1"]) == 24576 assert len(rouge_scores["rouge2"]) == 24576 assert len(rouge_scores["rougeL"]) == 24576 agg = {k: round(np.mean(v) * 100, 4) for k, v in rouge_scores.items()} print(agg) print("Avg output seqlen:", np.mean(output_lens)) # Set columns df["gen_output_tok_id"] = output_tok_ids_col df["gen_output_text"] = output_text_col df["gen_output_tok_len"] = output_lens df["rouge1"] = rouge_scores["rouge1"] df["rouge2"] = rouge_scores["rouge2"] df["rougeL"] = rouge_scores["rougeL"] p = Path(args.output_pkl_path) p.parent.mkdir(exist_ok=True) df.to_pickle(p) print(f"Dumped to {p}") if __name__ == "__main__": main(get_args())