add hotpotqa_e

a8601618 · Baber · 8558b8d4 · a8601618 · a8601618 · a8601618
Commit a8601618 authored Dec 18, 2024 by Baber
3 changed files
--- a/lm_eval/tasks/longbench/longbench.yaml
+++ b/lm_eval/tasks/longbench/longbench.yaml
+task: longbench
+dataset_path: THUDM/LongBench
+dataset_name: hotpotqa_e
+output_type: generate_until
+test_split: test
+doc_to_text: "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:"
+doc_to_target: "{{answers}}"
+generation_kwargs:
+  max_gen_toks: 32
+  temperature: 1
+  do_sample: false
+metric_list:
+  - metric: !function metrics.qa_f1_score
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/longbench/metrics.py
+++ b/lm_eval/tasks/longbench/metrics.py
+import re
+import string
+from collections import Counter
+
+import jieba
+from fuzzywuzzy import fuzz
+from rouge import Rouge
+
+
+def normalize_answer(s: str):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+
+    def white_space_fix(text: str):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def normalize_zh_answer(s: str):
+    """Lower text and remove punctuation, extra whitespace."""
+
+    def white_space_fix(text):
+        return "".join(text.split())
+
+    def remove_punc(text):
+        cn_punctuation = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
+        all_punctuation = set(string.punctuation + cn_punctuation)
+        return "".join(ch for ch in text if ch not in all_punctuation)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_punc(lower(s)))
+
+
+def count_score(prediction, ground_truth, **kwargs):
+    numbers = re.findall(r"\d+", prediction)
+    right_num = 0
+    for number in numbers:
+        if str(number) == str(ground_truth):
+            right_num += 1
+    final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
+    return float(final_score)
+
+
+def retrieval_score(prediction, ground_truth, **kwargs):
+    pattern = r"Paragraph (\d+)"
+    matches = re.findall(pattern, ground_truth)
+    ground_truth_id = matches[0]
+    numbers = re.findall(r"\d+", prediction)
+    right_num = 0
+    for number in numbers:
+        if str(number) == str(ground_truth_id):
+            right_num += 1
+    final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
+    return float(final_score)
+
+
+def retrieval_zh_score(prediction, ground_truth, **kwargs):
+    pattern = r"段落(\d+)"
+    matches = re.findall(pattern, ground_truth)
+    ground_truth_id = matches[0]
+    numbers = re.findall(r"\d+", prediction)
+    right_num = 0
+    for number in numbers:
+        if str(number) == str(ground_truth_id):
+            right_num += 1
+    final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
+    return float(final_score)
+
+
+def code_sim_score(prediction, ground_truth, **kwargs):
+    all_lines = prediction.lstrip("\n").split("\n")
+    prediction = ""
+    for line in all_lines:
+        if ("`" not in line) and ("#" not in line) and ("//" not in line):
+            prediction = line
+            break
+    return fuzz.ratio(prediction, ground_truth) / 100
+
+
+def classification_score(prediction, ground_truth, **kwargs):
+    em_match_list = []
+    all_classes = kwargs["all_classes"]
+    for class_name in all_classes:
+        if class_name in prediction:
+            em_match_list.append(class_name)
+    for match_term in em_match_list:
+        if match_term in ground_truth and match_term != ground_truth:
+            em_match_list.remove(match_term)
+    if ground_truth in em_match_list:
+        score = 1.0 / len(em_match_list)
+    else:
+        score = 0.0
+    return score
+
+
+def rouge_score(prediction, ground_truth, **kwargs):
+    rouge = Rouge()
+    try:
+        scores = rouge.get_scores([prediction], [ground_truth], avg=True)
+        # ruff: noqa
+    except:
+        return 0.0
+    return scores["rouge-l"]["f"]
+
+
+def rouge_zh_score(prediction, ground_truth, **kwargs):
+    prediction = " ".join(list(jieba.cut(prediction, cut_all=False)))
+    ground_truth = " ".join(list(jieba.cut(ground_truth, cut_all=False)))
+    score = rouge_score(prediction, ground_truth)
+    return score
+
+
+def f1_score(prediction, ground_truth, **kwargs):
+    common = Counter(prediction) & Counter(ground_truth)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction)
+    recall = 1.0 * num_same / len(ground_truth)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def qa_f1_score(*args):
+    gold_answer, result = args
+    normalized_prediction = normalize_answer(result)
+    normalized_ground_truth = normalize_answer(gold_answer)
+
+    prediction_tokens = normalized_prediction.split()
+    ground_truth_tokens = normalized_ground_truth.split()
+    return f1_score(prediction_tokens, ground_truth_tokens)
+
+
+def qa_f1_zh_score(prediction, ground_truth, **kwargs):
+    prediction_tokens = list(jieba.cut(prediction, cut_all=False))
+    ground_truth_tokens = list(jieba.cut(ground_truth, cut_all=False))
+    prediction_tokens = [normalize_zh_answer(token) for token in prediction_tokens]
+    ground_truth_tokens = [normalize_zh_answer(token) for token in ground_truth_tokens]
+    prediction_tokens = [token for token in prediction_tokens if len(token) > 0]
+    ground_truth_tokens = [token for token in ground_truth_tokens if len(token) > 0]
+    return f1_score(prediction_tokens, ground_truth_tokens)
--- a/lm_eval/tasks/longbench/utils.py
+++ b/lm_eval/tasks/longbench/utils.py
+import argparse
+import json
+import os
+
+import numpy as np
+from metrics import (
+    classification_score,
+    code_sim_score,
+    count_score,
+    qa_f1_score,
+    qa_f1_zh_score,
+    retrieval_score,
+    retrieval_zh_score,
+    rouge_score,
+    rouge_zh_score,
+)
+
+
+dataset2metric = {
+    "narrativeqa": qa_f1_score,
+    "qasper": qa_f1_score,
+    "multifieldqa_en": qa_f1_score,
+    "multifieldqa_zh": qa_f1_zh_score,
+    "hotpotqa": qa_f1_score,
+    "2wikimqa": qa_f1_score,
+    "musique": qa_f1_score,
+    "dureader": rouge_zh_score,
+    "gov_report": rouge_score,
+    "qmsum": rouge_score,
+    "multi_news": rouge_score,
+    "vcsum": rouge_zh_score,
+    "trec": classification_score,
+    "triviaqa": qa_f1_score,
+    "samsum": rouge_score,
+    "lsht": classification_score,
+    "passage_retrieval_en": retrieval_score,
+    "passage_count": count_score,
+    "passage_retrieval_zh": retrieval_zh_score,
+    "lcc": code_sim_score,
+    "repobench-p": code_sim_score,
+}
+
+# def parse_args(args=None):
+#     parser = argparse.ArgumentParser()
+#     parser.add_argument('--model', type=str, default=None)
+#     parser.add_argument('--e', action='store_true', help="Evaluate on LongBench-E")
+#     return parser.parse_args(args)
+
+
+def scorer_e(dataset, predictions, answers, lengths, all_classes):
+    scores = {"0-4k": [], "4-8k": [], "8k+": []}
+    for prediction, ground_truths, length in zip(predictions, answers, lengths):
+        score = 0.0
+        if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
+            prediction = prediction.lstrip("\n").split("\n")[0]
+        for ground_truth in ground_truths:
+            score = max(
+                score,
+                dataset2metric[dataset](
+                    prediction, ground_truth, all_classes=all_classes
+                ),
+            )
+        if length < 4000:
+            scores["0-4k"].append(score)
+        elif length < 8000:
+            scores["4-8k"].append(score)
+        else:
+            scores["8k+"].append(score)
+    for key in scores.keys():
+        scores[key] = round(100 * np.mean(scores[key]), 2)
+    return scores
+
+
+def scorer(dataset, predictions, answers, all_classes):
+    total_score = 0.0
+    for prediction, ground_truths in zip(predictions, answers):
+        score = 0.0
+        if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
+            prediction = prediction.lstrip("\n").split("\n")[0]
+        for ground_truth in ground_truths:
+            score = max(
+                score,
+                dataset2metric[dataset](
+                    prediction, ground_truth, all_classes=all_classes
+                ),
+            )
+        total_score += score
+    return round(100 * total_score / len(predictions), 2)
+
+
+# if __name__ == '__main__':
+#     args = parse_args()
+#     scores = dict()
+#     if args.e:
+#         path = f"pred_e/{args.model}/"
+#     else:
+#         path = f"pred/{args.model}/"
+#     all_files = os.listdir(path)
+#     print("Evaluating on:", all_files)
+#     for filename in all_files:
+#         if not filename.endswith("jsonl"):
+#             continue
+#         predictions, answers, lengths = [], [], []
+#         dataset = filename.split('.')[0]
+#         with open(f"{path}{filename}", "r", encoding="utf-8") as f:
+#             for line in f:
+#                 data = json.loads(line)
+#                 predictions.append(data["pred"])
+#                 answers.append(data["answers"])
+#                 all_classes = data["all_classes"]
+#                 if "length" in data:
+#                     lengths.append(data["length"])
+#         if args.e:
+#             score = scorer_e(dataset, predictions, answers, lengths, all_classes)
+#         else:
+#             score = scorer(dataset, predictions, answers, all_classes)
+#         scores[dataset] = score
+#     if args.e:
+#         out_path = f"pred_e/{args.model}/result.json"
+#     else:
+#         out_path = f"pred/{args.model}/result.json"
+#     with open(out_path, "w") as f:
+#         json.dump(scores, f, ensure_ascii=False, indent=4)