Merge branch 'main' of https://github.com/EleutherAI/lm-evaluation-harness into t5v2-alt-plus

02e841ce · lintangsutawika · 90ad5db7 · e74ec966 · 02e841ce · 02e841ce
Commit 02e841ce authored Mar 14, 2024 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/french_bench/utils.py
+++ b/lm_eval/tasks/french_bench/utils.py
+import collections
+import re
+import string
+
+import datasets
+import evaluate
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        regex = re.compile(r"\b(un|une|des|le|la|les)\b", re.UNICODE)
+        return re.sub(regex, " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def get_tokens(s):
+    if not s:
+        return []
+    return normalize_answer(s).split()
+
+
+# Exact match (the normalized answer exactly match the gold answer)
+def exact(predictions, references):
+    return int(normalize_answer(references[0]) == normalize_answer(predictions[0]))
+
+
+# The F-score of predicted tokens versus the gold answer
+def f1(predictions, references):
+    gold_toks = get_tokens(references[0])
+    pred_toks = get_tokens(predictions[0])
+    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+    num_same = sum(common.values())
+    if len(gold_toks) == 0 or len(pred_toks) == 0:
+        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+        return int(gold_toks == pred_toks)
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(pred_toks)
+    recall = 1.0 * num_same / len(gold_toks)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def rouge1(items):
+    """
+    # passthrough for efficiency
+    """
+    return items
+
+
+def rouge1_agg(items):
+    """
+    Higher is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    rouge_scorer = evaluate.load("rouge")
+    return rouge_scorer.compute(predictions=preds, references=refs)["rouge1"]
+
+
+def is_included(items):
+    """
+    # passthrough for efficiency
+    """
+    if items[0] in items[1]:
+        return True
+    return False
+
+
+def preprocess(text):
+    text = text.strip()
+    # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
+    text = text.replace(" [title]", ". ")
+    text = re.sub("\\[.*?\\]", "", text)
+    text = text.replace("  ", " ")
+    return text
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc):
+        ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
+        out_doc = {
+            "query": preprocess(doc["activity_label"] + ": " + ctx),
+            "choices": [preprocess(ending) for ending in doc["endings"]],
+            "gold": int(doc["label"]),
+        }
+        return out_doc
+
+    return dataset.map(_process_doc)
--- a/lm_eval/tasks/gpqa/README.md
+++ b/lm_eval/tasks/gpqa/README.md
@@ -35,6 +35,9 @@ This dataset is gated, so you will have to accept the terms of use at https://hu

 * `gpqa_{main, diamond, extended}_zeroshot`
 * `gpqa_{main, diamond, extended}_n_shot`
+* `gpqa_{main, diamond, extended}_generative_n_shot`
+* `gpqa_{main, diamond, extended}_cot_zeroshot`
+* `gpqa_{main, diamond, extended}_cot_n_shot`

 ### Checklist


--- a/lm_eval/tasks/gpqa/cot_n_shot/_generate_configs.py
+++ b/lm_eval/tasks/gpqa/cot_n_shot/_generate_configs.py
+import yaml
+from tqdm import tqdm
+
+
+def main() -> None:
+    subset = ["extended", "diamond", "main"]
+    setting = "cot_n_shot"
+    for task in tqdm(subset):
+        file_name = f"gpqa_{task}_{setting}.yaml"
+        try:
+            with open(f"{file_name}", "w") as f:
+                f.write("# Generated by _generate_configs.py\n")
+                yaml.dump(
+                    {
+                        "include": f"_gpqa_{setting}_yaml",
+                        "task": f"gpqa_{task}_{setting}",
+                        "dataset_name": f"gpqa_{task}",
+                    },
+                    f,
+                )
+        except FileExistsError:
+            pass
+
+
+if __name__ == "__main__":
+    main()
--- a/lm_eval/tasks/gpqa/cot_n_shot/_gpqa_cot_n_shot_yaml
+++ b/lm_eval/tasks/gpqa/cot_n_shot/_gpqa_cot_n_shot_yaml
+dataset_path: Idavidrein/gpqa
+group: gpqa
+output_type: generate_until
+process_docs: !function utils.process_docs
+training_split: train
+# Because huggingface dataset only has train split
+validation_split: train
+test_split: null
+description: "Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n"
+doc_to_text: "Question: {{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nLet's think step by step: "
+doc_to_target: answer
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "(?<=The answer is )(.*)(?=.)"
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: "multi_choice_regex"
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+generation_kwargs:
+  until:
+    - "</s>"
+  do_sample: false
+  temperature: 0.0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/gpqa/cot_n_shot/gpqa_diamond_cot_n_shot.yaml
+++ b/lm_eval/tasks/gpqa/cot_n_shot/gpqa_diamond_cot_n_shot.yaml
+# Generated by _generate_configs.py
+dataset_name: gpqa_diamond
+include: _gpqa_cot_n_shot_yaml
+task: gpqa_diamond_cot_n_shot
--- a/lm_eval/tasks/gpqa/cot_n_shot/gpqa_extended_cot_n_shot.yaml
+++ b/lm_eval/tasks/gpqa/cot_n_shot/gpqa_extended_cot_n_shot.yaml
+# Generated by _generate_configs.py
+dataset_name: gpqa_extended
+include: _gpqa_cot_n_shot_yaml
+task: gpqa_extended_cot_n_shot
--- a/lm_eval/tasks/gpqa/cot_n_shot/gpqa_main_cot_n_shot.yaml
+++ b/lm_eval/tasks/gpqa/cot_n_shot/gpqa_main_cot_n_shot.yaml
+# Generated by _generate_configs.py
+dataset_name: gpqa_main
+include: _gpqa_cot_n_shot_yaml
+task: gpqa_main_cot_n_shot
--- a/lm_eval/tasks/gpqa/cot_n_shot/utils.py
+++ b/lm_eval/tasks/gpqa/cot_n_shot/utils.py
+import random
+import re
+
+import datasets
+
+
+def preprocess(text):
+    if text is None:
+        return " "
+    text = text.strip()
+    text = text.replace(" [title]", ". ")
+    text = re.sub("\\[.*?\\]", "", text)
+    text = text.replace("  ", " ")
+    return text
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc):
+        choices = [
+            preprocess(doc["Incorrect Answer 1"]),
+            preprocess(doc["Incorrect Answer 2"]),
+            preprocess(doc["Incorrect Answer 3"]),
+            preprocess(doc["Correct Answer"]),
+        ]
+
+        random.shuffle(choices)
+        correct_answer_index = choices.index(preprocess(doc["Correct Answer"]))
+
+        out_doc = {
+            "choice1": choices[0],
+            "choice2": choices[1],
+            "choice3": choices[2],
+            "choice4": choices[3],
+            "choices": [choices[0], choices[1], choices[2], choices[3]],
+            "answer": f"({chr(65 + correct_answer_index)})",
+        }
+        return out_doc
+
+    return dataset.map(_process_doc)
--- a/lm_eval/tasks/gpqa/cot_zeroshot/_generate_configs.py
+++ b/lm_eval/tasks/gpqa/cot_zeroshot/_generate_configs.py
+import yaml
+from tqdm import tqdm
+
+
+def main() -> None:
+    subset = ["extended", "diamond", "main"]
+    setting = "cot_zeroshot"
+    for task in tqdm(subset):
+        file_name = f"gpqa_{task}_{setting}.yaml"
+        try:
+            with open(f"{file_name}", "w") as f:
+                f.write("# Generated by _generate_configs.py\n")
+                yaml.dump(
+                    {
+                        "include": f"_gpqa_{setting}_yaml",
+                        "task": f"gpqa_{task}_{setting}",
+                        "dataset_name": f"gpqa_{task}",
+                    },
+                    f,
+                )
+        except FileExistsError:
+            pass
+
+
+if __name__ == "__main__":
+    main()
--- a/lm_eval/tasks/gpqa/cot_zeroshot/_gpqa_cot_zeroshot_yaml
+++ b/lm_eval/tasks/gpqa/cot_zeroshot/_gpqa_cot_zeroshot_yaml
+dataset_path: Idavidrein/gpqa
+group: gpqa
+output_type: generate_until
+process_docs: !function utils.process_docs
+training_split: train
+# Because huggingface dataset only has train split
+validation_split: train
+test_split: null
+doc_to_text: "What is the correct answer to this question:{{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nLet's think step by step: "
+doc_to_target: answer
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "(?<=The answer is )(.*)(?=.)"
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: "multi_choice_regex"
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+generation_kwargs:
+  until:
+    - "</s>"
+  do_sample: false
+  temperature: 0.0
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/gpqa/cot_zeroshot/gpqa_diamond_cot_zeroshot.yaml
+++ b/lm_eval/tasks/gpqa/cot_zeroshot/gpqa_diamond_cot_zeroshot.yaml
+# Generated by _generate_configs.py
+dataset_name: gpqa_diamond
+include: _gpqa_cot_zeroshot_yaml
+task: gpqa_diamond_cot_zeroshot
--- a/lm_eval/tasks/gpqa/cot_zeroshot/gpqa_extended_cot_zeroshot.yaml
+++ b/lm_eval/tasks/gpqa/cot_zeroshot/gpqa_extended_cot_zeroshot.yaml
+# Generated by _generate_configs.py
+dataset_name: gpqa_extended
+include: _gpqa_cot_zeroshot_yaml
+task: gpqa_extended_cot_zeroshot
--- a/lm_eval/tasks/gpqa/cot_zeroshot/gpqa_main_cot_zeroshot.yaml
+++ b/lm_eval/tasks/gpqa/cot_zeroshot/gpqa_main_cot_zeroshot.yaml
+# Generated by _generate_configs.py
+dataset_name: gpqa_main
+include: _gpqa_cot_zeroshot_yaml
+task: gpqa_main_cot_zeroshot
--- a/lm_eval/tasks/gpqa/cot_zeroshot/utils.py
+++ b/lm_eval/tasks/gpqa/cot_zeroshot/utils.py
+import random
+import re
+
+import datasets
+
+
+def preprocess(text):
+    if text is None:
+        return " "
+    text = text.strip()
+    text = text.replace(" [title]", ". ")
+    text = re.sub("\\[.*?\\]", "", text)
+    text = text.replace("  ", " ")
+    return text
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc):
+        choices = [
+            preprocess(doc["Incorrect Answer 1"]),
+            preprocess(doc["Incorrect Answer 2"]),
+            preprocess(doc["Incorrect Answer 3"]),
+            preprocess(doc["Correct Answer"]),
+        ]
+
+        random.shuffle(choices)
+        correct_answer_index = choices.index(preprocess(doc["Correct Answer"]))
+
+        out_doc = {
+            "choice1": choices[0],
+            "choice2": choices[1],
+            "choice3": choices[2],
+            "choice4": choices[3],
+            "choices": [choices[0], choices[1], choices[2], choices[3]],
+            "answer": f"({chr(65 + correct_answer_index)})",
+        }
+        return out_doc
+
+    return dataset.map(_process_doc)
--- a/lm_eval/tasks/gpqa/generative/_generate_configs.py
+++ b/lm_eval/tasks/gpqa/generative/_generate_configs.py
+import yaml
+from tqdm import tqdm
+
+
+def main() -> None:
+    subset = ["extended", "diamond", "main"]
+    setting = "generative_n_shot"
+    for task in tqdm(subset):
+        file_name = f"gpqa_{task}_{setting}.yaml"
+        try:
+            with open(f"{file_name}", "w") as f:
+                f.write("# Generated by _generate_configs.py\n")
+                yaml.dump(
+                    {
+                        "include": f"_gpqa_{setting}_yaml",
+                        "task": f"gpqa_{task}_{setting}",
+                        "dataset_name": f"gpqa_{task}",
+                    },
+                    f,
+                )
+        except FileExistsError:
+            pass
+
+
+if __name__ == "__main__":
+    main()
--- a/lm_eval/tasks/gpqa/generative/_gpqa_generative_n_shot_yaml
+++ b/lm_eval/tasks/gpqa/generative/_gpqa_generative_n_shot_yaml
+dataset_path: Idavidrein/gpqa
+group: gpqa
+output_type: generate_until
+process_docs: !function utils.process_docs
+training_split: train
+# Because huggingface dataset only has train split
+validation_split: train
+test_split: null
+description: "Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n"
+doc_to_text: "Question: {{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer:"
+doc_to_target: answer
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "(?<=The answer is )(.*)(?=.)"
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: "multi_choice_regex"
+        group_select: -1
+        ignore_case: true
+        ignore_punctuation: true
+        regex_pattern: "(\\([A-Z]\\))"
+      - function: "take_first"
+generation_kwargs:
+  until:
+    - "</s>"
+    - "Question:"
+    - "<|im_end|>"
+  temperature: 0.0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/gpqa/generative/gpqa_diamond_generative_n_shot.yaml
+++ b/lm_eval/tasks/gpqa/generative/gpqa_diamond_generative_n_shot.yaml
+# Generated by _generate_configs.py
+dataset_name: gpqa_diamond
+include: _gpqa_generative_n_shot_yaml
+task: gpqa_diamond_generative_n_shot
--- a/lm_eval/tasks/gpqa/generative/gpqa_extended_generative_n_shot.yaml
+++ b/lm_eval/tasks/gpqa/generative/gpqa_extended_generative_n_shot.yaml
+# Generated by _generate_configs.py
+dataset_name: gpqa_extended
+include: _gpqa_generative_n_shot_yaml
+task: gpqa_extended_generative_n_shot
--- a/lm_eval/tasks/gpqa/generative/gpqa_main_generative_n_shot.yaml
+++ b/lm_eval/tasks/gpqa/generative/gpqa_main_generative_n_shot.yaml
+# Generated by _generate_configs.py
+dataset_name: gpqa_main
+include: _gpqa_generative_n_shot_yaml
+task: gpqa_main_generative_n_shot
--- a/lm_eval/tasks/gpqa/generative/utils.py
+++ b/lm_eval/tasks/gpqa/generative/utils.py
+import random
+import re
+
+import datasets
+
+
+def preprocess(text):
+    if text is None:
+        return " "
+    text = text.strip()
+    text = text.replace(" [title]", ". ")
+    text = re.sub("\\[.*?\\]", "", text)
+    text = text.replace("  ", " ")
+    return text
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc):
+        choices = [
+            preprocess(doc["Incorrect Answer 1"]),
+            preprocess(doc["Incorrect Answer 2"]),
+            preprocess(doc["Incorrect Answer 3"]),
+            preprocess(doc["Correct Answer"]),
+        ]
+
+        random.shuffle(choices)
+        correct_answer_index = choices.index(preprocess(doc["Correct Answer"]))
+
+        out_doc = {
+            "choice1": choices[0],
+            "choice2": choices[1],
+            "choice3": choices[2],
+            "choice4": choices[3],
+            "choices": [choices[0], choices[1], choices[2], choices[3]],
+            "answer": f"({chr(65 + correct_answer_index)})",
+        }
+        return out_doc
+
+    return dataset.map(_process_doc)