Merge remote-tracking branch 'origin/big-refactor' into big-refactor_math_sympy

e2ed49f8 · baberabb · a554e41b · 91a37c90 · e2ed49f8 · e2ed49f8
Commit e2ed49f8 authored Sep 18, 2023 by baberabb
20 changed files
--- a/.github/workflows/new_tasks.yml
+++ b/.github/workflows/new_tasks.yml
@@ -63,10 +63,10 @@ jobs:
      - name: Test with pytest
        # if new tasks are added, run tests on them
        if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
-        run: python -m pytest tests/test_tasks.py -s -vv -n=auto
+        run: python -m pytest tests/test_tasks.py -s -vv
        # if api is modified, run tests on it
      - name: Test more tasks with pytest
        env:
          API: true
        if: steps.changed-tasks.outputs.api_any_modified == 'true'
-        run: python -m pytest tests/test_tasks.py -s -vv -n=auto
+        run: python -m pytest tests/test_tasks.py -s -vv
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -674,22 +674,22 @@ class ConfigurableTask(Task):
            check_choices = test_choice
        else:
            check_choices = [test_target]
+        if self.config.doc_to_choice is not None:
-        for choice in check_choices:
+            for choice in check_choices:
-            choice_has_whitespace = True if " " in choice else False
+                choice_has_whitespace = True if choice[0].isspace() else False
-            delimiter_has_whitespace = (
+                delimiter_has_whitespace = (
-                True if " " in self.config.target_delimiter else False
+                    True if self.config.target_delimiter[-1].isspace() else False
-            )
-            if delimiter_has_whitespace and choice_has_whitespace:
-                eval_logger.warning(
-                    f'Both target_delimiter and target choice: "{choice}" have whitespace'
-                )
-            elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
-                eval_logger.warning(
-                    f'Both target_delimiter and target choice: "{choice}" does not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
                )
+                if delimiter_has_whitespace and choice_has_whitespace:
+                    eval_logger.warning(
+                        f'Both target_delimiter and target choice: "{choice}" have whitespace'
+                    )
+                elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
+                    eval_logger.warning(
+                        f'Both target_delimiter and target choice: "{choice}" does not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
+                    )
    def download(self, dataset_kwargs=None) -> None:
        self.dataset = datasets.load_dataset(
            path=self.DATASET_PATH,
@@ -1070,6 +1070,9 @@ class ConfigurableTask(Task):
                # it assumes that doc_to_target returns a number.
                choices = self.doc_to_choice(doc)
                gold = choices[gold]
+            # we expect multiple_targets to be a list.
+            elif self.multiple_target:
+                gold = list(gold)
            else:
                gold = str(gold)
@@ -1080,6 +1083,10 @@ class ConfigurableTask(Task):
                    # return true if any are true
                    # TODO: this may break for multipLe_target, non zero-or-1 metrics
                    scores = []
+                    if not isinstance(gold, list):
+                        # sometimes, a multiple_target dataset has exceptions where one doc has only one string answer
+                        # print(gold)
+                        gold = [gold]
                    for gold_option in gold:
                        try:
                            result_score = self._metric_fn_list[metric](

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -16,7 +16,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [x] MCTACO
 - [x] Pubmed QA
 - [x] SciQ
- [ ] QASPER
+- [x] QASPER
 - [x] QA4MRE
 - [x] TriviaQA
 - [x] AI2 ARC
@@ -36,7 +36,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [x] TruthfulQA (mc1)
 - [x] TruthfulQA (mc2)
 - [x] TruthfulQA (gen)
- [ ] MuTual
+- [x] MuTual
 - [ ] Hendrycks Math (Hailey)
 - [x] Asdiv
 - [ ] GSM8k

--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_bn.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_bn.yaml
+# Generated by utils.py
+dataset_name: bn
+doc_to_target: '{% if answer is not none %}{{answer[16+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nধাপে ধাপে উত্তর:"}}{% else
+  %}{{"প্রশ্ন: "+question+"\nধাপে ধাপে উত্তর:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_bn_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_de.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_de.yaml
+# Generated by utils.py
+dataset_name: de
+doc_to_target: '{% if answer is not none %}{{answer[28+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nSchritt-für-Schritt-Antwort:"}}{%
+  else %}{{"Frage: "+question+"\nSchritt-für-Schritt-Antwort:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_de_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_en.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_en.yaml
+# Generated by utils.py
+dataset_name: en
+doc_to_target: '{% if answer is not none %}{{answer[20+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else
+  %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_en_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_es.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_es.yaml
+# Generated by utils.py
+dataset_name: es
+doc_to_target: '{% if answer is not none %}{{answer[22+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nRespuesta paso a paso:"}}{%
+  else %}{{"Pregunta: "+question+"\nRespuesta paso a paso:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_es_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_fr.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_fr.yaml
+# Generated by utils.py
+dataset_name: fr
+doc_to_target: '{% if answer is not none %}{{answer[25+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nRéponse étape par étape :"}}{%
+  else %}{{"Question : "+question+"\nRéponse étape par étape :"}}{% endif %}'
+include: cot_yaml
+task: mgsm_fr_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_ja.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_ja.yaml
+# Generated by utils.py
+dataset_name: ja
+doc_to_target: '{% if answer is not none %}{{answer[10+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nステップごとの答え:"}}{% else %}{{"問題:
+  "+question+"\nステップごとの答え:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_ja_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_ru.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_ru.yaml
+# Generated by utils.py
+dataset_name: ru
+doc_to_target: '{% if answer is not none %}{{answer[17+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nПошаговоерешение:"}}{% else
+  %}{{"Задача: "+question+"\nПошаговоерешение:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_ru_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_sw.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_sw.yaml
+# Generated by utils.py
+dataset_name: sw
+doc_to_target: '{% if answer is not none %}{{answer[24+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nJibu la Hatua kwa Hatua:"}}{%
+  else %}{{"Swali: "+question+"\nJibu la Hatua kwa Hatua:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_sw_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_te.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_te.yaml
+# Generated by utils.py
+dataset_name: te
+doc_to_target: '{% if answer is not none %}{{answer[18+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nదశలవారీగా సమాధానం:"}}{% else
+  %}{{"ప్రశ్న: "+question+"\nదశలవారీగా సమాధానం:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_te_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_th.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_th.yaml
+# Generated by utils.py
+dataset_name: th
+doc_to_target: '{% if answer is not none %}{{answer[17+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nคำตอบทีละขั้นตอน:"}}{% else
+  %}{{"โจทย์: "+question+"\nคำตอบทีละขั้นตอน:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_th_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_zh.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_zh.yaml
+# Generated by utils.py
+dataset_name: zh
+doc_to_target: '{% if answer is not none %}{{answer[5+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\n逐步解答:"}}{% else %}{{"问题: "+question+"\n逐步解答:"}}{%
+  endif %}'
+include: cot_yaml
+task: mgsm_zh_direct
--- a/lm_eval/tasks/mutual/README.md
+++ b/lm_eval/tasks/mutual/README.md
+# MuTual
+### Paper
+Title: `MuTual: A Dataset for Multi-Turn Dialogue Reasoning`
+Abstract: https://www.aclweb.org/anthology/2020.acl-main.130/
+MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is
+modified from Chinese high school English listening comprehension test data.
+Homepage: https://github.com/Nealcly/MuTual
+### Citation
+```
+@inproceedings{mutual,
+    title = "MuTual: A Dataset for Multi-Turn Dialogue Reasoning",
+    author = "Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming" ,
+    booktitle = "Proceedings of the 58th Conference of the Association for Computational Linguistics",
+    year = "2020",
+    publisher = "Association for Computational Linguistics",
+}
+```
+### Groups and Tasks
+#### Groups
+* Not part of a group yet.
+#### Tasks
+* `mutual`
+* `mutual_plus`
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/mutual/multual_plus.yaml
+++ b/lm_eval/tasks/mutual/multual_plus.yaml
+include: mutual.yaml
+task: mutual_plus
+dataset_name: mutual_plus
--- a/lm_eval/tasks/mutual/mutual.yaml
+++ b/lm_eval/tasks/mutual/mutual.yaml
+task: mutual
+dataset_path: "EleutherAI/mutual"
+dataset_name: mutual
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: "{{article}}"
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answers)}}"
+doc_to_choice: "{{options}}"
+process_docs: !function utils.process_docs
+process_results: !function utils.process_results
+should_decontaminate: true
+doc_to_decontamination_query: "{{article}}"
+metric_list:
+  - metric: r@1
+    aggregation: mean
+    higher_is_better: true
+  - metric: r@2
+    aggregation: mean
+    higher_is_better: true
+  - metric: mrr
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/mutual/utils.py
+++ b/lm_eval/tasks/mutual/utils.py
+import numpy as np
+def process_docs(dataset):
+    def _detokenize(text):
+        text = text.replace(" '", "'")
+        text = text.replace(" \n", "\n")
+        text = text.replace("\n ", "\n")
+        text = text.replace(" n't", "n't")
+        text = text.replace("`` ", '"')
+        text = text.replace("''", '"')
+        # punctuation
+        text = text.replace(" :", ":")
+        text = text.replace(" ;", ";")
+        text = text.replace(" !", "!")
+        text = text.replace(" ?", "?")
+        text = text.replace(" ,", ",")
+        text = text.replace(" .", ".")
+        return text
+    def _process(doc):
+        return {
+            "article": _detokenize(doc["article"]),
+            "options": [_detokenize(option) for option in doc["options"]],
+        }
+    return dataset.map(_process)
+def process_results(doc, results):
+    gold = ["A", "B", "C", "D"].index(doc["answers"])
+    r4_1 = np.argmax(results) == gold  # r4_1 = accuracy
+    ranks = sorted(results, reverse=True)
+    r4_2 = (ranks.index(results[gold]) == 1) + r4_1
+    mrr = 1.0 / (ranks.index(results[gold]) + 1)  # `+ 1` for index offset
+    return {"r@1": r4_1, "r@2": r4_2, "mrr": mrr}
--- a/lm_eval/tasks/nq_open/README.md
+++ b/lm_eval/tasks/nq_open/README.md
--- a/lm_eval/tasks/nq_open/nq_open.yaml
+++ b/lm_eval/tasks/nq_open/nq_open.yaml
+task: nq_open
+dataset_path: nq_open
+output_type: greedy_until
+training_split: train
+validation_split: validation
+description: "Answer these questions:\n"
+doc_to_text: "Q: {{question}}?\nA:"
+doc_to_target: "{{answer}}" # TODO: should be multi-target
+fewshot_delimiter: "\n"
+generation_kwargs:
+  until:
+    - "\n"
+    - "."
+    - ","
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+target_delimiter: " "
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+    - "\ban|a|the\b"