merged with latest big-refactor

ac50adb5 · lintangsutawika · 6355d06f · a3252ed7 · ac50adb5 · ac50adb5
Commit ac50adb5 authored Sep 21, 2023 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_en.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_en.yaml
@@ -4,11 +4,5 @@ doc_to_target: '{% if answer is not none %}{{answer[20+1]}}{% else %}{{answer_nu
  endif %}'
 doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else
  %}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
-filter:
- function: regex
-  regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
-filter_list:
- name: get-answer
 include: cot_yaml
 task: mgsm_en_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_es.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_es.yaml
@@ -4,11 +4,5 @@ doc_to_target: '{% if answer is not none %}{{answer[22+1]}}{% else %}{{answer_nu
  endif %}'
 doc_to_text: '{% if answer is not none %}{{question+"\nRespuesta paso a paso:"}}{%
  else %}{{"Pregunta: "+question+"\nRespuesta paso a paso:"}}{% endif %}'
-filter:
- function: regex
-  regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
-filter_list:
- name: get-answer
 include: cot_yaml
 task: mgsm_es_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_fr.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_fr.yaml
@@ -4,11 +4,5 @@ doc_to_target: '{% if answer is not none %}{{answer[25+1]}}{% else %}{{answer_nu
  endif %}'
 doc_to_text: '{% if answer is not none %}{{question+"\nRéponse étape par étape :"}}{%
  else %}{{"Question : "+question+"\nRéponse étape par étape :"}}{% endif %}'
-filter:
- function: regex
-  regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
-filter_list:
- name: get-answer
 include: cot_yaml
 task: mgsm_fr_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_ja.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_ja.yaml
@@ -4,11 +4,5 @@ doc_to_target: '{% if answer is not none %}{{answer[10+1]}}{% else %}{{answer_nu
  endif %}'
 doc_to_text: '{% if answer is not none %}{{question+"\nステップごとの答え:"}}{% else %}{{"問題:
  "+question+"\nステップごとの答え:"}}{% endif %}'
-filter:
- function: regex
-  regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
-filter_list:
- name: get-answer
 include: cot_yaml
 task: mgsm_ja_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_ru.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_ru.yaml
@@ -4,11 +4,5 @@ doc_to_target: '{% if answer is not none %}{{answer[17+1]}}{% else %}{{answer_nu
  endif %}'
 doc_to_text: '{% if answer is not none %}{{question+"\nПошаговоерешение:"}}{% else
  %}{{"Задача: "+question+"\nПошаговоерешение:"}}{% endif %}'
-filter:
- function: regex
-  regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
-filter_list:
- name: get-answer
 include: cot_yaml
 task: mgsm_ru_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_sw.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_sw.yaml
@@ -4,11 +4,5 @@ doc_to_target: '{% if answer is not none %}{{answer[24+1]}}{% else %}{{answer_nu
  endif %}'
 doc_to_text: '{% if answer is not none %}{{question+"\nJibu la Hatua kwa Hatua:"}}{%
  else %}{{"Swali: "+question+"\nJibu la Hatua kwa Hatua:"}}{% endif %}'
-filter:
- function: regex
-  regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
-filter_list:
- name: get-answer
 include: cot_yaml
 task: mgsm_sw_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_te.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_te.yaml
@@ -4,11 +4,5 @@ doc_to_target: '{% if answer is not none %}{{answer[18+1]}}{% else %}{{answer_nu
  endif %}'
 doc_to_text: '{% if answer is not none %}{{question+"\nదశలవారీగా సమాధానం:"}}{% else
  %}{{"ప్రశ్న: "+question+"\nదశలవారీగా సమాధానం:"}}{% endif %}'
-filter:
- function: regex
-  regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
-filter_list:
- name: get-answer
 include: cot_yaml
 task: mgsm_te_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_th.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_th.yaml
@@ -4,11 +4,5 @@ doc_to_target: '{% if answer is not none %}{{answer[17+1]}}{% else %}{{answer_nu
  endif %}'
 doc_to_text: '{% if answer is not none %}{{question+"\nคำตอบทีละขั้นตอน:"}}{% else
  %}{{"โจทย์: "+question+"\nคำตอบทีละขั้นตอน:"}}{% endif %}'
-filter:
- function: regex
-  regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
-filter_list:
- name: get-answer
 include: cot_yaml
 task: mgsm_th_direct
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_zh.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_cot_native_zh.yaml
@@ -4,11 +4,5 @@ doc_to_target: '{% if answer is not none %}{{answer[5+1]}}{% else %}{{answer_num
  endif %}'
 doc_to_text: '{% if answer is not none %}{{question+"\n逐步解答:"}}{% else %}{{"问题: "+question+"\n逐步解答:"}}{%
  endif %}'
-filter:
- function: regex
-  regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
-filter_list:
- name: get-answer
 include: cot_yaml
 task: mgsm_zh_direct
--- a/lm_eval/tasks/mgsm/utils.py
+++ b/lm_eval/tasks/mgsm/utils.py
@@ -4,16 +4,19 @@ import argparse

 LANGUAGES = {
    "bn": {  # Bengali
+        # "QUESTION": "প্রশ্ন:",
        "QUESTION": "\u09aa\u09cd\u09b0\u09b6\u09cd\u09a8:",
+        # "ANSWER": "ধাপে ধাপে উত্তর:",
        "ANSWER": "\u09a7\u09be\u09aa\u09c7 \u09a7\u09be\u09aa\u09c7 \u0989\u09a4\u09cd\u09a4\u09b0:",
        "DIRECT": "Answer:",
        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
    },
    "de": {  # German
        "QUESTION": "Frage:",
+        # "ANSWER": "Schritt-für-Schritt-Antwort:",
        "ANSWER": "Schritt-f\u00fcr-Schritt-Antwort:",
        "DIRECT": "Antwort:",
-        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
+        "REGEX": "Die Antwort lautet (\\-?[0-9\\.\\,]+)",
    },
    "en": {  # English
        "QUESTION": "Question:",
@@ -24,50 +27,68 @@ LANGUAGES = {
    "es": {  # Spanish
        "QUESTION": "Pregunta:",
        "ANSWER": "Respuesta paso a paso:",
-        "DIRECT": "Answer:",
-        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
+        "DIRECT": "Respuesta:",
+        "REGEX": "La respuesta es (\\-?[0-9\\.\\,]+)",
    },
    "fr": {  # French
        "QUESTION": "Question :",
+        # "ANSWER": "Réponse étape par étape :"
        "ANSWER": "R\u00e9ponse \u00e9tape par \u00e9tape :",
-        "DIRECT": "Answer:",
-        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
+        # "DIRECT": "Réponse :",
+        "DIRECT": "R\u00e9ponse :",
+        # "REGEX": "La réponse est (\\-?[0-9\\.\\,]+)",
+        "REGEX": "La r\u00e9ponse est (\\-?[0-9\\.\\,]+)",
    },
    "ru": {  # Russian
+        # "QUESTION": "Задача:",
        "QUESTION": "\u0417\u0430\u0434\u0430\u0447\u0430:",
+        # "ANSWER": "Пошаговоерешение:",
        "ANSWER": "\u041f\u043e\u0448\u0430\u0433\u043e\u0432\u043e\u0435\u0440\u0435\u0448\u0435\u043d\u0438\u0435:",
        "DIRECT": "Answer:",
-        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
+        # "REGEX": "Ответ — (\\-?[0-9\\.\\,]+)",
+        "REGEX": "\u041e\u0442\u0432\u0435\u0442 \u2014 (\\-?[0-9\\.\\,]+)",
    },
    "sw": {  # Swahili
        "QUESTION": "Swali:",
        "ANSWER": "Jibu la Hatua kwa Hatua:",
        "DIRECT": "Answer:",
-        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
+        "REGEX": "Jibu ni (\\-?[0-9\\.\\,]+)",
    },
    "te": {  # Telugu
+        # "QUESTION": "ప్రశ్న:",
        "QUESTION": "\u0c2a\u0c4d\u0c30\u0c36\u0c4d\u0c28:",
+        # "ANSWER": "దశలవారీగా సమాధానం:",
        "ANSWER": "\u0c26\u0c36\u0c32\u0c35\u0c3e\u0c30\u0c40\u0c17\u0c3e \u0c38\u0c2e\u0c3e\u0c27\u0c3e\u0c28\u0c02:",
        "DIRECT": "Answer:",
-        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
+        # "REGEX": "సమాధానం (\\-?[0-9\\.\\,]+)",
+        "REGEX": "\u0c38\u0c2e\u0c3e\u0c27\u0c3e\u0c28\u0c02 (\\-?[0-9\\.\\,]+)",
    },
    "th": {  # Thai
+        # "QUESTION": "โจทย์:",
        "QUESTION": "\u0e42\u0e08\u0e17\u0e22\u0e4c:",
+        # "ANSWER": "คำตอบทีละขั้นตอน:",
        "ANSWER": "\u0e04\u0e33\u0e15\u0e2d\u0e1a\u0e17\u0e35\u0e25\u0e30\u0e02\u0e31\u0e49\u0e19\u0e15\u0e2d\u0e19:",
        "DIRECT": "Answer:",
-        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
+        # "REGEX": "คำตอบคือ (\\-?[0-9\\.\\,]+)",
+        "REGEX": "\u0e04\u0e33\u0e15\u0e2d\u0e1a\u0e04\u0e37\u0e2d (\\-?[0-9\\.\\,]+)",
    },
    "ja": {  # Japanese
+        # "QUESTION": "問題:",
        "QUESTION": "\u554f\u984c:",
+        # "ANSWER": "ステップごとの答え:",
        "ANSWER": "\u30b9\u30c6\u30c3\u30d7\u3054\u3068\u306e\u7b54\u3048:",
        "DIRECT": "Answer:",
-        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
+        # "REGEX": "答えは(\\-?[0-9\\.\\,]+)です。",
+        "REGEX": "\u7b54\u3048\u306f(\\-?[0-9\\.\\,]+)\u3067\u3059\u3002",
    },
    "zh": {  # Chinese
+        # "QUESTION": "问题:",
        "QUESTION": "\u95ee\u9898:",
+        # "ANSWER": "逐步解答:",
        "ANSWER": "\u9010\u6b65\u89e3\u7b54:",
        "DIRECT": "Answer:",
-        "REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
+        # "REGEX": "答案是 (\\-?[0-9\\.\\,]+)。",
+        "REGEX": "\u7b54\u6848\u662f (\\-?[0-9\\.\\,]+)\u3002",
    },
 }

@@ -80,15 +101,15 @@ def add_regex_pattern(regex_pattern):
        "filter_list": [
            {
                "name": "get-answer",
-            },
-        ],
-        "filter": [
-            {
-                "function": "regex",
-                "regex_pattern": regex_pattern,
-            },
-            {
-                "function": "take_first",
+                "filter": [
+                    {
+                        "function": "regex",
+                        "regex_pattern": regex_pattern,
+                    },
+                    {
+                        "function": "take_first",
+                    },
+                ],
            },
        ],
    }
@@ -107,6 +128,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
            QUESTION = LANGUAGES[lang]["QUESTION"]

            yaml_template = "cot_yaml"
+            filter_list = {}
            if mode == "direct":
                ANSWER = LANGUAGES[lang]["DIRECT"]
                REGEX = None
@@ -116,13 +138,13 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
                ANSWER = LANGUAGES[lang]["ANSWER"]
                REGEX = LANGUAGES[lang]["REGEX"]
                task_name = f"mgsm_{lang}_native-cot"
+                filter_list = add_regex_pattern(REGEX)
            elif mode == "en-cot":
                ANSWER = LANGUAGES["en"]["ANSWER"]
                REGEX = LANGUAGES["en"]["REGEX"]
                task_name = f"mgsm_{lang}_en-cot"

            file_name = f"{task_name}.yaml"
-            filter_list = add_regex_pattern(REGEX)

            with open(
                f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
@@ -147,6 +169,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
                    },
                    f,
                    allow_unicode=True,
+                    width=float("inf"),
                )
        except FileExistsError:
            err.append(file_name)

--- a/lm_eval/tasks/mutual/README.md
+++ b/lm_eval/tasks/mutual/README.md
+# MuTual
+
+### Paper
+
+Title: `MuTual: A Dataset for Multi-Turn Dialogue Reasoning`
+
+Abstract: https://www.aclweb.org/anthology/2020.acl-main.130/
+
+MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is
+modified from Chinese high school English listening comprehension test data.
+
+Homepage: https://github.com/Nealcly/MuTual
+
+### Citation
+
+```
+@inproceedings{mutual,
+    title = "MuTual: A Dataset for Multi-Turn Dialogue Reasoning",
+    author = "Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming" ,
+    booktitle = "Proceedings of the 58th Conference of the Association for Computational Linguistics",
+    year = "2020",
+    publisher = "Association for Computational Linguistics",
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* Not part of a group yet.
+
+#### Tasks
+
+* `mutual`
+* `mutual_plus`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/mutual/multual_plus.yaml
+++ b/lm_eval/tasks/mutual/multual_plus.yaml
+include: mutual.yaml
+task: mutual_plus
+dataset_name: mutual_plus
--- a/lm_eval/tasks/mutual/mutual.yaml
+++ b/lm_eval/tasks/mutual/mutual.yaml
+task: mutual
+dataset_path: "EleutherAI/mutual"
+dataset_name: mutual
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: "{{article}}"
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answers)}}"
+doc_to_choice: "{{options}}"
+process_docs: !function utils.process_docs
+process_results: !function utils.process_results
+should_decontaminate: true
+doc_to_decontamination_query: "{{article}}"
+metric_list:
+  - metric: r@1
+    aggregation: mean
+    higher_is_better: true
+  - metric: r@2
+    aggregation: mean
+    higher_is_better: true
+  - metric: mrr
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/mutual/utils.py
+++ b/lm_eval/tasks/mutual/utils.py
+import numpy as np
+
+
+def process_docs(dataset):
+    def _detokenize(text):
+        text = text.replace(" '", "'")
+        text = text.replace(" \n", "\n")
+        text = text.replace("\n ", "\n")
+        text = text.replace(" n't", "n't")
+        text = text.replace("`` ", '"')
+        text = text.replace("''", '"')
+        # punctuation
+        text = text.replace(" :", ":")
+        text = text.replace(" ;", ";")
+        text = text.replace(" !", "!")
+        text = text.replace(" ?", "?")
+        text = text.replace(" ,", ",")
+        text = text.replace(" .", ".")
+        return text
+
+    def _process(doc):
+        return {
+            "article": _detokenize(doc["article"]),
+            "options": [_detokenize(option) for option in doc["options"]],
+        }
+
+    return dataset.map(_process)
+
+
+def process_results(doc, results):
+    gold = ["A", "B", "C", "D"].index(doc["answers"])
+    r4_1 = np.argmax(results) == gold  # r4_1 = accuracy
+    ranks = sorted(results, reverse=True)
+    r4_2 = (ranks.index(results[gold]) == 1) + r4_1
+    mrr = 1.0 / (ranks.index(results[gold]) + 1)  # `+ 1` for index offset
+    return {"r@1": r4_1, "r@2": r4_2, "mrr": mrr}
--- a/lm_eval/tasks/nq_open/README.md
+++ b/lm_eval/tasks/nq_open/README.md
--- a/lm_eval/tasks/nq_open/nq_open.yaml
+++ b/lm_eval/tasks/nq_open/nq_open.yaml
+task: nq_open
+dataset_path: nq_open
+output_type: greedy_until
+training_split: train
+validation_split: validation
+description: "Answer these questions:\n"
+doc_to_text: "Q: {{question}}?\nA:"
+doc_to_target: "{{answer}}" # TODO: should be multi-target
+fewshot_delimiter: "\n"
+generation_kwargs:
+  until:
+    - "\n"
+    - "."
+    - ","
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+target_delimiter: " "
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+    - "\ban|a|the\b"
--- a/lm_eval/tasks/qasper/README.md
+++ b/lm_eval/tasks/qasper/README.md
+# QASPER
+
+### Paper
+
+Title: `A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers`
+
+Abstract: https://arxiv.org/abs/2105.03011
+
+QASPER is a dataset of 5,049 questions over 1,585 Natural Language Processing papers.
+Each question is written by an NLP practitioner who read only the title and abstract
+of the corresponding paper, and the question seeks information present in the full
+text. The questions are then answered by a separate set of NLP practitioners who also
+provide supporting evidence to answers.
+
+Homepage: https://allenai.org/data/qasper
+
+### Citation
+
+```
+@article{DBLP:journals/corr/abs-2105-03011,
+    author    = {Pradeep Dasigi and
+               Kyle Lo and
+               Iz Beltagy and
+               Arman Cohan and
+               Noah A. Smith and
+               Matt Gardner},
+    title     = {A Dataset of Information-Seeking Questions and Answers Anchored in
+               Research Papers},
+    journal   = {CoRR},
+    volume    = {abs/2105.03011},
+    year      = {2021},
+    url       = {https://arxiv.org/abs/2105.03011},
+    eprinttype = {arXiv},
+    eprint    = {2105.03011},
+    timestamp = {Fri, 14 May 2021 12:13:30 +0200},
+    biburl    = {https://dblp.org/rec/journals/corr/abs-2105-03011.bib},
+    bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `qasper`: executes both `qasper_bool` and `qasper_freeform`
+
+#### Tasks
+
+* `qasper_bool`: Multiple choice task that evaluates the task with `answer_type="bool"`
+* `qasper_freeform`: Greedy generation task that evaluates the samples from the task with `answer_type="free form answer"`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/qasper/bool.yaml
+++ b/lm_eval/tasks/qasper/bool.yaml
+group: qasper
+task: qasper_bool
+dataset_path: qasper
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+process_docs: !function utils.process_docs_bool
+doc_to_text: "TITLE: {{title}}\nABSTRACT: {{abstract}}\n\nQ: {{question}}\n\nA:"
+doc_to_target: 1
+doc_to_choice: ["no", "yes"]
+metric_list:
+  - metric: f1
--- a/lm_eval/tasks/qasper/freeform.yaml
+++ b/lm_eval/tasks/qasper/freeform.yaml
+group: qasper
+task: qasper_freeform
+dataset_path: qasper
+output_type: greedy_until
+training_split: train
+validation_split: validation
+process_docs: !function utils.process_docs_freeform
+doc_to_text: "TITLE: {{title}}\nABSTRACT: {{abstract}}\n\nQ: {{question}}\n\nA:"
+doc_to_target: answer
+generation_kwargs:
+  until:
+    - "\n"
+metric_list:
+  - metric: !function metrics.f1_abstractive
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/qasper/metrics.py
+++ b/lm_eval/tasks/qasper/metrics.py
+import re
+import string
+
+from collections import Counter
+
+
+def normalize_answer(s):
+    """
+    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
+    Lower text and remove punctuation, articles and extra whitespace.
+    """
+
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1_abstractive(predictions, references):
+    """
+    Taken from the official evaluation script for v1.1 of the SQuAD dataset.
+    """
+    prediction_tokens = normalize_answer(predictions[0]).split()
+    references_tokens = normalize_answer(references[0]).split()
+    common = Counter(prediction_tokens) & Counter(references_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(references_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1