Merge remote-tracking branch 'origin/big-refactor' into nqopen_baber

# Conflicts: # lm_eval/api/task.py

Merge remote-tracking branch 'origin/big-refactor' into nqopen_baber
# Conflicts: # lm_eval/api/task.py
a07d05f7 · baberabb · b1d468f2 · 6ba2a2b0 · a07d05f7 · a07d05f7
Commit a07d05f7 authored Sep 14, 2023 by baberabb
20 changed files
--- a/lm_eval/tasks/drop/default.yaml
+++ b/lm_eval/tasks/drop/default.yaml
+task: drop
+dataset_path: EleutherAI/drop
+output_type: greedy_until
+training_split: train
+validation_split: validation
+process_docs: !function utils.process_docs
+doc_to_text: "{{passage}} {{question}}"
+doc_to_target: "{{ answer|join(',')}}"
+target_delimiter: ""
+process_results: !function utils.process_results
+should_decontaminate: true
+doc_to_decontamination_query: "{{passage}} {{question}}"
+generation_kwargs:
+  until:
+    - "."
+metric_list:
+  - metric: em
+    aggregation: mean
+    higher_is_better: true
+  - metric: f1
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/drop/utils.py
+++ b/lm_eval/tasks/drop/utils.py
+import re
+import string
+
+import numpy as np
+from scipy.optimize import linear_sum_assignment
+
+_ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+
+
+def process_docs(dataset):
+    def _process(doc):
+        return {
+            "id": doc["query_id"],
+            "passage": doc["passage"],
+            "question": doc["question"],
+            "answers": get_answers(doc),
+        }
+
+    return dataset.map(_process)
+
+
+def get_answers(doc):
+    def _flatten_validated_answers(validated_answers):
+        """Flattens a dict of lists of validated answers.
+        {"number": ['1', '8'], ...}
+        -> [{"number": ['1'], ...}, {"number": ['8'], ...}]
+        """
+        valid_answers = []
+        for i in range(len(validated_answers["number"])):
+            valid_answers.append(
+                {
+                    "number": validated_answers["number"][i],
+                    "date": validated_answers["date"][i],
+                    "spans": validated_answers["spans"][i],
+                }
+            )
+        return valid_answers
+
+    answers = []
+    answers_set = set()
+    candidates = [doc["answer"]] + _flatten_validated_answers(doc["validated_answers"])
+    for candidate in candidates:
+        answer = parse_answer(candidate)
+        if answer in answers_set:
+            continue
+        answers_set.add(answer)
+        answers.append(answer)
+    return answers
+
+
+def parse_answer(answer):
+    # NOTE: Everything is returned as a tuple for uniformity and hashability.
+    if answer["number"] != "":
+        return (str(answer["number"]),)
+    if answer["spans"] != []:
+        return tuple(answer["spans"])
+    return (
+        " ".join(
+            [answer["date"]["day"], answer["date"]["month"], answer["date"]["year"]]
+        ).strip(),
+    )
+
+
+def process_results(doc, results):
+
+    preds, golds = results, doc["answers"]
+    max_em = 0
+    max_f1 = 0
+    for gold_answer in golds:
+        exact_match, f1_score = get_metrics(preds, gold_answer)
+        if gold_answer[0].strip():
+            max_em = max(max_em, exact_match)
+            max_f1 = max(max_f1, f1_score)
+    return {"em": max_em, "f1": max_f1}
+
+
+def get_metrics(predicted, gold):
+    """
+    Takes a predicted answer and a gold answer (that are both either a string or a list of
+    strings), and returns exact match and the DROP F1 metric for the prediction.  If you are
+    writing a script for evaluating objects in memory (say, the output of predictions during
+    validation, or while training), this is the function you want to call, after using
+    :func:`answer_json_to_strings` when reading the gold answer from the released data file.
+    """
+    predicted_bags = _answer_to_bags(predicted)
+    gold_bags = _answer_to_bags(gold)
+
+    if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(
+        gold_bags[0]
+    ):
+        exact_match = 1.0
+    else:
+        exact_match = 0.0
+
+    f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
+    f1 = np.mean(f1_per_bag)
+    f1 = round(f1, 2)
+    return exact_match, f1
+
+
+def _answer_to_bags(answer):
+    if isinstance(answer, (list, tuple)):
+        raw_spans = answer
+    else:
+        raw_spans = [answer]
+    normalized_spans = []
+    token_bags = []
+    for raw_span in raw_spans:
+        normalized_span = _normalize(raw_span)
+        normalized_spans.append(normalized_span)
+        token_bags.append(set(normalized_span.split()))
+    return normalized_spans, token_bags
+
+
+def _align_bags(predicted, gold):
+    """
+    Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
+    between them and gets maximum metric values over all the answers.
+    """
+    scores = np.zeros([len(gold), len(predicted)])
+    for gold_index, gold_item in enumerate(gold):
+        for pred_index, pred_item in enumerate(predicted):
+            if _match_numbers_if_present(gold_item, pred_item):
+                scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
+    row_ind, col_ind = linear_sum_assignment(-scores)
+
+    max_scores = np.zeros([max(len(gold), len(predicted))])
+    for row, column in zip(row_ind, col_ind):
+        max_scores[row] = max(max_scores[row], scores[row, column])
+    return max_scores
+
+
+def _compute_f1(predicted_bag, gold_bag):
+    intersection = len(gold_bag.intersection(predicted_bag))
+    if not predicted_bag:
+        precision = 1.0
+    else:
+        precision = intersection / float(len(predicted_bag))
+    if not gold_bag:
+        recall = 1.0
+    else:
+        recall = intersection / float(len(gold_bag))
+    f1 = (
+        (2 * precision * recall) / (precision + recall)
+        if not (precision == 0.0 and recall == 0.0)
+        else 0.0
+    )
+    return f1
+
+
+def _match_numbers_if_present(gold_bag, predicted_bag):
+    gold_numbers = set()
+    predicted_numbers = set()
+    for word in gold_bag:
+        if _is_number(word):
+            gold_numbers.add(word)
+    for word in predicted_bag:
+        if _is_number(word):
+            predicted_numbers.add(word)
+    if (not gold_numbers) or gold_numbers.intersection(predicted_numbers):
+        return True
+    return False
+
+
+def _is_number(text):
+    try:
+        float(text)
+        return True
+    except ValueError:
+        return False
+
+
+def _remove_articles(text):
+    return _ARTICLES.sub(" ", text)
+
+
+def _white_space_fix(text):
+    return " ".join(text.split())
+
+
+def _remove_punc(text):
+    exclude = set(string.punctuation)
+    if not _is_number(text):
+        return "".join(ch for ch in text if ch not in exclude)
+    else:
+        return text
+
+
+def _fix_number(text):
+    return str(float(text)) if _is_number(text) else text
+
+
+def _tokenize(text):
+    return re.split(" |-", text)
+
+
+def _normalize(answer):
+    tokens = [
+        _white_space_fix(_remove_articles(_fix_number(_remove_punc(token.lower()))))
+        for token in _tokenize(answer)
+    ]
+    tokens = [token for token in tokens if token.strip()]
+    normalized = " ".join(tokens).strip()
+    return normalized
--- a/lm_eval/tasks/glue/mnli/utils.py
+++ b/lm_eval/tasks/glue/mnli/utils.py
-def doc_to_text(doc):
+def doc_to_text(doc) -> str:
    return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format(
        doc["premise"],
        doc["hypothesis"].strip()

--- a/lm_eval/tasks/hendrycks_ethics/utils.py
+++ b/lm_eval/tasks/hendrycks_ethics/utils.py
@@ -15,7 +15,7 @@ def _preproc_doc(doc):
    return doc


-def doc_to_text(doc):
+def doc_to_text(doc) -> str:
    doc = _preproc_doc(doc)
    return f"Scenario 1: {doc['scenarios'][0]}\nScenario 2: {doc['scenarios'][1]}\nQuestion: Is Scenario 1 preferable?\nAnswer:"


--- a/lm_eval/tasks/mgsm/README.md
+++ b/lm_eval/tasks/mgsm/README.md
+# MGSM
+
+### Paper
+
+Title: `Language Models are Multilingual Chain-of-Thought Reasoners`
+
+Abstract: https://arxiv.org/abs/2210.03057
+
+Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems, proposed in the paper [Language models are multilingual chain-of-thought reasoners](http://arxiv.org/abs/2210.03057).
+
+The same 250 problems from [GSM8K](https://arxiv.org/abs/2110.14168) are each translated via human annotators in 10 languages. The 10 languages are:
+- Spanish
+- French
+- German
+- Russian
+- Chinese
+- Japanese
+- Thai
+- Swahili
+- Bengali
+- Telugu
+
+GSM8K (Grade School Math 8K) is a dataset of 8.5K high quality linguistically diverse grade school math word problems. The dataset was created to support the task of question answering on basic mathematical problems that require multi-step reasoning.
+
+You can find the input and targets for each of the ten languages (and English) as `.tsv` files.
+We also include few-shot exemplars that are also manually translated from each language in `exemplars.py`.
+
+Homepage: https://github.com/google-research/url-nlp/tree/main/mgsm
+
+
+### Citation
+
+```
+@misc{cobbe2021training,
+    title={Training Verifiers to Solve Math Word Problems},
+    author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
+    year={2021},
+    eprint={2110.14168},
+    archivePrefix={arXiv},
+    primaryClass={cs.LG}
+}
+@misc{shi2022language,
+    title={Language Models are Multilingual Chain-of-Thought Reasoners},
+    author={Freda Shi and Mirac Suzgun and Markus Freitag and Xuezhi Wang and Suraj Srivats and Soroush Vosoughi and Hyung Won Chung and Yi Tay and Sebastian Ruder and Denny Zhou and Dipanjan Das and Jason Wei},
+    year={2022},
+    eprint={2210.03057},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `mgsm_direct`: Direct question
+  * `mgsm_direct_bn`: Bengali
+  * `mgsm_direct_de`: German
+  * `mgsm_direct_en`: English
+  * `mgsm_direct_es`: Spanish
+  * `mgsm_direct_fr`: French
+  * `mgsm_direct_ja`: Japanese
+  * `mgsm_direct_ru`: Russian
+  * `mgsm_direct_sw`: Swahili
+  * `mgsm_direct_te`: Telugu
+  * `mgsm_direct_th`: Thai
+  * `mgsm_direct_zh`: Chinese
+* `mgsm_cot_native`: Question with Answer followed by CoT prompt in the same language as the dataset.
+  * `mgsm_cot_native_bn`: Bengali
+  * `mgsm_cot_native_de`: German
+  * `mgsm_cot_native_en`: English
+  * `mgsm_cot_native_es`: Spanish
+  * `mgsm_cot_native_fr`: French
+  * `mgsm_cot_native_ja`: Japanese
+  * `mgsm_cot_native_ru`: Russian
+  * `mgsm_cot_native_sw`: Swahili
+  * `mgsm_cot_native_te`: Telugu
+  * `mgsm_cot_native_th`: Thai
+  * `mgsm_cot_native_zh`: Chinese
+
+Examplar Samples: https://github.com/google-research/url-nlp/blob/main/mgsm/exemplars.py
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/mgsm/direct/direct_yaml
+++ b/lm_eval/tasks/mgsm/direct/direct_yaml
+# This file will be included in the generated language-specific task configs.
+# It doesn't have a yaml file extension as it is not meant to be imported directly
+# by the harness.
+group: mgsm_direct
+dataset_path: juletxara/mgsm
+dataset_name: null  # Overridden by language-specific config.
+output_type: greedy_until
+training_split: train
+test_split: test
+target_delimiter: ""
+generation_kwargs:
+  until:
+    - "\n\n"
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
--- a/lm_eval/tasks/mgsm/direct/mgsm_direct_bn.yaml
+++ b/lm_eval/tasks/mgsm/direct/mgsm_direct_bn.yaml
+# Generated by utils.py
+dataset_name: bn
+doc_to_target: '{% if answer is not none %}{{answer[6+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer"}}{% else %}{{"প্রশ্ন:
+  "+question+"\nAnswer"}}{% endif %}'
+include: direct_yaml
+task: mgsm_direct_bn
--- a/lm_eval/tasks/mgsm/direct/mgsm_direct_de.yaml
+++ b/lm_eval/tasks/mgsm/direct/mgsm_direct_de.yaml
+# Generated by utils.py
+dataset_name: de
+doc_to_target: '{% if answer is not none %}{{answer[7+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAntwort"}}{% else %}{{"Frage:
+  "+question+"\nAntwort"}}{% endif %}'
+include: direct_yaml
+task: mgsm_direct_de
--- a/lm_eval/tasks/mgsm/direct/mgsm_direct_en.yaml
+++ b/lm_eval/tasks/mgsm/direct/mgsm_direct_en.yaml
+# Generated by utils.py
+dataset_name: en
+doc_to_target: '{% if answer is not none %}{{answer[6+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer"}}{% else %}{{"Question:
+  "+question+"\nAnswer"}}{% endif %}'
+include: direct_yaml
+task: mgsm_direct_en
--- a/lm_eval/tasks/mgsm/direct/mgsm_direct_es.yaml
+++ b/lm_eval/tasks/mgsm/direct/mgsm_direct_es.yaml
+# Generated by utils.py
+dataset_name: es
+doc_to_target: '{% if answer is not none %}{{answer[6+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer"}}{% else %}{{"Pregunta:
+  "+question+"\nAnswer"}}{% endif %}'
+include: direct_yaml
+task: mgsm_direct_es
--- a/lm_eval/tasks/mgsm/direct/mgsm_direct_fr.yaml
+++ b/lm_eval/tasks/mgsm/direct/mgsm_direct_fr.yaml
+# Generated by utils.py
+dataset_name: fr
+doc_to_target: '{% if answer is not none %}{{answer[6+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer"}}{% else %}{{"Question
+  : "+question+"\nAnswer"}}{% endif %}'
+include: direct_yaml
+task: mgsm_direct_fr
--- a/lm_eval/tasks/mgsm/direct/mgsm_direct_ja.yaml
+++ b/lm_eval/tasks/mgsm/direct/mgsm_direct_ja.yaml
+# Generated by utils.py
+dataset_name: ja
+doc_to_target: '{% if answer is not none %}{{answer[6+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer"}}{% else %}{{"問題: "+question+"\nAnswer"}}{%
+  endif %}'
+include: direct_yaml
+task: mgsm_direct_ja
--- a/lm_eval/tasks/mgsm/direct/mgsm_direct_ru.yaml
+++ b/lm_eval/tasks/mgsm/direct/mgsm_direct_ru.yaml
+# Generated by utils.py
+dataset_name: ru
+doc_to_target: '{% if answer is not none %}{{answer[6+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer"}}{% else %}{{"Задача:
+  "+question+"\nAnswer"}}{% endif %}'
+include: direct_yaml
+task: mgsm_direct_ru
--- a/lm_eval/tasks/mgsm/direct/mgsm_direct_sw.yaml
+++ b/lm_eval/tasks/mgsm/direct/mgsm_direct_sw.yaml
+# Generated by utils.py
+dataset_name: sw
+doc_to_target: '{% if answer is not none %}{{answer[6+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer"}}{% else %}{{"Swali:
+  "+question+"\nAnswer"}}{% endif %}'
+include: direct_yaml
+task: mgsm_direct_sw
--- a/lm_eval/tasks/mgsm/direct/mgsm_direct_te.yaml
+++ b/lm_eval/tasks/mgsm/direct/mgsm_direct_te.yaml
+# Generated by utils.py
+dataset_name: te
+doc_to_target: '{% if answer is not none %}{{answer[6+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer"}}{% else %}{{"ప్రశ్న:
+  "+question+"\nAnswer"}}{% endif %}'
+include: direct_yaml
+task: mgsm_direct_te
--- a/lm_eval/tasks/mgsm/direct/mgsm_direct_th.yaml
+++ b/lm_eval/tasks/mgsm/direct/mgsm_direct_th.yaml
+# Generated by utils.py
+dataset_name: th
+doc_to_target: '{% if answer is not none %}{{answer[6+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer"}}{% else %}{{"โจทย์:
+  "+question+"\nAnswer"}}{% endif %}'
+include: direct_yaml
+task: mgsm_direct_th
--- a/lm_eval/tasks/mgsm/direct/mgsm_direct_zh.yaml
+++ b/lm_eval/tasks/mgsm/direct/mgsm_direct_zh.yaml
+# Generated by utils.py
+dataset_name: zh
+doc_to_target: '{% if answer is not none %}{{answer[6+1]}}{% else %}{{answer_number|string}}{%
+  endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer"}}{% else %}{{"问题: "+question+"\nAnswer"}}{%
+  endif %}'
+include: direct_yaml
+task: mgsm_direct_zh
--- a/lm_eval/tasks/mgsm/en_cot/cot_yaml
+++ b/lm_eval/tasks/mgsm/en_cot/cot_yaml
+# This file will be included in the generated language-specific task configs.
+# It doesn't have a yaml file extension as it is not meant to be imported directly
+# by the harness.
+group: mgsm_cot_native
+dataset_path: juletxara/mgsm
+dataset_name: null  # Overridden by language-specific config.
+output_type: greedy_until
+training_split: train
+test_split: test
+target_delimiter: ""
+generation_kwargs:
+  until:
+    - "\n\n"
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+target_delimiter: " "
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+filter_list:
+  - name: "get-answer"
+    filter:
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
--- a/lm_eval/tasks/mgsm/en_cot/mgsm_bn_en-cot.yaml
+++ b/lm_eval/tasks/mgsm/en_cot/mgsm_bn_en-cot.yaml
+# Generated by utils.py
+dataset_name: bn
+doc_to_target: '{% if answer is not none %}{{answer[20+1]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"প্রশ্ন: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_bn_direct
--- a/lm_eval/tasks/mgsm/en_cot/mgsm_de_en-cot.yaml
+++ b/lm_eval/tasks/mgsm/en_cot/mgsm_de_en-cot.yaml
+# Generated by utils.py
+dataset_name: de
+doc_to_target: '{% if answer is not none %}{{answer[20+1]}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Frage: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+include: cot_yaml
+task: mgsm_de_direct