merged with latest

d627333a · lintangsutawika · 4156a005 · 4cda3a1c · d627333a · d627333a
Commit d627333a authored Aug 29, 2023 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/truthfulqa/utils.py
+++ b/lm_eval/tasks/truthfulqa/utils.py
+import datasets
+import sacrebleu
+import numpy as np
+from rouge_score import rouge_scorer, scoring
+def process_results_mc2(doc, results):
+    lls, is_greedy = zip(*results)
+    # Split on the first `0` as everything before it is true (`1`).
+    split_idx = list(doc["mc2_targets"]["labels"]).index(0)
+    # Compute the normalized probability mass for the correct answer.
+    ll_true, ll_false = lls[:split_idx], lls[split_idx:]
+    p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))
+    p_true = p_true / (sum(p_true) + sum(p_false))
+    return {"acc": sum(p_true)}
+def process_docs_gen(dataset: datasets.Dataset) -> datasets.Dataset:
+    return dataset.map(preprocess_function)
+def preprocess_function(examples):
+    def _format_answers(answers):
+        formatted_answers = []
+        for answer in answers:
+            answer = answer.strip()
+            if len(answer):
+                # Add a period after all answers.
+                if answer[-1] != ".":
+                    formatted_answers.append(answer + ".")
+                else:
+                    formatted_answers.append(answer)
+        return formatted_answers
+    incorrect_answers = _format_answers(examples["incorrect_answers"])
+    correct_answers = _format_answers(examples["correct_answers"])
+    if "I have no comment." not in correct_answers:
+        correct_answers.append("I have no comment.")
+    return {
+        "question": examples["question"].strip(),
+        "correct_answers": correct_answers,
+        "incorrect_answers": incorrect_answers,
+    }
+def process_results_gen(doc, results):
+    completion = results[0]
+    true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"]
+    all_refs = true_refs + false_refs
+    # Process the sentence-level BLEURT, BLEU, and ROUGE for similarity measures.
+    # # BLEURT
+    # bleurt_scores_true = self.bleurt.compute(
+    #     predictions=[completion] * len(true_refs), references=true_refs
+    # )["scores"]
+    # bleurt_scores_false = self.bleurt.compute(
+    #     predictions=[completion] * len(false_refs), references=false_refs
+    # )["scores"]
+    # bleurt_correct = max(bleurt_scores_true)
+    # bleurt_incorrect = max(bleurt_scores_false)
+    # bleurt_max = bleurt_correct
+    # bleurt_diff = bleurt_correct - bleurt_incorrect
+    # bleurt_acc = int(bleurt_correct > bleurt_incorrect)
+    # BLEU
+    bleu_scores = [bleu([[ref]], [completion]) for ref in all_refs]
+    bleu_correct = np.nanmax(bleu_scores[: len(true_refs)])
+    bleu_incorrect = np.nanmax(bleu_scores[len(true_refs) :])
+    bleu_max = bleu_correct
+    bleu_diff = bleu_correct - bleu_incorrect
+    bleu_acc = int(bleu_correct > bleu_incorrect)
+    # ROUGE-N
+    rouge_scores = [rouge([ref], [completion]) for ref in all_refs]
+    # ROUGE-1
+    rouge1_scores = [score["rouge1"] for score in rouge_scores]
+    rouge1_correct = np.nanmax(rouge1_scores[: len(true_refs)])
+    rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs) :])
+    rouge1_max = rouge1_correct
+    rouge1_diff = rouge1_correct - rouge1_incorrect
+    rouge1_acc = int(rouge1_correct > rouge1_incorrect)
+    # ROUGE-2
+    rouge2_scores = [score["rouge2"] for score in rouge_scores]
+    rouge2_correct = np.nanmax(rouge2_scores[: len(true_refs)])
+    rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs) :])
+    rouge2_max = rouge2_correct
+    rouge2_diff = rouge2_correct - rouge2_incorrect
+    rouge2_acc = int(rouge2_correct > rouge2_incorrect)
+    # ROUGE-L
+    rougeL_scores = [score["rougeLsum"] for score in rouge_scores]
+    rougeL_correct = np.nanmax(rougeL_scores[: len(true_refs)])
+    rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs) :])
+    rougeL_max = rougeL_correct
+    rougeL_diff = rougeL_correct - rougeL_incorrect
+    rougeL_acc = int(rougeL_correct > rougeL_incorrect)
+    return {
+        # "bleurt_max": bleurt_max,
+        # "bleurt_acc": bleurt_acc,
+        # "bleurt_diff": bleurt_diff,
+        "bleu_max": bleu_max,
+        "bleu_acc": bleu_acc,
+        "bleu_diff": bleu_diff,
+        "rouge1_max": rouge1_max,
+        "rouge1_acc": rouge1_acc,
+        "rouge1_diff": rouge1_diff,
+        "rouge2_max": rouge2_max,
+        "rouge2_acc": rouge2_acc,
+        "rouge2_diff": rouge2_diff,
+        "rougeL_max": rougeL_max,
+        "rougeL_acc": rougeL_acc,
+        "rougeL_diff": rougeL_diff,
+    }
+def bleu(refs, preds):
+    """
+    Returns `t5` style BLEU scores. See the related implementation:
+    https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
+    :param refs:
+        A `list` of `list` of reference `str`s.
+    :param preds:
+        A `list` of predicted `str`s.
+    """
+    score = sacrebleu.corpus_bleu(
+        preds,
+        refs,
+        smooth_method="exp",
+        smooth_value=0.0,
+        force=False,
+        lowercase=False,
+        tokenize="intl",
+        use_effective_order=False,
+    ).score
+    return score
+def rouge(refs, preds):
+    """
+    Returns `t5` style ROUGE scores. See the related implementation:
+    https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
+    :param refs:
+        A `list` of reference `strs`.
+    :param preds:
+        A `list` of predicted `strs`.
+    """
+    rouge_types = ["rouge1", "rouge2", "rougeLsum"]
+    scorer = rouge_scorer.RougeScorer(rouge_types)
+    # Add newlines between sentences to correctly compute `rougeLsum`.
+    def _prepare_summary(summary):
+        summary = summary.replace(" . ", ".\n")
+        return summary
+    # Accumulate confidence intervals.
+    aggregator = scoring.BootstrapAggregator()
+    for ref, pred in zip(refs, preds):
+        ref = _prepare_summary(ref)
+        pred = _prepare_summary(pred)
+        aggregator.add_scores(scorer.score(ref, pred))
+    result = aggregator.aggregate()
+    return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
--- a/lm_eval/tasks/unscramble/README.md
+++ b/lm_eval/tasks/unscramble/README.md
@@ -28,7 +28,13 @@ Homepage: https://github.com/openai/gpt-3/tree/master/data
 }
 ```
-### Subtasks
+### Groups and Tasks
+#### Groups
+* `unscramble`
+#### Tasks
 * `anagrams1` - Anagrams of all but the first and last letter.
 * `anagrams2` - Anagrams of all but the first and last 2 letters.

--- a/lm_eval/tasks/unscramble/anagrams1.yaml
+++ b/lm_eval/tasks/unscramble/anagrams1.yaml
 group:
-  - greedy_until
+  - unscramble
 task: anagrams1
 dataset_path: EleutherAI/unscramble
 dataset_name: mid_word_1_anagrams

--- a/lm_eval/tasks/unscramble/anagrams2.yaml
+++ b/lm_eval/tasks/unscramble/anagrams2.yaml
 group:
-  - greedy_until
+  - unscramble
 task: anagrams2
 dataset_path: EleutherAI/unscramble
 dataset_name: mid_word_2_anagrams

--- a/lm_eval/tasks/unscramble/cycle_letters.yaml
+++ b/lm_eval/tasks/unscramble/cycle_letters.yaml
 group:
-  - greedy_until
+  - unscramble
 task: cycle_letters
 dataset_path: EleutherAI/unscramble
 dataset_name: cycle_letters_in_word

--- a/lm_eval/tasks/unscramble/random_insertion.yaml
+++ b/lm_eval/tasks/unscramble/random_insertion.yaml
 group:
-  - greedy_until
+  - unscramble
 task: random_insertion
 dataset_path: EleutherAI/unscramble
 dataset_name: random_insertion_in_word

--- a/lm_eval/tasks/unscramble/reversed_words.yaml
+++ b/lm_eval/tasks/unscramble/reversed_words.yaml
 group:
-  - greedy_until
+  - unscramble
 task: reversed_words
 dataset_path: EleutherAI/unscramble
 dataset_name: reversed_words

--- a/lm_eval/tasks/webqs/README.md
+++ b/lm_eval/tasks/webqs/README.md
-# Task-name
+# WEBQs
 ### Paper
@@ -33,9 +33,14 @@ Homepage: `https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a
 }
 ```
-### Subtasks
+### Groups and Tasks
+#### Groups
+* `freebase`
+#### Tasks
-List or describe tasks defined in this folder, and their names here:
 * `webqs`: `Questions with multiple accepted answers.`
 ### Checklist

--- a/lm_eval/tasks/webqs/webqs.yaml
+++ b/lm_eval/tasks/webqs/webqs.yaml
 group:
  - freebase
-  - question_answer
 task: webqs
 dataset_path: web_questions
 dataset_name: null

--- a/lm_eval/tasks/wikitext/README.md
+++ b/lm_eval/tasks/wikitext/README.md
@@ -26,7 +26,13 @@ Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-
 }
 ```
-### Subtasks
+### Groups and Tasks
+#### Groups
+* Not part of a group yet.
+#### Tasks
 * `wikitext`: measure perplexity on the Wikitext dataset, via rolling loglikelihoods.

--- a/lm_eval/tasks/wikitext/wikitext.yaml
+++ b/lm_eval/tasks/wikitext/wikitext.yaml
-group:
-  - perplexity
-  - loglikelihood_rolling
 task: wikitext
 dataset_path: EleutherAI/wikitext_document_level
 dataset_name: wikitext-2-raw-v1

--- a/lm_eval/tasks/winogrande/README.md
+++ b/lm_eval/tasks/winogrande/README.md
+# WinoGrande
+### Paper
+Title: `WinoGrande: An Adversarial Winograd Schema Challenge at Scale`
+Abstract: https://arxiv.org/abs/1907.10641
+WinoGrande is a collection of 44k problems, inspired by Winograd Schema Challenge
+(Levesque, Davis, and Morgenstern 2011), but adjusted to improve the scale and
+robustness against the dataset-specific bias. Formulated as a fill-in-a-blank
+task with binary options, the goal is to choose the right option for a given
+sentence which requires commonsense reasoning.
+NOTE: This evaluation of Winogrande uses partial evaluation as described by
+Trinh & Le in Simple Method for Commonsense Reasoning (2018).
+See: https://arxiv.org/abs/1806.02847
+Homepage: https://leaderboard.allenai.org/winogrande/submissions/public
+### Citation
+```
+@article{sakaguchi2019winogrande,
+    title={WinoGrande: An Adversarial Winograd Schema Challenge at Scale},
+    author={Sakaguchi, Keisuke and Bras, Ronan Le and Bhagavatula, Chandra and Choi, Yejin},
+    journal={arXiv preprint arXiv:1907.10641},
+    year={2019}
+}
+```
+### Groups and Tasks
+#### Groups
+* Not part of a group yet.
+#### Tasks
+* `winogrande`
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/wmt2016/README.md
+++ b/lm_eval/tasks/wmt2016/README.md
+# WMT16
+### Paper
+Title: `Findings of the 2016 Conference on Machine Translation`
+Abstract: http://www.aclweb.org/anthology/W/W16/W16-2301
+Homepage: https://huggingface.co/datasets/wmt16
+### Citation
+```
+@InProceedings{bojar-EtAl:2016:WMT1,
+  author    = {Bojar, Ond
+{r}ej  and  Chatterjee, Rajen  and  Federmann, Christian  and  Graham, Yvette  and  Haddow, Barry  and  Huck, Matthias  and  Jimeno Yepes, Antonio  and  Koehn, Philipp  and  Logacheva, Varvara  and  Monz, Christof  and  Negri, Matteo  and  Neveol, Aurelie  and  Neves, Mariana  and  Popel, Martin  and  Post, Matt  and  Rubino, Raphael  and  Scarton, Carolina  and  Specia, Lucia  and  Turchi, Marco  and  Verspoor, Karin  and  Zampieri, Marcos},
+  title     = {Findings of the 2016 Conference on Machine Translation},
+  booktitle = {Proceedings of the First Conference on Machine Translation},
+  month     = {August},
+  year      = {2016},
+  address   = {Berlin, Germany},
+  publisher = {Association for Computational Linguistics},
+  pages     = {131--198},
+  url       = {http://www.aclweb.org/anthology/W/W16/W16-2301}
+}
+```
+### Groups and Tasks
+#### Groups
+* `wmt-t5-prompt`: Group for all wmt tasks with prompt templates used for T5 (`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`)
+#### Tasks
+With specific prompt styles
+* `wmt-ro-en-t5-prompt`: WMT16 with the prompt template used for T5
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/wmt2016/metrics.py
+++ b/lm_eval/tasks/wmt2016/metrics.py
+import evaluate
+def bleu(predictions, references):
+    return (predictions[0], references[0])
+def agg_bleu(items):
+    bleu_fn = evaluate.load("bleu")
+    predictions, references = zip(*items)
+    return bleu_fn.compute(predictions=predictions, references=references)["bleu"]
--- a/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
+++ b/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
+group:
+  - wmt-t5-prompt
+task: wmt-ro-en-t5-prompt
+dataset_path: wmt16
+dataset_name: ro-en
+training_split: train
+validation_split: validation
+output_type: greedy_until
+doc_to_text: "translate English to Romanian: {{translation.en}}"
+doc_to_target: "{{translation.ro}}"
+metric_list:
+  - metric: wer
+    aggregation: mean
+    higher_is_better: false
+  - metric: !function metrics.bleu
+    aggregation: !function metrics.agg_bleu
+    higher_is_better: true
--- a/lm_eval/tasks/xcopa/README.md
+++ b/lm_eval/tasks/xcopa/README.md
-## XCOPA: A Multilingual Dataset for Causal Commonsense Reasoning
+# XCOPA
-https://ducdauge.github.io/files/xcopa.pdf
+### Paper
+Title: `XCOPA: A Multilingual Dataset for Causal Commonsense Reasoning`
+Abstract: https://ducdauge.github.io/files/xcopa.pdf
 The Cross-lingual Choice of Plausible Alternatives dataset is a benchmark to evaluate the ability of machine learning models to transfer commonsense reasoning across languages.
 The dataset is the translation and reannotation of the English COPA (Roemmele et al. 2011) and covers 11 languages from 11 families and several areas around the globe.
@@ -8,6 +13,8 @@ All the details about the creation of XCOPA and the implementation of the baseli
 Homepage: https://github.com/cambridgeltl/xcopa
+### Citation
 ```
 @inproceedings{ponti2020xcopa,
  title={{XCOPA: A} Multilingual Dataset for Causal Commonsense Reasoning},
@@ -17,3 +24,37 @@ Homepage: https://github.com/cambridgeltl/xcopa
  url={https://ducdauge.github.io/files/xcopa.pdf}
 }
 ```
+### Groups and Tasks
+#### Groups
+* `xcopa`
+#### Tasks
+* `xcopa_et`: Estonian
+* `xcopa_ht`: Haitian Creole
+* `xcopa_id`: Indonesian
+* `xcopa_it`: Italian
+* `xcopa_qu`: Cusco-Collao Quechua
+* `xcopa_sw`: Kiswahili
+* `xcopa_ta`: Tamil
+* `xcopa_th`: Thai
+* `xcopa_tr`: Turkish
+* `xcopa_vi`: Vietnamese
+* `xcopa_zh`: Mandarin Chinese
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/xstorycloze/README.md
+++ b/lm_eval/tasks/xstorycloze/README.md
+# XStoryCloze
+### Paper
+Title: `Few-shot Learning with Multilingual Language Models`
+Abstract: https://arxiv.org/abs/2112.10668
+XStoryCloze consists of the professionally translated version of the [English StoryCloze dataset](https://cs.rochester.edu/nlp/rocstories/) (Spring 2016 version) to 10 non-English languages. This dataset is released by Meta AI.
+Homepage: https://github.com/facebookresearch/fairseq/pull/4820
+### Citation
+```
+@article{DBLP:journals/corr/abs-2112-10668,
+  author    = {Xi Victoria Lin and
+               Todor Mihaylov and
+               Mikel Artetxe and
+               Tianlu Wang and
+               Shuohui Chen and
+               Daniel Simig and
+               Myle Ott and
+               Naman Goyal and
+               Shruti Bhosale and
+               Jingfei Du and
+               Ramakanth Pasunuru and
+               Sam Shleifer and
+               Punit Singh Koura and
+               Vishrav Chaudhary and
+               Brian O'Horo and
+               Jeff Wang and
+               Luke Zettlemoyer and
+               Zornitsa Kozareva and
+               Mona T. Diab and
+               Veselin Stoyanov and
+               Xian Li},
+  title     = {Few-shot Learning with Multilingual Language Models},
+  journal   = {CoRR},
+  volume    = {abs/2112.10668},
+  year      = {2021},
+  url       = {https://arxiv.org/abs/2112.10668},
+  eprinttype = {arXiv},
+  eprint    = {2112.10668},
+  timestamp = {Tue, 04 Jan 2022 15:59:27 +0100},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-2112-10668.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+```
+### Groups and Tasks
+#### Groups
+* `xstorycloze`
+#### Tasks
+* `xstorycloze_ar`: Arabic
+* `xstorycloze_en`: English
+* `xstorycloze_es`: Spanish
+* `xstorycloze_eu`: Basque
+* `xstorycloze_hi`: Hindi
+* `xstorycloze_id`: Indonesian
+* `xstorycloze_my`: Burmese
+* `xstorycloze_ru`: Russian
+* `xstorycloze_sw`: Swahili
+* `xstorycloze_te`: Telugu
+* `xstorycloze_zh`: Chinese
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/xwinograd/README.md
+++ b/lm_eval/tasks/xwinograd/README.md
@@ -31,7 +31,13 @@ Homepage: `https://huggingface.co/datasets/Muennighoff/xwinograd`
 }
 ```
-### Subtasks
+### Groups and Tasks
+#### Groups
+* `xwinograd`
+#### Tasks
 List or describe tasks defined in this folder, and their names here:
 * `xwinograd_en`: Winograd schema challenges in English.

--- a/lm_eval/tasks/xwinograd/xwinograd_common_yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_common_yaml
@@ -2,9 +2,7 @@
 # It doesn't have a yaml file extension as it is not meant to be imported directly
 # by the harness.
 group:
-  - winograd
+  - xwinograd
-  - commonsense
-  - multilingual
 dataset_path: Muennighoff/xwinograd
 dataset_name: null  # Overridden by language-specific config.
 output_type: multiple_choice

--- a/templates/new_yaml_task/README.md
+++ b/templates/new_yaml_task/README.md
@@ -2,7 +2,8 @@
 ### Paper
-Title: `paper title goes here`
+Title: `paper titles goes here`
 Abstract: `link to paper PDF or arXiv abstract goes here`
 `Short description of paper / benchmark goes here:`
@@ -16,11 +17,16 @@ Homepage: `homepage to the benchmark's website goes here, if applicable`
 BibTeX-formatted citation goes here
 ```
-### Subtasks
+### Groups and Tasks
+#### Groups
+* `group_name`: `Short description`
+#### Tasks
-List or describe tasks defined in this folder, and their names here:
 * `task_name`: `1-sentence description of what this particular task does`
-* `task_name2`: .....
+* `task_name2`: ...
 ### Checklist