"comfy/vscode:/vscode.git/clone" did not exist on "1305fb294ca69d0a44d88c5bf7ce8c682abd0c8a"
Commit d627333a authored by lintangsutawika's avatar lintangsutawika
Browse files

merged with latest

parents 4156a005 4cda3a1c
import datasets
import sacrebleu
import numpy as np
from rouge_score import rouge_scorer, scoring
def process_results_mc2(doc, results):
lls, is_greedy = zip(*results)
# Split on the first `0` as everything before it is true (`1`).
split_idx = list(doc["mc2_targets"]["labels"]).index(0)
# Compute the normalized probability mass for the correct answer.
ll_true, ll_false = lls[:split_idx], lls[split_idx:]
p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))
p_true = p_true / (sum(p_true) + sum(p_false))
return {"acc": sum(p_true)}
def process_docs_gen(dataset: datasets.Dataset) -> datasets.Dataset:
return dataset.map(preprocess_function)
def preprocess_function(examples):
def _format_answers(answers):
formatted_answers = []
for answer in answers:
answer = answer.strip()
if len(answer):
# Add a period after all answers.
if answer[-1] != ".":
formatted_answers.append(answer + ".")
else:
formatted_answers.append(answer)
return formatted_answers
incorrect_answers = _format_answers(examples["incorrect_answers"])
correct_answers = _format_answers(examples["correct_answers"])
if "I have no comment." not in correct_answers:
correct_answers.append("I have no comment.")
return {
"question": examples["question"].strip(),
"correct_answers": correct_answers,
"incorrect_answers": incorrect_answers,
}
def process_results_gen(doc, results):
completion = results[0]
true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"]
all_refs = true_refs + false_refs
# Process the sentence-level BLEURT, BLEU, and ROUGE for similarity measures.
# # BLEURT
# bleurt_scores_true = self.bleurt.compute(
# predictions=[completion] * len(true_refs), references=true_refs
# )["scores"]
# bleurt_scores_false = self.bleurt.compute(
# predictions=[completion] * len(false_refs), references=false_refs
# )["scores"]
# bleurt_correct = max(bleurt_scores_true)
# bleurt_incorrect = max(bleurt_scores_false)
# bleurt_max = bleurt_correct
# bleurt_diff = bleurt_correct - bleurt_incorrect
# bleurt_acc = int(bleurt_correct > bleurt_incorrect)
# BLEU
bleu_scores = [bleu([[ref]], [completion]) for ref in all_refs]
bleu_correct = np.nanmax(bleu_scores[: len(true_refs)])
bleu_incorrect = np.nanmax(bleu_scores[len(true_refs) :])
bleu_max = bleu_correct
bleu_diff = bleu_correct - bleu_incorrect
bleu_acc = int(bleu_correct > bleu_incorrect)
# ROUGE-N
rouge_scores = [rouge([ref], [completion]) for ref in all_refs]
# ROUGE-1
rouge1_scores = [score["rouge1"] for score in rouge_scores]
rouge1_correct = np.nanmax(rouge1_scores[: len(true_refs)])
rouge1_incorrect = np.nanmax(rouge1_scores[len(true_refs) :])
rouge1_max = rouge1_correct
rouge1_diff = rouge1_correct - rouge1_incorrect
rouge1_acc = int(rouge1_correct > rouge1_incorrect)
# ROUGE-2
rouge2_scores = [score["rouge2"] for score in rouge_scores]
rouge2_correct = np.nanmax(rouge2_scores[: len(true_refs)])
rouge2_incorrect = np.nanmax(rouge2_scores[len(true_refs) :])
rouge2_max = rouge2_correct
rouge2_diff = rouge2_correct - rouge2_incorrect
rouge2_acc = int(rouge2_correct > rouge2_incorrect)
# ROUGE-L
rougeL_scores = [score["rougeLsum"] for score in rouge_scores]
rougeL_correct = np.nanmax(rougeL_scores[: len(true_refs)])
rougeL_incorrect = np.nanmax(rougeL_scores[len(true_refs) :])
rougeL_max = rougeL_correct
rougeL_diff = rougeL_correct - rougeL_incorrect
rougeL_acc = int(rougeL_correct > rougeL_incorrect)
return {
# "bleurt_max": bleurt_max,
# "bleurt_acc": bleurt_acc,
# "bleurt_diff": bleurt_diff,
"bleu_max": bleu_max,
"bleu_acc": bleu_acc,
"bleu_diff": bleu_diff,
"rouge1_max": rouge1_max,
"rouge1_acc": rouge1_acc,
"rouge1_diff": rouge1_diff,
"rouge2_max": rouge2_max,
"rouge2_acc": rouge2_acc,
"rouge2_diff": rouge2_diff,
"rougeL_max": rougeL_max,
"rougeL_acc": rougeL_acc,
"rougeL_diff": rougeL_diff,
}
def bleu(refs, preds):
"""
Returns `t5` style BLEU scores. See the related implementation:
https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
:param refs:
A `list` of `list` of reference `str`s.
:param preds:
A `list` of predicted `str`s.
"""
score = sacrebleu.corpus_bleu(
preds,
refs,
smooth_method="exp",
smooth_value=0.0,
force=False,
lowercase=False,
tokenize="intl",
use_effective_order=False,
).score
return score
def rouge(refs, preds):
"""
Returns `t5` style ROUGE scores. See the related implementation:
https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
:param refs:
A `list` of reference `strs`.
:param preds:
A `list` of predicted `strs`.
"""
rouge_types = ["rouge1", "rouge2", "rougeLsum"]
scorer = rouge_scorer.RougeScorer(rouge_types)
# Add newlines between sentences to correctly compute `rougeLsum`.
def _prepare_summary(summary):
summary = summary.replace(" . ", ".\n")
return summary
# Accumulate confidence intervals.
aggregator = scoring.BootstrapAggregator()
for ref, pred in zip(refs, preds):
ref = _prepare_summary(ref)
pred = _prepare_summary(pred)
aggregator.add_scores(scorer.score(ref, pred))
result = aggregator.aggregate()
return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
...@@ -28,7 +28,13 @@ Homepage: https://github.com/openai/gpt-3/tree/master/data ...@@ -28,7 +28,13 @@ Homepage: https://github.com/openai/gpt-3/tree/master/data
} }
``` ```
### Subtasks ### Groups and Tasks
#### Groups
* `unscramble`
#### Tasks
* `anagrams1` - Anagrams of all but the first and last letter. * `anagrams1` - Anagrams of all but the first and last letter.
* `anagrams2` - Anagrams of all but the first and last 2 letters. * `anagrams2` - Anagrams of all but the first and last 2 letters.
......
group: group:
- greedy_until - unscramble
task: anagrams1 task: anagrams1
dataset_path: EleutherAI/unscramble dataset_path: EleutherAI/unscramble
dataset_name: mid_word_1_anagrams dataset_name: mid_word_1_anagrams
......
group: group:
- greedy_until - unscramble
task: anagrams2 task: anagrams2
dataset_path: EleutherAI/unscramble dataset_path: EleutherAI/unscramble
dataset_name: mid_word_2_anagrams dataset_name: mid_word_2_anagrams
......
group: group:
- greedy_until - unscramble
task: cycle_letters task: cycle_letters
dataset_path: EleutherAI/unscramble dataset_path: EleutherAI/unscramble
dataset_name: cycle_letters_in_word dataset_name: cycle_letters_in_word
......
group: group:
- greedy_until - unscramble
task: random_insertion task: random_insertion
dataset_path: EleutherAI/unscramble dataset_path: EleutherAI/unscramble
dataset_name: random_insertion_in_word dataset_name: random_insertion_in_word
......
group: group:
- greedy_until - unscramble
task: reversed_words task: reversed_words
dataset_path: EleutherAI/unscramble dataset_path: EleutherAI/unscramble
dataset_name: reversed_words dataset_name: reversed_words
......
# Task-name # WEBQs
### Paper ### Paper
...@@ -33,9 +33,14 @@ Homepage: `https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a ...@@ -33,9 +33,14 @@ Homepage: `https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a
} }
``` ```
### Subtasks ### Groups and Tasks
#### Groups
* `freebase`
#### Tasks
List or describe tasks defined in this folder, and their names here:
* `webqs`: `Questions with multiple accepted answers.` * `webqs`: `Questions with multiple accepted answers.`
### Checklist ### Checklist
......
group: group:
- freebase - freebase
- question_answer
task: webqs task: webqs
dataset_path: web_questions dataset_path: web_questions
dataset_name: null dataset_name: null
......
...@@ -26,7 +26,13 @@ Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext- ...@@ -26,7 +26,13 @@ Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-
} }
``` ```
### Subtasks ### Groups and Tasks
#### Groups
* Not part of a group yet.
#### Tasks
* `wikitext`: measure perplexity on the Wikitext dataset, via rolling loglikelihoods. * `wikitext`: measure perplexity on the Wikitext dataset, via rolling loglikelihoods.
......
group:
- perplexity
- loglikelihood_rolling
task: wikitext task: wikitext
dataset_path: EleutherAI/wikitext_document_level dataset_path: EleutherAI/wikitext_document_level
dataset_name: wikitext-2-raw-v1 dataset_name: wikitext-2-raw-v1
......
# WinoGrande
### Paper
Title: `WinoGrande: An Adversarial Winograd Schema Challenge at Scale`
Abstract: https://arxiv.org/abs/1907.10641
WinoGrande is a collection of 44k problems, inspired by Winograd Schema Challenge
(Levesque, Davis, and Morgenstern 2011), but adjusted to improve the scale and
robustness against the dataset-specific bias. Formulated as a fill-in-a-blank
task with binary options, the goal is to choose the right option for a given
sentence which requires commonsense reasoning.
NOTE: This evaluation of Winogrande uses partial evaluation as described by
Trinh & Le in Simple Method for Commonsense Reasoning (2018).
See: https://arxiv.org/abs/1806.02847
Homepage: https://leaderboard.allenai.org/winogrande/submissions/public
### Citation
```
@article{sakaguchi2019winogrande,
title={WinoGrande: An Adversarial Winograd Schema Challenge at Scale},
author={Sakaguchi, Keisuke and Bras, Ronan Le and Bhagavatula, Chandra and Choi, Yejin},
journal={arXiv preprint arXiv:1907.10641},
year={2019}
}
```
### Groups and Tasks
#### Groups
* Not part of a group yet.
#### Tasks
* `winogrande`
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
# WMT16
### Paper
Title: `Findings of the 2016 Conference on Machine Translation`
Abstract: http://www.aclweb.org/anthology/W/W16/W16-2301
Homepage: https://huggingface.co/datasets/wmt16
### Citation
```
@InProceedings{bojar-EtAl:2016:WMT1,
author = {Bojar, Ond
{r}ej and Chatterjee, Rajen and Federmann, Christian and Graham, Yvette and Haddow, Barry and Huck, Matthias and Jimeno Yepes, Antonio and Koehn, Philipp and Logacheva, Varvara and Monz, Christof and Negri, Matteo and Neveol, Aurelie and Neves, Mariana and Popel, Martin and Post, Matt and Rubino, Raphael and Scarton, Carolina and Specia, Lucia and Turchi, Marco and Verspoor, Karin and Zampieri, Marcos},
title = {Findings of the 2016 Conference on Machine Translation},
booktitle = {Proceedings of the First Conference on Machine Translation},
month = {August},
year = {2016},
address = {Berlin, Germany},
publisher = {Association for Computational Linguistics},
pages = {131--198},
url = {http://www.aclweb.org/anthology/W/W16/W16-2301}
}
```
### Groups and Tasks
#### Groups
* `wmt-t5-prompt`: Group for all wmt tasks with prompt templates used for T5 (`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`)
#### Tasks
With specific prompt styles
* `wmt-ro-en-t5-prompt`: WMT16 with the prompt template used for T5
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
import evaluate
def bleu(predictions, references):
return (predictions[0], references[0])
def agg_bleu(items):
bleu_fn = evaluate.load("bleu")
predictions, references = zip(*items)
return bleu_fn.compute(predictions=predictions, references=references)["bleu"]
group:
- wmt-t5-prompt
task: wmt-ro-en-t5-prompt
dataset_path: wmt16
dataset_name: ro-en
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "translate English to Romanian: {{translation.en}}"
doc_to_target: "{{translation.ro}}"
metric_list:
- metric: wer
aggregation: mean
higher_is_better: false
- metric: !function metrics.bleu
aggregation: !function metrics.agg_bleu
higher_is_better: true
## XCOPA: A Multilingual Dataset for Causal Commonsense Reasoning # XCOPA
https://ducdauge.github.io/files/xcopa.pdf
### Paper
Title: `XCOPA: A Multilingual Dataset for Causal Commonsense Reasoning`
Abstract: https://ducdauge.github.io/files/xcopa.pdf
The Cross-lingual Choice of Plausible Alternatives dataset is a benchmark to evaluate the ability of machine learning models to transfer commonsense reasoning across languages. The Cross-lingual Choice of Plausible Alternatives dataset is a benchmark to evaluate the ability of machine learning models to transfer commonsense reasoning across languages.
The dataset is the translation and reannotation of the English COPA (Roemmele et al. 2011) and covers 11 languages from 11 families and several areas around the globe. The dataset is the translation and reannotation of the English COPA (Roemmele et al. 2011) and covers 11 languages from 11 families and several areas around the globe.
...@@ -8,6 +13,8 @@ All the details about the creation of XCOPA and the implementation of the baseli ...@@ -8,6 +13,8 @@ All the details about the creation of XCOPA and the implementation of the baseli
Homepage: https://github.com/cambridgeltl/xcopa Homepage: https://github.com/cambridgeltl/xcopa
### Citation
``` ```
@inproceedings{ponti2020xcopa, @inproceedings{ponti2020xcopa,
title={{XCOPA: A} Multilingual Dataset for Causal Commonsense Reasoning}, title={{XCOPA: A} Multilingual Dataset for Causal Commonsense Reasoning},
...@@ -17,3 +24,37 @@ Homepage: https://github.com/cambridgeltl/xcopa ...@@ -17,3 +24,37 @@ Homepage: https://github.com/cambridgeltl/xcopa
url={https://ducdauge.github.io/files/xcopa.pdf} url={https://ducdauge.github.io/files/xcopa.pdf}
} }
``` ```
### Groups and Tasks
#### Groups
* `xcopa`
#### Tasks
* `xcopa_et`: Estonian
* `xcopa_ht`: Haitian Creole
* `xcopa_id`: Indonesian
* `xcopa_it`: Italian
* `xcopa_qu`: Cusco-Collao Quechua
* `xcopa_sw`: Kiswahili
* `xcopa_ta`: Tamil
* `xcopa_th`: Thai
* `xcopa_tr`: Turkish
* `xcopa_vi`: Vietnamese
* `xcopa_zh`: Mandarin Chinese
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
# XStoryCloze
### Paper
Title: `Few-shot Learning with Multilingual Language Models`
Abstract: https://arxiv.org/abs/2112.10668
XStoryCloze consists of the professionally translated version of the [English StoryCloze dataset](https://cs.rochester.edu/nlp/rocstories/) (Spring 2016 version) to 10 non-English languages. This dataset is released by Meta AI.
Homepage: https://github.com/facebookresearch/fairseq/pull/4820
### Citation
```
@article{DBLP:journals/corr/abs-2112-10668,
author = {Xi Victoria Lin and
Todor Mihaylov and
Mikel Artetxe and
Tianlu Wang and
Shuohui Chen and
Daniel Simig and
Myle Ott and
Naman Goyal and
Shruti Bhosale and
Jingfei Du and
Ramakanth Pasunuru and
Sam Shleifer and
Punit Singh Koura and
Vishrav Chaudhary and
Brian O'Horo and
Jeff Wang and
Luke Zettlemoyer and
Zornitsa Kozareva and
Mona T. Diab and
Veselin Stoyanov and
Xian Li},
title = {Few-shot Learning with Multilingual Language Models},
journal = {CoRR},
volume = {abs/2112.10668},
year = {2021},
url = {https://arxiv.org/abs/2112.10668},
eprinttype = {arXiv},
eprint = {2112.10668},
timestamp = {Tue, 04 Jan 2022 15:59:27 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2112-10668.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
```
### Groups and Tasks
#### Groups
* `xstorycloze`
#### Tasks
* `xstorycloze_ar`: Arabic
* `xstorycloze_en`: English
* `xstorycloze_es`: Spanish
* `xstorycloze_eu`: Basque
* `xstorycloze_hi`: Hindi
* `xstorycloze_id`: Indonesian
* `xstorycloze_my`: Burmese
* `xstorycloze_ru`: Russian
* `xstorycloze_sw`: Swahili
* `xstorycloze_te`: Telugu
* `xstorycloze_zh`: Chinese
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
...@@ -31,7 +31,13 @@ Homepage: `https://huggingface.co/datasets/Muennighoff/xwinograd` ...@@ -31,7 +31,13 @@ Homepage: `https://huggingface.co/datasets/Muennighoff/xwinograd`
} }
``` ```
### Subtasks ### Groups and Tasks
#### Groups
* `xwinograd`
#### Tasks
List or describe tasks defined in this folder, and their names here: List or describe tasks defined in this folder, and their names here:
* `xwinograd_en`: Winograd schema challenges in English. * `xwinograd_en`: Winograd schema challenges in English.
......
...@@ -2,9 +2,7 @@ ...@@ -2,9 +2,7 @@
# It doesn't have a yaml file extension as it is not meant to be imported directly # It doesn't have a yaml file extension as it is not meant to be imported directly
# by the harness. # by the harness.
group: group:
- winograd - xwinograd
- commonsense
- multilingual
dataset_path: Muennighoff/xwinograd dataset_path: Muennighoff/xwinograd
dataset_name: null # Overridden by language-specific config. dataset_name: null # Overridden by language-specific config.
output_type: multiple_choice output_type: multiple_choice
......
...@@ -2,7 +2,8 @@ ...@@ -2,7 +2,8 @@
### Paper ### Paper
Title: `paper title goes here` Title: `paper titles goes here`
Abstract: `link to paper PDF or arXiv abstract goes here` Abstract: `link to paper PDF or arXiv abstract goes here`
`Short description of paper / benchmark goes here:` `Short description of paper / benchmark goes here:`
...@@ -16,11 +17,16 @@ Homepage: `homepage to the benchmark's website goes here, if applicable` ...@@ -16,11 +17,16 @@ Homepage: `homepage to the benchmark's website goes here, if applicable`
BibTeX-formatted citation goes here BibTeX-formatted citation goes here
``` ```
### Subtasks ### Groups and Tasks
#### Groups
* `group_name`: `Short description`
#### Tasks
List or describe tasks defined in this folder, and their names here:
* `task_name`: `1-sentence description of what this particular task does` * `task_name`: `1-sentence description of what this particular task does`
* `task_name2`: ..... * `task_name2`: ...
### Checklist ### Checklist
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment