Commit 4b87456d authored by lintangsutawika's avatar lintangsutawika
Browse files

made fixes

parent 64c76fc3
...@@ -1002,37 +1002,38 @@ class ConfigurableTask(Task): ...@@ -1002,37 +1002,38 @@ class ConfigurableTask(Task):
choices = self.doc_to_choice(doc) choices = self.doc_to_choice(doc)
gold = choices[gold] gold = choices[gold]
for key, result in zip(self._metric_fn_list.keys(), results): for metric in self._metric_fn_list.keys():
if self.multiple_target: for result in results:
# in the case where we have multiple targets, if self.multiple_target:
# return true if any are true # in the case where we have multiple targets,
# TODO: this may break for multipLe_target, non zero-or-1 metrics # return true if any are true
scores = [] # TODO: this may break for multipLe_target, non zero-or-1 metrics
for gold_option in gold: scores = []
res = self._metric_fn_list[key]( for gold_option in gold:
references=[gold_option], res = self._metric_fn_list[metric](
references=[gold_option],
predictions=[result],
**self._metric_fn_kwargs[metric],
)
if isinstance(res, dict):
# TODO: this handles the case where HF evaluate returns a dict.
res = res[metric]
scores.append(res)
if any(scores):
result = 1.0
else:
result = 0.0
else:
result = self._metric_fn_list[metric](
references=[gold],
predictions=[result], predictions=[result],
**self._metric_fn_kwargs[key], **self._metric_fn_kwargs[metric],
) )
if isinstance(res, dict):
# TODO: this handles the case where HF evaluate returns a dict.
res = res[key]
scores.append(res)
if any(scores):
result = 1.0
else:
result = 0.0
else:
result = self._metric_fn_list[key](
references=[gold],
predictions=[result],
**self._metric_fn_kwargs[key],
)
if isinstance(result, dict): if isinstance(result, dict):
result_dict.update(result) result_dict.update(result)
else: else:
result_dict[key] = result result_dict[metric] = result
else: else:
raise ValueError( raise ValueError(
f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ", f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ",
......
# WMT16
### Paper
Title: `Findings of the 2016 Conference on Machine Translation`
Abstract: http://www.aclweb.org/anthology/W/W16/W16-2301
Homepage: https://huggingface.co/datasets/wmt16
### Citation
```
@InProceedings{bojar-EtAl:2016:WMT1,
author = {Bojar, Ond
{r}ej and Chatterjee, Rajen and Federmann, Christian and Graham, Yvette and Haddow, Barry and Huck, Matthias and Jimeno Yepes, Antonio and Koehn, Philipp and Logacheva, Varvara and Monz, Christof and Negri, Matteo and Neveol, Aurelie and Neves, Mariana and Popel, Martin and Post, Matt and Rubino, Raphael and Scarton, Carolina and Specia, Lucia and Turchi, Marco and Verspoor, Karin and Zampieri, Marcos},
title = {Findings of the 2016 Conference on Machine Translation},
booktitle = {Proceedings of the First Conference on Machine Translation},
month = {August},
year = {2016},
address = {Berlin, Germany},
publisher = {Association for Computational Linguistics},
pages = {131--198},
url = {http://www.aclweb.org/anthology/W/W16/W16-2301}
}
```
### Groups and Tasks
#### Groups
* `wmt-t5-prompt`: Group for all wmt tasks with prompt templates used for T5 (`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`)
#### Tasks
With specific prompt styles
* `wmt-ro-en-t5-prompt`: WMT16 with the prompt template used for T5
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
import evaluate
def bleu(predictions, references):
rouge_fn = evaluate.load('bleu')
results = rouge_fn.compute(predictions=predictions, references=references)
return results['bleu']
group:
- wmt-t5-prompt
task: wmt-ro-en-t5-prompt
dataset_path: wmt16
dataset_name: ro-en
training_split: train
validation_split: validation
output_type: greedy_until
doc_to_text: "translate English to Romanian: {{translation.en}}"
doc_to_target: "{{translation.ro}}"
metric_list:
- metric: wer
aggregation: mean
higher_is_better: false
- metric: !function metrics.bleu
aggregation: mean
higher_is_better: true
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment