made fixes

4b87456d · lintangsutawika · 64c76fc3 · 4b87456d · 4b87456d · 4b87456d
Commit 4b87456d authored Aug 15, 2023 by lintangsutawika
4 changed files
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -1002,37 +1002,38 @@ class ConfigurableTask(Task):
                choices = self.doc_to_choice(doc)
                gold = choices[gold]
-            for key, result in zip(self._metric_fn_list.keys(), results):
+            for metric in self._metric_fn_list.keys():
-                if self.multiple_target:
+                for result in results:
-                    # in the case where we have multiple targets,
+                    if self.multiple_target:
-                    # return true if any are true
+                        # in the case where we have multiple targets,
-                    # TODO: this may break for multipLe_target, non zero-or-1 metrics
+                        # return true if any are true
-                    scores = []
+                        # TODO: this may break for multipLe_target, non zero-or-1 metrics
-                    for gold_option in gold:
+                        scores = []
-                        res = self._metric_fn_list[key](
+                        for gold_option in gold:
-                            references=[gold_option],
+                            res = self._metric_fn_list[metric](
+                                references=[gold_option],
+                                predictions=[result],
+                                **self._metric_fn_kwargs[metric],
+                            )
+                            if isinstance(res, dict):
+                                # TODO: this handles the case where HF evaluate returns a dict.
+                                res = res[metric]
+                            scores.append(res)
+                        if any(scores):
+                            result = 1.0
+                        else:
+                            result = 0.0
+                    else:
+                        result = self._metric_fn_list[metric](
+                            references=[gold],
                            predictions=[result],
-                            **self._metric_fn_kwargs[key],
+                            **self._metric_fn_kwargs[metric],
                        )
-                        if isinstance(res, dict):
-                            # TODO: this handles the case where HF evaluate returns a dict.
-                            res = res[key]
-                        scores.append(res)
-                    if any(scores):
-                        result = 1.0
-                    else:
-                        result = 0.0
-                else:
-                    result = self._metric_fn_list[key](
-                        references=[gold],
-                        predictions=[result],
-                        **self._metric_fn_kwargs[key],
-                    )
                if isinstance(result, dict):
                    result_dict.update(result)
                else:
-                    result_dict[key] = result
+                    result_dict[metric] = result
        else:
            raise ValueError(
                f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ",

--- a/lm_eval/tasks/wmt2016/README.md
+++ b/lm_eval/tasks/wmt2016/README.md
+# WMT16
+### Paper
+Title: `Findings of the 2016 Conference on Machine Translation`
+Abstract: http://www.aclweb.org/anthology/W/W16/W16-2301
+Homepage: https://huggingface.co/datasets/wmt16
+### Citation
+```
+@InProceedings{bojar-EtAl:2016:WMT1,
+  author    = {Bojar, Ond
+{r}ej  and  Chatterjee, Rajen  and  Federmann, Christian  and  Graham, Yvette  and  Haddow, Barry  and  Huck, Matthias  and  Jimeno Yepes, Antonio  and  Koehn, Philipp  and  Logacheva, Varvara  and  Monz, Christof  and  Negri, Matteo  and  Neveol, Aurelie  and  Neves, Mariana  and  Popel, Martin  and  Post, Matt  and  Rubino, Raphael  and  Scarton, Carolina  and  Specia, Lucia  and  Turchi, Marco  and  Verspoor, Karin  and  Zampieri, Marcos},
+  title     = {Findings of the 2016 Conference on Machine Translation},
+  booktitle = {Proceedings of the First Conference on Machine Translation},
+  month     = {August},
+  year      = {2016},
+  address   = {Berlin, Germany},
+  publisher = {Association for Computational Linguistics},
+  pages     = {131--198},
+  url       = {http://www.aclweb.org/anthology/W/W16/W16-2301}
+}
+```
+### Groups and Tasks
+#### Groups
+* `wmt-t5-prompt`: Group for all wmt tasks with prompt templates used for T5 (`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`)
+#### Tasks
+With specific prompt styles
+* `wmt-ro-en-t5-prompt`: WMT16 with the prompt template used for T5
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/wmt2016/metrics.py
+++ b/lm_eval/tasks/wmt2016/metrics.py
+import evaluate
+def bleu(predictions, references):
+    rouge_fn = evaluate.load('bleu')
+    results = rouge_fn.compute(predictions=predictions, references=references)
+    return results['bleu']
--- a/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
+++ b/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
+group:
+  - wmt-t5-prompt
+task: wmt-ro-en-t5-prompt
+dataset_path: wmt16
+dataset_name: ro-en
+training_split: train
+validation_split: validation
+output_type: greedy_until
+doc_to_text: "translate English to Romanian: {{translation.en}}"
+doc_to_target: "{{translation.ro}}"
+metric_list:
+  - metric: wer
+    aggregation: mean
+    higher_is_better: false
+  - metric: !function metrics.bleu
+    aggregation: mean
+    higher_is_better: true