Commit e2ed49f8 authored by baberabb's avatar baberabb
Browse files

Merge remote-tracking branch 'origin/big-refactor' into big-refactor_math_sympy

parents a554e41b 91a37c90
...@@ -63,10 +63,10 @@ jobs: ...@@ -63,10 +63,10 @@ jobs:
- name: Test with pytest - name: Test with pytest
# if new tasks are added, run tests on them # if new tasks are added, run tests on them
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
run: python -m pytest tests/test_tasks.py -s -vv -n=auto run: python -m pytest tests/test_tasks.py -s -vv
# if api is modified, run tests on it # if api is modified, run tests on it
- name: Test more tasks with pytest - name: Test more tasks with pytest
env: env:
API: true API: true
if: steps.changed-tasks.outputs.api_any_modified == 'true' if: steps.changed-tasks.outputs.api_any_modified == 'true'
run: python -m pytest tests/test_tasks.py -s -vv -n=auto run: python -m pytest tests/test_tasks.py -s -vv
...@@ -674,22 +674,22 @@ class ConfigurableTask(Task): ...@@ -674,22 +674,22 @@ class ConfigurableTask(Task):
check_choices = test_choice check_choices = test_choice
else: else:
check_choices = [test_target] check_choices = [test_target]
if self.config.doc_to_choice is not None:
for choice in check_choices: for choice in check_choices:
choice_has_whitespace = True if " " in choice else False choice_has_whitespace = True if choice[0].isspace() else False
delimiter_has_whitespace = ( delimiter_has_whitespace = (
True if " " in self.config.target_delimiter else False True if self.config.target_delimiter[-1].isspace() else False
)
if delimiter_has_whitespace and choice_has_whitespace:
eval_logger.warning(
f'Both target_delimiter and target choice: "{choice}" have whitespace'
)
elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
eval_logger.warning(
f'Both target_delimiter and target choice: "{choice}" does not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
) )
if delimiter_has_whitespace and choice_has_whitespace:
eval_logger.warning(
f'Both target_delimiter and target choice: "{choice}" have whitespace'
)
elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
eval_logger.warning(
f'Both target_delimiter and target choice: "{choice}" does not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
)
def download(self, dataset_kwargs=None) -> None: def download(self, dataset_kwargs=None) -> None:
self.dataset = datasets.load_dataset( self.dataset = datasets.load_dataset(
path=self.DATASET_PATH, path=self.DATASET_PATH,
...@@ -1070,6 +1070,9 @@ class ConfigurableTask(Task): ...@@ -1070,6 +1070,9 @@ class ConfigurableTask(Task):
# it assumes that doc_to_target returns a number. # it assumes that doc_to_target returns a number.
choices = self.doc_to_choice(doc) choices = self.doc_to_choice(doc)
gold = choices[gold] gold = choices[gold]
# we expect multiple_targets to be a list.
elif self.multiple_target:
gold = list(gold)
else: else:
gold = str(gold) gold = str(gold)
...@@ -1080,6 +1083,10 @@ class ConfigurableTask(Task): ...@@ -1080,6 +1083,10 @@ class ConfigurableTask(Task):
# return true if any are true # return true if any are true
# TODO: this may break for multipLe_target, non zero-or-1 metrics # TODO: this may break for multipLe_target, non zero-or-1 metrics
scores = [] scores = []
if not isinstance(gold, list):
# sometimes, a multiple_target dataset has exceptions where one doc has only one string answer
# print(gold)
gold = [gold]
for gold_option in gold: for gold_option in gold:
try: try:
result_score = self._metric_fn_list[metric]( result_score = self._metric_fn_list[metric](
......
...@@ -16,7 +16,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for ...@@ -16,7 +16,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [x] MCTACO - [x] MCTACO
- [x] Pubmed QA - [x] Pubmed QA
- [x] SciQ - [x] SciQ
- [ ] QASPER - [x] QASPER
- [x] QA4MRE - [x] QA4MRE
- [x] TriviaQA - [x] TriviaQA
- [x] AI2 ARC - [x] AI2 ARC
...@@ -36,7 +36,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for ...@@ -36,7 +36,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [x] TruthfulQA (mc1) - [x] TruthfulQA (mc1)
- [x] TruthfulQA (mc2) - [x] TruthfulQA (mc2)
- [x] TruthfulQA (gen) - [x] TruthfulQA (gen)
- [ ] MuTual - [x] MuTual
- [ ] Hendrycks Math (Hailey) - [ ] Hendrycks Math (Hailey)
- [x] Asdiv - [x] Asdiv
- [ ] GSM8k - [ ] GSM8k
......
# Generated by utils.py
dataset_name: bn
doc_to_target: '{% if answer is not none %}{{answer[16+1]}}{% else %}{{answer_number|string}}{%
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nধাপে ধাপে উত্তর:"}}{% else
%}{{"প্রশ্ন: "+question+"\nধাপে ধাপে উত্তর:"}}{% endif %}'
include: cot_yaml
task: mgsm_bn_direct
# Generated by utils.py
dataset_name: de
doc_to_target: '{% if answer is not none %}{{answer[28+1]}}{% else %}{{answer_number|string}}{%
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nSchritt-für-Schritt-Antwort:"}}{%
else %}{{"Frage: "+question+"\nSchritt-für-Schritt-Antwort:"}}{% endif %}'
include: cot_yaml
task: mgsm_de_direct
# Generated by utils.py
dataset_name: en
doc_to_target: '{% if answer is not none %}{{answer[20+1]}}{% else %}{{answer_number|string}}{%
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else
%}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
include: cot_yaml
task: mgsm_en_direct
# Generated by utils.py
dataset_name: es
doc_to_target: '{% if answer is not none %}{{answer[22+1]}}{% else %}{{answer_number|string}}{%
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nRespuesta paso a paso:"}}{%
else %}{{"Pregunta: "+question+"\nRespuesta paso a paso:"}}{% endif %}'
include: cot_yaml
task: mgsm_es_direct
# Generated by utils.py
dataset_name: fr
doc_to_target: '{% if answer is not none %}{{answer[25+1]}}{% else %}{{answer_number|string}}{%
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nRéponse étape par étape :"}}{%
else %}{{"Question : "+question+"\nRéponse étape par étape :"}}{% endif %}'
include: cot_yaml
task: mgsm_fr_direct
# Generated by utils.py
dataset_name: ja
doc_to_target: '{% if answer is not none %}{{answer[10+1]}}{% else %}{{answer_number|string}}{%
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nステップごとの答え:"}}{% else %}{{"問題:
"+question+"\nステップごとの答え:"}}{% endif %}'
include: cot_yaml
task: mgsm_ja_direct
# Generated by utils.py
dataset_name: ru
doc_to_target: '{% if answer is not none %}{{answer[17+1]}}{% else %}{{answer_number|string}}{%
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nПошаговоерешение:"}}{% else
%}{{"Задача: "+question+"\nПошаговоерешение:"}}{% endif %}'
include: cot_yaml
task: mgsm_ru_direct
# Generated by utils.py
dataset_name: sw
doc_to_target: '{% if answer is not none %}{{answer[24+1]}}{% else %}{{answer_number|string}}{%
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nJibu la Hatua kwa Hatua:"}}{%
else %}{{"Swali: "+question+"\nJibu la Hatua kwa Hatua:"}}{% endif %}'
include: cot_yaml
task: mgsm_sw_direct
# Generated by utils.py
dataset_name: te
doc_to_target: '{% if answer is not none %}{{answer[18+1]}}{% else %}{{answer_number|string}}{%
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nదశలవారీగా సమాధానం:"}}{% else
%}{{"ప్రశ్న: "+question+"\nదశలవారీగా సమాధానం:"}}{% endif %}'
include: cot_yaml
task: mgsm_te_direct
# Generated by utils.py
dataset_name: th
doc_to_target: '{% if answer is not none %}{{answer[17+1]}}{% else %}{{answer_number|string}}{%
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nคำตอบทีละขั้นตอน:"}}{% else
%}{{"โจทย์: "+question+"\nคำตอบทีละขั้นตอน:"}}{% endif %}'
include: cot_yaml
task: mgsm_th_direct
# Generated by utils.py
dataset_name: zh
doc_to_target: '{% if answer is not none %}{{answer[5+1]}}{% else %}{{answer_number|string}}{%
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\n逐步解答:"}}{% else %}{{"问题: "+question+"\n逐步解答:"}}{%
endif %}'
include: cot_yaml
task: mgsm_zh_direct
# MuTual
### Paper
Title: `MuTual: A Dataset for Multi-Turn Dialogue Reasoning`
Abstract: https://www.aclweb.org/anthology/2020.acl-main.130/
MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is
modified from Chinese high school English listening comprehension test data.
Homepage: https://github.com/Nealcly/MuTual
### Citation
```
@inproceedings{mutual,
title = "MuTual: A Dataset for Multi-Turn Dialogue Reasoning",
author = "Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming" ,
booktitle = "Proceedings of the 58th Conference of the Association for Computational Linguistics",
year = "2020",
publisher = "Association for Computational Linguistics",
}
```
### Groups and Tasks
#### Groups
* Not part of a group yet.
#### Tasks
* `mutual`
* `mutual_plus`
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
include: mutual.yaml
task: mutual_plus
dataset_name: mutual_plus
task: mutual
dataset_path: "EleutherAI/mutual"
dataset_name: mutual
output_type: multiple_choice
training_split: train
validation_split: validation
doc_to_text: "{{article}}"
doc_to_target: "{{['A', 'B', 'C', 'D'].index(answers)}}"
doc_to_choice: "{{options}}"
process_docs: !function utils.process_docs
process_results: !function utils.process_results
should_decontaminate: true
doc_to_decontamination_query: "{{article}}"
metric_list:
- metric: r@1
aggregation: mean
higher_is_better: true
- metric: r@2
aggregation: mean
higher_is_better: true
- metric: mrr
aggregation: mean
higher_is_better: true
import numpy as np
def process_docs(dataset):
def _detokenize(text):
text = text.replace(" '", "'")
text = text.replace(" \n", "\n")
text = text.replace("\n ", "\n")
text = text.replace(" n't", "n't")
text = text.replace("`` ", '"')
text = text.replace("''", '"')
# punctuation
text = text.replace(" :", ":")
text = text.replace(" ;", ";")
text = text.replace(" !", "!")
text = text.replace(" ?", "?")
text = text.replace(" ,", ",")
text = text.replace(" .", ".")
return text
def _process(doc):
return {
"article": _detokenize(doc["article"]),
"options": [_detokenize(option) for option in doc["options"]],
}
return dataset.map(_process)
def process_results(doc, results):
gold = ["A", "B", "C", "D"].index(doc["answers"])
r4_1 = np.argmax(results) == gold # r4_1 = accuracy
ranks = sorted(results, reverse=True)
r4_2 = (ranks.index(results[gold]) == 1) + r4_1
mrr = 1.0 / (ranks.index(results[gold]) + 1) # `+ 1` for index offset
return {"r@1": r4_1, "r@2": r4_2, "mrr": mrr}
task: nq_open
dataset_path: nq_open
output_type: greedy_until
training_split: train
validation_split: validation
description: "Answer these questions:\n"
doc_to_text: "Q: {{question}}?\nA:"
doc_to_target: "{{answer}}" # TODO: should be multi-target
fewshot_delimiter: "\n"
generation_kwargs:
until:
- "\n"
- "."
- ","
do_sample: false
temperature: 0.0
filter_list:
- name: remove_whitespace
filter:
- function: remove_whitespace
- function: take_first
target_delimiter: " "
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
regexes_to_ignore:
- "\ban|a|the\b"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment