Commit ac50adb5 authored by lintangsutawika's avatar lintangsutawika
Browse files

merged with latest big-refactor

parents 6355d06f a3252ed7
......@@ -4,11 +4,5 @@ doc_to_target: '{% if answer is not none %}{{answer[20+1]}}{% else %}{{answer_nu
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else
%}{{"Question: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
filter:
- function: regex
regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
filter_list:
- name: get-answer
include: cot_yaml
task: mgsm_en_direct
......@@ -4,11 +4,5 @@ doc_to_target: '{% if answer is not none %}{{answer[22+1]}}{% else %}{{answer_nu
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nRespuesta paso a paso:"}}{%
else %}{{"Pregunta: "+question+"\nRespuesta paso a paso:"}}{% endif %}'
filter:
- function: regex
regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
filter_list:
- name: get-answer
include: cot_yaml
task: mgsm_es_direct
......@@ -4,11 +4,5 @@ doc_to_target: '{% if answer is not none %}{{answer[25+1]}}{% else %}{{answer_nu
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nRéponse étape par étape :"}}{%
else %}{{"Question : "+question+"\nRéponse étape par étape :"}}{% endif %}'
filter:
- function: regex
regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
filter_list:
- name: get-answer
include: cot_yaml
task: mgsm_fr_direct
......@@ -4,11 +4,5 @@ doc_to_target: '{% if answer is not none %}{{answer[10+1]}}{% else %}{{answer_nu
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nステップごとの答え:"}}{% else %}{{"問題:
"+question+"\nステップごとの答え:"}}{% endif %}'
filter:
- function: regex
regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
filter_list:
- name: get-answer
include: cot_yaml
task: mgsm_ja_direct
......@@ -4,11 +4,5 @@ doc_to_target: '{% if answer is not none %}{{answer[17+1]}}{% else %}{{answer_nu
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nПошаговоерешение:"}}{% else
%}{{"Задача: "+question+"\nПошаговоерешение:"}}{% endif %}'
filter:
- function: regex
regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
filter_list:
- name: get-answer
include: cot_yaml
task: mgsm_ru_direct
......@@ -4,11 +4,5 @@ doc_to_target: '{% if answer is not none %}{{answer[24+1]}}{% else %}{{answer_nu
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nJibu la Hatua kwa Hatua:"}}{%
else %}{{"Swali: "+question+"\nJibu la Hatua kwa Hatua:"}}{% endif %}'
filter:
- function: regex
regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
filter_list:
- name: get-answer
include: cot_yaml
task: mgsm_sw_direct
......@@ -4,11 +4,5 @@ doc_to_target: '{% if answer is not none %}{{answer[18+1]}}{% else %}{{answer_nu
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nదశలవారీగా సమాధానం:"}}{% else
%}{{"ప్రశ్న: "+question+"\nదశలవారీగా సమాధానం:"}}{% endif %}'
filter:
- function: regex
regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
filter_list:
- name: get-answer
include: cot_yaml
task: mgsm_te_direct
......@@ -4,11 +4,5 @@ doc_to_target: '{% if answer is not none %}{{answer[17+1]}}{% else %}{{answer_nu
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nคำตอบทีละขั้นตอน:"}}{% else
%}{{"โจทย์: "+question+"\nคำตอบทีละขั้นตอน:"}}{% endif %}'
filter:
- function: regex
regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
filter_list:
- name: get-answer
include: cot_yaml
task: mgsm_th_direct
......@@ -4,11 +4,5 @@ doc_to_target: '{% if answer is not none %}{{answer[5+1]}}{% else %}{{answer_num
endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\n逐步解答:"}}{% else %}{{"问题: "+question+"\n逐步解答:"}}{%
endif %}'
filter:
- function: regex
regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
filter_list:
- name: get-answer
include: cot_yaml
task: mgsm_zh_direct
......@@ -4,16 +4,19 @@ import argparse
LANGUAGES = {
"bn": { # Bengali
# "QUESTION": "প্রশ্ন:",
"QUESTION": "\u09aa\u09cd\u09b0\u09b6\u09cd\u09a8:",
# "ANSWER": "ধাপে ধাপে উত্তর:",
"ANSWER": "\u09a7\u09be\u09aa\u09c7 \u09a7\u09be\u09aa\u09c7 \u0989\u09a4\u09cd\u09a4\u09b0:",
"DIRECT": "Answer:",
"REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
},
"de": { # German
"QUESTION": "Frage:",
# "ANSWER": "Schritt-für-Schritt-Antwort:",
"ANSWER": "Schritt-f\u00fcr-Schritt-Antwort:",
"DIRECT": "Antwort:",
"REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
"REGEX": "Die Antwort lautet (\\-?[0-9\\.\\,]+)",
},
"en": { # English
"QUESTION": "Question:",
......@@ -24,50 +27,68 @@ LANGUAGES = {
"es": { # Spanish
"QUESTION": "Pregunta:",
"ANSWER": "Respuesta paso a paso:",
"DIRECT": "Answer:",
"REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
"DIRECT": "Respuesta:",
"REGEX": "La respuesta es (\\-?[0-9\\.\\,]+)",
},
"fr": { # French
"QUESTION": "Question :",
# "ANSWER": "Réponse étape par étape :"
"ANSWER": "R\u00e9ponse \u00e9tape par \u00e9tape :",
"DIRECT": "Answer:",
"REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
# "DIRECT": "Réponse :",
"DIRECT": "R\u00e9ponse :",
# "REGEX": "La réponse est (\\-?[0-9\\.\\,]+)",
"REGEX": "La r\u00e9ponse est (\\-?[0-9\\.\\,]+)",
},
"ru": { # Russian
# "QUESTION": "Задача:",
"QUESTION": "\u0417\u0430\u0434\u0430\u0447\u0430:",
# "ANSWER": "Пошаговоерешение:",
"ANSWER": "\u041f\u043e\u0448\u0430\u0433\u043e\u0432\u043e\u0435\u0440\u0435\u0448\u0435\u043d\u0438\u0435:",
"DIRECT": "Answer:",
"REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
# "REGEX": "Ответ — (\\-?[0-9\\.\\,]+)",
"REGEX": "\u041e\u0442\u0432\u0435\u0442 \u2014 (\\-?[0-9\\.\\,]+)",
},
"sw": { # Swahili
"QUESTION": "Swali:",
"ANSWER": "Jibu la Hatua kwa Hatua:",
"DIRECT": "Answer:",
"REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
"REGEX": "Jibu ni (\\-?[0-9\\.\\,]+)",
},
"te": { # Telugu
# "QUESTION": "ప్రశ్న:",
"QUESTION": "\u0c2a\u0c4d\u0c30\u0c36\u0c4d\u0c28:",
# "ANSWER": "దశలవారీగా సమాధానం:",
"ANSWER": "\u0c26\u0c36\u0c32\u0c35\u0c3e\u0c30\u0c40\u0c17\u0c3e \u0c38\u0c2e\u0c3e\u0c27\u0c3e\u0c28\u0c02:",
"DIRECT": "Answer:",
"REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
# "REGEX": "సమాధానం (\\-?[0-9\\.\\,]+)",
"REGEX": "\u0c38\u0c2e\u0c3e\u0c27\u0c3e\u0c28\u0c02 (\\-?[0-9\\.\\,]+)",
},
"th": { # Thai
# "QUESTION": "โจทย์:",
"QUESTION": "\u0e42\u0e08\u0e17\u0e22\u0e4c:",
# "ANSWER": "คำตอบทีละขั้นตอน:",
"ANSWER": "\u0e04\u0e33\u0e15\u0e2d\u0e1a\u0e17\u0e35\u0e25\u0e30\u0e02\u0e31\u0e49\u0e19\u0e15\u0e2d\u0e19:",
"DIRECT": "Answer:",
"REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
# "REGEX": "คำตอบคือ (\\-?[0-9\\.\\,]+)",
"REGEX": "\u0e04\u0e33\u0e15\u0e2d\u0e1a\u0e04\u0e37\u0e2d (\\-?[0-9\\.\\,]+)",
},
"ja": { # Japanese
# "QUESTION": "問題:",
"QUESTION": "\u554f\u984c:",
# "ANSWER": "ステップごとの答え:",
"ANSWER": "\u30b9\u30c6\u30c3\u30d7\u3054\u3068\u306e\u7b54\u3048:",
"DIRECT": "Answer:",
"REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
# "REGEX": "答えは(\\-?[0-9\\.\\,]+)です。",
"REGEX": "\u7b54\u3048\u306f(\\-?[0-9\\.\\,]+)\u3067\u3059\u3002",
},
"zh": { # Chinese
# "QUESTION": "问题:",
"QUESTION": "\u95ee\u9898:",
# "ANSWER": "逐步解答:",
"ANSWER": "\u9010\u6b65\u89e3\u7b54:",
"DIRECT": "Answer:",
"REGEX": "The answer is (\\-?[0-9\\.\\,]+)",
# "REGEX": "答案是 (\\-?[0-9\\.\\,]+)。",
"REGEX": "\u7b54\u6848\u662f (\\-?[0-9\\.\\,]+)\u3002",
},
}
......@@ -80,15 +101,15 @@ def add_regex_pattern(regex_pattern):
"filter_list": [
{
"name": "get-answer",
},
],
"filter": [
{
"function": "regex",
"regex_pattern": regex_pattern,
},
{
"function": "take_first",
"filter": [
{
"function": "regex",
"regex_pattern": regex_pattern,
},
{
"function": "take_first",
},
],
},
],
}
......@@ -107,6 +128,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
QUESTION = LANGUAGES[lang]["QUESTION"]
yaml_template = "cot_yaml"
filter_list = {}
if mode == "direct":
ANSWER = LANGUAGES[lang]["DIRECT"]
REGEX = None
......@@ -116,13 +138,13 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
ANSWER = LANGUAGES[lang]["ANSWER"]
REGEX = LANGUAGES[lang]["REGEX"]
task_name = f"mgsm_{lang}_native-cot"
filter_list = add_regex_pattern(REGEX)
elif mode == "en-cot":
ANSWER = LANGUAGES["en"]["ANSWER"]
REGEX = LANGUAGES["en"]["REGEX"]
task_name = f"mgsm_{lang}_en-cot"
file_name = f"{task_name}.yaml"
filter_list = add_regex_pattern(REGEX)
with open(
f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
......@@ -147,6 +169,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
},
f,
allow_unicode=True,
width=float("inf"),
)
except FileExistsError:
err.append(file_name)
......
# MuTual
### Paper
Title: `MuTual: A Dataset for Multi-Turn Dialogue Reasoning`
Abstract: https://www.aclweb.org/anthology/2020.acl-main.130/
MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is
modified from Chinese high school English listening comprehension test data.
Homepage: https://github.com/Nealcly/MuTual
### Citation
```
@inproceedings{mutual,
title = "MuTual: A Dataset for Multi-Turn Dialogue Reasoning",
author = "Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming" ,
booktitle = "Proceedings of the 58th Conference of the Association for Computational Linguistics",
year = "2020",
publisher = "Association for Computational Linguistics",
}
```
### Groups and Tasks
#### Groups
* Not part of a group yet.
#### Tasks
* `mutual`
* `mutual_plus`
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
include: mutual.yaml
task: mutual_plus
dataset_name: mutual_plus
task: mutual
dataset_path: "EleutherAI/mutual"
dataset_name: mutual
output_type: multiple_choice
training_split: train
validation_split: validation
doc_to_text: "{{article}}"
doc_to_target: "{{['A', 'B', 'C', 'D'].index(answers)}}"
doc_to_choice: "{{options}}"
process_docs: !function utils.process_docs
process_results: !function utils.process_results
should_decontaminate: true
doc_to_decontamination_query: "{{article}}"
metric_list:
- metric: r@1
aggregation: mean
higher_is_better: true
- metric: r@2
aggregation: mean
higher_is_better: true
- metric: mrr
aggregation: mean
higher_is_better: true
import numpy as np
def process_docs(dataset):
def _detokenize(text):
text = text.replace(" '", "'")
text = text.replace(" \n", "\n")
text = text.replace("\n ", "\n")
text = text.replace(" n't", "n't")
text = text.replace("`` ", '"')
text = text.replace("''", '"')
# punctuation
text = text.replace(" :", ":")
text = text.replace(" ;", ";")
text = text.replace(" !", "!")
text = text.replace(" ?", "?")
text = text.replace(" ,", ",")
text = text.replace(" .", ".")
return text
def _process(doc):
return {
"article": _detokenize(doc["article"]),
"options": [_detokenize(option) for option in doc["options"]],
}
return dataset.map(_process)
def process_results(doc, results):
gold = ["A", "B", "C", "D"].index(doc["answers"])
r4_1 = np.argmax(results) == gold # r4_1 = accuracy
ranks = sorted(results, reverse=True)
r4_2 = (ranks.index(results[gold]) == 1) + r4_1
mrr = 1.0 / (ranks.index(results[gold]) + 1) # `+ 1` for index offset
return {"r@1": r4_1, "r@2": r4_2, "mrr": mrr}
task: nq_open
dataset_path: nq_open
output_type: greedy_until
training_split: train
validation_split: validation
description: "Answer these questions:\n"
doc_to_text: "Q: {{question}}?\nA:"
doc_to_target: "{{answer}}" # TODO: should be multi-target
fewshot_delimiter: "\n"
generation_kwargs:
until:
- "\n"
- "."
- ","
do_sample: false
temperature: 0.0
filter_list:
- name: remove_whitespace
filter:
- function: remove_whitespace
- function: take_first
target_delimiter: " "
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
regexes_to_ignore:
- "\ban|a|the\b"
# QASPER
### Paper
Title: `A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers`
Abstract: https://arxiv.org/abs/2105.03011
QASPER is a dataset of 5,049 questions over 1,585 Natural Language Processing papers.
Each question is written by an NLP practitioner who read only the title and abstract
of the corresponding paper, and the question seeks information present in the full
text. The questions are then answered by a separate set of NLP practitioners who also
provide supporting evidence to answers.
Homepage: https://allenai.org/data/qasper
### Citation
```
@article{DBLP:journals/corr/abs-2105-03011,
author = {Pradeep Dasigi and
Kyle Lo and
Iz Beltagy and
Arman Cohan and
Noah A. Smith and
Matt Gardner},
title = {A Dataset of Information-Seeking Questions and Answers Anchored in
Research Papers},
journal = {CoRR},
volume = {abs/2105.03011},
year = {2021},
url = {https://arxiv.org/abs/2105.03011},
eprinttype = {arXiv},
eprint = {2105.03011},
timestamp = {Fri, 14 May 2021 12:13:30 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2105-03011.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
```
### Groups and Tasks
#### Groups
* `qasper`: executes both `qasper_bool` and `qasper_freeform`
#### Tasks
* `qasper_bool`: Multiple choice task that evaluates the task with `answer_type="bool"`
* `qasper_freeform`: Greedy generation task that evaluates the samples from the task with `answer_type="free form answer"`
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
group: qasper
task: qasper_bool
dataset_path: qasper
output_type: multiple_choice
training_split: train
validation_split: validation
process_docs: !function utils.process_docs_bool
doc_to_text: "TITLE: {{title}}\nABSTRACT: {{abstract}}\n\nQ: {{question}}\n\nA:"
doc_to_target: 1
doc_to_choice: ["no", "yes"]
metric_list:
- metric: f1
group: qasper
task: qasper_freeform
dataset_path: qasper
output_type: greedy_until
training_split: train
validation_split: validation
process_docs: !function utils.process_docs_freeform
doc_to_text: "TITLE: {{title}}\nABSTRACT: {{abstract}}\n\nQ: {{question}}\n\nA:"
doc_to_target: answer
generation_kwargs:
until:
- "\n"
metric_list:
- metric: !function metrics.f1_abstractive
aggregation: mean
higher_is_better: true
import re
import string
from collections import Counter
def normalize_answer(s):
"""
Taken from the official evaluation script for v1.1 of the SQuAD dataset.
Lower text and remove punctuation, articles and extra whitespace.
"""
def remove_articles(text):
return re.sub(r"\b(a|an|the)\b", " ", text)
def white_space_fix(text):
return " ".join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return "".join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def f1_abstractive(predictions, references):
"""
Taken from the official evaluation script for v1.1 of the SQuAD dataset.
"""
prediction_tokens = normalize_answer(predictions[0]).split()
references_tokens = normalize_answer(references[0]).split()
common = Counter(prediction_tokens) & Counter(references_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(references_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment