Commit 18e2d093 authored by lintangsutawika's avatar lintangsutawika
Browse files

added xnli

parent 7634a6ec
"""
@InProceedings{conneau2018xnli,
author = "Conneau, Alexis
and Rinott, Ruty
and Lample, Guillaume
and Williams, Adina
and Bowman, Samuel R.
and Schwenk, Holger
and Stoyanov, Veselin",
title = "XNLI: Evaluating Cross-lingual Sentence Representations",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods
in Natural Language Processing",
year = "2018",
publisher = "Association for Computational Linguistics",
location = "Brussels, Belgium",
}
"""
import argparse
from typing import Dict, List
import yaml
# Different languages that are part of xnli.
# These correspond to dataset names (Subsets) on HuggingFace.
# A yaml file is generated by this script for each language.
LANGUAGES = {
"ar": { # Arabic
"QUESTION_WORD": "صحيح",
"ENTAILMENT_LABEL": "نعم",
"NEUTRAL_LABEL": "لذا",
"CONTRADICTION_LABEL": "رقم"
},
"bg": { # Bulgarian
"QUESTION_WORD": "правилно",
"ENTAILMENT_LABEL": "да",
"NEUTRAL_LABEL": "така",
"CONTRADICTION_LABEL": "не"
},
"de": { # German
"QUESTION_WORD": "richtig",
"ENTAILMENT_LABEL": "Ja",
"NEUTRAL_LABEL": "Auch",
"CONTRADICTION_LABEL": "Nein"
},
"el": { # Greek
"QUESTION_WORD": "σωστός",
"ENTAILMENT_LABEL": "Ναί",
"NEUTRAL_LABEL": "Έτσι",
"CONTRADICTION_LABEL": "όχι"
},
"en": { # English
"QUESTION_WORD": "right",
"ENTAILMENT_LABEL": "Yes",
"NEUTRAL_LABEL": "Also",
"CONTRADICTION_LABEL": "No"
},
"es": { # Spanish
"QUESTION_WORD": "correcto",
"ENTAILMENT_LABEL": "Sí",
"NEUTRAL_LABEL": "Asi que",
"CONTRADICTION_LABEL": "No"
},
"fr": { # French
"QUESTION_WORD": "correct",
"ENTAILMENT_LABEL": "Oui",
"NEUTRAL_LABEL": "Aussi",
"CONTRADICTION_LABEL": "Non"
},
"hi": { # Hindi
"QUESTION_WORD": "सही",
"ENTAILMENT_LABEL": "हाँ",
"NEUTRAL_LABEL": "इसलिए",
"CONTRADICTION_LABEL": "नहीं"
},
"ru": { # Russian
"QUESTION_WORD": "правильно",
"ENTAILMENT_LABEL": "Да",
"NEUTRAL_LABEL": "Так",
"CONTRADICTION_LABEL": "Нет"
},
"sw": { # Swahili
"QUESTION_WORD": "sahihi",
"ENTAILMENT_LABEL": "Ndiyo",
"NEUTRAL_LABEL": "Hivyo",
"CONTRADICTION_LABEL": "Hapana"
},
"th": { # Thai
"QUESTION_WORD": "ถูกต้อง",
"ENTAILMENT_LABEL": "ใช่",
"NEUTRAL_LABEL": "ดังนั้น",
"CONTRADICTION_LABEL": "ไม่"
},
"tr": { # Turkish
"QUESTION_WORD": "doğru",
"ENTAILMENT_LABEL": "Evet",
"NEUTRAL_LABEL": "Böylece",
"CONTRADICTION_LABEL": "Hayır"
},
"ur": { # Urdu
"QUESTION_WORD": "صحیح",
"ENTAILMENT_LABEL": "جی ہاں",
"NEUTRAL_LABEL": "اس لئے",
"CONTRADICTION_LABEL": "نہیں"
},
"vi": { # Vietnamese
"QUESTION_WORD": "đúng",
"ENTAILMENT_LABEL": "Vâng",
"NEUTRAL_LABEL": "Vì vậy",
"CONTRADICTION_LABEL": "Không"
},
"zh": { # Chinese
"QUESTION_WORD": "正确",
"ENTAILMENT_LABEL": "是的",
"NEUTRAL_LABEL": "所以",
"CONTRADICTION_LABEL": "不是的"
},
}
def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
"""
Generate a yaml file for each language.
:param output_dir: The directory to output the files to.
:param overwrite: Whether to overwrite files if they already exist.
"""
err = []
for lang in LANGUAGES.keys():
file_name = f"xnli_{lang}.yaml"
try:
QUESTION_WORD = LANGUAGES[lang]["QUESTION_WORD"]
ENTAILMENT_LABEL = LANGUAGES[lang]["ENTAILMENT_LABEL"]
NEUTRAL_LABEL = LANGUAGES[lang]["NEUTRAL_LABEL"]
CONTRADICTION_LABEL = LANGUAGES[lang]["CONTRADICTION_LABEL"]
with open(f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding='utf8') as f:
f.write("# Generated by utils.py\n")
yaml.dump(
{
"include": "xnli_common_yaml",
"dataset_name": lang,
"task": f"xnli_{lang}",
"doc_to_text": f"{{{{premise}}}}, {QUESTION_WORD}?",
"doc_to_choice": f"{{[\"{ENTAILMENT_LABEL}, {{{{hypothesis}}}}\", \"{NEUTRAL_LABEL}, {{{{hypothesis}}}}\", \"{CONTRADICTION_LABEL}, {{{{hypothesis}}}}\"]}}"
},
f,
allow_unicode=True
)
except FileExistsError:
err.append(file_name)
if len(err) > 0:
raise FileExistsError(
"Files were not created because they already exist (use --overwrite flag):"
f" {', '.join(err)}"
)
def main() -> None:
"""Parse CLI args and generate language-specific yaml files."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--overwrite",
default=False,
action="store_true",
help="Overwrite files if they already exist",
)
parser.add_argument(
"--output-dir", default=".", help="Directory to write yaml files to"
)
args = parser.parse_args()
gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite)
if __name__ == "__main__":
main()
# Generated by utils.py
dataset_name: ar
doc_to_choice: '{["نعم, {{hypothesis}}", "لذا, {{hypothesis}}", "رقم, {{hypothesis}}"]}'
doc_to_text: '{{premise}}, صحيح?'
include: xnli_common_yaml
task: xnli_ar
# Generated by utils.py
dataset_name: bg
doc_to_choice: '{["да, {{hypothesis}}", "така, {{hypothesis}}", "не, {{hypothesis}}"]}'
doc_to_text: '{{premise}}, правилно?'
include: xnli_common_yaml
task: xnli_bg
# This file will be included in the generated language-specific task configs.
# It doesn't have a yaml file extension as it is not meant to be imported directly
# by the harness.
group: xnli
task: null
dataset_path: xnli
dataset_name: null
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
doc_to_text: null
doc_to_target: label
doc_to_choice: null
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
# Generated by utils.py
dataset_name: de
doc_to_choice: '{["Ja, {{hypothesis}}", "Auch, {{hypothesis}}", "Nein, {{hypothesis}}"]}'
doc_to_text: '{{premise}}, richtig?'
include: xnli_common_yaml
task: xnli_de
# Generated by utils.py
dataset_name: el
doc_to_choice: '{["Ναί, {{hypothesis}}", "Έτσι, {{hypothesis}}", "όχι, {{hypothesis}}"]}'
doc_to_text: '{{premise}}, σωστός?'
include: xnli_common_yaml
task: xnli_el
# Generated by utils.py
dataset_name: en
doc_to_choice: '{["Yes, {{hypothesis}}", "Also, {{hypothesis}}", "No, {{hypothesis}}"]}'
doc_to_text: '{{premise}}, right?'
include: xnli_common_yaml
task: xnli_en
# Generated by utils.py
dataset_name: es
doc_to_choice: '{["Sí, {{hypothesis}}", "Asi que, {{hypothesis}}", "No, {{hypothesis}}"]}'
doc_to_text: '{{premise}}, correcto?'
include: xnli_common_yaml
task: xnli_es
# Generated by utils.py
dataset_name: fr
doc_to_choice: '{["Oui, {{hypothesis}}", "Aussi, {{hypothesis}}", "Non, {{hypothesis}}"]}'
doc_to_text: '{{premise}}, correct?'
include: xnli_common_yaml
task: xnli_fr
# Generated by utils.py
dataset_name: hi
doc_to_choice: '{["हाँ, {{hypothesis}}", "इसलिए, {{hypothesis}}", "नहीं, {{hypothesis}}"]}'
doc_to_text: '{{premise}}, सही?'
include: xnli_common_yaml
task: xnli_hi
# Generated by utils.py
dataset_name: ru
doc_to_choice: '{["Да, {{hypothesis}}", "Так, {{hypothesis}}", "Нет, {{hypothesis}}"]}'
doc_to_text: '{{premise}}, правильно?'
include: xnli_common_yaml
task: xnli_ru
# Generated by utils.py
dataset_name: sw
doc_to_choice: '{["Ndiyo, {{hypothesis}}", "Hivyo, {{hypothesis}}", "Hapana, {{hypothesis}}"]}'
doc_to_text: '{{premise}}, sahihi?'
include: xnli_common_yaml
task: xnli_sw
# Generated by utils.py
dataset_name: th
doc_to_choice: '{["ใช่, {{hypothesis}}", "ดังนั้น, {{hypothesis}}", "ไม่, {{hypothesis}}"]}'
doc_to_text: '{{premise}}, ถูกต้อง?'
include: xnli_common_yaml
task: xnli_th
# Generated by utils.py
dataset_name: tr
doc_to_choice: '{["Evet, {{hypothesis}}", "Böylece, {{hypothesis}}", "Hayır, {{hypothesis}}"]}'
doc_to_text: '{{premise}}, doğru?'
include: xnli_common_yaml
task: xnli_tr
# Generated by utils.py
dataset_name: ur
doc_to_choice: '{["جی ہاں, {{hypothesis}}", "اس لئے, {{hypothesis}}", "نہیں, {{hypothesis}}"]}'
doc_to_text: '{{premise}}, صحیح?'
include: xnli_common_yaml
task: xnli_ur
# Generated by utils.py
dataset_name: vi
doc_to_choice: '{["Vâng, {{hypothesis}}", "Vì vậy, {{hypothesis}}", "Không, {{hypothesis}}"]}'
doc_to_text: '{{premise}}, đúng?'
include: xnli_common_yaml
task: xnli_vi
# Generated by utils.py
dataset_name: zh
doc_to_choice: '{["是的, {{hypothesis}}", "所以, {{hypothesis}}", "不是的, {{hypothesis}}"]}'
doc_to_text: '{{premise}}, 正确?'
include: xnli_common_yaml
task: xnli_zh
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment