utils.py 5.18 KB
Newer Older
lintangsutawika's avatar
lintangsutawika committed
1
2
3
4
5
6
7
8
9
10
11
import argparse
from typing import Dict, List

import yaml


# Different languages that are part of xnli.
# These correspond to dataset names (Subsets) on HuggingFace.
# A yaml file is generated by this script for each language.

LANGUAGES = {
lintangsutawika's avatar
lintangsutawika committed
12
    "ar": {  # Arabic
lintangsutawika's avatar
lintangsutawika committed
13
14
15
        "QUESTION_WORD": "صحيح",
        "ENTAILMENT_LABEL": "نعم",
        "NEUTRAL_LABEL": "لذا",
lintangsutawika's avatar
lintangsutawika committed
16
        "CONTRADICTION_LABEL": "رقم",
lintangsutawika's avatar
lintangsutawika committed
17
    },
lintangsutawika's avatar
lintangsutawika committed
18
    "bg": {  # Bulgarian
lintangsutawika's avatar
lintangsutawika committed
19
20
21
        "QUESTION_WORD": "правилно",
        "ENTAILMENT_LABEL": "да",
        "NEUTRAL_LABEL": "така",
lintangsutawika's avatar
lintangsutawika committed
22
        "CONTRADICTION_LABEL": "не",
lintangsutawika's avatar
lintangsutawika committed
23
    },
lintangsutawika's avatar
lintangsutawika committed
24
    "de": {  # German
lintangsutawika's avatar
lintangsutawika committed
25
26
27
        "QUESTION_WORD": "richtig",
        "ENTAILMENT_LABEL": "Ja",
        "NEUTRAL_LABEL": "Auch",
lintangsutawika's avatar
lintangsutawika committed
28
        "CONTRADICTION_LABEL": "Nein",
lintangsutawika's avatar
lintangsutawika committed
29
    },
lintangsutawika's avatar
lintangsutawika committed
30
    "el": {  # Greek
lintangsutawika's avatar
lintangsutawika committed
31
32
33
        "QUESTION_WORD": "σωστός",
        "ENTAILMENT_LABEL": "Ναί",
        "NEUTRAL_LABEL": "Έτσι",
lintangsutawika's avatar
lintangsutawika committed
34
        "CONTRADICTION_LABEL": "όχι",
lintangsutawika's avatar
lintangsutawika committed
35
    },
lintangsutawika's avatar
lintangsutawika committed
36
    "en": {  # English
lintangsutawika's avatar
lintangsutawika committed
37
38
39
        "QUESTION_WORD": "right",
        "ENTAILMENT_LABEL": "Yes",
        "NEUTRAL_LABEL": "Also",
lintangsutawika's avatar
lintangsutawika committed
40
        "CONTRADICTION_LABEL": "No",
lintangsutawika's avatar
lintangsutawika committed
41
    },
lintangsutawika's avatar
lintangsutawika committed
42
    "es": {  # Spanish
lintangsutawika's avatar
lintangsutawika committed
43
44
45
        "QUESTION_WORD": "correcto",
        "ENTAILMENT_LABEL": "Sí",
        "NEUTRAL_LABEL": "Asi que",
lintangsutawika's avatar
lintangsutawika committed
46
        "CONTRADICTION_LABEL": "No",
lintangsutawika's avatar
lintangsutawika committed
47
    },
lintangsutawika's avatar
lintangsutawika committed
48
    "fr": {  # French
lintangsutawika's avatar
lintangsutawika committed
49
50
51
        "QUESTION_WORD": "correct",
        "ENTAILMENT_LABEL": "Oui",
        "NEUTRAL_LABEL": "Aussi",
lintangsutawika's avatar
lintangsutawika committed
52
        "CONTRADICTION_LABEL": "Non",
lintangsutawika's avatar
lintangsutawika committed
53
    },
lintangsutawika's avatar
lintangsutawika committed
54
    "hi": {  # Hindi
lintangsutawika's avatar
lintangsutawika committed
55
56
57
        "QUESTION_WORD": "सही",
        "ENTAILMENT_LABEL": "हाँ",
        "NEUTRAL_LABEL": "इसलिए",
lintangsutawika's avatar
lintangsutawika committed
58
        "CONTRADICTION_LABEL": "नहीं",
lintangsutawika's avatar
lintangsutawika committed
59
    },
lintangsutawika's avatar
lintangsutawika committed
60
    "ru": {  # Russian
lintangsutawika's avatar
lintangsutawika committed
61
62
63
        "QUESTION_WORD": "правильно",
        "ENTAILMENT_LABEL": "Да",
        "NEUTRAL_LABEL": "Так",
lintangsutawika's avatar
lintangsutawika committed
64
        "CONTRADICTION_LABEL": "Нет",
lintangsutawika's avatar
lintangsutawika committed
65
    },
lintangsutawika's avatar
lintangsutawika committed
66
    "sw": {  # Swahili
lintangsutawika's avatar
lintangsutawika committed
67
68
69
        "QUESTION_WORD": "sahihi",
        "ENTAILMENT_LABEL": "Ndiyo",
        "NEUTRAL_LABEL": "Hivyo",
lintangsutawika's avatar
lintangsutawika committed
70
        "CONTRADICTION_LABEL": "Hapana",
lintangsutawika's avatar
lintangsutawika committed
71
    },
lintangsutawika's avatar
lintangsutawika committed
72
    "th": {  # Thai
lintangsutawika's avatar
lintangsutawika committed
73
74
75
        "QUESTION_WORD": "ถูกต้อง",
        "ENTAILMENT_LABEL": "ใช่",
        "NEUTRAL_LABEL": "ดังนั้น",
lintangsutawika's avatar
lintangsutawika committed
76
        "CONTRADICTION_LABEL": "ไม่",
lintangsutawika's avatar
lintangsutawika committed
77
    },
lintangsutawika's avatar
lintangsutawika committed
78
    "tr": {  # Turkish
lintangsutawika's avatar
lintangsutawika committed
79
80
81
        "QUESTION_WORD": "doğru",
        "ENTAILMENT_LABEL": "Evet",
        "NEUTRAL_LABEL": "Böylece",
lintangsutawika's avatar
lintangsutawika committed
82
        "CONTRADICTION_LABEL": "Hayır",
lintangsutawika's avatar
lintangsutawika committed
83
    },
lintangsutawika's avatar
lintangsutawika committed
84
    "ur": {  # Urdu
lintangsutawika's avatar
lintangsutawika committed
85
86
87
        "QUESTION_WORD": "صحیح",
        "ENTAILMENT_LABEL": "جی ہاں",
        "NEUTRAL_LABEL": "اس لئے",
lintangsutawika's avatar
lintangsutawika committed
88
        "CONTRADICTION_LABEL": "نہیں",
lintangsutawika's avatar
lintangsutawika committed
89
    },
lintangsutawika's avatar
lintangsutawika committed
90
    "vi": {  # Vietnamese
lintangsutawika's avatar
lintangsutawika committed
91
92
93
        "QUESTION_WORD": "đúng",
        "ENTAILMENT_LABEL": "Vâng",
        "NEUTRAL_LABEL": "Vì vậy",
lintangsutawika's avatar
lintangsutawika committed
94
        "CONTRADICTION_LABEL": "Không",
lintangsutawika's avatar
lintangsutawika committed
95
    },
lintangsutawika's avatar
lintangsutawika committed
96
    "zh": {  # Chinese
lintangsutawika's avatar
lintangsutawika committed
97
98
99
        "QUESTION_WORD": "正确",
        "ENTAILMENT_LABEL": "是的",
        "NEUTRAL_LABEL": "所以",
lintangsutawika's avatar
lintangsutawika committed
100
        "CONTRADICTION_LABEL": "不是的",
lintangsutawika's avatar
lintangsutawika committed
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
    },
}


def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
    """
    Generate a yaml file for each language.

    :param output_dir: The directory to output the files to.
    :param overwrite: Whether to overwrite files if they already exist.
    """
    err = []
    for lang in LANGUAGES.keys():
        file_name = f"xnli_{lang}.yaml"
        try:
            QUESTION_WORD = LANGUAGES[lang]["QUESTION_WORD"]
            ENTAILMENT_LABEL = LANGUAGES[lang]["ENTAILMENT_LABEL"]
            NEUTRAL_LABEL = LANGUAGES[lang]["NEUTRAL_LABEL"]
            CONTRADICTION_LABEL = LANGUAGES[lang]["CONTRADICTION_LABEL"]
lintangsutawika's avatar
lintangsutawika committed
120
121
122
            with open(
                f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
            ) as f:
lintangsutawika's avatar
lintangsutawika committed
123
124
125
126
127
128
                f.write("# Generated by utils.py\n")
                yaml.dump(
                    {
                        "include": "xnli_common_yaml",
                        "dataset_name": lang,
                        "task": f"xnli_{lang}",
lintangsutawika's avatar
update  
lintangsutawika committed
129
130
                        "doc_to_text": "",
                        "doc_to_choice": f"{{{{["
lintangsutawika's avatar
lintangsutawika committed
131
132
133
134
                        f"""premise+\", {QUESTION_WORD}? {ENTAILMENT_LABEL}, \"+hypothesis,"""
                        f"""premise+\", {QUESTION_WORD}? {NEUTRAL_LABEL}, \"+hypothesis,"""
                        f"""premise+\", {QUESTION_WORD}? {CONTRADICTION_LABEL}, \"+hypothesis"""
                        f"]}}}}",
lintangsutawika's avatar
lintangsutawika committed
135
136
                    },
                    f,
lintangsutawika's avatar
lintangsutawika committed
137
                    allow_unicode=True,
lintangsutawika's avatar
lintangsutawika committed
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
                )
        except FileExistsError:
            err.append(file_name)

    if len(err) > 0:
        raise FileExistsError(
            "Files were not created because they already exist (use --overwrite flag):"
            f" {', '.join(err)}"
        )


def main() -> None:
    """Parse CLI args and generate language-specific yaml files."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--overwrite",
        default=False,
        action="store_true",
        help="Overwrite files if they already exist",
    )
    parser.add_argument(
        "--output-dir", default=".", help="Directory to write yaml files to"
    )
    args = parser.parse_args()

    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite)


if __name__ == "__main__":
    main()