utils.py 5.15 KB
Newer Older
lintangsutawika's avatar
lintangsutawika committed
1
2
3
4
5
6
7
8
9
10
import argparse

import yaml


# Different languages that are part of xnli.
# These correspond to dataset names (Subsets) on HuggingFace.
# A yaml file is generated by this script for each language.

LANGUAGES = {
lintangsutawika's avatar
lintangsutawika committed
11
    "ar": {  # Arabic
lintangsutawika's avatar
lintangsutawika committed
12
13
14
        "QUESTION_WORD": "صحيح",
        "ENTAILMENT_LABEL": "نعم",
        "NEUTRAL_LABEL": "لذا",
lintangsutawika's avatar
lintangsutawika committed
15
        "CONTRADICTION_LABEL": "رقم",
lintangsutawika's avatar
lintangsutawika committed
16
    },
lintangsutawika's avatar
lintangsutawika committed
17
    "bg": {  # Bulgarian
lintangsutawika's avatar
lintangsutawika committed
18
19
20
        "QUESTION_WORD": "правилно",
        "ENTAILMENT_LABEL": "да",
        "NEUTRAL_LABEL": "така",
lintangsutawika's avatar
lintangsutawika committed
21
        "CONTRADICTION_LABEL": "не",
lintangsutawika's avatar
lintangsutawika committed
22
    },
lintangsutawika's avatar
lintangsutawika committed
23
    "de": {  # German
lintangsutawika's avatar
lintangsutawika committed
24
25
26
        "QUESTION_WORD": "richtig",
        "ENTAILMENT_LABEL": "Ja",
        "NEUTRAL_LABEL": "Auch",
lintangsutawika's avatar
lintangsutawika committed
27
        "CONTRADICTION_LABEL": "Nein",
lintangsutawika's avatar
lintangsutawika committed
28
    },
lintangsutawika's avatar
lintangsutawika committed
29
    "el": {  # Greek
lintangsutawika's avatar
lintangsutawika committed
30
31
32
        "QUESTION_WORD": "σωστός",
        "ENTAILMENT_LABEL": "Ναί",
        "NEUTRAL_LABEL": "Έτσι",
lintangsutawika's avatar
lintangsutawika committed
33
        "CONTRADICTION_LABEL": "όχι",
lintangsutawika's avatar
lintangsutawika committed
34
    },
lintangsutawika's avatar
lintangsutawika committed
35
    "en": {  # English
lintangsutawika's avatar
lintangsutawika committed
36
37
38
        "QUESTION_WORD": "right",
        "ENTAILMENT_LABEL": "Yes",
        "NEUTRAL_LABEL": "Also",
lintangsutawika's avatar
lintangsutawika committed
39
        "CONTRADICTION_LABEL": "No",
lintangsutawika's avatar
lintangsutawika committed
40
    },
lintangsutawika's avatar
lintangsutawika committed
41
    "es": {  # Spanish
lintangsutawika's avatar
lintangsutawika committed
42
43
44
        "QUESTION_WORD": "correcto",
        "ENTAILMENT_LABEL": "Sí",
        "NEUTRAL_LABEL": "Asi que",
lintangsutawika's avatar
lintangsutawika committed
45
        "CONTRADICTION_LABEL": "No",
lintangsutawika's avatar
lintangsutawika committed
46
    },
lintangsutawika's avatar
lintangsutawika committed
47
    "fr": {  # French
lintangsutawika's avatar
lintangsutawika committed
48
49
50
        "QUESTION_WORD": "correct",
        "ENTAILMENT_LABEL": "Oui",
        "NEUTRAL_LABEL": "Aussi",
lintangsutawika's avatar
lintangsutawika committed
51
        "CONTRADICTION_LABEL": "Non",
lintangsutawika's avatar
lintangsutawika committed
52
    },
lintangsutawika's avatar
lintangsutawika committed
53
    "hi": {  # Hindi
lintangsutawika's avatar
lintangsutawika committed
54
55
56
        "QUESTION_WORD": "सही",
        "ENTAILMENT_LABEL": "हाँ",
        "NEUTRAL_LABEL": "इसलिए",
lintangsutawika's avatar
lintangsutawika committed
57
        "CONTRADICTION_LABEL": "नहीं",
lintangsutawika's avatar
lintangsutawika committed
58
    },
lintangsutawika's avatar
lintangsutawika committed
59
    "ru": {  # Russian
lintangsutawika's avatar
lintangsutawika committed
60
61
62
        "QUESTION_WORD": "правильно",
        "ENTAILMENT_LABEL": "Да",
        "NEUTRAL_LABEL": "Так",
lintangsutawika's avatar
lintangsutawika committed
63
        "CONTRADICTION_LABEL": "Нет",
lintangsutawika's avatar
lintangsutawika committed
64
    },
lintangsutawika's avatar
lintangsutawika committed
65
    "sw": {  # Swahili
lintangsutawika's avatar
lintangsutawika committed
66
67
68
        "QUESTION_WORD": "sahihi",
        "ENTAILMENT_LABEL": "Ndiyo",
        "NEUTRAL_LABEL": "Hivyo",
lintangsutawika's avatar
lintangsutawika committed
69
        "CONTRADICTION_LABEL": "Hapana",
lintangsutawika's avatar
lintangsutawika committed
70
    },
lintangsutawika's avatar
lintangsutawika committed
71
    "th": {  # Thai
lintangsutawika's avatar
lintangsutawika committed
72
73
74
        "QUESTION_WORD": "ถูกต้อง",
        "ENTAILMENT_LABEL": "ใช่",
        "NEUTRAL_LABEL": "ดังนั้น",
lintangsutawika's avatar
lintangsutawika committed
75
        "CONTRADICTION_LABEL": "ไม่",
lintangsutawika's avatar
lintangsutawika committed
76
    },
lintangsutawika's avatar
lintangsutawika committed
77
    "tr": {  # Turkish
lintangsutawika's avatar
lintangsutawika committed
78
79
80
        "QUESTION_WORD": "doğru",
        "ENTAILMENT_LABEL": "Evet",
        "NEUTRAL_LABEL": "Böylece",
lintangsutawika's avatar
lintangsutawika committed
81
        "CONTRADICTION_LABEL": "Hayır",
lintangsutawika's avatar
lintangsutawika committed
82
    },
lintangsutawika's avatar
lintangsutawika committed
83
    "ur": {  # Urdu
lintangsutawika's avatar
lintangsutawika committed
84
85
86
        "QUESTION_WORD": "صحیح",
        "ENTAILMENT_LABEL": "جی ہاں",
        "NEUTRAL_LABEL": "اس لئے",
lintangsutawika's avatar
lintangsutawika committed
87
        "CONTRADICTION_LABEL": "نہیں",
lintangsutawika's avatar
lintangsutawika committed
88
    },
lintangsutawika's avatar
lintangsutawika committed
89
    "vi": {  # Vietnamese
lintangsutawika's avatar
lintangsutawika committed
90
91
92
        "QUESTION_WORD": "đúng",
        "ENTAILMENT_LABEL": "Vâng",
        "NEUTRAL_LABEL": "Vì vậy",
lintangsutawika's avatar
lintangsutawika committed
93
        "CONTRADICTION_LABEL": "Không",
lintangsutawika's avatar
lintangsutawika committed
94
    },
lintangsutawika's avatar
lintangsutawika committed
95
    "zh": {  # Chinese
lintangsutawika's avatar
lintangsutawika committed
96
97
98
        "QUESTION_WORD": "正确",
        "ENTAILMENT_LABEL": "是的",
        "NEUTRAL_LABEL": "所以",
lintangsutawika's avatar
lintangsutawika committed
99
        "CONTRADICTION_LABEL": "不是的",
lintangsutawika's avatar
lintangsutawika committed
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
    },
}


def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
    """
    Generate a yaml file for each language.

    :param output_dir: The directory to output the files to.
    :param overwrite: Whether to overwrite files if they already exist.
    """
    err = []
    for lang in LANGUAGES.keys():
        file_name = f"xnli_{lang}.yaml"
        try:
            QUESTION_WORD = LANGUAGES[lang]["QUESTION_WORD"]
            ENTAILMENT_LABEL = LANGUAGES[lang]["ENTAILMENT_LABEL"]
            NEUTRAL_LABEL = LANGUAGES[lang]["NEUTRAL_LABEL"]
            CONTRADICTION_LABEL = LANGUAGES[lang]["CONTRADICTION_LABEL"]
lintangsutawika's avatar
lintangsutawika committed
119
120
121
            with open(
                f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
            ) as f:
lintangsutawika's avatar
lintangsutawika committed
122
123
124
125
126
127
                f.write("# Generated by utils.py\n")
                yaml.dump(
                    {
                        "include": "xnli_common_yaml",
                        "dataset_name": lang,
                        "task": f"xnli_{lang}",
lintangsutawika's avatar
update  
lintangsutawika committed
128
129
                        "doc_to_text": "",
                        "doc_to_choice": f"{{{{["
lintangsutawika's avatar
lintangsutawika committed
130
131
132
133
                        f"""premise+\", {QUESTION_WORD}? {ENTAILMENT_LABEL}, \"+hypothesis,"""
                        f"""premise+\", {QUESTION_WORD}? {NEUTRAL_LABEL}, \"+hypothesis,"""
                        f"""premise+\", {QUESTION_WORD}? {CONTRADICTION_LABEL}, \"+hypothesis"""
                        f"]}}}}",
lintangsutawika's avatar
lintangsutawika committed
134
135
                    },
                    f,
lintangsutawika's avatar
lintangsutawika committed
136
                    allow_unicode=True,
lintangsutawika's avatar
lintangsutawika committed
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
                )
        except FileExistsError:
            err.append(file_name)

    if len(err) > 0:
        raise FileExistsError(
            "Files were not created because they already exist (use --overwrite flag):"
            f" {', '.join(err)}"
        )


def main() -> None:
    """Parse CLI args and generate language-specific yaml files."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--overwrite",
        default=False,
        action="store_true",
        help="Overwrite files if they already exist",
    )
    parser.add_argument(
        "--output-dir", default=".", help="Directory to write yaml files to"
    )
    args = parser.parse_args()

    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite)


if __name__ == "__main__":
    main()