flores_gen_806ede.py 7.75 KB
Newer Older
Leymore's avatar
Leymore committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import TopkRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import BleuEvaluator
from opencompass.datasets import FloresFirst100Dataset

_flores_lang_map = [
    ["eng", "eng_Latn", "English", "Indo-European-Germanic"],
    ["afr", "afr_Latn", "Afrikaans", "Indo-European-Germanic"],
    ["dan", "dan_Latn", "Danish", "Indo-European-Germanic"],
    ["deu", "deu_Latn", "German", "Indo-European-Germanic"],
    ["isl", "isl_Latn", "Icelandic", "Indo-European-Germanic"],
    ["ltz", "ltz_Latn", "Luxembourgish", "Indo-European-Germanic"],
    ["nld", "nld_Latn", "Dutch", "Indo-European-Germanic"],
    ["nob", "nob_Latn", "Norwegian", "Indo-European-Germanic"],
    ["swe", "swe_Latn", "Swedish", "Indo-European-Germanic"],
    ["ast", "ast_Latn", "Asturian", "Indo-European-Romance"],
    ["cat", "cat_Latn", "Catalan", "Indo-European-Romance"],
    ["fra", "fra_Latn", "French", "Indo-European-Romance"],
    ["glg", "glg_Latn", "Galician", "Indo-European-Romance"],
    ["oci", "oci_Latn", "Occitan", "Indo-European-Romance"],
    ["por", "por_Latn", "Portuguese", "Indo-European-Romance"],
    ["ron", "ron_Latn", "Romanian", "Indo-European-Romance"],
    ["spa", "spa_Latn", "Spanish", "Indo-European-Romance"],
    ["bel", "bel_Cyrl", "Belarusian", "Indo-European-Slavic"],
    ["bos", "bos_Latn", "Bosnian", "Indo-European-Slavic"],
    ["bul", "bul_Cyrl", "Bulgarian", "Indo-European-Slavic"],
    ["ces", "ces_Latn", "Czech", "Indo-European-Slavic"],
    ["hrv", "hrv_Latn", "Croatian", "Indo-European-Slavic"],
    ["mkd", "mkd_Cyrl", "Macedonian", "Indo-European-Slavic"],
    ["pol", "pol_Latn", "Polish", "Indo-European-Slavic"],
    ["rus", "rus_Cyrl", "Russian", "Indo-European-Slavic"],
    ["slk", "slk_Latn", "Slovak", "Indo-European-Slavic"],
    ["slv", "slv_Latn", "Slovenian", "Indo-European-Slavic"],
    ["srp", "srp_Cyrl", "Serbian", "Indo-European-Slavic"],
    ["ukr", "ukr_Cyrl", "Ukrainian", "Indo-European-Slavic"],
    ["asm", "asm_Beng", "Assamese", "Indo-European-Indo-Aryan"],
    ["ben", "ben_Beng", "Bengali", "Indo-European-Indo-Aryan"],
    ["guj", "guj_Gujr", "Gujarati", "Indo-European-Indo-Aryan"],
    ["hin", "hin_Deva", "Hindi", "Indo-European-Indo-Aryan"],
    ["mar", "mar_Deva", "Marathi", "Indo-European-Indo-Aryan"],
    ["npi", "npi_Deva", "Nepali", "Indo-European-Indo-Aryan"],
    ["ory", "ory_Orya", "Oriya", "Indo-European-Indo-Aryan"],
    ["pan", "pan_Guru", "Punjabi", "Indo-European-Indo-Aryan"],
    ["snd", "snd_Arab", "Sindhi", "Indo-European-Indo-Aryan"],
    ["urd", "urd_Arab", "Urdu", "Indo-European-Indo-Aryan"],
    ["ckb", "ckb_Arab", "Kurdish", "Indo-European-Other"],
    ["cym", "cym_Latn", "Welsh", "Indo-European-Other"],
    ["ell", "ell_Grek", "Greek", "Indo-European-Other"],
    ["fas", "pes_Arab", "Persian", "Indo-European-Other"],
    ["gle", "gle_Latn", "Irish", "Indo-European-Other"],
    ["hye", "hye_Armn", "Armenian", "Indo-European-Other"],
    ["ita", "ita_Latn", "Italian", "Indo-European-Other"],
    ["lav", "lvs_Latn", "Latvian", "Indo-European-Other"],
    ["lit", "lit_Latn", "Lithuanian", "Indo-European-Other"],
    ["pus", "pbt_Arab", "Pashto", "Indo-European-Other"],
    ["tgk", "tgk_Cyrl", "Tajik", "Indo-European-Other"],
    ["ceb", "ceb_Latn", "Cebuano", "Austronesian"],
    ["ind", "ind_Latn", "Indonesian", "Austronesian"],
    ["jav", "jav_Latn", "Javanese", "Austronesian"],
    ["mri", "mri_Latn", "Maori", "Austronesian"],
    ["msa", "zsm_Latn", "Malay", "Austronesian"],
    ["tgl", "tgl_Latn", "Tagalog", "Austronesian"],
    ["ibo", "ibo_Latn", "Igbo", "Atlantic-Congo"],
    ["kam", "kam_Latn", "Kamba", "Atlantic-Congo"],
    ["kea", "kea_Latn", "Kabuverdianu", "Atlantic-Congo"],
    ["lin", "lin_Latn", "Lingala", "Atlantic-Congo"],
    ["lug", "lug_Latn", "Luganda", "Atlantic-Congo"],
    ["nso", "nso_Latn", "Northern Sotho", "Atlantic-Congo"],
    ["nya", "nya_Latn", "Nyanja", "Atlantic-Congo"],
    ["sna", "sna_Latn", "Shona", "Atlantic-Congo"],
    ["swh", "swh_Latn", "Swahili", "Atlantic-Congo"],
    ["umb", "umb_Latn", "Umbundu", "Atlantic-Congo"],
    ["wol", "wol_Latn", "Wolof", "Atlantic-Congo"],
    ["xho", "xho_Latn", "Xhosa", "Atlantic-Congo"],
    ["yor", "yor_Latn", "Yoruba", "Atlantic-Congo"],
    ["zul", "zul_Latn", "Zulu", "Atlantic-Congo"],
    ["amh", "amh_Ethi", "Amharic", "Afro-Asiatic"],
    ["ara", "arb_Arab", "Arabic", "Afro-Asiatic"],
    ["ful", "fuv_Latn", "Fulah", "Afro-Asiatic"],
    ["mlt", "mlt_Latn", "Maltese", "Afro-Asiatic"],
    ["orm", "gaz_Latn", "Oromo", "Afro-Asiatic"],
    ["som", "som_Latn", "Somali", "Afro-Asiatic"],
    ["azj", "azj_Latn", "Azerbaijani", "Turkic"],
    ["kaz", "kaz_Cyrl", "Kazakh", "Turkic"],
    ["kir", "kir_Cyrl", "Kyrgyz", "Turkic"],
    ["tur", "tur_Latn", "Turkish", "Turkic"],
    ["uzb", "uzn_Latn", "Uzbek", "Turkic"],
    ["kan", "kan_Knda", "Kannada", "Dravidian"],
    ["mal", "mal_Mlym", "Malayalam", "Dravidian"],
    ["tam", "tam_Taml", "Tamil", "Dravidian"],
    ["tel", "tel_Telu", "Telugu", "Dravidian"],
    ["mya", "mya_Mymr", "Burmese", "Sino-Tibetan"],
    ["zho_simpl", "zho_Hans", "Chinese (Simpl)", "Sino-Tibetan"],
    ["zho_trad", "zho_Hant", "Chinese (Trad)", "Sino-Tibetan"],
    ["est", "est_Latn", "Estonian", "Other"],
    ["fin", "fin_Latn", "Finnish", "Other"],
    ["hau", "hau_Latn", "Hausa", "Other"],
    ["heb", "heb_Hebr", "Hebrew", "Other"],
    ["hun", "hun_Latn", "Hungarian", "Other"],
    ["jpn", "jpn_Jpan", "Japanese", "Other"],
    ["kat", "kat_Geor", "Georgian", "Other"],
    ["khm", "khm_Khmr", "Khmer", "Other"],
    ["kor", "kor_Hang", "Korean", "Other"],
    ["lao", "lao_Laoo", "Lao", "Other"],
    ["luo", "luo_Latn", "Luo", "Other"],
    ["mon", "khk_Cyrl", "Mongolian", "Other"],
    ["tha", "tha_Thai", "Thai", "Other"],
    ["vie", "vie_Latn", "Vietnamese", "Other"],
]
flores_lang_map = {i[0]: i for i in _flores_lang_map}
_flores_subtasks = [f"eng-{i}" for i in flores_lang_map if i != "eng"
                    ] + [f"{i}-eng" for i in flores_lang_map if i != "eng"]

flores_datasets = []
for _flores_subtask in _flores_subtasks:
    _src, _tgt = _flores_subtask.split("-")
    _, _flores_source, _src_inst, _ = flores_lang_map[_src]
    _, _flores_target, _tgt_inst, _ = flores_lang_map[_tgt]

121
122
123
124
125
126
    flores_reader_cfg = dict(
        input_columns=f"sentence_{_flores_source}",
        output_column=f"sentence_{_flores_target}",
        train_split="dev",
        test_split="devtest"
    )
Leymore's avatar
Leymore committed
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
    flores_infer_cfg = dict(
        ice_template=dict(
            type=PromptTemplate,
            template=dict(
                begin="</E>",
                round=[
                    dict(
                        role="HUMAN",
                        prompt=
                        f"Translate the following {_src_inst} statements to {_tgt_inst}.\n{{sentence_{_flores_source}}}"
                    ),
                    dict(role="BOT", prompt=f"{{sentence_{_flores_target}}}"),
                ],
            ),
            ice_token="</E>",
        ),
        retriever=dict(type=TopkRetriever, ice_num=8),
        inferencer=dict(type=GenInferencer),
    )
    flores_eval_cfg = dict(
        evaluator=dict(type=BleuEvaluator),
        pred_role="BOT",
    )
    if _tgt == "zho_simpl":
        flores_eval_cfg["pred_postprocessor"] = dict(type="flores")
        flores_eval_cfg["dataset_postprocessor"] = dict(type="flores")
    flores_datasets.append(
        dict(
            abbr=f"flores_100_{_src}-{_tgt}",
156
157
            type=FloresFirst100Dataset,
            path='./data/flores_first100',
Leymore's avatar
Leymore committed
158
            name=f"{_flores_source}-{_flores_target}",
159
            reader_cfg=flores_reader_cfg.copy(),
Leymore's avatar
Leymore committed
160
161
162
            infer_cfg=flores_infer_cfg.copy(),
            eval_cfg=flores_eval_cfg.copy(),
        ))