Commit 2e2f28a5 authored by haileyschoelkopf's avatar haileyschoelkopf
Browse files

add XX->en direction to translation tasks

parent 129762c2
# Generated by utils.py
dataset_name: iwslt2017-ar-en
dataset_name: iwslt2017-en-ar
dataset_path: iwslt2017
doc_to_target: ' {{translation["en"]}}'
doc_to_text: 'Arabic phrase: {{translation["ar"]}}
......
......@@ -30,7 +30,7 @@ gpt3_translation_benchmarks = {
LANGUAGES = {
**gpt3_translation_benchmarks,
# "wmt20": sacrebleu.get_langpairs_for_testset("wmt20"),
"iwslt2017": ["en-ar", "ar-en"], # Arabic
"iwslt2017": ["en-ar"], # Arabic
}
......@@ -49,45 +49,48 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
"""
err = []
for lang in LANGUAGES.keys():
for lang_pair in LANGUAGES[lang]:
file_name = f"{lang}_{lang_pair}.yaml"
try:
src_lang, _, tgt_lang = lang_pair.partition("-")
source, target = code_to_language(src_lang), code_to_language(tgt_lang)
groups = ["greedy_until", "translation", lang]
if lang in gpt3_translation_benchmarks.keys():
groups += ["gpt3_translation_benchmarks"]
with open(
f"{output_dir}/{file_name}",
"w" if overwrite else "x",
encoding="utf8",
) as f:
f.write("# Generated by utils.py\n")
yaml.dump(
{
"include": "wmt_common_yaml",
"group": groups,
"dataset_path": lang,
"dataset_name": lang_pair
if not (lang == "iwslt2017")
else "iwslt2017-" + lang_pair,
"task": f"{lang}-{lang_pair}",
"doc_to_text": f"{source} phrase: "
+ "{{translation["
+ f'"{src_lang}"'
+ "]}}\n"
+ f"{target} phrase:",
"doc_to_target": " {{"
+ "translation["
+ f'"{tgt_lang}"]'
+ "}}",
},
f,
)
except FileExistsError:
err.append(file_name)
for dataset_name in LANGUAGES[lang]:
src_lang, _, tgt_lang = dataset_name.partition("-")
for src, tgt in [[src_lang, tgt_lang], [tgt_lang, src_lang]]:
# both translation directions for each lang pair
lang_pair = src + "-" + tgt
file_name = f"{lang}_{lang_pair}.yaml"
try:
source, target = code_to_language(src), code_to_language(tgt)
groups = ["greedy_until", "translation", lang]
if lang in gpt3_translation_benchmarks.keys():
groups += ["gpt3_translation_benchmarks"]
with open(
f"{output_dir}/{file_name}",
"w" if overwrite else "x",
encoding="utf8",
) as f:
f.write("# Generated by utils.py\n")
yaml.dump(
{
"include": "wmt_common_yaml",
"group": groups,
"dataset_path": lang,
"dataset_name": dataset_name
if not (lang == "iwslt2017")
else "iwslt2017-" + dataset_name,
"task": f"{lang}-{lang_pair}",
"doc_to_text": f"{source} phrase: "
+ "{{translation["
+ f'"{src}"'
+ "]}}\n"
+ f"{target} phrase:",
"doc_to_target": " {{"
+ "translation["
+ f'"{tgt}"]'
+ "}}",
},
f,
)
except FileExistsError:
err.append(file_name)
if len(err) > 0:
raise FileExistsError(
......
# Generated by utils.py
dataset_name: fr-en
dataset_path: wmt14
doc_to_target: ' {{translation["fr"]}}'
doc_to_text: 'English phrase: {{translation["en"]}}
French phrase:'
group:
- greedy_until
- translation
- wmt14
- gpt3_translation_benchmarks
include: wmt_common_yaml
task: wmt14-en-fr
# Generated by utils.py
dataset_name: de-en
dataset_path: wmt16
doc_to_target: ' {{translation["de"]}}'
doc_to_text: 'English phrase: {{translation["en"]}}
German phrase:'
group:
- greedy_until
- translation
- wmt16
- gpt3_translation_benchmarks
include: wmt_common_yaml
task: wmt16-en-de
# Generated by utils.py
dataset_name: ro-en
dataset_path: wmt16
doc_to_target: ' {{translation["ro"]}}'
doc_to_text: 'English phrase: {{translation["en"]}}
Romanian phrase:'
group:
- greedy_until
- translation
- wmt16
- gpt3_translation_benchmarks
include: wmt_common_yaml
task: wmt16-en-ro
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment