Commit 2e2f28a5 authored by haileyschoelkopf's avatar haileyschoelkopf
Browse files

add XX->en direction to translation tasks

parent 129762c2
# Generated by utils.py # Generated by utils.py
dataset_name: iwslt2017-ar-en dataset_name: iwslt2017-en-ar
dataset_path: iwslt2017 dataset_path: iwslt2017
doc_to_target: ' {{translation["en"]}}' doc_to_target: ' {{translation["en"]}}'
doc_to_text: 'Arabic phrase: {{translation["ar"]}} doc_to_text: 'Arabic phrase: {{translation["ar"]}}
......
...@@ -30,7 +30,7 @@ gpt3_translation_benchmarks = { ...@@ -30,7 +30,7 @@ gpt3_translation_benchmarks = {
LANGUAGES = { LANGUAGES = {
**gpt3_translation_benchmarks, **gpt3_translation_benchmarks,
# "wmt20": sacrebleu.get_langpairs_for_testset("wmt20"), # "wmt20": sacrebleu.get_langpairs_for_testset("wmt20"),
"iwslt2017": ["en-ar", "ar-en"], # Arabic "iwslt2017": ["en-ar"], # Arabic
} }
...@@ -49,45 +49,48 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None: ...@@ -49,45 +49,48 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
""" """
err = [] err = []
for lang in LANGUAGES.keys(): for lang in LANGUAGES.keys():
for lang_pair in LANGUAGES[lang]: for dataset_name in LANGUAGES[lang]:
file_name = f"{lang}_{lang_pair}.yaml" src_lang, _, tgt_lang = dataset_name.partition("-")
try: for src, tgt in [[src_lang, tgt_lang], [tgt_lang, src_lang]]:
src_lang, _, tgt_lang = lang_pair.partition("-") # both translation directions for each lang pair
source, target = code_to_language(src_lang), code_to_language(tgt_lang) lang_pair = src + "-" + tgt
file_name = f"{lang}_{lang_pair}.yaml"
groups = ["greedy_until", "translation", lang] try:
if lang in gpt3_translation_benchmarks.keys(): source, target = code_to_language(src), code_to_language(tgt)
groups += ["gpt3_translation_benchmarks"]
groups = ["greedy_until", "translation", lang]
with open( if lang in gpt3_translation_benchmarks.keys():
f"{output_dir}/{file_name}", groups += ["gpt3_translation_benchmarks"]
"w" if overwrite else "x",
encoding="utf8", with open(
) as f: f"{output_dir}/{file_name}",
f.write("# Generated by utils.py\n") "w" if overwrite else "x",
yaml.dump( encoding="utf8",
{ ) as f:
"include": "wmt_common_yaml", f.write("# Generated by utils.py\n")
"group": groups, yaml.dump(
"dataset_path": lang, {
"dataset_name": lang_pair "include": "wmt_common_yaml",
if not (lang == "iwslt2017") "group": groups,
else "iwslt2017-" + lang_pair, "dataset_path": lang,
"task": f"{lang}-{lang_pair}", "dataset_name": dataset_name
"doc_to_text": f"{source} phrase: " if not (lang == "iwslt2017")
+ "{{translation[" else "iwslt2017-" + dataset_name,
+ f'"{src_lang}"' "task": f"{lang}-{lang_pair}",
+ "]}}\n" "doc_to_text": f"{source} phrase: "
+ f"{target} phrase:", + "{{translation["
"doc_to_target": " {{" + f'"{src}"'
+ "translation[" + "]}}\n"
+ f'"{tgt_lang}"]' + f"{target} phrase:",
+ "}}", "doc_to_target": " {{"
}, + "translation["
f, + f'"{tgt}"]'
) + "}}",
except FileExistsError: },
err.append(file_name) f,
)
except FileExistsError:
err.append(file_name)
if len(err) > 0: if len(err) > 0:
raise FileExistsError( raise FileExistsError(
......
# Generated by utils.py
dataset_name: fr-en
dataset_path: wmt14
doc_to_target: ' {{translation["fr"]}}'
doc_to_text: 'English phrase: {{translation["en"]}}
French phrase:'
group:
- greedy_until
- translation
- wmt14
- gpt3_translation_benchmarks
include: wmt_common_yaml
task: wmt14-en-fr
# Generated by utils.py
dataset_name: de-en
dataset_path: wmt16
doc_to_target: ' {{translation["de"]}}'
doc_to_text: 'English phrase: {{translation["en"]}}
German phrase:'
group:
- greedy_until
- translation
- wmt16
- gpt3_translation_benchmarks
include: wmt_common_yaml
task: wmt16-en-de
# Generated by utils.py
dataset_name: ro-en
dataset_path: wmt16
doc_to_target: ' {{translation["ro"]}}'
doc_to_text: 'English phrase: {{translation["en"]}}
Romanian phrase:'
group:
- greedy_until
- translation
- wmt16
- gpt3_translation_benchmarks
include: wmt_common_yaml
task: wmt16-en-ro
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment