Merge branch 'main' into humaneval

# Conflicts: # lm_eval/api/task.py

Merge branch 'main' into humaneval
# Conflicts: # lm_eval/api/task.py
173b2bc3 · Baber · 74344829 · bb098f13 · 173b2bc3 · 173b2bc3
Commit 173b2bc3 authored Jan 10, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/arc_mt/arc_challenge_mt_nb.yaml
+++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_nb.yaml
+include: arc_challenge_mt_fi.yaml
+task: arc_challenge_mt_nb
+dataset_name: nb
--- a/lm_eval/tasks/arc_mt/arc_challenge_mt_pl.yaml
+++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_pl.yaml
+include: arc_challenge_mt_fi.yaml
+task: arc_challenge_mt_pl
+dataset_name: pl
--- a/lm_eval/tasks/arc_mt/arc_challenge_mt_pt.yaml
+++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_pt.yaml
+include: arc_challenge_mt_fi.yaml
+task: arc_challenge_mt_pt
+dataset_name: pt
--- a/lm_eval/tasks/arc_mt/arc_challenge_mt_sv.yaml
+++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_sv.yaml
+include: arc_challenge_mt_fi.yaml
+task: arc_challenge_mt_sv
+dataset_name: sv
--- a/lm_eval/tasks/arithmetic/README.md
+++ b/lm_eval/tasks/arithmetic/README.md
@@ -27,9 +27,9 @@ Homepage: https://github.com/openai/gpt-3/tree/master/data
 }
 ```
-### Groups and Tasks
+### Groups, Tags, and Tasks
-#### Groups
+#### Tags
 * `arithmetic`: Evaluates `1dc` to `5ds`

--- a/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
-group:
+tag:
  - arithmetic
 task: arithmetic_1dc
 dataset_path: EleutherAI/arithmetic

--- a/lm_eval/tasks/asdiv/README.md
+++ b/lm_eval/tasks/asdiv/README.md
@@ -32,7 +32,7 @@ Homepage: https://github.com/chaochun/nlu-asdiv-dataset
 }
 ```
-### Groups and Tasks
+### Groups, Tags, and Tasks
 #### Groups
@@ -41,6 +41,11 @@ Homepage: https://github.com/chaochun/nlu-asdiv-dataset
 #### Tasks
 * `asdiv`
+* `asdiv_cot_llama`: ASDIV with prompt formatting modified to conform to the evaluation settings described by Meta here: https://huggingface.co/datasets/meta-llama/Meta-Llama-3.1-8B-Instruct-evals/viewer/Meta-Llama-3.1-8B-Instruct-evals__gsm8k__details?row=0
+    - Note that the CoT prompt from (https://arxiv.org/pdf/2201.11903) is used exactly as in GSM8k-CoT
+    - This file is setup to run identically to the task `gsm8k_cot_llama` but for asdiv.
+    - Use this task with --fewshot_as_multiturn and --apply_chat_template to run correctly with Llama Instruct models.
 ### Checklist

--- a/lm_eval/tasks/asdiv/asdiv-cot-llama.yaml
+++ b/lm_eval/tasks/asdiv/asdiv-cot-llama.yaml
+dataset_path: EleutherAI/asdiv
+doc_to_target: "{{answer.split(' (')[0] if answer is defined else target}}"
+doc_to_text: "Given the following problem, reason and give a final answer to the problem.\nProblem: {{body if body is defined}} {{question}}\nYour response should end with \"The final answer is [answer]\" where [answer] is the response to the problem.\n"
+fewshot_config:
+  sampler: first_n
+  samples:
+  - question: There are 15 trees in the grove. Grove workers will plant trees in the
+      grove today. After they are done, there will be 21 trees. How many trees did
+      the grove workers plant today?
+    target: There are 15 trees originally. Then there were 21 trees after some more
+      were planted. So there must have been 21 - 15 = 6. The final answer is 6
+  - question: If there are 3 cars in the parking lot and 2 more cars arrive, how many
+      cars are in the parking lot?
+    target: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The final answer
+      is 5
+  - question: Leah had 32 chocolates and her sister had 42. If they ate 35, how many
+      pieces do they have left in total?
+    target: Originally, Leah had 32 chocolates. Her sister had 42. So in total they
+      had 32 + 42 = 74. After eating 35, they had 74 - 35 = 39. The final answer is 39
+  - question: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12
+      lollipops. How many lollipops did Jason give to Denny?
+    target: Jason started with 20 lollipops. Then he had 12 after giving some to Denny.
+      So he gave Denny 20 - 12 = 8. The final answer is 8
+  - question: Shawn has five toys. For Christmas, he got two toys each from his mom and
+      dad. How many toys does he have now?
+    target: Shawn started with 5 toys. If he got 2 toys each from his mom and dad,
+      then that is 4 more toys. 5 + 4 = 9. The final answer is 9
+  - question: There were nine computers in the server room. Five more computers were
+      installed each day, from monday to thursday. How many computers are now in the
+      server room?
+    target: There were originally 9 computers. For each of 4 days, 5 more computers
+      were added. So 5 * 4 = 20 computers were added. 9 + 20 is 29. The final answer is
+      29
+  - question: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday,
+      he lost 2 more. How many golf balls did he have at the end of wednesday?
+    target: Michael started with 58 golf balls. After losing 23 on tuesday, he had
+      58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33 golf balls. The final answer
+      is 33
+  - question: Olivia has $23. She bought five bagels for $3 each. How much money does
+      she have left?
+    target: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15
+      dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The final answer is 8
+filter_list:
+- filter:
+  - function: regex
+    group_select: -1
+    regex_pattern: The final answer is ((-?[$0-9.,]{2,})|(-?[0-9]+))
+  - function: take_first
+  name: strict-match
+- filter:
+  - function: regex
+    group_select: -1
+    regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+  - function: take_first
+  name: flexible-extract
+generation_kwargs:
+  do_sample: false
+  until:
+  - '<|eot_id|>'
+  - '<|start_header_id|>user<|end_header_id|>'
+  - 'Q:'
+  - </s>
+  - <|im_end|>
+tag:
+- chain_of_thought
+metadata:
+  version: 1.0
+metric_list:
+- aggregation: mean
+  higher_is_better: true
+  ignore_case: true
+  ignore_punctuation: false
+  metric: exact_match
+  regexes_to_ignore:
+  - ','
+  - \$
+  - '(?s).*#### '
+  - \.$
+num_fewshot: 8
+output_type: generate_until
+repeats: 1
+task: asdiv_cot_llama
+validation_split: validation
+test_split: validation
+should_decontaminate: true
+doc_to_decontamination_query: "{{body}} {{question}}"
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/babi/README.md
+++ b/lm_eval/tasks/babi/README.md
@@ -21,12 +21,16 @@ Homepage: https://github.com/facebookarchive/bAbI-tasks
 }
 ```
-### Groups and Tasks
+### Groups, Tags, and Tasks
 #### Groups
 * Not part of a group yet
+#### Tags
+* No tags applied.
 #### Tasks
 * `babi`

--- a/lm_eval/tasks/basque_bench/README.md
+++ b/lm_eval/tasks/basque_bench/README.md
+# BasqueBench
+### Paper
+BasqueBench is a benchmark for evaluating language models in Basque tasks. This is, it evaluates the ability of a language model to understand and generate Basque text. BasqueBench offers a combination of pre-existing, open datasets and datasets developed exclusivelly for this benchmark. All the details of BasqueBench will be published in a paper soon.
+The new evaluation datasets included in BasqueBench are:
+| Task          | Category       | Homepage  |
+|:-------------:|:-----:|:-----:|
+| MGSM_eu | Math | https://huggingface.co/datasets/HiTZ/MGSM-eu |
+| PIQA_eu | Question Answering | https://huggingface.co/datasets/HiTZ/PIQA-eu |
+| WNLI_eu | Natural Language Inference | https://huggingface.co/datasets/HiTZ/wnli-eu |
+| XCOPA_eu | Commonsense Reasoning | https://huggingface.co/datasets/HiTZ/XCOPA-eu |
+The datasets included in BasqueBench that have been made public in previous pubications are:
+| Task          | Category       | Paper title          | Homepage  |
+|:-------------:|:-----:|:-------------:|:-----:|
+| Belebele_eu | Reading Comprehension | [The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants](https://arxiv.org/abs/2308.16884) | https://huggingface.co/datasets/facebook/belebele |
+| EusExams | Question Answering | [Latxa: An Open Language Model and Evaluation Suite for Basque](https://arxiv.org/abs/2403.20266) | https://huggingface.co/datasets/HiTZ/EusExams |
+| EusProficiency | Question Answering | [Latxa: An Open Language Model and Evaluation Suite for Basque](https://arxiv.org/abs/2403.20266) | https://huggingface.co/datasets/HiTZ/EusProficiency |
+| EusReading | Reading Comprehension | [Latxa: An Open Language Model and Evaluation Suite for Basque](https://arxiv.org/abs/2403.20266) | https://huggingface.co/datasets/HiTZ/EusReading |
+| EusTrivia | Question Answering | [Latxa: An Open Language Model and Evaluation Suite for Basque](https://arxiv.org/abs/2403.20266) | https://huggingface.co/datasets/HiTZ/EusTrivia |
+| FLORES_eu | Translation | [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) | https://huggingface.co/datasets/facebook/flores |
+| QNLIeu | Natural Language Inference | [BasqueGLUE: A Natural Language Understanding Benchmark for Basque](https://aclanthology.org/2022.lrec-1.172/) | https://huggingface.co/datasets/orai-nlp/basqueGLUE |
+| XNLIeu | Natural Language Inference | [XNLIeu: a dataset for cross-lingual NLI in Basque](https://arxiv.org/abs/2404.06996) | https://huggingface.co/datasets/HiTZ/xnli-eu |
+| XStoryCloze_eu | Commonsense Reasoning | [Few-shot Learning with Multilingual Generative Language Models](https://aclanthology.org/2022.emnlp-main.616/) | https://huggingface.co/datasets/juletxara/xstory_cloze |
+### Citation
+Paper for BasqueBench coming soon.
+### Groups and Tasks
+#### Groups
+- `basque_bench`: All tasks included in BasqueBench.
+- `flores_eu`: All FLORES translation tasks from or to Basque.
+#### Tasks
+The following tasks evaluate tasks on BasqueBench dataset using various scoring methods.
+  - `belebele_eus_Latn`
+  - `eus_exams_eu`
+  - `eus_proficiency`
+  - `eus_reading`
+  - `eus_trivia`
+  - `flores_eu`
+  - `flores_eu-ca`
+  - `flores_eu-de`
+  - `flores_eu-en`
+  - `flores_eu-es`
+  - `flores_eu-fr`
+  - `flores_eu-gl`
+  - `flores_eu-it`
+  - `flores_eu-pt`
+  - `flores_ca-eu`
+  - `flores_de-eu`
+  - `flores_en-eu`
+  - `flores_es-eu`
+  - `flores_fr-eu`
+  - `flores_gl-eu`
+  - `flores_it-eu`
+  - `flores_pt-eu`
+  - `mgsm_direct_eu`
+  - `mgsm_native_cot_eu`
+  - `piqa_eu`
+  - `qnlieu`
+  - `wnli_eu`
+  - `xcopa_eu`
+  - `xnli_eu`
+  - `xnli_eu_native`
+  - `xstorycloze_eu`
+Some of these tasks are taken from benchmarks already available in LM Evaluation Harness. These are:
+- `belebele_eus_Latn`: Belebele Basque
+- `qnlieu`: From BasqueGLUE
+### Checklist
+* [x] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation?
+    * [ ] Yes, original implementation contributed by author of the benchmark
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/basque_bench/basque_bench.yaml
+++ b/lm_eval/tasks/basque_bench/basque_bench.yaml
+group: basque_bench
+task:
+    - belebele_eus_Latn
+    - xstorycloze_eu
+    - flores_eu
+    - eus_reading
+    - eus_proficiency
+    - eus_trivia
+    - eus_exams_eu
+    - qnlieu
+    - xnli_eu
+    - xnli_eu_native
+    - wnli_eu
+    - xcopa_eu
+    - mgsm_direct_eu
+    - mgsm_native_cot_eu
+    - piqa_eu
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/basque_bench/flores_eu/_flores_common_yaml
+++ b/lm_eval/tasks/basque_bench/flores_eu/_flores_common_yaml
+tag: flores
+dataset_path: facebook/flores
+dataset_name: all
+output_type: generate_until
+#! The test split of flores is not publicly available! (See paper section 6.1)
+training_split: dev
+validation_split: dev
+test_split: devtest
+fewshot_split: dev
+target_delimiter: ''
+generation_kwargs:
+  until:
+    - "\n"
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: ter
+    aggregation: ter
+    higher_is_better: false
+  - metric: chrf
+    aggregation: chrf
+    higher_is_better: true
+metadata:
+  version: 0.1
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/basque_bench/flores_eu/create_yamls_flores_eu.py
+++ b/lm_eval/tasks/basque_bench/flores_eu/create_yamls_flores_eu.py
+# ruff: noqa: E731, E741
+"""
+Script to generate task YAMLs for the FLORES-200 dataset.
+Based on `tasks/translation/utils.py`.
+"""
+import argparse
+import itertools
+import yaml
+from langcodes import Language
+# utils
+flatten = lambda l: list(itertools.chain(*l))
+# constants
+_LANGUAGES = [
+    "ace_Arab",
+    "bam_Latn",
+    "dzo_Tibt",
+    "hin_Deva",
+    "khm_Khmr",
+    "mag_Deva",
+    "pap_Latn",
+    "sot_Latn",
+    "tur_Latn",
+    "ace_Latn",
+    "ban_Latn",
+    "ell_Grek",
+    "hne_Deva",
+    "kik_Latn",
+    "mai_Deva",
+    "pbt_Arab",
+    "spa_Latn",
+    "twi_Latn",
+    "acm_Arab",
+    "bel_Cyrl",
+    "eng_Latn",
+    "hrv_Latn",
+    "kin_Latn",
+    "mal_Mlym",
+    "pes_Arab",
+    "srd_Latn",
+    "tzm_Tfng",
+    "acq_Arab",
+    "bem_Latn",
+    "epo_Latn",
+    "hun_Latn",
+    "kir_Cyrl",
+    "mar_Deva",
+    "plt_Latn",
+    "srp_Cyrl",
+    "uig_Arab",
+    "aeb_Arab",
+    "ben_Beng",
+    "est_Latn",
+    "hye_Armn",
+    "kmb_Latn",
+    "min_Arab",
+    "pol_Latn",
+    "ssw_Latn",
+    "ukr_Cyrl",
+    "afr_Latn",
+    "bho_Deva",
+    "eus_Latn",
+    "ibo_Latn",
+    "kmr_Latn",
+    "min_Latn",
+    "por_Latn",
+    "sun_Latn",
+    "umb_Latn",
+    "ajp_Arab",
+    "bjn_Arab",
+    "ewe_Latn",
+    "ilo_Latn",
+    "knc_Arab",
+    "mkd_Cyrl",
+    "prs_Arab",
+    "swe_Latn",
+    "urd_Arab",
+    "aka_Latn",
+    "bjn_Latn",
+    "fao_Latn",
+    "ind_Latn",
+    "knc_Latn",
+    "mlt_Latn",
+    "quy_Latn",
+    "swh_Latn",
+    "uzn_Latn",
+    "als_Latn",
+    "bod_Tibt",
+    "fij_Latn",
+    "isl_Latn",
+    "kon_Latn",
+    "mni_Beng",
+    "ron_Latn",
+    "szl_Latn",
+    "vec_Latn",
+    "amh_Ethi",
+    "bos_Latn",
+    "fin_Latn",
+    "ita_Latn",
+    "kor_Hang",
+    "mos_Latn",
+    "run_Latn",
+    "tam_Taml",
+    "vie_Latn",
+    "apc_Arab",
+    "bug_Latn",
+    "fon_Latn",
+    "jav_Latn",
+    "lao_Laoo",
+    "mri_Latn",
+    "rus_Cyrl",
+    "taq_Latn",
+    "war_Latn",
+    "arb_Arab",
+    "bul_Cyrl",
+    "fra_Latn",
+    "jpn_Jpan",
+    "lij_Latn",
+    "mya_Mymr",
+    "sag_Latn",
+    "taq_Tfng",
+    "wol_Latn",
+    "arb_Latn",
+    "cat_Latn",
+    "fur_Latn",
+    "kab_Latn",
+    "lim_Latn",
+    "nld_Latn",
+    "san_Deva",
+    "tat_Cyrl",
+    "xho_Latn",
+    "ars_Arab",
+    "ceb_Latn",
+    "fuv_Latn",
+    "kac_Latn",
+    "lin_Latn",
+    "nno_Latn",
+    "sat_Olck",
+    "tel_Telu",
+    "ydd_Hebr",
+    "ary_Arab",
+    "ces_Latn",
+    "gaz_Latn",
+    "kam_Latn",
+    "lit_Latn",
+    "nob_Latn",
+    "scn_Latn",
+    "tgk_Cyrl",
+    "yor_Latn",
+    "arz_Arab",
+    "cjk_Latn",
+    "gla_Latn",
+    "kan_Knda",
+    "lmo_Latn",
+    "npi_Deva",
+    "shn_Mymr",
+    "tgl_Latn",
+    "yue_Hant",
+    "asm_Beng",
+    "ckb_Arab",
+    "gle_Latn",
+    "kas_Arab",
+    "ltg_Latn",
+    "nso_Latn",
+    "sin_Sinh",
+    "tha_Thai",
+    "zho_Hans",
+    "ast_Latn",
+    "crh_Latn",
+    "glg_Latn",
+    "kas_Deva",
+    "ltz_Latn",
+    "nus_Latn",
+    "slk_Latn",
+    "tir_Ethi",
+    "zho_Hant",
+    "awa_Deva",
+    "cym_Latn",
+    "grn_Latn",
+    "kat_Geor",
+    "lua_Latn",
+    "nya_Latn",
+    "slv_Latn",
+    "tpi_Latn",
+    "zsm_Latn",
+    "ayr_Latn",
+    "dan_Latn",
+    "guj_Gujr",
+    "kaz_Cyrl",
+    "lug_Latn",
+    "oci_Latn",
+    "smo_Latn",
+    "tsn_Latn",
+    "zul_Latn",
+    "azb_Arab",
+    "deu_Latn",
+    "hat_Latn",
+    "kbp_Latn",
+    "luo_Latn",
+    "ory_Orya",
+    "sna_Latn",
+    "tso_Latn",
+    "azj_Latn",
+    "dik_Latn",
+    "hau_Latn",
+    "kea_Latn",
+    "lus_Latn",
+    "pag_Latn",
+    "snd_Arab",
+    "tuk_Latn",
+    "bak_Cyrl",
+    "dyu_Latn",
+    "heb_Hebr",
+    "khk_Cyrl",
+    "lvs_Latn",
+    "pan_Guru",
+    "som_Latn",
+    "tum_Latn",
+]
+LANGUAGE_PAIRS = [
+    (a, b) for idx, a in enumerate(_LANGUAGES) for b in _LANGUAGES[idx + 1 :]
+]
+LANGUAGES_OF_INTEREST = [
+    "cat_Latn",
+    "spa_Latn",
+    "eng_Latn",
+    "glg_Latn",
+    "eus_Latn",
+    "ita_Latn",
+    "deu_Latn",
+    "por_Latn",
+    "fra_Latn",
+]
+MAIN_LANG = "eus_Latn"
+LANGUAGE_PAIRS = [
+    (a, b)
+    for (a, b) in LANGUAGE_PAIRS
+    if a in LANGUAGES_OF_INTEREST and b in LANGUAGES_OF_INTEREST and MAIN_LANG in (a, b)
+]
+# auxiliary functions
+code_to_language_name = lambda code: Language.make(
+    language=Language.get(code)["language"]
+).display_name()
+code_to_short_name = lambda code: Language.get(code)["language"]
+jinja_var = (
+    lambda s: "{{" + s + "}}"
+)  # wrapper to avoid having to escape { } in format strings
+def doc_to_text(src: str, tgt: str) -> str:
+    src_name, tgt_name = map(code_to_language_name, [src, tgt])
+    return f"""\
+{src_name} sentence: {jinja_var('sentence_' + src)}
+{tgt_name} sentence:"""
+def doc_to_target(tgt: str) -> str:
+    return f"{jinja_var('sentence_' + tgt)}"
+# main function
+def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
+    """
+    Generate a YAML file for each translation direction.
+    """
+    err = []
+    for src, tgt in LANGUAGE_PAIRS:
+        # do both translation directions for each lang pair
+        for src, tgt in [(src, tgt), (tgt, src)]:
+            lang_pair_name = f"{code_to_short_name(src)}-{code_to_short_name(tgt)}"
+            yaml_file_name = f"flores_{lang_pair_name}.yaml"
+            try:
+                with open(
+                    f"{output_dir}/{yaml_file_name}",
+                    "w" if overwrite else "x",
+                    encoding="utf-8",
+                ) as outfile:
+                    print(f"Creating {yaml_file_name}...")
+                    outfile.write("# File generated by `create-yamls.py`\n")
+                    yaml.dump(
+                        {
+                            #                             "group": [f"{BENCH_NAME}_bench", f"{BENCH_NAME}_bench_flores"],
+                            #                            "group": "flores_eu",
+                            "include": "_flores_common_yaml",
+                            "task": f"flores_{lang_pair_name}",
+                            "doc_to_text": doc_to_text(src, tgt),
+                            "doc_to_target": doc_to_target(tgt),
+                        },
+                        outfile,
+                        sort_keys=False,
+                    )
+            except FileExistsError:
+                err.append(yaml_file_name)
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist:"
+            f" {', '.join(err)}"
+            "\nUse flag --overwrite to overwrite them."
+        )
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=False,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir", default=".", help="Directory to write yaml files to"
+    )
+    args = parser.parse_args()
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite)
+if __name__ == "__main__":
+    main()
--- a/lm_eval/tasks/basque_bench/flores_eu/flores_ca-eu.yaml
+++ b/lm_eval/tasks/basque_bench/flores_eu/flores_ca-eu.yaml
+# File generated by `create-yamls.py`
+include: _flores_common_yaml
+task: flores_ca-eu
+doc_to_text: 'Catalan sentence: {{sentence_cat_Latn}}
+  Basque sentence:'
+doc_to_target: '{{sentence_eus_Latn}}'
--- a/lm_eval/tasks/basque_bench/flores_eu/flores_de-eu.yaml
+++ b/lm_eval/tasks/basque_bench/flores_eu/flores_de-eu.yaml
+# File generated by `create-yamls.py`
+include: _flores_common_yaml
+task: flores_de-eu
+doc_to_text: 'German sentence: {{sentence_deu_Latn}}
+  Basque sentence:'
+doc_to_target: '{{sentence_eus_Latn}}'
--- a/lm_eval/tasks/basque_bench/flores_eu/flores_en-eu.yaml
+++ b/lm_eval/tasks/basque_bench/flores_eu/flores_en-eu.yaml
+# File generated by `create-yamls.py`
+include: _flores_common_yaml
+task: flores_en-eu
+doc_to_text: 'English sentence: {{sentence_eng_Latn}}
+  Basque sentence:'
+doc_to_target: '{{sentence_eus_Latn}}'
--- a/lm_eval/tasks/basque_bench/flores_eu/flores_es-eu.yaml
+++ b/lm_eval/tasks/basque_bench/flores_eu/flores_es-eu.yaml
+# File generated by `create-yamls.py`
+include: _flores_common_yaml
+task: flores_es-eu
+doc_to_text: 'Spanish sentence: {{sentence_spa_Latn}}
+  Basque sentence:'
+doc_to_target: '{{sentence_eus_Latn}}'
--- a/lm_eval/tasks/basque_bench/flores_eu/flores_eu-ca.yaml
+++ b/lm_eval/tasks/basque_bench/flores_eu/flores_eu-ca.yaml
+# File generated by `create-yamls.py`
+include: _flores_common_yaml
+task: flores_eu-ca
+doc_to_text: 'Basque sentence: {{sentence_eus_Latn}}
+  Catalan sentence:'
+doc_to_target: '{{sentence_cat_Latn}}'
--- a/lm_eval/tasks/basque_bench/flores_eu/flores_eu-de.yaml
+++ b/lm_eval/tasks/basque_bench/flores_eu/flores_eu-de.yaml
+# File generated by `create-yamls.py`
+include: _flores_common_yaml
+task: flores_eu-de
+doc_to_text: 'Basque sentence: {{sentence_eus_Latn}}
+  German sentence:'
+doc_to_target: '{{sentence_deu_Latn}}'
--- a/lm_eval/tasks/basque_bench/flores_eu/flores_eu-en.yaml
+++ b/lm_eval/tasks/basque_bench/flores_eu/flores_eu-en.yaml
+# File generated by `create-yamls.py`
+include: _flores_common_yaml
+task: flores_eu-en
+doc_to_text: 'Basque sentence: {{sentence_eus_Latn}}
+  English sentence:'
+doc_to_target: '{{sentence_eng_Latn}}'