Merge branch 'main' into mathvista

# Conflicts: # lm_eval/models/openai_completions.py

Merge branch 'main' into mathvista
# Conflicts: # lm_eval/models/openai_completions.py
2106fbeb · Baber · 4354fe46 · 703fbffd · 2106fbeb · 2106fbeb
Commit 2106fbeb authored Jan 15, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/metabench/metabench_truthfulqa_secondary.yaml
+++ b/lm_eval/tasks/metabench/metabench_truthfulqa_secondary.yaml
+include: metabench_truthfulqa.yaml
+task: metabench_truthfulqa_secondary
+test_split: secondary
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/metabench/metabench_truthfulqa_secondary_permute.yaml
+++ b/lm_eval/tasks/metabench/metabench_truthfulqa_secondary_permute.yaml
+include: metabench_truthfulqa_permute.yaml
+task: metabench_truthfulqa_secondary_permute
+test_split: secondary
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/metabench/metabench_winogrande.yaml
+++ b/lm_eval/tasks/metabench/metabench_winogrande.yaml
+task: metabench_winogrande
+tag:
+  - metabench_winogrande_subset
+dataset_path: HCAI/metabench
+dataset_name: Winogrande
+process_docs: !function process_docs.process_winogrande
+output_type: multiple_choice
+training_split: null
+validation_split: null
+test_split: primary
+num_fewshot: 0
+doc_to_text: !function process_docs.winogrande_doc_to_text
+doc_to_target: !function process_docs.winogrande_doc_to_target
+doc_to_choice: !function process_docs.winogrande_doc_to_choice
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/metabench/metabench_winogrande_permute.yaml
+++ b/lm_eval/tasks/metabench/metabench_winogrande_permute.yaml
+include: metabench_winogrande.yaml
+task: metabench_winogrande_permute
+process_docs: !function process_docs_permute.process_winogrande
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/metabench/metabench_winogrande_secondary.yaml
+++ b/lm_eval/tasks/metabench/metabench_winogrande_secondary.yaml
+include: metabench_winogrande.yaml
+task: metabench_winogrande_secondary
+test_split: secondary
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/metabench/metabench_winogrande_secondary_permute.yaml
+++ b/lm_eval/tasks/metabench/metabench_winogrande_secondary_permute.yaml
+include: metabench_winogrande_permute.yaml
+task: metabench_winogrande_secondary_permute
+test_split: secondary
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/metabench/process_docs.py
+++ b/lm_eval/tasks/metabench/process_docs.py
+import hashlib
+import re
+
+import datasets
+
+
+def hash_string(string: str) -> str:
+    return hashlib.sha256(string.encode("utf-8")).hexdigest()
+
+
+def process_arc(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _subprocess(doc):
+        long_prompt = ""
+        for shot in range(1, 26):
+            question = doc[f"arc_question_shot_{shot}"]
+            doc.pop(f"arc_question_shot_{shot}")
+            answer_lab = doc[f"arc_answerKey_shot_{shot}"]
+            doc.pop(f"arc_answerKey_shot_{shot}")
+            answer_idx = doc[f"arc_choices_shot_{shot}"]["label"].index(answer_lab)
+            answer = doc[f"arc_choices_shot_{shot}"]["text"][answer_idx]
+            doc.pop(f"arc_choices_shot_{shot}")
+            doc.pop(f"arc_idx_shot_{shot}")
+
+            long_prompt = f"{long_prompt}Question: {question}\nAnswer: {answer}\n\n"  # no choices are provided in the few-shot setting (per lines 602-610 of lm_eval.api.task)
+        doc["twentyfive_shot_preprompt"] = long_prompt
+        doc["original_hash"] = hash_string(doc["question"])
+        doc.pop("alltwentyfiveshot_longprompt")
+        return doc
+
+    return dataset.map(_subprocess)
+
+
+def process_gsm8k(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _subprocess(doc):
+        long_prompt = ""
+        for shot in range(1, 6):
+            question = doc[f"gsm8k_prompt_shot_{shot}"]
+            doc.pop(f"gsm8k_prompt_shot_{shot}")
+            answer = doc[f"gsm8k_answer_shot_{shot}"]
+            doc.pop(f"gsm8k_answer_shot_{shot}")
+            doc.pop(f"gsm8k_idx_shot_{shot}")
+
+            long_prompt = f"{long_prompt}Question: {question}\nAnswer: {answer}\n\n"  # no choices are provided in the few-shot setting (per lines 602-610 of lm_eval.api.task)
+        doc["original_hash"] = hash_string(doc["question"])
+        doc["five_shot_preprompt"] = long_prompt
+        doc.pop("allfiveshot_longprompt")
+        return doc
+
+    return dataset.map(_subprocess)
+
+
+def process_hellaswag(dataset: datasets.Dataset) -> datasets.Dataset:
+    def process_txt(text):  # mirrored from hellaswag task
+        text = text.strip()
+        # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
+        text = text.replace(" [title]", ". ")
+        text = re.sub("\\[.*?\\]", "", text)
+        text = text.replace("  ", " ")
+        return text
+
+    def _preprocess(doc):
+        ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
+        doc.pop("ctx_a")
+        doc.pop("ctx_b")
+        doc.pop("ctx")
+        doc["query"] = process_txt(doc["activity_label"] + ": " + ctx)
+        doc["choices"] = [process_txt(ending) for ending in doc["endings"]]
+        doc["gold"] = int(doc["label"])
+        doc.pop("activity_label")
+        doc.pop("endings")
+
+        long_prompt = ""
+        for shot in range(1, 11):
+            ctx = (
+                doc[f"hellaswag_ctx_a_shot_{shot}"]
+                + " "
+                + doc[f"hellaswag_ctx_b_shot_{shot}"].capitalize()
+            )
+            doc.pop(f"hellaswag_ctx_a_shot_{shot}")
+            doc.pop(f"hellaswag_ctx_b_shot_{shot}")
+            doc.pop(f"hellaswag_ctx_shot_{shot}")
+            question = process_txt(
+                doc[f"hellaswag_activity_labels_shot_{shot}"] + ": " + ctx
+            )
+            ending = process_txt(
+                doc[f"hellaswag_endings_shot_{shot}"][
+                    int(doc[f"hellaswag_label_shot_{shot}"])
+                ]
+            )
+            doc.pop(f"hellaswag_activity_labels_shot_{shot}")
+            doc.pop(f"hellaswag_endings_shot_{shot}")
+            doc.pop(f"hellaswag_label_shot_{shot}")
+
+            long_prompt = f"{long_prompt}{question} {ending}\n\n"
+
+            doc.pop(f"hellaswag_ind_shot_{shot}")
+            doc.pop(f"hellaswag_source_id_shot_{shot}")
+            doc.pop(f"hellaswag_split_shot_{shot}")
+            doc.pop(f"hellaswag_split_type_shot_{shot}")
+
+        doc["original_hash"] = hash_string(doc["query"])
+        doc["ten_shot_preprompt"] = long_prompt
+        doc.pop("alltenshot_longprompt")
+        return doc
+
+    return dataset.map(_preprocess)
+
+
+def process_mmlu(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _subprocess(doc):
+        choices = ["A", "B", "C", "D"]
+        long_prompt = f"The following are multiple choice questions (with answers) about {' '.join(doc['subject'].split('_'))}.\n\n"
+        for shot in range(1, 6):
+            question = doc[f"mmlu_question_shot_{shot}"].strip()
+            doc.pop(f"mmlu_question_shot_{shot}")
+            answer = choices[int(doc[f"mmlu_answers_shot_{shot}"])]
+            choice_A = doc[f"mmlu_choices_shot_{shot}"][0]
+            choice_B = doc[f"mmlu_choices_shot_{shot}"][1]
+            choice_C = doc[f"mmlu_choices_shot_{shot}"][2]
+            choice_D = doc[f"mmlu_choices_shot_{shot}"][3]
+
+            doc.pop(f"mmlu_choices_shot_{shot}")
+            doc.pop(f"mmlu_answers_shot_{shot}")
+            doc.pop(f"mmlu_ind_shot_{shot}")
+
+            long_prompt = f"{long_prompt}{question}\nA. {choice_A}\nB. {choice_B}\nC. {choice_C}\nD. {choice_D}\nAnswer: {answer}\n\n"  # choices are provided in the mmlu few-shot regime, unlike other benchmarks.
+
+        doc["original_hash"] = hash_string(doc["question"])
+        doc["five_shot_preprompt"] = long_prompt
+        doc.pop("allfiveshot_longprompt")
+        return doc
+
+    return dataset.map(_subprocess)
+
+
+def process_truthfulqa(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _subprocess(doc):
+        doc["original_hash"] = hash_string(doc["question"])
+        return doc
+
+    return dataset.map(_subprocess)
+
+
+def process_winogrande(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _subprocess(doc):
+        long_prompt = ""
+        for shot in range(1, 6):
+            if doc[f"winogrande_answer_shot_{shot}"] == "1":
+                answer = doc[f"winogrande_option1_shot_{shot}"]
+            elif doc[f"winogrande_answer_shot_{shot}"] == "2":
+                answer = doc[f"winogrande_option2_shot_{shot}"]
+            else:
+                raise ValueError("Answer not recognised.")
+
+            question = doc[f"winogrande_prompt_shot_{shot}"].replace("_", answer)
+
+            doc.pop(f"winogrande_prompt_shot_{shot}")
+            doc.pop(f"winogrande_answer_shot_{shot}")
+            doc.pop(f"winogrande_idx_shot_{shot}")
+            doc.pop(f"winogrande_option1_shot_{shot}")
+            doc.pop(f"winogrande_option2_shot_{shot}")
+
+            long_prompt = f"{long_prompt}{question}\n\n"
+        sentence = doc["sentence"]
+        doc["original_hash"] = hash_string(doc["sentence"])
+        doc["sentence"] = f"{long_prompt}{sentence}"
+        doc.pop("allfiveshot_longprompt")
+        return doc
+
+    return dataset.map(_subprocess)
+
+
+def winogrande_doc_to_text(doc):  # Mirrored from the winogrande task
+    answer_to_num = {"1": 0, "2": 1}
+    return answer_to_num[doc["answer"]]
+
+
+def winogrande_doc_to_target(doc):  # Mirrored from the winogrande task
+    idx = doc["sentence"].index("_") + 1
+    return doc["sentence"][idx:].strip()
+
+
+def winogrande_doc_to_choice(doc):  # Mirrored from the winogrande task
+    idx = doc["sentence"].index("_")
+    options = [doc["option1"], doc["option2"]]
+    return [doc["sentence"][:idx] + opt for opt in options]
--- a/lm_eval/tasks/metabench/process_docs_permute.py
+++ b/lm_eval/tasks/metabench/process_docs_permute.py
+import hashlib
+import random
+import re
+
+import datasets
+
+
+def hash_string(string: str) -> str:
+    return hashlib.sha256(string.encode("utf-8")).hexdigest()
+
+
+def process_arc(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _subprocess(doc):
+        long_prompt = ""
+        for shot in range(1, 26):
+            question = doc[f"arc_question_shot_{shot}"]
+            doc.pop(f"arc_question_shot_{shot}")
+            answer_lab = doc[f"arc_answerKey_shot_{shot}"]
+            doc.pop(f"arc_answerKey_shot_{shot}")
+            answer_idx = doc[f"arc_choices_shot_{shot}"]["label"].index(answer_lab)
+            answer = doc[f"arc_choices_shot_{shot}"]["text"][answer_idx]
+            doc.pop(f"arc_choices_shot_{shot}")
+            doc.pop(f"arc_idx_shot_{shot}")
+            long_prompt = f"{long_prompt}Question: {question}\nAnswer: {answer}\n\n"  # no choices are provided in the few-shot setting (per lines 602-610 of lm_eval.api.task)
+        doc["twentyfive_shot_preprompt"] = long_prompt
+        doc.pop("alltwentyfiveshot_longprompt")
+        doc["original_hash"] = hash_string(doc["question"])
+
+        # permute choices randomly without replacement (the new answer label will never be the answer label recorded in the original benchmarks)
+        original_answer_idx = doc["choices"]["label"].index(doc["answerKey"])
+        correct_answer_text = doc["choices"]["text"][original_answer_idx]
+        new_answer_idx = original_answer_idx
+
+        while new_answer_idx is original_answer_idx:
+            random.shuffle(doc["choices"]["text"])
+            new_answer_idx = doc["choices"]["text"].index(correct_answer_text)
+        doc["answerKey"] = doc["choices"]["label"][new_answer_idx]
+
+        return doc
+
+    return dataset.map(_subprocess)
+
+
+def process_hellaswag(dataset: datasets.Dataset) -> datasets.Dataset:
+    def process_txt(text):  # mirrored from hellaswag task
+        text = text.strip()
+        # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
+        text = text.replace(" [title]", ". ")
+        text = re.sub("\\[.*?\\]", "", text)
+        text = text.replace("  ", " ")
+        return text
+
+    def _preprocess(doc):
+        ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
+        doc.pop("ctx_a")
+        doc.pop("ctx_b")
+        doc.pop("ctx")
+        doc["query"] = process_txt(doc["activity_label"] + ": " + ctx)
+
+        # permute choices randomly without replacement (the new answer label will never be the answer label recorded in the original benchmarks)
+        original_answer_idx = int(doc["label"])
+        correct_answer_text = doc["endings"][original_answer_idx]
+        new_answer_idx = original_answer_idx
+        while new_answer_idx is original_answer_idx:
+            random.shuffle(doc["endings"])
+            new_answer_idx = doc["endings"].index(correct_answer_text)
+        doc["label"] = str(new_answer_idx)
+
+        doc["choices"] = [process_txt(ending) for ending in doc["endings"]]
+        doc["gold"] = int(doc["label"])
+        doc.pop("activity_label")
+        doc.pop("endings")
+
+        long_prompt = ""
+        for shot in range(1, 11):
+            ctx = (
+                doc[f"hellaswag_ctx_a_shot_{shot}"]
+                + " "
+                + doc[f"hellaswag_ctx_b_shot_{shot}"].capitalize()
+            )
+            doc.pop(f"hellaswag_ctx_a_shot_{shot}")
+            doc.pop(f"hellaswag_ctx_b_shot_{shot}")
+            doc.pop(f"hellaswag_ctx_shot_{shot}")
+            question = process_txt(
+                doc[f"hellaswag_activity_labels_shot_{shot}"] + ": " + ctx
+            )
+            ending = process_txt(
+                doc[f"hellaswag_endings_shot_{shot}"][
+                    int(doc[f"hellaswag_label_shot_{shot}"])
+                ]
+            )
+            doc.pop(f"hellaswag_activity_labels_shot_{shot}")
+            doc.pop(f"hellaswag_endings_shot_{shot}")
+            doc.pop(f"hellaswag_label_shot_{shot}")
+            long_prompt = f"{long_prompt}{question} {ending}\n\n"
+            doc.pop(f"hellaswag_ind_shot_{shot}")
+            doc.pop(f"hellaswag_source_id_shot_{shot}")
+            doc.pop(f"hellaswag_split_shot_{shot}")
+            doc.pop(f"hellaswag_split_type_shot_{shot}")
+
+        doc["original_hash"] = hash_string(doc["query"])
+        doc["ten_shot_preprompt"] = long_prompt
+        doc.pop("alltenshot_longprompt")
+        return doc
+
+    return dataset.map(_preprocess)
+
+
+def process_mmlu(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _subprocess(doc):
+        choices = ["A", "B", "C", "D"]
+        long_prompt = f"The following are multiple choice questions (with answers) about {' '.join(doc['subject'].split('_'))}.\n\n"
+        for shot in range(1, 6):
+            question = doc[f"mmlu_question_shot_{shot}"].strip()
+            doc.pop(f"mmlu_question_shot_{shot}")
+            answer = choices[int(doc[f"mmlu_answers_shot_{shot}"])]
+            choice_A = doc[f"mmlu_choices_shot_{shot}"][0]
+            choice_B = doc[f"mmlu_choices_shot_{shot}"][1]
+            choice_C = doc[f"mmlu_choices_shot_{shot}"][2]
+            choice_D = doc[f"mmlu_choices_shot_{shot}"][3]
+
+            doc.pop(f"mmlu_choices_shot_{shot}")
+            doc.pop(f"mmlu_answers_shot_{shot}")
+            doc.pop(f"mmlu_ind_shot_{shot}")
+
+            long_prompt = f"{long_prompt}{question}\nA. {choice_A}\nB. {choice_B}\nC. {choice_C}\nD. {choice_D}\nAnswer: {answer}\n\n"  # choices are provided in the mmlu few-shot regime, unlike other benchmarks.
+
+        doc["original_hash"] = hash_string(doc["question"])
+        doc["five_shot_preprompt"] = long_prompt
+        doc.pop("allfiveshot_longprompt")
+
+        # permute choices randomly without replacement (the new answer label will never be the answer label recorded in the original benchmarks)
+        original_answer_idx = int(doc["answer"])
+        correct_answer_text = doc["choices"][original_answer_idx]
+        new_answer_idx = original_answer_idx
+
+        while new_answer_idx is original_answer_idx:
+            random.shuffle(doc["choices"])
+            new_answer_idx = doc["choices"].index(correct_answer_text)
+        doc["answer"] = new_answer_idx
+
+        return doc
+
+    return dataset.map(_subprocess)
+
+
+def process_truthfulqa(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _subprocess(
+        doc,
+    ):  # currently only permuting the mc1 targets as metabench does not use mc2 targets.
+        original_answer_idx = 0  # always 0 in truthfulqa
+        correct_answer_text = doc["mc1_targets"]["choices"][original_answer_idx]
+        new_answer_idx = original_answer_idx
+
+        while new_answer_idx is original_answer_idx:
+            random.shuffle(doc["mc1_targets"]["choices"])
+            new_answer_idx = doc["mc1_targets"]["choices"].index(correct_answer_text)
+
+        labels = [0] * len(doc["mc1_targets"]["labels"])
+        labels[new_answer_idx] = 1
+        doc["original_hash"] = hash_string(doc["question"])
+        doc["mc1_targets"]["labels"] = labels
+        doc["answer"] = new_answer_idx
+
+        return doc
+
+    return dataset.map(_subprocess)
+
+
+def process_winogrande(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _subprocess(doc):
+        long_prompt = ""
+        for shot in range(1, 6):
+            if doc[f"winogrande_answer_shot_{shot}"] == "1":
+                answer = doc[f"winogrande_option1_shot_{shot}"]
+            elif doc[f"winogrande_answer_shot_{shot}"] == "2":
+                answer = doc[f"winogrande_option2_shot_{shot}"]
+            else:
+                raise ValueError("Answer not recognised.")
+
+            question = doc[f"winogrande_prompt_shot_{shot}"].replace("_", answer)
+
+            doc.pop(f"winogrande_prompt_shot_{shot}")
+            doc.pop(f"winogrande_answer_shot_{shot}")
+            doc.pop(f"winogrande_idx_shot_{shot}")
+            doc.pop(f"winogrande_option1_shot_{shot}")
+            doc.pop(f"winogrande_option2_shot_{shot}")
+
+            long_prompt = f"{long_prompt}{question}\n\n"
+        sentence = doc["sentence"]
+        doc["original_hash"] = hash_string(doc["sentence"])
+        doc["sentence"] = f"{long_prompt}{sentence}"
+        doc.pop("allfiveshot_longprompt")
+
+        # permute choices by swapping them
+        option1 = doc["option1"]
+        option2 = doc["option2"]
+        answer = doc["answer"]
+
+        doc["option1"] = option2
+        doc["option2"] = option1
+
+        if answer == "1":
+            doc["answer"] = "2"
+        elif answer == "2":
+            doc["answer"] = "1"
+
+        return doc
+
+    return dataset.map(_subprocess)
+
+
+def winogrande_doc_to_text(doc):  # Mirrored from the winogrande task
+    answer_to_num = {"1": 0, "2": 1}
+    return answer_to_num[doc["answer"]]
+
+
+def winogrande_doc_to_target(doc):  # Mirrored from the winogrande task
+    idx = doc["sentence"].index("_") + 1
+    return doc["sentence"][idx:].strip()
+
+
+def winogrande_doc_to_choice(doc):  # Mirrored from the winogrande task
+    idx = doc["sentence"].index("_")
+    options = [doc["option1"], doc["option2"]]
+    return [doc["sentence"][:idx] + opt for opt in options]
--- a/lm_eval/tasks/mgsm/README.md
+++ b/lm_eval/tasks/mgsm/README.md
@@ -92,3 +92,7 @@ If other tasks on this dataset are already supported:
 * [ ] Is the "Main" variant of this task clearly denoted?
 * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
 * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
+
+# changelog
+- (en_cot, direct) ver 3; (native_cot) ver 4: issue #2578; PR #2587
+  - fix fewshot format: Changed inconsistent usage of ':' (ASCII) and '：' (Chinese) to use '：' consistently.
--- a/lm_eval/tasks/mgsm/direct/direct_yaml
+++ b/lm_eval/tasks/mgsm/direct/direct_yaml
@@ -32,4 +32,4 @@ metric_list:
    ignore_case: true
    ignore_punctuation: true
 metadata:
-  version: 2.0
+  version: 3.0
--- a/lm_eval/tasks/mgsm/direct/mgsm_direct_ja.yaml
+++ b/lm_eval/tasks/mgsm/direct/mgsm_direct_ja.yaml
 # Generated by utils.py
 dataset_name: ja
 doc_to_target: '{% if answer is not none %}{{answer[11:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"問題: "+question+"\nAnswer:"}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"問題： "+question+"\nAnswer:"}}{% endif %}'
 generation_kwargs:
  do_sample: false
  until:
-  - '問題:'
+  - 問題：
  - </s>
  - <|im_end|>
 include: direct_yaml

--- a/lm_eval/tasks/mgsm/direct/mgsm_direct_zh.yaml
+++ b/lm_eval/tasks/mgsm/direct/mgsm_direct_zh.yaml
 # Generated by utils.py
 dataset_name: zh
 doc_to_target: '{% if answer is not none %}{{answer[6:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"问题: "+question+"\nAnswer:"}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"问题： "+question+"\nAnswer:"}}{% endif %}'
 generation_kwargs:
  do_sample: false
  until:
-  - '问题:'
+  - 问题：
  - </s>
  - <|im_end|>
 include: direct_yaml

--- a/lm_eval/tasks/mgsm/en_cot/cot_yaml
+++ b/lm_eval/tasks/mgsm/en_cot/cot_yaml
@@ -33,4 +33,4 @@ filter_list:
    - function: take_first
    name: flexible-extract
 metadata:
-  version: 2.0
+  version: 3.0
--- a/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_ja.yaml
+++ b/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_ja.yaml
 # Generated by utils.py
 dataset_name: ja
 doc_to_target: '{% if answer is not none %}{{answer[11:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"問題: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"問題： "+question+"\nStep-by-Step Answer:"}}{% endif %}'
 generation_kwargs:
  do_sample: false
  until:
-  - '問題:'
+  - 問題：
  - </s>
  - <|im_end|>
 include: cot_yaml

--- a/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_zh.yaml
+++ b/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_zh.yaml
 # Generated by utils.py
 dataset_name: zh
 doc_to_target: '{% if answer is not none %}{{answer[6:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"问题: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"问题： "+question+"\nStep-by-Step Answer:"}}{% endif %}'
 generation_kwargs:
  do_sample: false
  until:
-  - '问题:'
+  - 问题：
  - </s>
  - <|im_end|>
 include: cot_yaml

--- a/lm_eval/tasks/mgsm/native_cot/cot_yaml
+++ b/lm_eval/tasks/mgsm/native_cot/cot_yaml
@@ -28,4 +28,4 @@ filter_list:
        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
      - function: "take_first"
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_ja.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_ja.yaml
 # Generated by utils.py
 dataset_name: ja
 doc_to_target: '{% if answer is not none %}{{answer[11:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nステップごとの答え:"}}{% else %}{{"問題: "+question+"\nステップごとの答え:"}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nステップごとの答え:"}}{% else %}{{"問題： "+question+"\nステップごとの答え:"}}{% endif %}'
 filter_list:
 - filter:
  - function: regex
@@ -17,7 +17,7 @@ filter_list:
 generation_kwargs:
  do_sample: false
  until:
-  - '問題:'
+  - 問題：
  - </s>
  - <|im_end|>
 include: cot_yaml

--- a/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_zh.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_zh.yaml
 # Generated by utils.py
 dataset_name: zh
 doc_to_target: '{% if answer is not none %}{{answer[6:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\n逐步解答:"}}{% else %}{{"问题: "+question+"\n逐步解答:"}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\n逐步解答:"}}{% else %}{{"问题： "+question+"\n逐步解答:"}}{% endif %}'
 filter_list:
 - filter:
  - function: regex
@@ -17,7 +17,7 @@ filter_list:
 generation_kwargs:
  do_sample: false
  until:
-  - '问题:'
+  - 问题：
  - </s>
  - <|im_end|>
 include: cot_yaml

--- a/lm_eval/tasks/mgsm/utils.py
+++ b/lm_eval/tasks/mgsm/utils.py
@@ -75,7 +75,7 @@ LANGUAGES = {
    },
    "ja": {  # Japanese
        # "QUESTION": "問題:",
-        "QUESTION": "\u554f\u984c:",
+        "QUESTION": "\u554f\u984c：",
        # "ANSWER": "ステップごとの答え:",
        "ANSWER": "\u30b9\u30c6\u30c3\u30d7\u3054\u3068\u306e\u7b54\u3048:",
        "DIRECT": "Answer:",
@@ -84,7 +84,7 @@ LANGUAGES = {
    },
    "zh": {  # Chinese
        # "QUESTION": "问题:",
-        "QUESTION": "\u95ee\u9898:",
+        "QUESTION": "\u95ee\u9898：",
        # "ANSWER": "逐步解答:",
        "ANSWER": "\u9010\u6b65\u89e3\u7b54:",
        "DIRECT": "Answer:",

--- a/lm_eval/tasks/mlqa/README.md
+++ b/lm_eval/tasks/mlqa/README.md
+# MLQA
+
+### Paper
+
+Title: `MLQA: Evaluating Cross-lingual Extractive Question Answering`
+
+Abstract: `https://arxiv.org/abs/1910.07475`
+
+MLQA (MultiLingual Question Answering) is a benchmark dataset for evaluating cross-lingual question answering performance.
+MLQA consists of over 5K extractive QA instances (12K in English) in SQuAD format in seven languages - English, Arabic,
+German, Spanish, Hindi, Vietnamese and Simplified Chinese. MLQA is highly parallel, with QA instances parallel between
+4 different languages on average
+
+Homepage: `https://github.com/facebookresearch/MLQA`
+
+
+### Citation
+
+```
+@misc{lewis2020mlqaevaluatingcrosslingualextractive,
+      title={MLQA: Evaluating Cross-lingual Extractive Question Answering},
+      author={Patrick Lewis and Barlas Oğuz and Ruty Rinott and Sebastian Riedel and Holger Schwenk},
+      year={2020},
+      eprint={1910.07475},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/1910.07475},
+}
+```
+
+### Groups, Tags, and Tasks
+
+#### Groups
+
+* Not part of a group yet
+
+#### Tasks
+
+Tasks of the form `mlqa_context-lang_question-lang.yaml`
+* `mlqa_ar_ar.yaml`
+* `mlqa_ar_de.yaml`
+* `mlqa_ar_vi.yaml`
+* `mlqa_ar_zh.yaml`
+* `mlqa_ar_en.yaml`
+* `mlqa_ar_es.yaml`
+* `mlqa_ar_hi.yaml`
+* `mlqa_de_ar.yaml`
+* `mlqa_de_de.yaml`
+* `mlqa_de_vi.yaml`
+* `mlqa_de_zh.yaml`
+* `mlqa_de_en.yaml`
+* `mlqa_de_es.yaml`
+* `mlqa_de_hi.yaml`
+* `mlqa_vi_ar.yaml`
+* `mlqa_vi_de.yaml`
+* `mlqa_vi_vi.yaml`
+* `mlqa_vi_zh.yaml`
+* `mlqa_vi_en.yaml`
+* `mlqa_vi_es.yaml`
+* `mlqa_vi_hi.yaml`
+* `mlqa_zh_ar.yaml`
+* `mlqa_zh_de.yaml`
+* `mlqa_zh_vi.yaml`
+* `mlqa_zh_zh.yaml`
+* `mlqa_zh_en.yaml`
+* `mlqa_zh_es.yaml`
+* `mlqa_zh_hi.yaml`
+* `mlqa_en_ar.yaml`
+* `mlqa_en_de.yaml`
+* `mlqa_en_vi.yaml`
+* `mlqa_en_zh.yaml`
+* `mlqa_en_en.yaml`
+* `mlqa_en_es.yaml`
+* `mlqa_en_hi.yaml`
+* `mlqa_es_ar.yaml`
+* `mlqa_es_de.yaml`
+* `mlqa_es_vi.yaml`
+* `mlqa_es_zh.yaml`
+* `mlqa_es_en.yaml`
+* `mlqa_es_es.yaml`
+* `mlqa_es_hi.yaml`
+* `mlqa_hi_ar.yaml`
+* `mlqa_hi_de.yaml`
+* `mlqa_hi_vi.yaml`
+* `mlqa_hi_zh.yaml`
+* `mlqa_hi_en.yaml`
+* `mlqa_hi_es.yaml`
+* `mlqa_hi_hi.yaml`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?