Unverified Commit a72babbf authored by Lei Chen's avatar Lei Chen Committed by GitHub
Browse files

PR fixing the issue #1391 (wrong contexts in the mgsm task) (#1440)



* fix the issue #1391, wrong contexts in mgsm tasks

* fix yaml issue for having two target_delimiter lines. For COT tasks, keep the one with a space (default)

* regenerate all task yaml files
- change naming so that file name will match with task name
- task|file follows a consistent naming way, mgsm_(mode)_(lang) for three modes, i.e., direct, en_cot, and native_cot

* English CoTs should have a space as target_delimiter

* Update utils.py

* Apply suggestions from code review

---------
Co-authored-by: default avatarHailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
parent 00dc9960
# Generated by utils.py
dataset_name: fr
doc_to_target: '{% if answer is not none %}{{answer[26:]}}{% else %}{{answer_number|string}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nRéponse étape par étape :"}}{% else %}{{"Question : "+question+"\nRéponse étape par étape :"}}{% endif %}'
filter_list:
- filter:
- function: regex
regex_pattern: La réponse est (\-?[0-9\.\,]+)
- function: take_first
name: get-answer
include: cot_yaml
task: mgsm_native_cot_fr
# Generated by utils.py
dataset_name: ja
doc_to_target: '{% if answer is not none %}{{answer[11:]}}{% else %}{{answer_number|string}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nステップごとの答え:"}}{% else %}{{"問題: "+question+"\nステップごとの答え:"}}{% endif %}'
filter_list:
- filter:
- function: regex
regex_pattern: 答えは(\-?[0-9\.\,]+)です。
- function: take_first
name: get-answer
include: cot_yaml
target_delimiter: ""
task: mgsm_native_cot_ja
# Generated by utils.py
dataset_name: ru
doc_to_target: '{% if answer is not none %}{{answer[18:]}}{% else %}{{answer_number|string}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nПошаговоерешение:"}}{% else %}{{"Задача: "+question+"\nПошаговоерешение:"}}{% endif %}'
filter_list:
- filter:
- function: regex
regex_pattern: Ответ — (\-?[0-9\.\,]+)
- function: take_first
name: get-answer
include: cot_yaml
task: mgsm_native_cot_ru
# Generated by utils.py
dataset_name: sw
doc_to_target: '{% if answer is not none %}{{answer[25:]}}{% else %}{{answer_number|string}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nJibu la Hatua kwa Hatua:"}}{% else %}{{"Swali: "+question+"\nJibu la Hatua kwa Hatua:"}}{% endif %}'
filter_list:
- filter:
- function: regex
regex_pattern: Jibu ni (\-?[0-9\.\,]+)
- function: take_first
name: get-answer
include: cot_yaml
task: mgsm_native_cot_sw
# Generated by utils.py
dataset_name: te
doc_to_target: '{% if answer is not none %}{{answer[19:]}}{% else %}{{answer_number|string}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nదశలవారీగా సమాధానం:"}}{% else %}{{"ప్రశ్న: "+question+"\nదశలవారీగా సమాధానం:"}}{% endif %}'
filter_list:
- filter:
- function: regex
regex_pattern: సమాధానం (\-?[0-9\.\,]+)
- function: take_first
name: get-answer
include: cot_yaml
task: mgsm_native_cot_te
# Generated by utils.py
dataset_name: th
doc_to_target: '{% if answer is not none %}{{answer[18:]}}{% else %}{{answer_number|string}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nคำตอบทีละขั้นตอน:"}}{% else %}{{"โจทย์: "+question+"\nคำตอบทีละขั้นตอน:"}}{% endif %}'
filter_list:
- filter:
- function: regex
regex_pattern: คำตอบคือ (\-?[0-9\.\,]+)
- function: take_first
name: get-answer
include: cot_yaml
task: mgsm_native_cot_th
# Generated by utils.py
dataset_name: zh
doc_to_target: '{% if answer is not none %}{{answer[6:]}}{% else %}{{answer_number|string}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\n逐步解答:"}}{% else %}{{"问题: "+question+"\n逐步解答:"}}{% endif %}'
filter_list:
- filter:
- function: regex
regex_pattern: 答案是 (\-?[0-9\.\,]+)。
- function: take_first
name: get-answer
include: cot_yaml
target_delimiter: ""
task: mgsm_native_cot_zh
...@@ -128,23 +128,25 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None: ...@@ -128,23 +128,25 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
yaml_template = "cot_yaml" yaml_template = "cot_yaml"
filter_list = {} filter_list = {}
DELIMITER = None
if mode == "direct": if mode == "direct":
ANSWER = LANGUAGES[lang]["DIRECT"] ANSWER = LANGUAGES[lang]["DIRECT"]
REGEX = None REGEX = None
task_name = f"mgsm_{lang}_direct" task_name = f"mgsm_direct_{lang}"
yaml_template = "direct_yaml" yaml_template = "direct_yaml"
elif mode == "native-cot": elif mode == "native-cot":
ANSWER = LANGUAGES[lang]["ANSWER"] ANSWER = LANGUAGES[lang]["ANSWER"]
REGEX = LANGUAGES[lang]["REGEX"] REGEX = LANGUAGES[lang]["REGEX"]
task_name = f"mgsm_{lang}_native-cot" task_name = f"mgsm_native_cot_{lang}"
filter_list = add_regex_pattern(REGEX) filter_list = add_regex_pattern(REGEX)
DELIMITER = "" if lang in ["zh", "ja"]
elif mode == "en-cot": elif mode == "en-cot":
ANSWER = LANGUAGES["en"]["ANSWER"] ANSWER = LANGUAGES["en"]["ANSWER"]
REGEX = LANGUAGES["en"]["REGEX"] REGEX = LANGUAGES["en"]["REGEX"]
task_name = f"mgsm_{lang}_en-cot" task_name = f"mgsm_en_cot_{lang}"
file_name = f"{task_name}.yaml" file_name = f"{task_name}.yaml"
ANSWER_TO_SKIP = len(LANGUAGES[lang]["ANSWER"])+1
with open( with open(
f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8" f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
) as f: ) as f:
...@@ -153,18 +155,19 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None: ...@@ -153,18 +155,19 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
{ {
"include": yaml_template, "include": yaml_template,
"dataset_name": lang, "dataset_name": lang,
"task": f"mgsm_{lang}_direct", "task": f"{task_name}",
"doc_to_text": f"""{{% if answer is not none %}}""" "doc_to_text": f"""{{% if answer is not none %}}"""
f"""{{{{question+"\\n{ANSWER}"}}}}""" f"""{{{{question+"\\n{ANSWER}"}}}}"""
f"""{{% else %}}""" f"""{{% else %}}"""
f"""{{{{"{QUESTION} "+question+"\\n{ANSWER}"}}}}""" f"""{{{{"{QUESTION} "+question+"\\n{ANSWER}"}}}}"""
f"""{{% endif %}}""", f"""{{% endif %}}""",
"doc_to_target": f"""{{% if answer is not none %}}""" "doc_to_target": f"""{{% if answer is not none %}}"""
f"""{{{{answer[{len(ANSWER)}+1]}}}}""" f"""{{{{answer[{ANSWER_TO_SKIP}:]}}}}"""
f"""{{% else %}}""" f"""{{% else %}}"""
f"""{{{{answer_number|string}}}}""" f"""{{{{answer_number|string}}}}"""
f"""{{% endif %}}""", f"""{{% endif %}}""",
**filter_list, **filter_list,
**({"target_delimiter": DELIMITER} if DELIMITER else {}),
}, },
f, f,
allow_unicode=True, allow_unicode=True,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment