Unverified Commit 8371662c authored by thnkinbtfly's avatar thnkinbtfly Committed by GitHub
Browse files

update parsing logic of mgsm following gsm8k (#1462)

parent 75ac1f47
......@@ -2,5 +2,11 @@
dataset_name: sw
doc_to_target: '{% if answer is not none %}{{answer[25:]}}{% else %}{{answer_number|string}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Swali: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- 'Swali:'
- </s>
- <|im_end|>
include: cot_yaml
task: mgsm_en_cot_sw
......@@ -2,5 +2,11 @@
dataset_name: te
doc_to_target: '{% if answer is not none %}{{answer[19:]}}{% else %}{{answer_number|string}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"ప్రశ్న: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- 'ప్రశ్న:'
- </s>
- <|im_end|>
include: cot_yaml
task: mgsm_en_cot_te
......@@ -2,5 +2,11 @@
dataset_name: th
doc_to_target: '{% if answer is not none %}{{answer[18:]}}{% else %}{{answer_number|string}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"โจทย์: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- 'โจทย์:'
- </s>
- <|im_end|>
include: cot_yaml
task: mgsm_en_cot_th
......@@ -2,5 +2,11 @@
dataset_name: zh
doc_to_target: '{% if answer is not none %}{{answer[6:]}}{% else %}{{answer_number|string}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"问题: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- '问题:'
- </s>
- <|im_end|>
include: cot_yaml
task: mgsm_en_cot_zh
......@@ -28,4 +28,4 @@ filter_list:
regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
- function: "take_first"
metadata:
version: 2.0
version: 3.0
......@@ -7,6 +7,18 @@ filter_list:
- function: regex
regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
name: get-answer
name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- 'প্রশ্ন:'
- </s>
- <|im_end|>
include: cot_yaml
task: mgsm_native_cot_bn
......@@ -7,6 +7,18 @@ filter_list:
- function: regex
regex_pattern: Die Antwort lautet (\-?[0-9\.\,]+)
- function: take_first
name: get-answer
name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- 'Frage:'
- </s>
- <|im_end|>
include: cot_yaml
task: mgsm_native_cot_de
......@@ -7,6 +7,18 @@ filter_list:
- function: regex
regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first
name: get-answer
name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- 'Question:'
- </s>
- <|im_end|>
include: cot_yaml
task: mgsm_native_cot_en
......@@ -7,6 +7,18 @@ filter_list:
- function: regex
regex_pattern: La respuesta es (\-?[0-9\.\,]+)
- function: take_first
name: get-answer
name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- 'Pregunta:'
- </s>
- <|im_end|>
include: cot_yaml
task: mgsm_native_cot_es
......@@ -7,6 +7,18 @@ filter_list:
- function: regex
regex_pattern: La réponse est (\-?[0-9\.\,]+)
- function: take_first
name: get-answer
name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- 'Question :'
- </s>
- <|im_end|>
include: cot_yaml
task: mgsm_native_cot_fr
......@@ -7,7 +7,18 @@ filter_list:
- function: regex
regex_pattern: 答えは(\-?[0-9\.\,]+)です。
- function: take_first
name: get-answer
name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- '問題:'
- </s>
- <|im_end|>
include: cot_yaml
target_delimiter: ""
task: mgsm_native_cot_ja
......@@ -7,6 +7,18 @@ filter_list:
- function: regex
regex_pattern: Ответ — (\-?[0-9\.\,]+)
- function: take_first
name: get-answer
name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- 'Задача:'
- </s>
- <|im_end|>
include: cot_yaml
task: mgsm_native_cot_ru
......@@ -7,6 +7,18 @@ filter_list:
- function: regex
regex_pattern: Jibu ni (\-?[0-9\.\,]+)
- function: take_first
name: get-answer
name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- 'Swali:'
- </s>
- <|im_end|>
include: cot_yaml
task: mgsm_native_cot_sw
......@@ -7,6 +7,18 @@ filter_list:
- function: regex
regex_pattern: సమాధానం (\-?[0-9\.\,]+)
- function: take_first
name: get-answer
name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- 'ప్రశ్న:'
- </s>
- <|im_end|>
include: cot_yaml
task: mgsm_native_cot_te
......@@ -7,6 +7,18 @@ filter_list:
- function: regex
regex_pattern: คำตอบคือ (\-?[0-9\.\,]+)
- function: take_first
name: get-answer
name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- 'โจทย์:'
- </s>
- <|im_end|>
include: cot_yaml
task: mgsm_native_cot_th
......@@ -7,7 +7,18 @@ filter_list:
- function: regex
regex_pattern: 答案是 (\-?[0-9\.\,]+)。
- function: take_first
name: get-answer
name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- '问题:'
- </s>
- <|im_end|>
include: cot_yaml
target_delimiter: ""
task: mgsm_native_cot_zh
......@@ -99,11 +99,24 @@ def add_regex_pattern(regex_pattern):
return {
"filter_list": [
{
"name": "get-answer",
"name": "strict-match",
"filter": [
{
"function": "regex",
"regex_pattern": regex_pattern,
"regex_pattern": f"""{regex_pattern}""",
},
{
"function": "take_first",
},
],
},
{
"name": "flexible-extract",
"filter": [
{
"function": "regex",
"regex_pattern": """(-?[$0-9.,]{2,})|(-?[0-9]+)""",
"group_select": -1,
},
{
"function": "take_first",
......@@ -113,7 +126,6 @@ def add_regex_pattern(regex_pattern):
],
}
def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
"""
Generate a yaml file for each language.
......@@ -139,7 +151,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
REGEX = LANGUAGES[lang]["REGEX"]
task_name = f"mgsm_native_cot_{lang}"
filter_list = add_regex_pattern(REGEX)
DELIMITER = "" if lang in ["zh", "ja"]
DELIMITER = "" if lang in ["zh", "ja"] else None
elif mode == "en-cot":
ANSWER = LANGUAGES["en"]["ANSWER"]
REGEX = LANGUAGES["en"]["REGEX"]
......@@ -167,6 +179,10 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
f"""{{{{answer_number|string}}}}"""
f"""{{% endif %}}""",
**filter_list,
"generation_kwargs": {
"until": [QUESTION, "</s>", "<|im_end|>"],
"do_sample": False
},
**({"target_delimiter": DELIMITER} if DELIMITER else {}),
},
f,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment