"app/vscode:/vscode.git/clone" did not exist on "ed6abba75a5de5e9934187cca1fe115e40584e76"
Unverified Commit 8371662c authored by thnkinbtfly's avatar thnkinbtfly Committed by GitHub
Browse files

update parsing logic of mgsm following gsm8k (#1462)

parent 75ac1f47
...@@ -2,5 +2,11 @@ ...@@ -2,5 +2,11 @@
dataset_name: sw dataset_name: sw
doc_to_target: '{% if answer is not none %}{{answer[25:]}}{% else %}{{answer_number|string}}{% endif %}' doc_to_target: '{% if answer is not none %}{{answer[25:]}}{% else %}{{answer_number|string}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Swali: "+question+"\nStep-by-Step Answer:"}}{% endif %}' doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"Swali: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- 'Swali:'
- </s>
- <|im_end|>
include: cot_yaml include: cot_yaml
task: mgsm_en_cot_sw task: mgsm_en_cot_sw
...@@ -2,5 +2,11 @@ ...@@ -2,5 +2,11 @@
dataset_name: te dataset_name: te
doc_to_target: '{% if answer is not none %}{{answer[19:]}}{% else %}{{answer_number|string}}{% endif %}' doc_to_target: '{% if answer is not none %}{{answer[19:]}}{% else %}{{answer_number|string}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"ప్రశ్న: "+question+"\nStep-by-Step Answer:"}}{% endif %}' doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"ప్రశ్న: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- 'ప్రశ్న:'
- </s>
- <|im_end|>
include: cot_yaml include: cot_yaml
task: mgsm_en_cot_te task: mgsm_en_cot_te
...@@ -2,5 +2,11 @@ ...@@ -2,5 +2,11 @@
dataset_name: th dataset_name: th
doc_to_target: '{% if answer is not none %}{{answer[18:]}}{% else %}{{answer_number|string}}{% endif %}' doc_to_target: '{% if answer is not none %}{{answer[18:]}}{% else %}{{answer_number|string}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"โจทย์: "+question+"\nStep-by-Step Answer:"}}{% endif %}' doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"โจทย์: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- 'โจทย์:'
- </s>
- <|im_end|>
include: cot_yaml include: cot_yaml
task: mgsm_en_cot_th task: mgsm_en_cot_th
...@@ -2,5 +2,11 @@ ...@@ -2,5 +2,11 @@
dataset_name: zh dataset_name: zh
doc_to_target: '{% if answer is not none %}{{answer[6:]}}{% else %}{{answer_number|string}}{% endif %}' doc_to_target: '{% if answer is not none %}{{answer[6:]}}{% else %}{{answer_number|string}}{% endif %}'
doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"问题: "+question+"\nStep-by-Step Answer:"}}{% endif %}' doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"问题: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
generation_kwargs:
do_sample: false
until:
- '问题:'
- </s>
- <|im_end|>
include: cot_yaml include: cot_yaml
task: mgsm_en_cot_zh task: mgsm_en_cot_zh
...@@ -28,4 +28,4 @@ filter_list: ...@@ -28,4 +28,4 @@ filter_list:
regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)" regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
- function: "take_first" - function: "take_first"
metadata: metadata:
version: 2.0 version: 3.0
...@@ -7,6 +7,18 @@ filter_list: ...@@ -7,6 +7,18 @@ filter_list:
- function: regex - function: regex
regex_pattern: The answer is (\-?[0-9\.\,]+) regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first - function: take_first
name: get-answer name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- 'প্রশ্ন:'
- </s>
- <|im_end|>
include: cot_yaml include: cot_yaml
task: mgsm_native_cot_bn task: mgsm_native_cot_bn
...@@ -7,6 +7,18 @@ filter_list: ...@@ -7,6 +7,18 @@ filter_list:
- function: regex - function: regex
regex_pattern: Die Antwort lautet (\-?[0-9\.\,]+) regex_pattern: Die Antwort lautet (\-?[0-9\.\,]+)
- function: take_first - function: take_first
name: get-answer name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- 'Frage:'
- </s>
- <|im_end|>
include: cot_yaml include: cot_yaml
task: mgsm_native_cot_de task: mgsm_native_cot_de
...@@ -7,6 +7,18 @@ filter_list: ...@@ -7,6 +7,18 @@ filter_list:
- function: regex - function: regex
regex_pattern: The answer is (\-?[0-9\.\,]+) regex_pattern: The answer is (\-?[0-9\.\,]+)
- function: take_first - function: take_first
name: get-answer name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- 'Question:'
- </s>
- <|im_end|>
include: cot_yaml include: cot_yaml
task: mgsm_native_cot_en task: mgsm_native_cot_en
...@@ -7,6 +7,18 @@ filter_list: ...@@ -7,6 +7,18 @@ filter_list:
- function: regex - function: regex
regex_pattern: La respuesta es (\-?[0-9\.\,]+) regex_pattern: La respuesta es (\-?[0-9\.\,]+)
- function: take_first - function: take_first
name: get-answer name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- 'Pregunta:'
- </s>
- <|im_end|>
include: cot_yaml include: cot_yaml
task: mgsm_native_cot_es task: mgsm_native_cot_es
...@@ -7,6 +7,18 @@ filter_list: ...@@ -7,6 +7,18 @@ filter_list:
- function: regex - function: regex
regex_pattern: La réponse est (\-?[0-9\.\,]+) regex_pattern: La réponse est (\-?[0-9\.\,]+)
- function: take_first - function: take_first
name: get-answer name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- 'Question :'
- </s>
- <|im_end|>
include: cot_yaml include: cot_yaml
task: mgsm_native_cot_fr task: mgsm_native_cot_fr
...@@ -7,7 +7,18 @@ filter_list: ...@@ -7,7 +7,18 @@ filter_list:
- function: regex - function: regex
regex_pattern: 答えは(\-?[0-9\.\,]+)です。 regex_pattern: 答えは(\-?[0-9\.\,]+)です。
- function: take_first - function: take_first
name: get-answer name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- '問題:'
- </s>
- <|im_end|>
include: cot_yaml include: cot_yaml
target_delimiter: ""
task: mgsm_native_cot_ja task: mgsm_native_cot_ja
...@@ -7,6 +7,18 @@ filter_list: ...@@ -7,6 +7,18 @@ filter_list:
- function: regex - function: regex
regex_pattern: Ответ — (\-?[0-9\.\,]+) regex_pattern: Ответ — (\-?[0-9\.\,]+)
- function: take_first - function: take_first
name: get-answer name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- 'Задача:'
- </s>
- <|im_end|>
include: cot_yaml include: cot_yaml
task: mgsm_native_cot_ru task: mgsm_native_cot_ru
...@@ -7,6 +7,18 @@ filter_list: ...@@ -7,6 +7,18 @@ filter_list:
- function: regex - function: regex
regex_pattern: Jibu ni (\-?[0-9\.\,]+) regex_pattern: Jibu ni (\-?[0-9\.\,]+)
- function: take_first - function: take_first
name: get-answer name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- 'Swali:'
- </s>
- <|im_end|>
include: cot_yaml include: cot_yaml
task: mgsm_native_cot_sw task: mgsm_native_cot_sw
...@@ -7,6 +7,18 @@ filter_list: ...@@ -7,6 +7,18 @@ filter_list:
- function: regex - function: regex
regex_pattern: సమాధానం (\-?[0-9\.\,]+) regex_pattern: సమాధానం (\-?[0-9\.\,]+)
- function: take_first - function: take_first
name: get-answer name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- 'ప్రశ్న:'
- </s>
- <|im_end|>
include: cot_yaml include: cot_yaml
task: mgsm_native_cot_te task: mgsm_native_cot_te
...@@ -7,6 +7,18 @@ filter_list: ...@@ -7,6 +7,18 @@ filter_list:
- function: regex - function: regex
regex_pattern: คำตอบคือ (\-?[0-9\.\,]+) regex_pattern: คำตอบคือ (\-?[0-9\.\,]+)
- function: take_first - function: take_first
name: get-answer name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- 'โจทย์:'
- </s>
- <|im_end|>
include: cot_yaml include: cot_yaml
task: mgsm_native_cot_th task: mgsm_native_cot_th
...@@ -7,7 +7,18 @@ filter_list: ...@@ -7,7 +7,18 @@ filter_list:
- function: regex - function: regex
regex_pattern: 答案是 (\-?[0-9\.\,]+)。 regex_pattern: 答案是 (\-?[0-9\.\,]+)。
- function: take_first - function: take_first
name: get-answer name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- '问题:'
- </s>
- <|im_end|>
include: cot_yaml include: cot_yaml
target_delimiter: ""
task: mgsm_native_cot_zh task: mgsm_native_cot_zh
...@@ -99,11 +99,24 @@ def add_regex_pattern(regex_pattern): ...@@ -99,11 +99,24 @@ def add_regex_pattern(regex_pattern):
return { return {
"filter_list": [ "filter_list": [
{ {
"name": "get-answer", "name": "strict-match",
"filter": [ "filter": [
{ {
"function": "regex", "function": "regex",
"regex_pattern": regex_pattern, "regex_pattern": f"""{regex_pattern}""",
},
{
"function": "take_first",
},
],
},
{
"name": "flexible-extract",
"filter": [
{
"function": "regex",
"regex_pattern": """(-?[$0-9.,]{2,})|(-?[0-9]+)""",
"group_select": -1,
}, },
{ {
"function": "take_first", "function": "take_first",
...@@ -113,7 +126,6 @@ def add_regex_pattern(regex_pattern): ...@@ -113,7 +126,6 @@ def add_regex_pattern(regex_pattern):
], ],
} }
def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None: def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
""" """
Generate a yaml file for each language. Generate a yaml file for each language.
...@@ -139,7 +151,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None: ...@@ -139,7 +151,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
REGEX = LANGUAGES[lang]["REGEX"] REGEX = LANGUAGES[lang]["REGEX"]
task_name = f"mgsm_native_cot_{lang}" task_name = f"mgsm_native_cot_{lang}"
filter_list = add_regex_pattern(REGEX) filter_list = add_regex_pattern(REGEX)
DELIMITER = "" if lang in ["zh", "ja"] DELIMITER = "" if lang in ["zh", "ja"] else None
elif mode == "en-cot": elif mode == "en-cot":
ANSWER = LANGUAGES["en"]["ANSWER"] ANSWER = LANGUAGES["en"]["ANSWER"]
REGEX = LANGUAGES["en"]["REGEX"] REGEX = LANGUAGES["en"]["REGEX"]
...@@ -167,6 +179,10 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None: ...@@ -167,6 +179,10 @@ def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
f"""{{{{answer_number|string}}}}""" f"""{{{{answer_number|string}}}}"""
f"""{{% endif %}}""", f"""{{% endif %}}""",
**filter_list, **filter_list,
"generation_kwargs": {
"until": [QUESTION, "</s>", "<|im_end|>"],
"do_sample": False
},
**({"target_delimiter": DELIMITER} if DELIMITER else {}), **({"target_delimiter": DELIMITER} if DELIMITER else {}),
}, },
f, f,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment