"docs/en/vscode:/vscode.git/clone" did not exist on "17ccaa5980cec6961b9ce63b964b94697b580a6a"
Unverified Commit 89deeeaf authored by thnkinbtfly's avatar thnkinbtfly Committed by GitHub
Browse files

update bbh, gsm8k, mmlu parsing logic and prompts (Orca2 bbh_cot_zeroshot 0% -> 42%) (#1356)



* update bbh, gsm8k, mmlu parsing logic and prompts

* remove the formatting prompt (bbh) + minor update (mmlu)

* update bbh, gsm8k, mmlu zeroshot, revert fewshots

* update bbh, gsm8k, mmlu version, forward changes to gsm8k-cot

* remove take_last, update to use docs parameters

* add newline

* ruff formatting

* Update pyproject.toml

* fix format

---------
Co-authored-by: default avatarHailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
parent 19cbb292
...@@ -7,7 +7,10 @@ class RegexFilter(Filter): ...@@ -7,7 +7,10 @@ class RegexFilter(Filter):
""" """ """ """
def __init__( def __init__(
self, regex_pattern: str = r"#### (\-?[0-9\.\,]+)", fallback: str = "[invalid]" self,
regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
group_select=0,
fallback: str = "[invalid]",
) -> None: ) -> None:
""" """
pass a string `regex` to run `re.compile(r"regex")` on. pass a string `regex` to run `re.compile(r"regex")` on.
...@@ -15,6 +18,7 @@ class RegexFilter(Filter): ...@@ -15,6 +18,7 @@ class RegexFilter(Filter):
""" """
self.regex_pattern = regex_pattern self.regex_pattern = regex_pattern
self.regex = re.compile(regex_pattern) self.regex = re.compile(regex_pattern)
self.group_select = group_select
self.fallback = fallback self.fallback = fallback
def apply(self, resps, docs): def apply(self, resps, docs):
...@@ -25,9 +29,12 @@ class RegexFilter(Filter): ...@@ -25,9 +29,12 @@ class RegexFilter(Filter):
def filter_set(inst): def filter_set(inst):
filtered = [] filtered = []
for resp in inst: for resp in inst:
match = self.regex.search(resp) match = self.regex.findall(resp)
if match: if match:
match = match.group(1).strip() match = match[self.group_select]
if isinstance(match, tuple):
match = [m for m in match if m][0]
match = match.strip()
else: else:
match = self.fallback match = self.fallback
filtered.append(match) filtered.append(match)
......
...@@ -7,21 +7,21 @@ metric_list: ...@@ -7,21 +7,21 @@ metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
# ignore_case: true ignore_case: true
# ignore_punctuation: true # ignore_punctuation: true
regexes_to_ignore:
- "\\.$"
- ","
- "\\\\"
- "\n"
- '"'
generation_kwargs: generation_kwargs:
until: until:
- "</s>" - "</s>"
- "Q" - "Q:"
- "\n\n" - "<|im_end|>"
do_sample: false do_sample: false
temperature: 0.0 temperature: 0.0
filter_list:
- name: "get-answer"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
num_fewshot: 0 num_fewshot: 0
metadata: metadata:
version: 1.0 version: 2.0
"dataset_name": "boolean_expressions" "dataset_name": "boolean_expressions"
"description": "Evaluate the result of a random Boolean expression.\n\n" "description": "Evaluate the result of a random Boolean expression.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_boolean_expressions" "task": "bbh_cot_zeroshot_boolean_expressions"
filter_list:
- name: "flexible-extract"
filter:
- function: "regex"
group_select: -1
regex_pattern: "\\b(True|False)\\b"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "causal_judgement" "dataset_name": "causal_judgement"
"description": "Answer questions about causal attribution.\n\n" "description": "Answer questions about causal attribution.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_causal_judgement" "task": "bbh_cot_zeroshot_causal_judgement"
filter_list:
- name: "flexible-extract"
filter:
- function: "regex"
group_select: -1
regex_pattern: "\\b(Yes|No|yes|no)\\b"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "date_understanding" "dataset_name": "date_understanding"
"description": "Infer the date from context.\n\n" "description": "Infer the date from context.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_date_understanding" "task": "bbh_cot_zeroshot_date_understanding"
filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
group_select: -1
ignore_case: true
ignore_punctuation: true
regex_pattern: "(\\([A-Z]\\))"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "disambiguation_qa" "dataset_name": "disambiguation_qa"
"description": "Clarify the meaning of sentences with ambiguous pronouns.\n\n" "description": "Clarify the meaning of sentences with ambiguous pronouns.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_disambiguation_qa" "task": "bbh_cot_zeroshot_disambiguation_qa"
filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
group_select: -1
ignore_case: true
ignore_punctuation: true
regex_pattern: "(\\([A-Z]\\))"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "dyck_languages" "dataset_name": "dyck_languages"
"description": "Correctly close a Dyck-n word.\n\n" "description": "Correctly close a Dyck-n word.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_dyck_languages" "task": "bbh_cot_zeroshot_dyck_languages"
filter_list:
- name: "flexible-extract"
filter:
- function: "regex"
group_select: -1
regex_pattern: "(?<= )([\" \\[\\(<{}>\\)\\]]+)|([\" \\[\\(<{}>\\)\\]]+)"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "formal_fallacies" "dataset_name": "formal_fallacies"
"description": "Distinguish deductively valid arguments from formal fallacies.\n\n" "description": "Distinguish deductively valid arguments from formal fallacies.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_formal_fallacies" "task": "bbh_cot_zeroshot_formal_fallacies"
filter_list:
- name: "flexible-extract"
filter:
- function: "regex"
group_select: -1
regex_pattern: "\\b(valid|invalid)\\b"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "geometric_shapes" "dataset_name": "geometric_shapes"
"description": "Name geometric shapes from their SVG paths.\n\n" "description": "Name geometric shapes from their SVG paths.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_geometric_shapes" "task": "bbh_cot_zeroshot_geometric_shapes"
filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
group_select: -1
ignore_case: true
ignore_punctuation: true
regex_pattern: "(\\([A-Z]\\))"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "hyperbaton" "dataset_name": "hyperbaton"
"description": "Order adjectives correctly in English sentences.\n\n" "description": "Order adjectives correctly in English sentences.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_hyperbaton" "task": "bbh_cot_zeroshot_hyperbaton"
filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
group_select: -1
ignore_case: true
ignore_punctuation: true
regex_pattern: "(\\([A-Z]\\))"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "logical_deduction_five_objects" "dataset_name": "logical_deduction_five_objects"
"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n" "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_logical_deduction_five_objects" "task": "bbh_cot_zeroshot_logical_deduction_five_objects"
filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
group_select: -1
ignore_case: true
ignore_punctuation: true
regex_pattern: "(\\([A-Z]\\))"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "logical_deduction_seven_objects" "dataset_name": "logical_deduction_seven_objects"
"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n" "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_logical_deduction_seven_objects" "task": "bbh_cot_zeroshot_logical_deduction_seven_objects"
filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
group_select: -1
ignore_case: true
ignore_punctuation: true
regex_pattern: "(\\([A-Z]\\))"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "logical_deduction_three_objects" "dataset_name": "logical_deduction_three_objects"
"description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n" "description": "A logical deduction task which requires deducing the order of a sequence of objects.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_logical_deduction_three_objects" "task": "bbh_cot_zeroshot_logical_deduction_three_objects"
filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
group_select: -1
ignore_case: true
ignore_punctuation: true
regex_pattern: "(\\([A-Z]\\))"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "movie_recommendation" "dataset_name": "movie_recommendation"
"description": "Recommend movies similar to the given list of movies.\n\n" "description": "Recommend movies similar to the given list of movies.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_movie_recommendation" "task": "bbh_cot_zeroshot_movie_recommendation"
filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
group_select: -1
ignore_case: true
ignore_punctuation: true
regex_pattern: "(\\([A-Z]\\))"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "multistep_arithmetic_two" "dataset_name": "multistep_arithmetic_two"
"description": "Solve multi-step arithmetic problems.\n\n" "description": "Solve multi-step arithmetic problems.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_multistep_arithmetic_two" "task": "bbh_cot_zeroshot_multistep_arithmetic_two"
filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.NumberParseRegexFilter
group_select: -1
regex_pattern: "([-0-9]+)"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "navigate" "dataset_name": "navigate"
"description": "Given a series of navigation instructions, determine whether one would end up back at the starting point.\n\n" "description": "Given a series of navigation instructions, determine whether one would end up back at the starting point.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_navigate" "task": "bbh_cot_zeroshot_navigate"
filter_list:
- name: "flexible-extract"
filter:
- function: "regex"
group_select: -1
regex_pattern: "\\b(Yes|No|yes|no)\\b"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "object_counting" "dataset_name": "object_counting"
"description": "Questions that involve enumerating objects and asking the model to count them.\n\n" "description": "Questions that involve enumerating objects and asking the model to count them.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_object_counting" "task": "bbh_cot_zeroshot_object_counting"
filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.NumberParseRegexFilter
group_select: -1
regex_pattern: "([-0-9]+)"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "penguins_in_a_table" "dataset_name": "penguins_in_a_table"
"description": "Answer questions about a table of penguins and their attributes.\n\n" "description": "Answer questions about a table of penguins and their attributes.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_penguins_in_a_table" "task": "bbh_cot_zeroshot_penguins_in_a_table"
filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
group_select: -1
ignore_case: true
ignore_punctuation: true
regex_pattern: "(\\([A-Z]\\))"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "reasoning_about_colored_objects" "dataset_name": "reasoning_about_colored_objects"
"description": "Answer extremely simple questions about the colors of objects on a surface.\n\n" "description": "Answer extremely simple questions about the colors of objects on a surface.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_reasoning_about_colored_objects" "task": "bbh_cot_zeroshot_reasoning_about_colored_objects"
filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
group_select: -1
ignore_case: true
ignore_punctuation: true
regex_pattern: "(\\([A-Z]\\))"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
"dataset_name": "ruin_names" "dataset_name": "ruin_names"
"description": "Select the humorous edit that 'ruins' the input movie or musical artist name.\n\n" "description": "Select the humorous edit that 'ruins' the input movie or musical artist name.\n\n"
"doc_to_text": "Q: {{input}}\nA: Let's think step by step.\n" "doc_to_text": "Q: {{input}}\nA: Let's think step by step."
"include": "_cot_zeroshot_template_yaml" "include": "_cot_zeroshot_template_yaml"
"task": "bbh_cot_zeroshot_ruin_names" "task": "bbh_cot_zeroshot_ruin_names"
filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.MultiChoiceRegexFilter
group_select: -1
ignore_case: true
ignore_punctuation: true
regex_pattern: "(\\([A-Z]\\))"
- function: "take_first"
- name: "strict-match"
filter:
- function: "regex"
regex_pattern: "((?<=The answer is )(.*)(?=.)|(?<=the answer is )(.*)(?=.)|(?<=The answer: )(.*)(?=.)|(?<=The final answer: )(.*)(?=.))"
- function: "take_first"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment