Commit e6b798f9 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into metrics

# Conflicts:
#	.pre-commit-config.yaml
#	lm_eval/api/task.py
#	lm_eval/models/huggingface.py
#	lm_eval/models/vllm_causallms.py
#	pyproject.toml
parents 14a29ade 4f8195f1
include: arithmetic_1dc.yaml
task: arithmetic_2da
dataset_name: arithmetic_2da
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml
task: arithmetic_2dm
dataset_name: arithmetic_2dm
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml
task: arithmetic_2ds
dataset_name: arithmetic_2ds
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml
task: arithmetic_3da
dataset_name: arithmetic_3da
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml
task: arithmetic_3ds
dataset_name: arithmetic_3ds
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml
task: arithmetic_4da
dataset_name: arithmetic_4da
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml
task: arithmetic_4ds
dataset_name: arithmetic_4ds
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml
task: arithmetic_5da
dataset_name: arithmetic_5da
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml
task: arithmetic_5ds
dataset_name: arithmetic_5ds
dataset_kwargs:
trust_remote_code: true
......@@ -41,41 +41,41 @@ fewshot_config:
target: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15
dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The final answer is 8
filter_list:
- filter:
- function: regex
group_select: -1
regex_pattern: The final answer is ((-?[$0-9.,]{2,})|(-?[0-9]+))
- function: take_first
name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
- filter:
- function: regex
group_select: -1
regex_pattern: The final answer is ((-?[$0-9.,]{2,})|(-?[0-9]+))
- function: take_first
name: strict-match
- filter:
- function: regex
group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
- function: take_first
name: flexible-extract
generation_kwargs:
do_sample: false
until:
- '<|eot_id|>'
- '<|start_header_id|>user<|end_header_id|>'
- 'Q:'
- </s>
- <|im_end|>
- '<|eot_id|>'
- '<|start_header_id|>user<|end_header_id|>'
- 'Q:'
- </s>
- <|im_end|>
tag:
- chain_of_thought
- chain_of_thought
metadata:
version: 1.0
metric_list:
- aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: false
metric: exact_match
regexes_to_ignore:
- ','
- \$
- '(?s).*#### '
- \.$
- aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: false
metric: exact_match
regexes_to_ignore:
- ','
- \$
- '(?s).*#### '
- \.$
num_fewshot: 8
output_type: generate_until
repeats: 1
......@@ -84,5 +84,3 @@ validation_split: validation
test_split: validation
should_decontaminate: true
doc_to_decontamination_query: "{{body}} {{question}}"
dataset_kwargs:
trust_remote_code: true
......@@ -12,5 +12,3 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -23,5 +23,3 @@ metric_list:
higher_is_better: true
metadata:
version: 0.1
dataset_kwargs:
trust_remote_code: true
......@@ -118,7 +118,7 @@ class NumberParseRegexFilter(ExtendedRegexFilter):
# https://www.reddit.com/r/regex/comments/11a38uk/parsing_numbers_written_out_as_english_words
english_number_regex = regex.compile(
"((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
"((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
)
for r in resps:
......@@ -161,7 +161,7 @@ class WordSortFilter(Filter):
class MultiChoiceRegexFilter(ExtendedRegexFilter):
def __init__(self, *args, **kwargs):
"""
r"""
regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
- step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
- step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
......@@ -202,7 +202,7 @@ class MultiChoiceRegexFilter(ExtendedRegexFilter):
fallback_regex = re.compile("|".join(fallback_regexes))
without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
without_paren_fallback_regex = re.compile(
f":[\s]*({without_paren_fallback_regex})"
rf":[\s]*({without_paren_fallback_regex})"
)
filtered = []
......
......@@ -118,7 +118,7 @@ class NumberParseRegexFilter(ExtendedRegexFilter):
# https://www.reddit.com/r/regex/comments/11a38uk/parsing_numbers_written_out_as_english_words
english_number_regex = regex.compile(
"((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
"((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
)
for r in resps:
......@@ -161,7 +161,7 @@ class WordSortFilter(Filter):
class MultiChoiceRegexFilter(ExtendedRegexFilter):
def __init__(self, *args, **kwargs):
"""
r"""
regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
- step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
- step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
......@@ -202,7 +202,7 @@ class MultiChoiceRegexFilter(ExtendedRegexFilter):
fallback_regex = re.compile("|".join(fallback_regexes))
without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
without_paren_fallback_regex = re.compile(
f":[\s]*({without_paren_fallback_regex})"
rf":[\s]*({without_paren_fallback_regex})"
)
filtered = []
......
......@@ -20,5 +20,4 @@ dataset_kwargs:
train: en/c4-train.00000-of-01024.json.gz
validation: en/c4-validation.00000-of-00008.json.gz
# following the choice of https://arxiv.org/abs/2410.07461
trust_remote_code: true
verification_mode: "no_checks"
......@@ -21,5 +21,3 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -20,5 +20,3 @@ metric_list:
higher_is_better: true
metadata:
version: 3.0
dataset_kwargs:
trust_remote_code: true
......@@ -9,5 +9,3 @@ metric_list:
higher_is_better: true
metadata:
version: 0.0
dataset_kwargs:
trust_remote_code: true
......@@ -20,5 +20,3 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -13,5 +13,3 @@ metric_list:
higher_is_better: true
metadata:
version: 0.0
dataset_kwargs:
trust_remote_code: true
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment