Commit e6b798f9 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into metrics

# Conflicts:
#	.pre-commit-config.yaml
#	lm_eval/api/task.py
#	lm_eval/models/huggingface.py
#	lm_eval/models/vllm_causallms.py
#	pyproject.toml
parents 14a29ade 4f8195f1
include: arithmetic_1dc.yaml include: arithmetic_1dc.yaml
task: arithmetic_2da task: arithmetic_2da
dataset_name: arithmetic_2da dataset_name: arithmetic_2da
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml include: arithmetic_1dc.yaml
task: arithmetic_2dm task: arithmetic_2dm
dataset_name: arithmetic_2dm dataset_name: arithmetic_2dm
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml include: arithmetic_1dc.yaml
task: arithmetic_2ds task: arithmetic_2ds
dataset_name: arithmetic_2ds dataset_name: arithmetic_2ds
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml include: arithmetic_1dc.yaml
task: arithmetic_3da task: arithmetic_3da
dataset_name: arithmetic_3da dataset_name: arithmetic_3da
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml include: arithmetic_1dc.yaml
task: arithmetic_3ds task: arithmetic_3ds
dataset_name: arithmetic_3ds dataset_name: arithmetic_3ds
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml include: arithmetic_1dc.yaml
task: arithmetic_4da task: arithmetic_4da
dataset_name: arithmetic_4da dataset_name: arithmetic_4da
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml include: arithmetic_1dc.yaml
task: arithmetic_4ds task: arithmetic_4ds
dataset_name: arithmetic_4ds dataset_name: arithmetic_4ds
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml include: arithmetic_1dc.yaml
task: arithmetic_5da task: arithmetic_5da
dataset_name: arithmetic_5da dataset_name: arithmetic_5da
dataset_kwargs:
trust_remote_code: true
include: arithmetic_1dc.yaml include: arithmetic_1dc.yaml
task: arithmetic_5ds task: arithmetic_5ds
dataset_name: arithmetic_5ds dataset_name: arithmetic_5ds
dataset_kwargs:
trust_remote_code: true
...@@ -41,13 +41,13 @@ fewshot_config: ...@@ -41,13 +41,13 @@ fewshot_config:
target: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15 target: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15
dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The final answer is 8 dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The final answer is 8
filter_list: filter_list:
- filter: - filter:
- function: regex - function: regex
group_select: -1 group_select: -1
regex_pattern: The final answer is ((-?[$0-9.,]{2,})|(-?[0-9]+)) regex_pattern: The final answer is ((-?[$0-9.,]{2,})|(-?[0-9]+))
- function: take_first - function: take_first
name: strict-match name: strict-match
- filter: - filter:
- function: regex - function: regex
group_select: -1 group_select: -1
regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+) regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
...@@ -62,11 +62,11 @@ generation_kwargs: ...@@ -62,11 +62,11 @@ generation_kwargs:
- </s> - </s>
- <|im_end|> - <|im_end|>
tag: tag:
- chain_of_thought - chain_of_thought
metadata: metadata:
version: 1.0 version: 1.0
metric_list: metric_list:
- aggregation: mean - aggregation: mean
higher_is_better: true higher_is_better: true
ignore_case: true ignore_case: true
ignore_punctuation: false ignore_punctuation: false
...@@ -84,5 +84,3 @@ validation_split: validation ...@@ -84,5 +84,3 @@ validation_split: validation
test_split: validation test_split: validation
should_decontaminate: true should_decontaminate: true
doc_to_decontamination_query: "{{body}} {{question}}" doc_to_decontamination_query: "{{body}} {{question}}"
dataset_kwargs:
trust_remote_code: true
...@@ -12,5 +12,3 @@ metric_list: ...@@ -12,5 +12,3 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -23,5 +23,3 @@ metric_list: ...@@ -23,5 +23,3 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 0.1 version: 0.1
dataset_kwargs:
trust_remote_code: true
...@@ -118,7 +118,7 @@ class NumberParseRegexFilter(ExtendedRegexFilter): ...@@ -118,7 +118,7 @@ class NumberParseRegexFilter(ExtendedRegexFilter):
# https://www.reddit.com/r/regex/comments/11a38uk/parsing_numbers_written_out_as_english_words # https://www.reddit.com/r/regex/comments/11a38uk/parsing_numbers_written_out_as_english_words
english_number_regex = regex.compile( english_number_regex = regex.compile(
"((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))" "((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
) )
for r in resps: for r in resps:
...@@ -161,7 +161,7 @@ class WordSortFilter(Filter): ...@@ -161,7 +161,7 @@ class WordSortFilter(Filter):
class MultiChoiceRegexFilter(ExtendedRegexFilter): class MultiChoiceRegexFilter(ExtendedRegexFilter):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
""" r"""
regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
- step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response. - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
- step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices. - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
...@@ -202,7 +202,7 @@ class MultiChoiceRegexFilter(ExtendedRegexFilter): ...@@ -202,7 +202,7 @@ class MultiChoiceRegexFilter(ExtendedRegexFilter):
fallback_regex = re.compile("|".join(fallback_regexes)) fallback_regex = re.compile("|".join(fallback_regexes))
without_paren_fallback_regex = "|".join(without_paren_fallback_regexes) without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
without_paren_fallback_regex = re.compile( without_paren_fallback_regex = re.compile(
f":[\s]*({without_paren_fallback_regex})" rf":[\s]*({without_paren_fallback_regex})"
) )
filtered = [] filtered = []
......
...@@ -118,7 +118,7 @@ class NumberParseRegexFilter(ExtendedRegexFilter): ...@@ -118,7 +118,7 @@ class NumberParseRegexFilter(ExtendedRegexFilter):
# https://www.reddit.com/r/regex/comments/11a38uk/parsing_numbers_written_out_as_english_words # https://www.reddit.com/r/regex/comments/11a38uk/parsing_numbers_written_out_as_english_words
english_number_regex = regex.compile( english_number_regex = regex.compile(
"((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))" "((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
) )
for r in resps: for r in resps:
...@@ -161,7 +161,7 @@ class WordSortFilter(Filter): ...@@ -161,7 +161,7 @@ class WordSortFilter(Filter):
class MultiChoiceRegexFilter(ExtendedRegexFilter): class MultiChoiceRegexFilter(ExtendedRegexFilter):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
""" r"""
regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
- step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response. - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
- step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices. - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
...@@ -202,7 +202,7 @@ class MultiChoiceRegexFilter(ExtendedRegexFilter): ...@@ -202,7 +202,7 @@ class MultiChoiceRegexFilter(ExtendedRegexFilter):
fallback_regex = re.compile("|".join(fallback_regexes)) fallback_regex = re.compile("|".join(fallback_regexes))
without_paren_fallback_regex = "|".join(without_paren_fallback_regexes) without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
without_paren_fallback_regex = re.compile( without_paren_fallback_regex = re.compile(
f":[\s]*({without_paren_fallback_regex})" rf":[\s]*({without_paren_fallback_regex})"
) )
filtered = [] filtered = []
......
...@@ -20,5 +20,4 @@ dataset_kwargs: ...@@ -20,5 +20,4 @@ dataset_kwargs:
train: en/c4-train.00000-of-01024.json.gz train: en/c4-train.00000-of-01024.json.gz
validation: en/c4-validation.00000-of-00008.json.gz validation: en/c4-validation.00000-of-00008.json.gz
# following the choice of https://arxiv.org/abs/2410.07461 # following the choice of https://arxiv.org/abs/2410.07461
trust_remote_code: true
verification_mode: "no_checks" verification_mode: "no_checks"
...@@ -21,5 +21,3 @@ metric_list: ...@@ -21,5 +21,3 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -20,5 +20,3 @@ metric_list: ...@@ -20,5 +20,3 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 3.0 version: 3.0
dataset_kwargs:
trust_remote_code: true
...@@ -9,5 +9,3 @@ metric_list: ...@@ -9,5 +9,3 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 0.0 version: 0.0
dataset_kwargs:
trust_remote_code: true
...@@ -20,5 +20,3 @@ metric_list: ...@@ -20,5 +20,3 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -13,5 +13,3 @@ metric_list: ...@@ -13,5 +13,3 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 0.0 version: 0.0
dataset_kwargs:
trust_remote_code: true
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment