Merge branch 'main' into metrics

# Conflicts: # .pre-commit-config.yaml # lm_eval/api/task.py # lm_eval/models/huggingface.py # lm_eval/models/vllm_causallms.py # pyproject.toml

Merge branch 'main' into metrics
# Conflicts: # .pre-commit-config.yaml # lm_eval/api/task.py # lm_eval/models/huggingface.py # lm_eval/models/vllm_causallms.py # pyproject.toml
e6b798f9 · Baber · 14a29ade · 4f8195f1 · e6b798f9 · e6b798f9
Commit e6b798f9 authored Jul 25, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/arithmetic/arithmetic_2da.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_2da.yaml
 include: arithmetic_1dc.yaml
 task: arithmetic_2da
 dataset_name: arithmetic_2da
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/arithmetic/arithmetic_2dm.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_2dm.yaml
 include: arithmetic_1dc.yaml
 task: arithmetic_2dm
 dataset_name: arithmetic_2dm
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/arithmetic/arithmetic_2ds.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_2ds.yaml
 include: arithmetic_1dc.yaml
 task: arithmetic_2ds
 dataset_name: arithmetic_2ds
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/arithmetic/arithmetic_3da.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_3da.yaml
 include: arithmetic_1dc.yaml
 task: arithmetic_3da
 dataset_name: arithmetic_3da
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/arithmetic/arithmetic_3ds.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_3ds.yaml
 include: arithmetic_1dc.yaml
 task: arithmetic_3ds
 dataset_name: arithmetic_3ds
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/arithmetic/arithmetic_4da.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_4da.yaml
 include: arithmetic_1dc.yaml
 task: arithmetic_4da
 dataset_name: arithmetic_4da
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/arithmetic/arithmetic_4ds.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_4ds.yaml
 include: arithmetic_1dc.yaml
 task: arithmetic_4ds
 dataset_name: arithmetic_4ds
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/arithmetic/arithmetic_5da.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_5da.yaml
 include: arithmetic_1dc.yaml
 task: arithmetic_5da
 dataset_name: arithmetic_5da
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/arithmetic/arithmetic_5ds.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_5ds.yaml
 include: arithmetic_1dc.yaml
 task: arithmetic_5ds
 dataset_name: arithmetic_5ds
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/asdiv/asdiv-cot-llama.yaml
+++ b/lm_eval/tasks/asdiv/asdiv-cot-llama.yaml
@@ -41,13 +41,13 @@ fewshot_config:
    target: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15
      dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The final answer is 8
 filter_list:
- filter:
+  - filter:
      - function: regex
        group_select: -1
        regex_pattern: The final answer is ((-?[$0-9.,]{2,})|(-?[0-9]+))
      - function: take_first
    name: strict-match
- filter:
+  - filter:
      - function: regex
        group_select: -1
        regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
@@ -62,11 +62,11 @@ generation_kwargs:
    - </s>
    - <|im_end|>
 tag:
- chain_of_thought
+  - chain_of_thought
 metadata:
  version: 1.0
 metric_list:
- aggregation: mean
+  - aggregation: mean
    higher_is_better: true
    ignore_case: true
    ignore_punctuation: false
@@ -84,5 +84,3 @@ validation_split: validation
 test_split: validation
 should_decontaminate: true
 doc_to_decontamination_query: "{{body}} {{question}}"
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/asdiv/default.yaml
+++ b/lm_eval/tasks/asdiv/default.yaml
@@ -12,5 +12,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/basque_bench/flores_eu/_flores_common_yaml
+++ b/lm_eval/tasks/basque_bench/flores_eu/_flores_common_yaml
@@ -23,5 +23,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 0.1
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/bbh/cot_zeroshot/utils.py
+++ b/lm_eval/tasks/bbh/cot_zeroshot/utils.py
@@ -118,7 +118,7 @@ class NumberParseRegexFilter(ExtendedRegexFilter):
        # https://www.reddit.com/r/regex/comments/11a38uk/parsing_numbers_written_out_as_english_words
        english_number_regex = regex.compile(
-            "((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
+            "((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
        )
        for r in resps:
@@ -161,7 +161,7 @@ class WordSortFilter(Filter):
 class MultiChoiceRegexFilter(ExtendedRegexFilter):
    def __init__(self, *args, **kwargs):
-        """
+        r"""
        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
@@ -202,7 +202,7 @@ class MultiChoiceRegexFilter(ExtendedRegexFilter):
            fallback_regex = re.compile("|".join(fallback_regexes))
            without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
            without_paren_fallback_regex = re.compile(
-                f":[\s]*({without_paren_fallback_regex})"
+                rf":[\s]*({without_paren_fallback_regex})"
            )
            filtered = []

--- a/lm_eval/tasks/bbh/zeroshot/utils.py
+++ b/lm_eval/tasks/bbh/zeroshot/utils.py
@@ -118,7 +118,7 @@ class NumberParseRegexFilter(ExtendedRegexFilter):
        # https://www.reddit.com/r/regex/comments/11a38uk/parsing_numbers_written_out_as_english_words
        english_number_regex = regex.compile(
-            "((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
+            "((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
        )
        for r in resps:
@@ -161,7 +161,7 @@ class WordSortFilter(Filter):
 class MultiChoiceRegexFilter(ExtendedRegexFilter):
    def __init__(self, *args, **kwargs):
-        """
+        r"""
        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
@@ -202,7 +202,7 @@ class MultiChoiceRegexFilter(ExtendedRegexFilter):
            fallback_regex = re.compile("|".join(fallback_regexes))
            without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
            without_paren_fallback_regex = re.compile(
-                f":[\s]*({without_paren_fallback_regex})"
+                rf":[\s]*({without_paren_fallback_regex})"
            )
            filtered = []

--- a/lm_eval/tasks/c4/c4.yaml
+++ b/lm_eval/tasks/c4/c4.yaml
@@ -20,5 +20,4 @@ dataset_kwargs:
    train: en/c4-train.00000-of-01024.json.gz
    validation: en/c4-validation.00000-of-00008.json.gz
  # following the choice of https://arxiv.org/abs/2410.07461
-  trust_remote_code: true
  verification_mode: "no_checks"
--- a/lm_eval/tasks/catalan_bench/flores_ca/_flores_common_yaml
+++ b/lm_eval/tasks/catalan_bench/flores_ca/_flores_common_yaml
@@ -21,5 +21,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/coqa/default.yaml
+++ b/lm_eval/tasks/coqa/default.yaml
@@ -20,5 +20,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 3.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/darija_bench/darija_sentiment/default_darija_sentiment_template_yaml
+++ b/lm_eval/tasks/darija_bench/darija_sentiment/default_darija_sentiment_template_yaml
@@ -9,5 +9,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 0.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/darijahellaswag/darijahellaswag.yaml
+++ b/lm_eval/tasks/darijahellaswag/darijahellaswag.yaml
@@ -20,5 +20,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/darijammlu/_default_darijammlu_template_yaml
+++ b/lm_eval/tasks/darijammlu/_default_darijammlu_template_yaml
@@ -13,5 +13,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 0.0
-dataset_kwargs:
-  trust_remote_code: true