remove trust-remote-code in configs; fix escape sequences (#3180)

* remove trust-remote-code * add W605 rule

remove trust-remote-code in configs; fix escape sequences (#3180)
* remove trust-remote-code * add W605 rule
314f7176 · Baber Abbasi · GitHub · 8c6fde08 · 314f7176 · 314f7176
Unverified Commit 314f7176 authored Jul 23, 2025 by Baber Abbasi Committed by GitHub Jul 23, 2025
20 changed files
--- a/lm_eval/tasks/arithmetic/arithmetic_4ds.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_4ds.yaml
 include: arithmetic_1dc.yaml
 task: arithmetic_4ds
 dataset_name: arithmetic_4ds
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/arithmetic/arithmetic_5da.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_5da.yaml
 include: arithmetic_1dc.yaml
 task: arithmetic_5da
 dataset_name: arithmetic_5da
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/arithmetic/arithmetic_5ds.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_5ds.yaml
 include: arithmetic_1dc.yaml
 task: arithmetic_5ds
 dataset_name: arithmetic_5ds
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/asdiv/asdiv-cot-llama.yaml
+++ b/lm_eval/tasks/asdiv/asdiv-cot-llama.yaml
@@ -41,41 +41,41 @@ fewshot_config:
    target: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15
      dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The final answer is 8
 filter_list:
- filter:
-  - function: regex
-    group_select: -1
-    regex_pattern: The final answer is ((-?[$0-9.,]{2,})|(-?[0-9]+))
-  - function: take_first
-  name: strict-match
- filter:
-  - function: regex
-    group_select: -1
-    regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
-  - function: take_first
-  name: flexible-extract
+  - filter:
+      - function: regex
+        group_select: -1
+        regex_pattern: The final answer is ((-?[$0-9.,]{2,})|(-?[0-9]+))
+      - function: take_first
+    name: strict-match
+  - filter:
+      - function: regex
+        group_select: -1
+        regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+      - function: take_first
+    name: flexible-extract
 generation_kwargs:
  do_sample: false
  until:
-  - '<|eot_id|>'
-  - '<|start_header_id|>user<|end_header_id|>'
-  - 'Q:'
-  - </s>
-  - <|im_end|>
+    - '<|eot_id|>'
+    - '<|start_header_id|>user<|end_header_id|>'
+    - 'Q:'
+    - </s>
+    - <|im_end|>
 tag:
- chain_of_thought
+  - chain_of_thought
 metadata:
  version: 1.0
 metric_list:
- aggregation: mean
-  higher_is_better: true
-  ignore_case: true
-  ignore_punctuation: false
-  metric: exact_match
-  regexes_to_ignore:
-  - ','
-  - \$
-  - '(?s).*#### '
-  - \.$
+  - aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: false
+    metric: exact_match
+    regexes_to_ignore:
+      - ','
+      - \$
+      - '(?s).*#### '
+      - \.$
 num_fewshot: 8
 output_type: generate_until
 repeats: 1
@@ -84,5 +84,3 @@ validation_split: validation
 test_split: validation
 should_decontaminate: true
 doc_to_decontamination_query: "{{body}} {{question}}"
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/asdiv/default.yaml
+++ b/lm_eval/tasks/asdiv/default.yaml
@@ -12,5 +12,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/basque_bench/flores_eu/_flores_common_yaml
+++ b/lm_eval/tasks/basque_bench/flores_eu/_flores_common_yaml
@@ -23,5 +23,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 0.1
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/bbh/cot_zeroshot/utils.py
+++ b/lm_eval/tasks/bbh/cot_zeroshot/utils.py
@@ -118,7 +118,7 @@ class NumberParseRegexFilter(ExtendedRegexFilter):

        # https://www.reddit.com/r/regex/comments/11a38uk/parsing_numbers_written_out_as_english_words
        english_number_regex = regex.compile(
-            "((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
+            "((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
        )

        for r in resps:
@@ -161,7 +161,7 @@ class WordSortFilter(Filter):

 class MultiChoiceRegexFilter(ExtendedRegexFilter):
    def __init__(self, *args, **kwargs):
-        """
+        r"""
        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
@@ -202,7 +202,7 @@ class MultiChoiceRegexFilter(ExtendedRegexFilter):
            fallback_regex = re.compile("|".join(fallback_regexes))
            without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
            without_paren_fallback_regex = re.compile(
-                f":[\s]*({without_paren_fallback_regex})"
+                rf":[\s]*({without_paren_fallback_regex})"
            )

            filtered = []

--- a/lm_eval/tasks/bbh/zeroshot/utils.py
+++ b/lm_eval/tasks/bbh/zeroshot/utils.py
@@ -118,7 +118,7 @@ class NumberParseRegexFilter(ExtendedRegexFilter):

        # https://www.reddit.com/r/regex/comments/11a38uk/parsing_numbers_written_out_as_english_words
        english_number_regex = regex.compile(
-            "((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
+            "((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
        )

        for r in resps:
@@ -161,7 +161,7 @@ class WordSortFilter(Filter):

 class MultiChoiceRegexFilter(ExtendedRegexFilter):
    def __init__(self, *args, **kwargs):
-        """
+        r"""
        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
@@ -202,7 +202,7 @@ class MultiChoiceRegexFilter(ExtendedRegexFilter):
            fallback_regex = re.compile("|".join(fallback_regexes))
            without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
            without_paren_fallback_regex = re.compile(
-                f":[\s]*({without_paren_fallback_regex})"
+                rf":[\s]*({without_paren_fallback_regex})"
            )

            filtered = []

--- a/lm_eval/tasks/c4/c4.yaml
+++ b/lm_eval/tasks/c4/c4.yaml
@@ -20,5 +20,4 @@ dataset_kwargs:
    train: en/c4-train.00000-of-01024.json.gz
    validation: en/c4-validation.00000-of-00008.json.gz
  # following the choice of https://arxiv.org/abs/2410.07461
-  trust_remote_code: true
  verification_mode: "no_checks"
--- a/lm_eval/tasks/catalan_bench/flores_ca/_flores_common_yaml
+++ b/lm_eval/tasks/catalan_bench/flores_ca/_flores_common_yaml
@@ -21,5 +21,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/coqa/default.yaml
+++ b/lm_eval/tasks/coqa/default.yaml
@@ -20,5 +20,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 3.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/darija_bench/darija_sentiment/default_darija_sentiment_template_yaml
+++ b/lm_eval/tasks/darija_bench/darija_sentiment/default_darija_sentiment_template_yaml
@@ -9,5 +9,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 0.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/darijahellaswag/darijahellaswag.yaml
+++ b/lm_eval/tasks/darijahellaswag/darijahellaswag.yaml
@@ -20,5 +20,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/darijammlu/_default_darijammlu_template_yaml
+++ b/lm_eval/tasks/darijammlu/_default_darijammlu_template_yaml
@@ -13,5 +13,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 0.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/drop/default.yaml
+++ b/lm_eval/tasks/drop/default.yaml
@@ -22,5 +22,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 3.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/egyhellaswag/egyhellaswag.yaml
+++ b/lm_eval/tasks/egyhellaswag/egyhellaswag.yaml
@@ -20,5 +20,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 0.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/galician_bench/flores_gl/_flores_common_yaml
+++ b/lm_eval/tasks/galician_bench/flores_gl/_flores_common_yaml
@@ -23,5 +23,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/groundcocoa/groundcocoa.yaml
+++ b/lm_eval/tasks/groundcocoa/groundcocoa.yaml
@@ -14,5 +14,4 @@ metric_list:
    aggregation: mean
    higher_is_better: true
 dataset_kwargs:
-  trust_remote_code: true
  streaming: true
--- a/lm_eval/tasks/hellaswag/hellaswag.yaml
+++ b/lm_eval/tasks/hellaswag/hellaswag.yaml
@@ -20,5 +20,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml
+++ b/lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml
@@ -7,7 +7,7 @@ dataset_name: algebra
 output_type: generate_until
 training_split: train
 test_split: test
-doc_to_text:  "Problem: {{problem}}\nAnswer:"
+doc_to_text: "Problem: {{problem}}\nAnswer:"
 process_results: !function utils.process_results
 doc_to_target: "{{answer}}"
 generation_kwargs:
@@ -21,5 +21,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true