remove trust-remote-code in configs; fix escape sequences (#3180)

* remove trust-remote-code * add W605 rule

remove trust-remote-code in configs; fix escape sequences (#3180)
* remove trust-remote-code * add W605 rule
314f7176 · Baber Abbasi · GitHub · 8c6fde08 · 314f7176 · 314f7176
Unverified Commit 314f7176 authored Jul 23, 2025 by Baber Abbasi Committed by GitHub Jul 23, 2025
18 changed files
--- a/lm_eval/tasks/race/race.yaml
+++ b/lm_eval/tasks/race/race.yaml
@@ -12,5 +12,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 2.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/score/agi_eval/non_greedy_robustness_agieval_aqua_rat.yaml
+++ b/lm_eval/tasks/score/agi_eval/non_greedy_robustness_agieval_aqua_rat.yaml
@@ -28,9 +28,7 @@ generation_kwargs:
 process_results: !function utils_agieval.non_greedy_robustness_process_results
 metric_list:
  - metric: non_greedy_accuracy
-    aggregation:  !function utils_agieval.non_greedy_accuracy
+    aggregation: !function utils_agieval.non_greedy_accuracy
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_aqua_rat.yaml
+++ b/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_aqua_rat.yaml
@@ -27,21 +27,19 @@ generation_kwargs:
 process_results: !function utils_agieval.option_order_robustness_process_results
 metric_list:
  - metric: per_option_accuracy_A
-    aggregation:  !function utils_agieval.per_option_accuracy_a
+    aggregation: !function utils_agieval.per_option_accuracy_a
    higher_is_better: true
  - metric: per_option_accuracy_B
-    aggregation:  !function utils_agieval.per_option_accuracy_b
+    aggregation: !function utils_agieval.per_option_accuracy_b
    higher_is_better: true
  - metric: per_option_accuracy_C
-    aggregation:  !function utils_agieval.per_option_accuracy_c
+    aggregation: !function utils_agieval.per_option_accuracy_c
    higher_is_better: true
  - metric: per_option_accuracy_D
-    aggregation:  !function utils_agieval.per_option_accuracy_d
+    aggregation: !function utils_agieval.per_option_accuracy_d
    higher_is_better: true
  - metric: options_consistency_rate
-    aggregation:  !function utils_agieval.options_consistency_rate
+    aggregation: !function utils_agieval.options_consistency_rate
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_aqua_rat.yaml
+++ b/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_aqua_rat.yaml
@@ -27,39 +27,37 @@ generation_kwargs:
 process_results: !function utils_agieval.prompt_robustness_process_results
 metric_list:
  - metric: 0_accuracy
-    aggregation:  !function utils_agieval.per_prompt_accuracy_0
+    aggregation: !function utils_agieval.per_prompt_accuracy_0
    higher_is_better: true
  - metric: 1_accuracy
-    aggregation:  !function utils_agieval.per_prompt_accuracy_1
+    aggregation: !function utils_agieval.per_prompt_accuracy_1
    higher_is_better: true
  - metric: 2_accuracy
-    aggregation:  !function utils_agieval.per_prompt_accuracy_2
+    aggregation: !function utils_agieval.per_prompt_accuracy_2
    higher_is_better: true
  - metric: 3_accuracy
-    aggregation:  !function utils_agieval.per_prompt_accuracy_3
+    aggregation: !function utils_agieval.per_prompt_accuracy_3
    higher_is_better: true
  - metric: 4_accuracy
-    aggregation:  !function utils_agieval.per_prompt_accuracy_4
+    aggregation: !function utils_agieval.per_prompt_accuracy_4
    higher_is_better: true
  - metric: 5_accuracy
-    aggregation:  !function utils_agieval.per_prompt_accuracy_5
+    aggregation: !function utils_agieval.per_prompt_accuracy_5
    higher_is_better: true
  - metric: 6_accuracy
-    aggregation:  !function utils_agieval.per_prompt_accuracy_6
+    aggregation: !function utils_agieval.per_prompt_accuracy_6
    higher_is_better: true
  - metric: 7_accuracy
-    aggregation:  !function utils_agieval.per_prompt_accuracy_7
+    aggregation: !function utils_agieval.per_prompt_accuracy_7
    higher_is_better: true
  - metric: 8_accuracy
-    aggregation:  !function utils_agieval.per_prompt_accuracy_8
+    aggregation: !function utils_agieval.per_prompt_accuracy_8
    higher_is_better: true
  - metric: 9_accuracy
-    aggregation:  !function utils_agieval.per_prompt_accuracy_9
+    aggregation: !function utils_agieval.per_prompt_accuracy_9
    higher_is_better: true
  - metric: consistency_rate
-    aggregation:  !function utils_agieval.agi_eval_prompt_consistency_rate
+    aggregation: !function utils_agieval.agi_eval_prompt_consistency_rate
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/score/math/math_grader.py
+++ b/lm_eval/tasks/score/math/math_grader.py
@@ -267,12 +267,12 @@ def normalize_answer_string(expr: str) -> str:
        "\\\\textit",
    ]:
        expr = expr.replace(surround_str, "")
-        pattern = f"^{surround_str}" + "\{(?P<text>.+?)\}$"
+        pattern = f"^{surround_str}" + r"\{(?P<text>.+?)\}$"
        m = re.search(pattern, expr)
        if m is not None:
            expr = m.group("text")
-    expr = expr.replace("\!", "")
+    expr = expr.replace(r"\!", "")
    expr = expr.replace("\\%", "%")
    expr = expr.replace("\\$", "$")
    expr = expr.replace("$", "")
@@ -305,7 +305,7 @@ def normalize_answer_string(expr: str) -> str:
        "p.m.",
        "PM",
    ]:
-        expr = re.sub(f"{unit}(es)?(s)? *(\^[0-9]+)?", "", expr)
+        expr = re.sub(rf"{unit}(es)?(s)? *(\^[0-9]+)?", "", expr)
    if "day" in expr:
        days = [
@@ -326,7 +326,7 @@ def normalize_answer_string(expr: str) -> str:
        if not weekday_expressed:
            expr = re.sub("day(s)?", "", expr)
-    expr = re.sub("\^ *\\\\circ", "", expr)
+    expr = re.sub("\\^ *\\\\circ", "", expr)
    if len(expr) > 0 and expr[0] == "{" and expr[-1] == "}":
        expr = expr[1:-1]

--- a/lm_eval/tasks/score/math/non_greedy_robustness_math_algebra.yaml
+++ b/lm_eval/tasks/score/math/non_greedy_robustness_math_algebra.yaml
@@ -18,7 +18,7 @@ dataset_name: algebra
 output_type: generate_until
 test_split: test
 process_docs: !function utils_math.non_greedy_robustness_process_docs
-doc_to_text:  !function utils_math.math_robustness_doc_to_text
+doc_to_text: !function utils_math.math_robustness_doc_to_text
 doc_to_target: answer
 generation_kwargs:
  max_gen_toks: 1024
@@ -28,9 +28,7 @@ generation_kwargs:
 process_results: !function utils_math.non_greedy_robustness_process_results
 metric_list:
  - metric: non_greedy_accuracy
-    aggregation:  !function utils_math.non_greedy_accuracy
+    aggregation: !function utils_math.non_greedy_accuracy
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/score/math/prompt_robustness_math_algebra.yaml
+++ b/lm_eval/tasks/score/math/prompt_robustness_math_algebra.yaml
@@ -18,7 +18,7 @@ process_docs: !function utils_math.prompt_robustness_process_docs
 dataset_name: algebra
 output_type: generate_until
 test_split: test
-doc_to_text:  !function utils_math.math_robustness_doc_to_text
+doc_to_text: !function utils_math.math_robustness_doc_to_text
 process_results: !function utils_math.process_results
 doc_to_target: answer
 generation_kwargs:
@@ -28,39 +28,37 @@ generation_kwargs:
  max_gen_toks: 1024
 metric_list:
  - metric: 0_accuracy
-    aggregation:  !function utils_math.per_prompt_accuracy_0
+    aggregation: !function utils_math.per_prompt_accuracy_0
    higher_is_better: true
  - metric: 1_accuracy
-    aggregation:  !function utils_math.per_prompt_accuracy_1
+    aggregation: !function utils_math.per_prompt_accuracy_1
    higher_is_better: true
  - metric: 2_accuracy
-    aggregation:  !function utils_math.per_prompt_accuracy_2
+    aggregation: !function utils_math.per_prompt_accuracy_2
    higher_is_better: true
  - metric: 3_accuracy
-    aggregation:  !function utils_math.per_prompt_accuracy_3
+    aggregation: !function utils_math.per_prompt_accuracy_3
    higher_is_better: true
  - metric: 4_accuracy
-    aggregation:  !function utils_math.per_prompt_accuracy_4
+    aggregation: !function utils_math.per_prompt_accuracy_4
    higher_is_better: true
  - metric: 5_accuracy
-    aggregation:  !function utils_math.per_prompt_accuracy_5
+    aggregation: !function utils_math.per_prompt_accuracy_5
    higher_is_better: true
  - metric: 6_accuracy
-    aggregation:  !function utils_math.per_prompt_accuracy_6
+    aggregation: !function utils_math.per_prompt_accuracy_6
    higher_is_better: true
  - metric: 7_accuracy
-    aggregation:  !function utils_math.per_prompt_accuracy_7
+    aggregation: !function utils_math.per_prompt_accuracy_7
    higher_is_better: true
  - metric: 8_accuracy
-    aggregation:  !function utils_math.per_prompt_accuracy_8
+    aggregation: !function utils_math.per_prompt_accuracy_8
    higher_is_better: true
  - metric: 9_accuracy
-    aggregation:  !function utils_math.per_prompt_accuracy_9
+    aggregation: !function utils_math.per_prompt_accuracy_9
    higher_is_better: true
  - metric: consistency_rate
-    aggregation:  !function utils_math.math_prompt_consistency_rate
+    aggregation: !function utils_math.math_prompt_consistency_rate
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/score/mmlu_pro/score_non_greedy_robustness_mmlu_pro.yaml
+++ b/lm_eval/tasks/score/mmlu_pro/score_non_greedy_robustness_mmlu_pro.yaml
@@ -30,9 +30,7 @@ generation_kwargs:
 process_results: !function utils_mmlu_pro.non_greedy_robustness_process_results
 metric_list:
  - metric: non_greedy_macro_accuracy
-    aggregation:  !function utils_mmlu_pro.non_greedy_macro_accuracy
+    aggregation: !function utils_mmlu_pro.non_greedy_macro_accuracy
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/score/mmlu_pro/score_option_order_robustness_mmlu_pro.yaml
+++ b/lm_eval/tasks/score/mmlu_pro/score_option_order_robustness_mmlu_pro.yaml
@@ -29,39 +29,37 @@ generation_kwargs:
 process_results: !function utils_mmlu_pro.option_order_robustness_process_results
 metric_list:
  - metric: per_option_macro_accuracy_A
-    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_a
+    aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_a
    higher_is_better: true
  - metric: per_option_macro_accuracy_B
-    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_b
+    aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_b
    higher_is_better: true
  - metric: per_option_macro_accuracy_C
-    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_c
+    aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_c
    higher_is_better: true
  - metric: per_option_macro_accuracy_D
-    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_d
+    aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_d
    higher_is_better: true
  - metric: per_option_macro_accuracy_E
-    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_e
+    aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_e
    higher_is_better: true
  - metric: per_option_macro_accuracy_F
-    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_f
+    aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_f
    higher_is_better: true
  - metric: per_option_macro_accuracy_G
-    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_g
+    aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_g
    higher_is_better: true
  - metric: per_option_macro_accuracy_H
-    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_h
+    aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_h
    higher_is_better: true
  - metric: per_option_macro_accuracy_I
-    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_i
+    aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_i
    higher_is_better: true
  - metric: per_option_macro_accuracy_J
-    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_j
+    aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_j
    higher_is_better: true
  - metric: options_consistency_rate
-    aggregation:  !function utils_mmlu_pro.options_consistency_rate
+    aggregation: !function utils_mmlu_pro.options_consistency_rate
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/score/mmlu_pro/score_prompt_robustness_mmlu_pro.yaml
+++ b/lm_eval/tasks/score/mmlu_pro/score_prompt_robustness_mmlu_pro.yaml
@@ -29,39 +29,37 @@ generation_kwargs:
 process_results: !function utils_mmlu_pro.prompt_robustness_process_results
 metric_list:
  - metric: 0_macro_accuracy
-    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_0
+    aggregation: !function utils_mmlu_pro.per_prompt_accuracy_0
    higher_is_better: true
  - metric: 1_macro_accuracy
-    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_1
+    aggregation: !function utils_mmlu_pro.per_prompt_accuracy_1
    higher_is_better: true
  - metric: 2_macro_accuracy
-    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_2
+    aggregation: !function utils_mmlu_pro.per_prompt_accuracy_2
    higher_is_better: true
  - metric: 3_macro_accuracy
-    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_3
+    aggregation: !function utils_mmlu_pro.per_prompt_accuracy_3
    higher_is_better: true
  - metric: 4_macro_accuracy
-    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_4
+    aggregation: !function utils_mmlu_pro.per_prompt_accuracy_4
    higher_is_better: true
  - metric: 5_macro_accuracy
-    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_5
+    aggregation: !function utils_mmlu_pro.per_prompt_accuracy_5
    higher_is_better: true
  - metric: 6_macro_accuracy
-    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_6
+    aggregation: !function utils_mmlu_pro.per_prompt_accuracy_6
    higher_is_better: true
  - metric: 7_macro_accuracy
-    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_7
+    aggregation: !function utils_mmlu_pro.per_prompt_accuracy_7
    higher_is_better: true
  - metric: 8_macro_accuracy
-    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_8
+    aggregation: !function utils_mmlu_pro.per_prompt_accuracy_8
    higher_is_better: true
  - metric: 9_macro_accuracy
-    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_9
+    aggregation: !function utils_mmlu_pro.per_prompt_accuracy_9
    higher_is_better: true
  - metric: consistency_rate
-    aggregation:  !function utils_mmlu_pro.mmlu_pro_prompt_consistency_rate
+    aggregation: !function utils_mmlu_pro.mmlu_pro_prompt_consistency_rate
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/spanish_bench/flores_es/_flores_common_yaml
+++ b/lm_eval/tasks/spanish_bench/flores_es/_flores_common_yaml
@@ -23,5 +23,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/unscramble/anagrams1.yaml
+++ b/lm_eval/tasks/unscramble/anagrams1.yaml
@@ -18,5 +18,3 @@ metric_list:
    ignore_punctuation: false
 metadata:
  version: 2.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/unscramble/anagrams2.yaml
+++ b/lm_eval/tasks/unscramble/anagrams2.yaml
@@ -18,5 +18,3 @@ metric_list:
    ignore_punctuation: false
 metadata:
  version: 2.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/unscramble/cycle_letters.yaml
+++ b/lm_eval/tasks/unscramble/cycle_letters.yaml
@@ -18,5 +18,3 @@ metric_list:
    ignore_punctuation: false
 metadata:
  version: 2.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/unscramble/random_insertion.yaml
+++ b/lm_eval/tasks/unscramble/random_insertion.yaml
@@ -18,5 +18,3 @@ metric_list:
    ignore_punctuation: false
 metadata:
  version: 2.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/wikitext/wikitext.yaml
+++ b/lm_eval/tasks/wikitext/wikitext.yaml
@@ -16,5 +16,3 @@ metric_list:
  - metric: bits_per_byte
 metadata:
  version: 2.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/winogrande/default.yaml
+++ b/lm_eval/tasks/winogrande/default.yaml
@@ -15,5 +15,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -106,7 +106,7 @@ plugins.md029.allow_extended_start_values = true # ol-prefix
 plugins.md034.enabled = false # no-bare-urls
 [tool.ruff.lint]
-extend-select = ["I"]
+extend-select = ["I", "W605"]
 [tool.ruff.lint.isort]
 lines-after-imports = 2