remove trust-remote-code in configs; fix escape sequences (#3180)

* remove trust-remote-code * add W605 rule

remove trust-remote-code in configs; fix escape sequences (#3180)
* remove trust-remote-code * add W605 rule
314f7176 · Baber Abbasi · GitHub · 8c6fde08 · 314f7176 · 314f7176
Unverified Commit 314f7176 authored Jul 23, 2025 by Baber Abbasi Committed by GitHub Jul 23, 2025
20 changed files
--- a/lm_eval/tasks/mediqa_qa2019/mediqa_qa2019_perplexity.yaml
+++ b/lm_eval/tasks/mediqa_qa2019/mediqa_qa2019_perplexity.yaml
@@ -23,5 +23,3 @@ metric_list:
    higher_is_better: false
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
@@ -25,8 +25,6 @@ metric_list:
 num_fewshot: 4
 metadata:
  version: 2.0
-dataset_kwargs:
-  trust_remote_code: true
 fewshot_config:
  sampler: first_n
  samples: !function utils.list_fewshot_samples
--- a/lm_eval/tasks/mlqa/mlqa_common_yaml
+++ b/lm_eval/tasks/mlqa/mlqa_common_yaml
 dataset_path: facebook/mlqa
-dataset_kwargs:
-  trust_remote_code: true
 test_split: test
 validation_split: validation
 output_type: generate_until

--- a/lm_eval/tasks/mmlu/continuation/_continuation_template_yaml
+++ b/lm_eval/tasks/mmlu/continuation/_continuation_template_yaml
@@ -9,5 +9,3 @@ doc_to_choice: "{{choices}}"
 doc_to_target: "{{answer}}"
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/mmlu/default/_default_template_yaml
+++ b/lm_eval/tasks/mmlu/default/_default_template_yaml
@@ -13,5 +13,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml
@@ -26,5 +26,3 @@ metric_list:
    ignore_punctuation: true
 metadata:
  version: 2.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
@@ -34,5 +34,3 @@ metric_list:
    ignore_punctuation: true
 metadata:
  version: 3.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/utils.py
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/utils.py
@@ -17,7 +17,7 @@ class MultiChoiceRegexFilter(RegexFilter):
        ignore_punctuation=False,
        regexes_to_ignore=None,
    ) -> None:
-        """
+        r"""
        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
@@ -90,7 +90,7 @@ class MultiChoiceRegexFilter(RegexFilter):
            fallback_regex = re.compile("|".join(fallback_regexes))
            without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
            without_paren_fallback_regex = re.compile(
-                f":[\s]*({without_paren_fallback_regex})"
+                rf":[\s]*({without_paren_fallback_regex})"
            )
            filtered = []

--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
@@ -30,5 +30,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 3.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/utils.py
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/utils.py
@@ -17,7 +17,7 @@ class MultiChoiceRegexFilter(RegexFilter):
        ignore_punctuation=False,
        regexes_to_ignore=None,
    ) -> None:
-        """
+        r"""
        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
@@ -90,7 +90,7 @@ class MultiChoiceRegexFilter(RegexFilter):
            fallback_regex = re.compile("|".join(fallback_regexes))
            without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
            without_paren_fallback_regex = re.compile(
-                f":[\s]*({without_paren_fallback_regex})"
+                rf":[\s]*({without_paren_fallback_regex})"
            )
            filtered = []

--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml
@@ -13,5 +13,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 2.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/mmlu/generative/_default_template_yaml
+++ b/lm_eval/tasks/mmlu/generative/_default_template_yaml
@@ -30,5 +30,3 @@ filter_list:
      - function: take_first
 metadata:
  version: 3.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/mmlu_prox/mmlu_prox_config_generator.py
+++ b/lm_eval/tasks/mmlu_prox/mmlu_prox_config_generator.py
@@ -44,7 +44,7 @@ if __name__ == "__main__":
                    line = line.format(lang=lang_abbr)
                if "{ans_regex}" in line:
                    ans_regex = lang_lib_list[-1].replace(
-                        "({})", "\(?([ABCDEFGHIJ])\)?"
+                        "({})", r"\(?([ABCDEFGHIJ])\)?"
                    )
                    if lang_abbr == "en":
                        ans_regex = ans_regex.lstrip("the").strip()

--- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml
@@ -12,5 +12,3 @@ metric_list:
  - metric: acc
 metadata:
  version: 0.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml
@@ -12,5 +12,3 @@ metric_list:
  - metric: acc
 metadata:
  version: 0.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml
@@ -12,5 +12,3 @@ metric_list:
  - metric: acc
 metadata:
  version: 0.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/mutual/mutual.yaml
+++ b/lm_eval/tasks/mutual/mutual.yaml
@@ -23,5 +23,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 2.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/noreval/tatoeba/_tatoeba_yaml
+++ b/lm_eval/tasks/noreval/tatoeba/_tatoeba_yaml
@@ -2,8 +2,6 @@ dataset_path: Helsinki-NLP/tatoeba_mt
 training_split: validation
 test_split: test
 output_type: generate_until
-dataset_kwargs:
-  trust_remote_code: true
 metric_list:
  - metric: bleu
    higher_is_better: true

--- a/lm_eval/tasks/piqa/piqa.yaml
+++ b/lm_eval/tasks/piqa/piqa.yaml
@@ -19,5 +19,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/portuguese_bench/flores_pt/_flores_common_yaml
+++ b/lm_eval/tasks/portuguese_bench/flores_pt/_flores_common_yaml
@@ -23,5 +23,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true