Merge branch 'main' into metrics

# Conflicts: # .pre-commit-config.yaml # lm_eval/api/task.py # lm_eval/models/huggingface.py # lm_eval/models/vllm_causallms.py # pyproject.toml

Merge branch 'main' into metrics
# Conflicts: # .pre-commit-config.yaml # lm_eval/api/task.py # lm_eval/models/huggingface.py # lm_eval/models/vllm_causallms.py # pyproject.toml
e6b798f9 · Baber · 14a29ade · 4f8195f1 · e6b798f9 · e6b798f9
Commit e6b798f9 authored Jul 25, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml
@@ -34,5 +34,3 @@ metric_list:
    ignore_punctuation: true
 metadata:
  version: 3.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/utils.py
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/utils.py
@@ -17,7 +17,7 @@ class MultiChoiceRegexFilter(RegexFilter):
        ignore_punctuation=False,
        regexes_to_ignore=None,
    ) -> None:
-        """
+        r"""
        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
@@ -90,7 +90,7 @@ class MultiChoiceRegexFilter(RegexFilter):
            fallback_regex = re.compile("|".join(fallback_regexes))
            without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
            without_paren_fallback_regex = re.compile(
-                f":[\s]*({without_paren_fallback_regex})"
+                rf":[\s]*({without_paren_fallback_regex})"
            )

            filtered = []

--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
@@ -30,5 +30,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 3.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/utils.py
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/utils.py
@@ -17,7 +17,7 @@ class MultiChoiceRegexFilter(RegexFilter):
        ignore_punctuation=False,
        regexes_to_ignore=None,
    ) -> None:
-        """
+        r"""
        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
@@ -90,7 +90,7 @@ class MultiChoiceRegexFilter(RegexFilter):
            fallback_regex = re.compile("|".join(fallback_regexes))
            without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
            without_paren_fallback_regex = re.compile(
-                f":[\s]*({without_paren_fallback_regex})"
+                rf":[\s]*({without_paren_fallback_regex})"
            )

            filtered = []

--- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml
@@ -13,5 +13,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 2.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/mmlu/generative/_default_template_yaml
+++ b/lm_eval/tasks/mmlu/generative/_default_template_yaml
@@ -30,5 +30,3 @@ filter_list:
      - function: take_first
 metadata:
  version: 3.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/mmlu_prox/mmlu_prox_config_generator.py
+++ b/lm_eval/tasks/mmlu_prox/mmlu_prox_config_generator.py
@@ -44,7 +44,7 @@ if __name__ == "__main__":
                    line = line.format(lang=lang_abbr)
                if "{ans_regex}" in line:
                    ans_regex = lang_lib_list[-1].replace(
-                        "({})", "\(?([ABCDEFGHIJ])\)?"
+                        "({})", r"\(?([ABCDEFGHIJ])\)?"
                    )
                    if lang_abbr == "en":
                        ans_regex = ans_regex.lstrip("the").strip()

--- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml
@@ -12,5 +12,3 @@ metric_list:
  - metric: acc
 metadata:
  version: 0.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml
@@ -12,5 +12,3 @@ metric_list:
  - metric: acc
 metadata:
  version: 0.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml
+++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml
@@ -12,5 +12,3 @@ metric_list:
  - metric: acc
 metadata:
  version: 0.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/mutual/mutual.yaml
+++ b/lm_eval/tasks/mutual/mutual.yaml
@@ -23,5 +23,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 2.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/noreval/tatoeba/_tatoeba_yaml
+++ b/lm_eval/tasks/noreval/tatoeba/_tatoeba_yaml
@@ -2,8 +2,6 @@ dataset_path: Helsinki-NLP/tatoeba_mt
 training_split: validation
 test_split: test
 output_type: generate_until
-dataset_kwargs:
-  trust_remote_code: true
 metric_list:
  - metric: bleu
    higher_is_better: true

--- a/lm_eval/tasks/piqa/piqa.yaml
+++ b/lm_eval/tasks/piqa/piqa.yaml
@@ -19,5 +19,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/portuguese_bench/flores_pt/_flores_common_yaml
+++ b/lm_eval/tasks/portuguese_bench/flores_pt/_flores_common_yaml
@@ -23,5 +23,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/race/race.yaml
+++ b/lm_eval/tasks/race/race.yaml
@@ -12,5 +12,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 2.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/score/agi_eval/non_greedy_robustness_agieval_aqua_rat.yaml
+++ b/lm_eval/tasks/score/agi_eval/non_greedy_robustness_agieval_aqua_rat.yaml
@@ -28,9 +28,7 @@ generation_kwargs:
 process_results: !function utils_agieval.non_greedy_robustness_process_results
 metric_list:
  - metric: non_greedy_accuracy
-    aggregation:  !function utils_agieval.non_greedy_accuracy
+    aggregation: !function utils_agieval.non_greedy_accuracy
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_aqua_rat.yaml
+++ b/lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_aqua_rat.yaml
@@ -27,21 +27,19 @@ generation_kwargs:
 process_results: !function utils_agieval.option_order_robustness_process_results
 metric_list:
  - metric: per_option_accuracy_A
-    aggregation:  !function utils_agieval.per_option_accuracy_a
+    aggregation: !function utils_agieval.per_option_accuracy_a
    higher_is_better: true
  - metric: per_option_accuracy_B
-    aggregation:  !function utils_agieval.per_option_accuracy_b
+    aggregation: !function utils_agieval.per_option_accuracy_b
    higher_is_better: true
  - metric: per_option_accuracy_C
-    aggregation:  !function utils_agieval.per_option_accuracy_c
+    aggregation: !function utils_agieval.per_option_accuracy_c
    higher_is_better: true
  - metric: per_option_accuracy_D
-    aggregation:  !function utils_agieval.per_option_accuracy_d
+    aggregation: !function utils_agieval.per_option_accuracy_d
    higher_is_better: true
  - metric: options_consistency_rate
-    aggregation:  !function utils_agieval.options_consistency_rate
+    aggregation: !function utils_agieval.options_consistency_rate
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_aqua_rat.yaml
+++ b/lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_aqua_rat.yaml
@@ -27,39 +27,37 @@ generation_kwargs:
 process_results: !function utils_agieval.prompt_robustness_process_results
 metric_list:
  - metric: 0_accuracy
-    aggregation:  !function utils_agieval.per_prompt_accuracy_0
+    aggregation: !function utils_agieval.per_prompt_accuracy_0
    higher_is_better: true
  - metric: 1_accuracy
-    aggregation:  !function utils_agieval.per_prompt_accuracy_1
+    aggregation: !function utils_agieval.per_prompt_accuracy_1
    higher_is_better: true
  - metric: 2_accuracy
-    aggregation:  !function utils_agieval.per_prompt_accuracy_2
+    aggregation: !function utils_agieval.per_prompt_accuracy_2
    higher_is_better: true
  - metric: 3_accuracy
-    aggregation:  !function utils_agieval.per_prompt_accuracy_3
+    aggregation: !function utils_agieval.per_prompt_accuracy_3
    higher_is_better: true
  - metric: 4_accuracy
-    aggregation:  !function utils_agieval.per_prompt_accuracy_4
+    aggregation: !function utils_agieval.per_prompt_accuracy_4
    higher_is_better: true
  - metric: 5_accuracy
-    aggregation:  !function utils_agieval.per_prompt_accuracy_5
+    aggregation: !function utils_agieval.per_prompt_accuracy_5
    higher_is_better: true
  - metric: 6_accuracy
-    aggregation:  !function utils_agieval.per_prompt_accuracy_6
+    aggregation: !function utils_agieval.per_prompt_accuracy_6
    higher_is_better: true
  - metric: 7_accuracy
-    aggregation:  !function utils_agieval.per_prompt_accuracy_7
+    aggregation: !function utils_agieval.per_prompt_accuracy_7
    higher_is_better: true
  - metric: 8_accuracy
-    aggregation:  !function utils_agieval.per_prompt_accuracy_8
+    aggregation: !function utils_agieval.per_prompt_accuracy_8
    higher_is_better: true
  - metric: 9_accuracy
-    aggregation:  !function utils_agieval.per_prompt_accuracy_9
+    aggregation: !function utils_agieval.per_prompt_accuracy_9
    higher_is_better: true
  - metric: consistency_rate
-    aggregation:  !function utils_agieval.agi_eval_prompt_consistency_rate
+    aggregation: !function utils_agieval.agi_eval_prompt_consistency_rate
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/score/math/math_grader.py
+++ b/lm_eval/tasks/score/math/math_grader.py
@@ -267,12 +267,12 @@ def normalize_answer_string(expr: str) -> str:
        "\\\\textit",
    ]:
        expr = expr.replace(surround_str, "")
-        pattern = f"^{surround_str}" + "\{(?P<text>.+?)\}$"
+        pattern = f"^{surround_str}" + r"\{(?P<text>.+?)\}$"
        m = re.search(pattern, expr)
        if m is not None:
            expr = m.group("text")

-    expr = expr.replace("\!", "")
+    expr = expr.replace(r"\!", "")
    expr = expr.replace("\\%", "%")
    expr = expr.replace("\\$", "$")
    expr = expr.replace("$", "")
@@ -305,7 +305,7 @@ def normalize_answer_string(expr: str) -> str:
        "p.m.",
        "PM",
    ]:
-        expr = re.sub(f"{unit}(es)?(s)? *(\^[0-9]+)?", "", expr)
+        expr = re.sub(rf"{unit}(es)?(s)? *(\^[0-9]+)?", "", expr)

    if "day" in expr:
        days = [
@@ -326,7 +326,7 @@ def normalize_answer_string(expr: str) -> str:
        if not weekday_expressed:
            expr = re.sub("day(s)?", "", expr)

-    expr = re.sub("\^ *\\\\circ", "", expr)
+    expr = re.sub("\\^ *\\\\circ", "", expr)

    if len(expr) > 0 and expr[0] == "{" and expr[-1] == "}":
        expr = expr[1:-1]

--- a/lm_eval/tasks/score/math/non_greedy_robustness_math_algebra.yaml
+++ b/lm_eval/tasks/score/math/non_greedy_robustness_math_algebra.yaml
@@ -18,7 +18,7 @@ dataset_name: algebra
 output_type: generate_until
 test_split: test
 process_docs: !function utils_math.non_greedy_robustness_process_docs
-doc_to_text:  !function utils_math.math_robustness_doc_to_text
+doc_to_text: !function utils_math.math_robustness_doc_to_text
 doc_to_target: answer
 generation_kwargs:
  max_gen_toks: 1024
@@ -28,9 +28,7 @@ generation_kwargs:
 process_results: !function utils_math.non_greedy_robustness_process_results
 metric_list:
  - metric: non_greedy_accuracy
-    aggregation:  !function utils_math.non_greedy_accuracy
+    aggregation: !function utils_math.non_greedy_accuracy
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true