Merge branch 'main' into metrics

# Conflicts: # .pre-commit-config.yaml # lm_eval/api/task.py # lm_eval/models/huggingface.py # lm_eval/models/vllm_causallms.py # pyproject.toml

Merge branch 'main' into metrics
# Conflicts: # .pre-commit-config.yaml # lm_eval/api/task.py # lm_eval/models/huggingface.py # lm_eval/models/vllm_causallms.py # pyproject.toml
e6b798f9 · Baber · 14a29ade · 4f8195f1 · e6b798f9 · e6b798f9
Commit e6b798f9 authored Jul 25, 2025 by Baber
13 changed files
--- a/lm_eval/tasks/score/math/prompt_robustness_math_algebra.yaml
+++ b/lm_eval/tasks/score/math/prompt_robustness_math_algebra.yaml
@@ -18,7 +18,7 @@ process_docs: !function utils_math.prompt_robustness_process_docs
 dataset_name: algebra
 output_type: generate_until
 test_split: test
-doc_to_text:  !function utils_math.math_robustness_doc_to_text
+doc_to_text: !function utils_math.math_robustness_doc_to_text
 process_results: !function utils_math.process_results
 doc_to_target: answer
 generation_kwargs:
@@ -28,39 +28,37 @@ generation_kwargs:
  max_gen_toks: 1024
 metric_list:
  - metric: 0_accuracy
-    aggregation:  !function utils_math.per_prompt_accuracy_0
+    aggregation: !function utils_math.per_prompt_accuracy_0
    higher_is_better: true
  - metric: 1_accuracy
-    aggregation:  !function utils_math.per_prompt_accuracy_1
+    aggregation: !function utils_math.per_prompt_accuracy_1
    higher_is_better: true
  - metric: 2_accuracy
-    aggregation:  !function utils_math.per_prompt_accuracy_2
+    aggregation: !function utils_math.per_prompt_accuracy_2
    higher_is_better: true
  - metric: 3_accuracy
-    aggregation:  !function utils_math.per_prompt_accuracy_3
+    aggregation: !function utils_math.per_prompt_accuracy_3
    higher_is_better: true
  - metric: 4_accuracy
-    aggregation:  !function utils_math.per_prompt_accuracy_4
+    aggregation: !function utils_math.per_prompt_accuracy_4
    higher_is_better: true
  - metric: 5_accuracy
-    aggregation:  !function utils_math.per_prompt_accuracy_5
+    aggregation: !function utils_math.per_prompt_accuracy_5
    higher_is_better: true
  - metric: 6_accuracy
-    aggregation:  !function utils_math.per_prompt_accuracy_6
+    aggregation: !function utils_math.per_prompt_accuracy_6
    higher_is_better: true
  - metric: 7_accuracy
-    aggregation:  !function utils_math.per_prompt_accuracy_7
+    aggregation: !function utils_math.per_prompt_accuracy_7
    higher_is_better: true
  - metric: 8_accuracy
-    aggregation:  !function utils_math.per_prompt_accuracy_8
+    aggregation: !function utils_math.per_prompt_accuracy_8
    higher_is_better: true
  - metric: 9_accuracy
-    aggregation:  !function utils_math.per_prompt_accuracy_9
+    aggregation: !function utils_math.per_prompt_accuracy_9
    higher_is_better: true
  - metric: consistency_rate
-    aggregation:  !function utils_math.math_prompt_consistency_rate
+    aggregation: !function utils_math.math_prompt_consistency_rate
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/score/mmlu_pro/score_non_greedy_robustness_mmlu_pro.yaml
+++ b/lm_eval/tasks/score/mmlu_pro/score_non_greedy_robustness_mmlu_pro.yaml
@@ -30,9 +30,7 @@ generation_kwargs:
 process_results: !function utils_mmlu_pro.non_greedy_robustness_process_results
 metric_list:
  - metric: non_greedy_macro_accuracy
-    aggregation:  !function utils_mmlu_pro.non_greedy_macro_accuracy
+    aggregation: !function utils_mmlu_pro.non_greedy_macro_accuracy
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/score/mmlu_pro/score_option_order_robustness_mmlu_pro.yaml
+++ b/lm_eval/tasks/score/mmlu_pro/score_option_order_robustness_mmlu_pro.yaml
@@ -29,39 +29,37 @@ generation_kwargs:
 process_results: !function utils_mmlu_pro.option_order_robustness_process_results
 metric_list:
  - metric: per_option_macro_accuracy_A
-    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_a
+    aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_a
    higher_is_better: true
  - metric: per_option_macro_accuracy_B
-    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_b
+    aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_b
    higher_is_better: true
  - metric: per_option_macro_accuracy_C
-    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_c
+    aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_c
    higher_is_better: true
  - metric: per_option_macro_accuracy_D
-    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_d
+    aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_d
    higher_is_better: true
  - metric: per_option_macro_accuracy_E
-    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_e
+    aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_e
    higher_is_better: true
  - metric: per_option_macro_accuracy_F
-    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_f
+    aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_f
    higher_is_better: true
  - metric: per_option_macro_accuracy_G
-    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_g
+    aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_g
    higher_is_better: true
  - metric: per_option_macro_accuracy_H
-    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_h
+    aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_h
    higher_is_better: true
  - metric: per_option_macro_accuracy_I
-    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_i
+    aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_i
    higher_is_better: true
  - metric: per_option_macro_accuracy_J
-    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_j
+    aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_j
    higher_is_better: true
  - metric: options_consistency_rate
-    aggregation:  !function utils_mmlu_pro.options_consistency_rate
+    aggregation: !function utils_mmlu_pro.options_consistency_rate
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/score/mmlu_pro/score_prompt_robustness_mmlu_pro.yaml
+++ b/lm_eval/tasks/score/mmlu_pro/score_prompt_robustness_mmlu_pro.yaml
@@ -29,39 +29,37 @@ generation_kwargs:
 process_results: !function utils_mmlu_pro.prompt_robustness_process_results
 metric_list:
  - metric: 0_macro_accuracy
-    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_0
+    aggregation: !function utils_mmlu_pro.per_prompt_accuracy_0
    higher_is_better: true
  - metric: 1_macro_accuracy
-    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_1
+    aggregation: !function utils_mmlu_pro.per_prompt_accuracy_1
    higher_is_better: true
  - metric: 2_macro_accuracy
-    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_2
+    aggregation: !function utils_mmlu_pro.per_prompt_accuracy_2
    higher_is_better: true
  - metric: 3_macro_accuracy
-    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_3
+    aggregation: !function utils_mmlu_pro.per_prompt_accuracy_3
    higher_is_better: true
  - metric: 4_macro_accuracy
-    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_4
+    aggregation: !function utils_mmlu_pro.per_prompt_accuracy_4
    higher_is_better: true
  - metric: 5_macro_accuracy
-    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_5
+    aggregation: !function utils_mmlu_pro.per_prompt_accuracy_5
    higher_is_better: true
  - metric: 6_macro_accuracy
-    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_6
+    aggregation: !function utils_mmlu_pro.per_prompt_accuracy_6
    higher_is_better: true
  - metric: 7_macro_accuracy
-    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_7
+    aggregation: !function utils_mmlu_pro.per_prompt_accuracy_7
    higher_is_better: true
  - metric: 8_macro_accuracy
-    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_8
+    aggregation: !function utils_mmlu_pro.per_prompt_accuracy_8
    higher_is_better: true
  - metric: 9_macro_accuracy
-    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_9
+    aggregation: !function utils_mmlu_pro.per_prompt_accuracy_9
    higher_is_better: true
  - metric: consistency_rate
-    aggregation:  !function utils_mmlu_pro.mmlu_pro_prompt_consistency_rate
+    aggregation: !function utils_mmlu_pro.mmlu_pro_prompt_consistency_rate
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/spanish_bench/flores_es/_flores_common_yaml
+++ b/lm_eval/tasks/spanish_bench/flores_es/_flores_common_yaml
@@ -23,5 +23,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/unscramble/anagrams1.yaml
+++ b/lm_eval/tasks/unscramble/anagrams1.yaml
@@ -18,5 +18,3 @@ metric_list:
    ignore_punctuation: false
 metadata:
  version: 2.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/unscramble/anagrams2.yaml
+++ b/lm_eval/tasks/unscramble/anagrams2.yaml
@@ -18,5 +18,3 @@ metric_list:
    ignore_punctuation: false
 metadata:
  version: 2.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/unscramble/cycle_letters.yaml
+++ b/lm_eval/tasks/unscramble/cycle_letters.yaml
@@ -18,5 +18,3 @@ metric_list:
    ignore_punctuation: false
 metadata:
  version: 2.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/unscramble/random_insertion.yaml
+++ b/lm_eval/tasks/unscramble/random_insertion.yaml
@@ -18,5 +18,3 @@ metric_list:
    ignore_punctuation: false
 metadata:
  version: 2.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/wikitext/wikitext.yaml
+++ b/lm_eval/tasks/wikitext/wikitext.yaml
@@ -16,5 +16,3 @@ metric_list:
  - metric: bits_per_byte
 metadata:
  version: 2.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/winogrande/default.yaml
+++ b/lm_eval/tasks/winogrande/default.yaml
@@ -15,5 +15,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -29,12 +29,28 @@ HIGHER_IS_BETTER_SYMBOLS = {
 }
+def wrap_text(string: str, width: int = 140, **kwargs) -> str | None:
+    """
+    Wraps the given string to the specified width.
+    """
+    import textwrap
+    return textwrap.fill(
+        inspect.cleandoc(string),
+        width=width,
+        initial_indent="",
+        subsequent_indent=" " * 8,
+        break_long_words=False,
+        break_on_hyphens=False,
+        **kwargs,
+    )
 def setup_logging(verbosity=logging.INFO):
    # Configure the root logger
    class CustomFormatter(logging.Formatter):
        def format(self, record):
-            if record.name.startswith("lm_eval."):
+            record.name = record.name.removeprefix("im_eval.")
-                record.name = record.name[len("lm_eval.") :]
            return super().format(record)
    formatter = CustomFormatter(

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,7 +14,7 @@ classifiers = [
 ]
 dependencies = [
  "accelerate>=0.26.0",
-  "datasets>=2.16.0",
+  "datasets>=2.16.0,<4.0",
  "evaluate>=0.4.0",
  "peft>=0.2.0",
  "pytablewriter",
@@ -55,6 +55,7 @@ ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"]
 ipex = ["optimum"]
 japanese_leaderboard = ["emoji==2.14.0", "neologdn==0.5.3", "fugashi[unidic-lite]", "rouge_score>=0.1.2"]
 longbench = ["jieba", "fuzzywuzzy", "rouge"]
+libra=["pymorphy2"]
 mamba = ["mamba_ssm", "causal-conv1d==1.0.2", "torch"]
 math = ["sympy>=1.12", "antlr4-python3-runtime==4.11", "math_verify[antlr4_11_0]"]
 multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
@@ -70,7 +71,9 @@ tasks = [
  "lm_eval[ifeval]",
  "lm_eval[japanese_leaderboard]",
  "lm_eval[longbench]",
-  "lm_eval[math]",
+  "lm_eval[libra]",
+    "lm_eval[mamba]",
+    "lm_eval[math]",
  "lm_eval[multilingual]",
  "lm_eval[ruler]"
 ]
@@ -98,7 +101,7 @@ plugins.md034.enabled = false # no-bare-urls
 [tool.ruff]
 target-version = "py39"
-lint.extend-select = ["I", "UP", "E", "C419", "F", "B", "SIM", "RUF034", "W605", "FURB"]
+lint.extend-select = ["I", "UP", "E", "C419", "F", "B", "SIM", "RUF034", "W605", "FURB", "W605"]
 lint.fixable = ["I001", "F401", "UP"]
 lint.ignore = ["E402", "E731", "E501", "E111", "E114", "E117", "E741"]