Merge branch 'main' into tasklist

# Conflicts: # pyproject.toml

Merge branch 'main' into tasklist
# Conflicts: # pyproject.toml
b58e5556 · Baber · 6e1866f5 · 4f8195f1 · b58e5556 · b58e5556
Commit b58e5556 authored Jul 27, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/llama3/instruct/mmlu_de/_continuation_template_yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_de/_continuation_template_yaml
@@ -28,5 +28,3 @@ filter_list:
      - function: take_first
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/llama3/instruct/mmlu_es/_continuation_template_yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_es/_continuation_template_yaml
@@ -28,5 +28,3 @@ filter_list:
      - function: take_first
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/llama3/instruct/mmlu_fr/_continuation_template_yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_fr/_continuation_template_yaml
@@ -28,5 +28,3 @@ filter_list:
      - function: take_first
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/llama3/instruct/mmlu_hi/_continuation_template_yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_hi/_continuation_template_yaml
@@ -28,5 +28,3 @@ filter_list:
      - function: take_first
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/llama3/instruct/mmlu_it/_continuation_template_yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_it/_continuation_template_yaml
@@ -28,5 +28,3 @@ filter_list:
      - function: take_first
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/llama3/instruct/mmlu_pro/_default_template_yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_pro/_default_template_yaml
@@ -31,5 +31,3 @@ filter_list:
        - function: take_first
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/llama3/instruct/mmlu_pt/_continuation_template_yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_pt/_continuation_template_yaml
@@ -28,5 +28,3 @@ filter_list:
      - function: take_first
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/llama3/instruct/mmlu_th/_continuation_template_yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_th/_continuation_template_yaml
@@ -28,5 +28,3 @@ filter_list:
      - function: take_first
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/logiqa/logiqa.yaml
+++ b/lm_eval/tasks/logiqa/logiqa.yaml
@@ -19,5 +19,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/logiqa2/logieval.yaml
+++ b/lm_eval/tasks/logiqa2/logieval.yaml
@@ -25,5 +25,3 @@ filter_list:
      - function: "take_first"
 metadata:
  version: 0.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/meddialog/utils.py
+++ b/lm_eval/tasks/meddialog/utils.py
@@ -11,7 +11,9 @@ try:

 except (ModuleNotFoundError, ImportError):
    raise ModuleNotFoundError(
-        "Please install evaluation metrics via pip install evaluate and pip install bert-score",
+        "Please install evaluation metrics via pip install evaluate bert-score "
+        "rouge_score>=0.1.2 nltk absl-py "
+        "git+https://github.com/google-research/bleurt.git"
    )
 except Exception as e:
    raise RuntimeError(

--- a/lm_eval/tasks/mediqa_qa2019/mediqa_qa2019_perplexity.yaml
+++ b/lm_eval/tasks/mediqa_qa2019/mediqa_qa2019_perplexity.yaml
@@ -23,5 +23,3 @@ metric_list:
    higher_is_better: false
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/mediqa_qa2019/utils.py
+++ b/lm_eval/tasks/mediqa_qa2019/utils.py
@@ -11,7 +11,9 @@ try:

 except (ModuleNotFoundError, ImportError):
    raise ModuleNotFoundError(
-        "Please install evaluation metrics via pip install evaluate and pip install bert-score",
+        "Please install evaluation metrics via pip install evaluate bert-score "
+        "rouge_score>=0.1.2 nltk absl-py "
+        "git+https://github.com/google-research/bleurt.git"
    )
 except Exception as e:
    raise RuntimeError(

--- a/lm_eval/tasks/medtext/utils.py
+++ b/lm_eval/tasks/medtext/utils.py
@@ -11,7 +11,9 @@ try:

 except (ModuleNotFoundError, ImportError):
    raise ModuleNotFoundError(
-        "Please install evaluation metrics via pip install evaluate and pip install bert-score",
+        "Please install evaluation metrics via pip install evaluate bert-score "
+        "rouge_score>=0.1.2 nltk absl-py "
+        "git+https://github.com/google-research/bleurt.git"
    )
 except Exception as e:
    raise RuntimeError(

--- a/lm_eval/tasks/meqsum/utils.py
+++ b/lm_eval/tasks/meqsum/utils.py
@@ -11,7 +11,9 @@ try:

 except (ModuleNotFoundError, ImportError):
    raise ModuleNotFoundError(
-        "Please install evaluation metrics via pip install evaluate and pip install bert-score",
+        "Please install evaluation metrics via pip install evaluate bert-score "
+        "rouge_score>=0.1.2 nltk absl-py "
+        "git+https://github.com/google-research/bleurt.git"
    )
 except Exception as e:
    raise RuntimeError(

--- a/lm_eval/tasks/mimic_repsum/utils.py
+++ b/lm_eval/tasks/mimic_repsum/utils.py
@@ -15,7 +15,9 @@ try:

 except (ModuleNotFoundError, ImportError):
    raise ModuleNotFoundError(
-        "Please install evaluation metrics via pip install evaluate and pip install bert-score",
+        "Please install evaluation metrics via pip install evaluate bert-score "
+        "rouge_score>=0.1.2 nltk absl-py radgraph"
+        "git+https://github.com/google-research/bleurt.git"
    )
 except Exception as e:
    raise RuntimeError(

--- a/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
+++ b/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml
@@ -7,7 +7,7 @@ dataset_name: algebra
 output_type: generate_until
 training_split: train
 test_split: test
-doc_to_text:  !function utils.doc_to_text
+doc_to_text: !function utils.doc_to_text
 process_results: !function utils.process_results
 doc_to_target: "{{answer if few_shot is undefined else solution}}"
 generation_kwargs:
@@ -25,8 +25,6 @@ metric_list:
 num_fewshot: 4
 metadata:
  version: 2.0
-dataset_kwargs:
-  trust_remote_code: true
 fewshot_config:
  sampler: first_n
  samples: !function utils.list_fewshot_samples
--- a/lm_eval/tasks/mlqa/mlqa_common_yaml
+++ b/lm_eval/tasks/mlqa/mlqa_common_yaml
 dataset_path: facebook/mlqa
-dataset_kwargs:
-  trust_remote_code: true
 test_split: test
 validation_split: validation
 output_type: generate_until

--- a/lm_eval/tasks/mmlu/continuation/_continuation_template_yaml
+++ b/lm_eval/tasks/mmlu/continuation/_continuation_template_yaml
@@ -9,5 +9,3 @@ doc_to_choice: "{{choices}}"
 doc_to_target: "{{answer}}"
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/mmlu/default/_default_template_yaml
+++ b/lm_eval/tasks/mmlu/default/_default_template_yaml
@@ -13,5 +13,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true