changes

e5161a6d · lintangsutawika · 57f08e40 · e5161a6d · e5161a6d · e5161a6d
Commit e5161a6d authored Jul 14, 2023 by lintangsutawika
5 changed files
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -61,6 +61,7 @@ def include_benchmarks(task_dir, benchmark_dir="benchmarks"):
        if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
            for f in file_list:
                if f.endswith(".yaml"):
+                    try:
                        benchmark_path = os.path.join(root, f)
                        with open(benchmark_path, "rb") as file:
@@ -76,6 +77,14 @@ def include_benchmarks(task_dir, benchmark_dir="benchmarks"):
                                    GROUP_REGISTRY[group].append(task)
                                else:
                                    GROUP_REGISTRY[group] = [task]
+                                    ALL_TASKS.add(group)
+                    except Exception as error:
+                        eval_logger.warning(
+                            "Failed to load benchmark in\n"
+                            f"                                 {benchmark_path}\n"
+                            "                                 Benchmark will not be added to registry\n"
+                            f"                                 Error: {error}"
+                        )
 task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"

--- a/lm_eval/tasks/benchmarks/pythia.yaml
+++ b/lm_eval/tasks/benchmarks/pythia.yaml
@@ -8,6 +8,6 @@ task:
  - winogrande
  - arc_challenge
  - arc_easy
-  - logiqa
+  # - logiqa
-  - blimp_*
+  # - blimp_*
-  - hendrycksTest*
+  # - hendrycksTest*
--- a/lm_eval/tasks/winogrande/default.yaml
+++ b/lm_eval/tasks/winogrande/default.yaml
-group:
-  - super-glue-lm-eval-v1
 task: winogrande
 dataset_path: winogrande
 dataset_name: winogrande_xl
 output_type: multiple_choice
-should_decontaminate: true
-doc_to_decontamination_query: "{{sentence}}"
 training_split: train
 validation_split: validation
+doc_to_text: !function preprocess_winogrande.doc_to_text
+doc_to_target: !function preprocess_winogrande.doc_to_target
+doc_to_choice: !function preprocess_winogrande.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: sentence
 metric_list:
-  - metric: exact_match
+  - metric: acc
    aggregation: mean
    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
--- a/lm_eval/tasks/winogrande/preprocess.py
+++ b/lm_eval/tasks/winogrande/preprocess.py
-import re
-from lm_eval.utils import general_detokenize
-def partial_context(doc, option):
-    # Substitute the pronoun in the sentence with the specified option
-    # and ignore everything after.
-    pronoun_loc = doc["sentence"].index("_")
-    return doc["sentence"][:pronoun_loc] + option
-def partial_target(doc):
-    # The target is everything after the document specified pronoun.
-    pronoun_loc = doc["sentence"].index("_") + 1
-    return " " + doc["sentence"][pronoun_loc:].strip()
--- a/lm_eval/tasks/winogrande/winogrande.yaml
+++ b/lm_eval/tasks/winogrande/winogrande.yaml
-task: winogrande
-dataset_path: winogrande
-dataset_name: winogrande_xl
-output_type: multiple_choice
-training_split: train
-validation_split: validation
-doc_to_text: !function preprocess_winogrande.doc_to_text
-doc_to_target: !function preprocess_winogrande.doc_to_target
-doc_to_choice: !function preprocess_winogrande.doc_to_choice
-metric_list:
-  - metric: acc
-    aggregation: mean
-    higher_is_better: true