batch commit

d859d1ca · Nathan Habib · 6e49b1f6 · d859d1ca · d859d1ca · d859d1ca
Commit d859d1ca authored Jun 26, 2024 by Nathan Habib
20 changed files
--- a/lm_eval/tasks/bigbench/push_bigbench_dataset.py
+++ b/lm_eval/tasks/bigbench/push_bigbench_dataset.py
@@ -8,7 +8,6 @@ Requires the installation of
 `pip install "bigbench @ https://storage.googleapis.com/public_research_data/bigbench/bigbench-0.0.1.tar.gz"`
 and is included so that the bigbench dependency can be avoided.
 """
 import bigbench.api.util as bb_utils
 import datasets
 from tqdm import tqdm

--- a/lm_eval/tasks/ceval/_generate_configs.py
+++ b/lm_eval/tasks/ceval/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
 import argparse
 import os

--- a/lm_eval/tasks/cmmlu/_generate_configs.py
+++ b/lm_eval/tasks/cmmlu/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
 import argparse
 import os

--- a/lm_eval/tasks/csatqa/_generate_configs.py
+++ b/lm_eval/tasks/csatqa/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
 import argparse
 import os

--- a/lm_eval/tasks/fda/task.py
+++ b/lm_eval/tasks/fda/task.py
+"""
+"""
 import re
 from typing import List

--- a/lm_eval/tasks/ifeval/instructions.py
+++ b/lm_eval/tasks/ifeval/instructions.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """Library of instructions."""
 import collections
 import json
 import logging

--- a/lm_eval/tasks/ifeval/instructions_registry.py
+++ b/lm_eval/tasks/ifeval/instructions_registry.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """Registry of all instructions."""
 from lm_eval.tasks.ifeval import instructions

--- a/lm_eval/tasks/mmlu/_generate_configs.py
+++ b/lm_eval/tasks/mmlu/_generate_configs.py
 """
 Take in a YAML, and output all "other" splits with this YAML
 """
 import argparse
 import logging
 import os

--- a/lm_eval/tasks/piqa/piqa.yaml
+++ b/lm_eval/tasks/piqa/piqa.yaml
@@ -19,5 +19,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/squad_completion/task.py
+++ b/lm_eval/tasks/squad_completion/task.py
+"""
+"""
 import re
 from typing import List

--- a/lm_eval/tasks/squadv2/task.py
+++ b/lm_eval/tasks/squadv2/task.py
@@ -13,7 +13,6 @@ also determine when no answer is supported by the paragraph and abstain from ans
 Homepage: https://rajpurkar.github.io/SQuAD-explorer/
 """
 from functools import partial
 from math import exp

--- a/lm_eval/tasks/tinyBenchmarks/utils_winogrande.py
+++ b/lm_eval/tasks/tinyBenchmarks/utils_winogrande.py
-"""This code mirrors the utils of the original winogrande task"""
+""" This code mirrors the utils of the original winogrande task """
 def doc_to_text(doc):

--- a/lm_eval/tasks/tmmluplus/default/_generate_configs.py
+++ b/lm_eval/tasks/tmmluplus/default/_generate_configs.py
 """
 Take in a YAML, and output all "other" splits with this YAML
 """
 import argparse
 import os

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -152,55 +152,6 @@ def general_detokenize(string):
    return string
-def get_file_task_name(filename: str) -> str:
-    """
-    Given the sample results filenames, extracts and returns the task name.
-    """
-    return filename[filename.find("_") + 1 : filename.rfind("_")]
-def get_file_datetime(filename: str) -> str:
-    """
-    Given the results and sample results filenames, extracts and returns the datetime.
-    """
-    return filename[filename.rfind("_") + 1 :].replace(".json", "")
-def sanitize_model_name(model_name: str) -> str:
-    """
-    Given the model name, returns a sanitized version of it.
-    """
-    return re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", model_name)
-def sanitize_task_name(task_name: str) -> str:
-    """
-    Given the task name, returns a sanitized version of it.
-    """
-    return re.sub(r"\W", "_", task_name)
-def get_latest_filename(filenames: List[str]) -> str:
-    """
-    Given a list of filenames, returns the filename with the latest datetime.
-    """
-    return max(filenames, key=lambda f: get_file_datetime(f))
-def get_results_filenames(filenames: List[str]) -> List[str]:
-    """
-    Extracts filenames that correspond to aggregated results.
-    """
-    return [f for f in filenames if "/results_" in f and ".json" in f]
-def get_sample_results_filenames(filenames: List[str]) -> List[str]:
-    """
-    Extracts filenames that correspond to sample results.
-    """
-    return [f for f in filenames if "/samples_" in f and ".json" in f]
 def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len):
    """
    - context_len allows for a rolling window context, allowing each prediction window to potentially

--- a/scripts/clean_training_data/README.md
+++ b/scripts/clean_training_data/README.md
@@ -10,7 +10,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1
    the match, splitting the training data into chunks
   3) Any chunks less than `minimum_slice_length` are removed
   4) Training data sets split into more than `too_dirty_cutoff` are considered
-    completely contaminated and removed
+    completey contaminated and removed
 OpenAI used:
 ```

--- a/scripts/make_table_results.py
+++ b/scripts/make_table_results.py
@@ -2,7 +2,6 @@
 Usage:
   python make_table_tasks.py --output <markdown_filename>
 """
 import json
 import logging
 import os

--- a/scripts/make_table_tasks.py
+++ b/scripts/make_table_tasks.py
@@ -2,7 +2,6 @@
 Usage:
   python make_table_tasks.py --output <markdown_filename>
 """
 import argparse
 import logging

--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -70,11 +70,6 @@ def main():
            if docs is not None:
                iters.append(docs)
-        if len(iters) == 0:
-            raise ValueError(
-                f"Passed --sets '{args.sets}' but this task has no splits which match. Please specify a different --sets value."
-            )
        docs = join_iters(iters)
        with open(

--- a/scripts/zeno_visualize.py
+++ b/scripts/zeno_visualize.py
@@ -7,12 +7,7 @@ from pathlib import Path
 import pandas as pd
 from zeno_client import ZenoClient, ZenoMetric
-from lm_eval.utils import (
+from lm_eval.utils import eval_logger
-    eval_logger,
-    get_latest_filename,
-    get_results_filenames,
-    get_sample_results_filenames,
-)
 def parse_args():
@@ -50,15 +45,13 @@ def main():
    assert len(models) > 0, "No model directories found in the data_path."
-    # Get the tasks from the latest results file of the first model.
    tasks = set(tasks_for_model(models[0], args.data_path))
-    # Get tasks names from the latest results file for each model
+    for model in models:  # Make sure that all models have the same tasks.
-    # Get intersection of tasks for all models
-    for model in models:
        old_tasks = tasks.copy()
        task_count = len(tasks)
-        model_tasks = set(tasks_for_model(model, args.data_path))
+        model_tasks = tasks_for_model(model, args.data_path)
        tasks.intersection(set(model_tasks))
        if task_count != len(tasks):
@@ -73,36 +66,22 @@ def main():
    for task in tasks:
        # Upload data for all models
        for model_index, model in enumerate(models):
-            # Get latest results and sample results for a model
-            model_dir = Path(args.data_path, model)
-            model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
-            model_results_filenames = get_results_filenames(model_files)
-            model_sample_filenames = get_sample_results_filenames(model_files)
-            latest_results = get_latest_filename(
-                [Path(f).name for f in model_results_filenames]
-            )
-            latest_sample_results = get_latest_filename(
-                [Path(f).name for f in model_sample_filenames if task in f]
-            )
            model_args = re.sub(
                r"[\"<>:/\|\\?\*\[\]]+",
                "__",
                json.load(
-                    open(Path(args.data_path, model, latest_results), encoding="utf-8")
+                    open(Path(args.data_path, model, "results.json"), encoding="utf-8")
                )["config"]["model_args"],
            )
-            print(model_args)
-            data = []
            with open(
-                Path(args.data_path, model, latest_sample_results),
+                Path(args.data_path, model, f"{model_args}_{task}.jsonl"),
                "r",
                encoding="utf-8",
            ) as file:
-                for line in file:
+                data = json.loads(file.read())
-                    data.append(json.loads(line.strip()))
            configs = json.load(
-                open(Path(args.data_path, model, latest_results), encoding="utf-8")
+                open(Path(args.data_path, model, "results.json"), encoding="utf-8")
            )["configs"]
            config = configs[task]
@@ -146,12 +125,10 @@ def tasks_for_model(model: str, data_path: str):
    Returns:
        list: A list of tasks for the model.
    """
-    # get latest model results for a given name
+    dir_path = Path(data_path, model)
-    model_dir = Path(data_path, model)
+    config = (
-    model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
+        json.load(open(Path(dir_path, "results.json"), encoding="utf-8"))["configs"],
-    model_results_filenames = get_results_filenames(model_files)
+    )
-    latest_results = get_latest_filename(model_results_filenames)
-    config = (json.load(open(latest_results, encoding="utf-8"))["configs"],)
    return list(config[0].keys())

--- a/tests/models/test_neuralmagic.py
+++ b/tests/models/test_neuralmagic.py
@@ -23,7 +23,6 @@ DEEPSPARSE_MODELS_TASKS = [
 ]
-@pytest.mark.skip(reason="test failing")
 @pytest.mark.parametrize("model_id,task", SPARSEML_MODELS_TASKS)
 def test_sparseml_eval(model_id, task):
    lm = get_model("sparseml").create_from_arg_string(