Results filenames handling fix (#1926)

* results filenames handling moved to utils * zeno results handling fix * tasks_for_model backward compatibility * results files logic moved to tasks_for_model * moved sanitize_model_name to utils

Results filenames handling fix (#1926)
* results filenames handling moved to utils * zeno results handling fix * tasks_for_model backward compatibility * results files logic moved to tasks_for_model * moved sanitize_model_name to utils
69952581 · KonradSzafer · GitHub · 305fb636 · 69952581 · 69952581
Unverified Commit 69952581 authored Jun 11, 2024 by KonradSzafer Committed by GitHub Jun 11, 2024
Showing with 93 additions and 26 deletions

lm_eval/loggers/evaluation_tracker.py lm_eval/loggers/evaluation_tracker.py +9 -14

lm_eval/utils.py lm_eval/utils.py +49 -0

scripts/zeno_visualize.py scripts/zeno_visualize.py +35 -12

No files found.
--- a/lm_eval/loggers/evaluation_tracker.py
+++ b/lm_eval/loggers/evaluation_tracker.py
@@ -17,9 +17,15 @@ from huggingface_hub import (

 from lm_eval.utils import (
    eval_logger,
+    get_file_datetime,
+    get_file_task_name,
+    get_results_filenames,
+    get_sample_results_filenames,
    handle_non_serializable,
    hash_string,
    sanitize_list,
+    sanitize_model_name,
+    sanitize_task_name,
 )


@@ -78,9 +84,7 @@ class GeneralConfigTracker:
        """Logs model parameters and job ID."""
        self.model_source = model_source
        self.model_name = GeneralConfigTracker._get_model_name(model_args)
-        self.model_name_sanitized = re.sub(
-            r"[\"<>:/\|\\?\*\[\]]+", "__", self.model_name
-        )
+        self.model_name_sanitized = sanitize_model_name(self.model_name)
        self.system_instruction = system_instruction
        self.system_instruction_sha = (
            hash_string(system_instruction) if system_instruction else None
@@ -319,23 +323,14 @@ class EvaluationTracker:
        Creates a metadata card for the evaluation results dataset and pushes it to the Hugging Face hub.
        """

-        def get_file_task_name(filename: str) -> str:
-            return filename[filename.find("_") + 1 : filename.rfind("_")]
-
-        def get_file_datetime(filename: str) -> str:
-            return filename[filename.rfind("_") + 1 :].replace(".json", "")
-
-        def sanitize_task_name(task_name: str) -> str:
-            return re.sub(r"\W", "_", task_name)
-
        eval_logger.info("Recreating metadata card")
        repo_id = (
            self.hub_results_repo if self.public_repo else self.hub_results_repo_private
        )

        files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
-        results_files = [f for f in files_in_repo if "/results_" in f and ".json" in f]
-        sample_files = [f for f in files_in_repo if "/samples_" in f and ".json" in f]
+        results_files = get_results_filenames(files_in_repo)
+        sample_files = get_sample_results_filenames(files_in_repo)

        # Build a dictionary to store the latest evaluation datetime for:
        # - Each tested model and its aggregated results

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -152,6 +152,55 @@ def general_detokenize(string):
    return string


+def get_file_task_name(filename: str) -> str:
+    """
+    Given the sample results filenames, extracts and returns the task name.
+    """
+    return filename[filename.find("_") + 1 : filename.rfind("_")]
+
+
+def get_file_datetime(filename: str) -> str:
+    """
+    Given the results and sample results filenames, extracts and returns the datetime.
+    """
+    return filename[filename.rfind("_") + 1 :].replace(".json", "")
+
+
+def sanitize_model_name(model_name: str) -> str:
+    """
+    Given the model name, returns a sanitized version of it.
+    """
+    return re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", model_name)
+
+
+def sanitize_task_name(task_name: str) -> str:
+    """
+    Given the task name, returns a sanitized version of it.
+    """
+    return re.sub(r"\W", "_", task_name)
+
+
+def get_latest_filename(filenames: List[str]) -> str:
+    """
+    Given a list of filenames, returns the filename with the latest datetime.
+    """
+    return max(filenames, key=lambda f: get_file_datetime(f))
+
+
+def get_results_filenames(filenames: List[str]) -> List[str]:
+    """
+    Extracts filenames that correspond to aggregated results.
+    """
+    return [f for f in filenames if "/results_" in f and ".json" in f]
+
+
+def get_sample_results_filenames(filenames: List[str]) -> List[str]:
+    """
+    Extracts filenames that correspond to sample results.
+    """
+    return [f for f in filenames if "/samples_" in f and ".json" in f]
+
+
 def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len):
    """
    - context_len allows for a rolling window context, allowing each prediction window to potentially

--- a/scripts/zeno_visualize.py
+++ b/scripts/zeno_visualize.py
@@ -7,7 +7,12 @@ from pathlib import Path
 import pandas as pd
 from zeno_client import ZenoClient, ZenoMetric

-from lm_eval.utils import eval_logger
+from lm_eval.utils import (
+    eval_logger,
+    get_latest_filename,
+    get_results_filenames,
+    get_sample_results_filenames,
+)


 def parse_args():
@@ -45,13 +50,15 @@ def main():

    assert len(models) > 0, "No model directories found in the data_path."

+    # Get the tasks from the latest results file of the first model.
    tasks = set(tasks_for_model(models[0], args.data_path))

-    for model in models:  # Make sure that all models have the same tasks.
+    # Get tasks names from the latest results file for each model
+    # Get intersection of tasks for all models
+    for model in models:
        old_tasks = tasks.copy()
        task_count = len(tasks)
-
-        model_tasks = tasks_for_model(model, args.data_path)
+        model_tasks = set(tasks_for_model(model, args.data_path))
        tasks.intersection(set(model_tasks))

        if task_count != len(tasks):
@@ -66,22 +73,36 @@ def main():
    for task in tasks:
        # Upload data for all models
        for model_index, model in enumerate(models):
+            # Get latest results and sample results for a model
+            model_dir = Path(args.data_path, model)
+            model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
+            model_results_filenames = get_results_filenames(model_files)
+            model_sample_filenames = get_sample_results_filenames(model_files)
+            latest_results = get_latest_filename(
+                [Path(f).name for f in model_results_filenames]
+            )
+            latest_sample_results = get_latest_filename(
+                [Path(f).name for f in model_sample_filenames if task in f]
+            )
            model_args = re.sub(
                r"[\"<>:/\|\\?\*\[\]]+",
                "__",
                json.load(
-                    open(Path(args.data_path, model, "results.json"), encoding="utf-8")
+                    open(Path(args.data_path, model, latest_results), encoding="utf-8")
                )["config"]["model_args"],
            )
+            print(model_args)
+            data = []
            with open(
-                Path(args.data_path, model, f"{model_args}_{task}.jsonl"),
+                Path(args.data_path, model, latest_sample_results),
                "r",
                encoding="utf-8",
            ) as file:
-                data = json.loads(file.read())
+                for line in file:
+                    data.append(json.loads(line.strip()))

            configs = json.load(
-                open(Path(args.data_path, model, "results.json"), encoding="utf-8")
+                open(Path(args.data_path, model, latest_results), encoding="utf-8")
            )["configs"]
            config = configs[task]

@@ -125,10 +146,12 @@ def tasks_for_model(model: str, data_path: str):
    Returns:
        list: A list of tasks for the model.
    """
-    dir_path = Path(data_path, model)
-    config = (
-        json.load(open(Path(dir_path, "results.json"), encoding="utf-8"))["configs"],
-    )
+    # get latest model results for a given name
+    model_dir = Path(data_path, model)
+    model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
+    model_results_filenames = get_results_filenames(model_files)
+    latest_results = get_latest_filename(model_results_filenames)
+    config = (json.load(open(latest_results, encoding="utf-8"))["configs"],)
    return list(config[0].keys())