Merge branch 'group-agg-rework' of...

Merge branch 'group-agg-rework' of https://github.com/EleutherAI/lm-evaluation-harness into multiprompt

Merge branch 'group-agg-rework' of...
Merge branch 'group-agg-rework' of https://github.com/EleutherAI/lm-evaluation-harness into multiprompt
88486e57 · lintangsutawika · 5971f2ca · ba73d131 · 88486e57 · 88486e57
Commit 88486e57 authored Jul 05, 2024 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/xcopa/default_et.yaml
+++ b/lm_eval/tasks/xcopa/default_et.yaml
-group: xcopa
 task: xcopa_et
 dataset_path: xcopa
 dataset_name: et

--- a/lm_eval/tasks/xnli/_xnli.yaml
+++ b/lm_eval/tasks/xnli/_xnli.yaml
+group: xnli
+task:
+  - xnli_ar
+  - xnli_bg
+  - xnli_de
+  - xnli_el
+  - xnli_en
+  - xnli_es
+  - xnli_fr
+  - xnli_hi
+  - xnli_ru
+  - xnli_sw
+  - xnli_th
+  - xnli_tr
+  - xnli_ur
+  - xnli_vi
+  - xnli_zh
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/xnli/xnli_common_yaml
+++ b/lm_eval/tasks/xnli/xnli_common_yaml
 # This file will be included in the generated language-specific task configs.
 # It doesn't have a yaml file extension as it is not meant to be imported directly
 # by the harness.
-group: xnli
 task: null
 dataset_path: xnli
 dataset_name: null

--- a/lm_eval/tasks/xnli_eu/README.md
+++ b/lm_eval/tasks/xnli_eu/README.md
@@ -24,9 +24,9 @@ Homepage: https://github.com/hitz-zentroa/xnli-eu
 }
 ```
-### Groups and Tasks
+### Groups, Tags, and Tasks
-#### Groups
+#### Tags
 * `xnli_eu_mt_native`: Includes MT and Native variants of the XNLIeu dataset.

--- a/lm_eval/tasks/xnli_eu/xnli_common_yaml
+++ b/lm_eval/tasks/xnli_eu/xnli_common_yaml
-group: xnli
 task: null
 dataset_path: xnli
 dataset_name: null

--- a/lm_eval/tasks/xnli_eu/xnli_eu_mt.yaml
+++ b/lm_eval/tasks/xnli_eu/xnli_eu_mt.yaml
 include: xnli_eu.yaml
-group: xnli_eu_mt_native
+tag: xnli_eu_mt_native
 task: xnli_eu_mt
 dataset_name: eu_mt
--- a/lm_eval/tasks/xnli_eu/xnli_eu_native.yaml
+++ b/lm_eval/tasks/xnli_eu/xnli_eu_native.yaml
 include: xnli_eu.yaml
-group: xnli_eu_mt_native
+tag: xnli_eu_mt_native
 task: xnli_eu_native
 training_split: null
 validation_split: null

--- a/lm_eval/tasks/xstorycloze/_xstorycloze.yaml
+++ b/lm_eval/tasks/xstorycloze/_xstorycloze.yaml
+group: xstorycloze
+task:
+  - xstorycloze_ar
+  - xstorycloze_en
+  - xstorycloze_es
+  - xstorycloze_eu
+  - xstorycloze_hi
+  - xstorycloze_id
+  - xstorycloze_my
+  - xstorycloze_ru
+  - xstorycloze_sw
+  - xstorycloze_te
+  - xstorycloze_zh
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/xstorycloze/default_ar.yaml
+++ b/lm_eval/tasks/xstorycloze/default_ar.yaml
-group: xstorycloze
 task: xstorycloze_ar
 dataset_path: juletxara/xstory_cloze
 dataset_name: ar

--- a/lm_eval/tasks/xwinograd/_xwinograd.yaml
+++ b/lm_eval/tasks/xwinograd/_xwinograd.yaml
+group: xwinograd
+task:
+  - xwinograd_en
+  - xwinograd_fr
+  - xwinograd_jp
+  - xwinograd_pt
+  - xwinograd_ru
+  - xwinograd_zh
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/xwinograd/xwinograd_common_yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_common_yaml
 # This file will be included in the generated language-specific task configs.
 # It doesn't have a yaml file extension as it is not meant to be imported directly
 # by the harness.
-group:
-  - xwinograd
 dataset_path: Muennighoff/xwinograd
 dataset_name: null  # Overridden by language-specific config.
 output_type: multiple_choice

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -152,6 +152,55 @@ def general_detokenize(string):
    return string
+def get_file_task_name(filename: str) -> str:
+    """
+    Given the sample results filenames, extracts and returns the task name.
+    """
+    return filename[filename.find("_") + 1 : filename.rfind("_")]
+def get_file_datetime(filename: str) -> str:
+    """
+    Given the results and sample results filenames, extracts and returns the datetime.
+    """
+    return filename[filename.rfind("_") + 1 :].replace(".json", "")
+def sanitize_model_name(model_name: str) -> str:
+    """
+    Given the model name, returns a sanitized version of it.
+    """
+    return re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", model_name)
+def sanitize_task_name(task_name: str) -> str:
+    """
+    Given the task name, returns a sanitized version of it.
+    """
+    return re.sub(r"\W", "_", task_name)
+def get_latest_filename(filenames: List[str]) -> str:
+    """
+    Given a list of filenames, returns the filename with the latest datetime.
+    """
+    return max(filenames, key=lambda f: get_file_datetime(f))
+def get_results_filenames(filenames: List[str]) -> List[str]:
+    """
+    Extracts filenames that correspond to aggregated results.
+    """
+    return [f for f in filenames if "/results_" in f and ".json" in f]
+def get_sample_results_filenames(filenames: List[str]) -> List[str]:
+    """
+    Extracts filenames that correspond to sample results.
+    """
+    return [f for f in filenames if "/samples_" in f and ".json" in f]
 def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len):
    """
    - context_len allows for a rolling window context, allowing each prediction window to potentially
@@ -289,7 +338,9 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False)
    keys = result_dict[column].keys()
    if sort_results:
-        # sort entries alphabetically
+        # sort entries alphabetically by task or group name.
+        # NOTE: we default here to false, because order matters for multi-level table printing a la mmlu.
+        # sorting here would mess that up
        keys = sorted(keys)
    for k in keys:
        dic = result_dict[column][k]
@@ -300,20 +351,21 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False)
        if "alias" in dic:
            k = dic.pop("alias")
-        for (mf), v in dic.items():
+        metric_items = dic.items()
+        if sort_results:
+            metric_items = sorted(metric_items)
+        for (mf), v in metric_items:
            m, _, f = mf.partition(",")
            if m.endswith("_stderr"):
                continue
-            if v != " ":
-                v = "%.4f" % v
            hib = HIGHER_IS_BETTER_SYMBOLS.get(higher_is_better.get(m), "")
            if m + "_stderr" + "," + f in dic:
                se = dic[m + "_stderr" + "," + f]
-                if se != "N/A":
+                se = "   N/A" if se == "N/A" else "%.4f" % se
-                    se = "%.4f" % se
+                values.append([k, version, f, n, m, hib, "%.4f" % v, "±", se])
-                values.append([k, version, f, n, m, hib, v, "±", se])
            else:
                values.append([k, version, f, n, m, hib, v, "", ""])
            k = ""

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "lm_eval"
-version = "0.4.2"
+version = "0.4.3"
 authors = [
    {name="EleutherAI", email="contact@eleuther.ai"}
 ]
@@ -39,7 +39,6 @@ dependencies = [
    "dill",
    "word2number",
    "more_itertools",
-    "shortuuid",
 ]
 [tool.setuptools.packages.find]

--- a/scripts/clean_training_data/README.md
+++ b/scripts/clean_training_data/README.md
@@ -10,7 +10,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1
    the match, splitting the training data into chunks
   3) Any chunks less than `minimum_slice_length` are removed
   4) Training data sets split into more than `too_dirty_cutoff` are considered
-    completey contaminated and removed
+    completely contaminated and removed
 OpenAI used:
 ```

--- a/scripts/make_table_results.py
+++ b/scripts/make_table_results.py
@@ -2,6 +2,7 @@
 Usage:
   python make_table_tasks.py --output <markdown_filename>
 """
 import json
 import logging
 import os

--- a/scripts/make_table_tasks.py
+++ b/scripts/make_table_tasks.py
@@ -2,6 +2,7 @@
 Usage:
   python make_table_tasks.py --output <markdown_filename>
 """
 import argparse
 import logging

--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -70,6 +70,11 @@ def main():
            if docs is not None:
                iters.append(docs)
+        if len(iters) == 0:
+            raise ValueError(
+                f"Passed --sets '{args.sets}' but this task has no splits which match. Please specify a different --sets value."
+            )
        docs = join_iters(iters)
        with open(

--- a/scripts/zeno_visualize.py
+++ b/scripts/zeno_visualize.py
@@ -7,7 +7,12 @@ from pathlib import Path
 import pandas as pd
 from zeno_client import ZenoClient, ZenoMetric
-from lm_eval.utils import eval_logger
+from lm_eval.utils import (
+    eval_logger,
+    get_latest_filename,
+    get_results_filenames,
+    get_sample_results_filenames,
+)
 def parse_args():
@@ -45,13 +50,15 @@ def main():
    assert len(models) > 0, "No model directories found in the data_path."
+    # Get the tasks from the latest results file of the first model.
    tasks = set(tasks_for_model(models[0], args.data_path))
-    for model in models:  # Make sure that all models have the same tasks.
+    # Get tasks names from the latest results file for each model
+    # Get intersection of tasks for all models
+    for model in models:
        old_tasks = tasks.copy()
        task_count = len(tasks)
+        model_tasks = set(tasks_for_model(model, args.data_path))
-        model_tasks = tasks_for_model(model, args.data_path)
        tasks.intersection(set(model_tasks))
        if task_count != len(tasks):
@@ -66,22 +73,36 @@ def main():
    for task in tasks:
        # Upload data for all models
        for model_index, model in enumerate(models):
+            # Get latest results and sample results for a model
+            model_dir = Path(args.data_path, model)
+            model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
+            model_results_filenames = get_results_filenames(model_files)
+            model_sample_filenames = get_sample_results_filenames(model_files)
+            latest_results = get_latest_filename(
+                [Path(f).name for f in model_results_filenames]
+            )
+            latest_sample_results = get_latest_filename(
+                [Path(f).name for f in model_sample_filenames if task in f]
+            )
            model_args = re.sub(
                r"[\"<>:/\|\\?\*\[\]]+",
                "__",
                json.load(
-                    open(Path(args.data_path, model, "results.json"), encoding="utf-8")
+                    open(Path(args.data_path, model, latest_results), encoding="utf-8")
                )["config"]["model_args"],
            )
+            print(model_args)
+            data = []
            with open(
-                Path(args.data_path, model, f"{model_args}_{task}.jsonl"),
+                Path(args.data_path, model, latest_sample_results),
                "r",
                encoding="utf-8",
            ) as file:
-                data = json.loads(file.read())
+                for line in file:
+                    data.append(json.loads(line.strip()))
            configs = json.load(
-                open(Path(args.data_path, model, "results.json"), encoding="utf-8")
+                open(Path(args.data_path, model, latest_results), encoding="utf-8")
            )["configs"]
            config = configs[task]
@@ -125,10 +146,12 @@ def tasks_for_model(model: str, data_path: str):
    Returns:
        list: A list of tasks for the model.
    """
-    dir_path = Path(data_path, model)
+    # get latest model results for a given name
-    config = (
+    model_dir = Path(data_path, model)
-        json.load(open(Path(dir_path, "results.json"), encoding="utf-8"))["configs"],
+    model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
-    )
+    model_results_filenames = get_results_filenames(model_files)
+    latest_results = get_latest_filename(model_results_filenames)
+    config = (json.load(open(latest_results, encoding="utf-8"))["configs"],)
    return list(config[0].keys())

--- a/templates/new_yaml_task/README.md
+++ b/templates/new_yaml_task/README.md
@@ -17,12 +17,16 @@ Homepage: `homepage to the benchmark's website goes here, if applicable`
 BibTeX-formatted citation goes here
 ```
-### Groups and Tasks
+### Groups, Tags, and Tasks
 #### Groups
 * `group_name`: `Short description`
+#### Tags
+* `tag_name`: `Short description`
 #### Tasks
 * `task_name`: `1-sentence description of what this particular task does`

--- a/tests/models/test_neuralmagic.py
+++ b/tests/models/test_neuralmagic.py
@@ -23,6 +23,7 @@ DEEPSPARSE_MODELS_TASKS = [
 ]
+@pytest.mark.skip(reason="test failing")
 @pytest.mark.parametrize("model_id,task", SPARSEML_MODELS_TASKS)
 def test_sparseml_eval(model_id, task):
    lm = get_model("sparseml").create_from_arg_string(