Merge branch 'EleutherAI:main' into main

da211969 · Jess · GitHub · 1b97e487 · 801322e0 · da211969
Unverified Commit da211969 authored Jun 28, 2024 by Jess Committed by GitHub Jun 28, 2024
20 changed files
--- a/lm_eval/tasks/paloma/paloma_wikitext_103.yaml
+++ b/lm_eval/tasks/paloma/paloma_wikitext_103.yaml
+include: _paloma_template
+task: paloma_wikitext_103
+task_alias: Wikitext-103
+dataset_name: wikitext_103
--- a/lm_eval/tasks/piqa/piqa.yaml
+++ b/lm_eval/tasks/piqa/piqa.yaml
@@ -19,3 +19,5 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/polemo2/polemo2_out.yaml
+++ b/lm_eval/tasks/polemo2/polemo2_out.yaml
 include: polemo2_in.yaml
 task: polemo2_out
 dataset_path: allegro/klej-polemo2-out
-dataset_name: klej-polemo2-out
+dataset_name: null
--- a/lm_eval/tasks/scrolls/task.py
+++ b/lm_eval/tasks/scrolls/task.py
@@ -4,12 +4,12 @@ from functools import reduce
 import numpy as np
 import transformers.data.metrics.squad_metrics as squad_metrics
-from datasets import load_metric
+from datasets import Dataset, load_metric
 from transformers import AutoTokenizer
 from lm_eval.api.instance import Instance
 from lm_eval.api.metrics import mean
-from lm_eval.api.task import Task
+from lm_eval.api.task import ConfigurableTask
 _CITATION = """
@@ -108,7 +108,7 @@ def _num_cpu_cores():
        return len(os.sched_getaffinity(0))
-class _SCROLLSTask(Task):
+class _SCROLLSTask(ConfigurableTask):
    VERSION = 2
    DATASET_PATH = "tau/scrolls"
    DATASET_NAME = None
@@ -117,7 +117,7 @@ class _SCROLLSTask(Task):
    PRUNE_NUM_PROC = None
    def __init__(self):
-        super().__init__()
+        super().__init__(config={"metadata": {"version": self.VERSION}})
        if self.DATASET_NAME is not None:
            self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME)
@@ -131,12 +131,26 @@ class _SCROLLSTask(Task):
        return False
    def training_docs(self):
-        for doc in self.dataset["train"]:
+        processed_docs = list(map(self._process_doc, self.dataset["train"]))
-            yield from self._process_doc(doc)
+        # Flatten the list of lists since _process_doc returns a list of one element.
+        processed_docs = [item for sublist in processed_docs for item in sublist]
+        processed_dict = {
+            key: [d[key] for d in processed_docs] for key in processed_docs[0]
+        }
+        return Dataset.from_dict(processed_dict)
    def validation_docs(self):
-        for doc in self.dataset["validation"]:
+        processed_docs = list(map(self._process_doc, self.dataset["validation"]))
-            yield from self._process_doc(doc)
+        # Flatten the list of lists since _process_doc returns a list of one element.
+        processed_docs = [item for sublist in processed_docs for item in sublist]
+        processed_dict = {
+            key: [d[key] for d in processed_docs] for key in processed_docs[0]
+        }
+        return Dataset.from_dict(processed_dict)
    def should_decontaminate(self):
        return True

--- a/lm_eval/tasks/siqa/siqa.yaml
+++ b/lm_eval/tasks/siqa/siqa.yaml
@@ -6,10 +6,7 @@ training_split: train
 validation_split: validation
 doc_to_text: "Q: {{context}} {{question}}\nA:"
 target_delimiter: " "
-doc_to_choice:
+doc_to_choice: "{{[answerA, answerB, answerC]}}"
-  - "{{answerA}}"
-  - "{{answerB}}"
-  - "{{answerC}}"
 doc_to_target: "{{ (label|int) - 1 }}"
 metric_list:
  - metric: acc

--- a/lm_eval/tasks/squad_completion/task.py
+++ b/lm_eval/tasks/squad_completion/task.py
-"""
-"""
 import re
 from typing import List

--- a/lm_eval/tasks/squadv2/task.py
+++ b/lm_eval/tasks/squadv2/task.py
@@ -13,6 +13,7 @@ also determine when no answer is supported by the paragraph and abstain from ans
 Homepage: https://rajpurkar.github.io/SQuAD-explorer/
 """
 from functools import partial
 from math import exp

--- a/lm_eval/tasks/storycloze/README.md
+++ b/lm_eval/tasks/storycloze/README.md
@@ -2,49 +2,31 @@
 ### Paper
-Title: `Few-shot Learning with Multilingual Language Models`
+Title: `A Corpus and Evaluation Framework for Deeper Understanding of Commonsense Stories`
-Abstract: `https://arxiv.org/abs/2112.10668`
+Abstract: `https://arxiv.org/abs/1604.01696`
-XStoryCloze consists of the professionally translated version of the [English StoryCloze dataset](https://cs.rochester.edu/nlp/rocstories/) (Spring 2016 version) to 10 non-English languages. This dataset is released by Meta AI.
+Homepage: https://cs.rochester.edu/nlp/rocstories/
-Homepage: https://github.com/facebookresearch/fairseq/pull/4820
+'Story Cloze Test' is a new commonsense reasoning framework for evaluating story understanding, story generation, and script learning. This test requires a system to choose the correct ending to a four-sentence story
 ### Citation
 ```
-@article{DBLP:journals/corr/abs-2112-10668,
+@misc{mostafazadeh2016corpus,
-  author    = {Xi Victoria Lin and
+      title={A Corpus and Evaluation Framework for Deeper Understanding of Commonsense Stories},
-               Todor Mihaylov and
+      author={Nasrin Mostafazadeh and
-               Mikel Artetxe and
+      Nathanael Chambers and
-               Tianlu Wang and
+      Xiaodong He and
-               Shuohui Chen and
+      Devi Parikh and
-               Daniel Simig and
+      Dhruv Batra and
-               Myle Ott and
+      Lucy Vanderwende and
-               Naman Goyal and
+      Pushmeet Kohli and
-               Shruti Bhosale and
+      James Allen},
-               Jingfei Du and
+      year={2016},
-               Ramakanth Pasunuru and
+      eprint={1604.01696},
-               Sam Shleifer and
+      archivePrefix={arXiv},
-               Punit Singh Koura and
+      primaryClass={cs.CL}
-               Vishrav Chaudhary and
-               Brian O'Horo and
-               Jeff Wang and
-               Luke Zettlemoyer and
-               Zornitsa Kozareva and
-               Mona T. Diab and
-               Veselin Stoyanov and
-               Xian Li},
-  title     = {Few-shot Learning with Multilingual Language Models},
-  journal   = {CoRR},
-  volume    = {abs/2112.10668},
-  year      = {2021},
-  url       = {https://arxiv.org/abs/2112.10668},
-  eprinttype = {arXiv},
-  eprint    = {2112.10668},
-  timestamp = {Tue, 04 Jan 2022 15:59:27 +0100},
-  biburl    = {https://dblp.org/rec/journals/corr/abs-2112-10668.bib},
-  bibsource = {dblp computer science bibliography, https://dblp.org}
 }
 ```

--- a/lm_eval/tasks/tinyBenchmarks/utils_winogrande.py
+++ b/lm_eval/tasks/tinyBenchmarks/utils_winogrande.py
-""" This code mirrors the utils of the original winogrande task """
+"""This code mirrors the utils of the original winogrande task"""
 def doc_to_text(doc):

--- a/lm_eval/tasks/tmmluplus/default/_generate_configs.py
+++ b/lm_eval/tasks/tmmluplus/default/_generate_configs.py
 """
 Take in a YAML, and output all "other" splits with this YAML
 """
 import argparse
 import os

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -26,6 +26,11 @@ eval_logger = logging.getLogger("lm-eval")
 SPACING = " " * 47
+HIGHER_IS_BETTER_SYMBOLS = {
+    True: "↑",
+    False: "↓",
+}
 def hash_string(string: str) -> str:
    return hashlib.sha256(string.encode("utf-8")).hexdigest()
@@ -76,6 +81,18 @@ def handle_non_serializable(o):
        return str(o)
+def sanitize_list(sub):
+    """
+    Takes possible nested list and recursively converts all inner component to strings
+    """
+    if isinstance(sub, list):
+        return [sanitize_list(item) for item in sub]
+    if isinstance(sub, tuple):
+        return tuple(sanitize_list(item) for item in sub)
+    else:
+        return str(sub)
 def simple_parse_args_string(args_string):
    """
    Parses something like
@@ -135,6 +152,55 @@ def general_detokenize(string):
    return string
+def get_file_task_name(filename: str) -> str:
+    """
+    Given the sample results filenames, extracts and returns the task name.
+    """
+    return filename[filename.find("_") + 1 : filename.rfind("_")]
+def get_file_datetime(filename: str) -> str:
+    """
+    Given the results and sample results filenames, extracts and returns the datetime.
+    """
+    return filename[filename.rfind("_") + 1 :].replace(".json", "")
+def sanitize_model_name(model_name: str) -> str:
+    """
+    Given the model name, returns a sanitized version of it.
+    """
+    return re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", model_name)
+def sanitize_task_name(task_name: str) -> str:
+    """
+    Given the task name, returns a sanitized version of it.
+    """
+    return re.sub(r"\W", "_", task_name)
+def get_latest_filename(filenames: List[str]) -> str:
+    """
+    Given a list of filenames, returns the filename with the latest datetime.
+    """
+    return max(filenames, key=lambda f: get_file_datetime(f))
+def get_results_filenames(filenames: List[str]) -> List[str]:
+    """
+    Extracts filenames that correspond to aggregated results.
+    """
+    return [f for f in filenames if "/results_" in f and ".json" in f]
+def get_sample_results_filenames(filenames: List[str]) -> List[str]:
+    """
+    Extracts filenames that correspond to sample results.
+    """
+    return [f for f in filenames if "/samples_" in f and ".json" in f]
 def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len):
    """
    - context_len allows for a rolling window context, allowing each prediction window to potentially
@@ -257,6 +323,7 @@ def make_table(result_dict, column: str = "results", sort_results: bool = True):
        "Filter",
        "n-shot",
        "Metric",
+        "",
        "Value",
        "",
        "Stderr",
@@ -277,22 +344,29 @@ def make_table(result_dict, column: str = "results", sort_results: bool = True):
        dic = result_dict[column][k]
        version = result_dict["versions"].get(k, "N/A")
        n = str(result_dict["n-shot"][k])
+        higher_is_better = result_dict.get("higher_is_better", {}).get(k, {})
        if "alias" in dic:
            k = dic.pop("alias")
-        for (mf), v in dic.items():
+        metric_items = dic.items()
+        if sort_results:
+            metric_items = sorted(metric_items)
+        for (mf), v in metric_items:
            m, _, f = mf.partition(",")
            if m.endswith("_stderr"):
                continue
+            hib = HIGHER_IS_BETTER_SYMBOLS.get(higher_is_better.get(m), "")
            if m + "_stderr" + "," + f in dic:
                se = dic[m + "_stderr" + "," + f]
                if se != "N/A":
                    se = "%.4f" % se
-                values.append([k, version, f, n, m, "%.4f" % v, "±", se])
+                values.append([k, version, f, n, m, hib, "%.4f" % v, "±", se])
            else:
-                values.append([k, version, f, n, m, "%.4f" % v, "", ""])
+                values.append([k, version, f, n, m, hib, "%.4f" % v, "", ""])
            k = ""
            version = ""
    md_writer.value_matrix = values

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,7 @@ classifiers = [
 requires-python = ">=3.8"
 license = { "text" = "MIT" }
 dependencies = [
-    "accelerate>=0.21.0",
+    "accelerate>=0.26.0",
    "evaluate",
    "datasets>=2.16.0",
    "evaluate>=0.4.0",
@@ -73,7 +73,7 @@ promptsource = ["promptsource>=0.2.3"]
 sentencepiece = ["sentencepiece>=0.1.98"]
 sparseml = ["sparseml-nightly[llm]>=1.8.0.20240404"]
 testing = ["pytest", "pytest-cov", "pytest-xdist"]
-vllm = ["vllm==0.3.2"]
+vllm = ["vllm>=0.4.2"]
 zeno = ["pandas", "zeno-client"]
 wandb = ["wandb>=0.16.3", "pandas", "numpy"]
 unitxt = ["unitxt"]

--- a/scripts/clean_training_data/README.md
+++ b/scripts/clean_training_data/README.md
@@ -10,7 +10,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1
    the match, splitting the training data into chunks
   3) Any chunks less than `minimum_slice_length` are removed
   4) Training data sets split into more than `too_dirty_cutoff` are considered
-    completey contaminated and removed
+    completely contaminated and removed
 OpenAI used:
 ```

--- a/scripts/make_table_results.py
+++ b/scripts/make_table_results.py
@@ -2,6 +2,7 @@
 Usage:
   python make_table_tasks.py --output <markdown_filename>
 """
 import json
 import logging
 import os

--- a/scripts/make_table_tasks.py
+++ b/scripts/make_table_tasks.py
@@ -2,6 +2,7 @@
 Usage:
   python make_table_tasks.py --output <markdown_filename>
 """
 import argparse
 import logging

--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -70,6 +70,11 @@ def main():
            if docs is not None:
                iters.append(docs)
+        if len(iters) == 0:
+            raise ValueError(
+                f"Passed --sets '{args.sets}' but this task has no splits which match. Please specify a different --sets value."
+            )
        docs = join_iters(iters)
        with open(

--- a/scripts/zeno_visualize.py
+++ b/scripts/zeno_visualize.py
@@ -7,7 +7,12 @@ from pathlib import Path
 import pandas as pd
 from zeno_client import ZenoClient, ZenoMetric
-from lm_eval.utils import eval_logger
+from lm_eval.utils import (
+    eval_logger,
+    get_latest_filename,
+    get_results_filenames,
+    get_sample_results_filenames,
+)
 def parse_args():
@@ -45,13 +50,15 @@ def main():
    assert len(models) > 0, "No model directories found in the data_path."
+    # Get the tasks from the latest results file of the first model.
    tasks = set(tasks_for_model(models[0], args.data_path))
-    for model in models:  # Make sure that all models have the same tasks.
+    # Get tasks names from the latest results file for each model
+    # Get intersection of tasks for all models
+    for model in models:
        old_tasks = tasks.copy()
        task_count = len(tasks)
+        model_tasks = set(tasks_for_model(model, args.data_path))
-        model_tasks = tasks_for_model(model, args.data_path)
        tasks.intersection(set(model_tasks))
        if task_count != len(tasks):
@@ -66,22 +73,36 @@ def main():
    for task in tasks:
        # Upload data for all models
        for model_index, model in enumerate(models):
+            # Get latest results and sample results for a model
+            model_dir = Path(args.data_path, model)
+            model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
+            model_results_filenames = get_results_filenames(model_files)
+            model_sample_filenames = get_sample_results_filenames(model_files)
+            latest_results = get_latest_filename(
+                [Path(f).name for f in model_results_filenames]
+            )
+            latest_sample_results = get_latest_filename(
+                [Path(f).name for f in model_sample_filenames if task in f]
+            )
            model_args = re.sub(
                r"[\"<>:/\|\\?\*\[\]]+",
                "__",
                json.load(
-                    open(Path(args.data_path, model, "results.json"), encoding="utf-8")
+                    open(Path(args.data_path, model, latest_results), encoding="utf-8")
                )["config"]["model_args"],
            )
+            print(model_args)
+            data = []
            with open(
-                Path(args.data_path, model, f"{model_args}_{task}.jsonl"),
+                Path(args.data_path, model, latest_sample_results),
                "r",
                encoding="utf-8",
            ) as file:
-                data = json.loads(file.read())
+                for line in file:
+                    data.append(json.loads(line.strip()))
            configs = json.load(
-                open(Path(args.data_path, model, "results.json"), encoding="utf-8")
+                open(Path(args.data_path, model, latest_results), encoding="utf-8")
            )["configs"]
            config = configs[task]
@@ -125,10 +146,12 @@ def tasks_for_model(model: str, data_path: str):
    Returns:
        list: A list of tasks for the model.
    """
-    dir_path = Path(data_path, model)
+    # get latest model results for a given name
-    config = (
+    model_dir = Path(data_path, model)
-        json.load(open(Path(dir_path, "results.json"), encoding="utf-8"))["configs"],
+    model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
-    )
+    model_results_filenames = get_results_filenames(model_files)
+    latest_results = get_latest_filename(model_results_filenames)
+    config = (json.load(open(latest_results, encoding="utf-8"))["configs"],)
    return list(config[0].keys())

--- a/tests/models/test_gguf.py
+++ b/tests/models/test_gguf.py
@@ -15,11 +15,11 @@ base_url = "https://matthoffner-ggml-llm-api.hf.space"
 def gguf_completion_mock(base_url=None, **kwargs):
    # Generate a hash from the parameters
    hash_kwargs = {"base_url": base_url, **kwargs}
-    hash = hashlib.sha256(
+    parameters_hash = hashlib.sha256(
        json.dumps(hash_kwargs, sort_keys=True).encode("utf-8")
    ).hexdigest()
-    fname = f"./tests/testdata/gguf_test_{hash}.pkl"
+    fname = f"./tests/testdata/gguf_test_{parameters_hash}.pkl"
    if os.path.exists(fname):
        with open(fname, "rb") as fh:

--- a/tests/models/test_huggingface.py
+++ b/tests/models/test_huggingface.py
 from __future__ import annotations
+import os
 import sys
 from pathlib import Path
 import numpy as np
 import torch
-import lm_eval.tasks as tasks
+from lm_eval import tasks
 from lm_eval.api.instance import Instance
 from lm_eval.models.huggingface import HFLM
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 task_manager = tasks.TaskManager()
+TEST_STRING = "foo bar"
 class Test_HFLM:
    torch.use_deterministic_algorithms(True)
@@ -107,7 +111,7 @@ class Test_HFLM:
        file_path = dir_path / f"outputs_log_{self.version_minor}.txt"
        file_path = file_path.resolve()
-        with open(file_path, "w") as f:
+        with open(file_path, "w", encoding="utf-8") as f:
            f.write("\n".join(str(x) for x in _res))
        assert np.allclose(_res, _RES, atol=1e-2)
        # check indices for Multiple Choice
@@ -126,19 +130,19 @@ class Test_HFLM:
        assert np.allclose(res, self.ROLLING_RES, atol=1e-1)
    def test_toc_encode(self) -> None:
-        res = self.LM.tok_encode("foo bar")
+        res = self.LM.tok_encode(TEST_STRING)
        assert res == [12110, 2534]
    def test_toc_decode(self) -> None:
        res = self.LM.tok_decode([12110, 2534])
-        assert res == "foo bar"
+        assert res == TEST_STRING
    def test_batch_encode(self) -> None:
-        res = self.LM.tok_batch_encode(["foo bar", "bar foo"])[0].tolist()
+        res = self.LM.tok_batch_encode([TEST_STRING, "bar foo"])[0].tolist()
        assert res == [[12110, 2534], [2009, 17374]]
    def test_model_generate(self) -> None:
-        context = self.LM.tok_batch_encode(["foo bar"])[0]
+        context = self.LM.tok_batch_encode([TEST_STRING])[0]
        res = self.LM._model_generate(context, max_length=10, stop=["\n\n"])
        res = self.LM.tok_decode(res[0])
        assert res == "foo bar\n<bazhang>!info bar"
--- a/tests/models/test_neuralmagic.py
+++ b/tests/models/test_neuralmagic.py
 import pytest
-import lm_eval.evaluator as evaluator
+from lm_eval import evaluator
 from lm_eval.api.registry import get_model
@@ -23,6 +23,7 @@ DEEPSPARSE_MODELS_TASKS = [
 ]
+@pytest.mark.skip(reason="test failing")
 @pytest.mark.parametrize("model_id,task", SPARSEML_MODELS_TASKS)
 def test_sparseml_eval(model_id, task):
    lm = get_model("sparseml").create_from_arg_string(