merged main

90ad5db7 · lintangsutawika · f692caa9 · b177c82c · 90ad5db7 · 90ad5db7
Commit 90ad5db7 authored Mar 01, 2024 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_te_mc2.yaml
+++ b/lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_te_mc2.yaml
@@ -3,5 +3,5 @@ task: truthfulqa_te_mc2
 dataset_path: alexandrainst/m_truthfulqa
 dataset_name: te
 training_split: null
-validation_split: validation
+validation_split: val
 test_split: null
--- a/lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_uk_mc1.yaml
+++ b/lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_uk_mc1.yaml
@@ -3,5 +3,5 @@ task: truthfulqa_uk_mc1
 dataset_path: alexandrainst/m_truthfulqa
 dataset_name: uk
 training_split: null
-validation_split: validation
+validation_split: val
 test_split: null
--- a/lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_uk_mc2.yaml
+++ b/lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_uk_mc2.yaml
@@ -3,5 +3,5 @@ task: truthfulqa_uk_mc2
 dataset_path: alexandrainst/m_truthfulqa
 dataset_name: uk
 training_split: null
-validation_split: validation
+validation_split: val
 test_split: null
--- a/lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_vi_mc1.yaml
+++ b/lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_vi_mc1.yaml
@@ -3,5 +3,5 @@ task: truthfulqa_vi_mc1
 dataset_path: alexandrainst/m_truthfulqa
 dataset_name: vi
 training_split: null
-validation_split: validation
+validation_split: val
 test_split: null
--- a/lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_vi_mc2.yaml
+++ b/lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_vi_mc2.yaml
@@ -3,5 +3,5 @@ task: truthfulqa_vi_mc2
 dataset_path: alexandrainst/m_truthfulqa
 dataset_name: vi
 training_split: null
-validation_split: validation
+validation_split: val
 test_split: null
--- a/lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_zh_mc1.yaml
+++ b/lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_zh_mc1.yaml
@@ -3,5 +3,5 @@ task: truthfulqa_zh_mc1
 dataset_path: alexandrainst/m_truthfulqa
 dataset_name: zh
 training_split: null
-validation_split: validation
+validation_split: val
 test_split: null
--- a/lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_zh_mc2.yaml
+++ b/lm_eval/tasks/okapi/truthfulqa_multilingual/truthfulqa_zh_mc2.yaml
@@ -3,5 +3,5 @@ task: truthfulqa_zh_mc2
 dataset_path: alexandrainst/m_truthfulqa
 dataset_name: zh
 training_split: null
-validation_split: validation
+validation_split: val
 test_split: null
--- a/lm_eval/tasks/okapi/truthfulqa_multilingual/utils.py
+++ b/lm_eval/tasks/okapi/truthfulqa_multilingual/utils.py
-import datasets
 import re
+
+import datasets
 import numpy as np

+
 QA_PROMPT = (
    "Q: What is human life expectancy in the United States?\n"
    "A: Human life expectancy in the United States is 78 years.\n\n"
@@ -17,6 +19,7 @@ QA_PROMPT = (
    "A: The 1992 Olympics were held in Barcelona, Spain."
 )

+
 def preprocess(text):
    if text is None:
        return " "

--- a/lm_eval/tasks/qasper/metrics.py
+++ b/lm_eval/tasks/qasper/metrics.py
 import re
 import string
-
 from collections import Counter



--- a/lm_eval/tasks/qasper/utils.py
+++ b/lm_eval/tasks/qasper/utils.py
-from datasets import Dataset
 from functools import partial

+from datasets import Dataset
+

 def process_docs(dataset, set_answer_type="bool"):
    FEATURES = ["title", "abstract", "question", "answer", "answer_type"]

--- a/lm_eval/tasks/realtoxicityprompts/metric.py
+++ b/lm_eval/tasks/realtoxicityprompts/metric.py
-import os
 import json
-import requests
+import os
+
 import numpy as np
+import requests

 from lm_eval.utils import eval_logger


--- a/lm_eval/tasks/scrolls/task.py
+++ b/lm_eval/tasks/scrolls/task.py
 import re
+from abc import abstractmethod
+from functools import reduce
+
 import numpy as np
 import transformers.data.metrics.squad_metrics as squad_metrics
-
-from abc import abstractmethod
 from datasets import load_metric
 from transformers import AutoTokenizer
-from functools import reduce

-from lm_eval.api.task import Task
-from lm_eval.api.metrics import mean
 from lm_eval.api.instance import Instance
-from lm_eval.api.registry import register_task
+from lm_eval.api.metrics import mean
+from lm_eval.api.task import Task
+

 _CITATION = """
 @inproceedings{shaham-etal-2022-scrolls,
@@ -44,6 +44,7 @@ _CITATION = """
 def _download_metric():
    import os
    import shutil
+
    from huggingface_hub import hf_hub_download

    scrolls_metric_path = hf_hub_download(
@@ -148,7 +149,7 @@ class _SCROLLSTask(Task):
        del self.dataset["test"]
        for split in self.dataset:
            self.dataset[split] = _drop_duplicates_in_input(self.dataset[split])
-        if self.PRUNE_TOKENIZERS is not None and self.PRUNE_TOKENIZERS is not None:
+        if self.PRUNE_TOKENIZERS is not None:
            self.prune()

    def _get_prune_text(self, sample):

--- a/lm_eval/tasks/squadv2/task.py
+++ b/lm_eval/tasks/squadv2/task.py
@@ -13,14 +13,15 @@ also determine when no answer is supported by the paragraph and abstain from ans

 Homepage: https://rajpurkar.github.io/SQuAD-explorer/
 """
-import datasets
-
-from math import exp
 from functools import partial
+from math import exp
+
+import datasets
 from packaging import version

-from lm_eval.api.task import ConfigurableTask
 from lm_eval.api.instance import Instance
+from lm_eval.api.task import ConfigurableTask
+

 _CITATION = """
 @misc{rajpurkar2018know,
@@ -35,7 +36,6 @@ _CITATION = """


 def _squad_metric(predictions, references):
-    # squad_metric = load("squad_v2")
    squad_metric = datasets.load_metric("squad_v2")
    return squad_metric.compute(predictions=predictions, references=references)

@@ -52,7 +52,7 @@ class SQuAD2(ConfigurableTask):
    DATASET_NAME = None

    def __init__(self):
-        super().__init__(config={'metadata': {'version': self.VERSION}})
+        super().__init__(config={"metadata": {"version": self.VERSION}})

    # HF changed squad on us so we have to make sure we aren't running the old one
    assert version.parse(datasets.__version__) >= version.parse(

--- a/lm_eval/tasks/super_glue/cb/aggregate.py
+++ b/lm_eval/tasks/super_glue/cb/aggregate.py
-import sklearn
 import numpy as np
+import sklearn


 def cb_multi_fi(items):

--- a/lm_eval/tasks/super_glue/record/t5_utils.py
+++ b/lm_eval/tasks/super_glue/record/t5_utils.py
+import collections
 import re
 import string
-import collections
-import numpy as np

+import numpy as np
 from datasets import Dataset

 from lm_eval.api.metrics import metric_max_over_ground_truths

--- a/lm_eval/tasks/super_glue/wsc/t5_utils.py
+++ b/lm_eval/tasks/super_glue/wsc/t5_utils.py
 import re
 from typing import List

+
 def doc_to_text(x):
    text = re.sub(r" X ", " *" + x["span2_text"] + "* ", _wsc_inputs(x))
    return "wsc: " + text
@@ -23,14 +24,14 @@ def _wsc_inputs(x):
            [
                " ".join(words[:pronoun_index]),
                "X",
-                " ".join(words[pronoun_index + 1:]),
+                " ".join(words[pronoun_index + 1 :]),
            ]
        )

    # Handle some special cases.
    if (
-            x["text"]
-            == 'The boy continued to whip the pony , and eventually the pony threw him over. John laughed out quite loud. "Good for him," he said. '
+        x["text"]
+        == 'The boy continued to whip the pony , and eventually the pony threw him over. John laughed out quite loud. "Good for him," he said. '
    ):
        return (
            "The boy continued to whip the pony , and eventually the pony threw "
@@ -39,8 +40,8 @@ def _wsc_inputs(x):

    # Using the span2_index, we get 'use' instead of 'it'.
    if (
-            x["text"]
-            == "When they had eventually calmed down a bit , and had gotten home, Mr. Farley put the magic pebble in an iron safe . Some day they might want to use it , but really for now, what more could they wish for?"
+        x["text"]
+        == "When they had eventually calmed down a bit , and had gotten home, Mr. Farley put the magic pebble in an iron safe . Some day they might want to use it , but really for now, what more could they wish for?"
    ):
        return (
            "When they had eventually calmed down a bit , and had gotten home, "

--- a/lm_eval/tasks/truthfulqa/utils.py
+++ b/lm_eval/tasks/truthfulqa/utils.py
 import datasets
-import sacrebleu
 import numpy as np
-
+import sacrebleu
 from rouge_score import rouge_scorer, scoring



--- a/lm_eval/tasks/xwinograd/utils.py
+++ b/lm_eval/tasks/xwinograd/utils.py
@@ -51,7 +51,9 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
    for lang in LANGUAGES:
        file_name = f"xwinograd_{lang}.yaml"
        try:
-            with open(f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf-8") as f:
+            with open(
+                f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf-8"
+            ) as f:
                f.write("# Generated by utils.py\n")
                yaml.dump(
                    {

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -5,16 +5,9 @@ import importlib.util
 import inspect
 import logging
 import os
-import pathlib
 import re
-import subprocess
-import sys
 from itertools import islice
-from typing import (
-    Any,
-    Callable,
-    List,
-)
+from typing import Any, Callable, List

 import numpy as np
 import yaml
@@ -249,7 +242,7 @@ def make_table(result_dict, column: str = "results"):
    values = []

    for k, dic in result_dict[column].items():
-        version = result_dict["versions"][k]
+        version = result_dict["versions"].get(k, "N/A")
        n = str(result_dict["n-shot"][k])

        if "alias" in dic:
@@ -297,61 +290,6 @@ def positional_deprecated(fn):
    return _wrapper


-@positional_deprecated
-def find_test_root(start_path: pathlib.Path) -> pathlib.Path:
-    """
-    Search upward in the directory tree to a maximum of three layers
-    to find and return the package root (containing the 'tests' folder)
-    """
-    cur_path = start_path.resolve()
-    max_layers = 3
-    for _ in range(max_layers):
-        if (cur_path / "tests" / "test_version_stable.py").exists():
-            return cur_path
-        else:
-            cur_path = cur_path.parent.resolve()
-    raise FileNotFoundError(
-        f"Unable to find package root within {max_layers} upwards" + f"of {start_path}"
-    )
-
-
-@positional_deprecated
-def run_task_tests(task_list: List[str]):
-    """
-    Find the package root and run the tests for the given tasks
-    """
-    import pytest
-
-    package_root = find_test_root(start_path=pathlib.Path(__file__))
-    task_string = " or ".join(task_list)
-    args = [
-        f"{package_root}/tests/test_version_stable.py",
-        f"--rootdir={package_root}",
-        "-k",
-        f"{task_string}",
-    ]
-    sys.path.append(str(package_root))
-    pytest_return_val = pytest.main(args)
-    if pytest_return_val:
-        raise ValueError(
-            f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
-        )
-
-
-def get_git_commit_hash():
-    """
-    Gets the git commit hash of your current repo (if it exists).
-    Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
-    """
-    try:
-        git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
-        git_hash = git_hash.decode()
-    except subprocess.CalledProcessError or FileNotFoundError:
-        # FileNotFoundError occurs when git not installed on system
-        git_hash = None
-    return git_hash
-
-
 def ignore_constructor(loader, node):
    return node

@@ -433,16 +371,10 @@ def apply_template(template: str, doc: dict) -> str:
    return rtemplate.render(**doc)


-def create_iterator(raw_iterator, rank, world_size, limit=None):
+def create_iterator(raw_iterator, *, rank=0, world_size=1, limit=None):
    """
    Method for creating a (potentially) sliced and limited
    iterator from a raw document iterator. Used for splitting data
    among ranks in multigpu setting or only pulling a sample of documents
    """
    return islice(raw_iterator, rank, limit, world_size)
-
-
-# Multi-token stopping criteria
-
-
-# from more_itertools
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,7 @@ dependencies = [
    "tqdm-multiprocess",
    "transformers>=4.1",
    "zstandard",
+    "dill",
    "word2number",
 ]

@@ -71,6 +72,7 @@ sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
 testing = ["pytest", "pytest-cov", "pytest-xdist"]
 vllm = ["vllm<=0.2.5"]
 zeno = ["pandas", "zeno-client"]
+wandb = ["wandb>=0.16.3", "pandas", "numpy"]
 all = [
    "lm_eval[anthropic]",
    "lm_eval[dev]",
@@ -86,11 +88,9 @@ all = [
    "lm_eval[testing]",
    "lm_eval[vllm]",
    "lm_eval[zeno]",
+    "lm_eval[wandb]",
 ]

-[tool.ruff]
-extend-exclude = ["lm_eval/tasks/*.py"]
-
 [tool.ruff.lint]
 extend-select = ["I"]

@@ -99,5 +99,4 @@ lines-after-imports = 2
 known-first-party = ["lm_eval"]

 [tool.ruff.extend-per-file-ignores]
-"__init__.py" = ["F401","F402","F403","I"]
-"lm_eval/tasks/*"= ["E721"]
+"__init__.py" = ["F401","F402","F403"]