Merge remote-tracking branch 'origin/big-refactor' into calibration

50e99bd7 · Herbie Bradley · 3d4c4cd6 · a3252ed7 · 50e99bd7 · 50e99bd7
Commit 50e99bd7 authored Sep 20, 2023 by Herbie Bradley
9 changed files
--- a/lm_eval/tasks/translation/utils.py
+++ b/lm_eval/tasks/translation/utils.py
@@ -10,7 +10,7 @@ try:
 except ModuleNotFoundError:
    raise Exception(
        "`pycountry` is required for generating translation task prompt templates. \
-please install pycountry via pip install lm-eval[multilingua] or pip install -e .[multilingual]",
+please install pycountry via pip install lm-eval[multilingual] or pip install -e .[multilingual]",
    )

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -16,7 +16,6 @@ import gc
 import torch
 import transformers
-from omegaconf import OmegaConf
 from jinja2 import BaseLoader, Environment, StrictUndefined
 from itertools import islice
@@ -55,8 +54,8 @@ def simple_parse_args_string(args_string):
    args_string = args_string.strip()
    if not args_string:
        return {}
-    arg_list = args_string.split(",")
+    arg_list = [arg for arg in args_string.split(",") if arg]
-    args_dict = OmegaConf.to_object(OmegaConf.from_dotlist(arg_list))
+    args_dict = {k: v for k, v in [arg.split("=") for arg in arg_list]}
    return args_dict
@@ -267,9 +266,9 @@ def make_table(result_dict, column: str = "results"):
    from pytablewriter import MarkdownTableWriter, LatexTableWriter
    if column == "results":
-        column_name = "Task"
+        column_name = "Tasks"
-    elif column == "aggregate":
+    elif column == "groups":
-        column_name = "Benchmark"
+        column_name = "Groups"
    md_writer = MarkdownTableWriter()
    latex_writer = LatexTableWriter()
@@ -395,8 +394,10 @@ def import_function(loader, node):
    function_name = loader.construct_scalar(node)
    yaml_path = os.path.dirname(loader.name)
-    module_name, function_name = function_name.split(".")
+    *module_name, function_name = function_name.split(".")
-    module_path = os.path.join(yaml_path, "{}.py".format(module_name))
+    if type(module_name) == list:
+        module_name = ".".join(module_name)
+    module_path = os.path.normpath(os.path.join(yaml_path, "{}.py".format(module_name)))
    spec = importlib.util.spec_from_file_location(module_name, module_path)
    module = importlib.util.module_from_spec(spec)
@@ -430,8 +431,7 @@ def load_yaml_config(yaml_path):
                # If not found, assume the included yaml
                # is in the same dir as the original yaml
                if not os.path.isfile(path):
-                    path = os.path.join(yaml_dir, path)
+                    path = os.path.normpath(os.path.join(yaml_dir, path))
                try:
                    included_yaml_config = load_yaml_config(path)
                    final_yaml_config.update(included_yaml_config)

--- a/main.py
+++ b/main.py
@@ -11,7 +11,6 @@ from lm_eval import evaluator, utils
 from lm_eval.api.registry import ALL_TASKS
 from lm_eval.logger import eval_logger, SPACING
 from lm_eval.tasks import include_task_folder
-from lm_eval.benchmarks import include_benchmarks
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -209,8 +208,8 @@ def main() -> None:
            f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
        )
        print(evaluator.make_table(results))
-        if "aggregate" in results:
+        if "groups" in results:
-            print(evaluator.make_table(results, "aggregate"))
+            print(evaluator.make_table(results, "groups"))
 if __name__ == "__main__":

--- a/mypy.ini
+++ b/mypy.ini
 [mypy]
-python_version = 3.9
+python_version = 3.8
 show_traceback = True
 check_untyped_defs = True
 no_implicit_reexport = True

--- a/pyproject.toml
+++ b/pyproject.toml
 [build-system]
 requires = ["setuptools>=40.8.0", "wheel"]
 build-backend = "setuptools.build_meta"
+[project]
+name = "lm_eval"
+version = "1.0.0"
+authors = [
+    {name="EleutherAI", email="contact@eleuther.ai"}
+]
+description = "A framework for evaluating language models"
+readme = "README.md"
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+]
+requires-python = ">=3.8"
+license = { "text" = "MIT" }
+dependencies = [
+    "accelerate>=0.21.0",
+    "evaluate",
+    "datasets>=2.0.0",
+    "evaluate>=0.4.0",
+    "jsonlines",
+    "numexpr",
+    "peft>=0.2.0",
+    "pybind11>=2.6.2",
+    "pytablewriter",
+    "rouge-score>=0.0.4",
+    "sacrebleu>=1.5.0",
+    "scikit-learn>=0.24.1",
+    "sqlitedict",
+    "torch>=1.8",
+    "tqdm-multiprocess",
+    "transformers>=4.1",
+    "zstandard",
+]
+[tool.setuptools]
+packages = ["lm_eval"]
+# required to include yaml files in pip installation
+[tool.setuptools.package-data]
+lm_eval = ["**/*.yaml", "tasks/**/*"]
+examples = ["**/*.yaml"]
+[project.scripts]
+lm-eval = "main:main"
+lm_eval = "main:main"
+[project.urls]
+Homepage = "https://github.com/EleutherAI/lm-evaluation-harness"
+Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
+[project.optional-dependencies]
+dev = ["black", "flake8", "pre-commit", "pytest", "pytest-cov"]
+linting = [
+    "flake8",
+    "pylint",
+    "mypy",
+    "pre-commit",
+]
+testing = ["pytest", "pytest-cov", "pytest-xdist"]
+multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
+sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
+promptsource = [
+    "promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
+]
+gptq = ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"]
+anthropic = ["anthropic"]
+openai = ["openai", "tiktoken"]
+all = [
+    "lm_eval[dev]",
+    "lm_eval[testing]",
+    "lm_eval[linting]",
+    "lm_eval[multilingual]",
+    "lm_eval[sentencepiece]",
+    "lm_eval[promptsource]",
+    "lm_eval[gptq]",
+    "lm_eval[anthropic]",
+    "lm_eval[openai]"
+]
--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -38,17 +38,21 @@ def main():
        iters = []
        for set in args.sets.split(","):
+            docs = None
            if set == "train" and task.has_training_docs():
                docs = task.training_docs()
            if set == "val" and task.has_validation_docs():
                docs = task.validation_docs()
            if set == "test" and task.has_test_docs():
                docs = task.test_docs()
-            iters.append(docs)
+            if docs is not None:
+                iters.append(docs)
        docs = join_iters(iters)
-        with open(os.path.join(args.output_base_path, task_name), "w") as f:
+        with open(
+            os.path.join(args.output_base_path, task_name), "w", encoding="utf8"
+        ) as f:
            for i, doc in (
                zip(range(args.num_examples), docs)
                if args.num_examples > 0

--- a/setup.py
+++ b/setup.py
 import setuptools
-import itertools
-with open("README.md", "r", encoding="utf-8") as fh:
+# This is to make sure that the package supports editable installs
-    long_description = fh.read()
+setuptools.setup()
-extras_require = {
-    "dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
-    "linting": [
-        "flake8",
-        "pylint",
-        "mypy",
-        "pre-commit",
-    ],
-    "testing": ["pytest", "pytest-cov", "pytest-xdist"],
-    "multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
-    "sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1", "pycountry"],
-    "promptsource": [
-        "promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
-    ],
-    "gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
-    "anthropic": ["anthropic"],
-    "openai": ["openai", "tiktoken"],
-}
-extras_require["all"] = list(itertools.chain.from_iterable(extras_require.values()))
-setuptools.setup(
-    name="lm_eval",
-    version="1.0.0",
-    author="EleutherAI",
-    author_email="contact@eleuther.ai",
-    description="A framework for evaluating language models",
-    long_description=long_description,
-    long_description_content_type="text/markdown",
-    url="https://github.com/EleutherAI/lm-evaluation-harness",
-    packages=setuptools.find_packages(),
-    # required to include yaml files in pip installation
-    package_data={
-        "lm_eval": ["**/*.yaml", "tasks/**/*"],
-        "examples": ["**/*.yaml"],
-    },
-    entry_points={
-        "console_scripts": ["lm-eval = main:main", "lm_eval = main:main"],
-    },
-    include_package_data=True,
-    classifiers=[
-        "Development Status :: 3 - Alpha",
-        "Programming Language :: Python :: 3",
-        "License :: OSI Approved :: MIT License",
-        "Operating System :: OS Independent",
-    ],
-    python_requires=">=3.9",
-    install_requires=[
-        "accelerate>=0.21.0",
-        "evaluate",
-        "datasets>=2.0.0",
-        "evaluate>=0.4.0",
-        "jsonlines",
-        "numexpr",
-        "omegaconf>=2.2",
-        "peft>=0.2.0",
-        "pybind11>=2.6.2",
-        "pytablewriter",
-        "rouge-score>=0.0.4",
-        "sacrebleu>=1.5.0",
-        "scikit-learn>=0.24.1",
-        "sqlitedict",
-        "torch>=1.8",
-        "tqdm-multiprocess",
-        "transformers>=4.1",
-        "zstandard",
-    ],
-    extras_require=extras_require,
-)
--- a/tests/test_evaluator.py
+++ b/tests/test_evaluator.py
@@ -7,6 +7,7 @@ import lm_eval.tasks as tasks
 # import lm_eval.models as models
 import lm_eval.api as api
 import lm_eval.evaluator as evaluator
+from typing import List
 import random
 import pytest
@@ -26,7 +27,7 @@ import pytest
        )
    ],
 )
-def test_evaluator(task_name: list[str], limit: int, model: str, model_args: str):
+def test_evaluator(task_name: List[str], limit: int, model: str, model_args: str):
    task_name = task_name
    limit = 10

--- a/tests/utils.py
+++ b/tests/utils.py
@@ -9,6 +9,7 @@ import os
 # This is the path where the output for the changed files for the tasks folder is stored
 # FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
 # reads a text file and returns a list of words
 # used to read the output of the changed txt from tj-actions/changed-files
 def load_changed_files(file_path: str) -> List[str]:
@@ -32,7 +33,7 @@ def parser(full_path: List[str]) -> List[str]:
    return list(_output)
-def new_tasks() -> Union[list[str], None]:
+def new_tasks() -> Union[List[str], None]:
    FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
    if os.path.exists(FILENAME):
        # If tasks folder has changed then we get the list of files from FILENAME