Merge pull request #362 from EleutherAI/cleanup-for-release

Cleanup `README.md` and package deps

Merge pull request #362 from EleutherAI/cleanup-for-release
Cleanup `README.md` and package deps
1d8107bf · Stella Biderman · GitHub · fdd3dbc3 · 1e5d55d9 · 1d8107bf
Unverified Commit 1d8107bf authored Dec 07, 2022 by Stella Biderman Committed by GitHub Dec 07, 2022
20 changed files
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -32,7 +32,9 @@ jobs:
      run: |
        python -m pip install --upgrade pip
        pip install flake8 pytest pytest-cov
-        pip install -e .[dev]
+        pip install -e .[dev,multilingual]
+        # Install optional git dependencies
+        pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
    - name: Lint with flake8
      run: |

--- a/README.md
+++ b/README.md
--- a/docs/task_table.md
+++ b/docs/task_table.md
--- a/lm_eval/tasks/translation.py
+++ b/lm_eval/tasks/translation.py
@@ -16,6 +16,20 @@ from lm_eval import metrics
 from lm_eval.base import Task, rf
 from typing import List
+try:
+    import nagisa
+    HAS_NAGISA = True
+except ImportError:
+    HAS_NAGISA = False
+try:
+    import jieba
+    HAS_JIEBA = True
+except ImportError:
+    HAS_JIEBA = False
 _CITATION = """
 @inproceedings{post-2018-call,
@@ -63,14 +77,22 @@ def create_tasks_from_benchmarks(benchmark_dict):
 def zh_split(zh_text: List[str]) -> List[str]:
    """Chinese splitting"""
-    import jieba
+    if not HAS_JIEBA:
+        raise ImportError(
+            "Chinese text splitting requires the `jieba` package. "
+            "Please install it with:\npip install jieba"
+        )
    return [" ".join(jieba.cut(txt.strip())) for txt in zh_text]
 def ja_split(ja_text: List[str]) -> List[str]:
    """Japanese splitting"""
-    import nagisa
+    if not HAS_NAGISA:
+        raise ImportError(
+            "Japanese text splitting requires the `nagisa` package. "
+            "Please install it with:\npip install nagisa"
+        )
    return [" ".join(nagisa.tagging(txt.strip()).words) for txt in ja_text]

--- a/lm_eval/tasks/truthfulqa.py
+++ b/lm_eval/tasks/truthfulqa.py
@@ -27,6 +27,14 @@ from lm_eval.base import rf, Task
 from lm_eval.metrics import mean
+try:
+    import bleurt
+    HAS_BLEURT = True
+except ImportError:
+    HAS_BLEURT = False
 _CITATION = """
 @misc{lin2021truthfulqa,
    title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
@@ -164,6 +172,12 @@ class TruthfulQAGeneration(Task):
    def __init__(self):
        super().__init__()
+        if not HAS_BLEURT:
+            raise ImportError(
+                "`TruthfulQAGeneration` requires the `bleurt` package. Please install it with:\n"
+                "pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt"
+                "\nWARNING: Installing any other version of bleurt may result in different results."
+            )
        self.bleurt = datasets.load_metric("bleurt")
    def has_training_docs(self):

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -5,7 +5,6 @@ import collections
 import functools
 import inspect
 import sys
-import pytest
 from typing import List
@@ -187,6 +186,8 @@ def run_task_tests(task_list: List[str]):
    """
    Find the package root and run the tests for the given tasks
    """
+    import pytest
    package_root = find_test_root(start_path=pathlib.Path(__file__))
    task_string = " or ".join(task_list)
    args = [

--- a/scripts/make_table_tasks.py
+++ b/scripts/make_table_tasks.py
+"""
+Usage:
+   python make_table_tasks.py --output <markdown_filename>
+"""
+import argparse
+import logging
 from lm_eval import tasks
 from pytablewriter import MarkdownTableWriter
-writer = MarkdownTableWriter()
-writer.headers = ["Task Name", "Train", "Val", "Test", "Val/Test Docs", "Metrics"]
-values = []
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
-def chk(tf):
+def check(tf):
    if tf:
        return "✓"
    else:
        return " "
-for tname, Task in tasks.TASK_REGISTRY.items():
+if __name__ == "__main__":
-    task = Task()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output", type=str, default="task_table.md")
-    v = [
+    args = parser.parse_args()
-        tname,
-        chk(task.has_training_docs()),
+    writer = MarkdownTableWriter()
-        chk(task.has_validation_docs()),
+    writer.headers = ["Task Name", "Train", "Val", "Test", "Val/Test Docs", "Metrics"]
-        chk(task.has_test_docs()),
+    values = []
-        len(list(task.test_docs() if task.has_test_docs() else task.validation_docs())),
-        ", ".join(task.aggregation().keys()),
+    tasks = tasks.TASK_REGISTRY.items()
-    ]
+    tasks = sorted(tasks, key=lambda x: x[0])
-    print(v)
+    for tname, Task in tasks:
-    values.append(v)
+        task = Task()
+        v = [
-writer.value_matrix = values
+            tname,
+            check(task.has_training_docs()),
-print(writer.dumps())
+            check(task.has_validation_docs()),
+            check(task.has_test_docs()),
+            len(
+                list(
+                    task.test_docs() if task.has_test_docs() else task.validation_docs()
+                )
+            ),
+            ", ".join(task.aggregation().keys()),
+        ]
+        logger.info(v)
+        values.append(v)
+    writer.value_matrix = values
+    table = writer.dumps()
+    with open(args.output, "w") as f:
+        f.write(table)
--- a/setup.py
+++ b/setup.py
@@ -14,6 +14,7 @@ setuptools.setup(
    url="https://github.com/EleutherAI/lm-evaluation-harness",
    packages=setuptools.find_packages(),
    classifiers=[
+        "Development Status :: 3 - Alpha",
        "Programming Language :: Python :: 3",
        "License :: OSI Approved :: MIT License",
        "Operating System :: OS Independent",
@@ -21,29 +22,23 @@ setuptools.setup(
    python_requires=">=3.6",
    install_requires=[
        "datasets>=2.0.0",
-        "click>=7.1",
+        "jsonlines",
+        "numexpr",
+        "openai>=0.6.4",
+        "pybind11>=2.6.2",
+        "pycountry",
+        "pytablewriter",
+        "rouge-score>=0.0.4",
+        "sacrebleu==1.5.0",
        "scikit-learn>=0.24.1",
+        "sqlitedict",
        "torch>=1.7",
+        "tqdm-multiprocess",
        "transformers>=4.1",
-        "sqlitedict==1.6.0",
+        "zstandard",
-        "pytablewriter==0.58.0",
-        "sacrebleu==1.5.0",
-        "rouge-score==0.0.4",
-        "pycountry==20.7.3",
-        "numexpr>=2.7.2",
-        "lm_dataformat==0.0.20",
-        "pybind11==2.6.2",
-        "tqdm-multiprocess==0.0.11",
-        "zstandard==0.15.2",
-        "jsonlines==2.0.0",
-        "mock==4.0.3",
-        "openai==0.6.4",
-        "jieba==0.42.1",
-        "nagisa==0.2.7",
-        "bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt",
-    ],
-    dependency_links=[
-        "https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt",
    ],
-    extras_require={"dev": ["pytest", "black", "pre-commit"]},
+    extras_require={
+        "dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
+        "multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
+    },
 )
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -258,8 +258,9 @@ def textsynth_mock_completion(**kwargs):
    import requests
    os.makedirs("tests/testdata", exist_ok=True)
+    hash_kwargs = {k: v for k, v in kwargs.items() if k != "headers"}
    hash = hashlib.sha256(
-        json.dumps(kwargs, sort_keys=True).encode("utf-8")
+        json.dumps(hash_kwargs, sort_keys=True).encode("utf-8")
    ).hexdigest()
    fname = f"tests/testdata/textsynth_test_{hash}.pkl"

--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
@@ -7,10 +7,7 @@ from itertools import islice
 @pytest.mark.parametrize("taskname,task_class", tasks.TASK_REGISTRY.items())
 def test_basic_interface(taskname, task_class):
    print("Evaluating task", taskname)
-    # dl = task_class.download
-    # task_class.download = MagicMock()
    task = task_class()
-    # task_class.download = dl
    assert task.has_training_docs() in [True, False]
    assert task.has_validation_docs() in [True, False]

--- a/tests/test_version_stable.py
+++ b/tests/test_version_stable.py
@@ -51,7 +51,7 @@ def flatten(d, parent_key="", sep="."):
    items = []
    for k, v in d.items():
        new_key = parent_key + sep + k if parent_key else k
-        if isinstance(v, collections.MutableMapping):
+        if isinstance(v, collections.abc.MutableMapping):
            items.extend(flatten(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))

--- a/tests/testdata/textsynth_test_2a333f73ac695f21ccc9818fc1c5c18295ff6cc3c6e86268f404cfed5aee8428.pkl
+++ b/tests/testdata/textsynth_test_2a333f73ac695f21ccc9818fc1c5c18295ff6cc3c6e86268f404cfed5aee8428.pkl
--- a/tests/testdata/textsynth_test_d4ff4c625c1f2b4fab137e4829a8e67a3582f72655c7bfaab017c471f8216a1d.pkl
+++ b/tests/testdata/textsynth_test_d4ff4c625c1f2b4fab137e4829a8e67a3582f72655c7bfaab017c471f8216a1d.pkl
--- a/tests/testdata/textsynth_test_43d375c048824d415ea6e315702d24ddcdfa906b8675a9f551f84b7bd5810e73.pkl
+++ b/tests/testdata/textsynth_test_43d375c048824d415ea6e315702d24ddcdfa906b8675a9f551f84b7bd5810e73.pkl
--- a/tests/testdata/textsynth_test_5bf000f1dd82089eacd5e452f4348f355948bcb1dfc73c6cd12e5fa8ebc8390c.pkl
+++ b/tests/testdata/textsynth_test_5bf000f1dd82089eacd5e452f4348f355948bcb1dfc73c6cd12e5fa8ebc8390c.pkl
--- a/tests/testdata/textsynth_test_e595600e98cdb86290b9ea562ee4bb6cc4e0ba82260189e55e66e6086654f28a.pkl
+++ b/tests/testdata/textsynth_test_e595600e98cdb86290b9ea562ee4bb6cc4e0ba82260189e55e66e6086654f28a.pkl
--- a/tests/testdata/textsynth_test_f85c600ce285362820c732899542ac6782d0b9fe6e6cb93b77919fe5a4d377ed.pkl
+++ b/tests/testdata/textsynth_test_f85c600ce285362820c732899542ac6782d0b9fe6e6cb93b77919fe5a4d377ed.pkl
--- a/tests/testdata/textsynth_test_c578855af1e00883017e5c142701e542813e8949f6e6f8471a8b2f4a144ba6d6.pkl
+++ b/tests/testdata/textsynth_test_c578855af1e00883017e5c142701e542813e8949f6e6f8471a8b2f4a144ba6d6.pkl
--- a/tests/testdata/textsynth_test_92ce62887f42665f7b752661ea8e33658074b94e7eef68b6282ce7b2f76422ea.pkl
+++ b/tests/testdata/textsynth_test_92ce62887f42665f7b752661ea8e33658074b94e7eef68b6282ce7b2f76422ea.pkl
--- a/tests/testdata/textsynth_test_c05f35fda4bdc2eefd6389e3317043c89b6816221ef36286b15969febee34757.pkl
+++ b/tests/testdata/textsynth_test_c05f35fda4bdc2eefd6389e3317043c89b6816221ef36286b15969febee34757.pkl