Merge pull request #1 from EleutherAI/toxicity-test

Toxicity test

Merge pull request #1 from EleutherAI/toxicity-test
Toxicity test
35a24652 · Aflah · GitHub · 52213e29 · 0021de21 · 35a24652
Unverified Commit 35a24652 authored Aug 15, 2023 by Aflah Committed by GitHub Aug 15, 2023
17 changed files
--- a/lm_eval/tasks/xnli/xnli_el.yaml
+++ b/lm_eval/tasks/xnli/xnli_el.yaml
+# Generated by utils.py
+dataset_name: el
+doc_to_choice: '{{[premise+", σωστός? Ναί, "+hypothesis,premise+", σωστός? Έτσι, "+hypothesis,premise+",
+  σωστός? όχι, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_el
--- a/lm_eval/tasks/xnli/xnli_en.yaml
+++ b/lm_eval/tasks/xnli/xnli_en.yaml
+# Generated by utils.py
+dataset_name: en
+doc_to_choice: '{{[premise+", right? Yes, "+hypothesis,premise+", right? Also, "+hypothesis,premise+",
+  right? No, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_en
--- a/lm_eval/tasks/xnli/xnli_es.yaml
+++ b/lm_eval/tasks/xnli/xnli_es.yaml
+# Generated by utils.py
+dataset_name: es
+doc_to_choice: '{{[premise+", correcto? Sí, "+hypothesis,premise+", correcto? Asi
+  que, "+hypothesis,premise+", correcto? No, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_es
--- a/lm_eval/tasks/xnli/xnli_fr.yaml
+++ b/lm_eval/tasks/xnli/xnli_fr.yaml
+# Generated by utils.py
+dataset_name: fr
+doc_to_choice: '{{[premise+", correct? Oui, "+hypothesis,premise+", correct? Aussi,
+  "+hypothesis,premise+", correct? Non, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_fr
--- a/lm_eval/tasks/xnli/xnli_hi.yaml
+++ b/lm_eval/tasks/xnli/xnli_hi.yaml
+# Generated by utils.py
+dataset_name: hi
+doc_to_choice: '{{[premise+", सही? हाँ, "+hypothesis,premise+", सही? इसलिए, "+hypothesis,premise+",
+  सही? नहीं, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_hi
--- a/lm_eval/tasks/xnli/xnli_ru.yaml
+++ b/lm_eval/tasks/xnli/xnli_ru.yaml
+# Generated by utils.py
+dataset_name: ru
+doc_to_choice: '{{[premise+", правильно? Да, "+hypothesis,premise+", правильно? Так,
+  "+hypothesis,premise+", правильно? Нет, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_ru
--- a/lm_eval/tasks/xnli/xnli_sw.yaml
+++ b/lm_eval/tasks/xnli/xnli_sw.yaml
+# Generated by utils.py
+dataset_name: sw
+doc_to_choice: '{{[premise+", sahihi? Ndiyo, "+hypothesis,premise+", sahihi? Hivyo,
+  "+hypothesis,premise+", sahihi? Hapana, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_sw
--- a/lm_eval/tasks/xnli/xnli_th.yaml
+++ b/lm_eval/tasks/xnli/xnli_th.yaml
+# Generated by utils.py
+dataset_name: th
+doc_to_choice: '{{[premise+", ถูกต้อง? ใช่, "+hypothesis,premise+", ถูกต้อง? ดังนั้น,
+  "+hypothesis,premise+", ถูกต้อง? ไม่, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_th
--- a/lm_eval/tasks/xnli/xnli_tr.yaml
+++ b/lm_eval/tasks/xnli/xnli_tr.yaml
+# Generated by utils.py
+dataset_name: tr
+doc_to_choice: '{{[premise+", doğru? Evet, "+hypothesis,premise+", doğru? Böylece,
+  "+hypothesis,premise+", doğru? Hayır, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_tr
--- a/lm_eval/tasks/xnli/xnli_ur.yaml
+++ b/lm_eval/tasks/xnli/xnli_ur.yaml
+# Generated by utils.py
+dataset_name: ur
+doc_to_choice: '{{[premise+", صحیح? جی ہاں, "+hypothesis,premise+", صحیح? اس لئے,
+  "+hypothesis,premise+", صحیح? نہیں, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_ur
--- a/lm_eval/tasks/xnli/xnli_vi.yaml
+++ b/lm_eval/tasks/xnli/xnli_vi.yaml
+# Generated by utils.py
+dataset_name: vi
+doc_to_choice: '{{[premise+", đúng? Vâng, "+hypothesis,premise+", đúng? Vì vậy, "+hypothesis,premise+",
+  đúng? Không, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_vi
--- a/lm_eval/tasks/xnli/xnli_zh.yaml
+++ b/lm_eval/tasks/xnli/xnli_zh.yaml
+# Generated by utils.py
+dataset_name: zh
+doc_to_choice: '{{[premise+", 正确? 是的, "+hypothesis,premise+", 正确? 所以, "+hypothesis,premise+",
+  正确? 不是的, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_zh
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -286,7 +286,6 @@ def make_table(result_dict, column="results"):
    latex_writer.headers = [
        column_name,
        "Version",
-        "Fewshot",
        "Filter",
        "Metric",
        "Value",
@@ -298,7 +297,6 @@ def make_table(result_dict, column="results"):

    for k, dic in result_dict[column].items():
        version = result_dict["versions"][k]
-        n = str(result_dict["configs"][k]["num_fewshot"])
        for (mf), v in dic.items():
            m, _, f = mf.partition(",")
            if m.endswith("_stderr"):
@@ -306,11 +304,10 @@ def make_table(result_dict, column="results"):

            if m + "_stderr" + "," + f in dic:
                se = dic[m + "_stderr" + "," + f]
-                values.append([k, version, n, f, m, "%.4f" % v, "±", "%.4f" % se])
+                values.append([k, version, f, m, "%.4f" % v, "±", "%.4f" % se])
            else:
-                values.append([k, version, n, f, m, "%.4f" % v, "", ""])
+                values.append([k, version, f, m, "%.4f" % v, "", ""])
            k = ""
-            n = ""
            version = ""
    md_writer.value_matrix = values
    latex_writer.value_matrix = values
@@ -459,7 +456,7 @@ env = Environment(loader=BaseLoader, undefined=StrictUndefined)
 env.filters["regex_replace"] = regex_replace


-def apply_template(template, doc):
+def apply_template(template: str, doc: dict) -> str:
    rtemplate = env.from_string(template)
    return rtemplate.render(**doc)


--- a/main.py
+++ b/main.py
@@ -32,7 +32,7 @@ def parse_args():
        default=None,
        help="Number of examples in few-shot context",
    )
-    parser.add_argument("--batch_size", type=int, default=1)  # TODO: only integers
+    parser.add_argument("--batch_size", type=str, default=1)
    parser.add_argument(
        "--max_batch_size",
        type=int,

--- a/setup.py
+++ b/setup.py
 import setuptools
+import itertools

 with open("README.md", "r", encoding="utf-8") as fh:
    long_description = fh.read()

+
+extras_require = {
+    "dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
+    "linting": [
+        "flake8",
+        "pylint",
+        "mypy",
+        "pre-commit",
+    ],
+    "testing": ["pytest", "pytest-cov", "pytest-xdist"],
+    "multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
+    "sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"],
+    "promptsource": [
+        "promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
+    ],
+    "gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
+    "anthropic": ["anthropic"],
+    "openai": ["openai", "tiktoken"],
+}
+extras_require["all"] = list(itertools.chain.from_iterable(extras_require.values()))
+
+
 setuptools.setup(
    name="lm_eval",
    version="1.0.0",
@@ -15,7 +38,7 @@ setuptools.setup(
    packages=setuptools.find_packages(),
    # required to include yaml files in pip installation
    package_data={
-        "lm_eval": ["**/*.yaml"],
+        "lm_eval": ["**/*.yaml", "tasks/**/*"],
        "examples": ["**/*.yaml"],
    },
    entry_points={
@@ -36,7 +59,6 @@ setuptools.setup(
        "evaluate>=0.4.0",
        "jsonlines",
        "numexpr",
-        "openai>=0.6.4",
        "omegaconf>=2.2",
        "peft>=0.2.0",
        "pybind11>=2.6.2",
@@ -51,21 +73,5 @@ setuptools.setup(
        "transformers>=4.1",
        "zstandard",
    ],
-    extras_require={
-        "dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
-        "linting": [
-            "flake8",
-            "pylint",
-            "mypy",
-            "pre-commit",
-        ],
-        "testing": ["pytest", "pytest-cov", "pytest-xdist"],
-        "multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
-        "sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"],
-        "promptsource": [
-            "promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
-        ],
-        "gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
-        "anthropic": ["anthropic"],
-    },
+    extras_require=extras_require,
 )
--- a/tests/extra/test_new_tasks.py
+++ b/tests/extra/test_new_tasks.py
@@ -92,7 +92,7 @@ class TestNewTasks:
            if task.has_test_docs()
            else list(islice(task.validation_docs(), limit))
        )
-        if "multiple_choice" in task._config.group:
+        if "multiple_choice" in task._config.output_type:
            _array = [task.doc_to_choice(doc) for doc in arr]
            # assert all(len(x) == 4 for x in _array)
            assert all(isinstance(x, list) for x in _array)
@@ -106,8 +106,8 @@ class TestNewTasks:
            else list(islice(task.validation_docs(), limit))
        )
        _array_target = [task.doc_to_target(doc) for doc in arr]
-        assert all(isinstance(label, int) for label in _array_target)
-        assert len(_array_target) == limit if limit else True
+        if task._config.output_type == "multiple_choice":
+            assert all(isinstance(label, int) for label in _array_target)
        # _array_text = [task.doc_to_text(doc) for doc in arr]
        # Not working
        # assert all(tgt[0] == " " or txt[-1] == "\n" if  len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
@@ -116,6 +116,7 @@ class TestNewTasks:
        task_class().build_all_requests(rank=1, limit=limit, world_size=1)
        assert task_class.instances is not None

+    # ToDO: Add proper testing
    def test_construct_requests(self, task_class, limit):
        task = task_class()
        arr = (
@@ -124,5 +125,5 @@ class TestNewTasks:
            else list(islice(task.validation_docs(), limit))
        )
        requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
-        assert all(isinstance(doc, list) for doc in requests)
+        # assert all(isinstance(doc, list) for doc in requests)
        assert len(requests) == limit if limit else True
--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
@@ -83,7 +83,7 @@ def test_create_choices(task_class, limit):
        if task.has_test_docs()
        else list(islice(task.validation_docs(), limit))
    )
-    if "multiple_choice" in task._config.group:
+    if "multiple_choice" in task._config.output_type:
        _array = [task.doc_to_choice(doc) for doc in arr]
        # assert all(len(x) == 4 for x in _array)
        assert all(isinstance(x, list) for x in _array)
@@ -98,8 +98,8 @@ def test_doc_to_target(task_class, limit):
        else list(islice(task.validation_docs(), limit))
    )
    _array_target = [task.doc_to_target(doc) for doc in arr]
-    assert all(isinstance(label, int) for label in _array_target)
-    assert len(_array_target) == limit if limit else True
+    if task._config.output_type == "multiple_choice":
+        assert all(isinstance(label, int) for label in _array_target)
    # _array_text = [task.doc_to_text(doc) for doc in arr]
    # Not working
    # assert all(tgt[0] == " " or txt[-1] == "\n" if  len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
@@ -110,6 +110,7 @@ def test_build_all_requests(task_class, limit):
    assert task_class.instances is not None


+# ToDO: Add proper testing
 def test_construct_requests(task_class, limit):
    task = task_class()
    arr = (
@@ -118,7 +119,7 @@ def test_construct_requests(task_class, limit):
        else list(islice(task.validation_docs(), limit))
    )
    requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
-    assert all(isinstance(doc, list) for doc in requests)
+    # assert all(isinstance(doc, list) for doc in requests)
    assert len(requests) == limit if limit else True