Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into superglue

33f2f9bf · lintangsutawika · e1fdf2a8 · 7634a6ec · 33f2f9bf · 33f2f9bf
Commit 33f2f9bf authored Aug 10, 2023 by lintangsutawika
9 changed files
--- a/lm_eval/tasks/xcopa/default_tr.yaml
+++ b/lm_eval/tasks/xcopa/default_tr.yaml
+include: default_et.yaml
+task: xcopa_tr
+dataset_name: tr
+doc_to_text: !function utils.doc_to_text_tr
--- a/lm_eval/tasks/xcopa/default_vi.yaml
+++ b/lm_eval/tasks/xcopa/default_vi.yaml
+include: default_et.yaml
+task: xcopa_vi
+dataset_name: vi
+doc_to_text: !function utils.doc_to_text_vi
--- a/lm_eval/tasks/xcopa/default_zh.yaml
+++ b/lm_eval/tasks/xcopa/default_zh.yaml
+include: default_et.yaml
+task: xcopa_zh
+dataset_name: zh
+doc_to_text: !function utils.doc_to_text_zh
--- a/lm_eval/tasks/xcopa/utils.py
+++ b/lm_eval/tasks/xcopa/utils.py
+from functools import partial
+
+
+def convert_choice(choice):
+    return choice[0].lower() + choice[1:]
+
+
+def doc_to_text(doc, connector):
+    # Drop the period
+    conn = connector[doc["question"]]
+    return doc["premise"].strip()[:-1] + f" {conn}"
+
+
+def doc_to_choice(doc):
+    return [convert_choice(doc["choice1"]), convert_choice(doc["choice2"])]
+
+
+doc_to_text_et = partial(
+    doc_to_text,
+    connector={
+        "cause": "sest",
+        "effect": "seetõttu",
+    },
+)
+
+
+doc_to_text_ht = partial(
+    doc_to_text,
+    connector={
+        "cause": "poukisa",
+        "effect": "donk sa",
+    },
+)
+
+
+doc_to_text_it = partial(
+    doc_to_text,
+    connector={
+        "cause": "perché",
+        "effect": "quindi",
+    },
+)
+
+
+doc_to_text_id = partial(
+    doc_to_text,
+    connector={
+        "cause": "karena",
+        "effect": "maka",
+    },
+)
+
+
+doc_to_text_qu = partial(
+    doc_to_text,
+    connector={
+        "cause": "imataq",
+        "effect": "chaymi",
+    },
+)
+
+
+doc_to_text_sw = partial(
+    doc_to_text,
+    connector={
+        "cause": "kwa sababu",
+        "effect": "kwa hiyo",
+    },
+)
+
+
+doc_to_text_zh = partial(
+    doc_to_text,
+    connector={
+        "cause": "因为",
+        "effect": "所以",
+    },
+)
+
+
+doc_to_text_ta = partial(
+    doc_to_text,
+    connector={
+        "cause": "காரணமாக",
+        "effect": "எனவே",
+    },
+)
+
+
+doc_to_text_th = partial(
+    doc_to_text,
+    connector={
+        "cause": "เพราะ",
+        "effect": "ดังนั้น",
+    },
+)
+
+
+doc_to_text_tr = partial(
+    doc_to_text,
+    connector={
+        "cause": "çünkü",
+        "effect": "bu yüzden",
+    },
+)
+
+
+doc_to_text_vi = partial(
+    doc_to_text,
+    connector={
+        "cause": "bởi vì",
+        "effect": "vì vậy",
+    },
+)
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -456,7 +456,7 @@ env = Environment(loader=BaseLoader, undefined=StrictUndefined)
 env.filters["regex_replace"] = regex_replace


-def apply_template(template, doc):
+def apply_template(template: str, doc: dict) -> str:
    rtemplate = env.from_string(template)
    return rtemplate.render(**doc)


--- a/main.py
+++ b/main.py
@@ -32,7 +32,7 @@ def parse_args():
        default=None,
        help="Number of examples in few-shot context",
    )
-    parser.add_argument("--batch_size", type=int, default=1)  # TODO: only integers
+    parser.add_argument("--batch_size", type=str, default=1)
    parser.add_argument(
        "--max_batch_size",
        type=int,

--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@ setuptools.setup(
    packages=setuptools.find_packages(),
    # required to include yaml files in pip installation
    package_data={
-        "lm_eval": ["**/*.yaml"],
+        "lm_eval": ["**/*.yaml", "tasks/**/*"],
        "examples": ["**/*.yaml"],
    },
    entry_points={
@@ -36,7 +36,6 @@ setuptools.setup(
        "evaluate>=0.4.0",
        "jsonlines",
        "numexpr",
-        "openai>=0.6.4",
        "omegaconf>=2.2",
        "peft>=0.2.0",
        "pybind11>=2.6.2",
@@ -67,5 +66,6 @@ setuptools.setup(
        ],
        "gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
        "anthropic": ["anthropic"],
+        "openai": ["openai", "tiktoken"],
    },
 )
--- a/tests/extra/test_new_tasks.py
+++ b/tests/extra/test_new_tasks.py
@@ -92,7 +92,7 @@ class TestNewTasks:
            if task.has_test_docs()
            else list(islice(task.validation_docs(), limit))
        )
-        if "multiple_choice" in task._config.group:
+        if "multiple_choice" in task._config.output_type:
            _array = [task.doc_to_choice(doc) for doc in arr]
            # assert all(len(x) == 4 for x in _array)
            assert all(isinstance(x, list) for x in _array)
@@ -106,8 +106,8 @@ class TestNewTasks:
            else list(islice(task.validation_docs(), limit))
        )
        _array_target = [task.doc_to_target(doc) for doc in arr]
-        assert all(isinstance(label, int) for label in _array_target)
-        assert len(_array_target) == limit if limit else True
+        if task._config.output_type == "multiple_choice":
+            assert all(isinstance(label, int) for label in _array_target)
        # _array_text = [task.doc_to_text(doc) for doc in arr]
        # Not working
        # assert all(tgt[0] == " " or txt[-1] == "\n" if  len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
@@ -116,6 +116,7 @@ class TestNewTasks:
        task_class().build_all_requests(rank=1, limit=limit, world_size=1)
        assert task_class.instances is not None

+    # ToDO: Add proper testing
    def test_construct_requests(self, task_class, limit):
        task = task_class()
        arr = (
@@ -124,5 +125,5 @@ class TestNewTasks:
            else list(islice(task.validation_docs(), limit))
        )
        requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
-        assert all(isinstance(doc, list) for doc in requests)
+        # assert all(isinstance(doc, list) for doc in requests)
        assert len(requests) == limit if limit else True
--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
@@ -83,7 +83,7 @@ def test_create_choices(task_class, limit):
        if task.has_test_docs()
        else list(islice(task.validation_docs(), limit))
    )
-    if "multiple_choice" in task._config.group:
+    if "multiple_choice" in task._config.output_type:
        _array = [task.doc_to_choice(doc) for doc in arr]
        # assert all(len(x) == 4 for x in _array)
        assert all(isinstance(x, list) for x in _array)
@@ -98,8 +98,8 @@ def test_doc_to_target(task_class, limit):
        else list(islice(task.validation_docs(), limit))
    )
    _array_target = [task.doc_to_target(doc) for doc in arr]
-    assert all(isinstance(label, int) for label in _array_target)
-    assert len(_array_target) == limit if limit else True
+    if task._config.output_type == "multiple_choice":
+        assert all(isinstance(label, int) for label in _array_target)
    # _array_text = [task.doc_to_text(doc) for doc in arr]
    # Not working
    # assert all(tgt[0] == " " or txt[-1] == "\n" if  len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
@@ -110,6 +110,7 @@ def test_build_all_requests(task_class, limit):
    assert task_class.instances is not None


+# ToDO: Add proper testing
 def test_construct_requests(task_class, limit):
    task = task_class()
    arr = (
@@ -118,7 +119,7 @@ def test_construct_requests(task_class, limit):
        else list(islice(task.validation_docs(), limit))
    )
    requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
-    assert all(isinstance(doc, list) for doc in requests)
+    # assert all(isinstance(doc, list) for doc in requests)
    assert len(requests) == limit if limit else True