merged latest and added altworld files

835cc40e · lintangsutawika · 8da401e0 · c9bbec6e · 835cc40e · 835cc40e
Commit 835cc40e authored Dec 06, 2023 by lintangsutawika
10 changed files
--- a/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
+++ b/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml
@@ -15,3 +15,5 @@ metric_list:
  - metric: !function metrics.bleu
    aggregation: !function metrics.agg_bleu
    higher_is_better: true
+metadata:
+  - version: 0.0
--- a/lm_eval/tasks/wsc273/default.yaml
+++ b/lm_eval/tasks/wsc273/default.yaml
@@ -13,3 +13,5 @@ metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
+metadata:
+  - version: 1.0
--- a/lm_eval/tasks/xcopa/default_et.yaml
+++ b/lm_eval/tasks/xcopa/default_et.yaml
@@ -10,3 +10,5 @@ doc_to_target: label
 doc_to_choice: !function utils.doc_to_choice
 metric_list:
  - metric: acc
+metadata:
+  - version: 1.0
--- a/lm_eval/tasks/xnli/xnli_common_yaml
+++ b/lm_eval/tasks/xnli/xnli_common_yaml
@@ -15,3 +15,5 @@ metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
+metadata:
+  - version: 1.0
--- a/lm_eval/tasks/xstorycloze/default_ar.yaml
+++ b/lm_eval/tasks/xstorycloze/default_ar.yaml
@@ -14,3 +14,5 @@ metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
+metadata:
+  - version: 1.0
--- a/lm_eval/tasks/xwinograd/xwinograd_common_yaml
+++ b/lm_eval/tasks/xwinograd/xwinograd_common_yaml
@@ -16,3 +16,5 @@ metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
+metadata:
+  - version: 1.0
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -10,7 +10,7 @@ import collections
 import importlib.util
 import fnmatch
-from typing import Iterator, List, Literal, Union
+from typing import Iterator, List, Literal, Union, Any, Callable
 import gc
 import torch
@@ -60,7 +60,12 @@ def handle_arg_string(arg):
        return True
    elif arg.lower() == "false":
        return False
-    return arg
+    elif arg.isnumeric():
+        return int(arg)
+    try:
+        return float(arg)
+    except ValueError:
+        return arg
 def simple_parse_args_string(args_string):
@@ -85,6 +90,32 @@ def join_iters(iters):
 def chunks(iter, n: int = 0, fn=None):
+    """
+    Divides an iterable into chunks of specified size or based on a given function.
+    Useful for batching
+    Parameters:
+    - iter: The input iterable to be divided into chunks.
+    - n: An integer representing the size of each chunk. Default is 0.
+    - fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
+    Returns:
+    An iterator that yields chunks of the input iterable.
+    Example usage:
+    ```
+    data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    for chunk in chunks(data, 3):
+        print(chunk)
+    ```
+    Output:
+    ```
+    [1, 2, 3]
+    [4, 5, 6]
+    [7, 8, 9]
+    [10]
+    ```
+    """
    arr = []
    for i, x in enumerate(iter):
        arr.append(x)
@@ -201,7 +232,13 @@ def make_disjoint_window(pair):
 class Reorderer:
-    def __init__(self, arr, fn) -> None:
+    def __init__(self, arr: List[Any], fn: Callable) -> None:
+        """Reorder an array according to some function
+        Args:
+            arr (List[Any]): The initial array
+            fn (Callable[[Any], Any]): A function to determine the priority of elements
+        """
        self.size = len(arr)
        arr = list(enumerate(arr))
        arr = group(arr, lambda x: fn(x[1]))
@@ -213,9 +250,22 @@ class Reorderer:
        self.arr = arr
    def get_reordered(self):
+        """Gets the reordered array
+        Returns:
+            List[Any]: The reordered array
+        """
        return [x[1] for x in self.arr]
    def get_original(self, newarr):
+        """Restores the original order of a new array based on the old array's order
+        Args:
+            newarr (List[Any]): The array to be restored
+        Returns:
+            List[Any]: The array restored to the original order
+        """
        res = [None] * self.size
        cov = [False] * self.size
@@ -296,31 +346,27 @@ def make_table(result_dict, column: str = "results"):
    elif column == "groups":
        column_name = "Groups"
-    md_writer = MarkdownTableWriter()
+    all_headers = [
-    latex_writer = LatexTableWriter()
-    md_writer.headers = [
-        column_name,
-        "Version",
-        "Filter",
-        "Metric",
-        "Value",
-        "",
-        "Stderr",
-    ]
-    latex_writer.headers = [
        column_name,
        "Version",
        "Filter",
+        "n-shot",
        "Metric",
        "Value",
        "",
        "Stderr",
    ]
+    md_writer = MarkdownTableWriter()
+    latex_writer = LatexTableWriter()
+    md_writer.headers = all_headers
+    latex_writer.headers = all_headers
    values = []
    for k, dic in result_dict[column].items():
        version = result_dict["versions"][k]
+        n = str(result_dict["n-shot"][k])
        if "alias" in dic:
            k = dic.pop("alias")
@@ -332,9 +378,9 @@ def make_table(result_dict, column: str = "results"):
            if m + "_stderr" + "," + f in dic:
                se = dic[m + "_stderr" + "," + f]
-                values.append([k, version, f, m, "%.4f" % v, "±", "%.4f" % se])
+                values.append([k, version, f, n, m, "%.4f" % v, "±", "%.4f" % se])
            else:
-                values.append([k, version, f, m, "%.4f" % v, "", ""])
+                values.append([k, version, f, n, m, "%.4f" % v, "", ""])
            k = ""
            version = ""
    md_writer.value_matrix = values
@@ -442,7 +488,6 @@ yaml.add_constructor("!function", import_function)
 def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None):
    if yaml_config is None:
        with open(yaml_path, "rb") as file:
            yaml_config = yaml.full_load(file)
@@ -463,7 +508,6 @@ def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None):
        include_path.reverse()
        final_yaml_config = {}
        for path in include_path:
            # Assumes that path is a full path.
            # If not found, assume the included yaml
            # is in the same dir as the original yaml

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,7 +70,8 @@ promptsource = [
 ]
 gptq = ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"]
 anthropic = ["anthropic"]
-openai = ["openai", "tiktoken"]
+openai = ["openai>=1.3.5", "tiktoken"]
+vllm = ["vllm"]
 all = [
    "lm_eval[dev]",
    "lm_eval[testing]",
@@ -80,5 +81,6 @@ all = [
    "lm_eval[promptsource]",
    "lm_eval[gptq]",
    "lm_eval[anthropic]",
-    "lm_eval[openai]"
+    "lm_eval[openai]",
+    "lm_eval[vllm]",
 ]
--- a/tests/models/test_vllm.py
+++ b/tests/models/test_vllm.py
+import pytest
+from typing import List
+from lm_eval.api.instance import Instance
+import lm_eval.tasks as tasks
+import sys
+import torch
+@pytest.mark.skip(reason="requires CUDA")
+class TEST_VLLM:
+    vllm = pytest.importorskip("vllm")
+    try:
+        from lm_eval.models.vllm_causallms import VLLM
+        LM = VLLM(pretrained="EleutherAI/pythia-70m")
+    except ModuleNotFoundError:
+        pass
+    torch.use_deterministic_algorithms(True)
+    tasks.initialize_tasks()
+    multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")()  # type: ignore
+    multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
+    MULTIPLE_CH: List[Instance] = multiple_choice_task.instances
+    generate_until_task = tasks.TASK_REGISTRY.get("gsm8k")()  # type: ignore
+    generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
+    generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
+    generate_until: List[Instance] = generate_until_task.instances
+    rolling_task = tasks.TASK_REGISTRY.get("wikitext")()  # type: ignore
+    rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
+    ROLLING: List[Instance] = rolling_task.instances
+    # TODO: make proper tests
+    def test_logliklihood(self) -> None:
+        res = self.LM.loglikelihood(self.MULTIPLE_CH)
+        assert len(res) == len(self.MULTIPLE_CH)
+        for x in res:
+            assert isinstance(x[0], float)
+    def test_generate_until(self) -> None:
+        res = self.LM.generate_until(self.generate_until)
+        assert len(res) == len(self.generate_until)
+        for x in res:
+            assert isinstance(x, str)
+    def test_logliklihood_rolling(self) -> None:
+        res = self.LM.loglikelihood_rolling(self.ROLLING)
+        for x in res:
+            assert isinstance(x, float)
--- a/tests/tests_master/test_models.py
+++ b/tests/tests_master/test_models.py
 import hashlib
 import json
-import openai
 import os
 import pickle
 import pytest
@@ -8,6 +7,10 @@ import unittest.mock as mock
 import lm_eval.models as models
+from openai import OpenAI
+client = OpenAI()
 LOGLIKELIHOOD_TEST_CASES = [
    ("The quick brown fox jumps over the lazy", " dog"),
@@ -172,7 +175,7 @@ def openai_mock_completion(**kwargs):
    if os.path.exists(fname):
        with open(fname, "rb") as fh:
            return pickle.load(fh)
-    ret = openai.Completion.create(**kwargs)
+    ret = client.completions.create(**kwargs)
    ret.api_key = ""
    with open(fname, "wb") as fh:
        pickle.dump(ret, fh)