Merge branch 'main' into mathvista

# Conflicts: # lm_eval/models/openai_completions.py

Merge branch 'main' into mathvista
# Conflicts: # lm_eval/models/openai_completions.py
2106fbeb · Baber · 4354fe46 · 703fbffd · 2106fbeb · 2106fbeb
Commit 2106fbeb authored Jan 15, 2025 by Baber
14 changed files
--- a/lm_eval/tasks/xquad/xquad_hi.yaml
+++ b/lm_eval/tasks/xquad/xquad_hi.yaml
+include: xquad_common_yaml
+task: xquad_hi
+dataset_name: xquad.hi
+doc_to_text: "प्रसंग: {{context}}\n\nसवाल: {{question}}\n\nउत्तर:"
--- a/lm_eval/tasks/xquad/xquad_ro.yaml
+++ b/lm_eval/tasks/xquad/xquad_ro.yaml
+include: xquad_common_yaml
+task: xquad_ro
+dataset_name: xquad.ro
+doc_to_text: "Context: {{context}}\n\nÎntrebare: {{question}}\n\nRăspuns:"
--- a/lm_eval/tasks/xquad/xquad_ru.yaml
+++ b/lm_eval/tasks/xquad/xquad_ru.yaml
+include: xquad_common_yaml
+task: xquad_ru
+dataset_name: xquad.ru
+doc_to_text: "Контекст: {{context}}\n\nВопрос: {{question}}\n\nОтвет:"
--- a/lm_eval/tasks/xquad/xquad_th.yaml
+++ b/lm_eval/tasks/xquad/xquad_th.yaml
+include: xquad_common_yaml
+task: xquad_th
+dataset_name: xquad.th
+doc_to_text: "บริบท: {{context}}\n\nคำถาม: {{question}}\n\nคำตอบ:"
--- a/lm_eval/tasks/xquad/xquad_tr.yaml
+++ b/lm_eval/tasks/xquad/xquad_tr.yaml
+include: xquad_common_yaml
+task: xquad_tr
+dataset_name: xquad.tr
+doc_to_text: "Bağlam: {{context}}\n\nSoru: {{question}}\n\nCevap:"
--- a/lm_eval/tasks/xquad/xquad_vi.yaml
+++ b/lm_eval/tasks/xquad/xquad_vi.yaml
+include: xquad_common_yaml
+task: xquad_vi
+dataset_name: xquad.vi
+doc_to_text: "Bối cảnh: {{context}}\n\nCâu hỏi: {{question}}\n\nTrả lời:"
--- a/lm_eval/tasks/xquad/xquad_zh.yaml
+++ b/lm_eval/tasks/xquad/xquad_zh.yaml
+include: xquad_common_yaml
+task: xquad_zh
+dataset_name: xquad.zh
+doc_to_text: "语境: {{context}}\n\n问题: {{question}}\n\n回答:"
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -10,7 +10,7 @@ import os
 import re
 from dataclasses import asdict, is_dataclass
 from itertools import islice
-from typing import Any, Callable, List
+from typing import Any, Callable, Generator, List, Tuple
 import numpy as np
 import yaml
@@ -104,7 +104,8 @@ def simple_parse_args_string(args_string):
        return {}
    arg_list = [arg for arg in args_string.split(",") if arg]
    args_dict = {
-        k: handle_arg_string(v) for k, v in [arg.split("=") for arg in arg_list]
+        kv[0]: handle_arg_string("=".join(kv[1:]))
+        for kv in [arg.split("=") for arg in arg_list]
    }
    return args_dict
@@ -201,7 +202,9 @@ def get_sample_results_filenames(filenames: List[str]) -> List[str]:
    return [f for f in filenames if "/samples_" in f and ".json" in f]
-def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len):
+def get_rolling_token_windows(
+    token_list: List[int], prefix_token: int, max_seq_len: int, context_len: int
+) -> Generator[Tuple[List[int], List[int]], None, None]:
    """
    - context_len allows for a rolling window context, allowing each prediction window to potentially
      condition on some context
@@ -228,7 +231,7 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len
    # Special handling for first window: predict all tokens
    first_seq_len = min(max_seq_len, len(token_list))
-    yield ([prefix_token] + token_list[: first_seq_len - 1], token_list[:first_seq_len])
+    yield [prefix_token] + token_list[: first_seq_len - 1], token_list[:first_seq_len]
    predicted += first_seq_len
    while predicted < len(token_list):
@@ -242,7 +245,9 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len
        predicted += window_pred_len
-def make_disjoint_window(pair):
+def make_disjoint_window(
+    pair: Tuple[List[int], List[int]],
+) -> Tuple[List[int], List[int]]:
    """Takes output from get_rolling_token_windows and makes the context not overlap with the continuation"""
    a, b = pair
    return a[: len(a) - (len(b) - 1)], b

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "lm_eval"
-version = "0.4.5"
+version = "0.4.7"
 authors = [
    {name="EleutherAI", email="contact@eleuther.ai"}
 ]
@@ -16,7 +16,7 @@ classifiers = [
    "License :: OSI Approved :: MIT License",
    "Operating System :: OS Independent",
 ]
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 license = { "text" = "MIT" }
 dependencies = [
    "accelerate>=0.26.0",
@@ -62,6 +62,7 @@ dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy"]
 deepsparse = ["deepsparse-nightly[llm]>=1.8.0.20240404"]
 gptq = ["auto-gptq[triton]>=0.6.0"]
 hf_transfer = ["hf_transfer"]
+ibm_watsonx_ai = ["ibm_watsonx_ai>=1.1.22"]
 ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"]
 neuronx = ["optimum[neuronx]"]
 mamba = ["mamba_ssm", "causal-conv1d==1.0.2"]
@@ -75,12 +76,15 @@ testing = ["pytest", "pytest-cov", "pytest-xdist"]
 vllm = ["vllm>=0.4.2"]
 zeno = ["pandas", "zeno-client"]
 wandb = ["wandb>=0.16.3", "pandas", "numpy"]
+gptqmodel = ["gptqmodel>=1.0.9"]
+japanese_leaderboard = ["emoji==2.14.0", "neologdn==0.5.3", "fugashi[unidic-lite]", "rouge_score>=0.1.2"]
 all = [
    "lm_eval[anthropic]",
    "lm_eval[dev]",
    "lm_eval[deepsparse]",
    "lm_eval[gptq]",
    "lm_eval[hf_transfer]",
+    "lm_eval[ibm_watsonx_ai]",
    "lm_eval[ifeval]",
    "lm_eval[mamba]",
    "lm_eval[math]",
@@ -93,6 +97,7 @@ all = [
    "lm_eval[vllm]",
    "lm_eval[zeno]",
    "lm_eval[wandb]",
+    "lm_eval[japanese_leaderboard]",
 ]
 [tool.ruff.lint]

--- a/scripts/clean_training_data/generate_13_grams.py
+++ b/scripts/clean_training_data/generate_13_grams.py
@@ -55,7 +55,7 @@ def yield_pile(start_offsets=None, checkpoint_offset=None):
        print(
            "We expect the pile archives to be in the 'pile' directory, but this was not found."
        )
-        raise Exception("Pile directory not found.")
+        raise FileNotFoundError("Pile directory not found.")
    files = list(sorted(glob.glob(os.path.join(directory, "*.jsonl.zst*"))))

--- a/scripts/zeno_visualize.py
+++ b/scripts/zeno_visualize.py
@@ -109,13 +109,14 @@ def main():
            if model_index == 0:  # Only need to assemble data for the first model
                metrics = []
                for metric in config["metric_list"]:
-                    metrics.append(
+                    if metric.get("aggregation") == "mean":
-                        ZenoMetric(
+                        metrics.append(
-                            name=metric["metric"],
+                            ZenoMetric(
-                            type="mean",
+                                name=metric["metric"],
-                            columns=[metric["metric"]],
+                                type="mean",
+                                columns=[metric["metric"]],
+                            )
                        )
-                    )
                project = client.create_project(
                    name=args.project_name + (f"_{task}" if len(tasks) > 1 else ""),
                    view="text-classification",
@@ -168,7 +169,11 @@ def generate_dataset(
    Returns:
        pd.Dataframe: A dataframe that is ready to be uploaded to Zeno.
    """
-    ids = [x["doc_id"] for x in data]
+    ids = (
+        [x["doc_id"] for x in data]
+        if not config.get("filter_list")
+        else [f"{x['doc_id']}.{x['filter']}" for x in data]
+    )
    labels = [x["target"] for x in data]
    instance = [""] * len(ids)
@@ -190,6 +195,7 @@ def generate_dataset(
    return pd.DataFrame(
        {
            "id": ids,
+            "doc_id": [x["doc_id"] for x in data],
            "data": instance,
            "input_len": [len(x) for x in instance],
            "labels": labels,
@@ -208,8 +214,15 @@ def generate_system_df(data, config):
    Returns:
        pd.Dataframe: A dataframe that is ready to be uploaded to Zeno as a system.
    """
-    ids = [x["doc_id"] for x in data]
+    ids = (
+        [x["doc_id"] for x in data]
+        if not config.get("filter_list")
+        else [f"{x['doc_id']}.{x['filter']}" for x in data]
+    )
    system_dict = {"id": ids}
+    system_dict["doc_id"] = [x["doc_id"] for x in data]
+    if config.get("filter_list"):
+        system_dict["filter"] = [x["filter"] for x in data]
    system_dict["output"] = [""] * len(ids)
    if config["output_type"] == "loglikelihood":
@@ -228,11 +241,10 @@ def generate_system_df(data, config):
        system_dict["output"] = [str(x["filtered_resps"][0]) for x in data]
        system_dict["output_length"] = [len(str(x["filtered_resps"][0])) for x in data]
-    metrics = {}
+    metrics = {
-    for metric in config["metric_list"]:
+        metric["metric"]: [x[metric["metric"]] for x in data]
-        if "aggregation" in metric and metric["aggregation"] == "mean":
+        for metric in config["metric_list"]
-            metrics[metric["metric"]] = [x[metric["metric"]] for x in data]
+    }
    system_dict.update(metrics)
    system_df = pd.DataFrame(system_dict)
    return system_df

--- a/tests/models/test_api.py
+++ b/tests/models/test_api.py
@@ -63,13 +63,13 @@ def test_create_payload_loglikelihood(api):
        (
            ["Hello, how are"],
            True,
-            {"max_gen_toks": 100, "temperature": 0.7},
+            {"max_gen_toks": 100, "temperature": 0.7, "until": ["hi"]},
            {
                "prompt": "Hello, how are",
                "model": "gpt-3.5-turbo",
                "max_tokens": 100,
                "temperature": 0.7,
-                "stop": ["<|endoftext|>"],
+                "stop": ["hi"],
                "seed": 1234,
            },
        ),
@@ -82,7 +82,7 @@ def test_create_payload_loglikelihood(api):
                "model": "gpt-3.5-turbo",
                "max_tokens": 256,
                "temperature": 0,
-                "stop": ["<|endoftext|>"],
+                "stop": [],
                "seed": 1234,
            },
        ),

--- a/tests/models/test_gptqmodel.py
+++ b/tests/models/test_gptqmodel.py
+from typing import List
+import pytest
+import lm_eval
+def assert_less_than(value, threshold, desc):
+    if value is not None:
+        assert float(value) < threshold, f"{desc} should be less than {threshold}"
+@pytest.mark.skip(reason="requires CUDA")
+class Test_GPTQModel:
+    gptqmodel = pytest.importorskip("gptqmodel", minversion="1.0.9")
+    MODEL_ID = "ModelCloud/Opt-125-GPTQ-4bit-10-25-2024"
+    def test_gptqmodel(self) -> None:
+        acc = "acc"
+        acc_norm = "acc_norm"
+        acc_value = None
+        acc_norm_value = None
+        task = "arc_easy"
+        model_args = f"pretrained={self.MODEL_ID},gptqmodel=True"
+        tasks: List[str] = [task]
+        results = lm_eval.simple_evaluate(
+            model="hf",
+            model_args=model_args,
+            tasks=tasks,
+            device="cuda",
+        )
+        column = "results"
+        dic = results.get(column, {}).get(self.task)
+        if dic is not None:
+            if "alias" in dic:
+                _ = dic.pop("alias")
+            items = sorted(dic.items())
+            for k, v in items:
+                m, _, f = k.partition(",")
+                if m.endswith("_stderr"):
+                    continue
+                if m == acc:
+                    acc_value = "%.4f" % v if isinstance(v, float) else v
+                if m == acc_norm:
+                    acc_norm_value = "%.4f" % v if isinstance(v, float) else v
+            assert_less_than(acc_value, 0.43, "acc")
+            assert_less_than(acc_norm_value, 0.39, "acc_norm")
--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
 import os
 from itertools import islice
+import datasets
 import pytest
 import lm_eval.tasks as tasks
@@ -10,6 +11,7 @@ from lm_eval.evaluator_utils import get_task_list
 from .utils import new_tasks
+datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 task_manager = tasks.TaskManager()
 # Default Task
@@ -77,10 +79,17 @@ class TestNewTasks:
        )
        _array = [task.doc_to_text(doc) for doc in arr]
        # space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
-        assert all(
+        target_delimiter: str = task.config.target_delimiter
-            isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True)
+        if not task.multiple_input:
-            for x in _array
+            for x in _array:
-        )
+                assert isinstance(x, str)
+                assert (
+                    (x[-1].isspace() is False if len(x) > 0 else True)
+                    if target_delimiter.isspace()
+                    else True
+                ), "doc_to_text ends in a whitespace and target delimiter also a whitespace"
+        else:
+            pass
    def test_create_choices(self, task_class, limit):
        task = task_class
@@ -121,5 +130,11 @@ class TestNewTasks:
            if task.has_test_docs()
            else list(islice(task.validation_docs(), limit))
        )
-        requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
+        # ctx is "" for multiple input tasks
+        requests = [
+            task.construct_requests(
+                doc=doc, ctx="" if task.multiple_input else task.doc_to_text(doc)
+            )
+            for doc in arr
+        ]
        assert len(requests) == limit if limit else True