Merge branch 'main' into inverse-scaling-tasks

60c9c170 · haileyschoelkopf · 4b2d565b · b4cd85d4 · 60c9c170 · 60c9c170
Commit 60c9c170 authored May 29, 2024 by haileyschoelkopf
20 changed files
--- a/lm_eval/tasks/unitxt/unitxt_tasks.classification.multi_class
+++ b/lm_eval/tasks/unitxt/unitxt_tasks.classification.multi_class
+group:
+- unitxt
+dataset_path: unitxt/data
+output_type: generate_until
+training_split: train
+validation_split: test
+doc_to_text: '{{source}}'
+doc_to_target: target
+process_results: !function 'unitxt_wrapper.process_results'
+generation_kwargs:
+  until:
+  - </s>
+metric_list:
+- metric: unitxt_f1_micro
+  aggregation: unitxt
+  higher_is_better: true
+- metric: unitxt_accuracy
+  aggregation: unitxt
+  higher_is_better: true
+- metric: unitxt_f1_macro
+  aggregation: unitxt
+  higher_is_better: true
+metadata:
+  verison: 1.0
--- a/lm_eval/tasks/unitxt/unitxt_tasks.classification.multi_label
+++ b/lm_eval/tasks/unitxt/unitxt_tasks.classification.multi_label
+group:
+- unitxt
+dataset_path: unitxt/data
+output_type: generate_until
+training_split: train
+validation_split: test
+doc_to_text: '{{source}}'
+doc_to_target: target
+process_results: !function 'unitxt_wrapper.process_results'
+generation_kwargs:
+  until:
+  - </s>
+metric_list:
+- metric: unitxt_f1_micro_multi_label
+  aggregation: unitxt
+  higher_is_better: true
+- metric: unitxt_accuracy
+  aggregation: unitxt
+  higher_is_better: true
+- metric: unitxt_f1_macro_multi_label
+  aggregation: unitxt
+  higher_is_better: true
+metadata:
+  verison: 1.0
--- a/lm_eval/tasks/unitxt/unitxt_tasks.grammatical_error_correction
+++ b/lm_eval/tasks/unitxt/unitxt_tasks.grammatical_error_correction
+group:
+- unitxt
+dataset_path: unitxt/data
+output_type: generate_until
+training_split: train
+validation_split: test
+doc_to_text: '{{source}}'
+doc_to_target: target
+process_results: !function 'unitxt_wrapper.process_results'
+generation_kwargs:
+  until:
+  - </s>
+metric_list:
+- metric: unitxt_char_edit_dist_accuracy
+  aggregation: unitxt
+  higher_is_better: true
+- metric: unitxt_rouge
+  aggregation: unitxt
+  higher_is_better: true
+- metric: unitxt_char_edit_distance[reference_field=original_text]
+  aggregation: unitxt
+  higher_is_better: true
+metadata:
+  verison: 1.0
--- a/lm_eval/tasks/unitxt/unitxt_tasks.qa.with_context.extractive
+++ b/lm_eval/tasks/unitxt/unitxt_tasks.qa.with_context.extractive
+group:
+- unitxt
+dataset_path: unitxt/data
+output_type: generate_until
+training_split: train
+validation_split: test
+doc_to_text: '{{source}}'
+doc_to_target: target
+process_results: !function 'unitxt_wrapper.process_results'
+generation_kwargs:
+  until:
+  - </s>
+metric_list:
+- metric: unitxt_squad
+  aggregation: unitxt
+  higher_is_better: true
+metadata:
+  verison: 1.0
--- a/lm_eval/tasks/unitxt/unitxt_tasks.regression.two_texts
+++ b/lm_eval/tasks/unitxt/unitxt_tasks.regression.two_texts
+group:
+- unitxt
+dataset_path: unitxt/data
+output_type: generate_until
+training_split: train
+validation_split: test
+doc_to_text: '{{source}}'
+doc_to_target: target
+process_results: !function 'unitxt_wrapper.process_results'
+generation_kwargs:
+  until:
+  - </s>
+metric_list:
+- metric: unitxt_spearman
+  aggregation: unitxt
+  higher_is_better: true
+metadata:
+  verison: 1.0
--- a/lm_eval/tasks/unitxt/unitxt_tasks.span_labeling.extraction
+++ b/lm_eval/tasks/unitxt/unitxt_tasks.span_labeling.extraction
+group:
+- unitxt
+dataset_path: unitxt/data
+output_type: generate_until
+training_split: train
+validation_split: test
+doc_to_text: '{{source}}'
+doc_to_target: target
+process_results: !function 'unitxt_wrapper.process_results'
+generation_kwargs:
+  until:
+  - </s>
+metric_list:
+- metric: unitxt_ner
+  aggregation: unitxt
+  higher_is_better: true
+metadata:
+  verison: 1.0
--- a/lm_eval/tasks/unitxt/unitxt_tasks.summarization.abstractive
+++ b/lm_eval/tasks/unitxt/unitxt_tasks.summarization.abstractive
+group:
+- unitxt
+dataset_path: unitxt/data
+output_type: generate_until
+training_split: train
+validation_split: test
+doc_to_text: '{{source}}'
+doc_to_target: target
+process_results: !function 'unitxt_wrapper.process_results'
+generation_kwargs:
+  until:
+  - </s>
+metric_list:
+- metric: unitxt_rouge
+  aggregation: unitxt
+  higher_is_better: true
+metadata:
+  verison: 1.0
--- a/lm_eval/tasks/unitxt/unitxt_wrapper.py
+++ b/lm_eval/tasks/unitxt/unitxt_wrapper.py
+try:
+    from unitxt import evaluate
+except ImportError:
+    raise ImportError(
+        "Package 'unitxt' is not installed. To install it, use `pip install 'lm_eval[unitxt]'`"
+    )
+from lm_eval.api.registry import AGGREGATION_REGISTRY, METRIC_REGISTRY, register_metric
+def unitxt_agg_metric(items):
+    preds = [pred[0] for pred, _, _ in items]
+    refs = [ref for _, ref, _ in items]
+    metric_name = items[0][2].replace("unitxt_", "metrics.")
+    for ref in refs:
+        ref["metrics"] = [metric_name]
+    result_metrics = evaluate(preds, refs)
+    return result_metrics[0]["score"]["global"]["score"]
+AGGREGATION_REGISTRY["unitxt"] = unitxt_agg_metric
+def unitxt_metric(items):  # This is a passthrough function
+    return items
+def process_results(doc, results):
+    metrics = doc["metrics"]
+    scores = {}
+    for metric in metrics:
+        metric = metric.replace("metrics.", "unitxt_")
+        scores[metric] = (results, doc, metric)
+        if metric not in METRIC_REGISTRY:
+            register_metric(
+                metric=metric,
+                higher_is_better=True,
+                output_type="generate_until",
+                aggregation="unitxt",
+            )(unitxt_metric)
+    return scores
+#
--- a/lm_eval/tasks/unitxt/xsum.yaml
+++ b/lm_eval/tasks/unitxt/xsum.yaml
+include: unitxt_tasks.summarization.abstractive
+task: xsum
+dataset_name: card=cards.xsum,template=templates.summarization.abstractive.full
--- a/lm_eval/tasks/unitxt/yahoo_answers_topics.yaml
+++ b/lm_eval/tasks/unitxt/yahoo_answers_topics.yaml
+include: unitxt_tasks.classification.multi_class
+task: yahoo_answers_topics
+dataset_name: card=cards.yahoo_answers_topics,template=templates.classification.multi_class.title
--- a/lm_eval/tasks/xnli_eu/README.md
+++ b/lm_eval/tasks/xnli_eu/README.md
+# XNLIeu
+### Paper
+Title: XNLIeu: a dataset for cross-lingual NLI in Basque
+Abstract: https://arxiv.org/abs/2404.06996
+XNLI is a popular Natural Language Inference (NLI) benchmark widely used to evaluate cross-lingual Natural Language Understanding (NLU) capabilities across languages. In this paper, we expand XNLI to include Basque, a low-resource language that can greatly benefit from transfer-learning approaches. The new dataset, dubbed XNLIeu, has been developed by first machine-translating the English XNLI corpus into Basque, followed by a manual post-edition step. We have conducted a series of experiments using mono- and multilingual LLMs to assess a) the effect of professional post-edition on the MT system; b) the best cross-lingual strategy for NLI in Basque; and c) whether the choice of the best cross-lingual strategy is influenced by the fact that the dataset is built by translation. The results show that post-edition is necessary and that the translate-train cross-lingual strategy obtains better results overall, although the gain is lower when tested in a dataset that has been built natively from scratch. Our code and datasets are publicly available under open licenses at https://github.com/hitz-zentroa/xnli-eu.
+Homepage: https://github.com/hitz-zentroa/xnli-eu
+### Citation
+```bibtex
+@misc{heredia2024xnlieu,
+    title={XNLIeu: a dataset for cross-lingual NLI in Basque},
+    author={Maite Heredia and Julen Etxaniz and Muitze Zulaika and Xabier Saralegi and Jeremy Barnes and Aitor Soroa},
+    year={2024},
+    eprint={2404.06996},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+```
+### Groups and Tasks
+#### Groups
+* `xnli_eu_mt_native`: Includes MT and Native variants of the XNLIeu dataset.
+#### Tasks
+* `xnli_eu`: XNLI in Basque postedited from MT.
+* `xnli_eu_mt`: XNLI in Basque machine translated from English.
+* `xnli_eu_native`: XNLI in Basque natively created.
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/xnli_eu/xnli_common_yaml
+++ b/lm_eval/tasks/xnli_eu/xnli_common_yaml
+group: xnli
+task: null
+dataset_path: xnli
+dataset_name: null
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: null
+doc_to_target: label
+doc_to_choice: null
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/xnli_eu/xnli_eu.yaml
+++ b/lm_eval/tasks/xnli_eu/xnli_eu.yaml
+include: xnli_common_yaml
+task: xnli_eu
+dataset_path: HiTZ/xnli-eu
+dataset_name: eu
+doc_to_choice: '{{[premise+", ezta? Bai, "+hypothesis,premise+", ezta? Gainera,
+"+hypothesis,premise+", ezta? Ez, "+hypothesis]}}'
+doc_to_text: ""
+test_split: test
--- a/lm_eval/tasks/xnli_eu/xnli_eu_mt.yaml
+++ b/lm_eval/tasks/xnli_eu/xnli_eu_mt.yaml
+include: xnli_eu.yaml
+group: xnli_eu_mt_native
+task: xnli_eu_mt
+dataset_name: eu_mt
--- a/lm_eval/tasks/xnli_eu/xnli_eu_native.yaml
+++ b/lm_eval/tasks/xnli_eu/xnli_eu_native.yaml
+include: xnli_eu.yaml
+group: xnli_eu_mt_native
+task: xnli_eu_native
+training_split: null
+validation_split: null
+dataset_name: eu_native
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
 import collections
 import fnmatch
 import functools
+import hashlib
 import importlib.util
 import inspect
+import json
 import logging
 import os
 import re
+from dataclasses import asdict, is_dataclass
 from itertools import islice
 from typing import Any, Callable, List
@@ -24,6 +27,10 @@ eval_logger = logging.getLogger("lm-eval")
 SPACING = " " * 47
+def hash_string(string: str) -> str:
+    return hashlib.sha256(string.encode("utf-8")).hexdigest()
 def escaped_split(text, sep_char, maxsplit=-1):
    """Split text into a list on occurrences of the given separation
    character `sep_char`. The separation character may be escaped by a
@@ -60,6 +67,15 @@ def handle_arg_string(arg):
        return arg
+def handle_non_serializable(o):
+    if isinstance(o, np.int64) or isinstance(o, np.int32):
+        return int(o)
+    elif isinstance(o, set):
+        return list(o)
+    else:
+        return str(o)
 def simple_parse_args_string(args_string):
    """
    Parses something like
@@ -166,6 +182,18 @@ def make_disjoint_window(pair):
    return a[: len(a) - (len(b) - 1)], b
+class EnhancedJSONEncoder(json.JSONEncoder):
+    """
+    Provides a proper json encoding for the loggers and trackers json dumps.
+    Notably manages the json encoding of dataclasses.
+    """
+    def default(self, o):
+        if is_dataclass(o):
+            return asdict(o)
+        return super().default(o)
 class Reorderer:
    def __init__(self, arr: List[Any], fn: Callable) -> None:
        """Reorder an array according to some function
@@ -214,7 +242,7 @@ class Reorderer:
        return res
-def make_table(result_dict, column: str = "results"):
+def make_table(result_dict, column: str = "results", sort_results: bool = True):
    """Generate table of results."""
    from pytablewriter import LatexTableWriter, MarkdownTableWriter
@@ -241,7 +269,12 @@ def make_table(result_dict, column: str = "results"):
    values = []
-    for k, dic in result_dict[column].items():
+    keys = result_dict[column].keys()
+    if sort_results:
+        # sort entries alphabetically
+        keys = sorted(keys)
+    for k in keys:
+        dic = result_dict[column][k]
        version = result_dict["versions"].get(k, "N/A")
        n = str(result_dict["n-shot"][k])

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "lm_eval"
-version = "0.4.1"
+version = "0.4.2"
 authors = [
    {name="EleutherAI", email="contact@eleuther.ai"}
 ]
@@ -59,6 +59,7 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
 [project.optional-dependencies]
 anthropic = ["anthropic"]
 dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy"]
+deepsparse = ["deepsparse-nightly[llm]>=1.8.0.20240404"]
 gptq = ["auto-gptq[triton]>=0.6.0"]
 hf_transfer = ["hf_transfer"]
 ifeval = ["langdetect", "immutabledict"]
@@ -69,14 +70,17 @@ multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
 openai = ["openai==1.3.9", "tiktoken"]
 optimum = ["optimum[openvino]"]
 promptsource = ["promptsource>=0.2.3"]
-sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
+sentencepiece = ["sentencepiece>=0.1.98"]
+sparseml = ["sparseml-nightly[llm]>=1.8.0.20240404"]
 testing = ["pytest", "pytest-cov", "pytest-xdist"]
-vllm = ["vllm==0.3.2"]
+vllm = ["vllm>=0.4.2"]
 zeno = ["pandas", "zeno-client"]
 wandb = ["wandb>=0.16.3", "pandas", "numpy"]
+unitxt = ["unitxt"]
 all = [
    "lm_eval[anthropic]",
    "lm_eval[dev]",
+    "lm_eval[deepsparse]",
    "lm_eval[gptq]",
    "lm_eval[hf_transfer]",
    "lm_eval[ifeval]",
@@ -86,10 +90,12 @@ all = [
    "lm_eval[openai]",
    "lm_eval[promptsource]",
    "lm_eval[sentencepiece]",
+    "lm_eval[sparseml]",
    "lm_eval[testing]",
    "lm_eval[vllm]",
    "lm_eval[zeno]",
    "lm_eval[wandb]",
+    "lm_eval[unitxt]"
 ]
 [tool.ruff.lint]

--- a/scripts/zeno_visualize.py
+++ b/scripts/zeno_visualize.py
@@ -67,7 +67,7 @@ def main():
        # Upload data for all models
        for model_index, model in enumerate(models):
            model_args = re.sub(
-                "/|=",
+                r"[\"<>:/\|\\?\*\[\]]+",
                "__",
                json.load(
                    open(Path(args.data_path, model, "results.json"), encoding="utf-8")

--- a/tests/models/test_huggingface.py
+++ b/tests/models/test_huggingface.py
@@ -23,6 +23,7 @@ class Test_HFLM:
    MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
    generate_until_task = task_list["gsm8k"]  # type: ignore
    generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
+    generate_until_task.set_fewshot_seed(1234)  # fewshot random generator seed
    generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
    generate_until: list[Instance] = generate_until_task.instances
    rolling_task = task_list["wikitext"]  # type: ignore

--- a/tests/models/test_neuralmagic.py
+++ b/tests/models/test_neuralmagic.py
+import pytest
+import lm_eval.evaluator as evaluator
+from lm_eval.api.registry import get_model
+SPARSEML_MODELS_TASKS = [
+    # loglikelihood
+    ("facebook/opt-125m", "lambada_openai"),
+    # loglikelihood_rolling
+    ("hf-internal-testing/tiny-random-gpt2", "wikitext"),
+    # generate_until
+    ("mgoin/tiny-random-llama-2-quant", "gsm8k"),
+]
+DEEPSPARSE_MODELS_TASKS = [
+    # loglikelihood
+    ("hf:mgoin/llama2.c-stories15M-quant-ds", "lambada_openai"),
+    # loglikelihood_rolling (not supported yet)
+    # ("hf:mgoin/llama2.c-stories15M-quant-ds", "wikitext"),
+    # generate_until
+    ("hf:mgoin/llama2.c-stories15M-quant-ds", "gsm8k"),
+]
+@pytest.mark.parametrize("model_id,task", SPARSEML_MODELS_TASKS)
+def test_sparseml_eval(model_id, task):
+    lm = get_model("sparseml").create_from_arg_string(
+        f"pretrained={model_id}",
+        {
+            "batch_size": 1,
+            "device": "cpu",
+            "dtype": "float32",
+        },
+    )
+    limit = 5
+    evaluator.simple_evaluate(
+        model=lm,
+        tasks=[task],
+        num_fewshot=0,
+        limit=limit,
+    )
+@pytest.mark.parametrize("model_id,task", DEEPSPARSE_MODELS_TASKS)
+def test_deepsparse_eval(model_id, task):
+    lm = get_model("deepsparse").create_from_arg_string(
+        f"pretrained={model_id}",
+        {
+            "batch_size": 1,
+        },
+    )
+    limit = 5
+    evaluator.simple_evaluate(
+        model=lm,
+        tasks=[task],
+        num_fewshot=0,
+        limit=limit,
+    )