Commit 60c9c170 authored by haileyschoelkopf's avatar haileyschoelkopf
Browse files

Merge branch 'main' into inverse-scaling-tasks

parents 4b2d565b b4cd85d4
group:
- unitxt
dataset_path: unitxt/data
output_type: generate_until
training_split: train
validation_split: test
doc_to_text: '{{source}}'
doc_to_target: target
process_results: !function 'unitxt_wrapper.process_results'
generation_kwargs:
until:
- </s>
metric_list:
- metric: unitxt_f1_micro
aggregation: unitxt
higher_is_better: true
- metric: unitxt_accuracy
aggregation: unitxt
higher_is_better: true
- metric: unitxt_f1_macro
aggregation: unitxt
higher_is_better: true
metadata:
verison: 1.0
group:
- unitxt
dataset_path: unitxt/data
output_type: generate_until
training_split: train
validation_split: test
doc_to_text: '{{source}}'
doc_to_target: target
process_results: !function 'unitxt_wrapper.process_results'
generation_kwargs:
until:
- </s>
metric_list:
- metric: unitxt_f1_micro_multi_label
aggregation: unitxt
higher_is_better: true
- metric: unitxt_accuracy
aggregation: unitxt
higher_is_better: true
- metric: unitxt_f1_macro_multi_label
aggregation: unitxt
higher_is_better: true
metadata:
verison: 1.0
group:
- unitxt
dataset_path: unitxt/data
output_type: generate_until
training_split: train
validation_split: test
doc_to_text: '{{source}}'
doc_to_target: target
process_results: !function 'unitxt_wrapper.process_results'
generation_kwargs:
until:
- </s>
metric_list:
- metric: unitxt_char_edit_dist_accuracy
aggregation: unitxt
higher_is_better: true
- metric: unitxt_rouge
aggregation: unitxt
higher_is_better: true
- metric: unitxt_char_edit_distance[reference_field=original_text]
aggregation: unitxt
higher_is_better: true
metadata:
verison: 1.0
group:
- unitxt
dataset_path: unitxt/data
output_type: generate_until
training_split: train
validation_split: test
doc_to_text: '{{source}}'
doc_to_target: target
process_results: !function 'unitxt_wrapper.process_results'
generation_kwargs:
until:
- </s>
metric_list:
- metric: unitxt_squad
aggregation: unitxt
higher_is_better: true
metadata:
verison: 1.0
group:
- unitxt
dataset_path: unitxt/data
output_type: generate_until
training_split: train
validation_split: test
doc_to_text: '{{source}}'
doc_to_target: target
process_results: !function 'unitxt_wrapper.process_results'
generation_kwargs:
until:
- </s>
metric_list:
- metric: unitxt_spearman
aggregation: unitxt
higher_is_better: true
metadata:
verison: 1.0
group:
- unitxt
dataset_path: unitxt/data
output_type: generate_until
training_split: train
validation_split: test
doc_to_text: '{{source}}'
doc_to_target: target
process_results: !function 'unitxt_wrapper.process_results'
generation_kwargs:
until:
- </s>
metric_list:
- metric: unitxt_ner
aggregation: unitxt
higher_is_better: true
metadata:
verison: 1.0
group:
- unitxt
dataset_path: unitxt/data
output_type: generate_until
training_split: train
validation_split: test
doc_to_text: '{{source}}'
doc_to_target: target
process_results: !function 'unitxt_wrapper.process_results'
generation_kwargs:
until:
- </s>
metric_list:
- metric: unitxt_rouge
aggregation: unitxt
higher_is_better: true
metadata:
verison: 1.0
try:
from unitxt import evaluate
except ImportError:
raise ImportError(
"Package 'unitxt' is not installed. To install it, use `pip install 'lm_eval[unitxt]'`"
)
from lm_eval.api.registry import AGGREGATION_REGISTRY, METRIC_REGISTRY, register_metric
def unitxt_agg_metric(items):
preds = [pred[0] for pred, _, _ in items]
refs = [ref for _, ref, _ in items]
metric_name = items[0][2].replace("unitxt_", "metrics.")
for ref in refs:
ref["metrics"] = [metric_name]
result_metrics = evaluate(preds, refs)
return result_metrics[0]["score"]["global"]["score"]
AGGREGATION_REGISTRY["unitxt"] = unitxt_agg_metric
def unitxt_metric(items): # This is a passthrough function
return items
def process_results(doc, results):
metrics = doc["metrics"]
scores = {}
for metric in metrics:
metric = metric.replace("metrics.", "unitxt_")
scores[metric] = (results, doc, metric)
if metric not in METRIC_REGISTRY:
register_metric(
metric=metric,
higher_is_better=True,
output_type="generate_until",
aggregation="unitxt",
)(unitxt_metric)
return scores
#
include: unitxt_tasks.summarization.abstractive
task: xsum
dataset_name: card=cards.xsum,template=templates.summarization.abstractive.full
include: unitxt_tasks.classification.multi_class
task: yahoo_answers_topics
dataset_name: card=cards.yahoo_answers_topics,template=templates.classification.multi_class.title
# XNLIeu
### Paper
Title: XNLIeu: a dataset for cross-lingual NLI in Basque
Abstract: https://arxiv.org/abs/2404.06996
XNLI is a popular Natural Language Inference (NLI) benchmark widely used to evaluate cross-lingual Natural Language Understanding (NLU) capabilities across languages. In this paper, we expand XNLI to include Basque, a low-resource language that can greatly benefit from transfer-learning approaches. The new dataset, dubbed XNLIeu, has been developed by first machine-translating the English XNLI corpus into Basque, followed by a manual post-edition step. We have conducted a series of experiments using mono- and multilingual LLMs to assess a) the effect of professional post-edition on the MT system; b) the best cross-lingual strategy for NLI in Basque; and c) whether the choice of the best cross-lingual strategy is influenced by the fact that the dataset is built by translation. The results show that post-edition is necessary and that the translate-train cross-lingual strategy obtains better results overall, although the gain is lower when tested in a dataset that has been built natively from scratch. Our code and datasets are publicly available under open licenses at https://github.com/hitz-zentroa/xnli-eu.
Homepage: https://github.com/hitz-zentroa/xnli-eu
### Citation
```bibtex
@misc{heredia2024xnlieu,
title={XNLIeu: a dataset for cross-lingual NLI in Basque},
author={Maite Heredia and Julen Etxaniz and Muitze Zulaika and Xabier Saralegi and Jeremy Barnes and Aitor Soroa},
year={2024},
eprint={2404.06996},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
### Groups and Tasks
#### Groups
* `xnli_eu_mt_native`: Includes MT and Native variants of the XNLIeu dataset.
#### Tasks
* `xnli_eu`: XNLI in Basque postedited from MT.
* `xnli_eu_mt`: XNLI in Basque machine translated from English.
* `xnli_eu_native`: XNLI in Basque natively created.
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
group: xnli
task: null
dataset_path: xnli
dataset_name: null
output_type: multiple_choice
training_split: train
validation_split: validation
doc_to_text: null
doc_to_target: label
doc_to_choice: null
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
include: xnli_common_yaml
task: xnli_eu
dataset_path: HiTZ/xnli-eu
dataset_name: eu
doc_to_choice: '{{[premise+", ezta? Bai, "+hypothesis,premise+", ezta? Gainera,
"+hypothesis,premise+", ezta? Ez, "+hypothesis]}}'
doc_to_text: ""
test_split: test
include: xnli_eu.yaml
group: xnli_eu_mt_native
task: xnli_eu_mt
dataset_name: eu_mt
include: xnli_eu.yaml
group: xnli_eu_mt_native
task: xnli_eu_native
training_split: null
validation_split: null
dataset_name: eu_native
import collections
import fnmatch
import functools
import hashlib
import importlib.util
import inspect
import json
import logging
import os
import re
from dataclasses import asdict, is_dataclass
from itertools import islice
from typing import Any, Callable, List
......@@ -24,6 +27,10 @@ eval_logger = logging.getLogger("lm-eval")
SPACING = " " * 47
def hash_string(string: str) -> str:
return hashlib.sha256(string.encode("utf-8")).hexdigest()
def escaped_split(text, sep_char, maxsplit=-1):
"""Split text into a list on occurrences of the given separation
character `sep_char`. The separation character may be escaped by a
......@@ -60,6 +67,15 @@ def handle_arg_string(arg):
return arg
def handle_non_serializable(o):
if isinstance(o, np.int64) or isinstance(o, np.int32):
return int(o)
elif isinstance(o, set):
return list(o)
else:
return str(o)
def simple_parse_args_string(args_string):
"""
Parses something like
......@@ -166,6 +182,18 @@ def make_disjoint_window(pair):
return a[: len(a) - (len(b) - 1)], b
class EnhancedJSONEncoder(json.JSONEncoder):
"""
Provides a proper json encoding for the loggers and trackers json dumps.
Notably manages the json encoding of dataclasses.
"""
def default(self, o):
if is_dataclass(o):
return asdict(o)
return super().default(o)
class Reorderer:
def __init__(self, arr: List[Any], fn: Callable) -> None:
"""Reorder an array according to some function
......@@ -214,7 +242,7 @@ class Reorderer:
return res
def make_table(result_dict, column: str = "results"):
def make_table(result_dict, column: str = "results", sort_results: bool = True):
"""Generate table of results."""
from pytablewriter import LatexTableWriter, MarkdownTableWriter
......@@ -241,7 +269,12 @@ def make_table(result_dict, column: str = "results"):
values = []
for k, dic in result_dict[column].items():
keys = result_dict[column].keys()
if sort_results:
# sort entries alphabetically
keys = sorted(keys)
for k in keys:
dic = result_dict[column][k]
version = result_dict["versions"].get(k, "N/A")
n = str(result_dict["n-shot"][k])
......
......@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "lm_eval"
version = "0.4.1"
version = "0.4.2"
authors = [
{name="EleutherAI", email="contact@eleuther.ai"}
]
......@@ -59,6 +59,7 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
[project.optional-dependencies]
anthropic = ["anthropic"]
dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy"]
deepsparse = ["deepsparse-nightly[llm]>=1.8.0.20240404"]
gptq = ["auto-gptq[triton]>=0.6.0"]
hf_transfer = ["hf_transfer"]
ifeval = ["langdetect", "immutabledict"]
......@@ -69,14 +70,17 @@ multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
openai = ["openai==1.3.9", "tiktoken"]
optimum = ["optimum[openvino]"]
promptsource = ["promptsource>=0.2.3"]
sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
sentencepiece = ["sentencepiece>=0.1.98"]
sparseml = ["sparseml-nightly[llm]>=1.8.0.20240404"]
testing = ["pytest", "pytest-cov", "pytest-xdist"]
vllm = ["vllm==0.3.2"]
vllm = ["vllm>=0.4.2"]
zeno = ["pandas", "zeno-client"]
wandb = ["wandb>=0.16.3", "pandas", "numpy"]
unitxt = ["unitxt"]
all = [
"lm_eval[anthropic]",
"lm_eval[dev]",
"lm_eval[deepsparse]",
"lm_eval[gptq]",
"lm_eval[hf_transfer]",
"lm_eval[ifeval]",
......@@ -86,10 +90,12 @@ all = [
"lm_eval[openai]",
"lm_eval[promptsource]",
"lm_eval[sentencepiece]",
"lm_eval[sparseml]",
"lm_eval[testing]",
"lm_eval[vllm]",
"lm_eval[zeno]",
"lm_eval[wandb]",
"lm_eval[unitxt]"
]
[tool.ruff.lint]
......
......@@ -67,7 +67,7 @@ def main():
# Upload data for all models
for model_index, model in enumerate(models):
model_args = re.sub(
"/|=",
r"[\"<>:/\|\\?\*\[\]]+",
"__",
json.load(
open(Path(args.data_path, model, "results.json"), encoding="utf-8")
......
......@@ -23,6 +23,7 @@ class Test_HFLM:
MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
generate_until_task = task_list["gsm8k"] # type: ignore
generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
generate_until_task.set_fewshot_seed(1234) # fewshot random generator seed
generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
generate_until: list[Instance] = generate_until_task.instances
rolling_task = task_list["wikitext"] # type: ignore
......
import pytest
import lm_eval.evaluator as evaluator
from lm_eval.api.registry import get_model
SPARSEML_MODELS_TASKS = [
# loglikelihood
("facebook/opt-125m", "lambada_openai"),
# loglikelihood_rolling
("hf-internal-testing/tiny-random-gpt2", "wikitext"),
# generate_until
("mgoin/tiny-random-llama-2-quant", "gsm8k"),
]
DEEPSPARSE_MODELS_TASKS = [
# loglikelihood
("hf:mgoin/llama2.c-stories15M-quant-ds", "lambada_openai"),
# loglikelihood_rolling (not supported yet)
# ("hf:mgoin/llama2.c-stories15M-quant-ds", "wikitext"),
# generate_until
("hf:mgoin/llama2.c-stories15M-quant-ds", "gsm8k"),
]
@pytest.mark.parametrize("model_id,task", SPARSEML_MODELS_TASKS)
def test_sparseml_eval(model_id, task):
lm = get_model("sparseml").create_from_arg_string(
f"pretrained={model_id}",
{
"batch_size": 1,
"device": "cpu",
"dtype": "float32",
},
)
limit = 5
evaluator.simple_evaluate(
model=lm,
tasks=[task],
num_fewshot=0,
limit=limit,
)
@pytest.mark.parametrize("model_id,task", DEEPSPARSE_MODELS_TASKS)
def test_deepsparse_eval(model_id, task):
lm = get_model("deepsparse").create_from_arg_string(
f"pretrained={model_id}",
{
"batch_size": 1,
},
)
limit = 5
evaluator.simple_evaluate(
model=lm,
tasks=[task],
num_fewshot=0,
limit=limit,
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment