Commit 25991f98 authored by hepj's avatar hepj
Browse files

修改readme

parent ac192496
Pipeline #1415 failed with stages
in 0 seconds
import evaluate
from evaluate.utils import launch_gradio_widget
module = evaluate.load("xnli")
launch_gradio_widget(module)
git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
\ No newline at end of file
# Copyright 2020 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" XNLI benchmark metric. """
import datasets
import evaluate
_CITATION = """\
@InProceedings{conneau2018xnli,
author = "Conneau, Alexis
and Rinott, Ruty
and Lample, Guillaume
and Williams, Adina
and Bowman, Samuel R.
and Schwenk, Holger
and Stoyanov, Veselin",
title = "XNLI: Evaluating Cross-lingual Sentence Representations",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods
in Natural Language Processing",
year = "2018",
publisher = "Association for Computational Linguistics",
location = "Brussels, Belgium",
}
"""
_DESCRIPTION = """\
XNLI is a subset of a few thousand examples from MNLI which has been translated
into a 14 different languages (some low-ish resource). As with MNLI, the goal is
to predict textual entailment (does sentence A imply/contradict/neither sentence
B) and is a classification task (given two sentences, predict one of three
labels).
"""
_KWARGS_DESCRIPTION = """
Computes XNLI score which is just simple accuracy.
Args:
predictions: Predicted labels.
references: Ground truth labels.
Returns:
'accuracy': accuracy
Examples:
>>> predictions = [0, 1]
>>> references = [0, 1]
>>> xnli_metric = evaluate.load("xnli")
>>> results = xnli_metric.compute(predictions=predictions, references=references)
>>> print(results)
{'accuracy': 1.0}
"""
def simple_accuracy(preds, labels):
return (preds == labels).mean()
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Xnli(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"predictions": datasets.Value("int64" if self.config_name != "sts-b" else "float32"),
"references": datasets.Value("int64" if self.config_name != "sts-b" else "float32"),
}
),
codebase_urls=[],
reference_urls=[],
format="numpy",
)
def _compute(self, predictions, references):
return {"accuracy": simple_accuracy(predictions, references)}
---
title: XTREME-S
emoji: 🤗
colorFrom: blue
colorTo: red
sdk: gradio
sdk_version: 3.19.1
app_file: app.py
pinned: false
tags:
- evaluate
- metric
description: >-
XTREME-S is a benchmark to evaluate universal cross-lingual speech representations in many languages.
XTREME-S covers four task families: speech recognition, classification, speech-to-text translation and retrieval.
---
# Metric Card for XTREME-S
## Metric Description
The XTREME-S metric aims to evaluate model performance on the Cross-lingual TRansfer Evaluation of Multilingual Encoders for Speech (XTREME-S) benchmark.
This benchmark was designed to evaluate speech representations across languages, tasks, domains and data regimes. It covers 102 languages from 10+ language families, 3 different domains and 4 task families: speech recognition, translation, classification and retrieval.
## How to Use
There are two steps: (1) loading the XTREME-S metric relevant to the subset of the benchmark being used for evaluation; and (2) calculating the metric.
1. **Loading the relevant XTREME-S metric** : the subsets of XTREME-S are the following: `mls`, `voxpopuli`, `covost2`, `fleurs-asr`, `fleurs-lang_id`, `minds14` and `babel`. More information about the different subsets can be found on the [XTREME-S benchmark page](https://huggingface.co/datasets/google/xtreme_s).
```python
>>> xtreme_s_metric = evaluate.load('xtreme_s', 'mls')
```
2. **Calculating the metric**: the metric takes two inputs :
- `predictions`: a list of predictions to score, with each prediction a `str`.
- `references`: a list of lists of references for each translation, with each reference a `str`.
```python
>>> references = ["it is sunny here", "paper and pen are essentials"]
>>> predictions = ["it's sunny", "paper pen are essential"]
>>> results = xtreme_s_metric.compute(predictions=predictions, references=references)
```
It also has two optional arguments:
- `bleu_kwargs`: a `dict` of keywords to be passed when computing the `bleu` metric for the `covost2` subset. Keywords can be one of `smooth_method`, `smooth_value`, `force`, `lowercase`, `tokenize`, `use_effective_order`.
- `wer_kwargs`: optional dict of keywords to be passed when computing `wer` and `cer`, which are computed for the `mls`, `fleurs-asr`, `voxpopuli`, and `babel` subsets. Keywords are `concatenate_texts`.
## Output values
The output of the metric depends on the XTREME-S subset chosen, consisting of a dictionary that contains one or several of the following metrics:
- `accuracy`: the proportion of correct predictions among the total number of cases processed, with a range between 0 and 1 (see [accuracy](https://huggingface.co/metrics/accuracy) for more information). This is returned for the `fleurs-lang_id` and `minds14` subsets.
- `f1`: the harmonic mean of the precision and recall (see [F1 score](https://huggingface.co/metrics/f1) for more information). Its range is 0-1 -- its lowest possible value is 0, if either the precision or the recall is 0, and its highest possible value is 1.0, which means perfect precision and recall. It is returned for the `minds14` subset.
- `wer`: Word error rate (WER) is a common metric of the performance of an automatic speech recognition system. The lower the value, the better the performance of the ASR system, with a WER of 0 being a perfect score (see [WER score](https://huggingface.co/metrics/wer) for more information). It is returned for the `mls`, `fleurs-asr`, `voxpopuli` and `babel` subsets of the benchmark.
- `cer`: Character error rate (CER) is similar to WER, but operates on character instead of word. The lower the CER value, the better the performance of the ASR system, with a CER of 0 being a perfect score (see [CER score](https://huggingface.co/metrics/cer) for more information). It is returned for the `mls`, `fleurs-asr`, `voxpopuli` and `babel` subsets of the benchmark.
- `bleu`: the BLEU score, calculated according to the SacreBLEU metric approach. It can take any value between 0.0 and 100.0, inclusive, with higher values being better (see [SacreBLEU](https://huggingface.co/metrics/sacrebleu) for more details). This is returned for the `covost2` subset.
### Values from popular papers
The [original XTREME-S paper](https://arxiv.org/pdf/2203.10752.pdf) reported average WERs ranging from 9.2 to 14.6, a BLEU score of 20.6, an accuracy of 73.3 and F1 score of 86.9, depending on the subsets of the dataset tested on.
## Examples
For the `mls` subset (which outputs `wer` and `cer`):
```python
>>> xtreme_s_metric = evaluate.load('xtreme_s', 'mls')
>>> references = ["it is sunny here", "paper and pen are essentials"]
>>> predictions = ["it's sunny", "paper pen are essential"]
>>> results = xtreme_s_metric.compute(predictions=predictions, references=references)
>>> print({k: round(v, 2) for k, v in results.items()})
{'wer': 0.56, 'cer': 0.27}
```
For the `covost2` subset (which outputs `bleu`):
```python
>>> xtreme_s_metric = evaluate.load('xtreme_s', 'covost2')
>>> references = ["bonjour paris", "il est necessaire de faire du sport de temps en temp"]
>>> predictions = ["bonjour paris", "il est important de faire du sport souvent"]
>>> results = xtreme_s_metric.compute(predictions=predictions, references=references)
>>> print({k: round(v, 2) for k, v in results.items()})
{'bleu': 31.65}
```
For the `fleurs-lang_id` subset (which outputs `accuracy`):
```python
>>> xtreme_s_metric = evaluate.load('xtreme_s', 'fleurs-lang_id')
>>> references = [0, 1, 0, 0, 1]
>>> predictions = [0, 1, 1, 0, 0]
>>> results = xtreme_s_metric.compute(predictions=predictions, references=references)
>>> print({k: round(v, 2) for k, v in results.items()})
{'accuracy': 0.6}
```
For the `minds14` subset (which outputs `f1` and `accuracy`):
```python
>>> xtreme_s_metric = evaluate.load('xtreme_s', 'minds14')
>>> references = [0, 1, 0, 0, 1]
>>> predictions = [0, 1, 1, 0, 0]
>>> results = xtreme_s_metric.compute(predictions=predictions, references=references)
>>> print({k: round(v, 2) for k, v in results.items()})
{'f1': 0.58, 'accuracy': 0.6}
```
## Limitations and bias
This metric works only with datasets that have the same format as the [XTREME-S dataset](https://huggingface.co/datasets/google/xtreme_s).
While the XTREME-S dataset is meant to represent a variety of languages and tasks, it has inherent biases: it is missing many languages that are important and under-represented in NLP datasets.
It also has a particular focus on read-speech because common evaluation benchmarks like CoVoST-2 or LibriSpeech evaluate on this type of speech, which results in a mismatch between performance obtained in a read-speech setting and a more noisy setting (in production or live deployment, for instance).
## Citation
```bibtex
@article{conneau2022xtreme,
title={XTREME-S: Evaluating Cross-lingual Speech Representations},
author={Conneau, Alexis and Bapna, Ankur and Zhang, Yu and Ma, Min and von Platen, Patrick and Lozhkov, Anton and Cherry, Colin and Jia, Ye and Rivera, Clara and Kale, Mihir and others},
journal={arXiv preprint arXiv:2203.10752},
year={2022}
}
```
## Further References
- [XTREME-S dataset](https://huggingface.co/datasets/google/xtreme_s)
- [XTREME-S github repository](https://github.com/google-research/xtreme)
import evaluate
from evaluate.utils import launch_gradio_widget
module = evaluate.load("xtreme_s", "mls")
launch_gradio_widget(module)
git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
scikit-learn
\ No newline at end of file
# Copyright 2022 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" XTREME-S benchmark metric. """
from typing import List
import datasets
from datasets.config import PY_VERSION
from packaging import version
from sklearn.metrics import f1_score
import evaluate
if PY_VERSION < version.parse("3.8"):
import importlib_metadata
else:
import importlib.metadata as importlib_metadata
# TODO(Patrick/Anton)
_CITATION = """\
"""
_DESCRIPTION = """\
XTREME-S is a benchmark to evaluate universal cross-lingual speech representations in many languages.
XTREME-S covers four task families: speech recognition, classification, speech-to-text translation and retrieval.
"""
_KWARGS_DESCRIPTION = """
Compute XTREME-S evaluation metric associated to each XTREME-S dataset.
Args:
predictions: list of predictions to score.
Each translation should be tokenized into a list of tokens.
references: list of lists of references for each translation.
Each reference should be tokenized into a list of tokens.
bleu_kwargs: optional dict of keywords to be passed when computing 'bleu'.
Keywords include Dict can be one of 'smooth_method', 'smooth_value', 'force', 'lowercase',
'tokenize', 'use_effective_order'.
wer_kwargs: optional dict of keywords to be passed when computing 'wer' and 'cer'.
Keywords include 'concatenate_texts'.
Returns: depending on the XTREME-S task, one or several of:
"accuracy": Accuracy - for 'fleurs-lang_id', 'minds14'
"f1": F1 score - for 'minds14'
"wer": Word error rate - for 'mls', 'fleurs-asr', 'voxpopuli', 'babel'
"cer": Character error rate - for 'mls', 'fleurs-asr', 'voxpopuli', 'babel'
"bleu": BLEU score according to the `sacrebleu` metric - for 'covost2'
Examples:
>>> xtreme_s_metric = evaluate.load('xtreme_s', 'mls') # 'mls', 'voxpopuli', 'fleurs-asr' or 'babel'
>>> references = ["it is sunny here", "paper and pen are essentials"]
>>> predictions = ["it's sunny", "paper pen are essential"]
>>> results = xtreme_s_metric.compute(predictions=predictions, references=references)
>>> print({k: round(v, 2) for k, v in results.items()})
{'wer': 0.56, 'cer': 0.27}
>>> xtreme_s_metric = evaluate.load('xtreme_s', 'covost2')
>>> references = ["bonjour paris", "il est necessaire de faire du sport de temps en temp"]
>>> predictions = ["bonjour paris", "il est important de faire du sport souvent"]
>>> results = xtreme_s_metric.compute(predictions=predictions, references=references)
>>> print({k: round(v, 2) for k, v in results.items()})
{'bleu': 31.65}
>>> xtreme_s_metric = evaluate.load('xtreme_s', 'fleurs-lang_id')
>>> references = [0, 1, 0, 0, 1]
>>> predictions = [0, 1, 1, 0, 0]
>>> results = xtreme_s_metric.compute(predictions=predictions, references=references)
>>> print({k: round(v, 2) for k, v in results.items()})
{'accuracy': 0.6}
>>> xtreme_s_metric = evaluate.load('xtreme_s', 'minds14')
>>> references = [0, 1, 0, 0, 1]
>>> predictions = [0, 1, 1, 0, 0]
>>> results = xtreme_s_metric.compute(predictions=predictions, references=references)
>>> print({k: round(v, 2) for k, v in results.items()})
{'f1': 0.58, 'accuracy': 0.6}
"""
_CONFIG_NAMES = ["fleurs-asr", "mls", "voxpopuli", "babel", "covost2", "fleurs-lang_id", "minds14"]
SENTENCE_DELIMITER = ""
try:
from jiwer import transforms as tr
_jiwer_available = True
except ImportError:
_jiwer_available = False
if _jiwer_available and version.parse(importlib_metadata.version("jiwer")) < version.parse("2.3.0"):
class SentencesToListOfCharacters(tr.AbstractTransform):
def __init__(self, sentence_delimiter: str = " "):
self.sentence_delimiter = sentence_delimiter
def process_string(self, s: str):
return list(s)
def process_list(self, inp: List[str]):
chars = []
for sent_idx, sentence in enumerate(inp):
chars.extend(self.process_string(sentence))
if self.sentence_delimiter is not None and self.sentence_delimiter != "" and sent_idx < len(inp) - 1:
chars.append(self.sentence_delimiter)
return chars
cer_transform = tr.Compose(
[tr.RemoveMultipleSpaces(), tr.Strip(), SentencesToListOfCharacters(SENTENCE_DELIMITER)]
)
elif _jiwer_available:
cer_transform = tr.Compose(
[
tr.RemoveMultipleSpaces(),
tr.Strip(),
tr.ReduceToSingleSentence(SENTENCE_DELIMITER),
tr.ReduceToListOfListOfChars(),
]
)
else:
cer_transform = None
def simple_accuracy(preds, labels):
return float((preds == labels).mean())
def f1_and_simple_accuracy(preds, labels):
return {
"f1": float(f1_score(y_true=labels, y_pred=preds, average="macro")),
"accuracy": simple_accuracy(preds, labels),
}
def bleu(
preds,
labels,
smooth_method="exp",
smooth_value=None,
force=False,
lowercase=False,
tokenize=None,
use_effective_order=False,
):
# xtreme-s can only have one label
labels = [[label] for label in labels]
preds = list(preds)
try:
import sacrebleu as scb
except ImportError:
raise ValueError(
"sacrebleu has to be installed in order to apply the bleu metric for covost2."
"You can install it via `pip install sacrebleu`."
)
if version.parse(scb.__version__) < version.parse("1.4.12"):
raise ImportWarning(
"To use `sacrebleu`, the module `sacrebleu>=1.4.12` is required, and the current version of `sacrebleu` doesn't match this condition.\n"
'You can install it with `pip install "sacrebleu>=1.4.12"`.'
)
references_per_prediction = len(labels[0])
if any(len(refs) != references_per_prediction for refs in labels):
raise ValueError("Sacrebleu requires the same number of references for each prediction")
transformed_references = [[refs[i] for refs in labels] for i in range(references_per_prediction)]
output = scb.corpus_bleu(
preds,
transformed_references,
smooth_method=smooth_method,
smooth_value=smooth_value,
force=force,
lowercase=lowercase,
use_effective_order=use_effective_order,
**(dict(tokenize=tokenize) if tokenize else {}),
)
return {"bleu": output.score}
def wer_and_cer(preds, labels, concatenate_texts, config_name):
try:
from jiwer import compute_measures
except ImportError:
raise ValueError(
f"jiwer has to be installed in order to apply the wer metric for {config_name}."
"You can install it via `pip install jiwer`."
)
if concatenate_texts:
wer = compute_measures(labels, preds)["wer"]
cer = compute_measures(labels, preds, truth_transform=cer_transform, hypothesis_transform=cer_transform)["wer"]
return {"wer": wer, "cer": cer}
else:
def compute_score(preds, labels, score_type="wer"):
incorrect = 0
total = 0
for prediction, reference in zip(preds, labels):
if score_type == "wer":
measures = compute_measures(reference, prediction)
elif score_type == "cer":
measures = compute_measures(
reference, prediction, truth_transform=cer_transform, hypothesis_transform=cer_transform
)
incorrect += measures["substitutions"] + measures["deletions"] + measures["insertions"]
total += measures["substitutions"] + measures["deletions"] + measures["hits"]
return incorrect / total
return {"wer": compute_score(preds, labels, "wer"), "cer": compute_score(preds, labels, "cer")}
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class XtremeS(evaluate.Metric):
def _info(self):
if self.config_name not in _CONFIG_NAMES:
raise KeyError(f"You should supply a configuration name selected in {_CONFIG_NAMES}")
pred_type = "int64" if self.config_name in ["fleurs-lang_id", "minds14"] else "string"
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{"predictions": datasets.Value(pred_type), "references": datasets.Value(pred_type)}
),
codebase_urls=[],
reference_urls=[],
format="numpy",
)
def _compute(self, predictions, references, bleu_kwargs=None, wer_kwargs=None):
bleu_kwargs = bleu_kwargs if bleu_kwargs is not None else {}
wer_kwargs = wer_kwargs if wer_kwargs is not None else {}
if self.config_name == "fleurs-lang_id":
return {"accuracy": simple_accuracy(predictions, references)}
elif self.config_name == "minds14":
return f1_and_simple_accuracy(predictions, references)
elif self.config_name == "covost2":
smooth_method = bleu_kwargs.pop("smooth_method", "exp")
smooth_value = bleu_kwargs.pop("smooth_value", None)
force = bleu_kwargs.pop("force", False)
lowercase = bleu_kwargs.pop("lowercase", False)
tokenize = bleu_kwargs.pop("tokenize", None)
use_effective_order = bleu_kwargs.pop("use_effective_order", False)
return bleu(
preds=predictions,
labels=references,
smooth_method=smooth_method,
smooth_value=smooth_value,
force=force,
lowercase=lowercase,
tokenize=tokenize,
use_effective_order=use_effective_order,
)
elif self.config_name in ["fleurs-asr", "mls", "voxpopuli", "babel"]:
concatenate_texts = wer_kwargs.pop("concatenate_texts", False)
return wer_and_cer(predictions, references, concatenate_texts, self.config_name)
else:
raise KeyError(f"You should supply a configuration name selected in {_CONFIG_NAMES}")
[metadata]
license_file = LICENSE
[isort]
ensure_newline_before_comments = True
force_grid_wrap = 0
include_trailing_comma = True
line_length = 119
lines_after_imports = 2
multi_line_output = 3
use_parentheses = True
[flake8]
ignore = E203, E501, W503
max-line-length = 119
exclude =
src/datasets/datasets
src/datasets/metrics
per-file-ignores =
metrics/*:F401
# Lint as: python3
""" HuggingFace/Evaluate is an open library for evaluation.
Note:
VERSION needs to be formatted following the MAJOR.MINOR.PATCH convention
(we need to follow this convention to be able to retrieve versioned scripts)
To create the package for pypi.
1. Open a PR and change the version in:
- __init__.py
- setup.py
Then merge the PR once it's approved.
3. Add a tag "vVERSION" (e.g. v0.4.1) in git to mark the release : "git tag vVERSION -m 'Add tag vVERSION for pypi'"
Push the tag to remote: git push --tags origin main
Then verify that the 'Python release' CI job runs and succeeds.
4. Fill release notes in the tag in github once everything is looking hunky-dory.
5. Open a PR to change the version in __init__.py and setup.py to X.X.X+1.dev0 (e.g. VERSION=0.4.1 -> 0.4.2.dev0).
Then merge the PR once it's approved.
"""
import os
from setuptools import find_packages, setup
REQUIRED_PKGS = [
# We need datasets as a backend
"datasets>=2.0.0",
# We use numpy>=1.17 to have np.random.Generator (Dataset shuffling)
"numpy>=1.17",
# For smart caching dataset processing
"dill",
# For performance gains with apache arrow
"pandas",
# for downloading datasets over HTTPS
"requests>=2.19.0",
# progress bars in download and scripts
"tqdm>=4.62.1",
# for fast hashing
"xxhash",
# for better multiprocessing
"multiprocess",
# to get metadata of optional dependencies such as torch or tensorflow for Python versions that don't have it
"importlib_metadata;python_version<'3.8'",
# to save datasets locally or on any filesystem
# minimum 2021.05.0 to have the AbstractArchiveFileSystem
"fsspec[http]>=2021.05.0",
# To get datasets from the Datasets Hub on huggingface.co
"huggingface-hub>=0.7.0",
# Utilities from PyPA to e.g., compare versions
"packaging",
]
TEMPLATE_REQUIRE = [
# to populate metric template
"cookiecutter",
# for the gradio widget
"gradio>=3.0.0"
]
EVALUATOR_REQUIRE = [
"transformers",
# for bootstrap computations in Evaluator
"scipy>=1.7.1",
]
TESTS_REQUIRE = [
# test dependencies
"absl-py",
"charcut>=1.1.1", # for charcut_mt
"cer>=1.2.0", # for characTER
"nltk", # for NIST and probably others
"pytest",
"pytest-datadir",
"pytest-xdist",
# optional dependencies
"tensorflow>=2.3,!=2.6.0,!=2.6.1, <=2.10",
"torch",
# metrics dependencies
"accelerate", # for frugalscore (calls transformers' Trainer)
"bert_score>=0.3.6",
"rouge_score>=0.1.2",
"sacrebleu",
"sacremoses",
"scipy>=1.10.0",
"seqeval",
"scikit-learn",
"jiwer",
"sentencepiece", # for bleurt
"transformers", # for evaluator
"mauve-text",
"trectools",
# to speed up pip backtracking
"toml>=0.10.1",
"requests_file>=1.5.1",
"tldextract>=3.1.0",
"texttable>=1.6.3",
"unidecode>=1.3.4",
"Werkzeug>=1.0.1",
"six~=1.15.0",
]
QUALITY_REQUIRE = ["black~=22.0", "flake8>=3.8.3", "isort>=5.0.0", "pyyaml>=5.3.1"]
EXTRAS_REQUIRE = {
"tensorflow": ["tensorflow>=2.2.0,!=2.6.0,!=2.6.1"],
"tensorflow_gpu": ["tensorflow-gpu>=2.2.0,!=2.6.0,!=2.6.1"],
"torch": ["torch"],
"dev": TESTS_REQUIRE + QUALITY_REQUIRE,
"tests": TESTS_REQUIRE,
"quality": QUALITY_REQUIRE,
"docs": [
# Might need to add doc-builder and some specific deps in the future
"s3fs",
],
"template": TEMPLATE_REQUIRE,
"evaluator": EVALUATOR_REQUIRE
}
setup(
name="evaluate",
version="0.4.2", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
description="HuggingFace community-driven open-source library of evaluation",
long_description=open("README.md", encoding="utf-8").read(),
long_description_content_type="text/markdown",
author="HuggingFace Inc.",
author_email="leandro@huggingface.co",
url="https://github.com/huggingface/evaluate",
download_url="https://github.com/huggingface/evaluate/tags",
license="Apache 2.0",
package_dir={"": "src"},
packages=find_packages("src"),
entry_points={"console_scripts": ["evaluate-cli=evaluate.commands.evaluate_cli:main"]},
install_requires=REQUIRED_PKGS,
extras_require=EXTRAS_REQUIRE,
python_requires=">=3.8.0",
classifiers=[
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
],
keywords="metrics machine learning evaluate evaluation",
zip_safe=False, # Required for mypy to find the py.typed file
)
# flake8: noqa
# Copyright 2020 The HuggingFace Evaluate Authors and the TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
# pylint: enable=line-too-long
# pylint: disable=g-import-not-at-top,g-bad-import-order,wrong-import-position
__version__ = "0.4.2"
from packaging import version
SCRIPTS_VERSION = "main" if version.parse(__version__).is_devrelease else __version__
del version
from .evaluation_suite import EvaluationSuite
from .evaluator import (
AudioClassificationEvaluator,
AutomaticSpeechRecognitionEvaluator,
Evaluator,
ImageClassificationEvaluator,
QuestionAnsweringEvaluator,
SummarizationEvaluator,
Text2TextGenerationEvaluator,
TextClassificationEvaluator,
TextGenerationEvaluator,
TokenClassificationEvaluator,
TranslationEvaluator,
evaluator,
)
from .hub import push_to_hub
from .info import ComparisonInfo, EvaluationModuleInfo, MeasurementInfo, MetricInfo
from .inspect import inspect_evaluation_module, list_evaluation_modules
from .loading import load
from .module import CombinedEvaluations, Comparison, EvaluationModule, Measurement, Metric, combine
from .saving import save
from .utils import *
from .utils import gradio, logging
import argparse
import os
import subprocess
from pathlib import Path
from cookiecutter.main import cookiecutter
from huggingface_hub import HfApi, Repository, create_repo
from evaluate.utils.logging import get_logger
logger = get_logger(__name__)
INSTRUCTIONS = """\
A new repository for your module "{module_name}" of type "{module_type}" has been created at {output_dir} and pushed to the Hugging Face Hub: {repo_url}.
Here are the next steps:
- implement the module logic in {module_slug}/{module_slug}.py
- document your module in {module_slug}/README.md
- add test cases for your module in {module_slug}/tests.py
- if your module has any dependencies update them in {module_slug}/requirements.txt
You can test your module's widget locally by running:
```
python {output_dir}/{module_slug}/app.py
```
When you are happy with your changes you can push your changes with the following commands to the Hugging Face Hub:
```
cd {output_dir}/{module_slug}
git add .
git commit -m "Updating module"
git push
```
You should then see the update widget on the Hugging Face Hub: {repo_url}
And you can load your module in Python with the following code:
```
from evaluate import load
module = load("{namespace}/{module_slug}")
```
"""
def main():
parser = argparse.ArgumentParser("HuggingFace Evaluate CLI tool", usage="evaluate-cli <command> [<args>]")
subparsers = parser.add_subparsers()
parser_create = subparsers.add_parser("create", help="Create new evaluation module.")
parser_create.add_argument(
"module_name", type=str, help='Pretty name of new evaluation module, e.g. "Recall" or "Exact Match".'
)
parser_create.add_argument(
"--module_type",
default="metric",
type=str,
help="Type of module, has to be one of [metric|comparison|measurement].",
)
parser_create.add_argument(
"--dataset_name", default="", type=str, help="Name of dataset if evaluation module is dataset specific."
)
parser_create.add_argument("--module_description", type=str, help="Short description of evaluation module.")
parser_create.add_argument("--output_dir", default=Path.cwd(), type=str, help="Path to output directory.")
parser_create.add_argument(
"--organization", default=None, type=str, help="Organization on the Hub to push evaluation module to."
)
parser_create.add_argument("--private", action="store_true", help="Sets evaluation module repository to private.")
args = vars(parser.parse_args())
if args["module_type"] not in ["metric", "comparison", "measurement"]:
raise ValueError("The module_type needs to be one of metric, comparison, or measurement")
if "-" in args["module_name"]:
raise ValueError("Hyphens ('-') are not allowed in module names.")
output_dir = Path(args["output_dir"])
organization = args["organization"]
module_slug = args["module_name"].lower().replace(" ", "_")
if organization is None:
hfapi = HfApi()
namespace = hfapi.whoami()["name"]
else:
namespace = organization
args["namespace"] = namespace
repo_url = f"https://huggingface.co/spaces/{namespace}/{module_slug}"
try:
create_repo(namespace + "/" + module_slug, repo_type="space", space_sdk="gradio", private=args["private"])
except Exception as exception:
logger.error(
f"Could not create Space for module at hf.co/spaces/{namespace}/{module_slug}. Make sure this space does not exist already."
)
raise exception
subprocess.run(
f"git clone {repo_url}".split(),
stderr=subprocess.PIPE,
stdout=subprocess.PIPE,
check=True,
encoding="utf-8",
cwd=output_dir,
env=os.environ.copy(),
)
repo = Repository(
local_dir=output_dir / module_slug,
)
cookiecutter(
"https://github.com/huggingface/evaluate/",
directory="templates",
no_input=True,
extra_context=args,
output_dir=output_dir,
overwrite_if_exists=True,
)
repo.git_add()
repo.git_commit("add module default template")
repo.git_push()
print(
INSTRUCTIONS.format(
module_name=args["module_name"],
module_type=args["module_type"],
module_slug=module_slug,
namespace=namespace,
repo_url=repo_url,
output_dir=output_dir,
)
)
if __name__ == "__main__":
main()
import importlib
import os
import platform
from pathlib import Path
from packaging import version
from .utils.logging import get_logger
logger = get_logger(__name__)
# Metrics
S3_METRICS_BUCKET_PREFIX = "https://s3.amazonaws.com/datasets.huggingface.co/datasets/metrics"
CLOUDFRONT_METRICS_DISTRIB_PREFIX = "https://cdn-datasets.huggingface.co/datasets/metric"
REPO_METRICS_URL = "https://raw.githubusercontent.com/huggingface/evaluate/{revision}/metrics/{path}/{name}"
REPO_MEASUREMENTS_URL = "https://raw.githubusercontent.com/huggingface/evaluate/{revision}/measurements/{path}/{name}"
REPO_COMPARISONS_URL = "https://raw.githubusercontent.com/huggingface/evaluate/{revision}/comparisons/{path}/{name}"
# Evaluation module types
EVALUATION_MODULE_TYPES = ["metric", "comparison", "measurement"]
# Hub
HF_ENDPOINT = os.environ.get("HF_ENDPOINT", "https://huggingface.co")
HF_LIST_ENDPOINT = HF_ENDPOINT + "/api/spaces?filter={type}"
HUB_EVALUATE_URL = HF_ENDPOINT + "/spaces/{path}/resolve/{revision}/{name}"
HUB_DEFAULT_VERSION = "main"
PY_VERSION = version.parse(platform.python_version())
if PY_VERSION < version.parse("3.8"):
import importlib_metadata
else:
import importlib.metadata as importlib_metadata
# General environment variables accepted values for booleans
ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
# Imports
PANDAS_VERSION = version.parse(importlib_metadata.version("pandas"))
PYARROW_VERSION = version.parse(importlib_metadata.version("pyarrow"))
USE_TF = os.environ.get("USE_TF", "AUTO").upper()
USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
USE_JAX = os.environ.get("USE_JAX", "AUTO").upper()
TORCH_VERSION = "N/A"
TORCH_AVAILABLE = False
if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
TORCH_AVAILABLE = importlib.util.find_spec("torch") is not None
if TORCH_AVAILABLE:
try:
TORCH_VERSION = version.parse(importlib_metadata.version("torch"))
logger.info(f"PyTorch version {TORCH_VERSION} available.")
except importlib_metadata.PackageNotFoundError:
pass
else:
logger.info("Disabling PyTorch because USE_TF is set")
TF_VERSION = "N/A"
TF_AVAILABLE = False
if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
TF_AVAILABLE = importlib.util.find_spec("tensorflow") is not None
if TF_AVAILABLE:
# For the metadata, we have to look for both tensorflow and tensorflow-cpu
for package in [
"tensorflow",
"tensorflow-cpu",
"tensorflow-gpu",
"tf-nightly",
"tf-nightly-cpu",
"tf-nightly-gpu",
"intel-tensorflow",
"tensorflow-rocm",
"tensorflow-macos",
]:
try:
TF_VERSION = version.parse(importlib_metadata.version(package))
except importlib_metadata.PackageNotFoundError:
continue
else:
break
else:
TF_AVAILABLE = False
if TF_AVAILABLE:
if TF_VERSION.major < 2:
logger.info(f"TensorFlow found but with version {TF_VERSION}. `datasets` requires version 2 minimum.")
TF_AVAILABLE = False
else:
logger.info(f"TensorFlow version {TF_VERSION} available.")
else:
logger.info("Disabling Tensorflow because USE_TORCH is set")
JAX_VERSION = "N/A"
JAX_AVAILABLE = False
if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
JAX_AVAILABLE = importlib.util.find_spec("jax") is not None
if JAX_AVAILABLE:
try:
JAX_VERSION = version.parse(importlib_metadata.version("jax"))
logger.info(f"JAX version {JAX_VERSION} available.")
except importlib_metadata.PackageNotFoundError:
pass
else:
logger.info("Disabling JAX because USE_JAX is set to False")
# Cache location
DEFAULT_XDG_CACHE_HOME = "~/.cache"
XDG_CACHE_HOME = os.getenv("XDG_CACHE_HOME", DEFAULT_XDG_CACHE_HOME)
DEFAULT_HF_CACHE_HOME = os.path.join(XDG_CACHE_HOME, "huggingface")
HF_CACHE_HOME = os.path.expanduser(os.getenv("HF_HOME", DEFAULT_HF_CACHE_HOME))
DEFAULT_HF_EVALUATE_CACHE = os.path.join(HF_CACHE_HOME, "evaluate")
HF_EVALUATE_CACHE = Path(os.getenv("HF_EVALUATE_CACHE", DEFAULT_HF_EVALUATE_CACHE))
DEFAULT_HF_METRICS_CACHE = os.path.join(HF_CACHE_HOME, "metrics")
HF_METRICS_CACHE = Path(os.getenv("HF_METRICS_CACHE", DEFAULT_HF_METRICS_CACHE))
DEFAULT_HF_MODULES_CACHE = os.path.join(HF_CACHE_HOME, "modules")
HF_MODULES_CACHE = Path(os.getenv("HF_MODULES_CACHE", DEFAULT_HF_MODULES_CACHE))
DOWNLOADED_DATASETS_DIR = "downloads"
DEFAULT_DOWNLOADED_EVALUATE_PATH = os.path.join(HF_EVALUATE_CACHE, DOWNLOADED_DATASETS_DIR)
DOWNLOADED_EVALUATE_PATH = Path(os.getenv("HF_DATASETS_DOWNLOADED_EVALUATE_PATH", DEFAULT_DOWNLOADED_EVALUATE_PATH))
EXTRACTED_EVALUATE_DIR = "extracted"
DEFAULT_EXTRACTED_EVALUATE_PATH = os.path.join(DEFAULT_DOWNLOADED_EVALUATE_PATH, EXTRACTED_EVALUATE_DIR)
EXTRACTED_EVALUATE_PATH = Path(os.getenv("HF_DATASETS_EXTRACTED_EVALUATE_PATH", DEFAULT_EXTRACTED_EVALUATE_PATH))
# Download count for the website
HF_UPDATE_DOWNLOAD_COUNTS = (
os.environ.get("HF_UPDATE_DOWNLOAD_COUNTS", "AUTO").upper() in ENV_VARS_TRUE_AND_AUTO_VALUES
)
# Offline mode
HF_EVALUATE_OFFLINE = os.environ.get("HF_EVALUATE_OFFLINE", "AUTO").upper() in ENV_VARS_TRUE_VALUES
# File names
LICENSE_FILENAME = "LICENSE"
METRIC_INFO_FILENAME = "metric_info.json"
DATASETDICT_JSON_FILENAME = "dataset_dict.json"
MODULE_NAME_FOR_DYNAMIC_MODULES = "evaluate_modules"
HF_HUB_ALLOWED_TASKS = [
"image-classification",
"translation",
"image-segmentation",
"fill-mask",
"automatic-speech-recognition",
"token-classification",
"sentence-similarity",
"audio-classification",
"question-answering",
"summarization",
"zero-shot-classification",
"table-to-text",
"feature-extraction",
"other",
"multiple-choice",
"text-classification",
"text-to-image",
"text2text-generation",
"zero-shot-image-classification",
"tabular-classification",
"tabular-regression",
"image-to-image",
"tabular-to-text",
"unconditional-image-generation",
"text-retrieval",
"text-to-speech",
"object-detection",
"audio-to-audio",
"text-generation",
"conversational",
"table-question-answering",
"visual-question-answering",
"image-to-text",
"reinforcement-learning",
"voice-activity-detection",
"time-series-forecasting",
"document-question-answering",
]
import importlib
import inspect
from dataclasses import dataclass
from pathlib import Path
from typing import Callable, Dict, Optional, Union
from datasets import Dataset, DownloadConfig, DownloadMode, load_dataset
from datasets.utils.version import Version
from ..evaluator import evaluator
from ..loading import evaluation_module_factory
from ..utils.logging import get_logger
logger = get_logger(__name__)
@dataclass
class SubTask:
task_type: str
data: Optional[Union[str, Dataset]] = None
subset: Optional[str] = None
split: Optional[str] = None
data_preprocessor: Optional[Callable] = None
args_for_task: Optional[dict] = None
def __post_init__(self):
if type(self.task_type) is not str:
raise ValueError(f"'task_type' must be type 'str', got {type(self.task_type)}")
if type(self.data) not in [Dataset, str]:
raise ValueError(
f"'data' must be an already-instantiated Dataset object or type 'str', got {type(self.data)}"
)
if self.subset and type(self.subset) is not str:
raise ValueError(f"'subset' must be type 'str', got {type(self.subset)}")
if self.split and type(self.split) is not str:
raise ValueError(f"'split' must be type 'str', got {type(self.split)}")
if self.data_preprocessor and not callable(self.data_preprocessor):
raise ValueError(f"'data_preprocessor' must be a Callable', got {self.data_preprocessor}")
if self.args_for_task and type(self.args_for_task) is not dict:
raise ValueError(f"'args_for_task' must be type 'dict', got {type(self.args_for_task)}")
def import_main_class(module_path):
"""Import a module at module_path and return the EvaluationSuite class"""
module = importlib.import_module(module_path)
module_main_cls = None
for name, obj in module.__dict__.items():
if isinstance(obj, type) and obj.__name__ == "Suite":
if inspect.isabstract(obj):
continue
module_main_cls = obj
break
return module_main_cls
class EvaluationSuite:
"""
This class instantiates an evaluation suite made up of multiple tasks, where each task consists of a dataset and
an associated metric, and runs evaluation on a model or pipeline. Evaluation suites can be a Python script found
either locally or uploaded as a Space on the Hugging Face Hub.
Usage:
```python
from evaluate import EvaluationSuite
suite = EvaluationSuite.load("evaluate/evaluation-suite-ci")
results = suite.run("lvwerra/distilbert-imdb")
```
"""
def __init__(self, name):
self.name = name
@staticmethod
def load(
path: str,
download_mode: Optional[DownloadMode] = None,
revision: Optional[Union[str, Version]] = None,
download_config: Optional[DownloadConfig] = None,
):
download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
evaluation_module = evaluation_module_factory(
path, module_type=None, revision=revision, download_config=download_config, download_mode=download_mode
)
name = Path(path).stem
evaluation_cls = import_main_class(evaluation_module.module_path)
evaluation_instance = evaluation_cls(name)
return evaluation_instance
def __repr__(self):
self.tasks = [str(task) for task in self.suite]
return f'EvaluationSuite name: "{self.name}", ' f"Tasks: {self.tasks})"
def assert_suite_nonempty(self):
if not self.suite:
raise ValueError(
"No evaluation tasks found. The EvaluationSuite must include at least one SubTask definition."
)
def run(
self, model_or_pipeline: Union[str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"] # noqa: F821
) -> Dict[str, float]:
self.assert_suite_nonempty()
results_all = []
for task in self.suite:
task_name = task.data
if task.data_preprocessor: # task requires extra preprocessing
ds = load_dataset(task.data, name=task.subset, split=task.split)
task.data = ds.map(task.data_preprocessor)
task_evaluator = evaluator(task.task_type)
args_for_task = task.args_for_task
args_for_task["model_or_pipeline"] = model_or_pipeline
args_for_task["data"] = task.data
args_for_task["subset"] = task.subset
args_for_task["split"] = task.split
results = task_evaluator.compute(**args_for_task)
results["task_name"] = task_name + "/" + task.subset if task.subset else task_name
results["data_preprocessor"] = str(task.data_preprocessor) if task.data_preprocessor is not None else None
results_all.append(results)
return results_all
# Copyright 2022 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
try:
from transformers.pipelines import SUPPORTED_TASKS as SUPPORTED_PIPELINE_TASKS
from transformers.pipelines import TASK_ALIASES
from transformers.pipelines import check_task as check_pipeline_task
TRANSFORMERS_AVAILABLE = True
except ImportError:
TRANSFORMERS_AVAILABLE = False
from typing import Dict, List
from .audio_classification import AudioClassificationEvaluator
from .automatic_speech_recognition import AutomaticSpeechRecognitionEvaluator
from .base import Evaluator
from .image_classification import ImageClassificationEvaluator
from .question_answering import QuestionAnsweringEvaluator
from .text2text_generation import SummarizationEvaluator, Text2TextGenerationEvaluator, TranslationEvaluator
from .text_classification import TextClassificationEvaluator
from .text_generation import TextGenerationEvaluator
from .token_classification import TokenClassificationEvaluator
SUPPORTED_EVALUATOR_TASKS = {
"text-classification": {
"implementation": TextClassificationEvaluator,
"default_metric_name": "accuracy",
},
"image-classification": {
"implementation": ImageClassificationEvaluator,
"default_metric_name": "accuracy",
},
"question-answering": {
"implementation": QuestionAnsweringEvaluator,
"default_metric_name": "squad",
},
"token-classification": {
"implementation": TokenClassificationEvaluator,
"default_metric_name": "seqeval",
},
"text-generation": {
"implementation": TextGenerationEvaluator,
"default_metric_name": "word_count",
},
"text2text-generation": {
"implementation": Text2TextGenerationEvaluator,
"default_metric_name": "bleu",
},
"summarization": {
"implementation": SummarizationEvaluator,
"default_metric_name": "rouge",
},
"translation": {
"implementation": TranslationEvaluator,
"default_metric_name": "bleu",
},
"automatic-speech-recognition": {
"implementation": AutomaticSpeechRecognitionEvaluator,
"default_metric_name": "wer",
},
"audio-classification": {
"implementation": AudioClassificationEvaluator,
"default_metric_name": "accuracy",
},
}
def get_supported_tasks() -> List[str]:
"""
Returns a list of supported task strings.
"""
return list(SUPPORTED_EVALUATOR_TASKS.keys())
def check_task(task: str) -> Dict:
"""
Checks an incoming task string, to validate it's correct and returns the default Evaluator class and default metric
name. It first performs a check to validata that the string is a valid `Pipeline` task, then it checks if it's a
valid `Evaluator` task. `Evaluator` tasks are a substet of `Pipeline` tasks.
Args:
task (`str`):
The task defining which evaluator will be returned. Currently accepted tasks are:
- `"image-classification"`
- `"question-answering"`
- `"text-classification"` (alias `"sentiment-analysis"` available)
- `"token-classification"`
Returns:
task_defaults: `dict`, contains the implementasion class of a give Evaluator and the default metric name.
"""
if task in TASK_ALIASES:
task = TASK_ALIASES[task]
if not check_pipeline_task(task):
raise KeyError(f"Unknown task {task}, available tasks are: {get_supported_tasks()}.")
if task in SUPPORTED_EVALUATOR_TASKS.keys() and task in SUPPORTED_PIPELINE_TASKS.keys():
return SUPPORTED_EVALUATOR_TASKS[task]
raise KeyError(f"Unknown task {task}, available tasks are: {get_supported_tasks()}.")
def evaluator(task: str = None) -> Evaluator:
"""
Utility factory method to build an [`Evaluator`].
Evaluators encapsulate a task and a default metric name. They leverage `pipeline` functionality from `transformers`
to simplify the evaluation of multiple combinations of models, datasets and metrics for a given task.
Args:
task (`str`):
The task defining which evaluator will be returned. Currently accepted tasks are:
- `"image-classification"`: will return a [`ImageClassificationEvaluator`].
- `"question-answering"`: will return a [`QuestionAnsweringEvaluator`].
- `"text-classification"` (alias `"sentiment-analysis"` available): will return a [`TextClassificationEvaluator`].
- `"token-classification"`: will return a [`TokenClassificationEvaluator`].
Returns:
[`Evaluator`]: An evaluator suitable for the task.
Examples:
```python
>>> from evaluate import evaluator
>>> # Sentiment analysis evaluator
>>> evaluator("sentiment-analysis")
```"""
if not TRANSFORMERS_AVAILABLE:
raise ImportError(
"If you want to use the `Evaluator` you need `transformers`. Run `pip install evaluate[transformers]`."
)
targeted_task = check_task(task)
evaluator_class = targeted_task["implementation"]
default_metric_name = targeted_task["default_metric_name"]
return evaluator_class(task=task, default_metric_name=default_metric_name)
# Copyright 2022 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from numbers import Number
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
from datasets import Dataset
from typing_extensions import Literal
from ..module import EvaluationModule
from ..utils.file_utils import add_end_docstrings, add_start_docstrings
from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator
if TYPE_CHECKING:
from transformers import FeatureExtractionMixin, Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel
TASK_DOCUMENTATION = r"""
Examples:
<Tip>
Remember that, in order to process audio files, you need ffmpeg installed (https://ffmpeg.org/download.html)
</Tip>
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("audio-classification")
>>> data = load_dataset("superb", 'ks', split="test[:40]")
>>> results = task_evaluator.compute(
>>> model_or_pipeline=""superb/wav2vec2-base-superb-ks"",
>>> data=data,
>>> label_column="label",
>>> input_column="file",
>>> metric="accuracy",
>>> label_mapping={0: "yes", 1: "no", 2: "up", 3: "down"}
>>> )
```
<Tip>
The evaluator supports raw audio data as well, in the form of a numpy array. However, be aware that calling
the audio column automatically decodes and resamples the audio files, which can be slow for large datasets.
</Tip>
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("audio-classification")
>>> data = load_dataset("superb", 'ks', split="test[:40]")
>>> data = data.map(lambda example: {"audio": example["audio"]["array"]})
>>> results = task_evaluator.compute(
>>> model_or_pipeline=""superb/wav2vec2-base-superb-ks"",
>>> data=data,
>>> label_column="label",
>>> input_column="audio",
>>> metric="accuracy",
>>> label_mapping={0: "yes", 1: "no", 2: "up", 3: "down"}
>>> )
```
"""
class AudioClassificationEvaluator(Evaluator):
"""
Audio classification evaluator.
This audio classification evaluator can currently be loaded from [`evaluator`] using the default task name
`audio-classification`.
Methods in this class assume a data format compatible with the [`transformers.AudioClassificationPipeline`].
"""
PIPELINE_KWARGS = {}
def __init__(self, task="audio-classification", default_metric_name=None):
super().__init__(task, default_metric_name=default_metric_name)
def predictions_processor(self, predictions, label_mapping):
pred_label = [max(pred, key=lambda x: x["score"])["label"] for pred in predictions]
pred_label = [label_mapping[pred] if label_mapping is not None else pred for pred in pred_label]
return {"predictions": pred_label}
@add_start_docstrings(EVALUTOR_COMPUTE_START_DOCSTRING)
@add_end_docstrings(EVALUATOR_COMPUTE_RETURN_DOCSTRING, TASK_DOCUMENTATION)
def compute(
self,
model_or_pipeline: Union[
str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel" # noqa: F821
] = None,
data: Union[str, Dataset] = None,
subset: Optional[str] = None,
split: Optional[str] = None,
metric: Union[str, EvaluationModule] = None,
tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None, # noqa: F821
feature_extractor: Optional[Union[str, "FeatureExtractionMixin"]] = None, # noqa: F821
strategy: Literal["simple", "bootstrap"] = "simple",
confidence_level: float = 0.95,
n_resamples: int = 9999,
device: int = None,
random_state: Optional[int] = None,
input_column: str = "file",
label_column: str = "label",
label_mapping: Optional[Dict[str, Number]] = None,
) -> Tuple[Dict[str, float], Any]:
"""
input_column (`str`, defaults to `"file"`):
The name of the column containing either the audio files or a raw waveform, represented as a numpy array, in the dataset specified by `data`.
label_column (`str`, defaults to `"label"`):
The name of the column containing the labels in the dataset specified by `data`.
label_mapping (`Dict[str, Number]`, *optional*, defaults to `None`):
We want to map class labels defined by the model in the pipeline to values consistent with those
defined in the `label_column` of the `data` dataset.
"""
result = super().compute(
model_or_pipeline=model_or_pipeline,
data=data,
subset=subset,
split=split,
metric=metric,
tokenizer=tokenizer,
feature_extractor=feature_extractor,
strategy=strategy,
confidence_level=confidence_level,
n_resamples=n_resamples,
device=device,
random_state=random_state,
input_column=input_column,
label_column=label_column,
label_mapping=label_mapping,
)
return result
# Copyright 2022 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
from datasets import Dataset
from typing_extensions import Literal
from ..module import EvaluationModule
from ..utils.file_utils import add_end_docstrings, add_start_docstrings
from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator
if TYPE_CHECKING:
from transformers import Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel
TASK_DOCUMENTATION = r"""
Examples:
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("automatic-speech-recognition")
>>> data = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="validation[:40]")
>>> results = task_evaluator.compute(
>>> model_or_pipeline="https://huggingface.co/openai/whisper-tiny.en",
>>> data=data,
>>> input_column="path",
>>> label_column="sentence",
>>> metric="wer",
>>> )
```
"""
class AutomaticSpeechRecognitionEvaluator(Evaluator):
"""
Automatic speech recognition evaluator.
This automatic speech recognition evaluator can currently be loaded from [`evaluator`] using the default task name
`automatic-speech-recognition`.
Methods in this class assume a data format compatible with the [`AutomaticSpeechRecognitionPipeline`].
"""
PIPELINE_KWARGS = {"truncation": True}
def __init__(self, task="automatic-speech-recognition", default_metric_name=None):
super().__init__(task, default_metric_name=default_metric_name)
def predictions_processor(self, predictions, label_mapping):
return {"predictions": [pred["text"] for pred in predictions]}
@add_start_docstrings(EVALUTOR_COMPUTE_START_DOCSTRING)
@add_end_docstrings(EVALUATOR_COMPUTE_RETURN_DOCSTRING, TASK_DOCUMENTATION)
def compute(
self,
model_or_pipeline: Union[
str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel" # noqa: F821
] = None,
data: Union[str, Dataset] = None,
subset: Optional[str] = None,
split: Optional[str] = None,
metric: Union[str, EvaluationModule] = None,
tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None, # noqa: F821
strategy: Literal["simple", "bootstrap"] = "simple",
confidence_level: float = 0.95,
n_resamples: int = 9999,
device: int = None,
random_state: Optional[int] = None,
input_column: str = "path",
label_column: str = "sentence",
generation_kwargs: dict = None,
) -> Tuple[Dict[str, float], Any]:
"""
input_column (`str`, defaults to `"path"`):
the name of the column containing the input audio path in the dataset specified by `data`.
label_column (`str`, defaults to `"sentence"`):
the name of the column containing the labels in the dataset specified by `data`.
generation_kwargs (`Dict`, *optional*, defaults to `None`):
The generation kwargs are passed to the pipeline and set the text generation strategy.
"""
if generation_kwargs is not None:
self.PIPELINE_KWARGS.update(generation_kwargs)
result = super().compute(
model_or_pipeline=model_or_pipeline,
data=data,
subset=subset,
split=split,
metric=metric,
tokenizer=tokenizer,
strategy=strategy,
confidence_level=confidence_level,
n_resamples=n_resamples,
device=device,
random_state=random_state,
input_column=input_column,
label_column=label_column,
)
return result
# Copyright 2022 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from abc import ABC, abstractmethod
from numbers import Number
from typing import Any, Callable, Dict, List, Optional, Union
# Lint as: python3
from datasets import Dataset, load_dataset
from evaluate.evaluator.utils import choose_split
try:
from scipy.stats import bootstrap
SCIPY_AVAILABLE = True
except ImportError:
SCIPY_AVAILABLE = False
try:
import transformers
from transformers import Pipeline, pipeline
TRANSFORMERS_AVAILABLE = True
except ImportError:
TRANSFORMERS_AVAILABLE = False
from time import perf_counter
from typing_extensions import Literal
from ..loading import load
from ..module import EvaluationModule
from ..utils.logging import get_logger
from .utils import DatasetColumn
logger = get_logger(__name__)
EVALUTOR_COMPUTE_START_DOCSTRING = r"""
Compute the metric for a given pipeline and dataset combination.
Args:
model_or_pipeline (`str` or `Pipeline` or `Callable` or `PreTrainedModel` or `TFPreTrainedModel`, defaults to `None`):
If the argument in not specified, we initialize the default pipeline for the task (in this case
`text-classification` or its alias - `sentiment-analysis`). If the argument is of the type `str` or
is a model instance, we use it to initialize a new `Pipeline` with the given model. Otherwise we assume the
argument specifies a pre-initialized pipeline.
data (`str` or `Dataset`, defaults to `None`):
Specifies the dataset we will run evaluation on. If it is of type `str`, we treat it as the dataset
name, and load it. Otherwise we assume it represents a pre-loaded dataset.
subset (`str`, defaults to `None`):
Defines which dataset subset to load. If `None` is passed the default subset is loaded.
split (`str`, defaults to `None`):
Defines which dataset split to load. If `None` is passed, infers based on the `choose_split` function.
metric (`str` or `EvaluationModule`, defaults to `None`):
Specifies the metric we use in evaluator. If it is of type `str`, we treat it as the metric name, and
load it. Otherwise we assume it represents a pre-loaded metric.
tokenizer (`str` or `PreTrainedTokenizer`, *optional*, defaults to `None`):
Argument can be used to overwrite a default tokenizer if `model_or_pipeline` represents a model for
which we build a pipeline. If `model_or_pipeline` is `None` or a pre-initialized pipeline, we ignore
this argument.
strategy (`Literal["simple", "bootstrap"]`, defaults to "simple"):
specifies the evaluation strategy. Possible values are:
- `"simple"` - we evaluate the metric and return the scores.
- `"bootstrap"` - on top of computing the metric scores, we calculate the confidence interval for each
of the returned metric keys, using `scipy`'s `bootstrap` method
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.bootstrap.html.
confidence_level (`float`, defaults to `0.95`):
The `confidence_level` value passed to `bootstrap` if `"bootstrap"` strategy is chosen.
n_resamples (`int`, defaults to `9999`):
The `n_resamples` value passed to `bootstrap` if `"bootstrap"` strategy is chosen.
device (`int`, defaults to `None`):
Device ordinal for CPU/GPU support of the pipeline. Setting this to -1 will leverage CPU, a positive
integer will run the model on the associated CUDA device ID. If `None` is provided it will be inferred and
CUDA:0 used if available, CPU otherwise.
random_state (`int`, *optional*, defaults to `None`):
The `random_state` value passed to `bootstrap` if `"bootstrap"` strategy is chosen. Useful for
debugging.
"""
EVALUATOR_COMPUTE_RETURN_DOCSTRING = r"""
Return:
A `Dict`. The keys represent metric keys calculated for the `metric` spefied in function arguments. For the
`"simple"` strategy, the value is the metric score. For the `"bootstrap"` strategy, the value is a `Dict`
containing the score, the confidence interval and the standard error calculated for each metric key.
"""
class Evaluator(ABC):
"""
The [`Evaluator`] class is the class from which all evaluators inherit. Refer to this class for methods shared across
different evaluators.
Base class implementing evaluator operations.
"""
PIPELINE_KWARGS = {}
METRIC_KWARGS = {}
def __init__(self, task: str, default_metric_name: str = None):
if not TRANSFORMERS_AVAILABLE:
raise ImportError(
"If you want to use the `Evaluator` you need `transformers`. Run `pip install evaluate[evaluator]`."
)
if not SCIPY_AVAILABLE:
raise ImportError(
"If you want to use the `Evaluator` you need `scipy>=1.7.1`. Run `pip install evaluate[evaluator]`."
)
self.task = task
self.default_metric_name = default_metric_name
@staticmethod
def _compute_confidence_interval(
metric,
metric_inputs,
metric_keys: List[str],
confidence_level: float = 0.95,
n_resamples: int = 9999,
random_state: Optional[int] = None,
) -> Dict[str, Any]:
"""
A utility function enabling the confidence interval calculation for metrics computed
by the evaluator based on `scipy`'s `bootstrap` method.
"""
# bootstrap only works with functions that use args and no kwargs
def build_args_metric(metric, key, **kwargs):
def args_metric(*args):
return metric.compute(**{k: v for k, v in zip(kwargs.keys(), args)})[key]
return args_metric
bootstrap_dict = {}
for key in metric_keys:
bs = bootstrap(
data=list(metric_inputs.values()),
statistic=build_args_metric(metric, key, **metric_inputs),
paired=True,
vectorized=False,
confidence_level=confidence_level,
n_resamples=n_resamples,
random_state=random_state,
)
bootstrap_dict[key] = {
"confidence_interval": (bs.confidence_interval.low, bs.confidence_interval.high),
"standard_error": bs.standard_error,
}
return bootstrap_dict
@staticmethod
def _compute_time_perf(start_time: float, end_time: float, num_samples: int) -> Dict[str, Any]:
"""
A utility function computing time performance metrics:
- `total_time_in_seconds` - pipeline inference runtime for the evaluation data in seconds,
- `samples_per_second` - pipeline throughput in the number of samples per second.
- `latency_in_seconds` - pipeline inference runtime for the evaluation data in seconds per sample,
"""
latency = end_time - start_time
throughput = num_samples / latency
latency_sample = 1.0 / throughput
return {
"total_time_in_seconds": latency,
"samples_per_second": throughput,
"latency_in_seconds": latency_sample,
}
@staticmethod
def _infer_device() -> int:
"""Helper function to check if GPU or CPU is available for inference."""
# try infer with torch first
try:
import torch
if torch.cuda.is_available():
device = 0 # first GPU
else:
device = -1 # CPU
except ImportError:
# if not available try TF
try:
import tensorflow as tf
if len(tf.config.list_physical_devices("GPU")) > 0:
device = 0 # first GPU
else:
device = -1 # CPU
except ImportError:
device = -1
if device == -1:
logger.info("No GPU found. The default device for pipeline inference is set to CPU.")
else:
logger.info("GPU found. The default device for pipeline inference is set to GPU (CUDA:0).")
return device
@abstractmethod
def predictions_processor(self, *args, **kwargs):
"""
A core method of the `Evaluator` class, which processes the pipeline outputs for compatibility with the metric.
"""
raise NotImplementedError()
def compute(
self,
model_or_pipeline: Union[
str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel" # noqa: F821
] = None,
data: Union[str, Dataset] = None,
subset: Optional[str] = None,
split: Optional[str] = None,
metric: Union[str, EvaluationModule] = None,
tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None, # noqa: F821
feature_extractor: Optional[Union[str, "FeatureExtractionMixin"]] = None, # noqa: F821
strategy: Literal["simple", "bootstrap"] = "simple",
confidence_level: float = 0.95,
n_resamples: int = 9999,
device: int = None,
random_state: Optional[int] = None,
input_column: str = "text",
label_column: str = "label",
label_mapping: Optional[Dict[str, Number]] = None,
) -> Dict[str, float]:
result = {}
self.check_for_mismatch_in_device_setup(device, model_or_pipeline)
# Prepare inputs
data = self.load_data(data=data, subset=subset, split=split)
metric_inputs, pipe_inputs = self.prepare_data(data=data, input_column=input_column, label_column=label_column)
pipe = self.prepare_pipeline(
model_or_pipeline=model_or_pipeline,
tokenizer=tokenizer,
feature_extractor=feature_extractor,
device=device,
)
metric = self.prepare_metric(metric)
# Compute predictions
predictions, perf_results = self.call_pipeline(pipe, pipe_inputs)
predictions = self.predictions_processor(predictions, label_mapping)
metric_inputs.update(predictions)
# Compute metrics from references and predictions
metric_results = self.compute_metric(
metric=metric,
metric_inputs=metric_inputs,
strategy=strategy,
confidence_level=confidence_level,
n_resamples=n_resamples,
random_state=random_state,
)
# TODO: To clarify why `wer` and `cer` return float
# even though metric.compute contract says that it
# returns Optional[dict].
if type(metric_results) is float:
metric_results = {metric.name: metric_results}
result.update(metric_results)
result.update(perf_results)
return result
@staticmethod
def check_for_mismatch_in_device_setup(device, model_or_pipeline):
if device is not None and device != -1 and isinstance(model_or_pipeline, Pipeline):
if model_or_pipeline.device.type == "cpu":
raise ValueError(
"The value of the `device` kwarg passed to `compute` suggests that this pipe should be run on an "
"accelerator, but the pipe was instantiated on CPU. Pass `device` to the pipeline during "
"initialization to use an accelerator, or pass `device=None` to `compute`. "
)
elif device != model_or_pipeline.device.index:
raise ValueError(
f"This pipeline was instantiated on device {model_or_pipeline.device.index} but device={device} was passed to `compute`."
)
def check_required_columns(self, data: Union[str, Dataset], columns_names: Dict[str, str]):
"""
Ensure the columns required for the evaluation are present in the dataset.
Args:
data (`str` or [`Dataset`]):
Specifies the dataset we will run evaluation on.
columns_names (`List[str]`):
List of column names to check in the dataset. The keys are the arguments to the [`evaluate.EvaluationModule.compute`] method,
while the values are the column names to check.
Example:
```py
>>> from datasets import load_dataset
>>> from evaluate import evaluator
>>> data = load_dataset("rotten_tomatoes', split="train")
>>> evaluator.check_required_columns(data, {"input_column": "text", "label_column": "label"})
```
"""
for input_name, column_name in columns_names.items():
if column_name not in data.column_names:
raise ValueError(
f"Invalid `{input_name}` {column_name} specified. The dataset contains the following columns: {data.column_names}."
)
@staticmethod
def get_dataset_split(data, subset=None, split=None):
"""
Infers which split to use if `None` is given.
Args:
data (`str`):
Name of dataset.
subset (`str`):
Name of config for datasets with multiple configurations (e.g. 'glue/cola').
split (`str`, defaults to `None`):
Split to use.
Returns:
`split`: `str` containing which split to use
Example:
```py
>>> from evaluate import evaluator
>>> evaluator("text-classification").get_dataset_split(data="rotten_tomatoes")
WARNING:evaluate.evaluator.base:Dataset split not defined! Automatically evaluating with split: TEST
'test'
```
"""
if split is None:
split = choose_split(data, subset)
logger.warning(f"Dataset split not defined! Automatically evaluating with split: {split.upper()}")
return split
def load_data(self, data: Union[str, Dataset], subset: str = None, split: str = None):
"""
Load dataset with given subset and split.
Args:
data ([`Dataset`] or `str`, defaults to `None`):
Specifies the dataset we will run evaluation on. If it is of
type `str`, we treat it as the dataset name, and load it. Otherwise we assume it represents a pre-loaded dataset.
subset (`str`, defaults to `None`):
Specifies dataset subset to be passed to `name` in `load_dataset`. To be
used with datasets with several configurations (e.g. glue/sst2).
split (`str`, defaults to `None`):
User-defined dataset split by name (e.g. train, validation, test). Supports slice-split (`test[:n]`).
If not defined and data is a `str` type, will automatically select the best one via `choose_split()`.
Returns:
data ([`Dataset`]): Loaded dataset which will be used for evaluation.
Example:
```py
>>> from evaluate import evaluator
>>> evaluator("text-classification").load_data(data="rotten_tomatoes", split="train")
Dataset({
features: ['text', 'label'],
num_rows: 8530
})
```
"""
if isinstance(data, str):
split = self.get_dataset_split(data, subset, split)
data = load_dataset(data, name=subset, split=split)
return data
elif isinstance(data, Dataset):
if split is not None or subset is not None:
logger.warning("`data` is a preloaded Dataset! Ignoring `subset` and `split`.")
return data
else:
raise ValueError(
"Please specify a valid `data` object - either a `str` with a name or a `Dataset` object."
)
def prepare_data(self, data: Dataset, input_column: str, label_column: str, *args, **kwargs):
"""
Prepare data.
Args:
data ([`Dataset`]):
Specifies the dataset we will run evaluation on.
input_column (`str`, defaults to `"text"`):
The name of the column containing the text feature in the dataset specified by `data`.
second_input_column(`str`, *optional*):
The name of the column containing the second text feature if there is one. Otherwise, set to `None`.
label_column (`str`, defaults to `"label"`):
The name of the column containing the labels in the dataset specified by `data`.
Returns:
`dict`: metric inputs.
`list`: pipeline inputs.
Example:
```py
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> ds = load_dataset("rotten_tomatoes", split="train")
>>> evaluator("text-classification").prepare_data(ds, input_column="text", second_input_column=None, label_column="label")
```
"""
self.check_required_columns(data, {"input_column": input_column, "label_column": label_column})
return {"references": data[label_column]}, DatasetColumn(data, input_column)
def prepare_pipeline(
self,
model_or_pipeline: Union[str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"], # noqa: F821
tokenizer: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"] = None, # noqa: F821
feature_extractor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"] = None, # noqa: F821
device: int = None,
):
"""
Prepare pipeline.
Args:
model_or_pipeline (`str` or [`~transformers.Pipeline`] or `Callable` or [`~transformers.PreTrainedModel`] or [`~transformers.TFPreTrainedModel`], defaults to `None`):
If the argument in not specified, we initialize the default pipeline for the task. If the argument is of the type `str` or
is a model instance, we use it to initialize a new [`~transformers.Pipeline`] with the given model. Otherwise we assume the
argument specifies a pre-initialized pipeline.
preprocessor ([`~transformers.PreTrainedTokenizerBase`] or [`~transformers.FeatureExtractionMixin`], *optional*, defaults to `None`):
Argument can be used to overwrite a default preprocessor if `model_or_pipeline` represents a model for
which we build a pipeline. If `model_or_pipeline` is `None` or a pre-initialized pipeline, we ignore
this argument.
Returns:
The initialized pipeline.
Example:
```py
>>> from evaluate import evaluator
>>> evaluator("text-classification").prepare_pipeline(model_or_pipeline="distilbert-base-uncased")
```
"""
if device is None:
device = self._infer_device()
if (
isinstance(model_or_pipeline, str)
or isinstance(model_or_pipeline, transformers.PreTrainedModel)
or isinstance(model_or_pipeline, transformers.TFPreTrainedModel)
):
pipe = pipeline(
self.task,
model=model_or_pipeline,
tokenizer=tokenizer,
feature_extractor=feature_extractor,
device=device,
)
else:
if model_or_pipeline is None:
pipe = pipeline(self.task, device=device)
else:
pipe = model_or_pipeline
if tokenizer is not None and feature_extractor is not None:
logger.warning("Ignoring the value of the preprocessor argument (`tokenizer` or `feature_extractor`).")
if (pipe.task != self.task) and not (self.task == "translation" and pipe.task.startswith("translation")):
raise ValueError(
f"Incompatible `model_or_pipeline`. Please specify `model_or_pipeline` compatible with the `{self.task}` task."
)
return pipe
def prepare_metric(self, metric: Union[str, EvaluationModule]):
"""
Prepare metric.
Args:
metric (`str` or [`EvaluationModule`], defaults to `None`):
Specifies the metric we use in evaluator. If it is of type `str`, we treat it as the metric name, and
load it. Otherwise we assume it represents a pre-loaded metric.
Returns:
The loaded metric.
Example:
```py
>>> from evaluate import evaluator
>>> evaluator("text-classification").prepare_metric("accuracy")
```
"""
# Prepare metric.
if metric is None:
if self.default_metric_name is None:
raise ValueError(
"`Evaluator` doesn't specify a default metric. Please specify a valid `metric` argument."
)
metric = load(self.default_metric_name)
elif isinstance(metric, str):
metric = load(metric)
return metric
def call_pipeline(self, pipe, *args, **kwargs):
start_time = perf_counter()
pipe_output = pipe(*args, **kwargs, **self.PIPELINE_KWARGS)
end_time = perf_counter()
return pipe_output, self._compute_time_perf(start_time, end_time, len(pipe_output))
def compute_metric(
self,
metric: EvaluationModule,
metric_inputs: Dict,
strategy: Literal["simple", "bootstrap"] = "simple",
confidence_level: float = 0.95,
n_resamples: int = 9999,
random_state: Optional[int] = None,
):
"""Compute and return metrics."""
result = metric.compute(**metric_inputs, **self.METRIC_KWARGS)
if strategy == "bootstrap":
metric_keys = result.keys()
bootstrap_dict = self._compute_confidence_interval(
metric,
metric_inputs,
metric_keys,
confidence_level,
n_resamples,
random_state,
)
for key in metric_keys:
bootstrap_dict[key]["score"] = result[key]
return bootstrap_dict
return result
# Copyright 2022 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from numbers import Number
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
from datasets import Dataset
from typing_extensions import Literal
from ..module import EvaluationModule
from ..utils.file_utils import add_end_docstrings, add_start_docstrings
from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator
if TYPE_CHECKING:
from transformers import FeatureExtractionMixin, Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel
TASK_DOCUMENTATION = r"""
Examples:
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("image-classification")
>>> data = load_dataset("beans", split="test[:40]")
>>> results = task_evaluator.compute(
>>> model_or_pipeline="nateraw/vit-base-beans",
>>> data=data,
>>> label_column="labels",
>>> metric="accuracy",
>>> label_mapping={'angular_leaf_spot': 0, 'bean_rust': 1, 'healthy': 2},
>>> strategy="bootstrap"
>>> )
```
"""
class ImageClassificationEvaluator(Evaluator):
"""
Image classification evaluator.
This image classification evaluator can currently be loaded from [`evaluator`] using the default task name
`image-classification`.
Methods in this class assume a data format compatible with the [`ImageClassificationPipeline`].
"""
PIPELINE_KWARGS = {}
def __init__(self, task="image-classification", default_metric_name=None):
super().__init__(task, default_metric_name=default_metric_name)
def predictions_processor(self, predictions, label_mapping):
pred_label = [max(pred, key=lambda x: x["score"])["label"] for pred in predictions]
pred_label = [label_mapping[pred] if label_mapping is not None else pred for pred in pred_label]
return {"predictions": pred_label}
@add_start_docstrings(EVALUTOR_COMPUTE_START_DOCSTRING)
@add_end_docstrings(EVALUATOR_COMPUTE_RETURN_DOCSTRING, TASK_DOCUMENTATION)
def compute(
self,
model_or_pipeline: Union[
str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel" # noqa: F821
] = None,
data: Union[str, Dataset] = None,
subset: Optional[str] = None,
split: Optional[str] = None,
metric: Union[str, EvaluationModule] = None,
tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None, # noqa: F821
feature_extractor: Optional[Union[str, "FeatureExtractionMixin"]] = None, # noqa: F821
strategy: Literal["simple", "bootstrap"] = "simple",
confidence_level: float = 0.95,
n_resamples: int = 9999,
device: int = None,
random_state: Optional[int] = None,
input_column: str = "image",
label_column: str = "label",
label_mapping: Optional[Dict[str, Number]] = None,
) -> Tuple[Dict[str, float], Any]:
"""
input_column (`str`, defaults to `"image"`):
The name of the column containing the images as PIL ImageFile in the dataset specified by `data`.
label_column (`str`, defaults to `"label"`):
The name of the column containing the labels in the dataset specified by `data`.
label_mapping (`Dict[str, Number]`, *optional*, defaults to `None`):
We want to map class labels defined by the model in the pipeline to values consistent with those
defined in the `label_column` of the `data` dataset.
"""
result = super().compute(
model_or_pipeline=model_or_pipeline,
data=data,
subset=subset,
split=split,
metric=metric,
tokenizer=tokenizer,
feature_extractor=feature_extractor,
strategy=strategy,
confidence_level=confidence_level,
n_resamples=n_resamples,
device=device,
random_state=random_state,
input_column=input_column,
label_column=label_column,
label_mapping=label_mapping,
)
return result
# Copyright 2022 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
# Lint as: python3
from datasets import Dataset
try:
TRANSFORMERS_AVAILABLE = True
except ImportError:
TRANSFORMERS_AVAILABLE = False
from typing_extensions import Literal
from ..module import EvaluationModule
from ..utils.file_utils import add_end_docstrings, add_start_docstrings
from ..utils.logging import get_logger
from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator
from .utils import DatasetColumn
if TYPE_CHECKING:
from transformers import Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel
logger = get_logger(__name__)
TASK_DOCUMENTATION = r"""
Examples:
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("question-answering")
>>> data = load_dataset("squad", split="validation[:2]")
>>> results = task_evaluator.compute(
>>> model_or_pipeline="sshleifer/tiny-distilbert-base-cased-distilled-squad",
>>> data=data,
>>> metric="squad",
>>> )
```
<Tip>
Datasets where the answer may be missing in the context are supported, for example SQuAD v2 dataset. In this case, it is safer to pass `squad_v2_format=True` to
the compute() call.
</Tip>
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("question-answering")
>>> data = load_dataset("squad_v2", split="validation[:2]")
>>> results = task_evaluator.compute(
>>> model_or_pipeline="mrm8488/bert-tiny-finetuned-squadv2",
>>> data=data,
>>> metric="squad_v2",
>>> squad_v2_format=True,
>>> )
```
"""
class QuestionAnsweringEvaluator(Evaluator):
"""
Question answering evaluator. This evaluator handles
[**extractive** question answering](https://huggingface.co/docs/transformers/task_summary#extractive-question-answering),
where the answer to the question is extracted from a context.
This question answering evaluator can currently be loaded from [`evaluator`] using the default task name
`question-answering`.
Methods in this class assume a data format compatible with the
[`~transformers.QuestionAnsweringPipeline`].
"""
PIPELINE_KWARGS = {}
def __init__(self, task="question-answering", default_metric_name=None):
super().__init__(task, default_metric_name=default_metric_name)
def prepare_data(
self, data: Dataset, question_column: str, context_column: str, id_column: str, label_column: str
):
"""Prepare data."""
if data is None:
raise ValueError(
"Please specify a valid `data` object - either a `str` with a name or a `Dataset` object."
)
self.check_required_columns(
data,
{
"question_column": question_column,
"context_column": context_column,
"id_column": id_column,
"label_column": label_column,
},
)
metric_inputs = dict()
metric_inputs["references"] = [
{"id": element[id_column], "answers": element[label_column]} for element in data
]
return metric_inputs, {
"question": DatasetColumn(data, question_column),
"context": DatasetColumn(data, context_column),
}
def is_squad_v2_format(self, data: Dataset, label_column: str = "answers"):
"""
Check if the provided dataset follows the squad v2 data schema, namely possible samples where the answer is not in the context.
In this case, the answer text list should be `[]`.
"""
original_num_rows = data.num_rows
nonempty_num_rows = data.filter(
lambda x: len(x[label_column]["text"]) > 0, load_from_cache_file=False
).num_rows
if original_num_rows > nonempty_num_rows:
return True
else:
return False
def predictions_processor(self, predictions: List, squad_v2_format: bool, ids: List):
result = []
for i in range(len(predictions)):
pred = {"prediction_text": predictions[i]["answer"], "id": ids[i]}
if squad_v2_format:
pred["no_answer_probability"] = predictions[i]["score"]
result.append(pred)
return {"predictions": result}
@add_start_docstrings(EVALUTOR_COMPUTE_START_DOCSTRING)
@add_end_docstrings(EVALUATOR_COMPUTE_RETURN_DOCSTRING, TASK_DOCUMENTATION)
def compute(
self,
model_or_pipeline: Union[
str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel" # noqa: F821
] = None,
data: Union[str, Dataset] = None,
subset: Optional[str] = None,
split: Optional[str] = None,
metric: Union[str, EvaluationModule] = None,
tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None, # noqa: F821
strategy: Literal["simple", "bootstrap"] = "simple",
confidence_level: float = 0.95,
n_resamples: int = 9999,
device: int = None,
random_state: Optional[int] = None,
question_column: str = "question",
context_column: str = "context",
id_column: str = "id",
label_column: str = "answers",
squad_v2_format: Optional[bool] = None,
) -> Tuple[Dict[str, float], Any]:
"""
question_column (`str`, defaults to `"question"`):
The name of the column containing the question in the dataset specified by `data`.
context_column (`str`, defaults to `"context"`):
The name of the column containing the context in the dataset specified by `data`.
id_column (`str`, defaults to `"id"`):
The name of the column containing the identification field of the question and answer pair in the
dataset specified by `data`.
label_column (`str`, defaults to `"answers"`):
The name of the column containing the answers in the dataset specified by `data`.
squad_v2_format (`bool`, *optional*, defaults to `None`):
Whether the dataset follows the format of squad_v2 dataset. This is the case when the provided dataset
has questions where the answer is not in the context, more specifically when are answers as
`{"text": [], "answer_start": []}` in the answer column. If all questions have at least one answer, this parameter
should be set to `False`. If this parameter is not provided, the format will be automatically inferred.
"""
result = {}
self.check_for_mismatch_in_device_setup(device, model_or_pipeline)
data = self.load_data(data=data, subset=subset, split=split)
metric_inputs, pipe_inputs = self.prepare_data(
data=data,
question_column=question_column,
context_column=context_column,
id_column=id_column,
label_column=label_column,
)
if squad_v2_format is None:
squad_v2_format = self.is_squad_v2_format(data=data, label_column=label_column)
logger.warning(
f"`squad_v2_format` parameter not provided to QuestionAnsweringEvaluator.compute(). Automatically inferred `squad_v2_format` as {squad_v2_format}."
)
pipe = self.prepare_pipeline(model_or_pipeline=model_or_pipeline, tokenizer=tokenizer, device=device)
metric = self.prepare_metric(metric)
if squad_v2_format and metric.name == "squad":
logger.warning(
"The dataset has SQuAD v2 format but you are using the SQuAD metric. Consider passing the 'squad_v2' metric."
)
if not squad_v2_format and metric.name == "squad_v2":
logger.warning(
"The dataset has SQuAD v1 format but you are using the SQuAD v2 metric. Consider passing the 'squad' metric."
)
if squad_v2_format:
self.PIPELINE_KWARGS["handle_impossible_answer"] = True
else:
self.PIPELINE_KWARGS["handle_impossible_answer"] = False
# Compute predictions
predictions, perf_results = self.call_pipeline(pipe, **pipe_inputs)
predictions = self.predictions_processor(predictions, squad_v2_format=squad_v2_format, ids=data[id_column])
metric_inputs.update(predictions)
# Compute metrics from references and predictions
metric_results = self.compute_metric(
metric=metric,
metric_inputs=metric_inputs,
strategy=strategy,
confidence_level=confidence_level,
n_resamples=n_resamples,
random_state=random_state,
)
result.update(metric_results)
result.update(perf_results)
return result
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment