修改readme

25991f98 · hepj · ac192496 · 25991f98 · 25991f98 · 25991f98
Commit 25991f98 authored Jul 25, 2024 by hepj
20 changed files
--- a/evaluate-0.4.2/metrics/xnli/app.py
+++ b/evaluate-0.4.2/metrics/xnli/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+module = evaluate.load("xnli")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/xnli/requirements.txt
+++ b/evaluate-0.4.2/metrics/xnli/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/xnli/xnli.py
+++ b/evaluate-0.4.2/metrics/xnli/xnli.py
+# Copyright 2020 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" XNLI benchmark metric. """
+import datasets
+import evaluate
+_CITATION = """\
+@InProceedings{conneau2018xnli,
+  author = "Conneau, Alexis
+                 and Rinott, Ruty
+                 and Lample, Guillaume
+                 and Williams, Adina
+                 and Bowman, Samuel R.
+                 and Schwenk, Holger
+                 and Stoyanov, Veselin",
+  title = "XNLI: Evaluating Cross-lingual Sentence Representations",
+  booktitle = "Proceedings of the 2018 Conference on Empirical Methods
+               in Natural Language Processing",
+  year = "2018",
+  publisher = "Association for Computational Linguistics",
+  location = "Brussels, Belgium",
+}
+"""
+_DESCRIPTION = """\
+XNLI is a subset of a few thousand examples from MNLI which has been translated
+into a 14 different languages (some low-ish resource). As with MNLI, the goal is
+to predict textual entailment (does sentence A imply/contradict/neither sentence
+B) and is a classification task (given two sentences, predict one of three
+labels).
+"""
+_KWARGS_DESCRIPTION = """
+Computes XNLI score which is just simple accuracy.
+Args:
+    predictions: Predicted labels.
+    references: Ground truth labels.
+Returns:
+    'accuracy': accuracy
+Examples:
+    >>> predictions = [0, 1]
+    >>> references = [0, 1]
+    >>> xnli_metric = evaluate.load("xnli")
+    >>> results = xnli_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'accuracy': 1.0}
+"""
+def simple_accuracy(preds, labels):
+    return (preds == labels).mean()
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Xnli(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Value("int64" if self.config_name != "sts-b" else "float32"),
+                    "references": datasets.Value("int64" if self.config_name != "sts-b" else "float32"),
+                }
+            ),
+            codebase_urls=[],
+            reference_urls=[],
+            format="numpy",
+        )
+    def _compute(self, predictions, references):
+        return {"accuracy": simple_accuracy(predictions, references)}
--- a/evaluate-0.4.2/metrics/xtreme_s/README.md
+++ b/evaluate-0.4.2/metrics/xtreme_s/README.md
+---
+title: XTREME-S
+emoji: 🤗 
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+description: >-
+  XTREME-S is a benchmark to evaluate universal cross-lingual speech representations in many languages.
+  XTREME-S covers four task families: speech recognition, classification, speech-to-text translation and retrieval.
+---
+# Metric Card for XTREME-S
+## Metric Description
+The XTREME-S metric aims to evaluate model performance on the Cross-lingual TRansfer Evaluation of Multilingual Encoders for Speech (XTREME-S) benchmark.
+This benchmark was designed to evaluate speech representations across languages, tasks, domains and data regimes. It covers 102 languages from 10+ language families, 3 different domains and 4 task families: speech recognition, translation, classification and retrieval.
+## How to Use
+There are two steps: (1) loading the XTREME-S metric relevant to the subset of the benchmark being used for evaluation; and (2) calculating the metric.
+1. **Loading the relevant XTREME-S metric** : the subsets of XTREME-S are the following: `mls`, `voxpopuli`, `covost2`, `fleurs-asr`, `fleurs-lang_id`,  `minds14`  and `babel`. More information about the different subsets can be found on the [XTREME-S benchmark page](https://huggingface.co/datasets/google/xtreme_s).
+```python
+>>> xtreme_s_metric = evaluate.load('xtreme_s', 'mls')
+```
+2. **Calculating the metric**: the metric takes two inputs : 
+- `predictions`: a list of predictions to score, with each prediction a `str`. 
+- `references`: a list of lists of references for each translation, with each reference a `str`. 
+```python
+>>> references = ["it is sunny here", "paper and pen are essentials"]
+>>> predictions = ["it's sunny", "paper pen are essential"]
+>>> results = xtreme_s_metric.compute(predictions=predictions, references=references)
+```
+It also has two optional arguments: 
+- `bleu_kwargs`: a `dict` of keywords to be passed when computing the `bleu` metric for the `covost2` subset. Keywords can be one of `smooth_method`, `smooth_value`, `force`, `lowercase`, `tokenize`, `use_effective_order`.
+- `wer_kwargs`: optional dict of keywords to be passed when computing `wer` and `cer`, which are computed for the `mls`, `fleurs-asr`, `voxpopuli`, and `babel` subsets. Keywords are `concatenate_texts`.
+## Output values
+The output of the metric depends on the XTREME-S subset chosen, consisting of a dictionary that contains one or several of the following metrics:
+- `accuracy`: the proportion of correct predictions among the total number of cases processed, with a range between 0 and 1 (see [accuracy](https://huggingface.co/metrics/accuracy) for more information). This is returned for the `fleurs-lang_id` and `minds14` subsets.
+- `f1`: the harmonic mean of the precision and recall (see [F1 score](https://huggingface.co/metrics/f1) for more information). Its range is 0-1 -- its lowest possible value is 0, if either the precision or the recall is 0, and its highest possible value is 1.0, which means perfect precision and recall. It is returned for the `minds14` subset.
+- `wer`: Word error rate (WER) is a common metric of the performance of an automatic speech recognition system. The lower the value, the better the performance of the ASR system, with a WER of 0 being a perfect score (see [WER score](https://huggingface.co/metrics/wer) for more information). It is returned for the `mls`, `fleurs-asr`, `voxpopuli` and `babel` subsets of the benchmark.
+- `cer`:  Character error rate (CER) is similar to WER, but operates on character instead of word. The lower the CER value, the better the performance of the ASR system, with a CER of 0 being a perfect score (see [CER score](https://huggingface.co/metrics/cer) for more information).  It is returned for the `mls`, `fleurs-asr`, `voxpopuli` and `babel` subsets of the benchmark.
+- `bleu`: the BLEU score, calculated according to the SacreBLEU metric approach. It can take any value between 0.0 and 100.0, inclusive, with higher values being better (see [SacreBLEU](https://huggingface.co/metrics/sacrebleu) for more details).  This is returned for the `covost2` subset.
+### Values from popular papers
+The [original XTREME-S paper](https://arxiv.org/pdf/2203.10752.pdf) reported average WERs ranging from 9.2 to 14.6, a BLEU score of 20.6, an accuracy of 73.3 and F1 score of 86.9, depending on the subsets of the dataset tested on. 
+## Examples 
+For the `mls` subset (which outputs `wer` and `cer`):
+```python
+>>> xtreme_s_metric = evaluate.load('xtreme_s', 'mls')  
+>>> references = ["it is sunny here", "paper and pen are essentials"]
+>>> predictions = ["it's sunny", "paper pen are essential"]
+>>> results = xtreme_s_metric.compute(predictions=predictions, references=references)
+>>> print({k: round(v, 2) for k, v in results.items()})
+{'wer': 0.56, 'cer': 0.27}
+```
+For the `covost2` subset (which outputs `bleu`):
+```python
+>>> xtreme_s_metric = evaluate.load('xtreme_s', 'covost2')
+>>> references = ["bonjour paris", "il est necessaire de faire du sport de temps en temp"]
+>>> predictions = ["bonjour paris", "il est important de faire du sport souvent"]
+>>> results = xtreme_s_metric.compute(predictions=predictions, references=references)
+>>> print({k: round(v, 2) for k, v in results.items()})
+{'bleu': 31.65}
+```
+For the `fleurs-lang_id` subset (which outputs `accuracy`):
+```python
+>>> xtreme_s_metric = evaluate.load('xtreme_s', 'fleurs-lang_id')
+>>> references = [0, 1, 0, 0, 1]
+>>> predictions = [0, 1, 1, 0, 0]
+>>> results = xtreme_s_metric.compute(predictions=predictions, references=references)
+>>> print({k: round(v, 2) for k, v in results.items()})
+{'accuracy': 0.6}
+ ```
+For the `minds14` subset (which outputs `f1` and `accuracy`):
+```python
+>>> xtreme_s_metric = evaluate.load('xtreme_s', 'minds14')
+>>> references = [0, 1, 0, 0, 1]
+>>> predictions = [0, 1, 1, 0, 0]
+>>> results = xtreme_s_metric.compute(predictions=predictions, references=references)
+>>> print({k: round(v, 2) for k, v in results.items()})
+{'f1': 0.58, 'accuracy': 0.6}
+```
+## Limitations and bias
+This metric works only with datasets that have the same format as the [XTREME-S dataset](https://huggingface.co/datasets/google/xtreme_s).
+While the XTREME-S dataset is meant to represent a variety of languages and tasks, it has inherent biases: it is missing many languages that are important and under-represented in NLP datasets. 
+It also has a particular focus on read-speech because common evaluation benchmarks like CoVoST-2 or LibriSpeech evaluate on this type of speech, which results in a mismatch between performance obtained in a read-speech setting and a more noisy setting (in production or live deployment, for instance). 
+## Citation
+```bibtex
+@article{conneau2022xtreme,
+  title={XTREME-S: Evaluating Cross-lingual Speech Representations},
+  author={Conneau, Alexis and Bapna, Ankur and Zhang, Yu and Ma, Min and von Platen, Patrick and Lozhkov, Anton and Cherry, Colin and Jia, Ye and Rivera, Clara and Kale, Mihir and others},
+  journal={arXiv preprint arXiv:2203.10752},
+  year={2022}
+}
+```
+## Further References 
+- [XTREME-S dataset](https://huggingface.co/datasets/google/xtreme_s)
+- [XTREME-S github repository](https://github.com/google-research/xtreme)
--- a/evaluate-0.4.2/metrics/xtreme_s/app.py
+++ b/evaluate-0.4.2/metrics/xtreme_s/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+module = evaluate.load("xtreme_s", "mls")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/xtreme_s/requirements.txt
+++ b/evaluate-0.4.2/metrics/xtreme_s/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+scikit-learn
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/xtreme_s/xtreme_s.py
+++ b/evaluate-0.4.2/metrics/xtreme_s/xtreme_s.py
+# Copyright 2022 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" XTREME-S benchmark metric. """
+from typing import List
+import datasets
+from datasets.config import PY_VERSION
+from packaging import version
+from sklearn.metrics import f1_score
+import evaluate
+if PY_VERSION < version.parse("3.8"):
+    import importlib_metadata
+else:
+    import importlib.metadata as importlib_metadata
+# TODO(Patrick/Anton)
+_CITATION = """\
+"""
+_DESCRIPTION = """\
+    XTREME-S is a benchmark to evaluate universal cross-lingual speech representations in many languages.
+    XTREME-S covers four task families: speech recognition, classification, speech-to-text translation and retrieval.
+"""
+_KWARGS_DESCRIPTION = """
+Compute XTREME-S evaluation metric associated to each XTREME-S dataset.
+Args:
+    predictions: list of predictions to score.
+        Each translation should be tokenized into a list of tokens.
+    references: list of lists of references for each translation.
+        Each reference should be tokenized into a list of tokens.
+    bleu_kwargs: optional dict of keywords to be passed when computing 'bleu'.
+        Keywords include Dict can be one of 'smooth_method', 'smooth_value', 'force', 'lowercase',
+        'tokenize', 'use_effective_order'.
+    wer_kwargs: optional dict of keywords to be passed when computing 'wer' and 'cer'.
+        Keywords include 'concatenate_texts'.
+Returns: depending on the XTREME-S task, one or several of:
+    "accuracy": Accuracy - for 'fleurs-lang_id', 'minds14'
+    "f1": F1 score - for 'minds14'
+    "wer": Word error rate - for 'mls', 'fleurs-asr', 'voxpopuli', 'babel'
+    "cer": Character error rate - for 'mls', 'fleurs-asr', 'voxpopuli', 'babel'
+    "bleu": BLEU score according to the `sacrebleu` metric - for 'covost2'
+Examples:
+    >>> xtreme_s_metric = evaluate.load('xtreme_s', 'mls')  # 'mls', 'voxpopuli', 'fleurs-asr' or 'babel'
+    >>> references = ["it is sunny here", "paper and pen are essentials"]
+    >>> predictions = ["it's sunny", "paper pen are essential"]
+    >>> results = xtreme_s_metric.compute(predictions=predictions, references=references)
+    >>> print({k: round(v, 2) for k, v in results.items()})
+    {'wer': 0.56, 'cer': 0.27}
+    >>> xtreme_s_metric = evaluate.load('xtreme_s', 'covost2')
+    >>> references = ["bonjour paris", "il est necessaire de faire du sport de temps en temp"]
+    >>> predictions = ["bonjour paris", "il est important de faire du sport souvent"]
+    >>> results = xtreme_s_metric.compute(predictions=predictions, references=references)
+    >>> print({k: round(v, 2) for k, v in results.items()})
+    {'bleu': 31.65}
+    >>> xtreme_s_metric = evaluate.load('xtreme_s', 'fleurs-lang_id')
+    >>> references = [0, 1, 0, 0, 1]
+    >>> predictions = [0, 1, 1, 0, 0]
+    >>> results = xtreme_s_metric.compute(predictions=predictions, references=references)
+    >>> print({k: round(v, 2) for k, v in results.items()})
+    {'accuracy': 0.6}
+    >>> xtreme_s_metric = evaluate.load('xtreme_s', 'minds14')
+    >>> references = [0, 1, 0, 0, 1]
+    >>> predictions = [0, 1, 1, 0, 0]
+    >>> results = xtreme_s_metric.compute(predictions=predictions, references=references)
+    >>> print({k: round(v, 2) for k, v in results.items()})
+    {'f1': 0.58, 'accuracy': 0.6}
+"""
+_CONFIG_NAMES = ["fleurs-asr", "mls", "voxpopuli", "babel", "covost2", "fleurs-lang_id", "minds14"]
+SENTENCE_DELIMITER = ""
+try:
+    from jiwer import transforms as tr
+    _jiwer_available = True
+except ImportError:
+    _jiwer_available = False
+if _jiwer_available and version.parse(importlib_metadata.version("jiwer")) < version.parse("2.3.0"):
+    class SentencesToListOfCharacters(tr.AbstractTransform):
+        def __init__(self, sentence_delimiter: str = " "):
+            self.sentence_delimiter = sentence_delimiter
+        def process_string(self, s: str):
+            return list(s)
+        def process_list(self, inp: List[str]):
+            chars = []
+            for sent_idx, sentence in enumerate(inp):
+                chars.extend(self.process_string(sentence))
+                if self.sentence_delimiter is not None and self.sentence_delimiter != "" and sent_idx < len(inp) - 1:
+                    chars.append(self.sentence_delimiter)
+            return chars
+    cer_transform = tr.Compose(
+        [tr.RemoveMultipleSpaces(), tr.Strip(), SentencesToListOfCharacters(SENTENCE_DELIMITER)]
+    )
+elif _jiwer_available:
+    cer_transform = tr.Compose(
+        [
+            tr.RemoveMultipleSpaces(),
+            tr.Strip(),
+            tr.ReduceToSingleSentence(SENTENCE_DELIMITER),
+            tr.ReduceToListOfListOfChars(),
+        ]
+    )
+else:
+    cer_transform = None
+def simple_accuracy(preds, labels):
+    return float((preds == labels).mean())
+def f1_and_simple_accuracy(preds, labels):
+    return {
+        "f1": float(f1_score(y_true=labels, y_pred=preds, average="macro")),
+        "accuracy": simple_accuracy(preds, labels),
+    }
+def bleu(
+    preds,
+    labels,
+    smooth_method="exp",
+    smooth_value=None,
+    force=False,
+    lowercase=False,
+    tokenize=None,
+    use_effective_order=False,
+):
+    # xtreme-s can only have one label
+    labels = [[label] for label in labels]
+    preds = list(preds)
+    try:
+        import sacrebleu as scb
+    except ImportError:
+        raise ValueError(
+            "sacrebleu has to be installed in order to apply the bleu metric for covost2."
+            "You can install it via `pip install sacrebleu`."
+        )
+    if version.parse(scb.__version__) < version.parse("1.4.12"):
+        raise ImportWarning(
+            "To use `sacrebleu`, the module `sacrebleu>=1.4.12` is required, and the current version of `sacrebleu` doesn't match this condition.\n"
+            'You can install it with `pip install "sacrebleu>=1.4.12"`.'
+        )
+    references_per_prediction = len(labels[0])
+    if any(len(refs) != references_per_prediction for refs in labels):
+        raise ValueError("Sacrebleu requires the same number of references for each prediction")
+    transformed_references = [[refs[i] for refs in labels] for i in range(references_per_prediction)]
+    output = scb.corpus_bleu(
+        preds,
+        transformed_references,
+        smooth_method=smooth_method,
+        smooth_value=smooth_value,
+        force=force,
+        lowercase=lowercase,
+        use_effective_order=use_effective_order,
+        **(dict(tokenize=tokenize) if tokenize else {}),
+    )
+    return {"bleu": output.score}
+def wer_and_cer(preds, labels, concatenate_texts, config_name):
+    try:
+        from jiwer import compute_measures
+    except ImportError:
+        raise ValueError(
+            f"jiwer has to be installed in order to apply the wer metric for {config_name}."
+            "You can install it via `pip install jiwer`."
+        )
+    if concatenate_texts:
+        wer = compute_measures(labels, preds)["wer"]
+        cer = compute_measures(labels, preds, truth_transform=cer_transform, hypothesis_transform=cer_transform)["wer"]
+        return {"wer": wer, "cer": cer}
+    else:
+        def compute_score(preds, labels, score_type="wer"):
+            incorrect = 0
+            total = 0
+            for prediction, reference in zip(preds, labels):
+                if score_type == "wer":
+                    measures = compute_measures(reference, prediction)
+                elif score_type == "cer":
+                    measures = compute_measures(
+                        reference, prediction, truth_transform=cer_transform, hypothesis_transform=cer_transform
+                    )
+                incorrect += measures["substitutions"] + measures["deletions"] + measures["insertions"]
+                total += measures["substitutions"] + measures["deletions"] + measures["hits"]
+            return incorrect / total
+        return {"wer": compute_score(preds, labels, "wer"), "cer": compute_score(preds, labels, "cer")}
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class XtremeS(evaluate.Metric):
+    def _info(self):
+        if self.config_name not in _CONFIG_NAMES:
+            raise KeyError(f"You should supply a configuration name selected in {_CONFIG_NAMES}")
+        pred_type = "int64" if self.config_name in ["fleurs-lang_id", "minds14"] else "string"
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {"predictions": datasets.Value(pred_type), "references": datasets.Value(pred_type)}
+            ),
+            codebase_urls=[],
+            reference_urls=[],
+            format="numpy",
+        )
+    def _compute(self, predictions, references, bleu_kwargs=None, wer_kwargs=None):
+        bleu_kwargs = bleu_kwargs if bleu_kwargs is not None else {}
+        wer_kwargs = wer_kwargs if wer_kwargs is not None else {}
+        if self.config_name == "fleurs-lang_id":
+            return {"accuracy": simple_accuracy(predictions, references)}
+        elif self.config_name == "minds14":
+            return f1_and_simple_accuracy(predictions, references)
+        elif self.config_name == "covost2":
+            smooth_method = bleu_kwargs.pop("smooth_method", "exp")
+            smooth_value = bleu_kwargs.pop("smooth_value", None)
+            force = bleu_kwargs.pop("force", False)
+            lowercase = bleu_kwargs.pop("lowercase", False)
+            tokenize = bleu_kwargs.pop("tokenize", None)
+            use_effective_order = bleu_kwargs.pop("use_effective_order", False)
+            return bleu(
+                preds=predictions,
+                labels=references,
+                smooth_method=smooth_method,
+                smooth_value=smooth_value,
+                force=force,
+                lowercase=lowercase,
+                tokenize=tokenize,
+                use_effective_order=use_effective_order,
+            )
+        elif self.config_name in ["fleurs-asr", "mls", "voxpopuli", "babel"]:
+            concatenate_texts = wer_kwargs.pop("concatenate_texts", False)
+            return wer_and_cer(predictions, references, concatenate_texts, self.config_name)
+        else:
+            raise KeyError(f"You should supply a configuration name selected in {_CONFIG_NAMES}")
--- a/evaluate-0.4.2/setup.cfg
+++ b/evaluate-0.4.2/setup.cfg
+[metadata]
+license_file = LICENSE
+[isort]
+ensure_newline_before_comments = True
+force_grid_wrap = 0
+include_trailing_comma = True
+line_length = 119
+lines_after_imports = 2
+multi_line_output = 3
+use_parentheses = True
+[flake8]
+ignore = E203, E501, W503
+max-line-length = 119
+exclude =
+    src/datasets/datasets
+    src/datasets/metrics
+per-file-ignores =
+    metrics/*:F401
--- a/evaluate-0.4.2/setup.py
+++ b/evaluate-0.4.2/setup.py
+# Lint as: python3
+""" HuggingFace/Evaluate is an open library for evaluation.
+Note:
+   VERSION needs to be formatted following the MAJOR.MINOR.PATCH convention
+   (we need to follow this convention to be able to retrieve versioned scripts)
+To create the package for pypi.
+1. Open a PR and change the version in:
+   - __init__.py
+   - setup.py
+   Then merge the PR once it's approved.
+3. Add a tag "vVERSION" (e.g. v0.4.1) in git to mark the release : "git tag vVERSION -m 'Add tag vVERSION for pypi'"
+   Push the tag to remote: git push --tags origin main
+   Then verify that the 'Python release' CI job runs and succeeds.
+4. Fill release notes in the tag in github once everything is looking hunky-dory.
+5. Open a PR to change the version in __init__.py and setup.py to X.X.X+1.dev0 (e.g. VERSION=0.4.1 -> 0.4.2.dev0).
+   Then merge the PR once it's approved.
+"""
+import os
+from setuptools import find_packages, setup
+REQUIRED_PKGS = [
+    # We need datasets as a backend
+    "datasets>=2.0.0",
+    # We use numpy>=1.17 to have np.random.Generator (Dataset shuffling)
+    "numpy>=1.17",
+    # For smart caching dataset processing
+    "dill",
+    # For performance gains with apache arrow
+    "pandas",
+    # for downloading datasets over HTTPS
+    "requests>=2.19.0",
+    # progress bars in download and scripts
+    "tqdm>=4.62.1",
+    # for fast hashing
+    "xxhash",
+    # for better multiprocessing
+    "multiprocess",
+    # to get metadata of optional dependencies such as torch or tensorflow for Python versions that don't have it
+    "importlib_metadata;python_version<'3.8'",
+    # to save datasets locally or on any filesystem
+    # minimum 2021.05.0 to have the AbstractArchiveFileSystem
+    "fsspec[http]>=2021.05.0",
+    # To get datasets from the Datasets Hub on huggingface.co
+    "huggingface-hub>=0.7.0",
+    # Utilities from PyPA to e.g., compare versions
+    "packaging",
+]
+TEMPLATE_REQUIRE = [
+    # to populate metric template
+    "cookiecutter",
+    # for the gradio widget
+    "gradio>=3.0.0"
+]
+EVALUATOR_REQUIRE = [
+   "transformers",
+   # for bootstrap computations in Evaluator
+   "scipy>=1.7.1",
+]
+TESTS_REQUIRE = [
+    # test dependencies
+    "absl-py",
+    "charcut>=1.1.1",  # for charcut_mt
+    "cer>=1.2.0",  # for characTER
+    "nltk",  # for NIST and probably others
+    "pytest",
+    "pytest-datadir",
+    "pytest-xdist",
+    # optional dependencies
+    "tensorflow>=2.3,!=2.6.0,!=2.6.1, <=2.10",
+    "torch",
+    # metrics dependencies
+    "accelerate",  # for frugalscore (calls transformers' Trainer)
+    "bert_score>=0.3.6",
+    "rouge_score>=0.1.2",
+    "sacrebleu",
+    "sacremoses",
+    "scipy>=1.10.0",
+    "seqeval",
+    "scikit-learn",
+    "jiwer",
+    "sentencepiece",  # for bleurt
+    "transformers", # for evaluator
+    "mauve-text",
+    "trectools",
+    # to speed up pip backtracking
+    "toml>=0.10.1",
+    "requests_file>=1.5.1",
+    "tldextract>=3.1.0",
+    "texttable>=1.6.3",
+    "unidecode>=1.3.4",
+    "Werkzeug>=1.0.1",
+    "six~=1.15.0",
+]
+QUALITY_REQUIRE = ["black~=22.0", "flake8>=3.8.3", "isort>=5.0.0", "pyyaml>=5.3.1"]
+EXTRAS_REQUIRE = {
+    "tensorflow": ["tensorflow>=2.2.0,!=2.6.0,!=2.6.1"],
+    "tensorflow_gpu": ["tensorflow-gpu>=2.2.0,!=2.6.0,!=2.6.1"],
+    "torch": ["torch"],
+    "dev": TESTS_REQUIRE + QUALITY_REQUIRE,
+    "tests": TESTS_REQUIRE,
+    "quality": QUALITY_REQUIRE,
+    "docs": [
+        # Might need to add doc-builder and some specific deps in the future
+        "s3fs",
+    ],
+    "template": TEMPLATE_REQUIRE,
+    "evaluator": EVALUATOR_REQUIRE
+}
+setup(
+    name="evaluate",
+    version="0.4.2",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    description="HuggingFace community-driven open-source library of evaluation",
+    long_description=open("README.md", encoding="utf-8").read(),
+    long_description_content_type="text/markdown",
+    author="HuggingFace Inc.",
+    author_email="leandro@huggingface.co",
+    url="https://github.com/huggingface/evaluate",
+    download_url="https://github.com/huggingface/evaluate/tags",
+    license="Apache 2.0",
+    package_dir={"": "src"},
+    packages=find_packages("src"),
+    entry_points={"console_scripts": ["evaluate-cli=evaluate.commands.evaluate_cli:main"]},
+    install_requires=REQUIRED_PKGS,
+    extras_require=EXTRAS_REQUIRE,
+    python_requires=">=3.8.0",
+    classifiers=[
+        "Development Status :: 5 - Production/Stable",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Education",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    ],
+    keywords="metrics machine learning evaluate evaluation",
+    zip_safe=False,  # Required for mypy to find the py.typed file
+)
--- a/evaluate-0.4.2/src/evaluate/__init__.py
+++ b/evaluate-0.4.2/src/evaluate/__init__.py
+# flake8: noqa
+# Copyright 2020 The HuggingFace Evaluate Authors and the TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Lint as: python3
+# pylint: enable=line-too-long
+# pylint: disable=g-import-not-at-top,g-bad-import-order,wrong-import-position
+__version__ = "0.4.2"
+from packaging import version
+SCRIPTS_VERSION = "main" if version.parse(__version__).is_devrelease else __version__
+del version
+from .evaluation_suite import EvaluationSuite
+from .evaluator import (
+    AudioClassificationEvaluator,
+    AutomaticSpeechRecognitionEvaluator,
+    Evaluator,
+    ImageClassificationEvaluator,
+    QuestionAnsweringEvaluator,
+    SummarizationEvaluator,
+    Text2TextGenerationEvaluator,
+    TextClassificationEvaluator,
+    TextGenerationEvaluator,
+    TokenClassificationEvaluator,
+    TranslationEvaluator,
+    evaluator,
+)
+from .hub import push_to_hub
+from .info import ComparisonInfo, EvaluationModuleInfo, MeasurementInfo, MetricInfo
+from .inspect import inspect_evaluation_module, list_evaluation_modules
+from .loading import load
+from .module import CombinedEvaluations, Comparison, EvaluationModule, Measurement, Metric, combine
+from .saving import save
+from .utils import *
+from .utils import gradio, logging
--- a/evaluate-0.4.2/src/evaluate/commands/__init__.py
+++ b/evaluate-0.4.2/src/evaluate/commands/__init__.py
--- a/evaluate-0.4.2/src/evaluate/commands/evaluate_cli.py
+++ b/evaluate-0.4.2/src/evaluate/commands/evaluate_cli.py
+import argparse
+import os
+import subprocess
+from pathlib import Path
+from cookiecutter.main import cookiecutter
+from huggingface_hub import HfApi, Repository, create_repo
+from evaluate.utils.logging import get_logger
+logger = get_logger(__name__)
+INSTRUCTIONS = """\
+A new repository for your module "{module_name}" of type "{module_type}" has been created at {output_dir} and pushed to the Hugging Face Hub: {repo_url}.
+Here are the next steps:
+- implement the module logic in {module_slug}/{module_slug}.py
+- document your module in {module_slug}/README.md
+- add test cases for your module in {module_slug}/tests.py
+- if your module has any dependencies update them in {module_slug}/requirements.txt
+You can test your module's widget locally by running:
+```
+python {output_dir}/{module_slug}/app.py
+```
+When you are happy with your changes you can push your changes with the following commands to the Hugging Face Hub:
+```
+cd {output_dir}/{module_slug}
+git add .
+git commit -m "Updating module"
+git push
+```
+You should then see the update widget on the Hugging Face Hub: {repo_url}
+And you can load your module in Python with the following code:
+```
+from evaluate import load
+module = load("{namespace}/{module_slug}")
+```
+"""
+def main():
+    parser = argparse.ArgumentParser("HuggingFace Evaluate CLI tool", usage="evaluate-cli <command> [<args>]")
+    subparsers = parser.add_subparsers()
+    parser_create = subparsers.add_parser("create", help="Create new evaluation module.")
+    parser_create.add_argument(
+        "module_name", type=str, help='Pretty name of new evaluation module, e.g. "Recall" or "Exact Match".'
+    )
+    parser_create.add_argument(
+        "--module_type",
+        default="metric",
+        type=str,
+        help="Type of module, has to be one of [metric|comparison|measurement].",
+    )
+    parser_create.add_argument(
+        "--dataset_name", default="", type=str, help="Name of dataset if evaluation module is dataset specific."
+    )
+    parser_create.add_argument("--module_description", type=str, help="Short description of evaluation module.")
+    parser_create.add_argument("--output_dir", default=Path.cwd(), type=str, help="Path to output directory.")
+    parser_create.add_argument(
+        "--organization", default=None, type=str, help="Organization on the Hub to push evaluation module to."
+    )
+    parser_create.add_argument("--private", action="store_true", help="Sets evaluation module repository to private.")
+    args = vars(parser.parse_args())
+    if args["module_type"] not in ["metric", "comparison", "measurement"]:
+        raise ValueError("The module_type needs to be one of metric, comparison, or measurement")
+    if "-" in args["module_name"]:
+        raise ValueError("Hyphens ('-') are not allowed in module names.")
+    output_dir = Path(args["output_dir"])
+    organization = args["organization"]
+    module_slug = args["module_name"].lower().replace(" ", "_")
+    if organization is None:
+        hfapi = HfApi()
+        namespace = hfapi.whoami()["name"]
+    else:
+        namespace = organization
+    args["namespace"] = namespace
+    repo_url = f"https://huggingface.co/spaces/{namespace}/{module_slug}"
+    try:
+        create_repo(namespace + "/" + module_slug, repo_type="space", space_sdk="gradio", private=args["private"])
+    except Exception as exception:
+        logger.error(
+            f"Could not create Space for module at hf.co/spaces/{namespace}/{module_slug}. Make sure this space does not exist already."
+        )
+        raise exception
+    subprocess.run(
+        f"git clone {repo_url}".split(),
+        stderr=subprocess.PIPE,
+        stdout=subprocess.PIPE,
+        check=True,
+        encoding="utf-8",
+        cwd=output_dir,
+        env=os.environ.copy(),
+    )
+    repo = Repository(
+        local_dir=output_dir / module_slug,
+    )
+    cookiecutter(
+        "https://github.com/huggingface/evaluate/",
+        directory="templates",
+        no_input=True,
+        extra_context=args,
+        output_dir=output_dir,
+        overwrite_if_exists=True,
+    )
+    repo.git_add()
+    repo.git_commit("add module default template")
+    repo.git_push()
+    print(
+        INSTRUCTIONS.format(
+            module_name=args["module_name"],
+            module_type=args["module_type"],
+            module_slug=module_slug,
+            namespace=namespace,
+            repo_url=repo_url,
+            output_dir=output_dir,
+        )
+    )
+if __name__ == "__main__":
+    main()
--- a/evaluate-0.4.2/src/evaluate/config.py
+++ b/evaluate-0.4.2/src/evaluate/config.py
+import importlib
+import os
+import platform
+from pathlib import Path
+from packaging import version
+from .utils.logging import get_logger
+logger = get_logger(__name__)
+# Metrics
+S3_METRICS_BUCKET_PREFIX = "https://s3.amazonaws.com/datasets.huggingface.co/datasets/metrics"
+CLOUDFRONT_METRICS_DISTRIB_PREFIX = "https://cdn-datasets.huggingface.co/datasets/metric"
+REPO_METRICS_URL = "https://raw.githubusercontent.com/huggingface/evaluate/{revision}/metrics/{path}/{name}"
+REPO_MEASUREMENTS_URL = "https://raw.githubusercontent.com/huggingface/evaluate/{revision}/measurements/{path}/{name}"
+REPO_COMPARISONS_URL = "https://raw.githubusercontent.com/huggingface/evaluate/{revision}/comparisons/{path}/{name}"
+# Evaluation module types
+EVALUATION_MODULE_TYPES = ["metric", "comparison", "measurement"]
+# Hub
+HF_ENDPOINT = os.environ.get("HF_ENDPOINT", "https://huggingface.co")
+HF_LIST_ENDPOINT = HF_ENDPOINT + "/api/spaces?filter={type}"
+HUB_EVALUATE_URL = HF_ENDPOINT + "/spaces/{path}/resolve/{revision}/{name}"
+HUB_DEFAULT_VERSION = "main"
+PY_VERSION = version.parse(platform.python_version())
+if PY_VERSION < version.parse("3.8"):
+    import importlib_metadata
+else:
+    import importlib.metadata as importlib_metadata
+# General environment variables accepted values for booleans
+ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
+ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
+# Imports
+PANDAS_VERSION = version.parse(importlib_metadata.version("pandas"))
+PYARROW_VERSION = version.parse(importlib_metadata.version("pyarrow"))
+USE_TF = os.environ.get("USE_TF", "AUTO").upper()
+USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
+USE_JAX = os.environ.get("USE_JAX", "AUTO").upper()
+TORCH_VERSION = "N/A"
+TORCH_AVAILABLE = False
+if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
+    TORCH_AVAILABLE = importlib.util.find_spec("torch") is not None
+    if TORCH_AVAILABLE:
+        try:
+            TORCH_VERSION = version.parse(importlib_metadata.version("torch"))
+            logger.info(f"PyTorch version {TORCH_VERSION} available.")
+        except importlib_metadata.PackageNotFoundError:
+            pass
+else:
+    logger.info("Disabling PyTorch because USE_TF is set")
+TF_VERSION = "N/A"
+TF_AVAILABLE = False
+if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
+    TF_AVAILABLE = importlib.util.find_spec("tensorflow") is not None
+    if TF_AVAILABLE:
+        # For the metadata, we have to look for both tensorflow and tensorflow-cpu
+        for package in [
+            "tensorflow",
+            "tensorflow-cpu",
+            "tensorflow-gpu",
+            "tf-nightly",
+            "tf-nightly-cpu",
+            "tf-nightly-gpu",
+            "intel-tensorflow",
+            "tensorflow-rocm",
+            "tensorflow-macos",
+        ]:
+            try:
+                TF_VERSION = version.parse(importlib_metadata.version(package))
+            except importlib_metadata.PackageNotFoundError:
+                continue
+            else:
+                break
+        else:
+            TF_AVAILABLE = False
+    if TF_AVAILABLE:
+        if TF_VERSION.major < 2:
+            logger.info(f"TensorFlow found but with version {TF_VERSION}. `datasets` requires version 2 minimum.")
+            TF_AVAILABLE = False
+        else:
+            logger.info(f"TensorFlow version {TF_VERSION} available.")
+else:
+    logger.info("Disabling Tensorflow because USE_TORCH is set")
+JAX_VERSION = "N/A"
+JAX_AVAILABLE = False
+if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
+    JAX_AVAILABLE = importlib.util.find_spec("jax") is not None
+    if JAX_AVAILABLE:
+        try:
+            JAX_VERSION = version.parse(importlib_metadata.version("jax"))
+            logger.info(f"JAX version {JAX_VERSION} available.")
+        except importlib_metadata.PackageNotFoundError:
+            pass
+else:
+    logger.info("Disabling JAX because USE_JAX is set to False")
+# Cache location
+DEFAULT_XDG_CACHE_HOME = "~/.cache"
+XDG_CACHE_HOME = os.getenv("XDG_CACHE_HOME", DEFAULT_XDG_CACHE_HOME)
+DEFAULT_HF_CACHE_HOME = os.path.join(XDG_CACHE_HOME, "huggingface")
+HF_CACHE_HOME = os.path.expanduser(os.getenv("HF_HOME", DEFAULT_HF_CACHE_HOME))
+DEFAULT_HF_EVALUATE_CACHE = os.path.join(HF_CACHE_HOME, "evaluate")
+HF_EVALUATE_CACHE = Path(os.getenv("HF_EVALUATE_CACHE", DEFAULT_HF_EVALUATE_CACHE))
+DEFAULT_HF_METRICS_CACHE = os.path.join(HF_CACHE_HOME, "metrics")
+HF_METRICS_CACHE = Path(os.getenv("HF_METRICS_CACHE", DEFAULT_HF_METRICS_CACHE))
+DEFAULT_HF_MODULES_CACHE = os.path.join(HF_CACHE_HOME, "modules")
+HF_MODULES_CACHE = Path(os.getenv("HF_MODULES_CACHE", DEFAULT_HF_MODULES_CACHE))
+DOWNLOADED_DATASETS_DIR = "downloads"
+DEFAULT_DOWNLOADED_EVALUATE_PATH = os.path.join(HF_EVALUATE_CACHE, DOWNLOADED_DATASETS_DIR)
+DOWNLOADED_EVALUATE_PATH = Path(os.getenv("HF_DATASETS_DOWNLOADED_EVALUATE_PATH", DEFAULT_DOWNLOADED_EVALUATE_PATH))
+EXTRACTED_EVALUATE_DIR = "extracted"
+DEFAULT_EXTRACTED_EVALUATE_PATH = os.path.join(DEFAULT_DOWNLOADED_EVALUATE_PATH, EXTRACTED_EVALUATE_DIR)
+EXTRACTED_EVALUATE_PATH = Path(os.getenv("HF_DATASETS_EXTRACTED_EVALUATE_PATH", DEFAULT_EXTRACTED_EVALUATE_PATH))
+# Download count for the website
+HF_UPDATE_DOWNLOAD_COUNTS = (
+    os.environ.get("HF_UPDATE_DOWNLOAD_COUNTS", "AUTO").upper() in ENV_VARS_TRUE_AND_AUTO_VALUES
+)
+# Offline mode
+HF_EVALUATE_OFFLINE = os.environ.get("HF_EVALUATE_OFFLINE", "AUTO").upper() in ENV_VARS_TRUE_VALUES
+# File names
+LICENSE_FILENAME = "LICENSE"
+METRIC_INFO_FILENAME = "metric_info.json"
+DATASETDICT_JSON_FILENAME = "dataset_dict.json"
+MODULE_NAME_FOR_DYNAMIC_MODULES = "evaluate_modules"
+HF_HUB_ALLOWED_TASKS = [
+    "image-classification",
+    "translation",
+    "image-segmentation",
+    "fill-mask",
+    "automatic-speech-recognition",
+    "token-classification",
+    "sentence-similarity",
+    "audio-classification",
+    "question-answering",
+    "summarization",
+    "zero-shot-classification",
+    "table-to-text",
+    "feature-extraction",
+    "other",
+    "multiple-choice",
+    "text-classification",
+    "text-to-image",
+    "text2text-generation",
+    "zero-shot-image-classification",
+    "tabular-classification",
+    "tabular-regression",
+    "image-to-image",
+    "tabular-to-text",
+    "unconditional-image-generation",
+    "text-retrieval",
+    "text-to-speech",
+    "object-detection",
+    "audio-to-audio",
+    "text-generation",
+    "conversational",
+    "table-question-answering",
+    "visual-question-answering",
+    "image-to-text",
+    "reinforcement-learning",
+    "voice-activity-detection",
+    "time-series-forecasting",
+    "document-question-answering",
+]
--- a/evaluate-0.4.2/src/evaluate/evaluation_suite/__init__.py
+++ b/evaluate-0.4.2/src/evaluate/evaluation_suite/__init__.py
+import importlib
+import inspect
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Callable, Dict, Optional, Union
+from datasets import Dataset, DownloadConfig, DownloadMode, load_dataset
+from datasets.utils.version import Version
+from ..evaluator import evaluator
+from ..loading import evaluation_module_factory
+from ..utils.logging import get_logger
+logger = get_logger(__name__)
+@dataclass
+class SubTask:
+    task_type: str
+    data: Optional[Union[str, Dataset]] = None
+    subset: Optional[str] = None
+    split: Optional[str] = None
+    data_preprocessor: Optional[Callable] = None
+    args_for_task: Optional[dict] = None
+    def __post_init__(self):
+        if type(self.task_type) is not str:
+            raise ValueError(f"'task_type' must be type 'str', got {type(self.task_type)}")
+        if type(self.data) not in [Dataset, str]:
+            raise ValueError(
+                f"'data' must be an already-instantiated Dataset object or type 'str', got {type(self.data)}"
+            )
+        if self.subset and type(self.subset) is not str:
+            raise ValueError(f"'subset' must be type 'str', got {type(self.subset)}")
+        if self.split and type(self.split) is not str:
+            raise ValueError(f"'split' must be type 'str', got {type(self.split)}")
+        if self.data_preprocessor and not callable(self.data_preprocessor):
+            raise ValueError(f"'data_preprocessor' must be a Callable', got {self.data_preprocessor}")
+        if self.args_for_task and type(self.args_for_task) is not dict:
+            raise ValueError(f"'args_for_task' must be type 'dict', got {type(self.args_for_task)}")
+def import_main_class(module_path):
+    """Import a module at module_path and return the EvaluationSuite class"""
+    module = importlib.import_module(module_path)
+    module_main_cls = None
+    for name, obj in module.__dict__.items():
+        if isinstance(obj, type) and obj.__name__ == "Suite":
+            if inspect.isabstract(obj):
+                continue
+            module_main_cls = obj
+            break
+    return module_main_cls
+class EvaluationSuite:
+    """
+    This class instantiates an evaluation suite made up of multiple tasks, where each task consists of a dataset and
+    an associated metric, and runs evaluation on a model or pipeline. Evaluation suites can be a Python script found
+    either locally or uploaded as a Space on the Hugging Face Hub.
+    Usage:
+    ```python
+    from evaluate import EvaluationSuite
+    suite = EvaluationSuite.load("evaluate/evaluation-suite-ci")
+    results = suite.run("lvwerra/distilbert-imdb")
+    ```
+    """
+    def __init__(self, name):
+        self.name = name
+    @staticmethod
+    def load(
+        path: str,
+        download_mode: Optional[DownloadMode] = None,
+        revision: Optional[Union[str, Version]] = None,
+        download_config: Optional[DownloadConfig] = None,
+    ):
+        download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
+        evaluation_module = evaluation_module_factory(
+            path, module_type=None, revision=revision, download_config=download_config, download_mode=download_mode
+        )
+        name = Path(path).stem
+        evaluation_cls = import_main_class(evaluation_module.module_path)
+        evaluation_instance = evaluation_cls(name)
+        return evaluation_instance
+    def __repr__(self):
+        self.tasks = [str(task) for task in self.suite]
+        return f'EvaluationSuite name: "{self.name}", ' f"Tasks: {self.tasks})"
+    def assert_suite_nonempty(self):
+        if not self.suite:
+            raise ValueError(
+                "No evaluation tasks found. The EvaluationSuite must include at least one SubTask definition."
+            )
+    def run(
+        self, model_or_pipeline: Union[str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"]  # noqa: F821
+    ) -> Dict[str, float]:
+        self.assert_suite_nonempty()
+        results_all = []
+        for task in self.suite:
+            task_name = task.data
+            if task.data_preprocessor:  # task requires extra preprocessing
+                ds = load_dataset(task.data, name=task.subset, split=task.split)
+                task.data = ds.map(task.data_preprocessor)
+            task_evaluator = evaluator(task.task_type)
+            args_for_task = task.args_for_task
+            args_for_task["model_or_pipeline"] = model_or_pipeline
+            args_for_task["data"] = task.data
+            args_for_task["subset"] = task.subset
+            args_for_task["split"] = task.split
+            results = task_evaluator.compute(**args_for_task)
+            results["task_name"] = task_name + "/" + task.subset if task.subset else task_name
+            results["data_preprocessor"] = str(task.data_preprocessor) if task.data_preprocessor is not None else None
+            results_all.append(results)
+        return results_all
--- a/evaluate-0.4.2/src/evaluate/evaluator/__init__.py
+++ b/evaluate-0.4.2/src/evaluate/evaluator/__init__.py
+# Copyright 2022 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+try:
+    from transformers.pipelines import SUPPORTED_TASKS as SUPPORTED_PIPELINE_TASKS
+    from transformers.pipelines import TASK_ALIASES
+    from transformers.pipelines import check_task as check_pipeline_task
+    TRANSFORMERS_AVAILABLE = True
+except ImportError:
+    TRANSFORMERS_AVAILABLE = False
+from typing import Dict, List
+from .audio_classification import AudioClassificationEvaluator
+from .automatic_speech_recognition import AutomaticSpeechRecognitionEvaluator
+from .base import Evaluator
+from .image_classification import ImageClassificationEvaluator
+from .question_answering import QuestionAnsweringEvaluator
+from .text2text_generation import SummarizationEvaluator, Text2TextGenerationEvaluator, TranslationEvaluator
+from .text_classification import TextClassificationEvaluator
+from .text_generation import TextGenerationEvaluator
+from .token_classification import TokenClassificationEvaluator
+SUPPORTED_EVALUATOR_TASKS = {
+    "text-classification": {
+        "implementation": TextClassificationEvaluator,
+        "default_metric_name": "accuracy",
+    },
+    "image-classification": {
+        "implementation": ImageClassificationEvaluator,
+        "default_metric_name": "accuracy",
+    },
+    "question-answering": {
+        "implementation": QuestionAnsweringEvaluator,
+        "default_metric_name": "squad",
+    },
+    "token-classification": {
+        "implementation": TokenClassificationEvaluator,
+        "default_metric_name": "seqeval",
+    },
+    "text-generation": {
+        "implementation": TextGenerationEvaluator,
+        "default_metric_name": "word_count",
+    },
+    "text2text-generation": {
+        "implementation": Text2TextGenerationEvaluator,
+        "default_metric_name": "bleu",
+    },
+    "summarization": {
+        "implementation": SummarizationEvaluator,
+        "default_metric_name": "rouge",
+    },
+    "translation": {
+        "implementation": TranslationEvaluator,
+        "default_metric_name": "bleu",
+    },
+    "automatic-speech-recognition": {
+        "implementation": AutomaticSpeechRecognitionEvaluator,
+        "default_metric_name": "wer",
+    },
+    "audio-classification": {
+        "implementation": AudioClassificationEvaluator,
+        "default_metric_name": "accuracy",
+    },
+}
+def get_supported_tasks() -> List[str]:
+    """
+    Returns a list of supported task strings.
+    """
+    return list(SUPPORTED_EVALUATOR_TASKS.keys())
+def check_task(task: str) -> Dict:
+    """
+    Checks an incoming task string, to validate it's correct and returns the default Evaluator class and default metric
+    name. It first performs a check to validata that the string is a valid `Pipeline` task, then it checks if it's a
+    valid `Evaluator` task. `Evaluator` tasks are a substet of `Pipeline` tasks.
+    Args:
+        task (`str`):
+            The task defining which evaluator will be returned. Currently accepted tasks are:
+            - `"image-classification"`
+            - `"question-answering"`
+            - `"text-classification"` (alias `"sentiment-analysis"` available)
+            - `"token-classification"`
+    Returns:
+        task_defaults: `dict`, contains the implementasion class of a give Evaluator and the default metric name.
+    """
+    if task in TASK_ALIASES:
+        task = TASK_ALIASES[task]
+    if not check_pipeline_task(task):
+        raise KeyError(f"Unknown task {task}, available tasks are: {get_supported_tasks()}.")
+    if task in SUPPORTED_EVALUATOR_TASKS.keys() and task in SUPPORTED_PIPELINE_TASKS.keys():
+        return SUPPORTED_EVALUATOR_TASKS[task]
+    raise KeyError(f"Unknown task {task}, available tasks are: {get_supported_tasks()}.")
+def evaluator(task: str = None) -> Evaluator:
+    """
+    Utility factory method to build an [`Evaluator`].
+    Evaluators encapsulate a task and a default metric name. They leverage `pipeline` functionality from `transformers`
+    to simplify the evaluation of multiple combinations of models, datasets and metrics for a given task.
+    Args:
+        task (`str`):
+            The task defining which evaluator will be returned. Currently accepted tasks are:
+            - `"image-classification"`: will return a [`ImageClassificationEvaluator`].
+            - `"question-answering"`: will return a [`QuestionAnsweringEvaluator`].
+            - `"text-classification"` (alias `"sentiment-analysis"` available): will return a [`TextClassificationEvaluator`].
+            - `"token-classification"`: will return a [`TokenClassificationEvaluator`].
+    Returns:
+        [`Evaluator`]: An evaluator suitable for the task.
+    Examples:
+    ```python
+    >>> from evaluate import evaluator
+    >>> # Sentiment analysis evaluator
+    >>> evaluator("sentiment-analysis")
+    ```"""
+    if not TRANSFORMERS_AVAILABLE:
+        raise ImportError(
+            "If you want to use the `Evaluator` you need `transformers`. Run `pip install evaluate[transformers]`."
+        )
+    targeted_task = check_task(task)
+    evaluator_class = targeted_task["implementation"]
+    default_metric_name = targeted_task["default_metric_name"]
+    return evaluator_class(task=task, default_metric_name=default_metric_name)
--- a/evaluate-0.4.2/src/evaluate/evaluator/audio_classification.py
+++ b/evaluate-0.4.2/src/evaluate/evaluator/audio_classification.py
+# Copyright 2022 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from numbers import Number
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
+from datasets import Dataset
+from typing_extensions import Literal
+from ..module import EvaluationModule
+from ..utils.file_utils import add_end_docstrings, add_start_docstrings
+from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator
+if TYPE_CHECKING:
+    from transformers import FeatureExtractionMixin, Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel
+TASK_DOCUMENTATION = r"""
+    Examples:
+    <Tip>
+    Remember that, in order to process audio files, you need ffmpeg installed (https://ffmpeg.org/download.html)
+    </Tip>
+    ```python
+    >>> from evaluate import evaluator
+    >>> from datasets import load_dataset
+    >>> task_evaluator = evaluator("audio-classification")
+    >>> data = load_dataset("superb", 'ks', split="test[:40]")
+    >>> results = task_evaluator.compute(
+    >>>     model_or_pipeline=""superb/wav2vec2-base-superb-ks"",
+    >>>     data=data,
+    >>>     label_column="label",
+    >>>     input_column="file",
+    >>>     metric="accuracy",
+    >>>     label_mapping={0: "yes", 1: "no", 2: "up", 3: "down"}
+    >>> )
+    ```
+    <Tip>
+    The evaluator supports raw audio data as well, in the form of a numpy array. However, be aware that calling
+    the audio column automatically decodes and resamples the audio files, which can be slow for large datasets.
+    </Tip>
+    ```python
+    >>> from evaluate import evaluator
+    >>> from datasets import load_dataset
+    >>> task_evaluator = evaluator("audio-classification")
+    >>> data = load_dataset("superb", 'ks', split="test[:40]")
+    >>> data = data.map(lambda example: {"audio": example["audio"]["array"]})
+    >>> results = task_evaluator.compute(
+    >>>     model_or_pipeline=""superb/wav2vec2-base-superb-ks"",
+    >>>     data=data,
+    >>>     label_column="label",
+    >>>     input_column="audio",
+    >>>     metric="accuracy",
+    >>>     label_mapping={0: "yes", 1: "no", 2: "up", 3: "down"}
+    >>> )
+    ```
+"""
+class AudioClassificationEvaluator(Evaluator):
+    """
+    Audio classification evaluator.
+    This audio classification evaluator can currently be loaded from [`evaluator`] using the default task name
+    `audio-classification`.
+    Methods in this class assume a data format compatible with the [`transformers.AudioClassificationPipeline`].
+    """
+    PIPELINE_KWARGS = {}
+    def __init__(self, task="audio-classification", default_metric_name=None):
+        super().__init__(task, default_metric_name=default_metric_name)
+    def predictions_processor(self, predictions, label_mapping):
+        pred_label = [max(pred, key=lambda x: x["score"])["label"] for pred in predictions]
+        pred_label = [label_mapping[pred] if label_mapping is not None else pred for pred in pred_label]
+        return {"predictions": pred_label}
+    @add_start_docstrings(EVALUTOR_COMPUTE_START_DOCSTRING)
+    @add_end_docstrings(EVALUATOR_COMPUTE_RETURN_DOCSTRING, TASK_DOCUMENTATION)
+    def compute(
+        self,
+        model_or_pipeline: Union[
+            str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"  # noqa: F821
+        ] = None,
+        data: Union[str, Dataset] = None,
+        subset: Optional[str] = None,
+        split: Optional[str] = None,
+        metric: Union[str, EvaluationModule] = None,
+        tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None,  # noqa: F821
+        feature_extractor: Optional[Union[str, "FeatureExtractionMixin"]] = None,  # noqa: F821
+        strategy: Literal["simple", "bootstrap"] = "simple",
+        confidence_level: float = 0.95,
+        n_resamples: int = 9999,
+        device: int = None,
+        random_state: Optional[int] = None,
+        input_column: str = "file",
+        label_column: str = "label",
+        label_mapping: Optional[Dict[str, Number]] = None,
+    ) -> Tuple[Dict[str, float], Any]:
+        """
+        input_column (`str`, defaults to `"file"`):
+            The name of the column containing either the audio files or a raw waveform, represented as a numpy array, in the dataset specified by `data`.
+        label_column (`str`, defaults to `"label"`):
+            The name of the column containing the labels in the dataset specified by `data`.
+        label_mapping (`Dict[str, Number]`, *optional*, defaults to `None`):
+            We want to map class labels defined by the model in the pipeline to values consistent with those
+            defined in the `label_column` of the `data` dataset.
+        """
+        result = super().compute(
+            model_or_pipeline=model_or_pipeline,
+            data=data,
+            subset=subset,
+            split=split,
+            metric=metric,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            strategy=strategy,
+            confidence_level=confidence_level,
+            n_resamples=n_resamples,
+            device=device,
+            random_state=random_state,
+            input_column=input_column,
+            label_column=label_column,
+            label_mapping=label_mapping,
+        )
+        return result
--- a/evaluate-0.4.2/src/evaluate/evaluator/automatic_speech_recognition.py
+++ b/evaluate-0.4.2/src/evaluate/evaluator/automatic_speech_recognition.py
+# Copyright 2022 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
+from datasets import Dataset
+from typing_extensions import Literal
+from ..module import EvaluationModule
+from ..utils.file_utils import add_end_docstrings, add_start_docstrings
+from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator
+if TYPE_CHECKING:
+    from transformers import Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel
+TASK_DOCUMENTATION = r"""
+    Examples:
+    ```python
+    >>> from evaluate import evaluator
+    >>> from datasets import load_dataset
+    >>> task_evaluator = evaluator("automatic-speech-recognition")
+    >>> data = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="validation[:40]")
+    >>> results = task_evaluator.compute(
+    >>>     model_or_pipeline="https://huggingface.co/openai/whisper-tiny.en",
+    >>>     data=data,
+    >>>     input_column="path",
+    >>>     label_column="sentence",
+    >>>     metric="wer",
+    >>> )
+    ```
+"""
+class AutomaticSpeechRecognitionEvaluator(Evaluator):
+    """
+    Automatic speech recognition evaluator.
+    This automatic speech recognition evaluator can currently be loaded from [`evaluator`] using the default task name
+    `automatic-speech-recognition`.
+    Methods in this class assume a data format compatible with the [`AutomaticSpeechRecognitionPipeline`].
+    """
+    PIPELINE_KWARGS = {"truncation": True}
+    def __init__(self, task="automatic-speech-recognition", default_metric_name=None):
+        super().__init__(task, default_metric_name=default_metric_name)
+    def predictions_processor(self, predictions, label_mapping):
+        return {"predictions": [pred["text"] for pred in predictions]}
+    @add_start_docstrings(EVALUTOR_COMPUTE_START_DOCSTRING)
+    @add_end_docstrings(EVALUATOR_COMPUTE_RETURN_DOCSTRING, TASK_DOCUMENTATION)
+    def compute(
+        self,
+        model_or_pipeline: Union[
+            str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"  # noqa: F821
+        ] = None,
+        data: Union[str, Dataset] = None,
+        subset: Optional[str] = None,
+        split: Optional[str] = None,
+        metric: Union[str, EvaluationModule] = None,
+        tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None,  # noqa: F821
+        strategy: Literal["simple", "bootstrap"] = "simple",
+        confidence_level: float = 0.95,
+        n_resamples: int = 9999,
+        device: int = None,
+        random_state: Optional[int] = None,
+        input_column: str = "path",
+        label_column: str = "sentence",
+        generation_kwargs: dict = None,
+    ) -> Tuple[Dict[str, float], Any]:
+        """
+        input_column (`str`, defaults to `"path"`):
+            the name of the column containing the input audio path in the dataset specified by `data`.
+        label_column (`str`, defaults to `"sentence"`):
+            the name of the column containing the labels in the dataset specified by `data`.
+        generation_kwargs (`Dict`, *optional*, defaults to `None`):
+            The generation kwargs are passed to the pipeline and set the text generation strategy.
+        """
+        if generation_kwargs is not None:
+            self.PIPELINE_KWARGS.update(generation_kwargs)
+        result = super().compute(
+            model_or_pipeline=model_or_pipeline,
+            data=data,
+            subset=subset,
+            split=split,
+            metric=metric,
+            tokenizer=tokenizer,
+            strategy=strategy,
+            confidence_level=confidence_level,
+            n_resamples=n_resamples,
+            device=device,
+            random_state=random_state,
+            input_column=input_column,
+            label_column=label_column,
+        )
+        return result
--- a/evaluate-0.4.2/src/evaluate/evaluator/base.py
+++ b/evaluate-0.4.2/src/evaluate/evaluator/base.py
+# Copyright 2022 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC, abstractmethod
+from numbers import Number
+from typing import Any, Callable, Dict, List, Optional, Union
+# Lint as: python3
+from datasets import Dataset, load_dataset
+from evaluate.evaluator.utils import choose_split
+try:
+    from scipy.stats import bootstrap
+    SCIPY_AVAILABLE = True
+except ImportError:
+    SCIPY_AVAILABLE = False
+try:
+    import transformers
+    from transformers import Pipeline, pipeline
+    TRANSFORMERS_AVAILABLE = True
+except ImportError:
+    TRANSFORMERS_AVAILABLE = False
+from time import perf_counter
+from typing_extensions import Literal
+from ..loading import load
+from ..module import EvaluationModule
+from ..utils.logging import get_logger
+from .utils import DatasetColumn
+logger = get_logger(__name__)
+EVALUTOR_COMPUTE_START_DOCSTRING = r"""
+    Compute the metric for a given pipeline and dataset combination.
+    Args:
+        model_or_pipeline (`str` or `Pipeline` or `Callable` or `PreTrainedModel` or `TFPreTrainedModel`, defaults to `None`):
+            If the argument in not specified, we initialize the default pipeline for the task (in this case
+            `text-classification` or its alias - `sentiment-analysis`). If the argument is of the type `str` or
+            is a model instance, we use it to initialize a new `Pipeline` with the given model. Otherwise we assume the
+            argument specifies a pre-initialized pipeline.
+        data (`str` or `Dataset`, defaults to `None`):
+            Specifies the dataset we will run evaluation on. If it is of type `str`, we treat it as the dataset
+            name, and load it. Otherwise we assume it represents a pre-loaded dataset.
+        subset (`str`, defaults to `None`):
+            Defines which dataset subset to load. If `None` is passed the default subset is loaded.
+        split (`str`, defaults to `None`):
+            Defines which dataset split to load. If `None` is passed, infers based on the `choose_split` function.
+        metric (`str` or `EvaluationModule`, defaults to `None`):
+            Specifies the metric we use in evaluator. If it is of type `str`, we treat it as the metric name, and
+            load it. Otherwise we assume it represents a pre-loaded metric.
+        tokenizer (`str` or `PreTrainedTokenizer`, *optional*, defaults to `None`):
+            Argument can be used to overwrite a default tokenizer if `model_or_pipeline` represents a model for
+            which we build a pipeline. If `model_or_pipeline` is `None` or a pre-initialized pipeline, we ignore
+            this argument.
+        strategy (`Literal["simple", "bootstrap"]`, defaults to "simple"):
+            specifies the evaluation strategy. Possible values are:
+            - `"simple"` - we evaluate the metric and return the scores.
+            - `"bootstrap"` - on top of computing the metric scores, we calculate the confidence interval for each
+            of the returned metric keys, using `scipy`'s `bootstrap` method
+            https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.bootstrap.html.
+        confidence_level (`float`, defaults to `0.95`):
+            The `confidence_level` value passed to `bootstrap` if `"bootstrap"` strategy is chosen.
+        n_resamples (`int`, defaults to `9999`):
+            The `n_resamples` value passed to `bootstrap` if `"bootstrap"` strategy is chosen.
+        device (`int`, defaults to `None`):
+            Device ordinal for CPU/GPU support of the pipeline. Setting this to -1 will leverage CPU, a positive
+            integer will run the model on the associated CUDA device ID. If `None` is provided it will be inferred and
+            CUDA:0 used if available, CPU otherwise.
+        random_state (`int`, *optional*, defaults to `None`):
+            The `random_state` value passed to `bootstrap` if `"bootstrap"` strategy is chosen. Useful for
+            debugging.
+"""
+EVALUATOR_COMPUTE_RETURN_DOCSTRING = r"""
+    Return:
+        A `Dict`. The keys represent metric keys calculated for the `metric` spefied in function arguments. For the
+        `"simple"` strategy, the value is the metric score. For the `"bootstrap"` strategy, the value is a `Dict`
+        containing the score, the confidence interval and the standard error calculated for each metric key.
+"""
+class Evaluator(ABC):
+    """
+    The [`Evaluator`] class is the class from which all evaluators inherit. Refer to this class for methods shared across
+    different evaluators.
+    Base class implementing evaluator operations.
+    """
+    PIPELINE_KWARGS = {}
+    METRIC_KWARGS = {}
+    def __init__(self, task: str, default_metric_name: str = None):
+        if not TRANSFORMERS_AVAILABLE:
+            raise ImportError(
+                "If you want to use the `Evaluator` you need `transformers`. Run `pip install evaluate[evaluator]`."
+            )
+        if not SCIPY_AVAILABLE:
+            raise ImportError(
+                "If you want to use the `Evaluator` you need `scipy>=1.7.1`. Run `pip install evaluate[evaluator]`."
+            )
+        self.task = task
+        self.default_metric_name = default_metric_name
+    @staticmethod
+    def _compute_confidence_interval(
+        metric,
+        metric_inputs,
+        metric_keys: List[str],
+        confidence_level: float = 0.95,
+        n_resamples: int = 9999,
+        random_state: Optional[int] = None,
+    ) -> Dict[str, Any]:
+        """
+        A utility function enabling the confidence interval calculation for metrics computed
+        by the evaluator based on `scipy`'s `bootstrap` method.
+        """
+        # bootstrap only works with functions that use args and no kwargs
+        def build_args_metric(metric, key, **kwargs):
+            def args_metric(*args):
+                return metric.compute(**{k: v for k, v in zip(kwargs.keys(), args)})[key]
+            return args_metric
+        bootstrap_dict = {}
+        for key in metric_keys:
+            bs = bootstrap(
+                data=list(metric_inputs.values()),
+                statistic=build_args_metric(metric, key, **metric_inputs),
+                paired=True,
+                vectorized=False,
+                confidence_level=confidence_level,
+                n_resamples=n_resamples,
+                random_state=random_state,
+            )
+            bootstrap_dict[key] = {
+                "confidence_interval": (bs.confidence_interval.low, bs.confidence_interval.high),
+                "standard_error": bs.standard_error,
+            }
+        return bootstrap_dict
+    @staticmethod
+    def _compute_time_perf(start_time: float, end_time: float, num_samples: int) -> Dict[str, Any]:
+        """
+        A utility function computing time performance metrics:
+            - `total_time_in_seconds` - pipeline inference runtime for the evaluation data in seconds,
+            - `samples_per_second` - pipeline throughput in the number of samples per second.
+            - `latency_in_seconds` - pipeline inference runtime for the evaluation data in seconds per sample,
+        """
+        latency = end_time - start_time
+        throughput = num_samples / latency
+        latency_sample = 1.0 / throughput
+        return {
+            "total_time_in_seconds": latency,
+            "samples_per_second": throughput,
+            "latency_in_seconds": latency_sample,
+        }
+    @staticmethod
+    def _infer_device() -> int:
+        """Helper function to check if GPU or CPU is available for inference."""
+        # try infer with torch first
+        try:
+            import torch
+            if torch.cuda.is_available():
+                device = 0  # first GPU
+            else:
+                device = -1  # CPU
+        except ImportError:
+            # if not available try TF
+            try:
+                import tensorflow as tf
+                if len(tf.config.list_physical_devices("GPU")) > 0:
+                    device = 0  # first GPU
+                else:
+                    device = -1  # CPU
+            except ImportError:
+                device = -1
+        if device == -1:
+            logger.info("No GPU found. The default device for pipeline inference is set to CPU.")
+        else:
+            logger.info("GPU found. The default device for pipeline inference is set to GPU (CUDA:0).")
+        return device
+    @abstractmethod
+    def predictions_processor(self, *args, **kwargs):
+        """
+        A core method of the `Evaluator` class, which processes the pipeline outputs for compatibility with the metric.
+        """
+        raise NotImplementedError()
+    def compute(
+        self,
+        model_or_pipeline: Union[
+            str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"  # noqa: F821
+        ] = None,
+        data: Union[str, Dataset] = None,
+        subset: Optional[str] = None,
+        split: Optional[str] = None,
+        metric: Union[str, EvaluationModule] = None,
+        tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None,  # noqa: F821
+        feature_extractor: Optional[Union[str, "FeatureExtractionMixin"]] = None,  # noqa: F821
+        strategy: Literal["simple", "bootstrap"] = "simple",
+        confidence_level: float = 0.95,
+        n_resamples: int = 9999,
+        device: int = None,
+        random_state: Optional[int] = None,
+        input_column: str = "text",
+        label_column: str = "label",
+        label_mapping: Optional[Dict[str, Number]] = None,
+    ) -> Dict[str, float]:
+        result = {}
+        self.check_for_mismatch_in_device_setup(device, model_or_pipeline)
+        # Prepare inputs
+        data = self.load_data(data=data, subset=subset, split=split)
+        metric_inputs, pipe_inputs = self.prepare_data(data=data, input_column=input_column, label_column=label_column)
+        pipe = self.prepare_pipeline(
+            model_or_pipeline=model_or_pipeline,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            device=device,
+        )
+        metric = self.prepare_metric(metric)
+        # Compute predictions
+        predictions, perf_results = self.call_pipeline(pipe, pipe_inputs)
+        predictions = self.predictions_processor(predictions, label_mapping)
+        metric_inputs.update(predictions)
+        # Compute metrics from references and predictions
+        metric_results = self.compute_metric(
+            metric=metric,
+            metric_inputs=metric_inputs,
+            strategy=strategy,
+            confidence_level=confidence_level,
+            n_resamples=n_resamples,
+            random_state=random_state,
+        )
+        # TODO: To clarify why `wer` and `cer` return float
+        # even though metric.compute contract says that it
+        # returns Optional[dict].
+        if type(metric_results) is float:
+            metric_results = {metric.name: metric_results}
+        result.update(metric_results)
+        result.update(perf_results)
+        return result
+    @staticmethod
+    def check_for_mismatch_in_device_setup(device, model_or_pipeline):
+        if device is not None and device != -1 and isinstance(model_or_pipeline, Pipeline):
+            if model_or_pipeline.device.type == "cpu":
+                raise ValueError(
+                    "The value of the `device` kwarg passed to `compute` suggests that this pipe should be run on an "
+                    "accelerator, but the pipe was instantiated on CPU. Pass `device` to the pipeline during "
+                    "initialization to use an accelerator, or pass `device=None` to `compute`. "
+                )
+            elif device != model_or_pipeline.device.index:
+                raise ValueError(
+                    f"This pipeline was instantiated on device {model_or_pipeline.device.index} but device={device} was passed to `compute`."
+                )
+    def check_required_columns(self, data: Union[str, Dataset], columns_names: Dict[str, str]):
+        """
+        Ensure the columns required for the evaluation are present in the dataset.
+        Args:
+            data (`str` or [`Dataset`]):
+                Specifies the dataset we will run evaluation on.
+            columns_names (`List[str]`):
+                List of column names to check in the dataset. The keys are the arguments to the [`evaluate.EvaluationModule.compute`] method,
+                while the values are the column names to check.
+        Example:
+        ```py
+        >>> from datasets import load_dataset
+        >>> from evaluate import evaluator
+        >>> data = load_dataset("rotten_tomatoes', split="train")
+        >>> evaluator.check_required_columns(data, {"input_column": "text", "label_column": "label"})
+        ```
+        """
+        for input_name, column_name in columns_names.items():
+            if column_name not in data.column_names:
+                raise ValueError(
+                    f"Invalid `{input_name}` {column_name} specified. The dataset contains the following columns: {data.column_names}."
+                )
+    @staticmethod
+    def get_dataset_split(data, subset=None, split=None):
+        """
+        Infers which split to use if `None` is given.
+        Args:
+             data (`str`):
+                Name of dataset.
+             subset (`str`):
+                Name of config for datasets with multiple configurations (e.g. 'glue/cola').
+             split (`str`, defaults to `None`):
+                Split to use.
+        Returns:
+            `split`: `str` containing which split to use
+        Example:
+        ```py
+        >>> from evaluate import evaluator
+        >>> evaluator("text-classification").get_dataset_split(data="rotten_tomatoes")
+        WARNING:evaluate.evaluator.base:Dataset split not defined! Automatically evaluating with split: TEST
+        'test'
+        ```
+        """
+        if split is None:
+            split = choose_split(data, subset)
+            logger.warning(f"Dataset split not defined! Automatically evaluating with split: {split.upper()}")
+        return split
+    def load_data(self, data: Union[str, Dataset], subset: str = None, split: str = None):
+        """
+        Load dataset with given subset and split.
+        Args:
+            data ([`Dataset`] or `str`, defaults to `None`):
+                Specifies the dataset we will run evaluation on. If it is of
+                type `str`, we treat it as the dataset name, and load it. Otherwise we assume it represents a pre-loaded dataset.
+            subset (`str`, defaults to `None`):
+                Specifies dataset subset to be passed to `name` in `load_dataset`. To be
+                used with datasets with several configurations (e.g. glue/sst2).
+            split (`str`, defaults to `None`):
+                User-defined dataset split by name (e.g. train, validation, test). Supports slice-split (`test[:n]`).
+                If not defined and data is a `str` type, will automatically select the best one via `choose_split()`.
+        Returns:
+            data ([`Dataset`]): Loaded dataset which will be used for evaluation.
+        Example:
+        ```py
+        >>> from evaluate import evaluator
+        >>> evaluator("text-classification").load_data(data="rotten_tomatoes", split="train")
+        Dataset({
+            features: ['text', 'label'],
+            num_rows: 8530
+        })
+        ```
+        """
+        if isinstance(data, str):
+            split = self.get_dataset_split(data, subset, split)
+            data = load_dataset(data, name=subset, split=split)
+            return data
+        elif isinstance(data, Dataset):
+            if split is not None or subset is not None:
+                logger.warning("`data` is a preloaded Dataset! Ignoring `subset` and `split`.")
+            return data
+        else:
+            raise ValueError(
+                "Please specify a valid `data` object - either a `str` with a name or a `Dataset` object."
+            )
+    def prepare_data(self, data: Dataset, input_column: str, label_column: str, *args, **kwargs):
+        """
+        Prepare data.
+        Args:
+            data ([`Dataset`]):
+                Specifies the dataset we will run evaluation on.
+            input_column (`str`, defaults to `"text"`):
+                The name of the column containing the text feature in the dataset specified by `data`.
+            second_input_column(`str`, *optional*):
+                The name of the column containing the second text feature if there is one. Otherwise, set to `None`.
+            label_column (`str`, defaults to `"label"`):
+                The name of the column containing the labels in the dataset specified by `data`.
+        Returns:
+            `dict`:  metric inputs.
+            `list`:  pipeline inputs.
+        Example:
+        ```py
+        >>> from evaluate import evaluator
+        >>> from datasets import load_dataset
+        >>> ds = load_dataset("rotten_tomatoes", split="train")
+        >>> evaluator("text-classification").prepare_data(ds, input_column="text", second_input_column=None, label_column="label")
+        ```
+        """
+        self.check_required_columns(data, {"input_column": input_column, "label_column": label_column})
+        return {"references": data[label_column]}, DatasetColumn(data, input_column)
+    def prepare_pipeline(
+        self,
+        model_or_pipeline: Union[str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"],  # noqa: F821
+        tokenizer: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"] = None,  # noqa: F821
+        feature_extractor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"] = None,  # noqa: F821
+        device: int = None,
+    ):
+        """
+        Prepare pipeline.
+        Args:
+            model_or_pipeline (`str` or [`~transformers.Pipeline`] or `Callable` or [`~transformers.PreTrainedModel`] or [`~transformers.TFPreTrainedModel`], defaults to `None`):
+                If the argument in not specified, we initialize the default pipeline for the task. If the argument is of the type `str` or
+                is a model instance, we use it to initialize a new [`~transformers.Pipeline`] with the given model. Otherwise we assume the
+                argument specifies a pre-initialized pipeline.
+            preprocessor ([`~transformers.PreTrainedTokenizerBase`] or [`~transformers.FeatureExtractionMixin`], *optional*, defaults to `None`):
+                Argument can be used to overwrite a default preprocessor if `model_or_pipeline` represents a model for
+                which we build a pipeline. If `model_or_pipeline` is `None` or a pre-initialized pipeline, we ignore
+                this argument.
+        Returns:
+            The initialized pipeline.
+        Example:
+        ```py
+        >>> from evaluate import evaluator
+        >>> evaluator("text-classification").prepare_pipeline(model_or_pipeline="distilbert-base-uncased")
+        ```
+        """
+        if device is None:
+            device = self._infer_device()
+        if (
+            isinstance(model_or_pipeline, str)
+            or isinstance(model_or_pipeline, transformers.PreTrainedModel)
+            or isinstance(model_or_pipeline, transformers.TFPreTrainedModel)
+        ):
+            pipe = pipeline(
+                self.task,
+                model=model_or_pipeline,
+                tokenizer=tokenizer,
+                feature_extractor=feature_extractor,
+                device=device,
+            )
+        else:
+            if model_or_pipeline is None:
+                pipe = pipeline(self.task, device=device)
+            else:
+                pipe = model_or_pipeline
+            if tokenizer is not None and feature_extractor is not None:
+                logger.warning("Ignoring the value of the preprocessor argument (`tokenizer` or `feature_extractor`).")
+        if (pipe.task != self.task) and not (self.task == "translation" and pipe.task.startswith("translation")):
+            raise ValueError(
+                f"Incompatible `model_or_pipeline`. Please specify `model_or_pipeline` compatible with the `{self.task}` task."
+            )
+        return pipe
+    def prepare_metric(self, metric: Union[str, EvaluationModule]):
+        """
+        Prepare metric.
+        Args:
+            metric (`str` or [`EvaluationModule`], defaults to `None`):
+                Specifies the metric we use in evaluator. If it is of type `str`, we treat it as the metric name, and
+                load it. Otherwise we assume it represents a pre-loaded metric.
+        Returns:
+            The loaded metric.
+        Example:
+        ```py
+        >>> from evaluate import evaluator
+        >>> evaluator("text-classification").prepare_metric("accuracy")
+        ```
+        """
+        # Prepare metric.
+        if metric is None:
+            if self.default_metric_name is None:
+                raise ValueError(
+                    "`Evaluator` doesn't specify a default metric. Please specify a valid `metric` argument."
+                )
+            metric = load(self.default_metric_name)
+        elif isinstance(metric, str):
+            metric = load(metric)
+        return metric
+    def call_pipeline(self, pipe, *args, **kwargs):
+        start_time = perf_counter()
+        pipe_output = pipe(*args, **kwargs, **self.PIPELINE_KWARGS)
+        end_time = perf_counter()
+        return pipe_output, self._compute_time_perf(start_time, end_time, len(pipe_output))
+    def compute_metric(
+        self,
+        metric: EvaluationModule,
+        metric_inputs: Dict,
+        strategy: Literal["simple", "bootstrap"] = "simple",
+        confidence_level: float = 0.95,
+        n_resamples: int = 9999,
+        random_state: Optional[int] = None,
+    ):
+        """Compute and return metrics."""
+        result = metric.compute(**metric_inputs, **self.METRIC_KWARGS)
+        if strategy == "bootstrap":
+            metric_keys = result.keys()
+            bootstrap_dict = self._compute_confidence_interval(
+                metric,
+                metric_inputs,
+                metric_keys,
+                confidence_level,
+                n_resamples,
+                random_state,
+            )
+            for key in metric_keys:
+                bootstrap_dict[key]["score"] = result[key]
+            return bootstrap_dict
+        return result
--- a/evaluate-0.4.2/src/evaluate/evaluator/image_classification.py
+++ b/evaluate-0.4.2/src/evaluate/evaluator/image_classification.py
+# Copyright 2022 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from numbers import Number
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
+from datasets import Dataset
+from typing_extensions import Literal
+from ..module import EvaluationModule
+from ..utils.file_utils import add_end_docstrings, add_start_docstrings
+from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator
+if TYPE_CHECKING:
+    from transformers import FeatureExtractionMixin, Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel
+TASK_DOCUMENTATION = r"""
+    Examples:
+    ```python
+    >>> from evaluate import evaluator
+    >>> from datasets import load_dataset
+    >>> task_evaluator = evaluator("image-classification")
+    >>> data = load_dataset("beans", split="test[:40]")
+    >>> results = task_evaluator.compute(
+    >>>     model_or_pipeline="nateraw/vit-base-beans",
+    >>>     data=data,
+    >>>     label_column="labels",
+    >>>     metric="accuracy",
+    >>>     label_mapping={'angular_leaf_spot': 0, 'bean_rust': 1, 'healthy': 2},
+    >>>     strategy="bootstrap"
+    >>> )
+    ```
+"""
+class ImageClassificationEvaluator(Evaluator):
+    """
+    Image classification evaluator.
+    This image classification evaluator can currently be loaded from [`evaluator`] using the default task name
+    `image-classification`.
+    Methods in this class assume a data format compatible with the [`ImageClassificationPipeline`].
+    """
+    PIPELINE_KWARGS = {}
+    def __init__(self, task="image-classification", default_metric_name=None):
+        super().__init__(task, default_metric_name=default_metric_name)
+    def predictions_processor(self, predictions, label_mapping):
+        pred_label = [max(pred, key=lambda x: x["score"])["label"] for pred in predictions]
+        pred_label = [label_mapping[pred] if label_mapping is not None else pred for pred in pred_label]
+        return {"predictions": pred_label}
+    @add_start_docstrings(EVALUTOR_COMPUTE_START_DOCSTRING)
+    @add_end_docstrings(EVALUATOR_COMPUTE_RETURN_DOCSTRING, TASK_DOCUMENTATION)
+    def compute(
+        self,
+        model_or_pipeline: Union[
+            str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"  # noqa: F821
+        ] = None,
+        data: Union[str, Dataset] = None,
+        subset: Optional[str] = None,
+        split: Optional[str] = None,
+        metric: Union[str, EvaluationModule] = None,
+        tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None,  # noqa: F821
+        feature_extractor: Optional[Union[str, "FeatureExtractionMixin"]] = None,  # noqa: F821
+        strategy: Literal["simple", "bootstrap"] = "simple",
+        confidence_level: float = 0.95,
+        n_resamples: int = 9999,
+        device: int = None,
+        random_state: Optional[int] = None,
+        input_column: str = "image",
+        label_column: str = "label",
+        label_mapping: Optional[Dict[str, Number]] = None,
+    ) -> Tuple[Dict[str, float], Any]:
+        """
+        input_column (`str`, defaults to `"image"`):
+            The name of the column containing the images as PIL ImageFile in the dataset specified by `data`.
+        label_column (`str`, defaults to `"label"`):
+            The name of the column containing the labels in the dataset specified by `data`.
+        label_mapping (`Dict[str, Number]`, *optional*, defaults to `None`):
+            We want to map class labels defined by the model in the pipeline to values consistent with those
+            defined in the `label_column` of the `data` dataset.
+        """
+        result = super().compute(
+            model_or_pipeline=model_or_pipeline,
+            data=data,
+            subset=subset,
+            split=split,
+            metric=metric,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            strategy=strategy,
+            confidence_level=confidence_level,
+            n_resamples=n_resamples,
+            device=device,
+            random_state=random_state,
+            input_column=input_column,
+            label_column=label_column,
+            label_mapping=label_mapping,
+        )
+        return result
--- a/evaluate-0.4.2/src/evaluate/evaluator/question_answering.py
+++ b/evaluate-0.4.2/src/evaluate/evaluator/question_answering.py
+# Copyright 2022 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+# Lint as: python3
+from datasets import Dataset
+try:
+    TRANSFORMERS_AVAILABLE = True
+except ImportError:
+    TRANSFORMERS_AVAILABLE = False
+from typing_extensions import Literal
+from ..module import EvaluationModule
+from ..utils.file_utils import add_end_docstrings, add_start_docstrings
+from ..utils.logging import get_logger
+from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator
+from .utils import DatasetColumn
+if TYPE_CHECKING:
+    from transformers import Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel
+logger = get_logger(__name__)
+TASK_DOCUMENTATION = r"""
+    Examples:
+    ```python
+    >>> from evaluate import evaluator
+    >>> from datasets import load_dataset
+    >>> task_evaluator = evaluator("question-answering")
+    >>> data = load_dataset("squad", split="validation[:2]")
+    >>> results = task_evaluator.compute(
+    >>>     model_or_pipeline="sshleifer/tiny-distilbert-base-cased-distilled-squad",
+    >>>     data=data,
+    >>>     metric="squad",
+    >>> )
+    ```
+    <Tip>
+    Datasets where the answer may be missing in the context are supported, for example SQuAD v2 dataset. In this case, it is safer to pass `squad_v2_format=True` to
+    the compute() call.
+    </Tip>
+    ```python
+    >>> from evaluate import evaluator
+    >>> from datasets import load_dataset
+    >>> task_evaluator = evaluator("question-answering")
+    >>> data = load_dataset("squad_v2", split="validation[:2]")
+    >>> results = task_evaluator.compute(
+    >>>     model_or_pipeline="mrm8488/bert-tiny-finetuned-squadv2",
+    >>>     data=data,
+    >>>     metric="squad_v2",
+    >>>     squad_v2_format=True,
+    >>> )
+    ```
+"""
+class QuestionAnsweringEvaluator(Evaluator):
+    """
+    Question answering evaluator. This evaluator handles
+    [**extractive** question answering](https://huggingface.co/docs/transformers/task_summary#extractive-question-answering),
+    where the answer to the question is extracted from a context.
+    This question answering evaluator can currently be loaded from [`evaluator`] using the default task name
+    `question-answering`.
+    Methods in this class assume a data format compatible with the
+    [`~transformers.QuestionAnsweringPipeline`].
+    """
+    PIPELINE_KWARGS = {}
+    def __init__(self, task="question-answering", default_metric_name=None):
+        super().__init__(task, default_metric_name=default_metric_name)
+    def prepare_data(
+        self, data: Dataset, question_column: str, context_column: str, id_column: str, label_column: str
+    ):
+        """Prepare data."""
+        if data is None:
+            raise ValueError(
+                "Please specify a valid `data` object - either a `str` with a name or a `Dataset` object."
+            )
+        self.check_required_columns(
+            data,
+            {
+                "question_column": question_column,
+                "context_column": context_column,
+                "id_column": id_column,
+                "label_column": label_column,
+            },
+        )
+        metric_inputs = dict()
+        metric_inputs["references"] = [
+            {"id": element[id_column], "answers": element[label_column]} for element in data
+        ]
+        return metric_inputs, {
+            "question": DatasetColumn(data, question_column),
+            "context": DatasetColumn(data, context_column),
+        }
+    def is_squad_v2_format(self, data: Dataset, label_column: str = "answers"):
+        """
+        Check if the provided dataset follows the squad v2 data schema, namely possible samples where the answer is not in the context.
+        In this case, the answer text list should be `[]`.
+        """
+        original_num_rows = data.num_rows
+        nonempty_num_rows = data.filter(
+            lambda x: len(x[label_column]["text"]) > 0, load_from_cache_file=False
+        ).num_rows
+        if original_num_rows > nonempty_num_rows:
+            return True
+        else:
+            return False
+    def predictions_processor(self, predictions: List, squad_v2_format: bool, ids: List):
+        result = []
+        for i in range(len(predictions)):
+            pred = {"prediction_text": predictions[i]["answer"], "id": ids[i]}
+            if squad_v2_format:
+                pred["no_answer_probability"] = predictions[i]["score"]
+            result.append(pred)
+        return {"predictions": result}
+    @add_start_docstrings(EVALUTOR_COMPUTE_START_DOCSTRING)
+    @add_end_docstrings(EVALUATOR_COMPUTE_RETURN_DOCSTRING, TASK_DOCUMENTATION)
+    def compute(
+        self,
+        model_or_pipeline: Union[
+            str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"  # noqa: F821
+        ] = None,
+        data: Union[str, Dataset] = None,
+        subset: Optional[str] = None,
+        split: Optional[str] = None,
+        metric: Union[str, EvaluationModule] = None,
+        tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None,  # noqa: F821
+        strategy: Literal["simple", "bootstrap"] = "simple",
+        confidence_level: float = 0.95,
+        n_resamples: int = 9999,
+        device: int = None,
+        random_state: Optional[int] = None,
+        question_column: str = "question",
+        context_column: str = "context",
+        id_column: str = "id",
+        label_column: str = "answers",
+        squad_v2_format: Optional[bool] = None,
+    ) -> Tuple[Dict[str, float], Any]:
+        """
+        question_column (`str`, defaults to `"question"`):
+            The name of the column containing the question in the dataset specified by `data`.
+        context_column (`str`, defaults to `"context"`):
+            The name of the column containing the context in the dataset specified by `data`.
+        id_column (`str`, defaults to `"id"`):
+            The name of the column containing the identification field of the question and answer pair in the
+            dataset specified by `data`.
+        label_column (`str`, defaults to `"answers"`):
+            The name of the column containing the answers in the dataset specified by `data`.
+        squad_v2_format (`bool`, *optional*, defaults to `None`):
+            Whether the dataset follows the format of squad_v2 dataset. This is the case when the provided dataset
+            has questions where the answer is not in the context, more specifically when are answers as
+            `{"text": [], "answer_start": []}` in the answer column. If all questions have at least one answer, this parameter
+            should be set to `False`. If this parameter is not provided, the format will be automatically inferred.
+        """
+        result = {}
+        self.check_for_mismatch_in_device_setup(device, model_or_pipeline)
+        data = self.load_data(data=data, subset=subset, split=split)
+        metric_inputs, pipe_inputs = self.prepare_data(
+            data=data,
+            question_column=question_column,
+            context_column=context_column,
+            id_column=id_column,
+            label_column=label_column,
+        )
+        if squad_v2_format is None:
+            squad_v2_format = self.is_squad_v2_format(data=data, label_column=label_column)
+            logger.warning(
+                f"`squad_v2_format` parameter not provided to QuestionAnsweringEvaluator.compute(). Automatically inferred `squad_v2_format` as {squad_v2_format}."
+            )
+        pipe = self.prepare_pipeline(model_or_pipeline=model_or_pipeline, tokenizer=tokenizer, device=device)
+        metric = self.prepare_metric(metric)
+        if squad_v2_format and metric.name == "squad":
+            logger.warning(
+                "The dataset has SQuAD v2 format but you are using the SQuAD metric. Consider passing the 'squad_v2' metric."
+            )
+        if not squad_v2_format and metric.name == "squad_v2":
+            logger.warning(
+                "The dataset has SQuAD v1 format but you are using the SQuAD v2 metric. Consider passing the 'squad' metric."
+            )
+        if squad_v2_format:
+            self.PIPELINE_KWARGS["handle_impossible_answer"] = True
+        else:
+            self.PIPELINE_KWARGS["handle_impossible_answer"] = False
+        # Compute predictions
+        predictions, perf_results = self.call_pipeline(pipe, **pipe_inputs)
+        predictions = self.predictions_processor(predictions, squad_v2_format=squad_v2_format, ids=data[id_column])
+        metric_inputs.update(predictions)
+        # Compute metrics from references and predictions
+        metric_results = self.compute_metric(
+            metric=metric,
+            metric_inputs=metric_inputs,
+            strategy=strategy,
+            confidence_level=confidence_level,
+            n_resamples=n_resamples,
+            random_state=random_state,
+        )
+        result.update(metric_results)
+        result.update(perf_results)
+        return result