修改readme

25991f98 · hepj · ac192496 · 25991f98 · 25991f98 · 25991f98
Commit 25991f98 authored Jul 25, 2024 by hepj
20 changed files
--- a/evaluate-0.4.2/metrics/super_glue/record_evaluation.py
+++ b/evaluate-0.4.2/metrics/super_glue/record_evaluation.py
+"""
+Official evaluation script for ReCoRD v1.0.
+(Some functions are adopted from the SQuAD evaluation script.)
+"""
+
+
+import argparse
+import json
+import re
+import string
+import sys
+from collections import Counter
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def f1_score(prediction, ground_truth):
+    prediction_tokens = normalize_answer(prediction).split()
+    ground_truth_tokens = normalize_answer(ground_truth).split()
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def exact_match_score(prediction, ground_truth):
+    return normalize_answer(prediction) == normalize_answer(ground_truth)
+
+
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+
+
+def evaluate(dataset, predictions):
+    f1 = exact_match = total = 0
+    correct_ids = []
+    for passage in dataset:
+        for qa in passage["qas"]:
+            total += 1
+            if qa["id"] not in predictions:
+                message = f'Unanswered question {qa["id"]} will receive score 0.'
+                print(message, file=sys.stderr)
+                continue
+
+            ground_truths = list(map(lambda x: x["text"], qa["answers"]))
+            prediction = predictions[qa["id"]]
+
+            _exact_match = metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
+            if int(_exact_match) == 1:
+                correct_ids.append(qa["id"])
+            exact_match += _exact_match
+
+            f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)
+
+    exact_match = exact_match / total
+    f1 = f1 / total
+
+    return {"exact_match": exact_match, "f1": f1}, correct_ids
+
+
+if __name__ == "__main__":
+    expected_version = "1.0"
+    parser = argparse.ArgumentParser("Official evaluation script for ReCoRD v1.0.")
+    parser.add_argument("data_file", help="The dataset file in JSON format.")
+    parser.add_argument("pred_file", help="The model prediction file in JSON format.")
+    parser.add_argument("--output_correct_ids", action="store_true", help="Output the correctly answered query IDs.")
+    args = parser.parse_args()
+
+    with open(args.data_file) as data_file:
+        dataset_json = json.load(data_file)
+        if dataset_json["version"] != expected_version:
+            print(
+                f'Evaluation expects v-{expected_version}, but got dataset with v-{dataset_json["version"]}',
+                file=sys.stderr,
+            )
+        dataset = dataset_json["data"]
+
+    with open(args.pred_file) as pred_file:
+        predictions = json.load(pred_file)
+
+    metrics, correct_ids = evaluate(dataset, predictions)
+
+    if args.output_correct_ids:
+        print(f"Output {len(correct_ids)} correctly answered question IDs.")
+        with open("correct_ids.json", "w") as f:
+            json.dump(correct_ids, f)
--- a/evaluate-0.4.2/metrics/super_glue/requirements.txt
+++ b/evaluate-0.4.2/metrics/super_glue/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+scikit-learn
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/super_glue/super_glue.py
+++ b/evaluate-0.4.2/metrics/super_glue/super_glue.py
+# Copyright 2020 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The SuperGLUE benchmark metric."""
+
+import datasets
+from sklearn.metrics import f1_score, matthews_corrcoef
+
+import evaluate
+
+from .record_evaluation import evaluate as evaluate_record
+
+
+_CITATION = """\
+@article{wang2019superglue,
+  title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},
+  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},
+  journal={arXiv preprint arXiv:1905.00537},
+  year={2019}
+}
+"""
+
+_DESCRIPTION = """\
+SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after
+GLUE with a new set of more difficult language understanding tasks, improved
+resources, and a new public leaderboard.
+"""
+
+_KWARGS_DESCRIPTION = """
+Compute SuperGLUE evaluation metric associated to each SuperGLUE dataset.
+Args:
+    predictions: list of predictions to score. Depending on the SuperGlUE subset:
+        - for 'record': list of question-answer dictionaries with the following keys:
+            - 'idx': index of the question as specified by the dataset
+            - 'prediction_text': the predicted answer text
+        - for 'multirc': list of question-answer dictionaries with the following keys:
+            - 'idx': index of the question-answer pair as specified by the dataset
+            - 'prediction': the predicted answer label
+        - otherwise: list of predicted labels
+    references: list of reference labels. Depending on the SuperGLUE subset:
+        - for 'record': list of question-answers dictionaries with the following keys:
+            - 'idx': index of the question as specified by the dataset
+            - 'answers': list of possible answers
+        - otherwise: list of reference labels
+Returns: depending on the SuperGLUE subset:
+    - for 'record':
+        - 'exact_match': Exact match between answer and gold answer
+        - 'f1': F1 score
+    - for 'multirc':
+        - 'exact_match': Exact match between answer and gold answer
+        - 'f1_m': Per-question macro-F1 score
+        - 'f1_a': Average F1 score over all answers
+    - for 'axb':
+        'matthews_correlation': Matthew Correlation
+    - for 'cb':
+        - 'accuracy': Accuracy
+        - 'f1': F1 score
+    - for all others:
+        - 'accuracy': Accuracy
+Examples:
+
+    >>> super_glue_metric = evaluate.load('super_glue', 'copa')  # any of ["copa", "rte", "wic", "wsc", "wsc.fixed", "boolq", "axg"]
+    >>> predictions = [0, 1]
+    >>> references = [0, 1]
+    >>> results = super_glue_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'accuracy': 1.0}
+
+    >>> super_glue_metric = evaluate.load('super_glue', 'cb')
+    >>> predictions = [0, 1]
+    >>> references = [0, 1]
+    >>> results = super_glue_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'accuracy': 1.0, 'f1': 1.0}
+
+    >>> super_glue_metric = evaluate.load('super_glue', 'record')
+    >>> predictions = [{'idx': {'passage': 0, 'query': 0}, 'prediction_text': 'answer'}]
+    >>> references = [{'idx': {'passage': 0, 'query': 0}, 'answers': ['answer', 'another_answer']}]
+    >>> results = super_glue_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'exact_match': 1.0, 'f1': 1.0}
+
+    >>> super_glue_metric = evaluate.load('super_glue', 'multirc')
+    >>> predictions = [{'idx': {'answer': 0, 'paragraph': 0, 'question': 0}, 'prediction': 0}, {'idx': {'answer': 1, 'paragraph': 2, 'question': 3}, 'prediction': 1}]
+    >>> references = [0, 1]
+    >>> results = super_glue_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'exact_match': 1.0, 'f1_m': 1.0, 'f1_a': 1.0}
+
+    >>> super_glue_metric = evaluate.load('super_glue', 'axb')
+    >>> references = [0, 1]
+    >>> predictions = [0, 1]
+    >>> results = super_glue_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'matthews_correlation': 1.0}
+"""
+
+
+def simple_accuracy(preds, labels):
+    return float((preds == labels).mean())
+
+
+def acc_and_f1(preds, labels, f1_avg="binary"):
+    acc = simple_accuracy(preds, labels)
+    f1 = float(f1_score(y_true=labels, y_pred=preds, average=f1_avg))
+    return {
+        "accuracy": acc,
+        "f1": f1,
+    }
+
+
+def evaluate_multirc(ids_preds, labels):
+    """
+    Computes F1 score and Exact Match for MultiRC predictions.
+    """
+    question_map = {}
+    for id_pred, label in zip(ids_preds, labels):
+        question_id = f'{id_pred["idx"]["paragraph"]}-{id_pred["idx"]["question"]}'
+        pred = id_pred["prediction"]
+        if question_id in question_map:
+            question_map[question_id].append((pred, label))
+        else:
+            question_map[question_id] = [(pred, label)]
+    f1s, ems = [], []
+    for question, preds_labels in question_map.items():
+        question_preds, question_labels = zip(*preds_labels)
+        f1 = f1_score(y_true=question_labels, y_pred=question_preds, average="macro")
+        f1s.append(f1)
+        em = int(sum(p == l for p, l in preds_labels) == len(preds_labels))
+        ems.append(em)
+    f1_m = float(sum(f1s) / len(f1s))
+    em = sum(ems) / len(ems)
+    f1_a = float(f1_score(y_true=labels, y_pred=[id_pred["prediction"] for id_pred in ids_preds]))
+    return {"exact_match": em, "f1_m": f1_m, "f1_a": f1_a}
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class SuperGlue(evaluate.Metric):
+    def _info(self):
+        if self.config_name not in [
+            "boolq",
+            "cb",
+            "copa",
+            "multirc",
+            "record",
+            "rte",
+            "wic",
+            "wsc",
+            "wsc.fixed",
+            "axb",
+            "axg",
+        ]:
+            raise KeyError(
+                "You should supply a configuration name selected in "
+                '["boolq", "cb", "copa", "multirc", "record", "rte", "wic", "wsc", "wsc.fixed", "axb", "axg",]'
+            )
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(self._get_feature_types()),
+            codebase_urls=[],
+            reference_urls=[],
+            format="numpy" if not self.config_name == "record" and not self.config_name == "multirc" else None,
+        )
+
+    def _get_feature_types(self):
+        if self.config_name == "record":
+            return {
+                "predictions": {
+                    "idx": {
+                        "passage": datasets.Value("int64"),
+                        "query": datasets.Value("int64"),
+                    },
+                    "prediction_text": datasets.Value("string"),
+                },
+                "references": {
+                    "idx": {
+                        "passage": datasets.Value("int64"),
+                        "query": datasets.Value("int64"),
+                    },
+                    "answers": datasets.Sequence(datasets.Value("string")),
+                },
+            }
+        elif self.config_name == "multirc":
+            return {
+                "predictions": {
+                    "idx": {
+                        "answer": datasets.Value("int64"),
+                        "paragraph": datasets.Value("int64"),
+                        "question": datasets.Value("int64"),
+                    },
+                    "prediction": datasets.Value("int64"),
+                },
+                "references": datasets.Value("int64"),
+            }
+        else:
+            return {
+                "predictions": datasets.Value("int64"),
+                "references": datasets.Value("int64"),
+            }
+
+    def _compute(self, predictions, references):
+        if self.config_name == "axb":
+            return {"matthews_correlation": matthews_corrcoef(references, predictions)}
+        elif self.config_name == "cb":
+            return acc_and_f1(predictions, references, f1_avg="macro")
+        elif self.config_name == "record":
+            dataset = [
+                {
+                    "qas": [
+                        {"id": ref["idx"]["query"], "answers": [{"text": ans} for ans in ref["answers"]]}
+                        for ref in references
+                    ]
+                }
+            ]
+            predictions = {pred["idx"]["query"]: pred["prediction_text"] for pred in predictions}
+            return evaluate_record(dataset, predictions)[0]
+        elif self.config_name == "multirc":
+            return evaluate_multirc(predictions, references)
+        elif self.config_name in ["copa", "rte", "wic", "wsc", "wsc.fixed", "boolq", "axg"]:
+            return {"accuracy": simple_accuracy(predictions, references)}
+        else:
+            raise KeyError(
+                "You should supply a configuration name selected in "
+                '["boolq", "cb", "copa", "multirc", "record", "rte", "wic", "wsc", "wsc.fixed", "axb", "axg",]'
+            )
--- a/evaluate-0.4.2/metrics/ter/README.md
+++ b/evaluate-0.4.2/metrics/ter/README.md
+---
+title: TER
+emoji: 🤗 
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+description: >-
+  TER (Translation Edit Rate, also called Translation Error Rate) is a metric to quantify the edit operations that a
+  hypothesis requires to match a reference translation. We use the implementation that is already present in sacrebleu
+  (https://github.com/mjpost/sacreBLEU#ter), which in turn is inspired by the TERCOM implementation, which can be found
+  here: https://github.com/jhclark/tercom.
+  
+  The implementation here is slightly different from sacrebleu in terms of the required input format. The length of
+  the references and hypotheses lists need to be the same, so you may need to transpose your references compared to
+  sacrebleu's required input format. See https://github.com/huggingface/datasets/issues/3154#issuecomment-950746534
+  
+  See the README.md file at https://github.com/mjpost/sacreBLEU#ter for more information.
+---
+
+# Metric Card for TER
+
+## Metric Description
+TER (Translation Edit Rate, also called Translation Error Rate) is a metric to quantify the edit operations that a hypothesis requires to match a reference translation. We use the implementation that is already present in [sacrebleu](https://github.com/mjpost/sacreBLEU#ter), which in turn is inspired by the [TERCOM implementation](https://github.com/jhclark/tercom).
+
+The implementation here is slightly different from sacrebleu in terms of the required input format. The length of the references and hypotheses lists need to be the same, so you may need to transpose your references compared to sacrebleu's required input format. See [this github issue](https://github.com/huggingface/datasets/issues/3154#issuecomment-950746534).
+
+See the README.md file at https://github.com/mjpost/sacreBLEU#ter for more information.
+
+
+## How to Use
+This metric takes, at minimum, predicted sentences and reference sentences:
+```python
+>>> predictions = ["does this sentence match??",
+...                     "what about this sentence?",
+...                     "What did the TER metric user say to the developer?"]
+>>> references = [["does this sentence match", "does this sentence match!?!"],
+...             ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"],
+...             ["Your jokes are...", "...TERrible"]]
+>>> ter = evaluate.load("ter")
+>>> results = ter.compute(predictions=predictions,
+...                         references=references,
+...                         case_sensitive=True)
+>>> print(results)
+{'score': 150.0, 'num_edits': 15, 'ref_length': 10.0}
+```
+
+### Inputs
+This metric takes the following as input:
+- **`predictions`** (`list` of `str`): The system stream (a sequence of segments).
+- **`references`** (`list` of `list` of `str`): A list of one or more reference streams (each a sequence of segments).
+- **`normalized`** (`boolean`): If `True`, applies basic tokenization and normalization to sentences. Defaults to `False`.
+- **`ignore_punct`** (`boolean`): If `True`, applies basic tokenization and normalization to sentences. Defaults to `False`.
+- **`support_zh_ja_chars`** (`boolean`): If `True`, tokenization/normalization supports processing of Chinese characters, as well as Japanese Kanji, Hiragana, Katakana, and Phonetic Extensions of Katakana. Only applies if `normalized = True`. Defaults to `False`.
+- **`case_sensitive`** (`boolean`): If `False`, makes all predictions and references lowercase to ignore differences in case. Defaults to `False`.
+
+### Output Values
+This metric returns the following:
+- **`score`** (`float`): TER score (num_edits / sum_ref_lengths * 100)
+- **`num_edits`** (`int`): The cumulative number of edits
+- **`ref_length`** (`float`): The cumulative average reference length
+
+The output takes the following form:
+```python
+{'score': ter_score, 'num_edits': num_edits, 'ref_length': ref_length}
+```
+
+The metric can take on any value `0` and above. `0` is a perfect score, meaning the predictions exactly match the references and no edits were necessary. Higher scores are worse. Scores above 100 mean that the cumulative number of edits, `num_edits`, is higher than the cumulative length of the references, `ref_length`.
+
+#### Values from Popular Papers
+
+
+### Examples
+Basic example with only predictions and references as inputs:
+```python
+>>> predictions = ["does this sentence match??",
+...                     "what about this sentence?"]
+>>> references = [["does this sentence match", "does this sentence match!?!"],
+...             ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"]]
+>>> ter = evaluate.load("ter")
+>>> results = ter.compute(predictions=predictions, 
+...                         references=references,
+...                         case_sensitive=True)
+>>> print(results)
+{'score': 62.5, 'num_edits': 5, 'ref_length': 8.0}
+```
+
+Example with `normalization = True`:
+```python
+>>> predictions = ["does this sentence match??",
+...                     "what about this sentence?"]
+>>> references = [["does this sentence match", "does this sentence match!?!"],
+...             ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"]]
+>>> ter = evaluate.load("ter")
+>>> results = ter.compute(predictions=predictions, 
+...                         references=references, 
+...                         normalized=True,
+...                         case_sensitive=True)
+>>> print(results)
+{'score': 57.14285714285714, 'num_edits': 6, 'ref_length': 10.5}
+```
+
+Example ignoring punctuation and capitalization, and everything matches:
+```python
+>>> predictions = ["does this sentence match??",
+...                     "what about this sentence?"]
+>>> references = [["does this sentence match", "does this sentence match!?!"],
+...             ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"]]
+>>> ter = evaluate.load("ter")
+>>> results = ter.compute(predictions=predictions, 
+...                         references=references, 
+...                         ignore_punct=True,
+...                         case_sensitive=False)
+>>> print(results)
+{'score': 0.0, 'num_edits': 0, 'ref_length': 8.0}
+```
+
+Example ignoring punctuation and capitalization, but with an extra (incorrect) sample:
+```python
+>>> predictions = ["does this sentence match??",
+...                    "what about this sentence?",
+...                    "What did the TER metric user say to the developer?"]
+>>> references = [["does this sentence match", "does this sentence match!?!"],
+...             ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"],
+...             ["Your jokes are...", "...TERrible"]]
+>>> ter = evaluate.load("ter")
+>>> results = ter.compute(predictions=predictions, 
+...                         references=references,
+...                         ignore_punct=True,
+...                         case_sensitive=False)
+>>> print(results)
+{'score': 100.0, 'num_edits': 10, 'ref_length': 10.0}
+```
+
+
+## Limitations and Bias
+
+
+## Citation
+```bibtex
+@inproceedings{snover-etal-2006-study,
+    title = "A Study of Translation Edit Rate with Targeted Human Annotation",
+    author = "Snover, Matthew  and
+      Dorr, Bonnie  and
+      Schwartz, Rich  and
+      Micciulla, Linnea  and
+      Makhoul, John",
+    booktitle = "Proceedings of the 7th Conference of the Association for Machine Translation in the Americas: Technical Papers",
+    month = aug # " 8-12",
+    year = "2006",
+    address = "Cambridge, Massachusetts, USA",
+    publisher = "Association for Machine Translation in the Americas",
+    url = "https://aclanthology.org/2006.amta-papers.25",
+    pages = "223--231",
+}
+@inproceedings{post-2018-call,
+    title = "A Call for Clarity in Reporting {BLEU} Scores",
+    author = "Post, Matt",
+    booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
+    month = oct,
+    year = "2018",
+    address = "Belgium, Brussels",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/W18-6319",
+    pages = "186--191",
+}
+```
+
+## Further References
+- See [the sacreBLEU github repo](https://github.com/mjpost/sacreBLEU#ter) for more information.
--- a/evaluate-0.4.2/metrics/ter/app.py
+++ b/evaluate-0.4.2/metrics/ter/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("ter")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/ter/requirements.txt
+++ b/evaluate-0.4.2/metrics/ter/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+sacrebleu
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/ter/ter.py
+++ b/evaluate-0.4.2/metrics/ter/ter.py
+# Copyright 2021 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TER metric as available in sacrebleu. """
+import datasets
+import sacrebleu as scb
+from packaging import version
+from sacrebleu import TER
+
+import evaluate
+
+
+_CITATION = """\
+@inproceedings{snover-etal-2006-study,
+    title = "A Study of Translation Edit Rate with Targeted Human Annotation",
+    author = "Snover, Matthew  and
+      Dorr, Bonnie  and
+      Schwartz, Rich  and
+      Micciulla, Linnea  and
+      Makhoul, John",
+    booktitle = "Proceedings of the 7th Conference of the Association for Machine Translation in the Americas: Technical Papers",
+    month = aug # " 8-12",
+    year = "2006",
+    address = "Cambridge, Massachusetts, USA",
+    publisher = "Association for Machine Translation in the Americas",
+    url = "https://aclanthology.org/2006.amta-papers.25",
+    pages = "223--231",
+}
+@inproceedings{post-2018-call,
+    title = "A Call for Clarity in Reporting {BLEU} Scores",
+    author = "Post, Matt",
+    booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
+    month = oct,
+    year = "2018",
+    address = "Belgium, Brussels",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/W18-6319",
+    pages = "186--191",
+}
+"""
+
+_DESCRIPTION = """\
+TER (Translation Edit Rate, also called Translation Error Rate) is a metric to quantify the edit operations that a
+hypothesis requires to match a reference translation. We use the implementation that is already present in sacrebleu
+(https://github.com/mjpost/sacreBLEU#ter), which in turn is inspired by the TERCOM implementation, which can be found
+here: https://github.com/jhclark/tercom.
+
+The implementation here is slightly different from sacrebleu in terms of the required input format. The length of
+the references and hypotheses lists need to be the same, so you may need to transpose your references compared to
+sacrebleu's required input format. See https://github.com/huggingface/datasets/issues/3154#issuecomment-950746534
+
+See the README.md file at https://github.com/mjpost/sacreBLEU#ter for more information.
+"""
+
+_KWARGS_DESCRIPTION = """
+Produces TER scores alongside the number of edits and reference length.
+
+Args:
+    predictions (list of str): The system stream (a sequence of segments).
+    references (list of list of str): A list of one or more reference streams (each a sequence of segments).
+    normalized (boolean): If `True`, applies basic tokenization and normalization to sentences. Defaults to `False`.
+    ignore_punct (boolean): If `True`, applies basic tokenization and normalization to sentences. Defaults to `False`.
+    support_zh_ja_chars (boolean): If `True`, tokenization/normalization supports processing of Chinese characters,
+                                    as well as Japanese Kanji, Hiragana, Katakana, and Phonetic Extensions of Katakana.
+                                    Only applies if `normalized = True`. Defaults to `False`.
+    case_sensitive (boolean): If `False`, makes all predictions and references lowercase to ignore differences in case. Defaults to `False`.
+
+Returns:
+    'score' (float): TER score (num_edits / sum_ref_lengths * 100)
+    'num_edits' (int): The cumulative number of edits
+    'ref_length' (float): The cumulative average reference length
+
+Examples:
+    Example 1:
+        >>> predictions = ["does this sentence match??",
+        ...                     "what about this sentence?",
+        ...                     "What did the TER metric user say to the developer?"]
+        >>> references = [["does this sentence match", "does this sentence match!?!"],
+        ...             ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"],
+        ...             ["Your jokes are...", "...TERrible"]]
+        >>> ter = evaluate.load("ter")
+        >>> results = ter.compute(predictions=predictions,
+        ...                         references=references,
+        ...                         case_sensitive=True)
+        >>> print(results)
+        {'score': 150.0, 'num_edits': 15, 'ref_length': 10.0}
+
+    Example 2:
+        >>> predictions = ["does this sentence match??",
+        ...                     "what about this sentence?"]
+        >>> references = [["does this sentence match", "does this sentence match!?!"],
+        ...             ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"]]
+        >>> ter = evaluate.load("ter")
+        >>> results = ter.compute(predictions=predictions,
+        ...                         references=references,
+        ...                         case_sensitive=True)
+        >>> print(results)
+        {'score': 62.5, 'num_edits': 5, 'ref_length': 8.0}
+
+    Example 3:
+        >>> predictions = ["does this sentence match??",
+        ...                     "what about this sentence?"]
+        >>> references = [["does this sentence match", "does this sentence match!?!"],
+        ...             ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"]]
+        >>> ter = evaluate.load("ter")
+        >>> results = ter.compute(predictions=predictions,
+        ...                         references=references,
+        ...                         normalized=True,
+        ...                         case_sensitive=True)
+        >>> print(results)
+        {'score': 57.14285714285714, 'num_edits': 6, 'ref_length': 10.5}
+
+    Example 4:
+        >>> predictions = ["does this sentence match??",
+        ...                     "what about this sentence?"]
+        >>> references = [["does this sentence match", "does this sentence match!?!"],
+        ...             ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"]]
+        >>> ter = evaluate.load("ter")
+        >>> results = ter.compute(predictions=predictions,
+        ...                         references=references,
+        ...                         ignore_punct=True,
+        ...                         case_sensitive=False)
+        >>> print(results)
+        {'score': 0.0, 'num_edits': 0, 'ref_length': 8.0}
+
+    Example 5:
+        >>> predictions = ["does this sentence match??",
+        ...                    "what about this sentence?",
+        ...                    "What did the TER metric user say to the developer?"]
+        >>> references = [["does this sentence match", "does this sentence match!?!"],
+        ...             ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"],
+        ...             ["Your jokes are...", "...TERrible"]]
+        >>> ter = evaluate.load("ter")
+        >>> results = ter.compute(predictions=predictions,
+        ...                         references=references,
+        ...                         ignore_punct=True,
+        ...                         case_sensitive=False)
+        >>> print(results)
+        {'score': 100.0, 'num_edits': 10, 'ref_length': 10.0}
+"""
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Ter(evaluate.Metric):
+    def _info(self):
+        if version.parse(scb.__version__) < version.parse("1.4.12"):
+            raise ImportWarning(
+                "To use `sacrebleu`, the module `sacrebleu>=1.4.12` is required, and the current version of `sacrebleu` doesn't match this condition.\n"
+                'You can install it with `pip install "sacrebleu>=1.4.12"`.'
+            )
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            homepage="http://www.cs.umd.edu/~snover/tercom/",
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=[
+                datasets.Features(
+                    {
+                        "predictions": datasets.Value("string", id="sequence"),
+                        "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
+                    }
+                ),
+                datasets.Features(
+                    {
+                        "predictions": datasets.Value("string", id="sequence"),
+                        "references": datasets.Value("string", id="sequence"),
+                    }
+                ),
+            ],
+            codebase_urls=["https://github.com/mjpost/sacreBLEU#ter"],
+            reference_urls=[
+                "https://github.com/jhclark/tercom",
+            ],
+        )
+
+    def _compute(
+        self,
+        predictions,
+        references,
+        normalized: bool = False,
+        ignore_punct: bool = False,
+        support_zh_ja_chars: bool = False,
+        case_sensitive: bool = False,
+    ):
+        # if only one reference is provided make sure we still use list of lists
+        if isinstance(references[0], str):
+            references = [[ref] for ref in references]
+
+        references_per_prediction = len(references[0])
+        if any(len(refs) != references_per_prediction for refs in references):
+            raise ValueError("Sacrebleu requires the same number of references for each prediction")
+        transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)]
+
+        sb_ter = TER(
+            normalized=normalized,
+            no_punct=ignore_punct,
+            asian_support=support_zh_ja_chars,
+            case_sensitive=case_sensitive,
+        )
+        output = sb_ter.corpus_score(predictions, transformed_references)
+
+        return {"score": output.score, "num_edits": output.num_edits, "ref_length": output.ref_length}
--- a/evaluate-0.4.2/metrics/trec_eval/README.md
+++ b/evaluate-0.4.2/metrics/trec_eval/README.md
+---
+title: TREC Eval
+emoji: 🤗 
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+description: >-
+  The TREC Eval metric combines a number of information retrieval metrics such as precision and nDCG. It is used to score rankings of retrieved documents with reference values.
+---
+
+# Metric Card for TREC Eval
+
+## Metric Description
+
+The TREC Eval metric combines a number of information retrieval metrics such as precision and normalized Discounted Cumulative Gain (nDCG). It is used to score rankings of retrieved documents with reference values.
+
+## How to Use
+```Python
+from evaluate import load
+trec_eval = load("trec_eval")
+results = trec_eval.compute(predictions=[run], references=[qrel])
+```
+
+### Inputs
+- **predictions** *(dict): a single retrieval run.*
+    - **query** *(int): Query ID.*
+    - **q0** *(str): Literal `"q0"`.*
+    - **docid** *(str): Document ID.*
+    - **rank** *(int): Rank of document.*
+    - **score** *(float): Score of document.*
+    - **system** *(str): Tag for current run.*
+- **references** *(dict): a single qrel.*
+    - **query** *(int): Query ID.*
+    - **q0** *(str): Literal `"q0"`.*
+    - **docid** *(str): Document ID.*
+    - **rel** *(int): Relevance of document.*
+
+### Output Values
+- **runid** *(str): Run name.*  
+- **num_ret** *(int): Number of retrieved documents.*  
+- **num_rel** *(int): Number of relevant documents.*  
+- **num_rel_ret** *(int): Number of retrieved relevant documents.*  
+- **num_q** *(int): Number of queries.*  
+- **map** *(float): Mean average precision.*
+- **gm_map** *(float): geometric mean average precision.*  
+- **bpref** *(float): binary preference score.*  
+- **Rprec** *(float): precision@R, where R is number of relevant documents.*  
+- **recip_rank** *(float): reciprocal rank*  
+- **P@k** *(float): precision@k (k in [5, 10, 15, 20, 30, 100, 200, 500, 1000]).*  
+- **NDCG@k** *(float): nDCG@k (k in [5, 10, 15, 20, 30, 100, 200, 500, 1000]).*  
+
+### Examples
+
+A minimal example of looks as follows:
+```Python
+qrel = {
+    "query": [0],
+    "q0": ["q0"],
+    "docid": ["doc_1"],
+    "rel": [2]
+}
+run = {
+    "query": [0, 0],
+    "q0": ["q0", "q0"],
+    "docid": ["doc_2", "doc_1"],
+    "rank": [0, 1],
+    "score": [1.5, 1.2],
+    "system": ["test", "test"]
+}
+
+trec_eval = evaluate.load("trec_eval")
+results = trec_eval.compute(references=[qrel], predictions=[run])
+results["P@5"]
+0.2
+```
+
+A more realistic use case with an examples from [`trectools`](https://github.com/joaopalotti/trectools):
+
+```python
+qrel = pd.read_csv("robust03_qrels.txt", sep="\s+", names=["query", "q0", "docid", "rel"])
+qrel["q0"] = qrel["q0"].astype(str)
+qrel = qrel.to_dict(orient="list")
+
+run = pd.read_csv("input.InexpC2", sep="\s+", names=["query", "q0", "docid", "rank", "score", "system"])
+run = run.to_dict(orient="list")
+
+trec_eval = evaluate.load("trec_eval")
+result = trec_eval.compute(run=[run], qrel=[qrel])
+```
+
+```python
+result
+
+{'runid': 'InexpC2',
+ 'num_ret': 100000,
+ 'num_rel': 6074,
+ 'num_rel_ret': 3198,
+ 'num_q': 100,
+ 'map': 0.22485930431817494,
+ 'gm_map': 0.10411523825735523,
+ 'bpref': 0.217511695914079,
+ 'Rprec': 0.2502547201167236,
+ 'recip_rank': 0.6646545943335417,
+ 'P@5': 0.44,
+ 'P@10': 0.37,
+ 'P@15': 0.34600000000000003,
+ 'P@20': 0.30999999999999994,
+ 'P@30': 0.2563333333333333,
+ 'P@100': 0.1428,
+ 'P@200': 0.09510000000000002,
+ 'P@500': 0.05242,
+ 'P@1000': 0.03198,
+ 'NDCG@5': 0.4101480395089769,
+ 'NDCG@10': 0.3806761417784469,
+ 'NDCG@15': 0.37819463408955706,
+ 'NDCG@20': 0.3686080836061317,
+ 'NDCG@30': 0.352474353427451,
+ 'NDCG@100': 0.3778329431025776,
+ 'NDCG@200': 0.4119129817248979,
+ 'NDCG@500': 0.4585354576461375,
+ 'NDCG@1000': 0.49092149290805653}
+```
+
+## Limitations and Bias
+The `trec_eval` metric requires the inputs to be in the TREC run and qrel formats for predictions and references.
+
+
+## Citation
+
+```bibtex
+@inproceedings{palotti2019,
+ author = {Palotti, Joao and Scells, Harrisen and Zuccon, Guido},
+ title = {TrecTools: an open-source Python library for Information Retrieval practitioners involved in TREC-like campaigns},
+ series = {SIGIR'19},
+ year = {2019},
+ location = {Paris, France},
+ publisher = {ACM}
+} 
+```
+
+## Further References
+
+- Homepage: https://github.com/joaopalotti/trectools
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/trec_eval/app.py
+++ b/evaluate-0.4.2/metrics/trec_eval/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("trec_eval")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/trec_eval/requirements.txt
+++ b/evaluate-0.4.2/metrics/trec_eval/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+trectools
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/trec_eval/trec_eval.py
+++ b/evaluate-0.4.2/metrics/trec_eval/trec_eval.py
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Module to compute TREC evaluation scores."""
+
+import datasets
+import pandas as pd
+from trectools import TrecEval, TrecQrel, TrecRun
+
+import evaluate
+
+
+_CITATION = """\
+@inproceedings{palotti2019,
+ author = {Palotti, Joao and Scells, Harrisen and Zuccon, Guido},
+ title = {TrecTools: an open-source Python library for Information Retrieval practitioners involved in TREC-like campaigns},
+ series = {SIGIR'19},
+ year = {2019},
+ location = {Paris, France},
+ publisher = {ACM}
+}
+"""
+
+# TODO: Add description of the module here
+_DESCRIPTION = """\
+The TREC Eval metric combines a number of information retrieval metrics such as \
+precision and nDCG. It is used to score rankings of retrieved documents with reference values."""
+
+
+# TODO: Add description of the arguments of the module here
+_KWARGS_DESCRIPTION = """
+Calculates TREC evaluation scores based on a run and qrel.
+Args:
+    predictions: list containing a single run.
+    references: list containing a single qrel.
+Returns:
+    dict: TREC evaluation scores.
+Examples:
+    >>> trec = evaluate.load("trec_eval")
+    >>> qrel = {
+    ...     "query": [0],
+    ...     "q0": ["0"],
+    ...     "docid": ["doc_1"],
+    ...     "rel": [2]
+    ... }
+    >>> run = {
+    ...     "query": [0, 0],
+    ...     "q0": ["q0", "q0"],
+    ...     "docid": ["doc_2", "doc_1"],
+    ...     "rank": [0, 1],
+    ...     "score": [1.5, 1.2],
+    ...     "system": ["test", "test"]
+    ... }
+    >>> results = trec.compute(references=[qrel], predictions=[run])
+    >>> print(results["P@5"])
+    0.2
+"""
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class TRECEval(evaluate.Metric):
+    """Compute TREC evaluation scores."""
+
+    def _info(self):
+        return evaluate.MetricInfo(
+            module_type="metric",
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": {
+                        "query": datasets.Sequence(datasets.Value("int64")),
+                        "q0": datasets.Sequence(datasets.Value("string")),
+                        "docid": datasets.Sequence(datasets.Value("string")),
+                        "rank": datasets.Sequence(datasets.Value("int64")),
+                        "score": datasets.Sequence(datasets.Value("float")),
+                        "system": datasets.Sequence(datasets.Value("string")),
+                    },
+                    "references": {
+                        "query": datasets.Sequence(datasets.Value("int64")),
+                        "q0": datasets.Sequence(datasets.Value("string")),
+                        "docid": datasets.Sequence(datasets.Value("string")),
+                        "rel": datasets.Sequence(datasets.Value("int64")),
+                    },
+                }
+            ),
+            homepage="https://github.com/joaopalotti/trectools",
+        )
+
+    def _compute(self, references, predictions):
+        """Returns the TREC evaluation scores."""
+
+        if len(predictions) > 1 or len(references) > 1:
+            raise ValueError(
+                f"You can only pass one prediction and reference per evaluation. You passed {len(predictions)} prediction(s) and {len(references)} reference(s)."
+            )
+
+        df_run = pd.DataFrame(predictions[0])
+        df_qrel = pd.DataFrame(references[0])
+
+        trec_run = TrecRun()
+        trec_run.filename = "placeholder.file"
+        trec_run.run_data = df_run
+
+        trec_qrel = TrecQrel()
+        trec_qrel.filename = "placeholder.file"
+        trec_qrel.qrels_data = df_qrel
+
+        trec_eval = TrecEval(trec_run, trec_qrel)
+
+        result = {}
+        result["runid"] = trec_eval.run.get_runid()
+        result["num_ret"] = trec_eval.get_retrieved_documents(per_query=False)
+        result["num_rel"] = trec_eval.get_relevant_documents(per_query=False)
+        result["num_rel_ret"] = trec_eval.get_relevant_retrieved_documents(per_query=False)
+        result["num_q"] = len(trec_eval.run.topics())
+        result["map"] = trec_eval.get_map(depth=10000, per_query=False, trec_eval=True)
+        result["gm_map"] = trec_eval.get_geometric_map(depth=10000, trec_eval=True)
+        result["bpref"] = trec_eval.get_bpref(depth=1000, per_query=False, trec_eval=True)
+        result["Rprec"] = trec_eval.get_rprec(depth=1000, per_query=False, trec_eval=True)
+        result["recip_rank"] = trec_eval.get_reciprocal_rank(depth=1000, per_query=False, trec_eval=True)
+
+        for v in [5, 10, 15, 20, 30, 100, 200, 500, 1000]:
+            result[f"P@{v}"] = trec_eval.get_precision(depth=v, per_query=False, trec_eval=True)
+        for v in [5, 10, 15, 20, 30, 100, 200, 500, 1000]:
+            result[f"NDCG@{v}"] = trec_eval.get_ndcg(depth=v, per_query=False, trec_eval=True)
+
+        return result
--- a/evaluate-0.4.2/metrics/wer/README.md
+++ b/evaluate-0.4.2/metrics/wer/README.md
+---
+title: WER
+emoji: 🤗 
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+description: >-
+  Word error rate (WER) is a common metric of the performance of an automatic speech recognition system.
+  
+  The general difficulty of measuring performance lies in the fact that the recognized word sequence can have a different length from the reference word sequence (supposedly the correct one). The WER is derived from the Levenshtein distance, working at the word level instead of the phoneme level. The WER is a valuable tool for comparing different systems as well as for evaluating improvements within one system. This kind of measurement, however, provides no details on the nature of translation errors and further work is therefore required to identify the main source(s) of error and to focus any research effort.
+  
+  This problem is solved by first aligning the recognized word sequence with the reference (spoken) word sequence using dynamic string alignment. Examination of this issue is seen through a theory called the power law that states the correlation between perplexity and word error rate.
+  
+  Word error rate can then be computed as:
+  
+  WER = (S + D + I) / N = (S + D + I) / (S + D + C)
+  
+  where
+  
+  S is the number of substitutions,
+  D is the number of deletions,
+  I is the number of insertions,
+  C is the number of correct words,
+  N is the number of words in the reference (N=S+D+C).
+  
+  This value indicates the average number of errors per reference word. The lower the value, the better the
+  performance of the ASR system with a WER of 0 being a perfect score.
+---
+
+# Metric Card for WER
+
+## Metric description
+Word error rate (WER) is a common metric of the performance of an automatic speech recognition (ASR) system. 
+
+The general difficulty of measuring the performance of ASR systems lies in the fact that the recognized word sequence can have a different length from the reference word sequence (supposedly the correct one). The WER is derived from the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance), working at the word level.
+
+This problem is solved by first aligning the recognized word sequence with the reference (spoken) word sequence using dynamic string alignment. Examination of this issue is seen through a theory called the power law that states the correlation between [perplexity](https://huggingface.co/metrics/perplexity) and word error rate (see [this article](https://www.cs.cmu.edu/~roni/papers/eval-metrics-bntuw-9802.pdf) for further information).
+
+Word error rate can then be computed as:
+
+`WER = (S + D + I) / N = (S + D + I) / (S + D + C)`
+
+where
+
+`S` is the number of substitutions,
+
+`D` is the number of deletions,
+
+`I` is the number of insertions,
+
+`C` is the number of correct words,
+
+`N` is the number of words in the reference (`N=S+D+C`).
+
+
+## How to use 
+
+The metric takes two inputs: references (a list of references for each speech input) and predictions (a list of transcriptions to score).
+
+
+```python
+from evaluate import load
+wer = load("wer")
+wer_score = wer.compute(predictions=predictions, references=references)
+```
+## Output values
+
+This metric outputs a float representing the word error rate.
+
+```
+print(wer_score)
+0.5
+```
+
+This value indicates the average number of errors per reference word. 
+
+The **lower** the value, the **better** the performance of the ASR system, with a WER of 0 being a perfect score.
+
+### Values from popular papers
+
+This metric is highly dependent on the content and quality of the dataset, and therefore users can expect very different values for the same model but on different datasets.
+
+For example, datasets such as [LibriSpeech](https://huggingface.co/datasets/librispeech_asr) report a WER in the 1.8-3.3 range, whereas ASR models evaluated on [Timit](https://huggingface.co/datasets/timit_asr) report a WER in the 8.3-20.4 range. 
+See the leaderboards for [LibriSpeech](https://paperswithcode.com/sota/speech-recognition-on-librispeech-test-clean) and [Timit](https://paperswithcode.com/sota/speech-recognition-on-timit) for the most recent values.
+
+## Examples 
+
+Perfect match between prediction and reference:
+
+```python
+from evaluate import load
+wer = load("wer")
+predictions = ["hello world", "good night moon"]
+references = ["hello world", "good night moon"]
+wer_score = wer.compute(predictions=predictions, references=references)
+print(wer_score)
+0.0
+```
+
+Partial match between prediction and reference:
+
+```python
+from evaluate import load
+wer = load("wer")
+predictions = ["this is the prediction", "there is an other sample"]
+references = ["this is the reference", "there is another one"]
+wer_score = wer.compute(predictions=predictions, references=references)
+print(wer_score)
+0.5
+```
+
+No match between prediction and reference:
+
+```python
+from evaluate import load
+wer = load("wer")
+predictions = ["hello world", "good night moon"]
+references = ["hi everyone", "have a great day"]
+wer_score = wer.compute(predictions=predictions, references=references)
+print(wer_score)
+1.0
+```
+
+## Limitations and bias
+
+WER is a valuable tool for comparing different systems as well as for evaluating improvements within one system. This kind of measurement, however, provides no details on the nature of translation errors and further work is therefore required to identify the main source(s) of error and to focus any research effort. 
+
+## Citation
+
+```bibtex
+@inproceedings{woodard1982,
+author = {Woodard, J.P. and Nelson, J.T.,
+year = {1982},
+journal = {Workshop on standardisation for speech I/O technology, Naval Air Development Center, Warminster, PA},
+title = {An information theoretic measure of speech recognition performance}
+}
+```
+
+```bibtex
+@inproceedings{morris2004,
+author = {Morris, Andrew and Maier, Viktoria and Green, Phil},
+year = {2004},
+month = {01},
+pages = {},
+title = {From WER and RIL to MER and WIL: improved evaluation measures for connected speech recognition.}
+}
+```
+
+## Further References 
+
+- [Word Error Rate -- Wikipedia](https://en.wikipedia.org/wiki/Word_error_rate)
+- [Hugging Face Tasks -- Automatic Speech Recognition](https://huggingface.co/tasks/automatic-speech-recognition)
--- a/evaluate-0.4.2/metrics/wer/app.py
+++ b/evaluate-0.4.2/metrics/wer/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("wer")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/wer/requirements.txt
+++ b/evaluate-0.4.2/metrics/wer/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+jiwer
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/wer/wer.py
+++ b/evaluate-0.4.2/metrics/wer/wer.py
+# Copyright 2021 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Word Error Ratio (WER) metric. """
+
+import datasets
+from jiwer import compute_measures
+
+import evaluate
+
+
+_CITATION = """\
+@inproceedings{inproceedings,
+    author = {Morris, Andrew and Maier, Viktoria and Green, Phil},
+    year = {2004},
+    month = {01},
+    pages = {},
+    title = {From WER and RIL to MER and WIL: improved evaluation measures for connected speech recognition.}
+}
+"""
+
+_DESCRIPTION = """\
+Word error rate (WER) is a common metric of the performance of an automatic speech recognition system.
+
+The general difficulty of measuring performance lies in the fact that the recognized word sequence can have a different length from the reference word sequence (supposedly the correct one). The WER is derived from the Levenshtein distance, working at the word level instead of the phoneme level. The WER is a valuable tool for comparing different systems as well as for evaluating improvements within one system. This kind of measurement, however, provides no details on the nature of translation errors and further work is therefore required to identify the main source(s) of error and to focus any research effort.
+
+This problem is solved by first aligning the recognized word sequence with the reference (spoken) word sequence using dynamic string alignment. Examination of this issue is seen through a theory called the power law that states the correlation between perplexity and word error rate.
+
+Word error rate can then be computed as:
+
+WER = (S + D + I) / N = (S + D + I) / (S + D + C)
+
+where
+
+S is the number of substitutions,
+D is the number of deletions,
+I is the number of insertions,
+C is the number of correct words,
+N is the number of words in the reference (N=S+D+C).
+
+This value indicates the average number of errors per reference word. The lower the value, the better the
+performance of the ASR system with a WER of 0 being a perfect score.
+"""
+
+_KWARGS_DESCRIPTION = """
+Compute WER score of transcribed segments against references.
+
+Args:
+    references: List of references for each speech input.
+    predictions: List of transcriptions to score.
+    concatenate_texts (bool, default=False): Whether to concatenate all input texts or compute WER iteratively.
+
+Returns:
+    (float): the word error rate
+
+Examples:
+
+    >>> predictions = ["this is the prediction", "there is an other sample"]
+    >>> references = ["this is the reference", "there is another one"]
+    >>> wer = evaluate.load("wer")
+    >>> wer_score = wer.compute(predictions=predictions, references=references)
+    >>> print(wer_score)
+    0.5
+"""
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class WER(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Value("string", id="sequence"),
+                    "references": datasets.Value("string", id="sequence"),
+                }
+            ),
+            codebase_urls=["https://github.com/jitsi/jiwer/"],
+            reference_urls=[
+                "https://en.wikipedia.org/wiki/Word_error_rate",
+            ],
+        )
+
+    def _compute(self, predictions=None, references=None, concatenate_texts=False):
+        if concatenate_texts:
+            return compute_measures(references, predictions)["wer"]
+        else:
+            incorrect = 0
+            total = 0
+            for prediction, reference in zip(predictions, references):
+                measures = compute_measures(reference, prediction)
+                incorrect += measures["substitutions"] + measures["deletions"] + measures["insertions"]
+                total += measures["substitutions"] + measures["deletions"] + measures["hits"]
+            return incorrect / total
--- a/evaluate-0.4.2/metrics/wiki_split/README.md
+++ b/evaluate-0.4.2/metrics/wiki_split/README.md
+---
+title: WikiSplit
+emoji: 🤗 
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+description: >-
+  WIKI_SPLIT is the combination of three metrics SARI, EXACT and SACREBLEU
+  It can be used to evaluate the quality of machine-generated texts.
+---
+
+# Metric Card for WikiSplit
+
+## Metric description
+
+WikiSplit is the combination of three metrics: [SARI](https://huggingface.co/metrics/sari), [exact match](https://huggingface.co/metrics/exact_match) and [SacreBLEU](https://huggingface.co/metrics/sacrebleu). 
+
+It can be used to evaluate the quality of sentence splitting approaches, which require rewriting a long sentence into two or more coherent short sentences, e.g. based on the [WikiSplit dataset](https://huggingface.co/datasets/wiki_split).
+
+## How to use 
+
+The WIKI_SPLIT metric takes three inputs:
+
+`sources`: a list of source sentences, where each sentence should be a string.
+
+`predictions`: a list of predicted sentences, where each sentence should be a string.
+
+`references`: a list of lists of reference sentences, where each sentence should be a string.
+
+```python
+>>> wiki_split = evaluate.load("wiki_split")
+>>> sources = ["About 95 species are currently accepted ."]
+>>> predictions = ["About 95 you now get in ."]
+>>> references= [["About 95 species are currently known ."]]
+>>> results = wiki_split.compute(sources=sources, predictions=predictions, references=references)
+```
+## Output values
+
+This metric outputs a dictionary containing three scores:
+
+`sari`: the [SARI](https://huggingface.co/metrics/sari) score, whose range is between `0.0` and `100.0` -- the higher the value, the better the performance of the model being evaluated, with a SARI of 100 being a perfect score.
+
+`sacrebleu`: the [SacreBLEU](https://huggingface.co/metrics/sacrebleu) score, which can take any value between `0.0` and `100.0`, inclusive.
+
+`exact`: the [exact match](https://huggingface.co/metrics/exact_match) score, which represents the sum of all of the individual exact match scores in the set, divided by the total number of predictions in the set. It ranges from `0.0` to `100`, inclusive. Here, `0.0` means no prediction/reference pairs were matches, while `100.0` means they all were.
+
+```python
+>>> print(results)
+{'sari': 21.805555555555557, 'sacrebleu': 14.535768424205482, 'exact': 0.0}
+```
+
+### Values from popular papers
+
+This metric was initially used by [Rothe et al.(2020)](https://arxiv.org/pdf/1907.12461.pdf) to evaluate the performance of different split-and-rephrase approaches on the [WikiSplit dataset](https://huggingface.co/datasets/wiki_split). They reported a SARI score of 63.5, a SacreBLEU score of 77.2, and an EXACT_MATCH score of 16.3.
+
+## Examples 
+
+Perfect match between prediction and reference:
+
+```python
+>>> wiki_split = evaluate.load("wiki_split")
+>>> sources = ["About 95 species are currently accepted ."]
+>>> predictions = ["About 95 species are currently accepted ."]
+>>> references= [["About 95 species are currently accepted ."]]
+>>> results = wiki_split.compute(sources=sources, predictions=predictions, references=references)
+>>> print(results)
+{'sari': 100.0, 'sacrebleu': 100.00000000000004, 'exact': 100.0
+```
+
+Partial match between prediction and reference:
+
+```python
+>>> wiki_split = evaluate.load("wiki_split")
+>>> sources = ["About 95 species are currently accepted ."]
+>>> predictions = ["About 95 you now get in ."]
+>>> references= [["About 95 species are currently known ."]]
+>>> results = wiki_split.compute(sources=sources, predictions=predictions, references=references)
+>>> print(results)
+{'sari': 21.805555555555557, 'sacrebleu': 14.535768424205482, 'exact': 0.0}
+```
+
+No match between prediction and reference:
+
+```python
+>>> wiki_split = evaluate.load("wiki_split")
+>>> sources = ["About 95 species are currently accepted ."]
+>>> predictions = ["Hello world ."]
+>>> references= [["About 95 species are currently known ."]]
+>>> results = wiki_split.compute(sources=sources, predictions=predictions, references=references)
+>>> print(results)
+{'sari': 14.047619047619046, 'sacrebleu': 0.0, 'exact': 0.0}
+```
+## Limitations and bias
+
+This metric is not the official metric to evaluate models on the [WikiSplit dataset](https://huggingface.co/datasets/wiki_split). It was initially proposed by [Rothe et al.(2020)](https://arxiv.org/pdf/1907.12461.pdf), whereas the [original paper introducing the WikiSplit dataset (2018)](https://aclanthology.org/D18-1080.pdf) uses different metrics to evaluate performance, such as corpus-level [BLEU](https://huggingface.co/metrics/bleu) and sentence-level BLEU. 
+
+## Citation
+
+```bibtex
+@article{rothe2020leveraging,
+  title={Leveraging pre-trained checkpoints for sequence generation tasks},
+  author={Rothe, Sascha and Narayan, Shashi and Severyn, Aliaksei},
+  journal={Transactions of the Association for Computational Linguistics},
+  volume={8},
+  pages={264--280},
+  year={2020},
+  publisher={MIT Press}
+}
+```
+
+## Further References 
+
+- [WikiSplit dataset](https://huggingface.co/datasets/wiki_split)
+- [WikiSplit paper (Botha et al., 2018)](https://aclanthology.org/D18-1080.pdf) 
--- a/evaluate-0.4.2/metrics/wiki_split/app.py
+++ b/evaluate-0.4.2/metrics/wiki_split/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("wiki_split")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/wiki_split/requirements.txt
+++ b/evaluate-0.4.2/metrics/wiki_split/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+sacrebleu
+sacremoses
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/wiki_split/wiki_split.py
+++ b/evaluate-0.4.2/metrics/wiki_split/wiki_split.py
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" WIKI_SPLIT metric."""
+
+import re
+import string
+from collections import Counter
+
+import datasets
+import sacrebleu
+import sacremoses
+from packaging import version
+
+import evaluate
+
+
+_CITATION = """
+@inproceedings{xu-etal-2016-optimizing,
+    title = {Optimizing Statistical Machine Translation for Text Simplification},
+    authors={Xu, Wei and Napoles, Courtney and Pavlick, Ellie and Chen, Quanze and Callison-Burch, Chris},
+    journal = {Transactions of the Association for Computational Linguistics},
+    volume = {4},
+    year={2016},
+    url = {https://www.aclweb.org/anthology/Q16-1029},
+    pages = {401--415
+},
+@inproceedings{post-2018-call,
+    title = "A Call for Clarity in Reporting {BLEU} Scores",
+    author = "Post, Matt",
+    booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
+    month = oct,
+    year = "2018",
+    address = "Belgium, Brussels",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/W18-6319",
+    pages = "186--191",
+}
+"""
+
+_DESCRIPTION = """\
+WIKI_SPLIT is the combination of three metrics SARI, EXACT and SACREBLEU
+It can be used to evaluate the quality of machine-generated texts.
+"""
+
+
+_KWARGS_DESCRIPTION = """
+Calculates sari score (between 0 and 100) given a list of source and predicted
+sentences, and a list of lists of reference sentences. It also computes the BLEU score as well as the exact match score.
+Args:
+    sources: list of source sentences where each sentence should be a string.
+    predictions: list of predicted sentences where each sentence should be a string.
+    references: list of lists of reference sentences where each sentence should be a string.
+Returns:
+    sari: sari score
+    sacrebleu: sacrebleu score
+    exact: exact score
+
+Examples:
+    >>> sources=["About 95 species are currently accepted ."]
+    >>> predictions=["About 95 you now get in ."]
+    >>> references=[["About 95 species are currently known ."]]
+    >>> wiki_split = evaluate.load("wiki_split")
+    >>> results = wiki_split.compute(sources=sources, predictions=predictions, references=references)
+    >>> print(results)
+    {'sari': 21.805555555555557, 'sacrebleu': 14.535768424205482, 'exact': 0.0}
+"""
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+        return re.sub(regex, " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def compute_exact(a_gold, a_pred):
+    return int(normalize_answer(a_gold) == normalize_answer(a_pred))
+
+
+def compute_em(predictions, references):
+    scores = [any([compute_exact(ref, pred) for ref in refs]) for pred, refs in zip(predictions, references)]
+    return (sum(scores) / len(scores)) * 100
+
+
+def SARIngram(sgrams, cgrams, rgramslist, numref):
+    rgramsall = [rgram for rgrams in rgramslist for rgram in rgrams]
+    rgramcounter = Counter(rgramsall)
+
+    sgramcounter = Counter(sgrams)
+    sgramcounter_rep = Counter()
+    for sgram, scount in sgramcounter.items():
+        sgramcounter_rep[sgram] = scount * numref
+
+    cgramcounter = Counter(cgrams)
+    cgramcounter_rep = Counter()
+    for cgram, ccount in cgramcounter.items():
+        cgramcounter_rep[cgram] = ccount * numref
+
+    # KEEP
+    keepgramcounter_rep = sgramcounter_rep & cgramcounter_rep
+    keepgramcountergood_rep = keepgramcounter_rep & rgramcounter
+    keepgramcounterall_rep = sgramcounter_rep & rgramcounter
+
+    keeptmpscore1 = 0
+    keeptmpscore2 = 0
+    for keepgram in keepgramcountergood_rep:
+        keeptmpscore1 += keepgramcountergood_rep[keepgram] / keepgramcounter_rep[keepgram]
+        # Fix an alleged bug [2] in the keep score computation.
+        # keeptmpscore2 += keepgramcountergood_rep[keepgram] / keepgramcounterall_rep[keepgram]
+        keeptmpscore2 += keepgramcountergood_rep[keepgram]
+    # Define 0/0=1 instead of 0 to give higher scores for predictions that match
+    #      a target exactly.
+    keepscore_precision = 1
+    keepscore_recall = 1
+    if len(keepgramcounter_rep) > 0:
+        keepscore_precision = keeptmpscore1 / len(keepgramcounter_rep)
+    if len(keepgramcounterall_rep) > 0:
+        # Fix an alleged bug [2] in the keep score computation.
+        # keepscore_recall = keeptmpscore2 / len(keepgramcounterall_rep)
+        keepscore_recall = keeptmpscore2 / sum(keepgramcounterall_rep.values())
+    keepscore = 0
+    if keepscore_precision > 0 or keepscore_recall > 0:
+        keepscore = 2 * keepscore_precision * keepscore_recall / (keepscore_precision + keepscore_recall)
+
+    # DELETION
+    delgramcounter_rep = sgramcounter_rep - cgramcounter_rep
+    delgramcountergood_rep = delgramcounter_rep - rgramcounter
+    delgramcounterall_rep = sgramcounter_rep - rgramcounter
+    deltmpscore1 = 0
+    deltmpscore2 = 0
+    for delgram in delgramcountergood_rep:
+        deltmpscore1 += delgramcountergood_rep[delgram] / delgramcounter_rep[delgram]
+        deltmpscore2 += delgramcountergood_rep[delgram] / delgramcounterall_rep[delgram]
+    # Define 0/0=1 instead of 0 to give higher scores for predictions that match
+    # a target exactly.
+    delscore_precision = 1
+    if len(delgramcounter_rep) > 0:
+        delscore_precision = deltmpscore1 / len(delgramcounter_rep)
+
+    # ADDITION
+    addgramcounter = set(cgramcounter) - set(sgramcounter)
+    addgramcountergood = set(addgramcounter) & set(rgramcounter)
+    addgramcounterall = set(rgramcounter) - set(sgramcounter)
+
+    addtmpscore = 0
+    for addgram in addgramcountergood:
+        addtmpscore += 1
+
+    # Define 0/0=1 instead of 0 to give higher scores for predictions that match
+    # a target exactly.
+    addscore_precision = 1
+    addscore_recall = 1
+    if len(addgramcounter) > 0:
+        addscore_precision = addtmpscore / len(addgramcounter)
+    if len(addgramcounterall) > 0:
+        addscore_recall = addtmpscore / len(addgramcounterall)
+    addscore = 0
+    if addscore_precision > 0 or addscore_recall > 0:
+        addscore = 2 * addscore_precision * addscore_recall / (addscore_precision + addscore_recall)
+
+    return (keepscore, delscore_precision, addscore)
+
+
+def SARIsent(ssent, csent, rsents):
+    numref = len(rsents)
+
+    s1grams = ssent.split(" ")
+    c1grams = csent.split(" ")
+    s2grams = []
+    c2grams = []
+    s3grams = []
+    c3grams = []
+    s4grams = []
+    c4grams = []
+
+    r1gramslist = []
+    r2gramslist = []
+    r3gramslist = []
+    r4gramslist = []
+    for rsent in rsents:
+        r1grams = rsent.split(" ")
+        r2grams = []
+        r3grams = []
+        r4grams = []
+        r1gramslist.append(r1grams)
+        for i in range(0, len(r1grams) - 1):
+            if i < len(r1grams) - 1:
+                r2gram = r1grams[i] + " " + r1grams[i + 1]
+                r2grams.append(r2gram)
+            if i < len(r1grams) - 2:
+                r3gram = r1grams[i] + " " + r1grams[i + 1] + " " + r1grams[i + 2]
+                r3grams.append(r3gram)
+            if i < len(r1grams) - 3:
+                r4gram = r1grams[i] + " " + r1grams[i + 1] + " " + r1grams[i + 2] + " " + r1grams[i + 3]
+                r4grams.append(r4gram)
+        r2gramslist.append(r2grams)
+        r3gramslist.append(r3grams)
+        r4gramslist.append(r4grams)
+
+    for i in range(0, len(s1grams) - 1):
+        if i < len(s1grams) - 1:
+            s2gram = s1grams[i] + " " + s1grams[i + 1]
+            s2grams.append(s2gram)
+        if i < len(s1grams) - 2:
+            s3gram = s1grams[i] + " " + s1grams[i + 1] + " " + s1grams[i + 2]
+            s3grams.append(s3gram)
+        if i < len(s1grams) - 3:
+            s4gram = s1grams[i] + " " + s1grams[i + 1] + " " + s1grams[i + 2] + " " + s1grams[i + 3]
+            s4grams.append(s4gram)
+
+    for i in range(0, len(c1grams) - 1):
+        if i < len(c1grams) - 1:
+            c2gram = c1grams[i] + " " + c1grams[i + 1]
+            c2grams.append(c2gram)
+        if i < len(c1grams) - 2:
+            c3gram = c1grams[i] + " " + c1grams[i + 1] + " " + c1grams[i + 2]
+            c3grams.append(c3gram)
+        if i < len(c1grams) - 3:
+            c4gram = c1grams[i] + " " + c1grams[i + 1] + " " + c1grams[i + 2] + " " + c1grams[i + 3]
+            c4grams.append(c4gram)
+
+    (keep1score, del1score, add1score) = SARIngram(s1grams, c1grams, r1gramslist, numref)
+    (keep2score, del2score, add2score) = SARIngram(s2grams, c2grams, r2gramslist, numref)
+    (keep3score, del3score, add3score) = SARIngram(s3grams, c3grams, r3gramslist, numref)
+    (keep4score, del4score, add4score) = SARIngram(s4grams, c4grams, r4gramslist, numref)
+    avgkeepscore = sum([keep1score, keep2score, keep3score, keep4score]) / 4
+    avgdelscore = sum([del1score, del2score, del3score, del4score]) / 4
+    avgaddscore = sum([add1score, add2score, add3score, add4score]) / 4
+    finalscore = (avgkeepscore + avgdelscore + avgaddscore) / 3
+    return finalscore
+
+
+def normalize(sentence, lowercase: bool = True, tokenizer: str = "13a", return_str: bool = True):
+
+    # Normalization is requried for the ASSET dataset (one of the primary
+    # datasets in sentence simplification) to allow using space
+    # to split the sentence. Even though Wiki-Auto and TURK datasets,
+    # do not require normalization, we do it for consistency.
+    # Code adapted from the EASSE library [1] written by the authors of the ASSET dataset.
+    # [1] https://github.com/feralvam/easse/blob/580bba7e1378fc8289c663f864e0487188fe8067/easse/utils/preprocessing.py#L7
+
+    if lowercase:
+        sentence = sentence.lower()
+
+    if tokenizer in ["13a", "intl"]:
+        if version.parse(sacrebleu.__version__).major >= 2:
+            normalized_sent = sacrebleu.metrics.bleu._get_tokenizer(tokenizer)()(sentence)
+        else:
+            normalized_sent = sacrebleu.TOKENIZERS[tokenizer]()(sentence)
+    elif tokenizer == "moses":
+        normalized_sent = sacremoses.MosesTokenizer().tokenize(sentence, return_str=True, escape=False)
+    elif tokenizer == "penn":
+        normalized_sent = sacremoses.MosesTokenizer().penn_tokenize(sentence, return_str=True)
+    else:
+        normalized_sent = sentence
+
+    if not return_str:
+        normalized_sent = normalized_sent.split()
+
+    return normalized_sent
+
+
+def compute_sari(sources, predictions, references):
+
+    if not (len(sources) == len(predictions) == len(references)):
+        raise ValueError("Sources length must match predictions and references lengths.")
+    sari_score = 0
+    for src, pred, refs in zip(sources, predictions, references):
+        sari_score += SARIsent(normalize(src), normalize(pred), [normalize(sent) for sent in refs])
+    sari_score = sari_score / len(predictions)
+    return 100 * sari_score
+
+
+def compute_sacrebleu(
+    predictions,
+    references,
+    smooth_method="exp",
+    smooth_value=None,
+    force=False,
+    lowercase=False,
+    use_effective_order=False,
+):
+    references_per_prediction = len(references[0])
+    if any(len(refs) != references_per_prediction for refs in references):
+        raise ValueError("Sacrebleu requires the same number of references for each prediction")
+    transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)]
+    output = sacrebleu.corpus_bleu(
+        predictions,
+        transformed_references,
+        smooth_method=smooth_method,
+        smooth_value=smooth_value,
+        force=force,
+        lowercase=lowercase,
+        use_effective_order=use_effective_order,
+    )
+    return output.score
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class WikiSplit(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=[
+                datasets.Features(
+                    {
+                        "predictions": datasets.Value("string", id="sequence"),
+                        "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
+                    }
+                ),
+                datasets.Features(
+                    {
+                        "predictions": datasets.Value("string", id="sequence"),
+                        "references": datasets.Value("string", id="sequence"),
+                    }
+                ),
+            ],
+            codebase_urls=[
+                "https://github.com/huggingface/transformers/blob/master/src/transformers/data/metrics/squad_metrics.py",
+                "https://github.com/cocoxu/simplification/blob/master/SARI.py",
+                "https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/sari_hook.py",
+                "https://github.com/mjpost/sacreBLEU",
+            ],
+            reference_urls=[
+                "https://www.aclweb.org/anthology/Q16-1029.pdf",
+                "https://github.com/mjpost/sacreBLEU",
+                "https://en.wikipedia.org/wiki/BLEU",
+                "https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213",
+            ],
+        )
+
+    def _compute(self, sources, predictions, references):
+        # if only one reference is provided make sure we still use list of lists
+        if isinstance(references[0], str):
+            references = [[ref] for ref in references]
+        result = {}
+        result.update({"sari": compute_sari(sources=sources, predictions=predictions, references=references)})
+        result.update({"sacrebleu": compute_sacrebleu(predictions=predictions, references=references)})
+        result.update({"exact": compute_em(predictions=predictions, references=references)})
+        return result
--- a/evaluate-0.4.2/metrics/xnli/README.md
+++ b/evaluate-0.4.2/metrics/xnli/README.md
+---
+title: XNLI
+emoji: 🤗 
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+description: >-
+  XNLI is a subset of a few thousand examples from MNLI which has been translated
+  into a 14 different languages (some low-ish resource). As with MNLI, the goal is
+  to predict textual entailment (does sentence A imply/contradict/neither sentence
+  B) and is a classification task (given two sentences, predict one of three
+  labels).
+---
+
+# Metric Card for XNLI
+
+## Metric description
+
+The XNLI metric allows to evaluate a model's score on the [XNLI dataset](https://huggingface.co/datasets/xnli), which is a subset of a few thousand examples from the [MNLI dataset](https://huggingface.co/datasets/glue/viewer/mnli) that have been translated into a 14 different languages, some of which are relatively low resource such as Swahili and Urdu.
+
+As with MNLI, the task is to predict textual entailment (does sentence A imply/contradict/neither sentence B) and is a classification task (given two sentences, predict one of three labels).
+
+## How to use 
+
+The XNLI metric is computed based on the `predictions` (a list of predicted labels) and the `references` (a list of ground truth labels).
+
+```python
+from evaluate import load
+xnli_metric = load("xnli")
+predictions = [0, 1]
+references = [0, 1]
+results = xnli_metric.compute(predictions=predictions, references=references)
+```
+
+## Output values
+
+The output of the XNLI metric is simply the `accuracy`, i.e. the proportion of correct predictions among the total number of cases processed, with a range between 0 and 1 (see [accuracy](https://huggingface.co/metrics/accuracy) for more information). 
+
+### Values from popular papers
+The [original XNLI paper](https://arxiv.org/pdf/1809.05053.pdf) reported accuracies ranging from 59.3 (for `ur`) to 73.7 (for `en`) for the BiLSTM-max model.
+
+For more recent model performance, see the [dataset leaderboard](https://paperswithcode.com/dataset/xnli).
+
+## Examples 
+
+Maximal values:
+
+```python
+>>> from evaluate import load
+>>> xnli_metric = load("xnli")
+>>> predictions = [0, 1]
+>>> references = [0, 1]
+>>> results = xnli_metric.compute(predictions=predictions, references=references)
+>>> print(results)
+{'accuracy': 1.0}
+```
+
+Minimal values:
+
+```python
+>>> from evaluate import load
+>>> xnli_metric = load("xnli")
+>>> predictions = [1, 0]
+>>> references = [0, 1]
+>>> results = xnli_metric.compute(predictions=predictions, references=references)
+>>> print(results)
+{'accuracy': 0.0}
+```
+
+Partial match:
+
+```python
+>>> from evaluate import load
+>>> xnli_metric = load("xnli")
+>>> predictions = [1, 0, 1]
+>>> references = [1, 0, 0]
+>>> results = xnli_metric.compute(predictions=predictions, references=references)
+>>> print(results)
+{'accuracy': 0.6666666666666666}
+```
+
+## Limitations and bias
+
+While accuracy alone does give a certain indication of performance, it can be supplemented by error analysis and a better understanding of the model's mistakes on each of the categories represented in the dataset, especially if they are unbalanced. 
+
+While the XNLI dataset is multilingual and represents a diversity of languages, in reality, cross-lingual sentence understanding goes beyond translation, given that there are many cultural differences that have an impact on human sentiment annotations. Since the XNLI dataset was obtained by translation based on English sentences, it does not capture these cultural differences. 
+
+
+
+## Citation
+
+```bibtex
+@InProceedings{conneau2018xnli,
+  author = "Conneau, Alexis
+                 and Rinott, Ruty
+                 and Lample, Guillaume
+                 and Williams, Adina
+                 and Bowman, Samuel R.
+                 and Schwenk, Holger
+                 and Stoyanov, Veselin",
+  title = "XNLI: Evaluating Cross-lingual Sentence Representations",
+  booktitle = "Proceedings of the 2018 Conference on Empirical Methods
+               in Natural Language Processing",
+  year = "2018",
+  publisher = "Association for Computational Linguistics",
+  location = "Brussels, Belgium",
+}
+```
+    
+## Further References 
+
+- [XNI Dataset GitHub](https://github.com/facebookresearch/XNLI)
+- [HuggingFace Tasks -- Text Classification](https://huggingface.co/tasks/text-classification)