修改readme

25991f98 · hepj · ac192496 · 25991f98 · 25991f98 · 25991f98
Commit 25991f98 authored Jul 25, 2024 by hepj
20 changed files
--- a/evaluate-0.4.2/metrics/brier_score/app.py
+++ b/evaluate-0.4.2/metrics/brier_score/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("brier_score")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/brier_score/brier_score.py
+++ b/evaluate-0.4.2/metrics/brier_score/brier_score.py
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Brier Score Metric"""
+
+import datasets
+from sklearn.metrics import brier_score_loss
+
+import evaluate
+
+
+_CITATION = """\
+@article{scikit-learn,
+ title={Scikit-learn: Machine Learning in {P}ython},
+ author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
+         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
+         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
+         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
+ journal={Journal of Machine Learning Research},
+ volume={12},
+ pages={2825--2830},
+ year={2011}
+}
+"""
+
+_DESCRIPTION = """\
+Brier score is a type of evaluation metric for classification tasks, where you predict outcomes such as win/lose, spam/ham, click/no-click etc.
+`BrierScore = 1/N * sum( (p_i - o_i)^2 )`
+"""
+
+
+_KWARGS_DESCRIPTION = """
+Args:
+    y_true : array of shape (n_samples,)
+        True targets.
+
+    y_prob : array of shape (n_samples,)
+        Probabilities of the positive class.
+
+    sample_weight : array-like of shape (n_samples,), default=None
+        Sample weights.
+
+    pos_label : int or str, default=None
+        Label of the positive class. `pos_label` will be inferred in the
+        following manner:
+
+        * if `y_true` in {-1, 1} or {0, 1}, `pos_label` defaults to 1;
+        * else if `y_true` contains string, an error will be raised and
+          `pos_label` should be explicitly specified;
+        * otherwise, `pos_label` defaults to the greater label,
+          i.e. `np.unique(y_true)[-1]`.
+
+Returns
+    score : float
+        Brier score loss.
+Examples:
+    Example-1: if y_true in {-1, 1} or {0, 1}, pos_label defaults to 1.
+        >>> import numpy as np
+        >>> brier_score = evaluate.load("brier_score")
+        >>> references = np.array([0, 0, 1, 1])
+        >>> predictions = np.array([0.1, 0.9, 0.8, 0.3])
+        >>> results = brier_score.compute(references=references, predictions=predictions)
+        >>> print(round(results["brier_score"], 4))
+        0.3375
+
+    Example-2: if y_true contains string, an error will be raised and pos_label should be explicitly specified.
+        >>> import numpy as np
+        >>> brier_score = evaluate.load("brier_score")
+        >>> references =  np.array(["spam", "ham", "ham", "spam"])
+        >>> predictions = np.array([0.1, 0.9, 0.8, 0.3])
+        >>> results = brier_score.compute(references=references, predictions=predictions, pos_label="ham")
+        >>> print(round(results["brier_score"], 4))
+        0.0375
+"""
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class BrierScore(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=self._get_feature_types(),
+            reference_urls=["https://scikit-learn.org/stable/modules/generated/sklearn.metrics.brier_score_loss.html"],
+        )
+
+    def _get_feature_types(self):
+        if self.config_name == "multilist":
+            return [
+                datasets.Features(
+                    {
+                        "references": datasets.Sequence(datasets.Value("float")),
+                        "predictions": datasets.Sequence(datasets.Value("float")),
+                    }
+                ),
+                datasets.Features(
+                    {
+                        "references": datasets.Sequence(datasets.Value("string")),
+                        "predictions": datasets.Sequence(datasets.Value("float")),
+                    }
+                ),
+            ]
+        else:
+            return [
+                datasets.Features(
+                    {
+                        "references": datasets.Value("float"),
+                        "predictions": datasets.Value("float"),
+                    }
+                ),
+                datasets.Features(
+                    {
+                        "references": datasets.Value("string"),
+                        "predictions": datasets.Value("float"),
+                    }
+                ),
+            ]
+
+    def _compute(self, references, predictions, sample_weight=None, pos_label=1):
+
+        brier_score = brier_score_loss(references, predictions, sample_weight=sample_weight, pos_label=pos_label)
+
+        return {"brier_score": brier_score}
--- a/evaluate-0.4.2/metrics/brier_score/requirements.txt
+++ b/evaluate-0.4.2/metrics/brier_score/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+scikit-learn
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/cer/README.md
+++ b/evaluate-0.4.2/metrics/cer/README.md
+---
+title: CER
+emoji: 🤗 
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+description: >-
+  Character error rate (CER) is a common metric of the performance of an automatic speech recognition system.
+  
+  CER is similar to Word Error Rate (WER), but operates on character instead of word. Please refer to docs of WER for further information.
+  
+  Character error rate can be computed as:
+  
+  CER = (S + D + I) / N = (S + D + I) / (S + D + C)
+  
+  where
+  
+  S is the number of substitutions,
+  D is the number of deletions,
+  I is the number of insertions,
+  C is the number of correct characters,
+  N is the number of characters in the reference (N=S+D+C).
+  
+  CER's output is not always a number between 0 and 1, in particular when there is a high number of insertions. This value is often associated to the percentage of characters that were incorrectly predicted. The lower the value, the better the
+  performance of the ASR system with a CER of 0 being a perfect score.
+---
+
+# Metric Card for CER
+
+## Metric description
+
+Character error rate (CER) is a common metric of the performance of an automatic speech recognition (ASR) system. CER is similar to Word Error Rate (WER), but operates on character instead of word. 
+
+Character error rate can be computed as: 
+
+`CER = (S + D + I) / N = (S + D + I) / (S + D + C)`
+
+where
+
+`S` is the number of substitutions, 
+
+`D` is the number of deletions, 
+
+`I` is the number of insertions, 
+
+`C` is the number of correct characters, 
+
+`N` is the number of characters in the reference (`N=S+D+C`). 
+
+
+## How to use 
+
+The metric takes two inputs: references (a list of references for each speech input) and predictions (a list of transcriptions to score).
+
+```python
+from evaluate import load
+cer = load("cer")
+cer_score = cer.compute(predictions=predictions, references=references)
+```
+## Output values
+
+This metric outputs a float representing the character error rate.
+
+```
+print(cer_score)
+0.34146341463414637
+```
+
+The **lower** the CER value, the **better** the performance of the ASR system, with a CER of 0 being a perfect score. 
+
+However, CER's output is not always a number between 0 and 1, in particular when there is a high number of insertions (see [Examples](#Examples) below).
+
+### Values from popular papers
+
+This metric is highly dependent on the content and quality of the dataset, and therefore users can expect very different values for the same model but on different datasets.
+
+Multilingual datasets such as [Common Voice](https://huggingface.co/datasets/common_voice) report different CERs depending on the language, ranging from 0.02-0.03 for languages such as French and Italian, to 0.05-0.07 for English (see [here](https://github.com/speechbrain/speechbrain/tree/develop/recipes/CommonVoice/ASR/CTC) for more values).
+
+## Examples 
+
+Perfect match between prediction and reference:
+
+```python
+from evaluate import load
+cer = load("cer")
+predictions = ["hello world", "good night moon"]
+references = ["hello world", "good night moon"]
+cer_score = cer.compute(predictions=predictions, references=references)
+print(cer_score)
+0.0
+```
+
+Partial match between prediction and reference:
+
+```python
+from evaluate import load
+cer = load("cer")
+predictions = ["this is the prediction", "there is an other sample"]
+references = ["this is the reference", "there is another one"]
+cer_score = cer.compute(predictions=predictions, references=references)
+print(cer_score)
+0.34146341463414637
+```
+
+No match between prediction and reference:
+
+```python
+from evaluate import load
+cer = load("cer")
+predictions = ["hello"]
+references = ["gracias"]
+cer_score = cer.compute(predictions=predictions, references=references)
+print(cer_score)
+1.0
+```
+
+CER above 1 due to insertion errors:
+
+```python
+from evaluate import load
+cer = load("cer")
+predictions = ["hello world"]
+references = ["hello"]
+cer_score = cer.compute(predictions=predictions, references=references)
+print(cer_score)
+1.2
+```
+
+## Limitations and bias
+
+CER is useful for comparing different models for tasks such as automatic speech recognition (ASR) and optic character recognition (OCR), especially for multilingual datasets where WER is not suitable given the diversity of languages. However, CER provides no details on the nature of translation errors and further work is therefore required to identify the main source(s) of error and to focus any research effort.
+
+Also, in some cases, instead of reporting the raw CER, a normalized CER is reported where the number of mistakes is divided by the sum of the number of edit operations (`I` + `S` + `D`) and `C` (the number of correct characters), which results in CER values that fall within the range of 0–100%.
+
+
+## Citation
+
+
+```bibtex
+@inproceedings{morris2004,
+author = {Morris, Andrew and Maier, Viktoria and Green, Phil},
+year = {2004},
+month = {01},
+pages = {},
+title = {From WER and RIL to MER and WIL: improved evaluation measures for connected speech recognition.}
+}
+```
+
+## Further References 
+
+- [Hugging Face Tasks -- Automatic Speech Recognition](https://huggingface.co/tasks/automatic-speech-recognition)
--- a/evaluate-0.4.2/metrics/cer/app.py
+++ b/evaluate-0.4.2/metrics/cer/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("cer")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/cer/cer.py
+++ b/evaluate-0.4.2/metrics/cer/cer.py
+# Copyright 2021 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Character Error Ratio (CER) metric. """
+
+from typing import List
+
+import datasets
+import jiwer
+import jiwer.transforms as tr
+from datasets.config import PY_VERSION
+from packaging import version
+
+import evaluate
+
+
+if PY_VERSION < version.parse("3.8"):
+    import importlib_metadata
+else:
+    import importlib.metadata as importlib_metadata
+
+
+SENTENCE_DELIMITER = ""
+
+
+if version.parse(importlib_metadata.version("jiwer")) < version.parse("2.3.0"):
+
+    class SentencesToListOfCharacters(tr.AbstractTransform):
+        def __init__(self, sentence_delimiter: str = " "):
+            self.sentence_delimiter = sentence_delimiter
+
+        def process_string(self, s: str):
+            return list(s)
+
+        def process_list(self, inp: List[str]):
+            chars = []
+            for sent_idx, sentence in enumerate(inp):
+                chars.extend(self.process_string(sentence))
+                if self.sentence_delimiter is not None and self.sentence_delimiter != "" and sent_idx < len(inp) - 1:
+                    chars.append(self.sentence_delimiter)
+            return chars
+
+    cer_transform = tr.Compose(
+        [tr.RemoveMultipleSpaces(), tr.Strip(), SentencesToListOfCharacters(SENTENCE_DELIMITER)]
+    )
+else:
+    cer_transform = tr.Compose(
+        [
+            tr.RemoveMultipleSpaces(),
+            tr.Strip(),
+            tr.ReduceToSingleSentence(SENTENCE_DELIMITER),
+            tr.ReduceToListOfListOfChars(),
+        ]
+    )
+
+
+_CITATION = """\
+@inproceedings{inproceedings,
+    author = {Morris, Andrew and Maier, Viktoria and Green, Phil},
+    year = {2004},
+    month = {01},
+    pages = {},
+    title = {From WER and RIL to MER and WIL: improved evaluation measures for connected speech recognition.}
+}
+"""
+
+_DESCRIPTION = """\
+Character error rate (CER) is a common metric of the performance of an automatic speech recognition system.
+
+CER is similar to Word Error Rate (WER), but operates on character instead of word. Please refer to docs of WER for further information.
+
+Character error rate can be computed as:
+
+CER = (S + D + I) / N = (S + D + I) / (S + D + C)
+
+where
+
+S is the number of substitutions,
+D is the number of deletions,
+I is the number of insertions,
+C is the number of correct characters,
+N is the number of characters in the reference (N=S+D+C).
+
+CER's output is not always a number between 0 and 1, in particular when there is a high number of insertions. This value is often associated to the percentage of characters that were incorrectly predicted. The lower the value, the better the
+performance of the ASR system with a CER of 0 being a perfect score.
+"""
+
+_KWARGS_DESCRIPTION = """
+Computes CER score of transcribed segments against references.
+Args:
+    references: list of references for each speech input.
+    predictions: list of transcribtions to score.
+    concatenate_texts: Whether or not to concatenate sentences before evaluation, set to True for more accurate result.
+Returns:
+    (float): the character error rate
+
+Examples:
+
+    >>> predictions = ["this is the prediction", "there is an other sample"]
+    >>> references = ["this is the reference", "there is another one"]
+    >>> cer = evaluate.load("cer")
+    >>> cer_score = cer.compute(predictions=predictions, references=references)
+    >>> print(cer_score)
+    0.34146341463414637
+"""
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class CER(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Value("string", id="sequence"),
+                    "references": datasets.Value("string", id="sequence"),
+                }
+            ),
+            codebase_urls=["https://github.com/jitsi/jiwer/"],
+            reference_urls=[
+                "https://en.wikipedia.org/wiki/Word_error_rate",
+                "https://sites.google.com/site/textdigitisation/qualitymeasures/computingerrorrates",
+            ],
+        )
+
+    def _compute(self, predictions, references, concatenate_texts=False):
+        if concatenate_texts:
+            return jiwer.compute_measures(
+                references,
+                predictions,
+                truth_transform=cer_transform,
+                hypothesis_transform=cer_transform,
+            )["wer"]
+
+        incorrect = 0
+        total = 0
+        for prediction, reference in zip(predictions, references):
+            measures = jiwer.compute_measures(
+                reference,
+                prediction,
+                truth_transform=cer_transform,
+                hypothesis_transform=cer_transform,
+            )
+            incorrect += measures["substitutions"] + measures["deletions"] + measures["insertions"]
+            total += measures["substitutions"] + measures["deletions"] + measures["hits"]
+
+        return incorrect / total
--- a/evaluate-0.4.2/metrics/cer/requirements.txt
+++ b/evaluate-0.4.2/metrics/cer/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+jiwer
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/cer/test_cer.py
+++ b/evaluate-0.4.2/metrics/cer/test_cer.py
+# Copyright 2021 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+from cer import CER
+
+
+cer = CER()
+
+
+class TestCER(unittest.TestCase):
+    def test_cer_case_sensitive(self):
+        refs = ["White House"]
+        preds = ["white house"]
+        # S = 2, D = 0, I = 0, N = 11, CER = 2 / 11
+        char_error_rate = cer.compute(predictions=preds, references=refs)
+        self.assertTrue(abs(char_error_rate - 0.1818181818) < 1e-6)
+
+    def test_cer_whitespace(self):
+        refs = ["were wolf"]
+        preds = ["werewolf"]
+        # S = 0, D = 0, I = 1, N = 9, CER = 1 / 9
+        char_error_rate = cer.compute(predictions=preds, references=refs)
+        self.assertTrue(abs(char_error_rate - 0.1111111) < 1e-6)
+
+        refs = ["werewolf"]
+        preds = ["weae     wolf"]
+        # S = 1, D = 1, I = 0, N = 8, CER = 0.25
+        char_error_rate = cer.compute(predictions=preds, references=refs)
+        self.assertTrue(abs(char_error_rate - 0.25) < 1e-6)
+
+        # consecutive whitespaces case 1
+        refs = ["were wolf"]
+        preds = ["were               wolf"]
+        # S = 0, D = 0, I = 0, N = 9, CER = 0
+        char_error_rate = cer.compute(predictions=preds, references=refs)
+        self.assertTrue(abs(char_error_rate - 0.0) < 1e-6)
+
+        # consecutive whitespaces case 2
+        refs = ["were   wolf"]
+        preds = ["were               wolf"]
+        # S = 0, D = 0, I = 0, N = 9, CER = 0
+        char_error_rate = cer.compute(predictions=preds, references=refs)
+        self.assertTrue(abs(char_error_rate - 0.0) < 1e-6)
+
+    def test_cer_sub(self):
+        refs = ["werewolf"]
+        preds = ["weaewolf"]
+        # S = 1, D = 0, I = 0, N = 8, CER = 0.125
+        char_error_rate = cer.compute(predictions=preds, references=refs)
+        self.assertTrue(abs(char_error_rate - 0.125) < 1e-6)
+
+    def test_cer_del(self):
+        refs = ["werewolf"]
+        preds = ["wereawolf"]
+        # S = 0, D = 1, I = 0, N = 8, CER = 0.125
+        char_error_rate = cer.compute(predictions=preds, references=refs)
+        self.assertTrue(abs(char_error_rate - 0.125) < 1e-6)
+
+    def test_cer_insert(self):
+        refs = ["werewolf"]
+        preds = ["wereolf"]
+        # S = 0, D = 0, I = 1, N = 8, CER = 0.125
+        char_error_rate = cer.compute(predictions=preds, references=refs)
+        self.assertTrue(abs(char_error_rate - 0.125) < 1e-6)
+
+    def test_cer_equal(self):
+        refs = ["werewolf"]
+        char_error_rate = cer.compute(predictions=refs, references=refs)
+        self.assertEqual(char_error_rate, 0.0)
+
+    def test_cer_list_of_seqs(self):
+        refs = ["werewolf", "I am your father"]
+        char_error_rate = cer.compute(predictions=refs, references=refs)
+        self.assertEqual(char_error_rate, 0.0)
+
+        refs = ["werewolf", "I am your father", "doge"]
+        preds = ["werxwolf", "I       am your father", "doge"]
+        # S = 1, D = 0, I = 0, N = 28, CER = 1 / 28
+        char_error_rate = cer.compute(predictions=preds, references=refs)
+        self.assertTrue(abs(char_error_rate - 0.03571428) < 1e-6)
+
+    def test_correlated_sentences(self):
+        refs = ["My hovercraft", "is full of eels"]
+        preds = ["My hovercraft is full", " of eels"]
+        # S = 0, D = 0, I = 2, N = 28, CER = 2 / 28
+        # whitespace at the front of " of eels" will be strip during preporcessing
+        # so need to insert 2 whitespaces
+        char_error_rate = cer.compute(predictions=preds, references=refs, concatenate_texts=True)
+        self.assertTrue(abs(char_error_rate - 0.071428) < 1e-6)
+
+    def test_cer_unicode(self):
+        refs = ["我能吞下玻璃而不伤身体"]
+        preds = [" 能吞虾玻璃而 不霜身体啦"]
+        # S = 3, D = 2, I = 0, N = 11, CER = 5 / 11
+        char_error_rate = cer.compute(predictions=preds, references=refs)
+        self.assertTrue(abs(char_error_rate - 0.4545454545) < 1e-6)
+
+        refs = ["我能吞下玻璃", "而不伤身体"]
+        preds = ["我    能 吞 下 玻 璃", "而不伤身体"]
+        # S = 0, D = 5, I = 0, N = 11, CER = 5 / 11
+        char_error_rate = cer.compute(predictions=preds, references=refs)
+        self.assertTrue(abs(char_error_rate - 0.454545454545) < 1e-6)
+
+        refs = ["我能吞下玻璃而不伤身体"]
+        char_error_rate = cer.compute(predictions=refs, references=refs)
+        self.assertFalse(char_error_rate, 0.0)
+
+    def test_cer_empty(self):
+        refs = [""]
+        preds = ["Hypothesis"]
+        with self.assertRaises(ValueError):
+            cer.compute(predictions=preds, references=refs)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/evaluate-0.4.2/metrics/character/README.md
+++ b/evaluate-0.4.2/metrics/character/README.md
+---
+title: CharacTER
+emoji: 🔤
+colorFrom: orange
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+- machine-translation
+description: >-
+  CharacTer is a character-level metric inspired by the commonly applied translation edit rate (TER).
+---
+
+# Metric Card for CharacTER
+
+## Metric Description
+CharacTer is a character-level metric inspired by the translation edit rate (TER) metric. It is 
+defined as the minimum number of character edits required to adjust a hypothesis, until it completely matches the
+reference, normalized by the length of the hypothesis sentence. CharacTer calculates the character level edit
+distance while performing the shift edit on word level. Unlike the strict matching criterion in TER, a hypothesis
+word is considered to match a reference word and could be shifted, if the edit distance between them is below a
+threshold value. The Levenshtein distance between the reference and the shifted hypothesis sequence is computed on the
+character level. In addition, the lengths of hypothesis sequences instead of reference sequences are used for
+normalizing the edit distance, which effectively counters the issue that shorter translations normally achieve lower
+TER.
+
+## Intended Uses
+CharacTER was developed for machine translation evaluation.
+
+## How to Use
+
+```python
+import evaluate
+character = evaluate.load("character")
+
+# Single hyp/ref 
+preds = ["this week the saudis denied information published in the new york times"]
+refs = ["saudi arabia denied this week information published in the american new york times"]
+results = character.compute(references=refs, predictions=preds)
+
+# Corpus example
+preds = ["this week the saudis denied information published in the new york times",
+         "this is in fact an estimate"]
+refs = ["saudi arabia denied this week information published in the american new york times",
+        "this is actually an estimate"]
+results = character.compute(references=refs, predictions=preds)
+```
+
+### Inputs
+- **predictions**: a single prediction or a list of predictions to score. Each prediction should be a string with
+     tokens separated by spaces.
+- **references**: a single reference or a list of reference for each prediction. Each reference should be a string with
+     tokens separated by spaces.
+
+
+### Output Values
+
+*=only when a list of references/hypotheses are given
+
+- **count** (*): how many parallel sentences were processed
+- **mean** (*): the mean CharacTER score
+- **median** (*): the median score
+- **std** (*): standard deviation of the score
+- **min** (*): smallest score
+- **max** (*): largest score
+- **cer_scores**: all scores, one per ref/hyp pair
+
+### Output Example
+```python
+{
+    'count': 2,
+    'mean': 0.3127282211789254,
+    'median': 0.3127282211789254,
+    'std': 0.07561653111280243,
+    'min': 0.25925925925925924,
+    'max': 0.36619718309859156,
+    'cer_scores': [0.36619718309859156, 0.25925925925925924]
+}
+```
+
+## Citation
+```bibtex
+@inproceedings{wang-etal-2016-character,
+    title = "{C}harac{T}er: Translation Edit Rate on Character Level",
+    author = "Wang, Weiyue  and
+      Peter, Jan-Thorsten  and
+      Rosendahl, Hendrik  and
+      Ney, Hermann",
+    booktitle = "Proceedings of the First Conference on Machine Translation: Volume 2, Shared Task Papers",
+    month = aug,
+    year = "2016",
+    address = "Berlin, Germany",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/W16-2342",
+    doi = "10.18653/v1/W16-2342",
+    pages = "505--510",
+}
+```
+
+## Further References
+- Repackaged version that is used in this HF implementation: [https://github.com/bramvanroy/CharacTER](https://github.com/bramvanroy/CharacTER)
+- Original version: [https://github.com/rwth-i6/CharacTER](https://github.com/rwth-i6/CharacTER)
--- a/evaluate-0.4.2/metrics/character/app.py
+++ b/evaluate-0.4.2/metrics/character/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("character")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/character/character.py
+++ b/evaluate-0.4.2/metrics/character/character.py
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""CharacTER metric, a character-based TER variant, for machine translation."""
+import math
+from statistics import mean, median
+from typing import Iterable, List, Union
+
+import cer
+import datasets
+from cer import calculate_cer
+from datasets import Sequence, Value
+
+import evaluate
+
+
+_CITATION = """\
+@inproceedings{wang-etal-2016-character,
+    title = "{C}harac{T}er: Translation Edit Rate on Character Level",
+    author = "Wang, Weiyue  and
+      Peter, Jan-Thorsten  and
+      Rosendahl, Hendrik  and
+      Ney, Hermann",
+    booktitle = "Proceedings of the First Conference on Machine Translation: Volume 2, Shared Task Papers",
+    month = aug,
+    year = "2016",
+    address = "Berlin, Germany",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/W16-2342",
+    doi = "10.18653/v1/W16-2342",
+    pages = "505--510",
+}
+"""
+
+_DESCRIPTION = """\
+CharacTer is a character-level metric inspired by the commonly applied translation edit rate (TER). It is
+defined as the minimum number of character edits required to adjust a hypothesis, until it completely matches the
+reference, normalized by the length of the hypothesis sentence. CharacTer calculates the character level edit
+distance while performing the shift edit on word level. Unlike the strict matching criterion in TER, a hypothesis
+word is considered to match a reference word and could be shifted, if the edit distance between them is below a
+threshold value. The Levenshtein distance between the reference and the shifted hypothesis sequence is computed on the
+character level. In addition, the lengths of hypothesis sequences instead of reference sequences are used for
+normalizing the edit distance, which effectively counters the issue that shorter translations normally achieve lower
+TER."""
+
+_KWARGS_DESCRIPTION = """
+Calculates how good the predictions are in terms of the CharacTER metric given some references.
+Args:
+    predictions: a list of predictions to score. Each prediction should be a string with
+     tokens separated by spaces.
+    references: a list of references for each prediction. You can also pass multiple references for each prediction,
+     so a list and in that list a sublist for each prediction for its related references. When multiple references are
+     given, the lowest (best) score is returned for that prediction-references pair.
+     Each reference should be a string with tokens separated by spaces.
+    aggregate: one of "mean", "sum", "median" to indicate how the scores of individual sentences should be
+     aggregated
+    return_all_scores: a boolean, indicating whether in addition to the aggregated score, also all individual
+     scores should be returned
+Returns:
+    cer_score: an aggregated score across all the items, based on 'aggregate'
+    cer_scores: (optionally, if 'return_all_scores' evaluates to True) a list of all scores, one per ref/hyp pair
+Examples:
+    >>> character_mt = evaluate.load("character")
+    >>> preds = ["this week the saudis denied information published in the new york times"]
+    >>> refs = ["saudi arabia denied this week information published in the american new york times"]
+    >>> character_mt.compute(references=refs, predictions=preds)
+    {'cer_score': 0.36619718309859156}
+    >>> preds = ["this week the saudis denied information published in the new york times",
+    ...          "this is in fact an estimate"]
+    >>> refs = ["saudi arabia denied this week information published in the american new york times",
+    ...         "this is actually an estimate"]
+    >>> character_mt.compute(references=refs, predictions=preds, aggregate="sum", return_all_scores=True)
+    {'cer_score': 0.6254564423578508, 'cer_scores': [0.36619718309859156, 0.25925925925925924]}
+    >>> preds = ["this week the saudis denied information published in the new york times"]
+    >>> refs = [["saudi arabia denied this week information published in the american new york times",
+    ...          "the saudis have denied new information published in the ny times"]]
+    >>> character_mt.compute(references=refs, predictions=preds, aggregate="median", return_all_scores=True)
+    {'cer_score': 0.36619718309859156, 'cer_scores': [0.36619718309859156]}
+"""
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Character(evaluate.Metric):
+    """CharacTer is a character-level metric inspired by the commonly applied translation edit rate (TER)."""
+
+    def _info(self):
+        return evaluate.MetricInfo(
+            module_type="metric",
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=[
+                datasets.Features(
+                    {"predictions": Value("string", id="prediction"), "references": Value("string", id="reference")}
+                ),
+                datasets.Features(
+                    {
+                        "predictions": Value("string", id="prediction"),
+                        "references": Sequence(Value("string", id="reference"), id="references"),
+                    }
+                ),
+            ],
+            homepage="https://github.com/bramvanroy/CharacTER",
+            codebase_urls=["https://github.com/bramvanroy/CharacTER", "https://github.com/rwth-i6/CharacTER"],
+        )
+
+    def _compute(
+        self,
+        predictions: Iterable[str],
+        references: Union[Iterable[str], Iterable[Iterable[str]]],
+        aggregate: str = "mean",
+        return_all_scores: bool = False,
+    ):
+        if aggregate not in ("mean", "sum", "median"):
+            raise ValueError("'aggregate' must be one of 'sum', 'mean', 'median'")
+
+        predictions = [p.split() for p in predictions]
+        # Predictions and references have the same internal types (both lists of strings),
+        # so only one reference per prediction
+        if isinstance(references[0], str):
+            references = [r.split() for r in references]
+
+            scores_d = cer.calculate_cer_corpus(predictions, references)
+            cer_scores: List[float] = scores_d["cer_scores"]
+
+            if aggregate == "sum":
+                score = sum(cer_scores)
+            elif aggregate == "mean":
+                score = scores_d["mean"]
+            else:
+                score = scores_d["median"]
+        else:
+            # In the case of multiple references, we just find the "best score",
+            # i.e., the reference that the prediction is closest to, i.e. the lowest characTER score
+            references = [[r.split() for r in refs] for refs in references]
+
+            cer_scores = []
+            for pred, refs in zip(predictions, references):
+                min_score = math.inf
+                for ref in refs:
+                    score = calculate_cer(pred, ref)
+
+                    if score < min_score:
+                        min_score = score
+
+                cer_scores.append(min_score)
+
+            if aggregate == "sum":
+                score = sum(cer_scores)
+            elif aggregate == "mean":
+                score = mean(cer_scores)
+            else:
+                score = median(cer_scores)
+
+        # Return scores
+        if return_all_scores:
+            return {"cer_score": score, "cer_scores": cer_scores}
+        else:
+            return {"cer_score": score}
--- a/evaluate-0.4.2/metrics/character/requirements.txt
+++ b/evaluate-0.4.2/metrics/character/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+cer>=1.2.0
--- a/evaluate-0.4.2/metrics/charcut_mt/README.md
+++ b/evaluate-0.4.2/metrics/charcut_mt/README.md
+---
+title: CharCut
+emoji: 🔤
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+- machine-translation
+description: >-
+  CharCut is a character-based machine translation evaluation metric.
+---
+
+# Metric Card for CharacTER
+
+## Metric Description
+CharCut compares outputs of MT systems with reference translations. The matching algorithm is based on an iterative
+search for longest common substrings, combined with a length-based threshold that limits short and noisy character
+matches. As a similarity metric this is not new, but to the best of our knowledge it was never applied to highlighting
+and scoring of MT outputs. It has the neat effect of keeping character-based differences readable by humans.
+
+## Intended Uses
+CharCut was developed for machine translation evaluation.
+
+## How to Use
+
+```python
+import evaluate
+charcut = evaluate.load("charcut")
+preds = ["this week the saudis denied information published in the new york times",
+                "this is in fact an estimate"]
+refs = ["saudi arabia denied this week information published in the american new york times",
+                "this is actually an estimate"]
+results = charcut.compute(references=refs, predictions=preds)
+print(results)
+# {'charcut_mt': 0.1971153846153846}
+
+```
+### Inputs
+- **predictions**: a single prediction or a list of predictions to score. Each prediction should be a string with
+     tokens separated by spaces.
+- **references**: a single reference or a list of reference for each prediction. Each reference should be a string with
+     tokens separated by spaces.
+
+
+### Output Values
+- **charcut_mt**: the CharCut evaluation score (lower is better)
+
+### Output Example
+```python
+{'charcut_mt': 0.1971153846153846}
+```
+
+## Citation
+```bibtex
+@inproceedings{lardilleux-lepage-2017-charcut,
+    title = "{CHARCUT}: Human-Targeted Character-Based {MT} Evaluation with Loose Differences",
+    author = "Lardilleux, Adrien  and
+      Lepage, Yves",
+    booktitle = "Proceedings of the 14th International Conference on Spoken Language Translation",
+    month = dec # " 14-15",
+    year = "2017",
+    address = "Tokyo, Japan",
+    publisher = "International Workshop on Spoken Language Translation",
+    url = "https://aclanthology.org/2017.iwslt-1.20",
+    pages = "146--153",
+    abstract = "We present CHARCUT, a character-based machine translation evaluation metric derived from a human-targeted segment difference visualisation algorithm. It combines an iterative search for longest common substrings between the candidate and the reference translation with a simple length-based threshold, enabling loose differences that limit noisy character matches. Its main advantage is to produce scores that directly reflect human-readable string differences, making it a useful support tool for the manual analysis of MT output and its display to end users. Experiments on WMT16 metrics task data show that it is on par with the best {``}un-trained{''} metrics in terms of correlation with human judgement, well above BLEU and TER baselines, on both system and segment tasks.",
+}
+```
+
+## Further References
+- Repackaged version that is used in this HF implementation: [https://github.com/BramVanroy/CharCut](https://github.com/BramVanroy/CharCut)
+- Original version: [https://github.com/alardill/CharCut](https://github.com/alardill/CharCut)
--- a/evaluate-0.4.2/metrics/charcut_mt/app.py
+++ b/evaluate-0.4.2/metrics/charcut_mt/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("charcut_mt")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/charcut_mt/charcut_mt.py
+++ b/evaluate-0.4.2/metrics/charcut_mt/charcut_mt.py
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""An implementation for calculating CharCut, a character-based machine translation evaluation metric."""
+from typing import Iterable, Union
+
+import datasets
+from charcut import calculate_charcut
+from datasets import Sequence, Value
+
+import evaluate
+
+
+_CITATION = """\
+@inproceedings{lardilleux-lepage-2017-charcut,
+    title = "{CHARCUT}: Human-Targeted Character-Based {MT} Evaluation with Loose Differences",
+    author = "Lardilleux, Adrien  and
+      Lepage, Yves",
+    booktitle = "Proceedings of the 14th International Conference on Spoken Language Translation",
+    month = dec # " 14-15",
+    year = "2017",
+    address = "Tokyo, Japan",
+    publisher = "International Workshop on Spoken Language Translation",
+    url = "https://aclanthology.org/2017.iwslt-1.20",
+    pages = "146--153"
+}
+"""
+
+_DESCRIPTION = """\
+CharCut compares outputs of MT systems with reference translations. The matching algorithm is based on an iterative
+search for longest common substrings, combined with a length-based threshold that limits short and noisy character
+matches. As a similarity metric this is not new, but to the best of our knowledge it was never applied to highlighting
+and scoring of MT outputs. It has the neat effect of keeping character-based differences readable by humans."""
+
+_KWARGS_DESCRIPTION = """
+Calculates how good predictions are given some references.
+Args:
+    predictions: a list of predictions to score. Each prediction should be a string with
+     tokens separated by spaces.
+    references: a list of reference for each prediction. Each reference should be a string with
+     tokens separated by spaces.
+Returns:
+    charcut_mt: the CharCut score
+Examples:
+    >>> charcut_mt = evaluate.load("charcut_mt")
+    >>> preds = ["this week the saudis denied information published in the new york times",
+    ...          "this is in fact an estimate"]
+    >>> refs = ["saudi arabia denied this week information published in the american new york times",
+    ...         "this is actually an estimate"]
+    >>> charcut_mt.compute(references=refs, predictions=preds)
+    {'charcut_mt': 0.1971153846153846}
+"""
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Charcut(evaluate.Metric):
+    """Character-based MT evaluation."""
+
+    def _info(self):
+        return evaluate.MetricInfo(
+            # This is the description that will appear on the modules page.
+            module_type="metric",
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            # This defines the format of each prediction and reference
+            features=[
+                datasets.Features(
+                    {"predictions": Value("string", id="prediction"), "references": Value("string", id="reference")}
+                ),
+            ],
+            # Homepage of the module for documentation
+            homepage="https://github.com/BramVanroy/CharCut",
+            # Additional links to the codebase or references
+            codebase_urls=["https://github.com/BramVanroy/CharCut", "https://github.com/alardill/CharCut"],
+        )
+
+    def _compute(self, predictions: Iterable[str], references: Iterable[str]):
+        return {"charcut_mt": calculate_charcut(predictions, references)[0]}
--- a/evaluate-0.4.2/metrics/charcut_mt/requirements.txt
+++ b/evaluate-0.4.2/metrics/charcut_mt/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+charcut>=1.1.1
--- a/evaluate-0.4.2/metrics/chrf/README.md
+++ b/evaluate-0.4.2/metrics/chrf/README.md
+---
+title: chrF
+emoji: 🤗 
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+description: >-
+  ChrF and ChrF++ are two MT evaluation metrics. They both use the F-score statistic for character n-gram matches,
+  and ChrF++ adds word n-grams as well which correlates more strongly with direct assessment. We use the implementation
+  that is already present in sacrebleu.
+  
+  The implementation here is slightly different from sacrebleu in terms of the required input format. The length of
+  the references and hypotheses lists need to be the same, so you may need to transpose your references compared to
+  sacrebleu's required input format. See https://github.com/huggingface/datasets/issues/3154#issuecomment-950746534
+  
+  See the README.md file at https://github.com/mjpost/sacreBLEU#chrf--chrf for more information.
+---
+
+# Metric Card for chrF(++)
+
+
+## Metric Description
+ChrF and ChrF++ are two MT evaluation metrics that use the F-score statistic for character n-gram matches. ChrF++ additionally includes word n-grams, which correlate more strongly with direct assessment. We use the implementation that is already present in sacrebleu.
+
+While this metric is included in sacreBLEU, the implementation here is slightly different from sacreBLEU in terms of the required input format. Here, the length of the references and hypotheses lists need to be the same, so you may need to transpose your references compared to sacrebleu's required input format. See https://github.com/huggingface/datasets/issues/3154#issuecomment-950746534
+
+See the [sacreBLEU README.md](https://github.com/mjpost/sacreBLEU#chrf--chrf) for more information.
+
+
+## How to Use
+At minimum, this metric requires a `list` of predictions and a `list` of `list`s of references:
+```python
+>>> prediction = ["The relationship between cats and dogs is not exactly friendly.", "a good bookshop is just a genteel black hole that knows how to read."]
+>>> reference = [["The relationship between dogs and cats is not exactly friendly.", ], ["A good bookshop is just a genteel Black Hole that knows how to read."]]
+>>> chrf = evaluate.load("chrf")
+>>> results = chrf.compute(predictions=prediction, references=reference)
+>>> print(results)
+{'score': 84.64214891738334, 'char_order': 6, 'word_order': 0, 'beta': 2}
+```
+
+### Inputs
+- **`predictions`** (`list` of `str`): The predicted sentences.
+- **`references`** (`list` of `list` of `str`): The references. There should be one reference sub-list for each prediction sentence.
+- **`char_order`** (`int`): Character n-gram order. Defaults to `6`. 
+- **`word_order`** (`int`): Word n-gram order. If equals to 2, the metric is referred to as chrF++. Defaults to `0`.
+- **`beta`** (`int`): Determine the importance of recall w.r.t precision. Defaults to `2`.
+- **`lowercase`** (`bool`): If `True`, enables case-insensitivity. Defaults to `False`.
+- **`whitespace`** (`bool`): If `True`, include whitespaces when extracting character n-grams. Defaults to `False`.
+- **`eps_smoothing`** (`bool`): If `True`, applies epsilon smoothing similar to reference chrF++.py, NLTK, and Moses implementations. If `False`, takes into account effective match order similar to sacreBLEU < 2.0.0. Defaults to `False`.
+
+
+
+### Output Values
+The output is a dictionary containing the following fields:
+- **`'score'`** (`float`): The chrF (chrF++) score.
+- **`'char_order'`** (`int`): The character n-gram order.
+- **`'word_order'`** (`int`): The word n-gram order. If equals to `2`, the metric is referred to as chrF++.
+- **`'beta'`** (`int`): Determine the importance of recall w.r.t precision.
+
+
+The output is formatted as below:
+```python
+{'score': 61.576379378113785, 'char_order': 6, 'word_order': 0, 'beta': 2}
+```
+
+The chrF(++) score can be any value between `0.0` and `100.0`, inclusive.
+
+#### Values from Popular Papers
+
+
+### Examples
+A simple example of calculating chrF:
+```python
+>>> prediction = ["The relationship between cats and dogs is not exactly friendly.", "a good bookshop is just a genteel black hole that knows how to read."]
+>>> reference = [["The relationship between dogs and cats is not exactly friendly.", ], ["A good bookshop is just a genteel Black Hole that knows how to read."]]
+>>> chrf = evaluate.load("chrf")
+>>> results = chrf.compute(predictions=prediction, references=reference)
+>>> print(results)
+{'score': 84.64214891738334, 'char_order': 6, 'word_order': 0, 'beta': 2}
+```
+
+The same example, but with the argument `word_order=2`, to calculate chrF++ instead of chrF:
+```python
+>>> prediction = ["The relationship between cats and dogs is not exactly friendly.", "a good bookshop is just a genteel black hole that knows how to read."]
+>>> reference = [["The relationship between dogs and cats is not exactly friendly.", ], ["A good bookshop is just a genteel Black Hole that knows how to read."]]
+>>> chrf = evaluate.load("chrf")
+>>> results = chrf.compute(predictions=prediction,
+...                         references=reference,
+...                         word_order=2)
+>>> print(results)
+{'score': 82.87263732906315, 'char_order': 6, 'word_order': 2, 'beta': 2}
+```
+
+The same chrF++ example as above, but with `lowercase=True` to normalize all case:
+```python
+>>> prediction = ["The relationship between cats and dogs is not exactly friendly.", "a good bookshop is just a genteel black hole that knows how to read."]
+>>> reference = [["The relationship between dogs and cats is not exactly friendly.", ], ["A good bookshop is just a genteel Black Hole that knows how to read."]]
+>>> chrf = evaluate.load("chrf")
+>>> results = chrf.compute(predictions=prediction,
+...                         references=reference,
+...                         word_order=2,
+...                         lowercase=True)
+>>> print(results)
+{'score': 92.12853119829202, 'char_order': 6, 'word_order': 2, 'beta': 2}
+```
+
+
+## Limitations and Bias
+- According to [Popović 2017](https://www.statmt.org/wmt17/pdf/WMT70.pdf), chrF+ (where `word_order=1`) and chrF++ (where `word_order=2`) produce scores that correlate better with human judgements than chrF (where `word_order=0`) does. 
+
+## Citation
+```bibtex
+@inproceedings{popovic-2015-chrf,
+    title = "chr{F}: character n-gram {F}-score for automatic {MT} evaluation",
+    author = "Popovi{\'c}, Maja",
+    booktitle = "Proceedings of the Tenth Workshop on Statistical Machine Translation",
+    month = sep,
+    year = "2015",
+    address = "Lisbon, Portugal",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/W15-3049",
+    doi = "10.18653/v1/W15-3049",
+    pages = "392--395",
+}
+@inproceedings{popovic-2017-chrf,
+    title = "chr{F}++: words helping character n-grams",
+    author = "Popovi{\'c}, Maja",
+    booktitle = "Proceedings of the Second Conference on Machine Translation",
+    month = sep,
+    year = "2017",
+    address = "Copenhagen, Denmark",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/W17-4770",
+    doi = "10.18653/v1/W17-4770",
+    pages = "612--618",
+}
+@inproceedings{post-2018-call,
+    title = "A Call for Clarity in Reporting {BLEU} Scores",
+    author = "Post, Matt",
+    booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
+    month = oct,
+    year = "2018",
+    address = "Belgium, Brussels",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/W18-6319",
+    pages = "186--191",
+}
+```
+
+## Further References
+- See the [sacreBLEU README.md](https://github.com/mjpost/sacreBLEU#chrf--chrf) for more information on this implementation.
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/chrf/app.py
+++ b/evaluate-0.4.2/metrics/chrf/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("chrf")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/chrf/chrf.py
+++ b/evaluate-0.4.2/metrics/chrf/chrf.py
+# Copyright 2021 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Chrf(++) metric as available in sacrebleu. """
+import datasets
+import sacrebleu as scb
+from packaging import version
+from sacrebleu import CHRF
+
+import evaluate
+
+
+_CITATION = """\
+@inproceedings{popovic-2015-chrf,
+    title = "chr{F}: character n-gram {F}-score for automatic {MT} evaluation",
+    author = "Popovi{\'c}, Maja",
+    booktitle = "Proceedings of the Tenth Workshop on Statistical Machine Translation",
+    month = sep,
+    year = "2015",
+    address = "Lisbon, Portugal",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/W15-3049",
+    doi = "10.18653/v1/W15-3049",
+    pages = "392--395",
+}
+@inproceedings{popovic-2017-chrf,
+    title = "chr{F}++: words helping character n-grams",
+    author = "Popovi{\'c}, Maja",
+    booktitle = "Proceedings of the Second Conference on Machine Translation",
+    month = sep,
+    year = "2017",
+    address = "Copenhagen, Denmark",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/W17-4770",
+    doi = "10.18653/v1/W17-4770",
+    pages = "612--618",
+}
+@inproceedings{post-2018-call,
+    title = "A Call for Clarity in Reporting {BLEU} Scores",
+    author = "Post, Matt",
+    booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
+    month = oct,
+    year = "2018",
+    address = "Belgium, Brussels",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/W18-6319",
+    pages = "186--191",
+}
+"""
+
+_DESCRIPTION = """\
+ChrF and ChrF++ are two MT evaluation metrics. They both use the F-score statistic for character n-gram matches,
+and ChrF++ adds word n-grams as well which correlates more strongly with direct assessment. We use the implementation
+that is already present in sacrebleu.
+
+The implementation here is slightly different from sacrebleu in terms of the required input format. The length of
+the references and hypotheses lists need to be the same, so you may need to transpose your references compared to
+sacrebleu's required input format. See https://github.com/huggingface/datasets/issues/3154#issuecomment-950746534
+
+See the README.md file at https://github.com/mjpost/sacreBLEU#chrf--chrf for more information.
+"""
+
+_KWARGS_DESCRIPTION = """
+Produces ChrF(++) scores for hypotheses given reference translations.
+
+Args:
+    predictions (list of str): The predicted sentences.
+    references (list of list of str): The references. There should be one reference sub-list for each prediction sentence.
+    char_order (int): Character n-gram order. Defaults to `6`.
+    word_order (int): Word n-gram order. If equals to `2`, the metric is referred to as chrF++. Defaults to `0`.
+    beta (int): Determine the importance of recall w.r.t precision. Defaults to `2`.
+    lowercase (bool): if `True`, enables case-insensitivity. Defaults to `False`.
+    whitespace (bool): If `True`, include whitespaces when extracting character n-grams.
+    eps_smoothing (bool): If `True`, applies epsilon smoothing similar
+    to reference chrF++.py, NLTK and Moses implementations. If `False`,
+    it takes into account effective match order similar to sacreBLEU < 2.0.0. Defaults to `False`.
+
+Returns:
+    'score' (float): The chrF (chrF++) score,
+    'char_order' (int): The character n-gram order,
+    'word_order' (int): The word n-gram order. If equals to 2, the metric is referred to as chrF++,
+    'beta' (int): Determine the importance of recall w.r.t precision
+
+Examples:
+    Example 1--a simple example of calculating chrF:
+        >>> prediction = ["The relationship between cats and dogs is not exactly friendly.", "a good bookshop is just a genteel black hole that knows how to read."]
+        >>> reference = [["The relationship between dogs and cats is not exactly friendly."], ["A good bookshop is just a genteel Black Hole that knows how to read."]]
+        >>> chrf = evaluate.load("chrf")
+        >>> results = chrf.compute(predictions=prediction, references=reference)
+        >>> print(results)
+        {'score': 84.64214891738334, 'char_order': 6, 'word_order': 0, 'beta': 2}
+
+    Example 2--the same example, but with the argument word_order=2, to calculate chrF++ instead of chrF:
+        >>> prediction = ["The relationship between cats and dogs is not exactly friendly.", "a good bookshop is just a genteel black hole that knows how to read."]
+        >>> reference = [["The relationship between dogs and cats is not exactly friendly."], ["A good bookshop is just a genteel Black Hole that knows how to read."]]
+        >>> chrf = evaluate.load("chrf")
+        >>> results = chrf.compute(predictions=prediction,
+        ...                         references=reference,
+        ...                         word_order=2)
+        >>> print(results)
+        {'score': 82.87263732906315, 'char_order': 6, 'word_order': 2, 'beta': 2}
+
+    Example 3--the same chrF++ example as above, but with `lowercase=True` to normalize all case:
+        >>> prediction = ["The relationship between cats and dogs is not exactly friendly.", "a good bookshop is just a genteel black hole that knows how to read."]
+        >>> reference = [["The relationship between dogs and cats is not exactly friendly."], ["A good bookshop is just a genteel Black Hole that knows how to read."]]
+        >>> chrf = evaluate.load("chrf")
+        >>> results = chrf.compute(predictions=prediction,
+        ...                         references=reference,
+        ...                         word_order=2,
+        ...                         lowercase=True)
+        >>> print(results)
+        {'score': 92.12853119829202, 'char_order': 6, 'word_order': 2, 'beta': 2}
+"""
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class ChrF(evaluate.Metric):
+    def _info(self):
+        if version.parse(scb.__version__) < version.parse("1.4.12"):
+            raise ImportWarning(
+                "To use `sacrebleu`, the module `sacrebleu>=1.4.12` is required, and the current version of `sacrebleu` doesn't match this condition.\n"
+                'You can install it with `pip install "sacrebleu>=1.4.12"`.'
+            )
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            homepage="https://github.com/mjpost/sacreBLEU#chrf--chrf",
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=[
+                datasets.Features(
+                    {
+                        "predictions": datasets.Value("string", id="sequence"),
+                        "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
+                    }
+                ),
+                datasets.Features(
+                    {
+                        "predictions": datasets.Value("string", id="sequence"),
+                        "references": datasets.Value("string", id="sequence"),
+                    }
+                ),
+            ],
+            codebase_urls=["https://github.com/mjpost/sacreBLEU#chrf--chrf"],
+            reference_urls=[
+                "https://github.com/m-popovic/chrF",
+            ],
+        )
+
+    def _compute(
+        self,
+        predictions,
+        references,
+        char_order: int = CHRF.CHAR_ORDER,
+        word_order: int = CHRF.WORD_ORDER,
+        beta: int = CHRF.BETA,
+        lowercase: bool = False,
+        whitespace: bool = False,
+        eps_smoothing: bool = False,
+    ):
+        # if only one reference is provided make sure we still use list of lists
+        if isinstance(references[0], str):
+            references = [[ref] for ref in references]
+        references_per_prediction = len(references[0])
+        if any(len(refs) != references_per_prediction for refs in references):
+            raise ValueError(
+                "ChrF, as implemented by sacrebleu, requires the same number of references for each prediction"
+            )
+        transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)]
+
+        sb_chrf = CHRF(char_order, word_order, beta, lowercase, whitespace, eps_smoothing)
+        output = sb_chrf.corpus_score(predictions, transformed_references)
+
+        return {
+            "score": output.score,
+            "char_order": output.char_order,
+            "word_order": output.word_order,
+            "beta": output.beta,
+        }
--- a/evaluate-0.4.2/metrics/chrf/requirements.txt
+++ b/evaluate-0.4.2/metrics/chrf/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+sacrebleu
\ No newline at end of file