修改readme

25991f98 · hepj · ac192496 · 25991f98 · 25991f98 · 25991f98
Commit 25991f98 authored Jul 25, 2024 by hepj
20 changed files
--- a/evaluate-0.4.2/metrics/glue/glue.py
+++ b/evaluate-0.4.2/metrics/glue/glue.py
+# Copyright 2020 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" GLUE benchmark metric. """
+
+import datasets
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics import f1_score, matthews_corrcoef
+
+import evaluate
+
+
+_CITATION = """\
+@inproceedings{wang2019glue,
+  title={{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding},
+  author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R.},
+  note={In the Proceedings of ICLR.},
+  year={2019}
+}
+"""
+
+_DESCRIPTION = """\
+GLUE, the General Language Understanding Evaluation benchmark
+(https://gluebenchmark.com/) is a collection of resources for training,
+evaluating, and analyzing natural language understanding systems.
+"""
+
+_KWARGS_DESCRIPTION = """
+Compute GLUE evaluation metric associated to each GLUE dataset.
+Args:
+    predictions: list of predictions to score.
+        Each translation should be tokenized into a list of tokens.
+    references: list of lists of references for each translation.
+        Each reference should be tokenized into a list of tokens.
+Returns: depending on the GLUE subset, one or several of:
+    "accuracy": Accuracy
+    "f1": F1 score
+    "pearson": Pearson Correlation
+    "spearmanr": Spearman Correlation
+    "matthews_correlation": Matthew Correlation
+Examples:
+
+    >>> glue_metric = evaluate.load('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
+    >>> references = [0, 1]
+    >>> predictions = [0, 1]
+    >>> results = glue_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'accuracy': 1.0}
+
+    >>> glue_metric = evaluate.load('glue', 'mrpc')  # 'mrpc' or 'qqp'
+    >>> references = [0, 1]
+    >>> predictions = [0, 1]
+    >>> results = glue_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'accuracy': 1.0, 'f1': 1.0}
+
+    >>> glue_metric = evaluate.load('glue', 'stsb')
+    >>> references = [0., 1., 2., 3., 4., 5.]
+    >>> predictions = [0., 1., 2., 3., 4., 5.]
+    >>> results = glue_metric.compute(predictions=predictions, references=references)
+    >>> print({"pearson": round(results["pearson"], 2), "spearmanr": round(results["spearmanr"], 2)})
+    {'pearson': 1.0, 'spearmanr': 1.0}
+
+    >>> glue_metric = evaluate.load('glue', 'cola')
+    >>> references = [0, 1]
+    >>> predictions = [0, 1]
+    >>> results = glue_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'matthews_correlation': 1.0}
+"""
+
+
+def simple_accuracy(preds, labels):
+    return float((preds == labels).mean())
+
+
+def acc_and_f1(preds, labels):
+    acc = simple_accuracy(preds, labels)
+    f1 = float(f1_score(y_true=labels, y_pred=preds))
+    return {
+        "accuracy": acc,
+        "f1": f1,
+    }
+
+
+def pearson_and_spearman(preds, labels):
+    pearson_corr = float(pearsonr(preds, labels)[0])
+    spearman_corr = float(spearmanr(preds, labels)[0])
+    return {
+        "pearson": pearson_corr,
+        "spearmanr": spearman_corr,
+    }
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Glue(evaluate.Metric):
+    def _info(self):
+        if self.config_name not in [
+            "sst2",
+            "mnli",
+            "mnli_mismatched",
+            "mnli_matched",
+            "cola",
+            "stsb",
+            "mrpc",
+            "qqp",
+            "qnli",
+            "rte",
+            "wnli",
+            "hans",
+        ]:
+            raise KeyError(
+                "You should supply a configuration name selected in "
+                '["sst2", "mnli", "mnli_mismatched", "mnli_matched", '
+                '"cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]'
+            )
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Value("int64" if self.config_name != "stsb" else "float32"),
+                    "references": datasets.Value("int64" if self.config_name != "stsb" else "float32"),
+                }
+            ),
+            codebase_urls=[],
+            reference_urls=[],
+            format="numpy",
+        )
+
+    def _compute(self, predictions, references):
+        if self.config_name == "cola":
+            return {"matthews_correlation": matthews_corrcoef(references, predictions)}
+        elif self.config_name == "stsb":
+            return pearson_and_spearman(predictions, references)
+        elif self.config_name in ["mrpc", "qqp"]:
+            return acc_and_f1(predictions, references)
+        elif self.config_name in ["sst2", "mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]:
+            return {"accuracy": simple_accuracy(predictions, references)}
+        else:
+            raise KeyError(
+                "You should supply a configuration name selected in "
+                '["sst2", "mnli", "mnli_mismatched", "mnli_matched", '
+                '"cola", "stsb", "mrpc", "qqp", "qnli", "rte", "wnli", "hans"]'
+            )
--- a/evaluate-0.4.2/metrics/glue/requirements.txt
+++ b/evaluate-0.4.2/metrics/glue/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+scipy
+scikit-learn
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/google_bleu/README.md
+++ b/evaluate-0.4.2/metrics/google_bleu/README.md
+---
+title: Google BLEU
+emoji: 🤗 
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+description: >-
+  The BLEU score has some undesirable properties when used for single
+  sentences, as it was designed to be a corpus measure. We therefore 
+  use a slightly different score for our RL experiments which we call
+  the 'GLEU score'. For the GLEU score, we record all sub-sequences of
+  1, 2, 3 or 4 tokens in output and target sequence (n-grams). We then
+  compute a recall, which is the ratio of the number of matching n-grams
+  to the number of total n-grams in the target (ground truth) sequence,
+  and a precision, which is the ratio of the number of matching n-grams
+  to the number of total n-grams in the generated output sequence. Then
+  GLEU score is simply the minimum of recall and precision. This GLEU
+  score's range is always between 0 (no matches) and 1 (all match) and
+  it is symmetrical when switching output and target. According to
+  our experiments, GLEU score correlates quite well with the BLEU
+  metric on a corpus level but does not have its drawbacks for our per
+  sentence reward objective.
+---
+
+# Metric Card for Google BLEU
+
+
+## Metric Description
+The BLEU score has some undesirable properties when used for single sentences, as it was designed to be a corpus measure. The Google BLEU score is designed to limit these undesirable properties when used for single sentences.
+
+To calculate this score, all sub-sequences of 1, 2, 3 or 4 tokens in output and target sequence (n-grams) are recorded. The precision and recall, described below, are then computed.
+
+- **precision:** the ratio of the number of matching n-grams to the number of total n-grams in the generated output sequence
+- **recall:** the ratio of the number of matching n-grams to the number of total n-grams in the target (ground truth) sequence
+
+The minimum value of precision and recall is then returned as the score.
+
+
+## Intended Uses
+This metric is generally used to evaluate machine translation models. It is especially used when scores of individual (prediction, reference) sentence pairs are needed, as opposed to when averaging over the (prediction, reference) scores for a whole corpus. That being said, it can also be used when averaging over the scores for a whole corpus.
+
+Because it performs better on individual sentence pairs as compared to BLEU, Google BLEU has also been used in RL experiments.
+
+## How to Use
+This metric takes a list of predicted sentences, as well as a list of references.
+
+```python
+sentence1 = "the cat sat on the mat"
+sentence2 = "the cat ate the mat"
+google_bleu = evaluate.load("google_bleu")
+result = google_bleu.compute(predictions=[sentence1], references=[[sentence2]])
+print(result)
+>>> {'google_bleu': 0.3333333333333333}
+```
+
+### Inputs
+- **predictions** (list of str): list of translations to score.
+- **references** (list of list of str): list of lists of references for each translation.
+- **tokenizer** : approach used for tokenizing `predictions` and `references`.
+The default tokenizer is `tokenizer_13a`, a minimal tokenization approach that is equivalent to `mteval-v13a`, used by WMT. This can be replaced by any function that takes a string as input and returns a list of tokens as output.
+- **min_len** (int): The minimum order of n-gram this function should extract. Defaults to 1.
+- **max_len** (int): The maximum order of n-gram this function should extract. Defaults to 4.
+
+### Output Values
+This metric returns the following in a dict:
+- **google_bleu** (float): google_bleu score
+
+The output format is as follows:
+```python
+{'google_bleu': google_bleu score}
+```
+
+This metric can take on values from 0 to 1, inclusive. Higher scores are better, with 0 indicating no matches, and 1 indicating a perfect match.
+
+Note that this score is symmetrical when switching output and target. This means that, given two sentences, `sentence1` and `sentence2`, whatever score is output when `sentence1` is the predicted sentence and  `sencence2` is the reference sentence will be the same as when the sentences are swapped and `sentence2` is the predicted sentence while `sentence1` is the reference sentence. In code, this looks like:
+
+```python
+predictions = "the cat sat on the mat"
+references = "the cat ate the mat"
+google_bleu = evaluate.load("google_bleu")
+result_a = google_bleu.compute(predictions=[predictions], references=[[references]])
+result_b = google_bleu.compute(predictions=[predictions], references=[[references]])
+print(result_a == result_b)
+>>> True
+```
+
+#### Values from Popular Papers
+
+
+### Examples
+Example with one reference per sample:
+```python
+>>> predictions = ['It is a guide to action which ensures that the rubber duck always disobeys the commands of the cat', 'he read the book because he was interested in world history']
+>>> references = [['It is the guiding principle which guarantees the rubber duck forces never being under the command of the cat'], ['he was interested in world history because he read the book']]
+>>> google_bleu = evaluate.load("google_bleu")
+>>> results = google_bleu.compute(predictions=predictions, references=references)
+>>> print(round(results["google_bleu"], 2))
+0.44
+```
+
+Example with multiple references for the first sample:
+```python
+>>> predictions = ['It is a guide to action which ensures that the rubber duck always disobeys the commands of the cat', 'he read the book because he was interested in world history']
+>>> references  = [['It is the guiding principle which guarantees the rubber duck forces never being under the command of the cat', 'It is a guide to action that ensures that the rubber duck will never heed the cat commands', 'It is the practical guide for the rubber duck army never to heed the directions of the cat'], ['he was interested in world history because he read the book']]
+>>> google_bleu = evaluate.load("google_bleu")
+>>> results = google_bleu.compute(predictions=predictions, references=references)
+>>> print(round(results["google_bleu"], 2))
+0.61
+```
+
+Example with multiple references for the first sample, and with `min_len` adjusted to `2`, instead of the default `1`, which means that the function extracts n-grams of length `2`:
+```python
+>>> predictions = ['It is a guide to action which ensures that the rubber duck always disobeys the commands of the cat', 'he read the book because he was interested in world history']
+>>> references = [['It is the guiding principle which guarantees the rubber duck forces never being under the command of the cat', 'It is a guide to action that ensures that the rubber duck will never heed the cat commands', 'It is the practical guide for the rubber duck army never to heed the directions of the cat'], ['he was interested in world history because he read the book']]
+>>> google_bleu = evaluate.load("google_bleu")
+>>> results = google_bleu.compute(predictions=predictions, references=references, min_len=2)
+>>> print(round(results["google_bleu"], 2))
+0.53
+```
+
+Example with multiple references for the first sample, with `min_len` adjusted to `2`, instead of the default `1`, and `max_len` adjusted to `6` instead of the default `4`:
+```python
+>>> predictions = ['It is a guide to action which ensures that the rubber duck always disobeys the commands of the cat', 'he read the book because he was interested in world history']
+>>> references = [['It is the guiding principle which guarantees the rubber duck forces never being under the command of the cat', 'It is a guide to action that ensures that the rubber duck will never heed the cat commands', 'It is the practical guide for the rubber duck army never to heed the directions of the cat'], ['he was interested in world history because he read the book']]
+>>> google_bleu = evaluate.load("google_bleu")
+>>> results = google_bleu.compute(predictions=predictions,references=references, min_len=2, max_len=6)
+>>> print(round(results["google_bleu"], 2))
+0.4
+```
+
+## Limitations and Bias
+
+The GoogleBLEU metric does not come with a predefined tokenization function; previous versions simply used `split()` to split the input strings into tokens. Using a tokenizer such as the default one, `tokenizer_13a`, makes results more standardized and reproducible. The BLEU and sacreBLEU metrics also use this default tokenizer.
+
+## Citation
+```bibtex
+@misc{wu2016googles,
+title={Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation},
+author={Yonghui Wu and Mike Schuster and Zhifeng Chen and Quoc V. Le and Mohammad Norouzi and Wolfgang Macherey and Maxim Krikun and Yuan Cao and Qin Gao and Klaus Macherey and Jeff Klingner and Apurva Shah and Melvin Johnson and Xiaobing Liu and Łukasz Kaiser and Stephan Gouws and Yoshikiyo Kato and Taku Kudo and Hideto Kazawa and Keith Stevens and George Kurian and Nishant Patil and Wei Wang and Cliff Young and Jason Smith and Jason Riesa and Alex Rudnick and Oriol Vinyals and Greg Corrado and Macduff Hughes and Jeffrey Dean},
+year={2016},
+eprint={1609.08144},
+archivePrefix={arXiv},
+primaryClass={cs.CL}
+}
+```
+## Further References
+- This Hugging Face implementation uses the [nltk.translate.gleu_score implementation](https://www.nltk.org/_modules/nltk/translate/gleu_score.html)
--- a/evaluate-0.4.2/metrics/google_bleu/app.py
+++ b/evaluate-0.4.2/metrics/google_bleu/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("google_bleu")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/google_bleu/google_bleu.py
+++ b/evaluate-0.4.2/metrics/google_bleu/google_bleu.py
+# Copyright 2020 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Google BLEU (aka GLEU) metric. """
+
+from typing import Dict, List
+
+import datasets
+from nltk.translate import gleu_score
+
+import evaluate
+from evaluate import MetricInfo
+
+from .tokenizer_13a import Tokenizer13a
+
+
+_CITATION = """\
+@misc{wu2016googles,
+      title={Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation},
+      author={Yonghui Wu and Mike Schuster and Zhifeng Chen and Quoc V. Le and Mohammad Norouzi and Wolfgang Macherey
+              and Maxim Krikun and Yuan Cao and Qin Gao and Klaus Macherey and Jeff Klingner and Apurva Shah and Melvin
+              Johnson and Xiaobing Liu and Łukasz Kaiser and Stephan Gouws and Yoshikiyo Kato and Taku Kudo and Hideto
+              Kazawa and Keith Stevens and George Kurian and Nishant Patil and Wei Wang and Cliff Young and
+              Jason Smith and Jason Riesa and Alex Rudnick and Oriol Vinyals and Greg Corrado and Macduff Hughes
+              and Jeffrey Dean},
+      year={2016},
+      eprint={1609.08144},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+"""
+
+_DESCRIPTION = """\
+The BLEU score has some undesirable properties when used for single
+sentences, as it was designed to be a corpus measure. We therefore
+use a slightly different score for our RL experiments which we call
+the 'GLEU score'. For the GLEU score, we record all sub-sequences of
+1, 2, 3 or 4 tokens in output and target sequence (n-grams). We then
+compute a recall, which is the ratio of the number of matching n-grams
+to the number of total n-grams in the target (ground truth) sequence,
+and a precision, which is the ratio of the number of matching n-grams
+to the number of total n-grams in the generated output sequence. Then
+GLEU score is simply the minimum of recall and precision. This GLEU
+score's range is always between 0 (no matches) and 1 (all match) and
+it is symmetrical when switching output and target. According to
+our experiments, GLEU score correlates quite well with the BLEU
+metric on a corpus level but does not have its drawbacks for our per
+sentence reward objective.
+"""
+
+_KWARGS_DESCRIPTION = """\
+Computes corpus-level Google BLEU (GLEU) score of translated segments against one or more references.
+Instead of averaging the sentence level GLEU scores (i.e. macro-average precision), Wu et al. (2016) sum up the matching
+tokens and the max of hypothesis and reference tokens for each sentence, then compute using the aggregate values.
+
+Args:
+    predictions (list of str): list of translations to score.
+    references (list of list of str): list of lists of references for each translation.
+    tokenizer : approach used for tokenizing `predictions` and `references`.
+        The default tokenizer is `tokenizer_13a`, a minimal tokenization approach that is equivalent to `mteval-v13a`, used by WMT.
+        This can be replaced by any function that takes a string as input and returns a list of tokens as output.
+    min_len (int): The minimum order of n-gram this function should extract. Defaults to 1.
+    max_len (int): The maximum order of n-gram this function should extract. Defaults to 4.
+
+Returns:
+    'google_bleu': google_bleu score
+
+Examples:
+    Example 1:
+        >>> predictions = ['It is a guide to action which ensures that the rubber duck always disobeys the commands of the cat', \
+        'he read the book because he was interested in world history']
+        >>> references = [['It is the guiding principle which guarantees the rubber duck forces never being under the command of the cat'], \
+        ['he was interested in world history because he read the book']]
+        >>> google_bleu = evaluate.load("google_bleu")
+        >>> results = google_bleu.compute(predictions=predictions, references=references)
+        >>> print(round(results["google_bleu"], 2))
+        0.44
+
+    Example 2:
+        >>> predictions = ['It is a guide to action which ensures that the rubber duck always disobeys the commands of the cat', \
+        'he read the book because he was interested in world history']
+        >>> references = [['It is the guiding principle which guarantees the rubber duck forces never being under the command of the cat', \
+        'It is a guide to action that ensures that the rubber duck will never heed the cat commands', \
+        'It is the practical guide for the rubber duck army never to heed the directions of the cat'], \
+        ['he was interested in world history because he read the book']]
+        >>> google_bleu = evaluate.load("google_bleu")
+        >>> results = google_bleu.compute(predictions=predictions, references=references)
+        >>> print(round(results["google_bleu"], 2))
+        0.61
+
+    Example 3:
+        >>> predictions = ['It is a guide to action which ensures that the rubber duck always disobeys the commands of the cat', \
+        'he read the book because he was interested in world history']
+        >>> references = [['It is the guiding principle which guarantees the rubber duck forces never being under the command of the cat', \
+        'It is a guide to action that ensures that the rubber duck will never heed the cat commands', \
+        'It is the practical guide for the rubber duck army never to heed the directions of the cat'], \
+        ['he was interested in world history because he read the book']]
+        >>> google_bleu = evaluate.load("google_bleu")
+        >>> results = google_bleu.compute(predictions=predictions, references=references, min_len=2)
+        >>> print(round(results["google_bleu"], 2))
+        0.53
+
+    Example 4:
+        >>> predictions = ['It is a guide to action which ensures that the rubber duck always disobeys the commands of the cat', \
+        'he read the book because he was interested in world history']
+        >>> references = [['It is the guiding principle which guarantees the rubber duck forces never being under the command of the cat', \
+        'It is a guide to action that ensures that the rubber duck will never heed the cat commands', \
+        'It is the practical guide for the rubber duck army never to heed the directions of the cat'], \
+        ['he was interested in world history because he read the book']]
+        >>> google_bleu = evaluate.load("google_bleu")
+        >>> results = google_bleu.compute(predictions=predictions,references=references, min_len=2, max_len=6)
+        >>> print(round(results["google_bleu"], 2))
+        0.4
+"""
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class GoogleBleu(evaluate.Metric):
+    def _info(self) -> MetricInfo:
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=[
+                datasets.Features(
+                    {
+                        "predictions": datasets.Value("string", id="sequence"),
+                        "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
+                    }
+                ),
+                datasets.Features(
+                    {
+                        "predictions": datasets.Value("string", id="sequence"),
+                        "references": datasets.Value("string", id="sequence"),
+                    }
+                ),
+            ],
+        )
+
+    def _compute(
+        self,
+        predictions: List[str],
+        references: List[List[str]],
+        tokenizer=Tokenizer13a(),
+        min_len: int = 1,
+        max_len: int = 4,
+    ) -> Dict[str, float]:
+        # if only one reference is provided make sure we still use list of lists
+        if isinstance(references[0], str):
+            references = [[ref] for ref in references]
+
+        references = [[tokenizer(r) for r in ref] for ref in references]
+        predictions = [tokenizer(p) for p in predictions]
+        return {
+            "google_bleu": gleu_score.corpus_gleu(
+                list_of_references=references, hypotheses=predictions, min_len=min_len, max_len=max_len
+            )
+        }
--- a/evaluate-0.4.2/metrics/google_bleu/requirements.txt
+++ b/evaluate-0.4.2/metrics/google_bleu/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+nltk
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/google_bleu/tokenizer_13a.py
+++ b/evaluate-0.4.2/metrics/google_bleu/tokenizer_13a.py
+# Source: https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/tokenizers/tokenizer_13a.py
+# Copyright 2020 SacreBLEU Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from functools import lru_cache
+
+
+class BaseTokenizer:
+    """A base dummy tokenizer to derive from."""
+
+    def signature(self):
+        """
+        Returns a signature for the tokenizer.
+        :return: signature string
+        """
+        return "none"
+
+    def __call__(self, line):
+        """
+        Tokenizes an input line with the tokenizer.
+        :param line: a segment to tokenize
+        :return: the tokenized line
+        """
+        return line
+
+
+class TokenizerRegexp(BaseTokenizer):
+    def signature(self):
+        return "re"
+
+    def __init__(self):
+        self._re = [
+            # language-dependent part (assuming Western languages)
+            (re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), r" \1 "),
+            # tokenize period and comma unless preceded by a digit
+            (re.compile(r"([^0-9])([\.,])"), r"\1 \2 "),
+            # tokenize period and comma unless followed by a digit
+            (re.compile(r"([\.,])([^0-9])"), r" \1 \2"),
+            # tokenize dash when preceded by a digit
+            (re.compile(r"([0-9])(-)"), r"\1 \2 "),
+            # one space only between words
+            # NOTE: Doing this in Python (below) is faster
+            # (re.compile(r'\s+'), r' '),
+        ]
+
+    @lru_cache(maxsize=2**16)
+    def __call__(self, line):
+        """Common post-processing tokenizer for `13a` and `zh` tokenizers.
+        :param line: a segment to tokenize
+        :return: the tokenized line
+        """
+        for (_re, repl) in self._re:
+            line = _re.sub(repl, line)
+
+        # no leading or trailing spaces, single space within words
+        # return ' '.join(line.split())
+        # This line is changed with regards to the original tokenizer (seen above) to return individual words
+        return line.split()
+
+
+class Tokenizer13a(BaseTokenizer):
+    def signature(self):
+        return "13a"
+
+    def __init__(self):
+        self._post_tokenizer = TokenizerRegexp()
+
+    @lru_cache(maxsize=2**16)
+    def __call__(self, line):
+        """Tokenizes an input line using a relatively minimal tokenization
+        that is however equivalent to mteval-v13a, used by WMT.
+
+        :param line: a segment to tokenize
+        :return: the tokenized line
+        """
+
+        # language-independent part:
+        line = line.replace("<skipped>", "")
+        line = line.replace("-\n", "")
+        line = line.replace("\n", " ")
+
+        if "&" in line:
+            line = line.replace("&quot;", '"')
+            line = line.replace("&amp;", "&")
+            line = line.replace("&lt;", "<")
+            line = line.replace("&gt;", ">")
+
+        return self._post_tokenizer(f" {line} ")
--- a/evaluate-0.4.2/metrics/indic_glue/README.md
+++ b/evaluate-0.4.2/metrics/indic_glue/README.md
+---
+title: IndicGLUE
+emoji: 🤗 
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+description: >-
+  IndicGLUE is a natural language understanding benchmark for Indian languages. It contains a wide
+  variety of tasks and covers 11 major Indian languages - as, bn, gu, hi, kn, ml, mr, or, pa, ta, te.
+---
+
+# Metric Card for IndicGLUE
+
+## Metric description
+This metric is used to compute the evaluation metric for the [IndicGLUE dataset](https://huggingface.co/datasets/indic_glue). 
+
+IndicGLUE is a natural language understanding benchmark for Indian languages. It contains a wide variety of tasks and covers 11 major Indian languages - Assamese (`as`), Bengali (`bn`), Gujarati (`gu`), Hindi (`hi`), Kannada (`kn`), Malayalam (`ml`), Marathi(`mr`), Oriya(`or`), Panjabi (`pa`), Tamil(`ta`) and Telugu (`te`).
+
+## How to use 
+
+There are two steps: (1) loading the IndicGLUE metric relevant to the subset of the dataset being used for evaluation; and (2) calculating the metric.
+
+1. **Loading the relevant IndicGLUE metric** : the subsets of IndicGLUE are the following: `wnli`, `copa`, `sna`, `csqa`, `wstp`, `inltkh`, `bbca`, `cvit-mkb-clsr`, `iitp-mr`, `iitp-pr`, `actsa-sc`, `md`, and`wiki-ner`.
+
+More information about the different subsets of the Indic GLUE dataset can be found on the [IndicGLUE dataset page](https://indicnlp.ai4bharat.org/indic-glue/).
+
+2. **Calculating the metric**: the metric takes two inputs : one list with the predictions of the model to score and one lists of references for each translation for all subsets of the dataset except for `cvit-mkb-clsr`, where each prediction and reference is a vector of floats.
+
+```python
+indic_glue_metric = evaluate.load('indic_glue', 'wnli')  
+references = [0, 1]
+predictions = [0, 1]
+results = indic_glue_metric.compute(predictions=predictions, references=references)
+```
+    
+## Output values
+
+The output of the metric depends on the IndicGLUE subset chosen, consisting of a dictionary that contains one or several of the following metrics:
+
+`accuracy`: the proportion of correct predictions among the total number of cases processed, with a range between 0 and 1 (see [accuracy](https://huggingface.co/metrics/accuracy) for more information). 
+
+`f1`: the harmonic mean of the precision and recall (see [F1 score](https://huggingface.co/metrics/f1) for more information). Its range is 0-1 -- its lowest possible value is 0, if either the precision or the recall is 0, and its highest possible value is 1.0, which means perfect precision and recall.
+
+`precision@10`: the fraction of the true examples among the top 10 predicted examples, with a range between 0 and 1 (see [precision](https://huggingface.co/metrics/precision) for more information). 
+
+The `cvit-mkb-clsr` subset returns `precision@10`, the `wiki-ner` subset returns `accuracy` and `f1`, and all other subsets of Indic GLUE return only accuracy. 
+
+### Values from popular papers
+
+The [original IndicGlue paper](https://aclanthology.org/2020.findings-emnlp.445.pdf) reported an average accuracy of 0.766 on the dataset, which varies depending on the subset selected.
+
+## Examples 
+
+Maximal values for the WNLI subset (which outputs `accuracy`):
+
+```python
+indic_glue_metric = evaluate.load('indic_glue', 'wnli') 
+references = [0, 1]
+predictions = [0, 1]
+results = indic_glue_metric.compute(predictions=predictions, references=references)
+print(results)
+{'accuracy': 1.0}
+```
+
+Minimal values for the Wiki-NER subset (which outputs `accuracy` and `f1`):
+
+```python
+>>> indic_glue_metric = evaluate.load('indic_glue', 'wiki-ner')
+>>> references = [0, 1]
+>>> predictions = [1,0]
+>>> results = indic_glue_metric.compute(predictions=predictions, references=references)
+>>> print(results)
+{'accuracy': 1.0, 'f1': 1.0}
+```
+
+Partial match for the CVIT-Mann Ki Baat subset (which outputs `precision@10`) 
+
+```python
+>>> indic_glue_metric = evaluate.load('indic_glue', 'cvit-mkb-clsr')
+>>> references = [[0.5, 0.5, 0.5], [0.1, 0.2, 0.3]]
+>>> predictions = [[0.5, 0.5, 0.5], [0.1, 0.2, 0.3]]
+>>> results = indic_glue_metric.compute(predictions=predictions, references=references)
+>>> print(results)
+{'precision@10': 1.0}
+```
+
+## Limitations and bias
+This metric works only with datasets that have the same format as the [IndicGLUE dataset](https://huggingface.co/datasets/glue).
+
+## Citation
+
+```bibtex
+    @inproceedings{kakwani2020indicnlpsuite,
+    title={{IndicNLPSuite: Monolingual Corpora, Evaluation Benchmarks and Pre-trained Multilingual Language Models for Indian Languages}},
+    author={Divyanshu Kakwani and Anoop Kunchukuttan and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar},
+    year={2020},
+    booktitle={Findings of EMNLP},
+}
+```
+    
+## Further References 
+- [IndicNLP website](https://indicnlp.ai4bharat.org/home/)
--- a/evaluate-0.4.2/metrics/indic_glue/app.py
+++ b/evaluate-0.4.2/metrics/indic_glue/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("indic_glue", "wnli")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/indic_glue/indic_glue.py
+++ b/evaluate-0.4.2/metrics/indic_glue/indic_glue.py
+# Copyright 2020 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" IndicGLUE benchmark metric. """
+
+import datasets
+import numpy as np
+from scipy.spatial.distance import cdist
+from scipy.stats import pearsonr, spearmanr
+from sklearn.metrics import f1_score
+
+import evaluate
+
+
+_CITATION = """\
+    @inproceedings{kakwani2020indicnlpsuite,
+    title={{IndicNLPSuite: Monolingual Corpora, Evaluation Benchmarks and Pre-trained Multilingual Language Models for Indian Languages}},
+    author={Divyanshu Kakwani and Anoop Kunchukuttan and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar},
+    year={2020},
+    booktitle={Findings of EMNLP},
+}
+"""
+
+_DESCRIPTION = """\
+    IndicGLUE is a natural language understanding benchmark for Indian languages. It contains a wide
+    variety of tasks and covers 11 major Indian languages - as, bn, gu, hi, kn, ml, mr, or, pa, ta, te.
+"""
+
+_KWARGS_DESCRIPTION = """
+Compute IndicGLUE evaluation metric associated to each IndicGLUE dataset.
+Args:
+    predictions: list of predictions to score (as int64),
+        except for 'cvit-mkb-clsr' where each prediction is a vector (of float32).
+    references: list of ground truth labels corresponding to the predictions (as int64),
+        except for 'cvit-mkb-clsr' where each reference is a vector (of float32).
+Returns: depending on the IndicGLUE subset, one or several of:
+    "accuracy": Accuracy
+    "f1": F1 score
+    "precision": Precision@10
+Examples:
+
+    >>> indic_glue_metric = evaluate.load('indic_glue', 'wnli')  # 'wnli' or any of ["copa", "sna", "csqa", "wstp", "inltkh", "bbca", "iitp-mr", "iitp-pr", "actsa-sc", "md"]
+    >>> references = [0, 1]
+    >>> predictions = [0, 1]
+    >>> results = indic_glue_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'accuracy': 1.0}
+
+    >>> indic_glue_metric = evaluate.load('indic_glue', 'wiki-ner')
+    >>> references = [0, 1]
+    >>> predictions = [0, 1]
+    >>> results = indic_glue_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'accuracy': 1.0, 'f1': 1.0}
+
+    >>> indic_glue_metric = evaluate.load('indic_glue', 'cvit-mkb-clsr')
+    >>> references = [[0.5, 0.5, 0.5], [0.1, 0.2, 0.3]]
+    >>> predictions = [[0.5, 0.5, 0.5], [0.1, 0.2, 0.3]]
+    >>> results = indic_glue_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'precision@10': 1.0}
+
+"""
+
+
+def simple_accuracy(preds, labels):
+    return float((preds == labels).mean())
+
+
+def acc_and_f1(preds, labels):
+    acc = simple_accuracy(preds, labels)
+    f1 = float(f1_score(y_true=labels, y_pred=preds))
+    return {
+        "accuracy": acc,
+        "f1": f1,
+    }
+
+
+def precision_at_10(en_sentvecs, in_sentvecs):
+    en_sentvecs = np.array(en_sentvecs)
+    in_sentvecs = np.array(in_sentvecs)
+    n = en_sentvecs.shape[0]
+
+    # mean centering
+    en_sentvecs = en_sentvecs - np.mean(en_sentvecs, axis=0)
+    in_sentvecs = in_sentvecs - np.mean(in_sentvecs, axis=0)
+
+    sim = cdist(en_sentvecs, in_sentvecs, "cosine")
+    actual = np.array(range(n))
+    preds = sim.argsort(axis=1)[:, :10]
+    matches = np.any(preds == actual[:, None], axis=1)
+    return float(matches.mean())
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class IndicGlue(evaluate.Metric):
+    def _info(self):
+        if self.config_name not in [
+            "wnli",
+            "copa",
+            "sna",
+            "csqa",
+            "wstp",
+            "inltkh",
+            "bbca",
+            "cvit-mkb-clsr",
+            "iitp-mr",
+            "iitp-pr",
+            "actsa-sc",
+            "md",
+            "wiki-ner",
+        ]:
+            raise KeyError(
+                "You should supply a configuration name selected in "
+                '["wnli", "copa", "sna", "csqa", "wstp", "inltkh", "bbca", '
+                '"cvit-mkb-clsr", "iitp-mr", "iitp-pr", "actsa-sc", "md", '
+                '"wiki-ner"]'
+            )
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Value("int64")
+                    if self.config_name != "cvit-mkb-clsr"
+                    else datasets.Sequence(datasets.Value("float32")),
+                    "references": datasets.Value("int64")
+                    if self.config_name != "cvit-mkb-clsr"
+                    else datasets.Sequence(datasets.Value("float32")),
+                }
+            ),
+            codebase_urls=[],
+            reference_urls=[],
+            format="numpy" if self.config_name != "cvit-mkb-clsr" else None,
+        )
+
+    def _compute(self, predictions, references):
+        if self.config_name == "cvit-mkb-clsr":
+            return {"precision@10": precision_at_10(predictions, references)}
+        elif self.config_name in ["wiki-ner"]:
+            return acc_and_f1(predictions, references)
+        elif self.config_name in [
+            "wnli",
+            "copa",
+            "sna",
+            "csqa",
+            "wstp",
+            "inltkh",
+            "bbca",
+            "iitp-mr",
+            "iitp-pr",
+            "actsa-sc",
+            "md",
+        ]:
+            return {"accuracy": simple_accuracy(predictions, references)}
+        else:
+            raise KeyError(
+                "You should supply a configuration name selected in "
+                '["wnli", "copa", "sna", "csqa", "wstp", "inltkh", "bbca", '
+                '"cvit-mkb-clsr", "iitp-mr", "iitp-pr", "actsa-sc", "md", '
+                '"wiki-ner"]'
+            )
--- a/evaluate-0.4.2/metrics/indic_glue/requirements.txt
+++ b/evaluate-0.4.2/metrics/indic_glue/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+scipy
+scikit-learn
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/mae/README.md
+++ b/evaluate-0.4.2/metrics/mae/README.md
+---
+title: MAE
+emoji: 🤗 
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+description: >-
+  Mean Absolute Error (MAE) is the mean of the magnitude of difference between the predicted and actual
+  values.
+---
+
+# Metric Card for MAE
+
+
+## Metric Description
+
+Mean Absolute Error (MAE) is the mean of the magnitude of difference between the predicted and actual numeric values:
+![image](https://user-images.githubusercontent.com/14205986/165824243-e1078dfd-489d-456c-a0da-cbaa28726220.png)
+
+
+## How to Use
+
+At minimum, this metric requires predictions and references as inputs.
+
+```python
+>>> mae_metric = evaluate.load("mae")
+>>> predictions = [2.5, 0.0, 2, 8]
+>>> references = [3, -0.5, 2, 7]
+>>> results = mae_metric.compute(predictions=predictions, references=references)
+```
+
+### Inputs
+
+Mandatory inputs: 
+- `predictions`: numeric array-like of shape (`n_samples,`) or (`n_samples`, `n_outputs`), representing the estimated target values.
+- `references`: numeric array-like of shape (`n_samples,`) or (`n_samples`, `n_outputs`), representing the ground truth (correct) target values.
+
+Optional arguments:
+- `sample_weight`: numeric array-like of shape (`n_samples,`) representing sample weights. The default is `None`.
+- `multioutput`: `raw_values`, `uniform_average` or numeric array-like of shape (`n_outputs,`), which defines the aggregation of multiple output values. The default value is `uniform_average`.
+  - `raw_values` returns a full set of errors in case of multioutput input.
+  - `uniform_average` means that the errors of all outputs are averaged with uniform weight. 
+  - the array-like value defines weights used to average errors.
+
+### Output Values
+This metric outputs a dictionary, containing the mean absolute error score, which is of type:
+- `float`: if multioutput is `uniform_average` or an ndarray of weights, then the weighted average of all output errors is returned.
+- numeric array-like of shape (`n_outputs,`): if multioutput is `raw_values`, then the score is returned for each output separately. 
+
+Each MAE `float` value ranges from `0.0` to `+inf`, with the best value being 0.0.
+
+Output Example(s):
+```python
+{'mae': 0.5}
+```
+
+If `multioutput="raw_values"`:
+```python
+{'mae': array([0.5, 1. ])}
+```
+
+#### Values from Popular Papers
+
+
+### Examples
+
+Example with the `uniform_average` config:
+```python
+>>> mae_metric = evaluate.load("mae")
+>>> predictions = [2.5, 0.0, 2, 8]
+>>> references = [3, -0.5, 2, 7]
+>>> results = mae_metric.compute(predictions=predictions, references=references)
+>>> print(results)
+{'mae': 0.5}
+```
+
+Example with multi-dimensional lists, and the `raw_values` config:
+```python
+>>> mae_metric = evaluate.load("mae", "multilist")
+>>> predictions = [[0.5, 1], [-1, 1], [7, -6]]
+>>> references = [[0, 2], [-1, 2], [8, -5]]
+>>> results = mae_metric.compute(predictions=predictions, references=references)
+>>> print(results)
+{'mae': 0.75}
+>>> results = mae_metric.compute(predictions=predictions, references=references, multioutput='raw_values')
+>>> print(results)
+{'mae': array([0.5, 1. ])}
+```
+
+## Limitations and Bias
+One limitation of MAE is that the relative size of the error is not always obvious, meaning that it can be difficult to tell a big error from a smaller one -- metrics such as Mean Absolute Percentage Error (MAPE) have been proposed to calculate MAE in percentage terms.
+
+Also, since it calculates the mean, MAE may underestimate the impact of big, but infrequent, errors -- metrics such as the Root Mean Square Error (RMSE) compensate for this by taking the root of error values. 
+
+## Citation(s)
+```bibtex
+@article{scikit-learn,
+  title={Scikit-learn: Machine Learning in {P}ython},
+  author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
+         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
+         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
+         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
+  journal={Journal of Machine Learning Research},
+  volume={12},
+  pages={2825--2830},
+  year={2011}
+}
+```
+
+```bibtex
+@article{willmott2005advantages,
+  title={Advantages of the mean absolute error (MAE) over the root mean square error (RMSE) in assessing average model performance},
+  author={Willmott, Cort J and Matsuura, Kenji},
+  journal={Climate research},
+  volume={30},
+  number={1},
+  pages={79--82},
+  year={2005}
+}
+```
+
+## Further References
+- [Mean Absolute Error - Wikipedia](https://en.wikipedia.org/wiki/Mean_absolute_error)
--- a/evaluate-0.4.2/metrics/mae/app.py
+++ b/evaluate-0.4.2/metrics/mae/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("mae")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/mae/mae.py
+++ b/evaluate-0.4.2/metrics/mae/mae.py
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MAE - Mean Absolute Error Metric"""
+
+import datasets
+from sklearn.metrics import mean_absolute_error
+
+import evaluate
+
+
+_CITATION = """\
+@article{scikit-learn,
+ title={Scikit-learn: Machine Learning in {P}ython},
+ author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
+         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
+         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
+         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
+ journal={Journal of Machine Learning Research},
+ volume={12},
+ pages={2825--2830},
+ year={2011}
+}
+"""
+
+_DESCRIPTION = """\
+Mean Absolute Error (MAE) is the mean of the magnitude of difference between the predicted and actual
+values.
+"""
+
+
+_KWARGS_DESCRIPTION = """
+Args:
+    predictions: array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+    references: array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+    sample_weight: array-like of shape (n_samples,), default=None
+        Sample weights.
+    multioutput: {"raw_values", "uniform_average"} or array-like of shape (n_outputs,), default="uniform_average"
+        Defines aggregating of multiple output values. Array-like value defines weights used to average errors.
+
+                 "raw_values" : Returns a full set of errors in case of multioutput input.
+
+                 "uniform_average" : Errors of all outputs are averaged with uniform weight.
+
+Returns:
+    mae : mean absolute error.
+        If multioutput is "raw_values", then mean absolute error is returned for each output separately. If multioutput is "uniform_average" or an ndarray of weights, then the weighted average of all output errors is returned.
+        MAE output is non-negative floating point. The best value is 0.0.
+Examples:
+
+    >>> mae_metric = evaluate.load("mae")
+    >>> predictions = [2.5, 0.0, 2, 8]
+    >>> references = [3, -0.5, 2, 7]
+    >>> results = mae_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'mae': 0.5}
+
+    If you're using multi-dimensional lists, then set the config as follows :
+
+    >>> mae_metric = evaluate.load("mae", "multilist")
+    >>> predictions = [[0.5, 1], [-1, 1], [7, -6]]
+    >>> references = [[0, 2], [-1, 2], [8, -5]]
+    >>> results = mae_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'mae': 0.75}
+    >>> results = mae_metric.compute(predictions=predictions, references=references, multioutput='raw_values')
+    >>> print(results)
+    {'mae': array([0.5, 1. ])}
+"""
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Mae(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(self._get_feature_types()),
+            reference_urls=[
+                "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html"
+            ],
+        )
+
+    def _get_feature_types(self):
+        if self.config_name == "multilist":
+            return {
+                "predictions": datasets.Sequence(datasets.Value("float")),
+                "references": datasets.Sequence(datasets.Value("float")),
+            }
+        else:
+            return {
+                "predictions": datasets.Value("float"),
+                "references": datasets.Value("float"),
+            }
+
+    def _compute(self, predictions, references, sample_weight=None, multioutput="uniform_average"):
+
+        mae_score = mean_absolute_error(references, predictions, sample_weight=sample_weight, multioutput=multioutput)
+
+        return {"mae": mae_score}
--- a/evaluate-0.4.2/metrics/mae/requirements.txt
+++ b/evaluate-0.4.2/metrics/mae/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+scikit-learn
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/mahalanobis/README.md
+++ b/evaluate-0.4.2/metrics/mahalanobis/README.md
+---
+title: Mahalanobis Distance
+emoji: 🤗 
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+description: >-
+  Compute the Mahalanobis Distance
+  
+  Mahalonobis distance is the distance between a point and a distribution.
+  And not between two distinct points. It is effectively a multivariate equivalent of the Euclidean distance.
+  It was introduced by Prof. P. C. Mahalanobis in 1936
+  and has been used in various statistical applications ever since
+  [source: https://www.machinelearningplus.com/statistics/mahalanobis-distance/]
+---
+
+# Metric Card for Mahalanobis Distance
+
+## Metric Description
+Mahalonobis distance is the distance between a point and a distribution (as opposed to the distance between two points), making it the multivariate equivalent of the Euclidean distance.
+
+It is often used in multivariate anomaly detection, classification on highly imbalanced datasets and one-class classification. 
+
+## How to Use
+At minimum, this metric requires two `list`s of datapoints: 
+
+```python
+>>> mahalanobis_metric = evaluate.load("mahalanobis")
+>>> results = mahalanobis_metric.compute(reference_distribution=[[0, 1], [1, 0]], X=[[0, 1]])
+```
+
+### Inputs
+- `X` (`list`): data points to be compared with the `reference_distribution`.
+- `reference_distribution` (`list`): data points from the reference distribution that we want to compare to.
+                    
+### Output Values
+`mahalanobis` (`array`): the Mahalonobis distance for each data point in `X`.
+
+```python
+>>> print(results)
+{'mahalanobis': array([0.5])}
+```
+
+#### Values from Popular Papers
+*N/A*
+
+### Example
+
+```python
+>>> mahalanobis_metric = evaluate.load("mahalanobis")
+>>> results = mahalanobis_metric.compute(reference_distribution=[[0, 1], [1, 0]], X=[[0, 1]])
+>>> print(results)
+{'mahalanobis': array([0.5])}
+```
+
+## Limitations and Bias
+
+The Mahalanobis distance is only able to capture linear relationships between the variables, which means it cannot capture all types of outliers. Mahalanobis distance also fails to faithfully represent data that is highly skewed or multimodal.
+
+## Citation
+```bibtex
+@inproceedings{mahalanobis1936generalized,
+  title={On the generalized distance in statistics},
+  author={Mahalanobis, Prasanta Chandra},
+  year={1936},
+  organization={National Institute of Science of India}
+}
+```
+
+```bibtex
+@article{de2000mahalanobis,
+  title={The Mahalanobis distance},
+  author={De Maesschalck, Roy and Jouan-Rimbaud, Delphine and Massart, D{\'e}sir{\'e} L},
+  journal={Chemometrics and intelligent laboratory systems},
+  volume={50},
+  number={1},
+  pages={1--18},
+  year={2000},
+  publisher={Elsevier}
+}
+```
+
+## Further References
+-[Wikipedia -- Mahalanobis Distance](https://en.wikipedia.org/wiki/Mahalanobis_distance)
+
+-[Machine Learning Plus -- Mahalanobis Distance](https://www.machinelearningplus.com/statistics/mahalanobis-distance/)
--- a/evaluate-0.4.2/metrics/mahalanobis/app.py
+++ b/evaluate-0.4.2/metrics/mahalanobis/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("mahalanobis")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/mahalanobis/mahalanobis.py
+++ b/evaluate-0.4.2/metrics/mahalanobis/mahalanobis.py
+# Copyright 2021 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mahalanobis metric."""
+
+import datasets
+import numpy as np
+
+import evaluate
+
+
+_DESCRIPTION = """
+Compute the Mahalanobis Distance
+
+Mahalonobis distance is the distance between a point and a distribution.
+And not between two distinct points. It is effectively a multivariate equivalent of the Euclidean distance.
+It was introduced by Prof. P. C. Mahalanobis in 1936
+and has been used in various statistical applications ever since
+[source: https://www.machinelearningplus.com/statistics/mahalanobis-distance/]
+"""
+
+_CITATION = """\
+@article{de2000mahalanobis,
+  title={The mahalanobis distance},
+  author={De Maesschalck, Roy and Jouan-Rimbaud, Delphine and Massart, D{\'e}sir{\'e} L},
+  journal={Chemometrics and intelligent laboratory systems},
+  volume={50},
+  number={1},
+  pages={1--18},
+  year={2000},
+  publisher={Elsevier}
+}
+"""
+
+_KWARGS_DESCRIPTION = """
+Args:
+    X: List of datapoints to be compared with the `reference_distribution`.
+    reference_distribution: List of datapoints from the reference distribution we want to compare to.
+Returns:
+    mahalanobis: The Mahalonobis distance for each datapoint in `X`.
+Examples:
+
+    >>> mahalanobis_metric = evaluate.load("mahalanobis")
+    >>> results = mahalanobis_metric.compute(reference_distribution=[[0, 1], [1, 0]], X=[[0, 1]])
+    >>> print(results)
+    {'mahalanobis': array([0.5])}
+"""
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Mahalanobis(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "X": datasets.Sequence(datasets.Value("float", id="sequence"), id="X"),
+                }
+            ),
+        )
+
+    def _compute(self, X, reference_distribution):
+
+        # convert to numpy arrays
+        X = np.array(X)
+        reference_distribution = np.array(reference_distribution)
+
+        # Assert that arrays are 2D
+        if len(X.shape) != 2:
+            raise ValueError("Expected `X` to be a 2D vector")
+        if len(reference_distribution.shape) != 2:
+            raise ValueError("Expected `reference_distribution` to be a 2D vector")
+        if reference_distribution.shape[0] < 2:
+            raise ValueError(
+                "Expected `reference_distribution` to be a 2D vector with more than one element in the first dimension"
+            )
+
+        # Get mahalanobis distance for each prediction
+        X_minus_mu = X - np.mean(reference_distribution)
+        cov = np.cov(reference_distribution.T)
+        try:
+            inv_covmat = np.linalg.inv(cov)
+        except np.linalg.LinAlgError:
+            inv_covmat = np.linalg.pinv(cov)
+        left_term = np.dot(X_minus_mu, inv_covmat)
+        mahal_dist = np.dot(left_term, X_minus_mu.T).diagonal()
+
+        return {"mahalanobis": mahal_dist}
--- a/evaluate-0.4.2/metrics/mahalanobis/requirements.txt
+++ b/evaluate-0.4.2/metrics/mahalanobis/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/mape/README.md
+++ b/evaluate-0.4.2/metrics/mape/README.md
+---
+title: MAPE
+emoji: 🤗 
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+description: >-
+  Mean Absolute Percentage Error (MAPE) is the mean percentage error difference between the predicted and actual
+  values.
+---
+
+# Metric Card for MAPE
+
+
+## Metric Description
+
+Mean Absolute Error (MAPE) is the mean of the percentage error of difference between the predicted $x_i$ and actual $y_i$ numeric values:
+![image](https://user-images.githubusercontent.com/8100/200005316-c3975d32-8978-40f3-b541-c2ef57ec7c5b.png)
+
+## How to Use
+
+At minimum, this metric requires predictions and references as inputs.
+
+```python
+>>> mape_metric = evaluate.load("mape")
+>>> predictions = [2.5, 0.0, 2, 8]
+>>> references = [3, -0.5, 2, 7]
+>>> results = mape_metric.compute(predictions=predictions, references=references)
+```
+
+### Inputs
+
+Mandatory inputs: 
+- `predictions`: numeric array-like of shape (`n_samples,`) or (`n_samples`, `n_outputs`), representing the estimated target values.
+- `references`: numeric array-like of shape (`n_samples,`) or (`n_samples`, `n_outputs`), representing the ground truth (correct) target values.
+
+Optional arguments:
+- `sample_weight`: numeric array-like of shape (`n_samples,`) representing sample weights. The default is `None`.
+- `multioutput`: `raw_values`, `uniform_average` or numeric array-like of shape (`n_outputs,`), which defines the aggregation of multiple output values. The default value is `uniform_average`.
+  - `raw_values` returns a full set of errors in case of multioutput input.
+  - `uniform_average` means that the errors of all outputs are averaged with uniform weight. 
+  - the array-like value defines weights used to average errors.
+
+### Output Values
+This metric outputs a dictionary, containing the mean absolute error score, which is of type:
+- `float`: if multioutput is `uniform_average` or an ndarray of weights, then the weighted average of all output errors is returned.
+- numeric array-like of shape (`n_outputs,`): if multioutput is `raw_values`, then the score is returned for each output separately. 
+
+Each MAPE `float` value is postive with the best value being 0.0.
+
+Output Example(s):
+```python
+{'mape': 0.5}
+```
+
+If `multioutput="raw_values"`:
+```python
+{'mape': array([0.5, 1. ])}
+```
+
+#### Values from Popular Papers
+
+
+### Examples
+
+Example with the `uniform_average` config:
+```python
+>>> mape_metric = evaluate.load("mape")
+>>> predictions = [2.5, 0.0, 2, 8]
+>>> references = [3, -0.5, 2, 7]
+>>> results = mape_metric.compute(predictions=predictions, references=references)
+>>> print(results)
+{'mape': 0.3273...}
+```
+
+Example with multi-dimensional lists, and the `raw_values` config:
+```python
+>>> mape_metric = evaluate.load("mape", "multilist")
+>>> predictions = [[0.5, 1], [-1, 1], [7, -6]]
+>>> references = [[0.1, 2], [-1, 2], [8, -5]]
+>>> results = mape_metric.compute(predictions=predictions, references=references)
+>>> print(results)
+{'mape': 0.8874...}
+>>> results = mape_metric.compute(predictions=predictions, references=references, multioutput='raw_values')
+>>> print(results)
+{'mape': array([1.3749..., 0.4])}
+```
+
+## Limitations and Bias
+One limitation of MAPE is that it cannot be used if the ground truth is zero or close to zero. This metric is also asymmetric in that it puts a heavier penalty on predictions less than the ground truth and a smaller penalty on predictions bigger than the ground truth and thus can lead to a bias of methods being select which under-predict if selected via this metric.
+
+## Citation(s)
+```bibtex
+@article{scikit-learn,
+  title={Scikit-learn: Machine Learning in {P}ython},
+  author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
+         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
+         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
+         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
+  journal={Journal of Machine Learning Research},
+  volume={12},
+  pages={2825--2830},
+  year={2011}
+}
+```
+
+```bibtex
+@article{DEMYTTENAERE201638,
+    title = {Mean Absolute Percentage Error for regression models},
+    journal = {Neurocomputing},
+    volume = {192},
+    pages = {38--48},
+    year = {2016},
+    note = {Advances in artificial neural networks, machine learning and computational intelligence},
+    issn = {0925-2312},
+    doi = {https://doi.org/10.1016/j.neucom.2015.12.114},
+    url = {https://www.sciencedirect.com/science/article/pii/S0925231216003325},
+    author = {Arnaud {de Myttenaere} and Boris Golden and Bénédicte {Le Grand} and Fabrice Rossi},
+}
+```
+
+## Further References
+- [Mean absolute percentage error - Wikipedia](https://en.wikipedia.org/wiki/Mean_absolute_percentage_error)