修改readme

25991f98 · hepj · ac192496 · 25991f98 · 25991f98 · 25991f98
Commit 25991f98 authored Jul 25, 2024 by hepj
20 changed files
--- a/evaluate-0.4.2/metrics/mape/app.py
+++ b/evaluate-0.4.2/metrics/mape/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("mape")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/mape/mape.py
+++ b/evaluate-0.4.2/metrics/mape/mape.py
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MAPE - Mean Absolute Percentage Error Metric"""
+
+import datasets
+from sklearn.metrics import mean_absolute_percentage_error
+
+import evaluate
+
+
+_CITATION = """\
+@article{scikit-learn,
+ title={Scikit-learn: Machine Learning in {P}ython},
+ author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
+         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
+         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
+         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
+ journal={Journal of Machine Learning Research},
+ volume={12},
+ pages={2825--2830},
+ year={2011}
+}
+"""
+
+_DESCRIPTION = """\
+Mean Absolute Percentage Error (MAPE) is the mean percentage error difference between the predicted and actual
+values.
+"""
+
+
+_KWARGS_DESCRIPTION = """
+Args:
+    predictions: array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+    references: array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+    sample_weight: array-like of shape (n_samples,), default=None
+        Sample weights.
+    multioutput: {"raw_values", "uniform_average"} or array-like of shape (n_outputs,), default="uniform_average"
+        Defines aggregating of multiple output values. Array-like value defines weights used to average errors.
+
+                 "raw_values" : Returns a full set of errors in case of multioutput input.
+
+                 "uniform_average" : Errors of all outputs are averaged with uniform weight.
+
+Returns:
+    mape : mean absolute percentage error.
+        If multioutput is "raw_values", then mean absolute percentage error is returned for each output separately. If multioutput is "uniform_average" or an ndarray of weights, then the weighted average of all output errors is returned.
+        MAPE output is non-negative floating point. The best value is 0.0.
+Examples:
+
+    >>> mape_metric = evaluate.load("mape")
+    >>> predictions = [2.5, 0.0, 2, 8]
+    >>> references = [3, -0.5, 2, 7]
+    >>> results = mape_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'mape': 0.3273809523809524}
+
+    If you're using multi-dimensional lists, then set the config as follows :
+
+    >>> mape_metric = evaluate.load("mape", "multilist")
+    >>> predictions = [[0.5, 1], [-1, 1], [7, -6]]
+    >>> references = [[0.1, 2], [-1, 2], [8, -5]]
+    >>> results = mape_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'mape': 0.8874999875823658}
+    >>> results = mape_metric.compute(predictions=predictions, references=references, multioutput='raw_values')
+    >>> print(results)
+    {'mape': array([1.37499998, 0.4       ])}
+"""
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Mape(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(self._get_feature_types()),
+            reference_urls=[
+                "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_percentage_error.html"
+            ],
+        )
+
+    def _get_feature_types(self):
+        if self.config_name == "multilist":
+            return {
+                "predictions": datasets.Sequence(datasets.Value("float")),
+                "references": datasets.Sequence(datasets.Value("float")),
+            }
+        else:
+            return {
+                "predictions": datasets.Value("float"),
+                "references": datasets.Value("float"),
+            }
+
+    def _compute(self, predictions, references, sample_weight=None, multioutput="uniform_average"):
+
+        mape_score = mean_absolute_percentage_error(
+            references,
+            predictions,
+            sample_weight=sample_weight,
+            multioutput=multioutput,
+        )
+
+        return {"mape": mape_score}
--- a/evaluate-0.4.2/metrics/mape/requirements.txt
+++ b/evaluate-0.4.2/metrics/mape/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+scikit-learn
--- a/evaluate-0.4.2/metrics/mase/README.md
+++ b/evaluate-0.4.2/metrics/mase/README.md
+---
+title: MASE
+emoji: 🤗 
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+description: >-
+  Mean Absolute Scaled Error (MASE) is the mean absolute error of the forecast values, divided by the mean absolute error of the in-sample one-step naive forecast on the training set.
+---
+
+# Metric Card for MASE
+
+## Metric Description
+
+Mean Absolute Scaled Error (MASE) is the mean absolute error of the forecast values, divided by the mean absolute error of the in-sample one-step naive forecast. For prediction $x_i$ and corresponding ground truth $y_i$ as well as training data $z_t$ with seasonality $p$ the metric is given by:
+![image](https://user-images.githubusercontent.com/8100/200009284-7ce4ccaa-373c-42f0-acbb-f81d52a97512.png)
+
+
+This metric is:
+*  independent of the scale of the data;
+* has predictable behavior when predicted/ground-truth data is near zero;
+*  symmetric;
+* interpretable,  as values greater than one indicate that in-sample one-step forecasts from the naïve method perform better than the forecast values under consideration.
+
+
+## How to Use
+
+At minimum, this metric requires predictions, references and training data as inputs.
+
+```python
+>>> mase_metric = evaluate.load("mase")
+>>> predictions = [2.5, 0.0, 2, 8]
+>>> references = [3, -0.5, 2, 7]
+>>> training = [5, 0.5, 4, 6, 3, 5, 2]
+>>> results = mase_metric.compute(predictions=predictions, references=references, training=training)
+```
+
+### Inputs
+
+Mandatory inputs: 
+- `predictions`: numeric array-like of shape (`n_samples,`) or (`n_samples`, `n_outputs`), representing the estimated target values.
+- `references`: numeric array-like of shape (`n_samples,`) or (`n_samples`, `n_outputs`), representing the ground truth (correct) target values.
+- `training`: numeric array-like of shape (`n_train_samples,`) or (`n_train_samples`, `n_outputs`), representing the in sample training data.
+
+Optional arguments:
+- `periodicity`: the seasonal periodicity of training data. The default is 1.
+- `sample_weight`: numeric array-like of shape (`n_samples,`) representing sample weights. The default is `None`.
+- `multioutput`: `raw_values`, `uniform_average` or numeric array-like of shape (`n_outputs,`), which defines the aggregation of multiple output values. The default value is `uniform_average`.
+  - `raw_values` returns a full set of errors in case of multioutput input.
+  - `uniform_average` means that the errors of all outputs are averaged with uniform weight. 
+  - the array-like value defines weights used to average errors.
+
+### Output Values
+This metric outputs a dictionary, containing the mean absolute error score, which is of type:
+- `float`: if multioutput is `uniform_average` or an ndarray of weights, then the weighted average of all output errors is returned.
+- numeric array-like of shape (`n_outputs,`): if multioutput is `raw_values`, then the score is returned for each output separately. 
+
+Each MASE `float` value ranges from `0.0` to `1.0`, with the best value being 0.0.
+
+Output Example(s):
+```python
+{'mase': 0.5}
+```
+
+If `multioutput="raw_values"`:
+```python
+{'mase': array([0.5, 1. ])}
+```
+
+#### Values from Popular Papers
+
+
+### Examples
+
+Example with the `uniform_average` config:
+```python
+>>> mase_metric = evaluate.load("mase")
+>>> predictions = [2.5, 0.0, 2, 8]
+>>> references = [3, -0.5, 2, 7]
+>>> training = [5, 0.5, 4, 6, 3, 5, 2]
+>>> results = mase_metric.compute(predictions=predictions, references=references, training=training)
+>>> print(results)
+{'mase': 0.1833...}
+```
+
+Example with multi-dimensional lists, and the `raw_values` config:
+```python
+>>> mase_metric = evaluate.load("mase", "multilist")
+>>> predictions = [[0.5, 1], [-1, 1], [7, -6]]
+>>> references = [[0.1, 2], [-1, 2], [8, -5]]
+>>> training = [[0.5, 1], [-1, 1], [7, -6]]
+>>> results = mase_metric.compute(predictions=predictions, references=references, training=training)
+>>> print(results)
+{'mase': 0.1818...}
+>>> results = mase_metric.compute(predictions=predictions, references=references, training=training, multioutput='raw_values')
+>>> print(results)
+{'mase': array([0.1052..., 0.2857...])}
+```
+
+## Limitations and Bias
+
+
+## Citation(s)
+
+```bibtex
+@article{HYNDMAN2006679,
+    title = {Another look at measures of forecast accuracy},
+    journal = {International Journal of Forecasting},
+    volume = {22},
+    number = {4},
+    pages = {679--688},
+    year = {2006},
+    issn = {0169-2070},
+    doi = {https://doi.org/10.1016/j.ijforecast.2006.03.001},
+    url = {https://www.sciencedirect.com/science/article/pii/S0169207006000239},
+    author = {Rob J. Hyndman and Anne B. Koehler},
+}
+```
+
+## Further References
+- [Mean absolute scaled error - Wikipedia](https://en.wikipedia.org/wiki/Mean_absolute_scaled_errorr)
--- a/evaluate-0.4.2/metrics/mase/app.py
+++ b/evaluate-0.4.2/metrics/mase/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("mase")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/mase/mase.py
+++ b/evaluate-0.4.2/metrics/mase/mase.py
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MASE - Mean Absolute Scaled Error Metric"""
+
+import datasets
+import numpy as np
+from sklearn.metrics import mean_absolute_error
+
+import evaluate
+
+
+_CITATION = """\
+@article{HYNDMAN2006679,
+    title = {Another look at measures of forecast accuracy},
+    journal = {International Journal of Forecasting},
+    volume = {22},
+    number = {4},
+    pages = {679--688},
+    year = {2006},
+    issn = {0169-2070},
+    doi = {https://doi.org/10.1016/j.ijforecast.2006.03.001},
+    url = {https://www.sciencedirect.com/science/article/pii/S0169207006000239},
+    author = {Rob J. Hyndman and Anne B. Koehler},
+}
+"""
+
+_DESCRIPTION = """\
+Mean Absolute Scaled Error (MASE) is the mean absolute error of the forecast values, divided by the mean absolute error of the in-sample one-step naive forecast.
+"""
+
+
+_KWARGS_DESCRIPTION = """
+Args:
+    predictions: array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+    references: array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+    training: array-like of shape (n_train_samples,) or (n_train_samples, n_outputs)
+        In sample training data for naive forecast.
+    periodicity: int, default=1
+        Seasonal periodicity of training data.
+    sample_weight: array-like of shape (n_samples,), default=None
+        Sample weights.
+    multioutput: {"raw_values", "uniform_average"} or array-like of shape (n_outputs,), default="uniform_average"
+        Defines aggregating of multiple output values. Array-like value defines weights used to average errors.
+
+                 "raw_values" : Returns a full set of errors in case of multioutput input.
+
+                 "uniform_average" : Errors of all outputs are averaged with uniform weight.
+
+Returns:
+    mase : mean absolute scaled error.
+        If multioutput is "raw_values", then mean absolute percentage error is returned for each output separately. If multioutput is "uniform_average" or an ndarray of weights, then the weighted average of all output errors is returned.
+        MASE output is non-negative floating point. The best value is 0.0.
+Examples:
+
+    >>> mase_metric = evaluate.load("mase")
+    >>> predictions = [2.5, 0.0, 2, 8, 1.25]
+    >>> references = [3, -0.5, 2, 7, 2]
+    >>> training = [5, 0.5, 4, 6, 3, 5, 2]
+    >>> results = mase_metric.compute(predictions=predictions, references=references, training=training)
+    >>> print(results)
+    {'mase': 0.18333333333333335}
+
+    If you're using multi-dimensional lists, then set the config as follows :
+
+    >>> mase_metric = evaluate.load("mase", "multilist")
+    >>> predictions = [[0, 2], [-1, 2], [8, -5]]
+    >>> references = [[0.5, 1], [-1, 1], [7, -6]]
+    >>> training = [[0.5, 1], [-1, 1], [7, -6]]
+    >>> results = mase_metric.compute(predictions=predictions, references=references, training=training)
+    >>> print(results)
+    {'mase': 0.18181818181818182}
+    >>> results = mase_metric.compute(predictions=predictions, references=references, training=training, multioutput='raw_values')
+    >>> print(results)
+    {'mase': array([0.10526316, 0.28571429])}
+    >>> results = mase_metric.compute(predictions=predictions, references=references, training=training, multioutput=[0.3, 0.7])
+    >>> print(results)
+    {'mase': 0.21935483870967742}
+"""
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Mase(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(self._get_feature_types()),
+            reference_urls=["https://otexts.com/fpp3/accuracy.html#scaled-errors"],
+        )
+
+    def _get_feature_types(self):
+        if self.config_name == "multilist":
+            return {
+                "predictions": datasets.Sequence(datasets.Value("float")),
+                "references": datasets.Sequence(datasets.Value("float")),
+            }
+        else:
+            return {
+                "predictions": datasets.Value("float"),
+                "references": datasets.Value("float"),
+            }
+
+    def _compute(
+        self,
+        predictions,
+        references,
+        training,
+        periodicity=1,
+        sample_weight=None,
+        multioutput="uniform_average",
+    ):
+
+        y_pred_naive = training[:-periodicity]
+        mae_naive = mean_absolute_error(training[periodicity:], y_pred_naive, multioutput=multioutput)
+
+        mae_score = mean_absolute_error(
+            references,
+            predictions,
+            sample_weight=sample_weight,
+            multioutput=multioutput,
+        )
+
+        epsilon = np.finfo(np.float64).eps
+        mase_score = mae_score / np.maximum(mae_naive, epsilon)
+
+        return {"mase": mase_score}
--- a/evaluate-0.4.2/metrics/mase/requirements.txt
+++ b/evaluate-0.4.2/metrics/mase/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+scikit-learn
--- a/evaluate-0.4.2/metrics/matthews_correlation/README.md
+++ b/evaluate-0.4.2/metrics/matthews_correlation/README.md
+---
+title: Matthews Correlation Coefficient
+emoji: 🤗 
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+description: >-
+  Compute the Matthews correlation coefficient (MCC)
+  
+  The Matthews correlation coefficient is used in machine learning as a
+  measure of the quality of binary and multiclass classifications. It takes
+  into account true and false positives and negatives and is generally
+  regarded as a balanced measure which can be used even if the classes are of
+  very different sizes. The MCC is in essence a correlation coefficient value
+  between -1 and +1. A coefficient of +1 represents a perfect prediction, 0
+  an average random prediction and -1 an inverse prediction.  The statistic
+  is also known as the phi coefficient. [source: Wikipedia]
+---
+
+# Metric Card for Matthews Correlation Coefficient
+
+## Metric Description
+The Matthews correlation coefficient is used in machine learning as a
+measure of the quality of binary and multiclass classifications. It takes
+into account true and false positives and negatives and is generally
+regarded as a balanced measure which can be used even if the classes are of
+very different sizes. The MCC is in essence a correlation coefficient value
+between -1 and +1. A coefficient of +1 represents a perfect prediction, 0
+an average random prediction and -1 an inverse prediction.  The statistic
+is also known as the phi coefficient. [source: Wikipedia]
+
+## How to Use
+At minimum, this metric requires a list of predictions and a list of references:
+```python
+>>> matthews_metric = evaluate.load("matthews_correlation")
+>>> results = matthews_metric.compute(references=[0, 1], predictions=[0, 1])
+>>> print(results)
+{'matthews_correlation': 1.0}
+```
+
+### Inputs
+- **`predictions`** (`list` of `int`s): Predicted class labels.
+- **`references`** (`list` of `int`s): Ground truth labels.
+- **`sample_weight`** (`list` of `int`s, `float`s, or `bool`s): Sample weights. Defaults to `None`.
+- **`average`**(`None` or `macro`): For the multilabel case, whether to return one correlation coefficient per feature (`average=None`), or the average of them (`average='macro'`). Defaults to `None`.
+
+### Output Values
+- **`matthews_correlation`** (`float` or `list` of `float`s): Matthews correlation coefficient, or list of them in the multilabel case without averaging.
+
+The metric output takes the following form:
+```python
+{'matthews_correlation': 0.54}
+```
+
+This metric can be any value from -1 to +1, inclusive.
+
+#### Values from Popular Papers
+
+
+### Examples
+A basic example with only predictions and references as inputs:
+```python
+>>> matthews_metric = evaluate.load("matthews_correlation")
+>>> results = matthews_metric.compute(references=[1, 3, 2, 0, 3, 2],
+...                                     predictions=[1, 2, 2, 0, 3, 3])
+>>> print(results)
+{'matthews_correlation': 0.5384615384615384}
+```
+
+The same example as above, but also including sample weights:
+```python
+>>> matthews_metric = evaluate.load("matthews_correlation")
+>>> results = matthews_metric.compute(references=[1, 3, 2, 0, 3, 2],
+...                                     predictions=[1, 2, 2, 0, 3, 3],
+...                                     sample_weight=[0.5, 3, 1, 1, 1, 2])
+>>> print(results)
+{'matthews_correlation': 0.09782608695652174}
+```
+
+The same example as above, with sample weights that cause a negative correlation:
+```python
+>>> matthews_metric = evaluate.load("matthews_correlation")
+>>> results = matthews_metric.compute(references=[1, 3, 2, 0, 3, 2],
+...                                     predictions=[1, 2, 2, 0, 3, 3],
+...                                     sample_weight=[0.5, 1, 0, 0, 0, 1])
+>>> print(results)
+{'matthews_correlation': -0.25}
+```
+
+## Limitations and Bias
+*Note any limitations or biases that the metric has.*
+
+
+## Citation
+```bibtex
+@article{scikit-learn,
+  title={Scikit-learn: Machine Learning in {P}ython},
+  author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
+         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
+         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
+         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
+  journal={Journal of Machine Learning Research},
+  volume={12},
+  pages={2825--2830},
+  year={2011}
+}
+```
+
+## Further References
+
+- This Hugging Face implementation uses [this scikit-learn implementation](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html)
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/matthews_correlation/app.py
+++ b/evaluate-0.4.2/metrics/matthews_correlation/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("matthews_correlation")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/matthews_correlation/matthews_correlation.py
+++ b/evaluate-0.4.2/metrics/matthews_correlation/matthews_correlation.py
+# Copyright 2021 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Matthews Correlation metric."""
+
+import datasets
+import numpy as np
+from sklearn.metrics import matthews_corrcoef
+
+import evaluate
+
+
+_DESCRIPTION = """
+Compute the Matthews correlation coefficient (MCC)
+
+The Matthews correlation coefficient is used in machine learning as a
+measure of the quality of binary and multiclass classifications. It takes
+into account true and false positives and negatives and is generally
+regarded as a balanced measure which can be used even if the classes are of
+very different sizes. The MCC is in essence a correlation coefficient value
+between -1 and +1. A coefficient of +1 represents a perfect prediction, 0
+an average random prediction and -1 an inverse prediction.  The statistic
+is also known as the phi coefficient. [source: Wikipedia]
+"""
+
+_KWARGS_DESCRIPTION = """
+Args:
+    predictions (list of int): Predicted labels, as returned by a model.
+    references (list of int): Ground truth labels.
+    average (`string`): This parameter is used for multilabel configs. Defaults to `None`.
+        - None (default): Returns an array of Matthews correlation coefficients, one for each feature
+        - 'macro': Calculate metrics for each feature, and find their unweighted mean.
+    sample_weight (list of int, float, or bool): Sample weights. Defaults to `None`.
+Returns:
+    matthews_correlation (dict containing float): Matthews correlation.
+Examples:
+    Example 1, a basic example with only predictions and references as inputs:
+        >>> matthews_metric = evaluate.load("matthews_correlation")
+        >>> results = matthews_metric.compute(references=[1, 3, 2, 0, 3, 2],
+        ...                                     predictions=[1, 2, 2, 0, 3, 3])
+        >>> print(round(results['matthews_correlation'], 2))
+        0.54
+
+    Example 2, the same example as above, but also including sample weights:
+        >>> matthews_metric = evaluate.load("matthews_correlation")
+        >>> results = matthews_metric.compute(references=[1, 3, 2, 0, 3, 2],
+        ...                                     predictions=[1, 2, 2, 0, 3, 3],
+        ...                                     sample_weight=[0.5, 3, 1, 1, 1, 2])
+        >>> print(round(results['matthews_correlation'], 2))
+        0.1
+
+    Example 3, the same example as above, but with sample weights that cause a negative correlation:
+        >>> matthews_metric = evaluate.load("matthews_correlation")
+        >>> results = matthews_metric.compute(references=[1, 3, 2, 0, 3, 2],
+        ...                                     predictions=[1, 2, 2, 0, 3, 3],
+        ...                                     sample_weight=[0.5, 1, 0, 0, 0, 1])
+        >>> print(round(results['matthews_correlation'], 2))
+        -0.25
+
+    Example 4, Multi-label without averaging:
+        >>> matthews_metric = evaluate.load("matthews_correlation", config_name="multilabel")
+        >>> results = matthews_metric.compute(references=[[0,1], [1,0], [1,1]],
+        ...                                     predictions=[[0,1], [1,1], [0,1]])
+        >>> print(results['matthews_correlation'])
+        [0.5, 0.0]
+
+    Example 5, Multi-label with averaging:
+        >>> matthews_metric = evaluate.load("matthews_correlation", config_name="multilabel")
+        >>> results = matthews_metric.compute(references=[[0,1], [1,0], [1,1]],
+        ...                                     predictions=[[0,1], [1,1], [0,1]],
+        ...                                     average='macro')
+        >>> print(round(results['matthews_correlation'], 2))
+        0.25
+"""
+
+_CITATION = """\
+@article{scikit-learn,
+  title={Scikit-learn: Machine Learning in {P}ython},
+  author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
+         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
+         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
+         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
+  journal={Journal of Machine Learning Research},
+  volume={12},
+  pages={2825--2830},
+  year={2011}
+}
+"""
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class MatthewsCorrelation(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Sequence(datasets.Value("int32")),
+                    "references": datasets.Sequence(datasets.Value("int32")),
+                }
+                if self.config_name == "multilabel"
+                else {
+                    "predictions": datasets.Value("int32"),
+                    "references": datasets.Value("int32"),
+                }
+            ),
+            reference_urls=[
+                "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html"
+            ],
+        )
+
+    def _compute(self, predictions, references, sample_weight=None, average=None):
+        if self.config_name == "multilabel":
+            references = np.array(references)
+            predictions = np.array(predictions)
+            if not (references.ndim == 2 and predictions.ndim == 2):
+                raise ValueError("For multi-label inputs, both references and predictions should be 2-dimensional")
+            matthews_corr = [
+                matthews_corrcoef(predictions[:, i], references[:, i], sample_weight=sample_weight)
+                for i in range(references.shape[1])
+            ]
+            if average == "macro":
+                matthews_corr = np.mean(matthews_corr)
+            elif average is not None:
+                raise ValueError("Invalid `average`: expected `macro`, or None ")
+        else:
+            matthews_corr = float(matthews_corrcoef(references, predictions, sample_weight=sample_weight))
+        return {"matthews_correlation": matthews_corr}
--- a/evaluate-0.4.2/metrics/matthews_correlation/requirements.txt
+++ b/evaluate-0.4.2/metrics/matthews_correlation/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+scikit-learn
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/mauve/README.md
+++ b/evaluate-0.4.2/metrics/mauve/README.md
+---
+title: MAUVE
+emoji: 🤗
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+description: >-
+  MAUVE is a measure of the statistical gap between two text distributions, e.g., how far the text written by a model is the distribution of human text, using samples from both distributions.
+ 
+  MAUVE is obtained by computing Kullback–Leibler (KL) divergences between the two distributions in a quantized embedding space of a large language model. It can quantify differences in the quality of generated text based on the size of the model, the decoding algorithm, and the length of the generated text. MAUVE was found to correlate the strongest with human evaluations over baseline metrics for open-ended text generation.
+ 
+---
+
+# Metric Card for MAUVE
+
+## Metric description
+
+MAUVE is a measure of the gap between neural text and human text. It is computed using the [Kullback–Leibler (KL) divergences](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence) between the two distributions of text in a quantized embedding space of a large language model. MAUVE can identify differences in quality arising from model sizes and decoding algorithms.
+
+This metric is a wrapper around the [official implementation](https://github.com/krishnap25/mauve) of MAUVE.
+
+For more details, consult the [MAUVE paper](https://arxiv.org/abs/2102.01454).
+
+## How to use
+
+The metric takes two lists of strings of tokens separated by spaces: one representing `predictions` (i.e. the text generated by the model) and the second representing `references` (a reference text for each prediction):
+
+```python
+from evaluate import load
+mauve = load('mauve')
+predictions = ["hello world", "goodnight moon"]
+references = ["hello world",  "goodnight moon"]
+mauve_results = mauve.compute(predictions=predictions, references=references)
+```
+
+It also has several optional arguments:
+
+`num_buckets`: the size of the histogram to quantize P and Q. Options: `auto` (default) or an integer.
+
+`pca_max_data`: the number of data points to use for PCA dimensionality reduction prior to clustering. If -1, use all the data. The default is `-1`.
+
+`kmeans_explained_var`: the amount of variance of the data to keep in dimensionality reduction by PCA. The default is `0.9`.
+
+`kmeans_num_redo`: number of times to redo k-means clustering (the best objective is kept). The default is `5`.
+
+`kmeans_max_iter`: maximum number of k-means iterations. The default is `500`.
+
+`featurize_model_name`: name of the model from which features are obtained, from one of the following: `gpt2`, `gpt2-medium`, `gpt2-large`, `gpt2-xl`. The default is `gpt2-large`.
+
+`device_id`: Device for featurization. Supply a GPU id (e.g. `0` or `3`) to use GPU. If no GPU with this id is found, the metric will use CPU.
+
+`max_text_length`: maximum number of tokens to consider. The default is `1024`.
+
+`divergence_curve_discretization_size` Number of points to consider on the divergence curve. The default is `25`.
+
+`mauve_scaling_factor`: Hyperparameter for scaling. The default is `5`.
+
+`verbose`: If `True` (default), running the metric will print running time updates.
+
+`seed`: random seed to initialize k-means cluster assignments, randomly assigned by default.
+    
+
+
+## Output values
+
+This metric outputs a dictionary with 5 key-value pairs:
+
+`mauve`: MAUVE score, which ranges between 0 and 1. **Larger** values indicate that P and Q are closer.
+
+`frontier_integral`: Frontier Integral, which ranges between 0 and 1. **Smaller** values indicate that P and Q are closer.
+
+`divergence_curve`: a numpy.ndarray of shape (m, 2); plot it with `matplotlib` to view the divergence curve.
+
+`p_hist`: a discrete distribution, which is a quantized version of the text distribution `p_text`.
+ 
+`q_hist`: same as above, but with `q_text`.
+
+
+### Values from popular papers
+
+The [original MAUVE paper](https://arxiv.org/abs/2102.01454) reported values ranging from 0.88 to 0.94 for open-ended text generation using a text completion task in the web text domain. The authors found that bigger models resulted in higher MAUVE scores and that MAUVE is correlated with human judgments.
+
+
+## Examples
+
+Perfect match between prediction and reference:
+
+```python
+from evaluate import load
+mauve = load('mauve')
+predictions = ["hello world", "goodnight moon"]
+references = ["hello world",  "goodnight moon"]
+mauve_results = mauve.compute(predictions=predictions, references=references)
+print(mauve_results.mauve)
+1.0
+```
+
+Partial match between prediction and reference:
+
+```python
+from evaluate import load
+mauve = load('mauve')
+predictions = ["hello world", "goodnight moon"]
+references = ["hello there", "general kenobi"]
+mauve_results = mauve.compute(predictions=predictions, references=references)
+print(mauve_results.mauve)
+0.27811372536724027
+```
+
+## Limitations and bias
+
+The [original MAUVE paper](https://arxiv.org/abs/2102.01454) did not analyze the inductive biases present in different embedding models, but related work has shown different kinds of biases exist in many popular generative language models including GPT-2 (see [Kirk et al., 2021](https://arxiv.org/pdf/2102.04130.pdf), [Abid et al., 2021](https://arxiv.org/abs/2101.05783)). The extent to which these biases can impact the MAUVE score has not been quantified.
+
+Also, calculating the MAUVE metric involves downloading the model from which features are obtained -- the default model, `gpt2-large`, takes over 3GB of storage space and downloading it can take a significant amount of time depending on the speed of your internet connection. If this is an issue, choose a smaller model; for instance, `gpt` is 523MB.
+
+It is a good idea to use at least 1000 samples for each distribution to compute MAUVE (the original paper uses 5000).
+
+MAUVE is unable to identify very small differences between different settings of generation (e.g., between top-p sampling with p=0.95 versus 0.96). It is important, therefore, to account for the randomness inside the generation (e.g., due to sampling) and within the MAUVE estimation procedure (see the `seed` parameter above). Concretely, it is a good idea to obtain generations using multiple random seeds and/or to use rerun MAUVE with multiple values of the parameter `seed`.
+
+For MAUVE to be large, the model distribution must be close to the human text distribution as seen by the embeddings. It is possible to have high-quality model text that still has a small MAUVE score (i.e., large gap) if it contains text about different topics/subjects, or uses a different writing style or vocabulary, or contains texts of a different length distribution. MAUVE summarizes the statistical gap (as measured by the large language model embeddings) --- this includes all these factors in addition to the quality-related aspects such as grammaticality.
+
+See the [official implementation](https://github.com/krishnap25/mauve#best-practices-for-mauve) for more details about best practices. 
+
+
+## Citation
+
+```bibtex
+@inproceedings{pillutla-etal:mauve:neurips2021,
+  title={MAUVE: Measuring the Gap Between Neural Text and Human Text using Divergence Frontiers},
+  author={Pillutla, Krishna and Swayamdipta, Swabha and Zellers, Rowan and Thickstun, John and Welleck, Sean and Choi, Yejin and Harchaoui, Zaid},
+  booktitle = {NeurIPS},
+  year  	= {2021}
+}
+```
+
+## Further References
+- [Official MAUVE implementation](https://github.com/krishnap25/mauve)
+- [Hugging Face Tasks - Text Generation](https://huggingface.co/tasks/text-generation)
--- a/evaluate-0.4.2/metrics/mauve/app.py
+++ b/evaluate-0.4.2/metrics/mauve/app.py
+import sys
+
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+sys.path = [p for p in sys.path if p != "/home/user/app"]
+module = evaluate.load("mauve")
+sys.path = ["/home/user/app"] + sys.path
+
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/mauve/mauve.py
+++ b/evaluate-0.4.2/metrics/mauve/mauve.py
+# coding=utf-8
+# Copyright 2020 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MAUVE metric from https://github.com/krishnap25/mauve. """
+
+import datasets
+import faiss  # Here to have a nice missing dependency error message early on
+import numpy  # Here to have a nice missing dependency error message early on
+import requests  # Here to have a nice missing dependency error message early on
+import sklearn  # Here to have a nice missing dependency error message early on
+import tqdm  # Here to have a nice missing dependency error message early on
+from mauve import compute_mauve  # From: mauve-text
+
+import evaluate
+
+
+_CITATION = """\
+@inproceedings{pillutla-etal:mauve:neurips2021,
+  title={{MAUVE: Measuring the Gap Between Neural Text and Human Text using Divergence Frontiers}},
+  author={Pillutla, Krishna and Swayamdipta, Swabha and Zellers, Rowan and Thickstun, John and Welleck, Sean and Choi, Yejin and Harchaoui, Zaid},
+  booktitle = {NeurIPS},
+  year      = {2021}
+}
+
+@article{pillutla-etal:mauve:arxiv2022,
+  title={{MAUVE Scores for Generative Models: Theory and Practice}},
+  author={Pillutla, Krishna and Liu, Lang and Thickstun, John and Welleck, Sean and Swayamdipta, Swabha and Zellers, Rowan and Oh, Sewoong and Choi, Yejin and Harchaoui, Zaid},
+  journal={arXiv Preprint},
+  year={2022}
+}
+"""
+
+_DESCRIPTION = """\
+MAUVE is a measure of the statistical gap between two text distributions, e.g., how far the text written by a model is the distribution of human text, using samples from both distributions.
+
+MAUVE is obtained by computing Kullback–Leibler (KL) divergences between the two distributions in a quantized embedding space of a large language model.
+It can quantify differences in the quality of generated text based on the size of the model, the decoding algorithm, and the length of the generated text.
+MAUVE was found to correlate the strongest with human evaluations over baseline metrics for open-ended text generation.
+
+This metrics is a wrapper around the official implementation of MAUVE:
+https://github.com/krishnap25/mauve
+"""
+
+_KWARGS_DESCRIPTION = """
+Calculates MAUVE scores between two lists of generated text and reference text.
+Args:
+    predictions: list of generated text to score. Each predictions
+        should be a string with tokens separated by spaces.
+    references: list of reference for each prediction. Each
+        reference should be a string with tokens separated by spaces.
+Optional Args:
+    num_buckets: the size of the histogram to quantize P and Q. Options: 'auto' (default) or an integer
+    pca_max_data: the number data points to use for PCA dimensionality reduction prior to clustering. If -1, use all the data. Default -1
+    kmeans_explained_var: amount of variance of the data to keep in dimensionality reduction by PCA. Default 0.9
+    kmeans_num_redo: number of times to redo k-means clustering (the best objective is kept). Default 5
+    kmeans_max_iter: maximum number of k-means iterations. Default 500
+    featurize_model_name: name of the model from which features are obtained. Default 'gpt2-large' Use one of ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'].
+    device_id: Device for featurization. Supply a GPU id (e.g. 0 or 3) to use GPU. If no GPU with this id is found, use CPU
+    max_text_length: maximum number of tokens to consider. Default 1024
+    divergence_curve_discretization_size: Number of points to consider on the divergence curve. Default 25
+    mauve_scaling_factor: "c" from the paper. Default 5.
+    verbose: If True (default), print running time updates
+    seed: random seed to initialize k-means cluster assignments.
+Returns:
+    mauve: MAUVE score, a number between 0 and 1. Larger values indicate that P and Q are closer,
+    frontier_integral: Frontier Integral, a number between 0 and 1. Smaller values indicate that P and Q are closer,
+    divergence_curve: a numpy.ndarray of shape (m, 2); plot it with matplotlib to view the divergence curve,
+    p_hist: a discrete distribution, which is a quantized version of the text distribution p_text,
+    q_hist: same as above, but with q_text.
+Examples:
+
+    >>> # faiss segfaults in doctest for some reason, so the .compute call is not tested with doctest
+    >>> import evaluate
+    >>> mauve = evaluate.load('mauve')
+    >>> predictions = ["hello there", "general kenobi"]
+    >>> references = ["hello there", "general kenobi"]
+    >>> out = mauve.compute(predictions=predictions, references=references) # doctest: +SKIP
+    >>> print(out.mauve) # doctest: +SKIP
+    1.0
+"""
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Mauve(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            homepage="https://github.com/krishnap25/mauve",
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Value("string", id="sequence"),
+                    "references": datasets.Value("string", id="sequence"),
+                }
+            ),
+            codebase_urls=["https://github.com/krishnap25/mauve"],
+            reference_urls=[
+                "https://arxiv.org/abs/2102.01454",
+                "https://github.com/krishnap25/mauve",
+            ],
+        )
+
+    def _compute(
+        self,
+        predictions,
+        references,
+        p_features=None,
+        q_features=None,
+        p_tokens=None,
+        q_tokens=None,
+        num_buckets="auto",
+        pca_max_data=-1,
+        kmeans_explained_var=0.9,
+        kmeans_num_redo=5,
+        kmeans_max_iter=500,
+        featurize_model_name="gpt2-large",
+        device_id=-1,
+        max_text_length=1024,
+        divergence_curve_discretization_size=25,
+        mauve_scaling_factor=5,
+        verbose=True,
+        seed=25,
+    ):
+        out = compute_mauve(
+            p_text=predictions,
+            q_text=references,
+            p_features=p_features,
+            q_features=q_features,
+            p_tokens=p_tokens,
+            q_tokens=q_tokens,
+            num_buckets=num_buckets,
+            pca_max_data=pca_max_data,
+            kmeans_explained_var=kmeans_explained_var,
+            kmeans_num_redo=kmeans_num_redo,
+            kmeans_max_iter=kmeans_max_iter,
+            featurize_model_name=featurize_model_name,
+            device_id=device_id,
+            max_text_length=max_text_length,
+            divergence_curve_discretization_size=divergence_curve_discretization_size,
+            mauve_scaling_factor=mauve_scaling_factor,
+            verbose=verbose,
+            seed=seed,
+        )
+        return out
--- a/evaluate-0.4.2/metrics/mauve/requirements.txt
+++ b/evaluate-0.4.2/metrics/mauve/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+faiss-cpu
+scikit-learn
+mauve-text
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/mean_iou/README.md
+++ b/evaluate-0.4.2/metrics/mean_iou/README.md
+---
+title: Mean IoU
+emoji: 🤗 
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+description: >-
+  IoU is the area of overlap between the predicted segmentation and the ground truth divided by the area of union
+  between the predicted segmentation and the ground truth. For binary (two classes) or multi-class segmentation,
+  the mean IoU of the image is calculated by taking the IoU of each class and averaging them.
+---
+
+# Metric Card for Mean IoU 
+
+
+## Metric Description
+
+IoU (Intersection over Union) is the area of overlap between the predicted segmentation and the ground truth divided by the area of union between the predicted segmentation and the ground truth. 
+
+For binary (two classes) or multi-class segmentation, the *mean IoU* of the image is calculated by taking the IoU of each class and averaging them.
+
+## How to Use
+
+The Mean IoU metric takes two lists of numeric 2D arrays as input corresponding to the predicted and ground truth segmentations:
+```python
+>>> import numpy as np
+>>> mean_iou = evaluate.load("mean_iou")
+>>> predicted = np.array([[2, 2, 3], [8, 2, 4], [3, 255, 2]])
+>>> ground_truth = np.array([[1, 2, 2], [8, 2, 1], [3, 255, 1]])
+>>> results = mean_iou.compute(predictions=[predicted], references=[ground_truth], num_labels=10, ignore_index=255)
+```
+
+### Inputs
+**Mandatory inputs**
+- `predictions` (`List[ndarray]`): List of predicted segmentation maps, each of shape (height, width). Each segmentation map can be of a different size.
+- `references` (`List[ndarray]`): List of ground truth segmentation maps, each of shape (height, width). Each segmentation map can be of a different size.
+- `num_labels` (`int`): Number of classes (categories). 
+- `ignore_index` (`int`): Index that will be ignored during evaluation.
+
+**Optional inputs**
+- `nan_to_num` (`int`): If specified, NaN values will be replaced by the number defined by the user.
+- `label_map` (`dict`): If specified, dictionary mapping old label indices to new label indices.
+- `reduce_labels` (`bool`): Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). The background label will be replaced by 255. The default value is `False`.
+
+### Output Values
+The metric returns a dictionary with the following elements:
+- `mean_iou` (`float`): Mean Intersection-over-Union (IoU averaged over all categories).
+- `mean_accuracy` (`float`): Mean accuracy (averaged over all categories). 
+- `overall_accuracy` (`float`): Overall accuracy on all images.
+- `per_category_accuracy` (`ndarray` of shape `(num_labels,)`): Per category accuracy.
+- `per_category_iou` (`ndarray` of shape `(num_labels,)`): Per category IoU.
+
+The values of all of the scores reported range from from `0.0` (minimum) and `1.0` (maximum).
+
+Output Example:
+```python
+{'mean_iou': 0.47750000000000004, 'mean_accuracy': 0.5916666666666666, 'overall_accuracy': 0.5263157894736842, 'per_category_iou': array([0.   , 0.   , 0.375, 0.4  , 0.5  , 0.   , 0.5  , 1.   , 1.   , 1.   ]), 'per_category_accuracy': array([0.        , 0.        , 0.75      , 0.66666667, 1.        , 0.        , 0.5       , 1.        , 1.        , 1.        ])}
+```
+
+#### Values from Popular Papers
+
+The [leaderboard for the CityScapes dataset](https://paperswithcode.com/sota/semantic-segmentation-on-cityscapes) reports a Mean IOU ranging from 64 to 84; that of [ADE20k](https://paperswithcode.com/sota/semantic-segmentation-on-ade20k) ranges from 30 to a peak of 59.9, indicating that the dataset is more difficult for current approaches (as of 2022). 
+
+
+### Examples
+
+```python
+>>> import numpy as np
+>>> mean_iou = evaluate.load("mean_iou")
+>>> # suppose one has 3 different segmentation maps predicted
+>>> predicted_1 = np.array([[1, 2], [3, 4], [5, 255]])
+>>> actual_1 = np.array([[0, 3], [5, 4], [6, 255]])
+>>> predicted_2 = np.array([[2, 7], [9, 2], [3, 6]])
+>>> actual_2 = np.array([[1, 7], [9, 2], [3, 6]])
+>>> predicted_3 = np.array([[2, 2, 3], [8, 2, 4], [3, 255, 2]])
+>>> actual_3 = np.array([[1, 2, 2], [8, 2, 1], [3, 255, 1]])
+>>> predictions = [predicted_1, predicted_2, predicted_3]
+>>> references = [actual_1, actual_2, actual_3]
+>>> results = mean_iou.compute(predictions=predictions, references=references, num_labels=10, ignore_index=255, reduce_labels=False)
+>>> print(results) # doctest: +NORMALIZE_WHITESPACE
+{'mean_iou': 0.47750000000000004, 'mean_accuracy': 0.5916666666666666, 'overall_accuracy': 0.5263157894736842, 'per_category_iou': array([0.   , 0.   , 0.375, 0.4  , 0.5  , 0.   , 0.5  , 1.   , 1.   , 1.   ]), 'per_category_accuracy': array([0.        , 0.        , 0.75      , 0.66666667, 1.        , 0.        , 0.5       , 1.        , 1.        , 1.        ])}
+``` 
+
+
+## Limitations and Bias
+Mean IOU is an average metric, so it will not show you where model predictions differ from the ground truth (i.e. if there are particular regions or classes that the model does poorly on). Further error analysis is needed to gather actional insights that can be used to inform model improvements. 
+
+## Citation(s)
+```bibtex
+@software{MMSegmentation_Contributors_OpenMMLab_Semantic_Segmentation_2020,
+author = {{MMSegmentation Contributors}},
+license = {Apache-2.0},
+month = {7},
+title = {{OpenMMLab Semantic Segmentation Toolbox and Benchmark}},
+url = {https://github.com/open-mmlab/mmsegmentation},
+year = {2020}
+}"
+```
+
+
+## Further References
+- [Wikipedia article - Jaccard Index](https://en.wikipedia.org/wiki/Jaccard_index)
--- a/evaluate-0.4.2/metrics/mean_iou/app.py
+++ b/evaluate-0.4.2/metrics/mean_iou/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("mean_iou")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/mean_iou/mean_iou.py
+++ b/evaluate-0.4.2/metrics/mean_iou/mean_iou.py
+# Copyright 2022 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mean IoU (Intersection-over-Union) metric."""
+
+from typing import Dict, Optional
+
+import datasets
+import numpy as np
+
+import evaluate
+
+
+_DESCRIPTION = """
+IoU is the area of overlap between the predicted segmentation and the ground truth divided by the area of union
+between the predicted segmentation and the ground truth. For binary (two classes) or multi-class segmentation,
+the mean IoU of the image is calculated by taking the IoU of each class and averaging them.
+"""
+
+_KWARGS_DESCRIPTION = """
+Args:
+    predictions (`List[ndarray]`):
+        List of predicted segmentation maps, each of shape (height, width). Each segmentation map can be of a different size.
+    references (`List[ndarray]`):
+        List of ground truth segmentation maps, each of shape (height, width). Each segmentation map can be of a different size.
+    num_labels (`int`):
+        Number of classes (categories).
+    ignore_index (`int`):
+        Index that will be ignored during evaluation.
+    nan_to_num (`int`, *optional*):
+        If specified, NaN values will be replaced by the number defined by the user.
+    label_map (`dict`, *optional*):
+        If specified, dictionary mapping old label indices to new label indices.
+    reduce_labels (`bool`, *optional*, defaults to `False`):
+        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is used for background,
+        and background itself is not included in all classes of a dataset (e.g. ADE20k). The background label will be replaced by 255.
+
+Returns:
+    `Dict[str, float | ndarray]` comprising various elements:
+    - *mean_iou* (`float`):
+        Mean Intersection-over-Union (IoU averaged over all categories).
+    - *mean_accuracy* (`float`):
+        Mean accuracy (averaged over all categories).
+    - *overall_accuracy* (`float`):
+        Overall accuracy on all images.
+    - *per_category_accuracy* (`ndarray` of shape `(num_labels,)`):
+        Per category accuracy.
+    - *per_category_iou* (`ndarray` of shape `(num_labels,)`):
+        Per category IoU.
+
+Examples:
+
+    >>> import numpy as np
+
+    >>> mean_iou = evaluate.load("mean_iou")
+
+    >>> # suppose one has 3 different segmentation maps predicted
+    >>> predicted_1 = np.array([[1, 2], [3, 4], [5, 255]])
+    >>> actual_1 = np.array([[0, 3], [5, 4], [6, 255]])
+
+    >>> predicted_2 = np.array([[2, 7], [9, 2], [3, 6]])
+    >>> actual_2 = np.array([[1, 7], [9, 2], [3, 6]])
+
+    >>> predicted_3 = np.array([[2, 2, 3], [8, 2, 4], [3, 255, 2]])
+    >>> actual_3 = np.array([[1, 2, 2], [8, 2, 1], [3, 255, 1]])
+
+    >>> predicted = [predicted_1, predicted_2, predicted_3]
+    >>> ground_truth = [actual_1, actual_2, actual_3]
+
+    >>> results = mean_iou.compute(predictions=predicted, references=ground_truth, num_labels=10, ignore_index=255, reduce_labels=False)
+    >>> print(results) # doctest: +NORMALIZE_WHITESPACE
+    {'mean_iou': 0.47750000000000004, 'mean_accuracy': 0.5916666666666666, 'overall_accuracy': 0.5263157894736842, 'per_category_iou': array([0.   , 0.   , 0.375, 0.4  , 0.5  , 0.   , 0.5  , 1.   , 1.   , 1.   ]), 'per_category_accuracy': array([0.        , 0.        , 0.75      , 0.66666667, 1.        , 0.        , 0.5       , 1.        , 1.        , 1.        ])}
+"""
+
+_CITATION = """\
+@software{MMSegmentation_Contributors_OpenMMLab_Semantic_Segmentation_2020,
+author = {{MMSegmentation Contributors}},
+license = {Apache-2.0},
+month = {7},
+title = {{OpenMMLab Semantic Segmentation Toolbox and Benchmark}},
+url = {https://github.com/open-mmlab/mmsegmentation},
+year = {2020}
+}"""
+
+
+def intersect_and_union(
+    pred_label,
+    label,
+    num_labels,
+    ignore_index: bool,
+    label_map: Optional[Dict[int, int]] = None,
+    reduce_labels: bool = False,
+):
+    """Calculate intersection and Union.
+
+    Args:
+        pred_label (`ndarray`):
+            Prediction segmentation map of shape (height, width).
+        label (`ndarray`):
+            Ground truth segmentation map of shape (height, width).
+        num_labels (`int`):
+            Number of categories.
+        ignore_index (`int`):
+            Index that will be ignored during evaluation.
+        label_map (`dict`, *optional*):
+            Mapping old labels to new labels. The parameter will work only when label is str.
+        reduce_labels (`bool`, *optional*, defaults to `False`):
+            Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is used for background,
+            and background itself is not included in all classes of a dataset (e.g. ADE20k). The background label will be replaced by 255.
+
+     Returns:
+         area_intersect (`ndarray`):
+            The intersection of prediction and ground truth histogram on all classes.
+         area_union (`ndarray`):
+            The union of prediction and ground truth histogram on all classes.
+         area_pred_label (`ndarray`):
+            The prediction histogram on all classes.
+         area_label (`ndarray`):
+            The ground truth histogram on all classes.
+    """
+    if label_map is not None:
+        for old_id, new_id in label_map.items():
+            label[label == old_id] = new_id
+
+    # turn into Numpy arrays
+    pred_label = np.array(pred_label)
+    label = np.array(label)
+
+    if reduce_labels:
+        label[label == 0] = 255
+        label = label - 1
+        label[label == 254] = 255
+
+    mask = label != ignore_index
+    mask = np.not_equal(label, ignore_index)
+    pred_label = pred_label[mask]
+    label = np.array(label)[mask]
+
+    intersect = pred_label[pred_label == label]
+
+    area_intersect = np.histogram(intersect, bins=num_labels, range=(0, num_labels - 1))[0]
+    area_pred_label = np.histogram(pred_label, bins=num_labels, range=(0, num_labels - 1))[0]
+    area_label = np.histogram(label, bins=num_labels, range=(0, num_labels - 1))[0]
+
+    area_union = area_pred_label + area_label - area_intersect
+
+    return area_intersect, area_union, area_pred_label, area_label
+
+
+def total_intersect_and_union(
+    results,
+    gt_seg_maps,
+    num_labels,
+    ignore_index: bool,
+    label_map: Optional[Dict[int, int]] = None,
+    reduce_labels: bool = False,
+):
+    """Calculate Total Intersection and Union, by calculating `intersect_and_union` for each (predicted, ground truth) pair.
+
+    Args:
+        results (`ndarray`):
+            List of prediction segmentation maps, each of shape (height, width).
+        gt_seg_maps (`ndarray`):
+            List of ground truth segmentation maps, each of shape (height, width).
+        num_labels (`int`):
+            Number of categories.
+        ignore_index (`int`):
+            Index that will be ignored during evaluation.
+        label_map (`dict`, *optional*):
+            Mapping old labels to new labels. The parameter will work only when label is str.
+        reduce_labels (`bool`, *optional*, defaults to `False`):
+            Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is used for background,
+            and background itself is not included in all classes of a dataset (e.g. ADE20k). The background label will be replaced by 255.
+
+     Returns:
+         total_area_intersect (`ndarray`):
+            The intersection of prediction and ground truth histogram on all classes.
+         total_area_union (`ndarray`):
+            The union of prediction and ground truth histogram on all classes.
+         total_area_pred_label (`ndarray`):
+            The prediction histogram on all classes.
+         total_area_label (`ndarray`):
+            The ground truth histogram on all classes.
+    """
+    total_area_intersect = np.zeros((num_labels,), dtype=np.float64)
+    total_area_union = np.zeros((num_labels,), dtype=np.float64)
+    total_area_pred_label = np.zeros((num_labels,), dtype=np.float64)
+    total_area_label = np.zeros((num_labels,), dtype=np.float64)
+    for result, gt_seg_map in zip(results, gt_seg_maps):
+        area_intersect, area_union, area_pred_label, area_label = intersect_and_union(
+            result, gt_seg_map, num_labels, ignore_index, label_map, reduce_labels
+        )
+        total_area_intersect += area_intersect
+        total_area_union += area_union
+        total_area_pred_label += area_pred_label
+        total_area_label += area_label
+    return total_area_intersect, total_area_union, total_area_pred_label, total_area_label
+
+
+def mean_iou(
+    results,
+    gt_seg_maps,
+    num_labels,
+    ignore_index: bool,
+    nan_to_num: Optional[int] = None,
+    label_map: Optional[Dict[int, int]] = None,
+    reduce_labels: bool = False,
+):
+    """Calculate Mean Intersection and Union (mIoU).
+
+    Args:
+        results (`ndarray`):
+            List of prediction segmentation maps, each of shape (height, width).
+        gt_seg_maps (`ndarray`):
+            List of ground truth segmentation maps, each of shape (height, width).
+        num_labels (`int`):
+            Number of categories.
+        ignore_index (`int`):
+            Index that will be ignored during evaluation.
+        nan_to_num (`int`, *optional*):
+            If specified, NaN values will be replaced by the number defined by the user.
+        label_map (`dict`, *optional*):
+            Mapping old labels to new labels. The parameter will work only when label is str.
+        reduce_labels (`bool`, *optional*, defaults to `False`):
+            Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is used for background,
+            and background itself is not included in all classes of a dataset (e.g. ADE20k). The background label will be replaced by 255.
+
+    Returns:
+        `Dict[str, float | ndarray]` comprising various elements:
+        - *mean_iou* (`float`):
+            Mean Intersection-over-Union (IoU averaged over all categories).
+        - *mean_accuracy* (`float`):
+            Mean accuracy (averaged over all categories).
+        - *overall_accuracy* (`float`):
+            Overall accuracy on all images.
+        - *per_category_accuracy* (`ndarray` of shape `(num_labels,)`):
+            Per category accuracy.
+        - *per_category_iou* (`ndarray` of shape `(num_labels,)`):
+            Per category IoU.
+    """
+    total_area_intersect, total_area_union, total_area_pred_label, total_area_label = total_intersect_and_union(
+        results, gt_seg_maps, num_labels, ignore_index, label_map, reduce_labels
+    )
+
+    # compute metrics
+    metrics = dict()
+
+    all_acc = total_area_intersect.sum() / total_area_label.sum()
+    iou = total_area_intersect / total_area_union
+    acc = total_area_intersect / total_area_label
+
+    metrics["mean_iou"] = np.nanmean(iou)
+    metrics["mean_accuracy"] = np.nanmean(acc)
+    metrics["overall_accuracy"] = all_acc
+    metrics["per_category_iou"] = iou
+    metrics["per_category_accuracy"] = acc
+
+    if nan_to_num is not None:
+        metrics = dict(
+            {metric: np.nan_to_num(metric_value, nan=nan_to_num) for metric, metric_value in metrics.items()}
+        )
+
+    return metrics
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class MeanIoU(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Image(),
+                    "references": datasets.Image(),
+                }
+            ),
+            reference_urls=[
+                "https://github.com/open-mmlab/mmsegmentation/blob/71c201b1813267d78764f306a297ca717827c4bf/mmseg/core/evaluation/metrics.py"
+            ],
+        )
+
+    def _compute(
+        self,
+        predictions,
+        references,
+        num_labels: int,
+        ignore_index: bool,
+        nan_to_num: Optional[int] = None,
+        label_map: Optional[Dict[int, int]] = None,
+        reduce_labels: bool = False,
+    ):
+        iou_result = mean_iou(
+            results=predictions,
+            gt_seg_maps=references,
+            num_labels=num_labels,
+            ignore_index=ignore_index,
+            nan_to_num=nan_to_num,
+            label_map=label_map,
+            reduce_labels=reduce_labels,
+        )
+        return iou_result
--- a/evaluate-0.4.2/metrics/mean_iou/requirements.txt
+++ b/evaluate-0.4.2/metrics/mean_iou/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/meteor/README.md
+++ b/evaluate-0.4.2/metrics/meteor/README.md
+---
+title: METEOR
+emoji: 🤗 
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+description: >-
+  METEOR, an automatic metric for machine translation evaluation
+  that is based on a generalized concept of unigram matching between the
+  machine-produced translation and human-produced reference translations.
+  Unigrams can be matched based on their surface forms, stemmed forms,
+  and meanings; furthermore, METEOR can be easily extended to include more
+  advanced matching strategies. Once all generalized unigram matches
+  between the two strings have been found, METEOR computes a score for
+  this matching using a combination of unigram-precision, unigram-recall, and
+  a measure of fragmentation that is designed to directly capture how
+  well-ordered the matched words in the machine translation are in relation
+  to the reference.
+  
+  METEOR gets an R correlation value of 0.347 with human evaluation on the Arabic
+  data and 0.331 on the Chinese data. This is shown to be an improvement on
+  using simply unigram-precision, unigram-recall and their harmonic F1
+  combination.
+---
+
+# Metric Card for METEOR
+
+## Metric description
+
+METEOR (Metric for Evaluation of Translation with Explicit ORdering) is a machine translation evaluation metric, which is calculated based on the harmonic mean of precision and recall, with recall weighted more than precision. 
+
+METEOR is based on a generalized concept of unigram matching between the machine-produced translation and human-produced reference translations. Unigrams can be matched based on their surface forms, stemmed forms, and meanings. Once all generalized unigram matches between the two strings have been found, METEOR computes a score for this matching using a combination of unigram-precision, unigram-recall, and a measure of fragmentation that is designed to directly capture how well-ordered the matched words in the machine translation are in relation to the reference. 
+
+
+## How to use 
+
+METEOR has two mandatory arguments:
+
+`predictions`: a `list` of predictions to score. Each prediction should be a string with tokens separated by spaces.
+
+`references`: a `list` of references (in the case of one `reference` per `prediction`), or a `list` of `lists` of references (in the case of multiple `references` per `prediction`. Each reference should be a string with tokens separated by spaces.
+
+It also has several optional parameters:
+
+`alpha`: Parameter for controlling relative weights of precision and recall. The default value is `0.9`.
+
+`beta`: Parameter for controlling shape of penalty as a function of fragmentation. The default value is `3`.
+
+`gamma`: The relative weight assigned to fragmentation penalty. The default is `0.5`. 
+
+Refer to the [METEOR paper](https://aclanthology.org/W05-0909.pdf) for more information about parameter values and ranges.
+
+```python
+>>> meteor = evaluate.load('meteor')
+>>> predictions = ["It is a guide to action which ensures that the military always obeys the commands of the party"]
+>>> references = ["It is a guide to action that ensures that the military will forever heed Party commands"]
+>>> results = meteor.compute(predictions=predictions, references=references)
+```
+
+## Output values
+
+The metric outputs a dictionary containing the METEOR score. Its values range from 0 to 1, e.g.:
+```
+{'meteor': 0.9999142661179699}
+```
+
+
+### Values from popular papers
+The [METEOR paper](https://aclanthology.org/W05-0909.pdf) does not report METEOR score values for different models, but it does report that METEOR gets an R correlation value of 0.347 with human evaluation on the Arabic data and 0.331 on the Chinese data. 
+
+
+## Examples 
+
+One `reference` per `prediction`:
+
+```python
+>>> meteor = evaluate.load('meteor')
+>>> predictions = ["It is a guide to action which ensures that the military always obeys the commands of the party"]
+>>> reference = ["It is a guide to action which ensures that the military always obeys the commands of the party"]
+>>> results = meteor.compute(predictions=predictions, references=reference)
+>>> print(round(results['meteor'], 2))
+1.0
+```
+
+Multiple `references` per `prediction`:
+
+```python
+>>> meteor = evaluate.load('meteor')
+>>> predictions = ["It is a guide to action which ensures that the military always obeys the commands of the party"]
+>>> references = [['It is a guide to action that ensures that the military will forever heed Party commands', 'It is the guiding principle which guarantees the military forces always being under the command of the Party', 'It is the practical guide for the army always to heed the directions of the party']]
+>>> results = meteor.compute(predictions=predictions, references=references)
+>>> print(round(results['meteor'], 2))
+1.0
+```
+
+Multiple `references` per `prediction`, partial match:
+
+```python
+>>> meteor = evaluate.load('meteor')
+>>> predictions = ["It is a guide to action which ensures that the military always obeys the commands of the party"]
+>>> references = [['It is a guide to action that ensures that the military will forever heed Party commands', 'It is the guiding principle which guarantees the military forces always being under the command of the Party', 'It is the practical guide for the army always to heed the directions of the party']]
+>>> results = meteor.compute(predictions=predictions, references=references)
+>>> print(round(results['meteor'], 2))
+0.69
+```
+
+## Limitations and bias
+
+While the correlation between METEOR and human judgments was measured for Chinese and Arabic and found to be significant, further experimentation is needed to check its correlation for other languages. 
+
+Furthermore, while the alignment and matching done in METEOR is based on unigrams, using multiple word entities (e.g. bigrams) could contribute to improving its accuracy -- this has been proposed in [more recent publications](https://www.cs.cmu.edu/~alavie/METEOR/pdf/meteor-naacl-2010.pdf) on the subject.
+
+
+## Citation
+
+```bibtex
+@inproceedings{banarjee2005,
+  title     = {{METEOR}: An Automatic Metric for {MT} Evaluation with Improved Correlation with Human Judgments},
+  author    = {Banerjee, Satanjeev  and Lavie, Alon},
+  booktitle = {Proceedings of the {ACL} Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and/or Summarization},
+  month     = jun,
+  year      = {2005},
+  address   = {Ann Arbor, Michigan},
+  publisher = {Association for Computational Linguistics},
+  url       = {https://www.aclweb.org/anthology/W05-0909},
+  pages     = {65--72},
+}
+```
+    
+## Further References 
+- [METEOR -- Wikipedia](https://en.wikipedia.org/wiki/METEOR)
+- [METEOR score -- NLTK](https://www.nltk.org/_modules/nltk/translate/meteor_score.html)
+