修改readme

25991f98 · hepj · ac192496 · 25991f98 · 25991f98 · 25991f98
Commit 25991f98 authored Jul 25, 2024 by hepj
20 changed files
--- a/evaluate-0.4.2/src/evaluate/evaluator/text2text_generation.py
+++ b/evaluate-0.4.2/src/evaluate/evaluator/text2text_generation.py
+# Copyright 2022 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
+
+from datasets import Dataset
+from typing_extensions import Literal
+
+from ..module import EvaluationModule
+from ..utils.file_utils import add_start_docstrings
+from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator
+
+
+if TYPE_CHECKING:
+    from transformers import Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel
+
+
+TASK_DOCUMENTATION_KWARGS = r"""
+        input_column (`str`, defaults to `"text"`):
+            the name of the column containing the input text in the dataset specified by `data`.
+        label_column (`str`, defaults to `"label"`):
+            the name of the column containing the labels in the dataset specified by `data`.
+        generation_kwargs (`Dict`, *optional*, defaults to `None`):
+            The generation kwargs are passed to the pipeline and set the text generation strategy.
+"""
+
+TEXT2TEXT_TASK_DOCSTRING_EXAMPLE = r"""
+    Examples:
+    ```python
+    >>> from evaluate import evaluator
+    >>> from datasets import load_dataset
+    >>> task_evaluator = evaluator("text2text-generation")
+    >>> data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:40]")
+    >>> results = task_evaluator.compute(
+    >>>     model_or_pipeline="facebook/bart-large-cnn",
+    >>>     data=data,
+    >>>     input_column="article",
+    >>>     label_column="highlights",
+    >>>     metric="rouge",
+    >>> )
+    ```
+"""
+
+SUMMARIZATION_TASK_DOCSTRING_EXAMPLE = r"""
+    Examples:
+    ```python
+    >>> from evaluate import evaluator
+    >>> from datasets import load_dataset
+    >>> task_evaluator = evaluator("summarization")
+    >>> data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:40]")
+    >>> results = task_evaluator.compute(
+    >>>     model_or_pipeline="facebook/bart-large-cnn",
+    >>>     data=data,
+    >>>     input_column="article",
+    >>>     label_column="highlights",
+    >>> )
+    ```
+"""
+
+
+TRANSLATION_TASK_DOCSTRING_EXAMPLE = r"""
+    Examples:
+    ```python
+    >>> from evaluate import evaluator
+    >>> from datasets import load_dataset
+    >>> task_evaluator = evaluator("translation")
+    >>> data = load_dataset("wmt19", "fr-de", split="validation[:40]")
+    >>> data = data.map(lambda x: {"text": x["translation"]["de"], "label": x["translation"]["fr"]})
+    >>> results = task_evaluator.compute(
+    >>>     model_or_pipeline="Helsinki-NLP/opus-mt-de-fr",
+    >>>     data=data,
+    >>> )
+    ```
+"""
+
+
+class Text2TextGenerationEvaluator(Evaluator):
+    """
+    Text2Text generation evaluator.
+    This Text2Text generation evaluator can currently be loaded from [`evaluator`] using the default task name
+    `text2text-generation`.
+    Methods in this class assume a data format compatible with the [`~transformers.Text2TextGenerationPipeline`].
+    """
+
+    PREDICTION_PREFIX = "generated"
+    PIPELINE_KWARGS = {"truncation": True}
+
+    def __init__(self, task="text2text-generation", default_metric_name=None):
+        super().__init__(task, default_metric_name=default_metric_name)
+
+    def predictions_processor(self, predictions, label_mapping):
+        return {"predictions": [pred[f"{self.PREDICTION_PREFIX}_text"] for pred in predictions]}
+
+    @add_start_docstrings(
+        EVALUTOR_COMPUTE_START_DOCSTRING,
+        TASK_DOCUMENTATION_KWARGS,
+        EVALUATOR_COMPUTE_RETURN_DOCSTRING,
+        TEXT2TEXT_TASK_DOCSTRING_EXAMPLE,
+    )
+    def compute(
+        self,
+        model_or_pipeline: Union[
+            str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"  # noqa: F821
+        ] = None,
+        data: Union[str, Dataset] = None,
+        subset: Optional[str] = None,
+        split: Optional[str] = None,
+        metric: Union[str, EvaluationModule] = None,
+        tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None,  # noqa: F821
+        strategy: Literal["simple", "bootstrap"] = "simple",
+        confidence_level: float = 0.95,
+        n_resamples: int = 9999,
+        device: int = None,
+        random_state: Optional[int] = None,
+        input_column: str = "text",
+        label_column: str = "label",
+        generation_kwargs: dict = None,
+    ) -> Tuple[Dict[str, float], Any]:
+        if generation_kwargs is not None:
+            self.PIPELINE_KWARGS.update(generation_kwargs)
+
+        result = super().compute(
+            model_or_pipeline=model_or_pipeline,
+            data=data,
+            subset=subset,
+            split=split,
+            metric=metric,
+            tokenizer=tokenizer,
+            strategy=strategy,
+            confidence_level=confidence_level,
+            n_resamples=n_resamples,
+            device=device,
+            random_state=random_state,
+            input_column=input_column,
+            label_column=label_column,
+        )
+
+        return result
+
+
+class SummarizationEvaluator(Text2TextGenerationEvaluator):
+    """
+    Text summarization evaluator.
+    This text summarization evaluator can currently be loaded from [`evaluator`] using the default task name
+    `summarization`.
+    Methods in this class assume a data format compatible with the [`SummarizationEvaluator`].
+    """
+
+    PREDICTION_PREFIX = "summary"
+    PIPELINE_KWARGS = {"truncation": True}
+
+    def __init__(self, task="summarization", default_metric_name=None):
+        super().__init__(task, default_metric_name=default_metric_name)
+
+    @add_start_docstrings(
+        EVALUTOR_COMPUTE_START_DOCSTRING,
+        TASK_DOCUMENTATION_KWARGS,
+        EVALUATOR_COMPUTE_RETURN_DOCSTRING,
+        SUMMARIZATION_TASK_DOCSTRING_EXAMPLE,
+    )
+    def compute(
+        self,
+        model_or_pipeline: Union[
+            str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"  # noqa: F821
+        ] = None,
+        data: Union[str, Dataset] = None,
+        subset: Optional[str] = None,
+        split: Optional[str] = None,
+        metric: Union[str, EvaluationModule] = None,
+        tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None,  # noqa: F821
+        strategy: Literal["simple", "bootstrap"] = "simple",
+        confidence_level: float = 0.95,
+        n_resamples: int = 9999,
+        device: int = None,
+        random_state: Optional[int] = None,
+        input_column: str = "text",
+        label_column: str = "label",
+        generation_kwargs: dict = None,
+    ) -> Tuple[Dict[str, float], Any]:
+        result = super().compute(
+            model_or_pipeline=model_or_pipeline,
+            data=data,
+            subset=subset,
+            split=split,
+            metric=metric,
+            tokenizer=tokenizer,
+            strategy=strategy,
+            confidence_level=confidence_level,
+            n_resamples=n_resamples,
+            device=device,
+            random_state=random_state,
+            input_column=input_column,
+            label_column=label_column,
+            generation_kwargs=generation_kwargs,
+        )
+
+        return result
+
+
+class TranslationEvaluator(Text2TextGenerationEvaluator):
+    """
+    Translation evaluator.
+    This translation generation evaluator can currently be loaded from [`evaluator`] using the default task name
+    `translation`.
+    Methods in this class assume a data format compatible with the [`~transformers.TranslationPipeline`].
+    """
+
+    PREDICTION_PREFIX = "translation"
+    PIPELINE_KWARGS = {"truncation": True}
+
+    def __init__(self, task="translation", default_metric_name=None):
+        super().__init__(task, default_metric_name=default_metric_name)
+
+    @add_start_docstrings(
+        EVALUTOR_COMPUTE_START_DOCSTRING,
+        TASK_DOCUMENTATION_KWARGS,
+        EVALUATOR_COMPUTE_RETURN_DOCSTRING,
+        TRANSLATION_TASK_DOCSTRING_EXAMPLE,
+    )
+    def compute(
+        self,
+        model_or_pipeline: Union[
+            str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"  # noqa: F821
+        ] = None,
+        data: Union[str, Dataset] = None,
+        subset: Optional[str] = None,
+        split: Optional[str] = None,
+        metric: Union[str, EvaluationModule] = None,
+        tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None,  # noqa: F821
+        strategy: Literal["simple", "bootstrap"] = "simple",
+        confidence_level: float = 0.95,
+        n_resamples: int = 9999,
+        device: int = None,
+        random_state: Optional[int] = None,
+        input_column: str = "text",
+        label_column: str = "label",
+        generation_kwargs: dict = None,
+    ) -> Tuple[Dict[str, float], Any]:
+        result = super().compute(
+            model_or_pipeline=model_or_pipeline,
+            data=data,
+            subset=subset,
+            split=split,
+            metric=metric,
+            tokenizer=tokenizer,
+            strategy=strategy,
+            confidence_level=confidence_level,
+            n_resamples=n_resamples,
+            device=device,
+            random_state=random_state,
+            input_column=input_column,
+            label_column=label_column,
+            generation_kwargs=generation_kwargs,
+        )
+
+        return result
--- a/evaluate-0.4.2/src/evaluate/evaluator/text_classification.py
+++ b/evaluate-0.4.2/src/evaluate/evaluator/text_classification.py
+# Copyright 2022 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from numbers import Number
+from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
+
+from datasets import Dataset, load_dataset
+from typing_extensions import Literal
+
+from ..module import EvaluationModule
+from ..utils.file_utils import add_end_docstrings, add_start_docstrings
+from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator
+from .utils import DatasetColumnPair
+
+
+if TYPE_CHECKING:
+    from transformers import FeatureExtractionMixin, Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel
+
+
+TASK_DOCUMENTATION = r"""
+    Examples:
+    ```python
+    >>> from evaluate import evaluator
+    >>> from datasets import load_dataset
+    >>> task_evaluator = evaluator("text-classification")
+    >>> data = load_dataset("imdb", split="test[:2]")
+    >>> results = task_evaluator.compute(
+    >>>     model_or_pipeline="huggingface/prunebert-base-uncased-6-finepruned-w-distil-mnli",
+    >>>     data=data,
+    >>>     metric="accuracy",
+    >>>     label_mapping={"LABEL_0": 0.0, "LABEL_1": 1.0},
+    >>>     strategy="bootstrap",
+    >>>     n_resamples=10,
+    >>>     random_state=0
+    >>> )
+    ```
+"""
+
+
+class TextClassificationEvaluator(Evaluator):
+    """
+    Text classification evaluator.
+    This text classification evaluator can currently be loaded from [`evaluator`] using the default task name
+    `text-classification` or with a `"sentiment-analysis"` alias.
+    Methods in this class assume a data format compatible with the [`~transformers.TextClassificationPipeline`] - a single textual
+    feature as input and a categorical label as output.
+    """
+
+    PIPELINE_KWARGS = {"truncation": True}
+
+    def __init__(self, task="text-classification", default_metric_name=None):
+        super().__init__(task, default_metric_name=default_metric_name)
+
+    def prepare_data(self, data: Union[str, Dataset], input_column: str, second_input_column: str, label_column: str):
+        if data is None:
+            raise ValueError(
+                "Please specify a valid `data` object - either a `str` with a name or a `Dataset` object."
+            )
+
+        self.check_required_columns(data, {"input_column": input_column, "label_column": label_column})
+
+        if second_input_column is not None:
+            self.check_required_columns(data, {"second_input_column": second_input_column})
+
+        data = load_dataset(data) if isinstance(data, str) else data
+
+        return {"references": data[label_column]}, DatasetColumnPair(
+            data, input_column, second_input_column, "text", "text_pair"
+        )
+
+    def predictions_processor(self, predictions, label_mapping):
+        predictions = [
+            label_mapping[element["label"]] if label_mapping is not None else element["label"]
+            for element in predictions
+        ]
+        return {"predictions": predictions}
+
+    @add_start_docstrings(EVALUTOR_COMPUTE_START_DOCSTRING)
+    @add_end_docstrings(EVALUATOR_COMPUTE_RETURN_DOCSTRING, TASK_DOCUMENTATION)
+    def compute(
+        self,
+        model_or_pipeline: Union[
+            str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"  # noqa: F821
+        ] = None,
+        data: Union[str, Dataset] = None,
+        subset: Optional[str] = None,
+        split: Optional[str] = None,
+        metric: Union[str, EvaluationModule] = None,
+        tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None,  # noqa: F821
+        feature_extractor: Optional[Union[str, "FeatureExtractionMixin"]] = None,  # noqa: F821
+        strategy: Literal["simple", "bootstrap"] = "simple",
+        confidence_level: float = 0.95,
+        n_resamples: int = 9999,
+        device: int = None,
+        random_state: Optional[int] = None,
+        input_column: str = "text",
+        second_input_column: Optional[str] = None,
+        label_column: str = "label",
+        label_mapping: Optional[Dict[str, Number]] = None,
+    ) -> Tuple[Dict[str, float], Any]:
+        """
+        input_column (`str`, *optional*, defaults to `"text"`):
+            The name of the column containing the text feature in the dataset specified by `data`.
+        second_input_column (`str`, *optional*, defaults to `None`):
+            The name of the second column containing the text features. This may be useful for classification tasks
+            as MNLI, where two columns are used.
+        label_column (`str`, defaults to `"label"`):
+            The name of the column containing the labels in the dataset specified by `data`.
+        label_mapping (`Dict[str, Number]`, *optional*, defaults to `None`):
+            We want to map class labels defined by the model in the pipeline to values consistent with those
+            defined in the `label_column` of the `data` dataset.
+        """
+
+        result = {}
+
+        self.check_for_mismatch_in_device_setup(device, model_or_pipeline)
+
+        # Prepare inputs
+        data = self.load_data(data=data, subset=subset, split=split)
+        metric_inputs, pipe_inputs = self.prepare_data(
+            data=data, input_column=input_column, second_input_column=second_input_column, label_column=label_column
+        )
+        pipe = self.prepare_pipeline(
+            model_or_pipeline=model_or_pipeline,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            device=device,
+        )
+        metric = self.prepare_metric(metric)
+
+        # Compute predictions
+        predictions, perf_results = self.call_pipeline(pipe, pipe_inputs)
+        predictions = self.predictions_processor(predictions, label_mapping)
+        metric_inputs.update(predictions)
+
+        # Compute metrics from references and predictions
+        metric_results = self.compute_metric(
+            metric=metric,
+            metric_inputs=metric_inputs,
+            strategy=strategy,
+            confidence_level=confidence_level,
+            n_resamples=n_resamples,
+            random_state=random_state,
+        )
+
+        result.update(metric_results)
+        result.update(perf_results)
+
+        return result
--- a/evaluate-0.4.2/src/evaluate/evaluator/text_generation.py
+++ b/evaluate-0.4.2/src/evaluate/evaluator/text_generation.py
+# Copyright 2022 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Tuple
+
+from datasets import Dataset
+
+from .base import Evaluator
+from .utils import DatasetColumn
+
+
+TASK_DOCUMENTATION_KWARGS = r"""
+        input_column (`str`, defaults to `"text"`):
+            the name of the column containing the input text in the dataset specified by `data`.
+        generation_kwargs (`Dict`, *optional*, defaults to `None`):
+            The generation kwargs are passed to the pipeline and set the text generation strategy.
+"""
+
+
+class TextGenerationEvaluator(Evaluator):
+    """
+    Text generation evaluator.
+    This Text generation evaluator can currently be loaded from [`evaluator`] using the default task name
+    `text-generation`.
+    Methods in this class assume a data format compatible with the [`~transformers.TextGenerationPipeline`].
+    """
+
+    def predictions_processor(self, predictions, *args, **kwargs):
+        """
+        Args:
+            predictions: A list of lists of dicts
+
+        Returns:
+            `dict`: All the generated texts are flattened and stored under the "data" key.
+        """
+        return {"data": [pred[f"{self.predictions_prefix}_text"] for pred_list in predictions for pred in pred_list]}
+
+    def __init__(self, task="text-generation", default_metric_name=None, predictions_prefix: str = "generated"):
+        super().__init__(task=task, default_metric_name=default_metric_name)
+        self.predictions_prefix = predictions_prefix
+
+    def prepare_data(self, data: Dataset, input_column: str, *args, **kwargs) -> Tuple[Dict, DatasetColumn]:
+        """
+        Prepare data.
+
+        Args:
+            data ([`Dataset`]):
+                Specifies the dataset we will run evaluation on.
+            input_column (`str`, defaults to `"text"`):
+                The name of the column containing the text feature in the dataset specified by `data`.
+        Returns:
+            `dict`:  metric inputs.
+            `list`:  pipeline inputs.
+        """
+
+        self.check_required_columns(data, {"input_column": input_column})
+
+        return {}, DatasetColumn(data, input_column)
--- a/evaluate-0.4.2/src/evaluate/evaluator/token_classification.py
+++ b/evaluate-0.4.2/src/evaluate/evaluator/token_classification.py
+# Copyright 2022 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+
+from datasets import ClassLabel, Dataset, Sequence
+from typing_extensions import Literal
+
+from ..module import EvaluationModule
+from ..utils.file_utils import add_end_docstrings, add_start_docstrings
+from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator
+from .utils import DatasetColumn
+
+
+if TYPE_CHECKING:
+    from transformers import Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel
+
+
+TASK_DOCUMENTATION = r"""
+    The dataset input and label columns are expected to be formatted as a list of words and a list of labels respectively, following [conll2003 dataset](https://huggingface.co/datasets/conll2003). Datasets whose inputs are single strings, and labels are a list of offset are not supported.
+
+    Examples:
+    ```python
+    >>> from evaluate import evaluator
+    >>> from datasets import load_dataset
+    >>> task_evaluator = evaluator("token-classification")
+    >>> data = load_dataset("conll2003", split="validation[:2]")
+    >>> results = task_evaluator.compute(
+    >>>     model_or_pipeline="elastic/distilbert-base-uncased-finetuned-conll03-english",
+    >>>     data=data,
+    >>>     metric="seqeval",
+    >>> )
+    ```
+
+    <Tip>
+
+    For example, the following dataset format is accepted by the evaluator:
+
+    ```python
+    dataset = Dataset.from_dict(
+        mapping={
+            "tokens": [["New", "York", "is", "a", "city", "and", "Felix", "a", "person", "."]],
+            "ner_tags": [[1, 2, 0, 0, 0, 0, 3, 0, 0, 0]],
+        },
+        features=Features({
+            "tokens": Sequence(feature=Value(dtype="string")),
+            "ner_tags": Sequence(feature=ClassLabel(names=["O", "B-LOC", "I-LOC", "B-PER", "I-PER"])),
+            }),
+    )
+    ```
+
+    </Tip>
+
+    <Tip warning={true}>
+
+    For example, the following dataset format is **not** accepted by the evaluator:
+
+    ```python
+    dataset = Dataset.from_dict(
+        mapping={
+            "tokens": [["New York is a city and Felix a person."]],
+            "starts": [[0, 23]],
+            "ends": [[7, 27]],
+            "ner_tags": [["LOC", "PER"]],
+        },
+        features=Features({
+            "tokens": Value(dtype="string"),
+            "starts": Sequence(feature=Value(dtype="int32")),
+            "ends": Sequence(feature=Value(dtype="int32")),
+            "ner_tags": Sequence(feature=Value(dtype="string")),
+        }),
+    )
+    ```
+
+    </Tip>
+"""
+
+
+class TokenClassificationEvaluator(Evaluator):
+    """
+    Token classification evaluator.
+
+    This token classification evaluator can currently be loaded from [`evaluator`] using the default task name
+    `token-classification`.
+
+    Methods in this class assume a data format compatible with the [`~transformers.TokenClassificationPipeline`].
+    """
+
+    PIPELINE_KWARGS = {"ignore_labels": []}
+
+    def __init__(self, task="token-classification", default_metric_name=None):
+        super().__init__(task, default_metric_name=default_metric_name)
+
+    def predictions_processor(self, predictions: List[List[Dict]], words: List[List[str]], join_by: str):
+        """
+        Transform the pipeline predictions into a list of predicted labels of the same length as the true labels.
+
+        Args:
+            predictions (`List[List[Dict]]`):
+                List of pipeline predictions, where each token has been labeled.
+            words (`List[List[str]]`):
+                Original input data to the pipeline, used to build predicted labels of the same length.
+            join_by (`str`):
+                String to use to join two words. In English, it will typically be " ".
+
+        Returns:
+            `dict`: a dictionary holding the predictions
+        """
+        preds = []
+
+        # iterate over the data rows
+        for i, prediction in enumerate(predictions):
+            pred_processed = []
+
+            # get a list of tuples giving the indexes of the start and end character of each word
+            words_offsets = self.words_to_offsets(words[i], join_by)
+
+            token_index = 0
+            for word_offset in words_offsets:
+                # for each word, we may keep only the predicted label for the first token, discard the others
+                while prediction[token_index]["start"] < word_offset[0]:
+                    token_index += 1
+
+                if prediction[token_index]["start"] > word_offset[0]:  # bad indexing
+                    pred_processed.append("O")
+                elif prediction[token_index]["start"] == word_offset[0]:
+                    pred_processed.append(prediction[token_index]["entity"])
+
+            preds.append(pred_processed)
+
+        return {"predictions": preds}
+
+    def words_to_offsets(self, words: List[str], join_by: str):
+        """
+        Convert a list of words to a list of offsets, where word are joined by `join_by`.
+
+        Args:
+            words (`List[str]`):
+                List of words to get offsets from.
+            join_by (`str`):
+                String to insert between words.
+
+        Returns:
+            `List[Tuple[int, int]]`: List of the characters (start index, end index) for each of the words.
+        """
+        offsets = []
+
+        start = 0
+        for word in words:
+            end = start + len(word) - 1
+            offsets.append((start, end))
+            start = end + len(join_by) + 1
+
+        return offsets
+
+    def prepare_data(self, data: Union[str, Dataset], input_column: str, label_column: str, join_by: str):
+        super().prepare_data(data, input_column, label_column)
+
+        if not isinstance(data.features[input_column], Sequence) or not isinstance(
+            data.features[label_column], Sequence
+        ):
+            raise ValueError(
+                "TokenClassificationEvaluator expects the input and label columns to be provided as lists."
+            )
+
+        # If the labels are of type ClassLabel, they are already integers and we have the map stored somewhere.
+        # Otherwise, we have to get the list of labels manually.
+        labels_are_int = isinstance(data.features[label_column].feature, ClassLabel)
+        if labels_are_int:
+            label_list = data.features[label_column].feature.names  # list of string labels
+            id_to_label = {i: label for i, label in enumerate(label_list)}
+            references = [[id_to_label[label_id] for label_id in label_ids] for label_ids in data[label_column]]
+        elif data.features[label_column].feature.dtype.startswith("int"):
+            raise NotImplementedError(
+                "References provided as integers, but the reference column is not a Sequence of ClassLabels."
+            )
+        else:
+            # In the event the labels are not a `Sequence[ClassLabel]`, we have already labels as strings
+            # An example is labels as ["PER", "PER", "O", "LOC", "O", "LOC", "O"], e.g. in polyglot_ner dataset
+            references = data[label_column]
+
+        metric_inputs = {"references": references}
+        data = data.map(lambda x: {input_column: join_by.join(x[input_column])})
+        pipeline_inputs = DatasetColumn(data, input_column)
+
+        return metric_inputs, pipeline_inputs
+
+    def prepare_pipeline(
+        self,
+        model_or_pipeline: Union[str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"],  # noqa: F821
+        tokenizer: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"] = None,  # noqa: F821
+        feature_extractor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"] = None,  # noqa: F821
+        device: int = None,
+    ):
+        pipe = super().prepare_pipeline(model_or_pipeline, tokenizer, feature_extractor, device)
+
+        # check the pipeline outputs start characters in its predictions
+        dummy_output = pipe(["2003 New York Gregory"], **self.PIPELINE_KWARGS)
+        if dummy_output[0][0]["start"] is None:
+            raise ValueError(
+                "TokenClassificationEvaluator supports only pipelines giving 'start' index as a pipeline output (got None). "
+                "Transformers pipelines with a slow tokenizer will raise this error."
+            )
+
+        return pipe
+
+    @add_start_docstrings(EVALUTOR_COMPUTE_START_DOCSTRING)
+    @add_end_docstrings(EVALUATOR_COMPUTE_RETURN_DOCSTRING, TASK_DOCUMENTATION)
+    def compute(
+        self,
+        model_or_pipeline: Union[
+            str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"  # noqa: F821
+        ] = None,
+        data: Union[str, Dataset] = None,
+        subset: Optional[str] = None,
+        split: str = None,
+        metric: Union[str, EvaluationModule] = None,
+        tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None,  # noqa: F821
+        strategy: Literal["simple", "bootstrap"] = "simple",
+        confidence_level: float = 0.95,
+        n_resamples: int = 9999,
+        device: Optional[int] = None,
+        random_state: Optional[int] = None,
+        input_column: str = "tokens",
+        label_column: str = "ner_tags",
+        join_by: Optional[str] = " ",
+    ) -> Tuple[Dict[str, float], Any]:
+        """
+        input_column (`str`, defaults to `"tokens"`):
+            The name of the column containing the tokens feature in the dataset specified by `data`.
+        label_column (`str`, defaults to `"label"`):
+            The name of the column containing the labels in the dataset specified by `data`.
+        join_by (`str`, *optional*, defaults to `" "`):
+            This evaluator supports dataset whose input column is a list of words. This parameter specifies how to join
+            words to generate a string input. This is especially useful for languages that do not separate words by a space.
+        """
+        result = {}
+
+        self.check_for_mismatch_in_device_setup(device, model_or_pipeline)
+
+        # Prepare inputs
+        data = self.load_data(data=data, subset=subset, split=split)
+        metric_inputs, pipe_inputs = self.prepare_data(
+            data=data, input_column=input_column, label_column=label_column, join_by=join_by
+        )
+        pipe = self.prepare_pipeline(model_or_pipeline=model_or_pipeline, tokenizer=tokenizer, device=device)
+        metric = self.prepare_metric(metric)
+
+        # Compute predictions
+        predictions, perf_results = self.call_pipeline(pipe, pipe_inputs)
+        predictions = self.predictions_processor(predictions, data[input_column], join_by)
+        metric_inputs.update(predictions)
+
+        # Compute metrics from references and predictions
+        metric_results = self.compute_metric(
+            metric=metric,
+            metric_inputs=metric_inputs,
+            strategy=strategy,
+            confidence_level=confidence_level,
+            n_resamples=n_resamples,
+            random_state=random_state,
+        )
+
+        result.update(metric_results)
+        result.update(perf_results)
+
+        return result
--- a/evaluate-0.4.2/src/evaluate/evaluator/utils.py
+++ b/evaluate-0.4.2/src/evaluate/evaluator/utils.py
+from datasets import Dataset, get_dataset_split_names
+
+
+class DatasetColumn(list):
+    """Helper class to avoid loading a dataset column into memory when accessing it."""
+
+    def __init__(self, dataset: Dataset, key: str):
+        self.dataset = dataset
+        self.key = key
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, i):
+        return self.dataset[i][self.key]
+
+    def __iter__(self):
+        return (self.dataset[i][self.key] for i in range(len(self)))
+
+
+def choose_split(data, subset=None):
+    available_splits = get_dataset_split_names(data, subset)
+    preferred_split_order = [
+        "test",
+        "testing",
+        "eval",
+        "evaluation",
+        "validation",
+        "val",
+        "valid",
+        "dev",
+        "train",
+        "training",
+    ]
+    for split in preferred_split_order:
+        if split in available_splits:
+            return split
+    raise ValueError("No dataset split defined! Pass an explicit value to the `split` kwarg.")
+
+
+class DatasetColumnPair(list):
+    """Helper class to avoid loading two dataset columns into memory when accessing it."""
+
+    def __init__(
+        self,
+        dataset: Dataset,
+        first_col: str,
+        second_col: str,
+        first_key: str,
+        second_key: str,
+    ):
+        """
+        Args:
+            dataset (Dataset): dataset to build an iterator on
+            first_col (str): first column name to use in the dataset
+            second_col (str): second column name to use in the dataset
+            first_key (str): key name used for the first column in the returned dictionary
+            second_key (str): key name used for the second column in the returned dictionary
+        """
+        self.dataset = dataset
+
+        self.first_col = first_col
+        self.second_col = second_col
+
+        self.first_key = first_key
+        self.second_key = second_key
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, i):
+        return {
+            self.first_key: self.dataset[i][self.first_col],
+            self.second_key: self.dataset[i][self.second_col] if self.second_col else None,
+        }
+
+    def __iter__(self):
+        return (
+            {
+                self.first_key: self.dataset[i][self.first_col],
+                self.second_key: self.dataset[i][self.second_col] if self.second_col else None,
+            }
+            for i in range(len(self))
+        )
--- a/evaluate-0.4.2/src/evaluate/hub.py
+++ b/evaluate-0.4.2/src/evaluate/hub.py
+from typing import Dict
+
+import requests
+from huggingface_hub import dataset_info, model_info
+from huggingface_hub.repocard import metadata_update
+
+from .config import HF_HUB_ALLOWED_TASKS
+from .utils.logging import get_logger
+
+
+logger = get_logger(__name__)
+
+
+def push_to_hub(
+    model_id: str,
+    task_type: str,
+    dataset_type: str,
+    dataset_name: str,
+    metric_type: str,
+    metric_name: str,
+    metric_value: float,
+    task_name: str = None,
+    dataset_config: str = None,
+    dataset_split: str = None,
+    dataset_revision: str = None,
+    dataset_args: Dict[str, int] = None,
+    metric_config: str = None,
+    metric_args: Dict[str, int] = None,
+    overwrite: bool = False,
+):
+    r"""
+    Pushes the result of a metric to the metadata of a model repository in the Hub.
+
+    Args:
+        model_id (`str`):
+            Model id from https://hf.co/models.
+        task_type (`str`):
+            Task id, refer to the [Hub allowed tasks](https://github.com/huggingface/evaluate/blob/main/src/evaluate/config.py#L154) for allowed values.
+        dataset_type (`str`):
+            Dataset id from https://hf.co/datasets.
+        dataset_name (`str`):
+            Pretty name for the dataset.
+        metric_type (`str`):
+            Metric id from https://hf.co/metrics.
+        metric_name (`str`):
+            Pretty name for the metric.
+        metric_value (`float`):
+            Computed metric value.
+        task_name (`str`, *optional*):
+            Pretty name for the task.
+        dataset_config (`str`, *optional*):
+            Dataset configuration used in [`~datasets.load_dataset`].
+            See [`~datasets.load_dataset`] for more info.
+        dataset_split (`str`, *optional*):
+            Name of split used for metric computation.
+        dataset_revision (`str`, *optional*):
+            Git hash for the specific version of the dataset.
+        dataset_args (`dict[str, int]`, *optional*):
+            Additional arguments passed to [`~datasets.load_dataset`].
+        metric_config (`str`, *optional*):
+            Configuration for the metric (e.g. the GLUE metric has a configuration for each subset).
+        metric_args (`dict[str, int]`, *optional*):
+            Arguments passed during [`~evaluate.EvaluationModule.compute`].
+        overwrite (`bool`, *optional*, defaults to `False`):
+            If set to `True` an existing metric field can be overwritten, otherwise
+             attempting to overwrite any existing fields will cause an error.
+
+    Example:
+
+    ```python
+    >>> push_to_hub(
+    ...     model_id="huggingface/gpt2-wikitext2",
+    ...     metric_value=0.5
+    ...     metric_type="bleu",
+    ...     metric_name="BLEU",
+    ...     dataset_name="WikiText",
+    ...     dataset_type="wikitext",
+    ...     dataset_split="test",
+    ...     task_type="text-generation",
+    ...     task_name="Text Generation"
+    ... )
+    ```"""
+    if task_type not in HF_HUB_ALLOWED_TASKS:
+        raise ValueError(f"Task type not supported. Task has to be one of {HF_HUB_ALLOWED_TASKS}")
+
+    try:
+        dataset_info(dataset_type)
+    except requests.exceptions.HTTPError:
+        logger.warning(f"Dataset {dataset_type} not found on the Hub at hf.co/datasets/{dataset_type}")
+
+    try:
+        model_info(model_id)
+    except requests.exceptions.HTTPError:
+        raise ValueError(f"Model {model_id} not found on the Hub at hf.co/{model_id}")
+
+    result = {
+        "task": {
+            "type": task_type,
+        },
+        "dataset": {
+            "type": dataset_type,
+            "name": dataset_name,
+        },
+        "metrics": [
+            {
+                "type": metric_type,
+                "value": metric_value,
+            },
+        ],
+    }
+
+    if dataset_config is not None:
+        result["dataset"]["config"] = dataset_config
+    if dataset_split is not None:
+        result["dataset"]["split"] = dataset_split
+    if dataset_revision is not None:
+        result["dataset"]["revision"] = dataset_revision
+    if dataset_args is not None:
+        result["dataset"]["args"] = dataset_args
+
+    if task_name is not None:
+        result["task"]["name"] = task_name
+
+    if metric_name is not None:
+        result["metrics"][0]["name"] = metric_name
+    if metric_config is not None:
+        result["metrics"][0]["config"] = metric_config
+    if metric_args is not None:
+        result["metrics"][0]["args"] = metric_args
+
+    metadata = {"model-index": [{"results": [result]}]}
+
+    return metadata_update(repo_id=model_id, metadata=metadata, overwrite=overwrite)
--- a/evaluate-0.4.2/src/evaluate/info.py
+++ b/evaluate-0.4.2/src/evaluate/info.py
+# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+""" EvaluationModuleInfo records information we know about a dataset and a metric.
+"""
+
+import dataclasses
+import json
+import os
+from dataclasses import asdict, dataclass, field
+from typing import List, Optional, Union
+
+from datasets.features import Features, Value
+
+from . import config
+from .utils.logging import get_logger
+
+
+logger = get_logger(__name__)
+
+
+@dataclass
+class EvaluationModuleInfo:
+    """Base class to store information about an evaluation used for `MetricInfo`, `ComparisonInfo`,
+    and `MeasurementInfo`.
+
+    `EvaluationModuleInfo` documents an evaluation, including its name, version, and features.
+    See the constructor arguments and properties for a full list.
+
+    Note: Not all fields are known on construction and may be updated later.
+    """
+
+    # Set in the dataset scripts
+    description: str
+    citation: str
+    features: Union[Features, List[Features]]
+    inputs_description: str = field(default_factory=str)
+    homepage: str = field(default_factory=str)
+    license: str = field(default_factory=str)
+    codebase_urls: List[str] = field(default_factory=list)
+    reference_urls: List[str] = field(default_factory=list)
+    streamable: bool = False
+    format: Optional[str] = None
+    module_type: str = "metric"  # deprecate this in the future
+
+    # Set later by the builder
+    module_name: Optional[str] = None
+    config_name: Optional[str] = None
+    experiment_id: Optional[str] = None
+
+    def __post_init__(self):
+        if self.format is not None:
+            for key, value in self.features.items():
+                if not isinstance(value, Value):
+                    raise ValueError(
+                        f"When using 'numpy' format, all features should be a `datasets.Value` feature. "
+                        f"Here {key} is an instance of {value.__class__.__name__}"
+                    )
+
+    def write_to_directory(self, metric_info_dir):
+        """Write `EvaluationModuleInfo` as JSON to `metric_info_dir`.
+        Also save the license separately in LICENSE.
+
+        Args:
+            metric_info_dir (`str`):
+                The directory to save `metric_info_dir` to.
+
+        Example:
+
+        ```py
+        >>> my_metric.info.write_to_directory("/path/to/directory/")
+        ```
+        """
+        with open(os.path.join(metric_info_dir, config.METRIC_INFO_FILENAME), "w", encoding="utf-8") as f:
+            json.dump(asdict(self), f)
+
+        with open(os.path.join(metric_info_dir, config.LICENSE_FILENAME), "w", encoding="utf-8") as f:
+            f.write(self.license)
+
+    @classmethod
+    def from_directory(cls, metric_info_dir) -> "EvaluationModuleInfo":
+        """Create `EvaluationModuleInfo` from the JSON file in `metric_info_dir`.
+
+        Args:
+            metric_info_dir (`str`):
+                The directory containing the `metric_info` JSON file. This
+                should be the root directory of a specific metric version.
+
+        Example:
+
+        ```py
+        >>> my_metric = EvaluationModuleInfo.from_directory("/path/to/directory/")
+        ```
+        """
+        logger.info(f"Loading Metric info from {metric_info_dir}")
+        if not metric_info_dir:
+            raise ValueError("Calling EvaluationModuleInfo.from_directory() with undefined metric_info_dir.")
+
+        with open(os.path.join(metric_info_dir, config.METRIC_INFO_FILENAME), encoding="utf-8") as f:
+            metric_info_dict = json.load(f)
+        return cls.from_dict(metric_info_dict)
+
+    @classmethod
+    def from_dict(cls, metric_info_dict: dict) -> "EvaluationModuleInfo":
+        field_names = {f.name for f in dataclasses.fields(cls)}
+        return cls(**{k: v for k, v in metric_info_dict.items() if k in field_names})
+
+
+@dataclass
+class MetricInfo(EvaluationModuleInfo):
+    """Information about a metric.
+
+    `EvaluationModuleInfo` documents a metric, including its name, version, and features.
+    See the constructor arguments and properties for a full list.
+
+    Note: Not all fields are known on construction and may be updated later.
+    """
+
+    module_type: str = "metric"
+
+
+@dataclass
+class ComparisonInfo(EvaluationModuleInfo):
+    """Information about a comparison.
+
+    `EvaluationModuleInfo` documents a comparison, including its name, version, and features.
+    See the constructor arguments and properties for a full list.
+
+    Note: Not all fields are known on construction and may be updated later.
+    """
+
+    module_type: str = "comparison"
+
+
+@dataclass
+class MeasurementInfo(EvaluationModuleInfo):
+    """Information about a measurement.
+
+    `EvaluationModuleInfo` documents a measurement, including its name, version, and features.
+    See the constructor arguments and properties for a full list.
+
+    Note: Not all fields are known on construction and may be updated later.
+    """
+
+    module_type: str = "measurement"
--- a/evaluate-0.4.2/src/evaluate/inspect.py
+++ b/evaluate-0.4.2/src/evaluate/inspect.py
+# Copyright 2020 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+""" List and inspect metrics."""
+
+from typing import Optional
+
+import requests
+from datasets import DownloadConfig
+
+from .config import EVALUATION_MODULE_TYPES, HF_LIST_ENDPOINT
+from .loading import evaluation_module_factory
+from .utils.logging import get_logger
+
+
+logger = get_logger(__name__)
+
+
+class SplitsNotFoundError(ValueError):
+    pass
+
+
+def list_evaluation_modules(module_type=None, include_community=True, with_details=False):
+    """List all evaluation modules available on the Hugging Face Hub.
+
+    Args:
+        module_type (`str`, *optional*, defaults to `None`):
+            Type of evaluation modules to list. Has to be one of `'metric'`, `'comparison'`, or `'measurement'`. If `None`, all types are listed.
+        include_community (`bool`, *optional*, defaults to `True`):
+            Include community modules in the list.
+        with_details (`bool`, *optional*, defaults to `False`):
+            Return the full details on the metrics instead of only the ID.
+
+    Returns:
+        `List[Union[str, dict]]`
+
+    Example:
+
+    ```py
+    >>> from evaluate import list_evaluation_modules
+    >>> list_evaluation_modules(module_type="metric")
+    ```
+    """
+
+    if module_type is None:
+        evaluations_list = []
+        for module_type in EVALUATION_MODULE_TYPES:
+            evaluations_list.extend(
+                _list_evaluation_modules_type(
+                    module_type, include_community=include_community, with_details=with_details
+                )
+            )
+    else:
+        if module_type not in EVALUATION_MODULE_TYPES:
+            raise ValueError(f"Invalid module type '{module_type}'. Has to be one of {EVALUATION_MODULE_TYPES}.")
+        evaluations_list = _list_evaluation_modules_type(
+            module_type, include_community=include_community, with_details=with_details
+        )
+    return evaluations_list
+
+
+def _list_evaluation_modules_type(module_type, include_community=True, with_details=False):
+
+    r = requests.get(HF_LIST_ENDPOINT.format(type=module_type))
+    r.raise_for_status()
+    d = r.json()
+
+    if not include_community:
+        d = [element for element in d if element["id"].split("/")[0] == f"evaluate-{module_type}"]
+
+    # remove namespace for canonical modules and add community tag
+    for element in d:
+        if element["id"].split("/")[0] == f"evaluate-{module_type}":
+            element["id"] = element["id"].split("/")[1]
+            element["community"] = False
+        else:
+            element["community"] = True
+
+    if with_details:
+        return [
+            {
+                "name": element["id"],
+                "type": module_type,
+                "community": element["community"],
+                "likes": element.get("likes", 0),
+            }
+            for element in d
+        ]
+    else:
+        return [element["id"] for element in d]
+
+
+def inspect_evaluation_module(
+    path: str, local_path: str, download_config: Optional[DownloadConfig] = None, **download_kwargs
+):
+    r"""
+    Allow inspection/modification of a evaluation script by copying it on local drive at local_path.
+
+    Args:
+        path (``str``): path to the evaluation script. Can be either:
+
+            - a local path to script or the directory containing the script (if the script has the same name as the directory),
+                e.g. ``'./metrics/accuracy'`` or ``'./metrics/accuracy/accuracy.py'``
+            - a dataset identifier on the Hugging Face Hub (list all available datasets and ids with ``evaluate.list_evaluation_modules()``)
+                e.g. ``'accuracy'``, ``'bleu'`` or ``'word_length'``
+        local_path (``str``): path to the local folder to copy the datset script to.
+        download_config (Optional ``datasets.DownloadConfig``: specific download configuration parameters.
+        **download_kwargs: optional attributes for DownloadConfig() which will override the attributes in download_config if supplied.
+    """
+    evaluation_module = evaluation_module_factory(
+        path, download_config=download_config, force_local_path=local_path, **download_kwargs
+    )
+    print(
+        f"The processing scripts for metric {path} can be inspected at {local_path}. "
+        f"The main class is in {evaluation_module.module_path}. "
+        f"You can modify this processing scripts and use it with `evaluate.load({local_path})`."
+    )
--- a/evaluate-0.4.2/src/evaluate/loading.py
+++ b/evaluate-0.4.2/src/evaluate/loading.py
+# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Access datasets."""
+import filecmp
+import importlib
+import inspect
+import json
+import os
+import re
+import shutil
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional, Tuple, Type, Union
+from urllib.parse import urlparse
+
+from datasets import DownloadConfig, DownloadMode
+from datasets.builder import DatasetBuilder
+from datasets.packaged_modules import _EXTENSION_TO_MODULE, _hash_python_lines
+from datasets.utils.filelock import FileLock
+from datasets.utils.version import Version
+
+from . import SCRIPTS_VERSION, config
+from .module import EvaluationModule
+from .utils.file_utils import (
+    cached_path,
+    head_hf_s3,
+    hf_hub_url,
+    init_hf_modules,
+    is_relative_path,
+    relative_to_absolute_path,
+    url_or_path_join,
+)
+from .utils.logging import get_logger
+
+
+logger = get_logger(__name__)
+
+
+ALL_ALLOWED_EXTENSIONS = list(_EXTENSION_TO_MODULE.keys()) + ["zip"]
+
+
+def init_dynamic_modules(
+    name: str = config.MODULE_NAME_FOR_DYNAMIC_MODULES, hf_modules_cache: Optional[Union[Path, str]] = None
+):
+    """
+    Create a module with name `name` in which you can add dynamic modules
+    such as metrics or datasets. The module can be imported using its name.
+    The module is created in the HF_MODULE_CACHE directory by default (~/.cache/huggingface/modules) but it can
+    be overriden by specifying a path to another directory in `hf_modules_cache`.
+    """
+    hf_modules_cache = init_hf_modules(hf_modules_cache)
+    dynamic_modules_path = os.path.join(hf_modules_cache, name)
+    os.makedirs(dynamic_modules_path, exist_ok=True)
+    if not os.path.exists(os.path.join(dynamic_modules_path, "__init__.py")):
+        with open(os.path.join(dynamic_modules_path, "__init__.py"), "w"):
+            pass
+    return dynamic_modules_path
+
+
+def import_main_class(module_path) -> Optional[Union[Type[DatasetBuilder], Type[EvaluationModule]]]:
+    """Import a module at module_path and return its main class, a Metric by default"""
+    module = importlib.import_module(module_path)
+    main_cls_type = EvaluationModule
+
+    # Find the main class in our imported module
+    module_main_cls = None
+    for name, obj in module.__dict__.items():
+        if isinstance(obj, type) and issubclass(obj, main_cls_type):
+            if inspect.isabstract(obj):
+                continue
+            module_main_cls = obj
+            break
+
+    return module_main_cls
+
+
+def files_to_hash(file_paths: List[str]) -> str:
+    """
+    Convert a list of scripts or text files provided in file_paths into a hashed filename in a repeatable way.
+    """
+    # List all python files in directories if directories are supplied as part of external imports
+    to_use_files: List[Union[Path, str]] = []
+    for file_path in file_paths:
+        if os.path.isdir(file_path):
+            to_use_files.extend(list(Path(file_path).rglob("*.[pP][yY]")))
+        else:
+            to_use_files.append(file_path)
+
+    # Get the code from all these files
+    lines = []
+    for file_path in to_use_files:
+        with open(file_path, encoding="utf-8") as f:
+            lines.extend(f.readlines())
+    return _hash_python_lines(lines)
+
+
+def convert_github_url(url_path: str) -> Tuple[str, Optional[str]]:
+    """Convert a link to a file on a github repo in a link to the raw github object."""
+    parsed = urlparse(url_path)
+    sub_directory = None
+    if parsed.scheme in ("http", "https", "s3") and parsed.netloc == "github.com":
+        if "blob" in url_path:
+            if not url_path.endswith(".py"):
+                raise ValueError(f"External import from github at {url_path} should point to a file ending with '.py'")
+            url_path = url_path.replace("blob", "raw")  # Point to the raw file
+        else:
+            # Parse github url to point to zip
+            github_path = parsed.path[1:]
+            repo_info, branch = github_path.split("/tree/") if "/tree/" in github_path else (github_path, "master")
+            repo_owner, repo_name = repo_info.split("/")
+            url_path = f"https://github.com/{repo_owner}/{repo_name}/archive/{branch}.zip"
+            sub_directory = f"{repo_name}-{branch}"
+    return url_path, sub_directory
+
+
+def increase_load_count(name: str, resource_type: str):
+    """Update the download count of a dataset or metric."""
+    if not config.HF_EVALUATE_OFFLINE and config.HF_UPDATE_DOWNLOAD_COUNTS:
+        try:
+            head_hf_s3(name, filename=name + ".py", dataset=(resource_type == "dataset"))
+        except Exception:
+            pass
+
+
+def get_imports(file_path: str) -> Tuple[str, str, str, str]:
+    """Find whether we should import or clone additional files for a given processing script.
+        And list the import.
+
+    We allow:
+    - library dependencies,
+    - local dependencies and
+    - external dependencies whose url is specified with a comment starting from "# From:' followed by the raw url to a file, an archive or a github repository.
+        external dependencies will be downloaded (and extracted if needed in the dataset folder).
+        We also add an `__init__.py` to each sub-folder of a downloaded folder so the user can import from them in the script.
+
+    Note that only direct import in the dataset processing script will be handled
+    We don't recursively explore the additional import to download further files.
+
+    Example::
+
+        import tensorflow
+        import .c4_utils
+        import .clicr.dataset-code.build_json_dataset  # From: https://raw.githubusercontent.com/clips/clicr/master/dataset-code/build_json_dataset
+    """
+    lines = []
+    with open(file_path, encoding="utf-8") as f:
+        lines.extend(f.readlines())
+
+    logger.debug(f"Checking {file_path} for additional imports.")
+    imports: List[Tuple[str, str, str, Optional[str]]] = []
+    is_in_docstring = False
+    for line in lines:
+        docstr_start_match = re.findall(r'[\s\S]*?"""[\s\S]*?', line)
+
+        if len(docstr_start_match) == 1:
+            # flip True <=> False only if doctstring
+            # starts at line without finishing
+            is_in_docstring = not is_in_docstring
+
+        if is_in_docstring:
+            # import statements in doctstrings should
+            # not be added as required dependencies
+            continue
+
+        match = re.match(r"^import\s+(\.?)([^\s\.]+)[^#\r\n]*(?:#\s+From:\s+)?([^\r\n]*)", line, flags=re.MULTILINE)
+        if match is None:
+            match = re.match(
+                r"^from\s+(\.?)([^\s\.]+)(?:[^\s]*)\s+import\s+[^#\r\n]*(?:#\s+From:\s+)?([^\r\n]*)",
+                line,
+                flags=re.MULTILINE,
+            )
+            if match is None:
+                continue
+        if match.group(1):
+            # The import starts with a '.', we will download the relevant file
+            if any(imp[1] == match.group(2) for imp in imports):
+                # We already have this import
+                continue
+            if match.group(3):
+                # The import has a comment with 'From:', we'll retrieve it from the given url
+                url_path = match.group(3)
+                url_path, sub_directory = convert_github_url(url_path)
+                imports.append(("external", match.group(2), url_path, sub_directory))
+            elif match.group(2):
+                # The import should be at the same place as the file
+                imports.append(("internal", match.group(2), match.group(2), None))
+        else:
+            if match.group(3):
+                # The import has a comment with `From: git+https:...`, asks user to pip install from git.
+                url_path = match.group(3)
+                imports.append(("library", match.group(2), url_path, None))
+            else:
+                imports.append(("library", match.group(2), match.group(2), None))
+
+    return imports
+
+
+def _download_additional_modules(
+    name: str, base_path: str, imports: Tuple[str, str, str, str], download_config: Optional[DownloadConfig]
+) -> List[Tuple[str, str]]:
+    """
+    Download additional module for a module <name>.py at URL (or local path) <base_path>/<name>.py
+    The imports must have been parsed first using ``get_imports``.
+
+    If some modules need to be installed with pip, an error is raised showing how to install them.
+    This function return the list of downloaded modules as tuples (import_name, module_file_path).
+
+    The downloaded modules can then be moved into an importable directory with ``_copy_script_and_other_resources_in_importable_dir``.
+    """
+    local_imports = []
+    library_imports = []
+    download_config = download_config.copy()
+    if download_config.download_desc is None:
+        download_config.download_desc = "Downloading extra modules"
+    for import_type, import_name, import_path, sub_directory in imports:
+        if import_type == "library":
+            library_imports.append((import_name, import_path))  # Import from a library
+            continue
+
+        if import_name == name:
+            raise ValueError(
+                f"Error in the {name} script, importing relative {import_name} module "
+                f"but {import_name} is the name of the script. "
+                f"Please change relative import {import_name} to another name and add a '# From: URL_OR_PATH' "
+                f"comment pointing to the original relative import file path."
+            )
+        if import_type == "internal":
+            url_or_filename = url_or_path_join(base_path, import_path + ".py")
+        elif import_type == "external":
+            url_or_filename = import_path
+        else:
+            raise ValueError("Wrong import_type")
+
+        local_import_path = cached_path(
+            url_or_filename,
+            download_config=download_config,
+        )
+        if sub_directory is not None:
+            local_import_path = os.path.join(local_import_path, sub_directory)
+        local_imports.append((import_name, local_import_path))
+
+    # Check library imports
+    needs_to_be_installed = set()
+    for library_import_name, library_import_path in library_imports:
+        try:
+            lib = importlib.import_module(library_import_name)  # noqa F841
+        except ImportError:
+            library_import_name = "scikit-learn" if library_import_name == "sklearn" else library_import_name
+            needs_to_be_installed.add((library_import_name, library_import_path))
+    if needs_to_be_installed:
+        raise ImportError(
+            f"To be able to use {name}, you need to install the following dependencies"
+            f"{[lib_name for lib_name, lib_path in needs_to_be_installed]} using 'pip install "
+            f"{' '.join([lib_path for lib_name, lib_path in needs_to_be_installed])}' for instance'"
+        )
+    return local_imports
+
+
+def _copy_script_and_other_resources_in_importable_dir(
+    name: str,
+    importable_directory_path: str,
+    subdirectory_name: str,
+    original_local_path: str,
+    local_imports: List[Tuple[str, str]],
+    additional_files: List[Tuple[str, str]],
+    download_mode: Optional[DownloadMode],
+) -> str:
+    """Copy a script and its required imports to an importable directory
+
+    Args:
+        name (str): name of the resource to load
+        importable_directory_path (str): path to the loadable folder in the dynamic modules directory
+        subdirectory_name (str): name of the subdirectory in importable_directory_path in which to place the script
+        original_local_path (str): local path to the resource script
+        local_imports (List[Tuple[str, str]]): list of (destination_filename, import_file_to_copy)
+        additional_files (List[Tuple[str, str]]): list of (destination_filename, additional_file_to_copy)
+        download_mode (Optional[DownloadMode]): download mode
+
+    Return:
+        importable_local_file: path to an importable module with importlib.import_module
+    """
+
+    # Define a directory with a unique name in our dataset or metric folder
+    # path is: ./datasets|metrics/dataset|metric_name/hash_from_code/script.py
+    # we use a hash as subdirectory_name to be able to have multiple versions of a dataset/metric processing file together
+    importable_subdirectory = os.path.join(importable_directory_path, subdirectory_name)
+    importable_local_file = os.path.join(importable_subdirectory, name + ".py")
+
+    # Prevent parallel disk operations
+    lock_path = importable_directory_path + ".lock"
+    with FileLock(lock_path):
+        # Create main dataset/metrics folder if needed
+        if download_mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists(importable_directory_path):
+            shutil.rmtree(importable_directory_path)
+        os.makedirs(importable_directory_path, exist_ok=True)
+
+        # add an __init__ file to the main dataset folder if needed
+        init_file_path = os.path.join(importable_directory_path, "__init__.py")
+        if not os.path.exists(init_file_path):
+            with open(init_file_path, "w"):
+                pass
+
+        # Create hash dataset folder if needed
+        os.makedirs(importable_subdirectory, exist_ok=True)
+        # add an __init__ file to the hash dataset folder if needed
+        init_file_path = os.path.join(importable_subdirectory, "__init__.py")
+        if not os.path.exists(init_file_path):
+            with open(init_file_path, "w"):
+                pass
+
+        # Copy dataset.py file in hash folder if needed
+        if not os.path.exists(importable_local_file):
+            shutil.copyfile(original_local_path, importable_local_file)
+
+        # Record metadata associating original dataset path with local unique folder
+        meta_path = importable_local_file.split(".py")[0] + ".json"
+        if not os.path.exists(meta_path):
+            meta = {"original file path": original_local_path, "local file path": importable_local_file}
+            # the filename is *.py in our case, so better rename to filenam.json instead of filename.py.json
+            with open(meta_path, "w", encoding="utf-8") as meta_file:
+                json.dump(meta, meta_file)
+
+        # Copy all the additional imports
+        for import_name, import_path in local_imports:
+            if os.path.isfile(import_path):
+                full_path_local_import = os.path.join(importable_subdirectory, import_name + ".py")
+                if not os.path.exists(full_path_local_import):
+                    shutil.copyfile(import_path, full_path_local_import)
+            elif os.path.isdir(import_path):
+                full_path_local_import = os.path.join(importable_subdirectory, import_name)
+                if not os.path.exists(full_path_local_import):
+                    shutil.copytree(import_path, full_path_local_import)
+            else:
+                raise OSError(f"Error with local import at {import_path}")
+
+        # Copy aditional files like dataset infos file if needed
+        for file_name, original_path in additional_files:
+            destination_additional_path = os.path.join(importable_subdirectory, file_name)
+            if not os.path.exists(destination_additional_path) or not filecmp.cmp(
+                original_path, destination_additional_path
+            ):
+                shutil.copyfile(original_path, destination_additional_path)
+        return importable_local_file
+
+
+def _create_importable_file(
+    local_path: str,
+    local_imports: List[Tuple[str, str]],
+    additional_files: List[Tuple[str, str]],
+    dynamic_modules_path: str,
+    module_namespace: str,
+    name: str,
+    download_mode: DownloadMode,
+) -> Tuple[str, str]:
+    importable_directory_path = os.path.join(dynamic_modules_path, module_namespace, name.replace("/", "--"))
+    Path(importable_directory_path).mkdir(parents=True, exist_ok=True)
+    (Path(importable_directory_path).parent / "__init__.py").touch(exist_ok=True)
+    hash = files_to_hash([local_path] + [loc[1] for loc in local_imports])
+    importable_local_file = _copy_script_and_other_resources_in_importable_dir(
+        name=name.split("/")[-1],
+        importable_directory_path=importable_directory_path,
+        subdirectory_name=hash,
+        original_local_path=local_path,
+        local_imports=local_imports,
+        additional_files=additional_files,
+        download_mode=download_mode,
+    )
+    logger.debug(f"Created importable dataset file at {importable_local_file}")
+    module_path = ".".join(
+        [os.path.basename(dynamic_modules_path), module_namespace, name.replace("/", "--"), hash, name.split("/")[-1]]
+    )
+    return module_path, hash
+
+
+@dataclass
+class ImportableModule:
+    module_path: str
+    hash: str
+
+
+class _EvaluationModuleFactory:
+    def get_module(self) -> ImportableModule:
+        raise NotImplementedError
+
+
+class LocalEvaluationModuleFactory(_EvaluationModuleFactory):
+    """Get the module of a local metric. The metric script is loaded from a local script."""
+
+    def __init__(
+        self,
+        path: str,
+        module_type: str = "metrics",
+        download_config: Optional[DownloadConfig] = None,
+        download_mode: Optional[DownloadMode] = None,
+        dynamic_modules_path: Optional[str] = None,
+    ):
+        self.path = path
+        self.module_type = module_type
+        self.name = Path(path).stem
+        self.download_config = download_config or DownloadConfig()
+        self.download_mode = download_mode
+        self.dynamic_modules_path = dynamic_modules_path
+
+    def get_module(self) -> ImportableModule:
+        # get script and other files
+        imports = get_imports(self.path)
+        local_imports = _download_additional_modules(
+            name=self.name,
+            base_path=str(Path(self.path).parent),
+            imports=imports,
+            download_config=self.download_config,
+        )
+        # copy the script and the files in an importable directory
+        dynamic_modules_path = self.dynamic_modules_path if self.dynamic_modules_path else init_dynamic_modules()
+        module_path, hash = _create_importable_file(
+            local_path=self.path,
+            local_imports=local_imports,
+            additional_files=[],
+            dynamic_modules_path=dynamic_modules_path,
+            module_namespace=self.module_type,
+            name=self.name,
+            download_mode=self.download_mode,
+        )
+        # make the new module to be noticed by the import system
+        importlib.invalidate_caches()
+        return ImportableModule(module_path, hash)
+
+
+class HubEvaluationModuleFactory(_EvaluationModuleFactory):
+    """Get the module of a metric from a metric repository on the Hub."""
+
+    def __init__(
+        self,
+        name: str,
+        module_type: str = "metrics",
+        revision: Optional[Union[str, Version]] = None,
+        download_config: Optional[DownloadConfig] = None,
+        download_mode: Optional[DownloadMode] = None,
+        dynamic_modules_path: Optional[str] = None,
+    ):
+        self.name = name
+        self.module_type = module_type
+        self.revision = revision
+        self.download_config = download_config or DownloadConfig()
+        self.download_mode = download_mode
+        self.dynamic_modules_path = dynamic_modules_path
+        assert self.name.count("/") == 1
+        increase_load_count(name, resource_type="metric")
+
+    def download_loading_script(self, revision) -> str:
+        file_path = hf_hub_url(path=self.name, name=self.name.split("/")[1] + ".py", revision=revision)
+        download_config = self.download_config.copy()
+        if download_config.download_desc is None:
+            download_config.download_desc = "Downloading builder script"
+        return cached_path(file_path, download_config=download_config)
+
+    def get_module(self) -> ImportableModule:
+        revision = self.revision or os.getenv("HF_SCRIPTS_VERSION", SCRIPTS_VERSION)
+
+        if re.match(r"\d*\.\d*\.\d*", revision):  # revision is version number (three digits separated by full stops)
+            revision = "v" + revision  # tagging convention on evaluate repository starts with v
+
+        # get script and other files
+        try:
+            local_path = self.download_loading_script(revision)
+        except FileNotFoundError as err:
+            # if there is no file found with current revision tag try to load main
+            if self.revision is None and os.getenv("HF_SCRIPTS_VERSION", SCRIPTS_VERSION) != "main":
+                revision = "main"
+                local_path = self.download_loading_script(revision)
+            else:
+                raise err
+
+        imports = get_imports(local_path)
+        local_imports = _download_additional_modules(
+            name=self.name,
+            base_path=hf_hub_url(path=self.name, name="", revision=revision),
+            imports=imports,
+            download_config=self.download_config,
+        )
+        # copy the script and the files in an importable directory
+        dynamic_modules_path = self.dynamic_modules_path if self.dynamic_modules_path else init_dynamic_modules()
+        module_path, hash = _create_importable_file(
+            local_path=local_path,
+            local_imports=local_imports,
+            additional_files=[],
+            dynamic_modules_path=dynamic_modules_path,
+            module_namespace=self.module_type,
+            name=self.name,
+            download_mode=self.download_mode,
+        )
+        # make the new module to be noticed by the import system
+        importlib.invalidate_caches()
+        return ImportableModule(module_path, hash)
+
+
+class CachedEvaluationModuleFactory(_EvaluationModuleFactory):
+    """
+    Get the module of a metric that has been loaded once already and cached.
+    The script that is loaded from the cache is the most recent one with a matching name.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        module_type: str = "metrics",
+        dynamic_modules_path: Optional[str] = None,
+    ):
+        self.name = name
+        self.module_type = module_type
+        self.dynamic_modules_path = dynamic_modules_path
+        assert self.name.count("/") == 0
+
+    def get_module(self) -> ImportableModule:
+        dynamic_modules_path = self.dynamic_modules_path if self.dynamic_modules_path else init_dynamic_modules()
+        importable_directory_path = os.path.join(dynamic_modules_path, self.module_type, self.name)
+        hashes = (
+            [h for h in os.listdir(importable_directory_path) if len(h) == 64]
+            if os.path.isdir(importable_directory_path)
+            else None
+        )
+        if not hashes:
+            raise FileNotFoundError(f"Metric {self.name} is not cached in {dynamic_modules_path}")
+        # get most recent
+
+        def _get_modification_time(module_hash):
+            return (
+                (Path(importable_directory_path) / module_hash / (self.name.split("--")[-1] + ".py")).stat().st_mtime
+            )
+
+        hash = sorted(hashes, key=_get_modification_time)[-1]
+        logger.warning(
+            f"Using the latest cached version of the module from {os.path.join(importable_directory_path, hash)} "
+            f"(last modified on {time.ctime(_get_modification_time(hash))}) since it "
+            f"couldn't be found locally at {self.name}, or remotely on the Hugging Face Hub."
+        )
+        # make the new module to be noticed by the import system
+        module_path = ".".join(
+            [os.path.basename(dynamic_modules_path), self.module_type, self.name, hash, self.name.split("--")[-1]]
+        )
+        importlib.invalidate_caches()
+        return ImportableModule(module_path, hash)
+
+
+def evaluation_module_factory(
+    path: str,
+    module_type: Optional[str] = None,
+    revision: Optional[Union[str, Version]] = None,
+    download_config: Optional[DownloadConfig] = None,
+    download_mode: Optional[DownloadMode] = None,
+    force_local_path: Optional[str] = None,
+    dynamic_modules_path: Optional[str] = None,
+    **download_kwargs,
+) -> ImportableModule:
+    """
+    Download/extract/cache a metric module.
+
+    Metrics codes are cached inside the the dynamic modules cache to allow easy import (avoid ugly sys.path tweaks).
+
+    Args:
+
+        path (str): Path or name of the metric script.
+
+            - if ``path`` is a local metric script or a directory containing a local metric script (if the script has the same name as the directory):
+              -> load the module from the metric script
+              e.g. ``'./metrics/accuracy'`` or ``'./metrics/accuracy/accuracy.py'``.
+            - if ``path`` is a metric on the Hugging Face Hub (ex: `glue`, `squad`)
+              -> load the module from the metric script in the github repository at huggingface/datasets
+              e.g. ``'accuracy'`` or ``'rouge'``.
+
+        revision (Optional ``Union[str, datasets.Version]``):
+            If specified, the module will be loaded from the datasets repository at this version.
+            By default:
+            - it is set to the local version of the lib.
+            - it will also try to load it from the master branch if it's not available at the local version of the lib.
+            Specifying a version that is different from your local version of the lib might cause compatibility issues.
+        download_config (:class:`DownloadConfig`, optional): Specific download configuration parameters.
+        download_mode (:class:`DownloadMode`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
+        force_local_path (Optional str): Optional path to a local path to download and prepare the script to.
+            Used to inspect or modify the script folder.
+        dynamic_modules_path (Optional str, defaults to HF_MODULES_CACHE / "datasets_modules", i.e. ~/.cache/huggingface/modules/datasets_modules):
+            Optional path to the directory in which the dynamic modules are saved. It must have been initialized with :obj:`init_dynamic_modules`.
+            By default the datasets and metrics are stored inside the `datasets_modules` module.
+        download_kwargs: optional attributes for DownloadConfig() which will override the attributes in download_config if supplied.
+
+    Returns:
+        ImportableModule
+    """
+    if download_config is None:
+        download_config = DownloadConfig(**download_kwargs)
+    download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
+    download_config.extract_compressed_file = True
+    download_config.force_extract = True
+
+    filename = list(filter(lambda x: x, path.replace(os.sep, "/").split("/")))[-1]
+    if not filename.endswith(".py"):
+        filename = filename + ".py"
+    combined_path = os.path.join(path, filename)
+    # Try locally
+    if path.endswith(filename):
+        if os.path.isfile(path):
+            return LocalEvaluationModuleFactory(
+                path, download_mode=download_mode, dynamic_modules_path=dynamic_modules_path
+            ).get_module()
+        else:
+            raise FileNotFoundError(f"Couldn't find a metric script at {relative_to_absolute_path(path)}")
+    elif os.path.isfile(combined_path):
+        return LocalEvaluationModuleFactory(
+            combined_path, download_mode=download_mode, dynamic_modules_path=dynamic_modules_path
+        ).get_module()
+    elif is_relative_path(path) and path.count("/") <= 1 and not force_local_path:
+        try:
+            # load a canonical evaluation module from hub
+            if path.count("/") == 0:
+                # if no type provided look through all possible modules
+                if module_type is None:
+                    for current_type in ["metric", "comparison", "measurement"]:
+                        try:
+                            return HubEvaluationModuleFactory(
+                                f"evaluate-{current_type}/{path}",
+                                revision=revision,
+                                download_config=download_config,
+                                download_mode=download_mode,
+                                dynamic_modules_path=dynamic_modules_path,
+                            ).get_module()
+                        except ConnectionError:
+                            pass
+                    raise FileNotFoundError
+                # if module_type provided load specific module_type
+                else:
+                    return HubEvaluationModuleFactory(
+                        f"evaluate-{module_type}/{path}",
+                        revision=revision,
+                        download_config=download_config,
+                        download_mode=download_mode,
+                        dynamic_modules_path=dynamic_modules_path,
+                    ).get_module()
+            # load community evaluation module from hub
+            elif path.count("/") == 1:
+                return HubEvaluationModuleFactory(
+                    path,
+                    revision=revision,
+                    download_config=download_config,
+                    download_mode=download_mode,
+                    dynamic_modules_path=dynamic_modules_path,
+                ).get_module()
+        except Exception as e1:  # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
+            # if it's a canonical module we need to check if it's any of the types
+            if path.count("/") == 0:
+                for current_type in ["metric", "comparison", "measurement"]:
+                    try:
+                        return CachedEvaluationModuleFactory(
+                            f"evaluate-{current_type}--{path}", dynamic_modules_path=dynamic_modules_path
+                        ).get_module()
+                    except Exception as e2:  # noqa: if it's not in the cache, then it doesn't exist.
+                        pass
+            # if it's a community module we just need to check on path
+            elif path.count("/") == 1:
+                try:
+                    return CachedEvaluationModuleFactory(
+                        path.replace("/", "--"), dynamic_modules_path=dynamic_modules_path
+                    ).get_module()
+                except Exception as e2:  # noqa: if it's not in the cache, then it doesn't exist.
+                    pass
+            if not isinstance(e1, (ConnectionError, FileNotFoundError)):
+                raise e1 from None
+            raise FileNotFoundError(
+                f"Couldn't find a module script at {relative_to_absolute_path(combined_path)}. "
+                f"Module '{path}' doesn't exist on the Hugging Face Hub either."
+            ) from None
+    else:
+        raise FileNotFoundError(f"Couldn't find a module script at {relative_to_absolute_path(combined_path)}.")
+
+
+def load(
+    path: str,
+    config_name: Optional[str] = None,
+    module_type: Optional[str] = None,
+    process_id: int = 0,
+    num_process: int = 1,
+    cache_dir: Optional[str] = None,
+    experiment_id: Optional[str] = None,
+    keep_in_memory: bool = False,
+    download_config: Optional[DownloadConfig] = None,
+    download_mode: Optional[DownloadMode] = None,
+    revision: Optional[Union[str, Version]] = None,
+    **init_kwargs,
+) -> EvaluationModule:
+    """Load a [`~evaluate.EvaluationModule`].
+
+    Args:
+
+        path (`str`):
+            Path to the evaluation processing script with the evaluation builder. Can be either:
+                - a local path to processing script or the directory containing the script (if the script has the same name as the directory),
+                    e.g. `'./metrics/rouge'` or `'./metrics/rouge/rouge.py'`
+                - a evaluation module identifier on the HuggingFace evaluate repo e.g. `'rouge'` or `'bleu'` that are in either `'metrics/'`,
+                    `'comparisons/'`, or `'measurements/'` depending on the provided `module_type`
+        config_name (`str`, *optional*):
+            Selecting a configuration for the metric (e.g. the GLUE metric has a configuration for each subset).
+        module_type (`str`, default `'metric'`):
+            Type of evaluation module, can be one of `'metric'`, `'comparison'`, or `'measurement'`.
+        process_id (`int`, *optional*):
+            For distributed evaluation: id of the process.
+        num_process (`int`, *optional*):
+            For distributed evaluation: total number of processes.
+        cache_dir (`str`, *optional*):
+            Path to store the temporary predictions and references (default to `~/.cache/huggingface/evaluate/`).
+        experiment_id (`str`):
+            A specific experiment id. This is used if several distributed evaluations share the same file system.
+            This is useful to compute metrics in distributed setups (in particular non-additive metrics like F1).
+        keep_in_memory (`bool`):
+            Whether to store the temporary results in memory (defaults to `False`).
+        download_config ([`~evaluate.DownloadConfig`], *optional*):
+            Specific download configuration parameters.
+        download_mode ([`DownloadMode`], defaults to `REUSE_DATASET_IF_EXISTS`):
+            Download/generate mode.
+        revision (`Union[str, evaluate.Version]`, *optional*):
+            If specified, the module will be loaded from the datasets repository
+            at this version. By default it is set to the local version of the lib. Specifying a version that is different from
+            your local version of the lib might cause compatibility issues.
+
+    Returns:
+        [`evaluate.EvaluationModule`]
+
+    Example:
+
+        ```py
+        >>> from evaluate import load
+        >>> accuracy = load("accuracy")
+        ```
+    """
+    download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
+    evaluation_module = evaluation_module_factory(
+        path, module_type=module_type, revision=revision, download_config=download_config, download_mode=download_mode
+    )
+    evaluation_cls = import_main_class(evaluation_module.module_path)
+    evaluation_instance = evaluation_cls(
+        config_name=config_name,
+        process_id=process_id,
+        num_process=num_process,
+        cache_dir=cache_dir,
+        keep_in_memory=keep_in_memory,
+        experiment_id=experiment_id,
+        hash=evaluation_module.hash,
+        **init_kwargs,
+    )
+
+    if module_type and module_type != evaluation_instance.module_type:
+        raise TypeError(
+            f"No module of module type '{module_type}' not found for '{path}' locally, or on the Hugging Face Hub. Found module of module type '{evaluation_instance.module_type}' instead."
+        )
+
+    # Download and prepare resources for the metric
+    evaluation_instance.download_and_prepare(download_config=download_config)
+
+    return evaluation_instance
--- a/evaluate-0.4.2/src/evaluate/module.py
+++ b/evaluate-0.4.2/src/evaluate/module.py
+# Copyright 2020 The HuggingFace Datasets Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+""" EvaluationModule base class."""
+import collections
+import itertools
+import os
+import types
+import uuid
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import pyarrow as pa
+from datasets import DatasetInfo, DownloadConfig, DownloadManager
+from datasets.arrow_dataset import Dataset
+from datasets.arrow_reader import ArrowReader
+from datasets.arrow_writer import ArrowWriter
+from datasets.features import Features, Sequence, Value
+from datasets.features.features import _check_non_null_non_empty_recursive
+from datasets.utils.filelock import BaseFileLock, FileLock, Timeout
+from datasets.utils.py_utils import copyfunc, temp_seed, zip_dict
+
+from . import config
+from .info import EvaluationModuleInfo
+from .naming import camelcase_to_snakecase
+from .utils.logging import get_logger
+
+
+logger = get_logger(__name__)
+
+
+class FileFreeLock(BaseFileLock):
+    """Thread lock until a file **cannot** be locked"""
+
+    def __init__(self, lock_file, *args, **kwargs):
+        self.filelock = FileLock(lock_file)
+        super().__init__(lock_file, *args, **kwargs)
+        self._lock_file_fd = None
+
+    def _acquire(self):
+        try:
+            self.filelock.acquire(timeout=0.01, poll_interval=0.02)  # Try to lock once
+        except Timeout:
+            # We couldn't acquire the lock, the file is locked!
+            self._lock_file_fd = self.filelock.lock_file
+        else:
+            # We were able to acquire the lock, the file is not yet locked!
+            self.filelock.release()
+            self._lock_file_fd = None
+
+    def _release(self):
+        self._lock_file_fd = None
+
+    @property
+    def is_locked(self) -> bool:
+        return self._lock_file_fd is not None
+
+
+# lists - summarize long lists similarly to NumPy
+# arrays/tensors - let the frameworks control formatting
+def summarize_if_long_list(obj):
+    if type(obj) is not list or len(obj) <= 6:
+        return f"{obj}"
+
+    def format_chunk(chunk):
+        return ", ".join(repr(x) for x in chunk)
+
+    return f"[{format_chunk(obj[:3])}, ..., {format_chunk(obj[-3:])}]"
+
+
+class EvaluationModuleInfoMixin:
+    """This base class exposes some attributes of EvaluationModuleInfo
+    at the base level of the EvaluationModule for easy access.
+    """
+
+    def __init__(self, info: EvaluationModuleInfo):
+        self._module_info = info
+
+    @property
+    def info(self):
+        """:class:`evaluate.EvaluationModuleInfo` object containing all the metadata in the evaluation module."""
+        return self._module_info
+
+    @property
+    def name(self) -> str:
+        return self._module_info.module_name
+
+    @property
+    def experiment_id(self) -> Optional[str]:
+        return self._module_info.experiment_id
+
+    @property
+    def description(self) -> str:
+        return self._module_info.description
+
+    @property
+    def citation(self) -> str:
+        return self._module_info.citation
+
+    @property
+    def features(self) -> Features:
+        return self._module_info.features
+
+    @property
+    def inputs_description(self) -> str:
+        return self._module_info.inputs_description
+
+    @property
+    def homepage(self) -> Optional[str]:
+        return self._module_info.homepage
+
+    @property
+    def license(self) -> str:
+        return self._module_info.license
+
+    @property
+    def codebase_urls(self) -> Optional[List[str]]:
+        return self._module_info.codebase_urls
+
+    @property
+    def reference_urls(self) -> Optional[List[str]]:
+        return self._module_info.reference_urls
+
+    @property
+    def streamable(self) -> bool:
+        return self._module_info.streamable
+
+    @property
+    def format(self) -> Optional[str]:
+        return self._module_info.format
+
+    @property
+    def module_type(self) -> str:
+        return self._module_info.module_type
+
+
+class EvaluationModule(EvaluationModuleInfoMixin):
+    """A `EvaluationModule` is the base class and common API for metrics, comparisons, and measurements.
+
+    Args:
+        config_name (`str`):
+            This is used to define a hash specific to a module computation script and prevents the module's data
+            to be overridden when the module loading script is modified.
+        keep_in_memory (`bool`):
+            Keep all predictions and references in memory. Not possible in distributed settings.
+        cache_dir (`str`):
+            Path to a directory in which temporary prediction/references data will be stored.
+            The data directory should be located on a shared file-system in distributed setups.
+        num_process (`int`):
+            Specify the total number of nodes in a distributed settings.
+            This is useful to compute module in distributed setups (in particular non-additive modules like F1).
+        process_id (`int`):
+            Specify the id of the current process in a distributed setup (between 0 and num_process-1)
+            This is useful to compute module in distributed setups (in particular non-additive metrics like F1).
+        seed (`int`, optional):
+            If specified, this will temporarily set numpy's random seed when [`~evaluate.EvaluationModule.compute`] is run.
+        experiment_id (`str`):
+            A specific experiment id. This is used if several distributed evaluations share the same file system.
+            This is useful to compute module in distributed setups (in particular non-additive metrics like F1).
+        hash (`str`):
+            Used to identify the evaluation module according to the hashed file contents.
+        max_concurrent_cache_files (`int`):
+            Max number of concurrent module cache files (default `10000`).
+        timeout (`Union[int, float]`):
+            Timeout in second for distributed setting synchronization.
+    """
+
+    def __init__(
+        self,
+        config_name: Optional[str] = None,
+        keep_in_memory: bool = False,
+        cache_dir: Optional[str] = None,
+        num_process: int = 1,
+        process_id: int = 0,
+        seed: Optional[int] = None,
+        experiment_id: Optional[str] = None,
+        hash: str = None,
+        max_concurrent_cache_files: int = 10000,
+        timeout: Union[int, float] = 100,
+        **kwargs,
+    ):
+        # prepare info
+        self.config_name = config_name or "default"
+        info = self._info()
+        info.module_name = camelcase_to_snakecase(self.__class__.__name__)
+        info.config_name = self.config_name
+        info.experiment_id = experiment_id or "default_experiment"
+        EvaluationModuleInfoMixin.__init__(self, info)  # For easy access on low level
+
+        # Safety checks on num_process and process_id
+        if not isinstance(process_id, int) or process_id < 0:
+            raise ValueError("'process_id' should be a number greater than 0")
+        if not isinstance(num_process, int) or num_process <= process_id:
+            raise ValueError("'num_process' should be a number greater than process_id")
+        if keep_in_memory and num_process != 1:
+            raise ValueError("Using 'keep_in_memory' is not possible in distributed setting (num_process > 1).")
+
+        self.num_process = num_process
+        self.process_id = process_id
+        self.max_concurrent_cache_files = max_concurrent_cache_files
+
+        self.keep_in_memory = keep_in_memory
+        self._data_dir_root = os.path.expanduser(cache_dir or config.HF_METRICS_CACHE)
+        self.data_dir = self._build_data_dir()
+        if seed is None:
+            _, seed, pos, *_ = np.random.get_state()
+            self.seed: int = seed[pos] if pos < 624 else seed[0]
+        else:
+            self.seed: int = seed
+        self.timeout: Union[int, float] = timeout
+
+        # Update 'compute' and 'add' docstring
+        # methods need to be copied otherwise it changes the docstrings of every instance
+        self.compute = types.MethodType(copyfunc(self.compute), self)
+        self.add_batch = types.MethodType(copyfunc(self.add_batch), self)
+        self.add = types.MethodType(copyfunc(self.add), self)
+        self.compute.__func__.__doc__ += self.info.inputs_description
+        self.add_batch.__func__.__doc__ += self.info.inputs_description
+        self.add.__func__.__doc__ += self.info.inputs_description
+
+        # self.arrow_schema = pa.schema(field for field in self.info.features.type)
+        self.selected_feature_format = None
+        self.buf_writer = None
+        self.writer = None
+        self.writer_batch_size = None
+        self.data = None
+
+        # This is the cache file we store our predictions/references in
+        # Keep it None for now so we can (cloud)pickle the object
+        self.cache_file_name = None
+        self.filelock = None
+        self.rendez_vous_lock = None
+
+        # This is all the cache files on which we have a lock when we are in a distributed setting
+        self.file_paths = None
+        self.filelocks = None
+
+        # This fingerprints the evaluation module according to the hashed contents of the module code
+        self._hash = hash
+
+    def __len__(self):
+        """Return the number of examples (predictions or predictions/references pair)
+        currently stored in the evaluation module's cache.
+        """
+        return 0 if self.writer is None else len(self.writer)
+
+    def __repr__(self):
+        return (
+            f'EvaluationModule(name: "{self.name}", module_type: "{self.module_type}", '
+            f'features: {self.features}, usage: """{self.inputs_description}""", '
+            f"stored examples: {len(self)})"
+        )
+
+    def _build_data_dir(self):
+        """Path of this evaluation module in cache_dir:
+        Will be:
+            self._data_dir_root/self.name/self.config_name/self.hash (if not none)/
+        If any of these element is missing or if ``with_version=False`` the corresponding subfolders are dropped.
+        """
+        builder_data_dir = self._data_dir_root
+        builder_data_dir = os.path.join(builder_data_dir, self.name, self.config_name)
+        os.makedirs(builder_data_dir, exist_ok=True)
+        return builder_data_dir
+
+    def _create_cache_file(self, timeout=1) -> Tuple[str, FileLock]:
+        """Create a new cache file. If the default cache file is used, we generated a new hash."""
+        file_path = os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-{self.process_id}.arrow")
+        filelock = None
+        for i in range(self.max_concurrent_cache_files):
+            filelock = FileLock(file_path + ".lock")
+            try:
+                filelock.acquire(timeout=timeout)
+            except Timeout:
+                # If we have reached the max number of attempts or we are not allow to find a free name (distributed setup)
+                # We raise an error
+                if self.num_process != 1:
+                    raise ValueError(
+                        f"Error in _create_cache_file: another evaluation module instance is already using the local cache file at {file_path}. "
+                        f"Please specify an experiment_id (currently: {self.experiment_id}) to avoid collision "
+                        f"between distributed evaluation module instances."
+                    ) from None
+                if i == self.max_concurrent_cache_files - 1:
+                    raise ValueError(
+                        f"Cannot acquire lock, too many evaluation module instance are operating concurrently on this file system."
+                        f"You should set a larger value of max_concurrent_cache_files when creating the evaluation module "
+                        f"(current value is {self.max_concurrent_cache_files})."
+                    ) from None
+                # In other cases (allow to find new file name + not yet at max num of attempts) we can try to sample a new hashing name.
+                file_uuid = str(uuid.uuid4())
+                file_path = os.path.join(
+                    self.data_dir, f"{self.experiment_id}-{file_uuid}-{self.num_process}-{self.process_id}.arrow"
+                )
+            else:
+                break
+
+        return file_path, filelock
+
+    def _get_all_cache_files(self) -> Tuple[List[str], List[FileLock]]:
+        """Get a lock on all the cache files in a distributed setup.
+        We wait for timeout second to let all the distributed node finish their tasks (default is 100 seconds).
+        """
+        if self.num_process == 1:
+            if self.cache_file_name is None:
+                raise ValueError(
+                    "Evaluation module cache file doesn't exist. Please make sure that you call `add` or `add_batch` "
+                    "at least once before calling `compute`."
+                )
+            file_paths = [self.cache_file_name]
+        else:
+            file_paths = [
+                os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-{process_id}.arrow")
+                for process_id in range(self.num_process)
+            ]
+
+        # Let's acquire a lock on each process files to be sure they are finished writing
+        filelocks = []
+        for process_id, file_path in enumerate(file_paths):
+            if process_id == 0:  # process 0 already has its lock file
+                filelocks.append(self.filelock)
+            else:
+                filelock = FileLock(file_path + ".lock")
+                try:
+                    filelock.acquire(timeout=self.timeout)
+                except Timeout:
+                    raise ValueError(
+                        f"Cannot acquire lock on cached file {file_path} for process {process_id}."
+                    ) from None
+                else:
+                    filelocks.append(filelock)
+
+        return file_paths, filelocks
+
+    def _check_all_processes_locks(self):
+        expected_lock_file_names = [
+            os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-{process_id}.arrow.lock")
+            for process_id in range(self.num_process)
+        ]
+        for expected_lock_file_name in expected_lock_file_names:
+            nofilelock = FileFreeLock(expected_lock_file_name)
+            try:
+                nofilelock.acquire(timeout=self.timeout)
+            except Timeout:
+                raise ValueError(
+                    f"Expected to find locked file {expected_lock_file_name} from process {self.process_id} but it doesn't exist."
+                ) from None
+            else:
+                nofilelock.release()
+
+    def _check_rendez_vous(self):
+        expected_lock_file_name = os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-0.arrow.lock")
+        nofilelock = FileFreeLock(expected_lock_file_name)
+        try:
+            nofilelock.acquire(timeout=self.timeout)
+        except Timeout:
+            raise ValueError(
+                f"Expected to find locked file {expected_lock_file_name} from process {self.process_id} but it doesn't exist."
+            ) from None
+        else:
+            nofilelock.release()
+        lock_file_name = os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-rdv.lock")
+        rendez_vous_lock = FileLock(lock_file_name)
+        try:
+            rendez_vous_lock.acquire(timeout=self.timeout)
+        except Timeout:
+            raise ValueError(f"Couldn't acquire lock on {lock_file_name} from process {self.process_id}.") from None
+        else:
+            rendez_vous_lock.release()
+
+    def _finalize(self):
+        """Close all the writing process and load/gather the data
+        from all the nodes if main node or all_process is True.
+        """
+        if self.writer is not None:
+            self.writer.finalize()
+        self.writer = None
+        # release the locks of the processes > 0 so that process 0 can lock them to read + delete the data
+        if self.filelock is not None and self.process_id > 0:
+            self.filelock.release()
+
+        if self.keep_in_memory:
+            # Read the predictions and references
+            reader = ArrowReader(path=self.data_dir, info=DatasetInfo(features=self.selected_feature_format))
+            self.data = Dataset.from_buffer(self.buf_writer.getvalue())
+
+        elif self.process_id == 0:
+            # Let's acquire a lock on each node files to be sure they are finished writing
+            file_paths, filelocks = self._get_all_cache_files()
+
+            # Read the predictions and references
+            try:
+                reader = ArrowReader(path="", info=DatasetInfo(features=self.selected_feature_format))
+                self.data = Dataset(**reader.read_files([{"filename": f} for f in file_paths]))
+            except FileNotFoundError:
+                raise ValueError(
+                    "Error in finalize: another evaluation module instance is already using the local cache file. "
+                    "Please specify an experiment_id to avoid collision between distributed evaluation module instances."
+                ) from None
+
+            # Store file paths and locks and we will release/delete them after the computation.
+            self.file_paths = file_paths
+            self.filelocks = filelocks
+
+    def compute(self, *, predictions=None, references=None, **kwargs) -> Optional[dict]:
+        """Compute the evaluation module.
+
+        Usage of positional arguments is not allowed to prevent mistakes.
+
+        Args:
+            predictions (`list/array/tensor`, *optional*):
+                Predictions.
+            references (`list/array/tensor`, *optional*):
+                References.
+            **kwargs (optional):
+                Keyword arguments that will be forwarded to the evaluation module [`~evaluate.EvaluationModule.compute`]
+                method (see details in the docstring).
+
+        Return:
+            `dict` or `None`
+
+            - Dictionary with the results if this evaluation module is run on the main process (`process_id == 0`).
+            - `None` if the evaluation module is not run on the main process (`process_id != 0`).
+
+        ```py
+        >>> import evaluate
+        >>> accuracy =  evaluate.load("accuracy")
+        >>> accuracy.compute(predictions=[0, 1, 1, 0], references=[0, 1, 0, 1])
+        ```
+        """
+        all_kwargs = {"predictions": predictions, "references": references, **kwargs}
+        if predictions is None and references is None:
+            missing_kwargs = {k: None for k in self._feature_names() if k not in all_kwargs}
+            all_kwargs.update(missing_kwargs)
+        else:
+            missing_inputs = [k for k in self._feature_names() if k not in all_kwargs]
+            if missing_inputs:
+                raise ValueError(
+                    f"Evaluation module inputs are missing: {missing_inputs}. All required inputs are {list(self._feature_names())}"
+                )
+        inputs = {input_name: all_kwargs[input_name] for input_name in self._feature_names()}
+        compute_kwargs = {k: kwargs[k] for k in kwargs if k not in self._feature_names()}
+
+        if any(v is not None for v in inputs.values()):
+            self.add_batch(**inputs)
+        self._finalize()
+
+        self.cache_file_name = None
+        self.filelock = None
+        self.selected_feature_format = None
+
+        if self.process_id == 0:
+            self.data.set_format(type=self.info.format)
+
+            inputs = {input_name: self.data[input_name] for input_name in self._feature_names()}
+            with temp_seed(self.seed):
+                output = self._compute(**inputs, **compute_kwargs)
+
+            if self.buf_writer is not None:
+                self.buf_writer = None
+                del self.data
+                self.data = None
+            else:
+                # Release locks and delete all the cache files. Process 0 is released last.
+                for filelock, file_path in reversed(list(zip(self.filelocks, self.file_paths))):
+                    logger.info(f"Removing {file_path}")
+                    del self.data
+                    self.data = None
+                    del self.writer
+                    self.writer = None
+                    os.remove(file_path)
+                    filelock.release()
+
+            return output
+        else:
+            return None
+
+    def add_batch(self, *, predictions=None, references=None, **kwargs):
+        """Add a batch of predictions and references for the evaluation module's stack.
+
+        Args:
+            predictions (`list/array/tensor`, *optional*):
+                Predictions.
+            references (`list/array/tensor`, *optional*):
+                References.
+
+        Example:
+
+        ```py
+        >>> import evaluate
+        >>> accuracy = evaluate.load("accuracy")
+        >>> for refs, preds in zip([[0,1],[0,1]], [[1,0],[0,1]]):
+        ...     accuracy.add_batch(references=refs, predictions=preds)
+        ```
+        """
+        bad_inputs = [input_name for input_name in kwargs if input_name not in self._feature_names()]
+        if bad_inputs:
+            raise ValueError(
+                f"Bad inputs for evaluation module: {bad_inputs}. All required inputs are {list(self._feature_names())}"
+            )
+        batch = {"predictions": predictions, "references": references, **kwargs}
+        batch = {input_name: batch[input_name] for input_name in self._feature_names()}
+        if self.writer is None:
+            self.selected_feature_format = self._infer_feature_from_batch(batch)
+            self._init_writer()
+        try:
+            for key, column in batch.items():
+                if len(column) > 0:
+                    self._enforce_nested_string_type(self.selected_feature_format[key], column[0])
+            batch = self.selected_feature_format.encode_batch(batch)
+            self.writer.write_batch(batch)
+        except (pa.ArrowInvalid, TypeError):
+            if any(len(batch[c]) != len(next(iter(batch.values()))) for c in batch):
+                col0 = next(iter(batch))
+                bad_col = [c for c in batch if len(batch[c]) != len(batch[col0])][0]
+                error_msg = (
+                    f"Mismatch in the number of {col0} ({len(batch[col0])}) and {bad_col} ({len(batch[bad_col])})"
+                )
+            elif set(self.selected_feature_format) != {"references", "predictions"}:
+                error_msg = (
+                    f"Module inputs don't match the expected format.\n"
+                    f"Expected format: {self.selected_feature_format },\n"
+                )
+                error_msg_inputs = ",\n".join(
+                    f"Input {input_name}: {summarize_if_long_list(batch[input_name])}"
+                    for input_name in self.selected_feature_format
+                )
+                error_msg += error_msg_inputs
+            else:
+                error_msg = (
+                    f"Predictions and/or references don't match the expected format.\n"
+                    f"Expected format: {self.selected_feature_format },\n"
+                    f"Input predictions: {summarize_if_long_list(predictions)},\n"
+                    f"Input references: {summarize_if_long_list(references)}"
+                )
+            raise ValueError(error_msg) from None
+
+    def add(self, *, prediction=None, reference=None, **kwargs):
+        """Add one prediction and reference for the evaluation module's stack.
+
+        Args:
+            prediction (`list/array/tensor`, *optional*):
+                Predictions.
+            reference (`list/array/tensor`, *optional*):
+                References.
+
+        Example:
+
+        ```py
+        >>> import evaluate
+        >>> accuracy = evaluate.load("accuracy")
+        >>> accuracy.add(references=[0,1], predictions=[1,0])
+        ```
+        """
+        bad_inputs = [input_name for input_name in kwargs if input_name not in self._feature_names()]
+        if bad_inputs:
+            raise ValueError(
+                f"Bad inputs for evaluation module: {bad_inputs}. All required inputs are {list(self._feature_names())}"
+            )
+        example = {"predictions": prediction, "references": reference, **kwargs}
+        example = {input_name: example[input_name] for input_name in self._feature_names()}
+        if self.writer is None:
+            self.selected_feature_format = self._infer_feature_from_example(example)
+            self._init_writer()
+        try:
+            self._enforce_nested_string_type(self.selected_feature_format, example)
+            example = self.selected_feature_format.encode_example(example)
+            self.writer.write(example)
+        except (pa.ArrowInvalid, TypeError):
+            error_msg = (
+                f"Evaluation module inputs don't match the expected format.\n"
+                f"Expected format: {self.selected_feature_format},\n"
+            )
+            error_msg_inputs = ",\n".join(
+                f"Input {input_name}: {summarize_if_long_list(example[input_name])}"
+                for input_name in self.selected_feature_format
+            )
+            error_msg += error_msg_inputs
+            raise ValueError(error_msg) from None
+
+    def _infer_feature_from_batch(self, batch):
+        if isinstance(self.features, Features):
+            return self.features
+        else:
+            example = dict([(k, v[0]) for k, v in batch.items()])
+            return self._infer_feature_from_example(example)
+
+    def _infer_feature_from_example(self, example):
+        if isinstance(self.features, Features):
+            return self.features
+        else:
+            for features in self.features:
+                try:
+                    self._enforce_nested_string_type(features, example)
+                    features.encode_example(example)
+                    return features
+                except (ValueError, TypeError):
+                    continue
+        feature_strings = "\n".join([f"Feature option {i}: {feature}" for i, feature in enumerate(self.features)])
+        error_msg = (
+            f"Predictions and/or references don't match the expected format.\n"
+            f"Expected format:\n{feature_strings},\n"
+            f"Input predictions: {summarize_if_long_list(example['predictions'])},\n"
+            f"Input references: {summarize_if_long_list(example['references'])}"
+        )
+        raise ValueError(error_msg) from None
+
+    def _feature_names(self):
+        if isinstance(self.features, list):
+            feature_names = list(self.features[0].keys())
+        else:
+            feature_names = list(self.features.keys())
+        return feature_names
+
+    def _init_writer(self, timeout=1):
+        if self.num_process > 1:
+            if self.process_id == 0:
+                file_path = os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-rdv.lock")
+                self.rendez_vous_lock = FileLock(file_path)
+                try:
+                    self.rendez_vous_lock.acquire(timeout=timeout)
+                except TimeoutError:
+                    raise ValueError(
+                        f"Error in _init_writer: another evalution module instance is already using the local cache file at {file_path}. "
+                        f"Please specify an experiment_id (currently: {self.experiment_id}) to avoid collision "
+                        f"between distributed evaluation module instances."
+                    ) from None
+
+        if self.keep_in_memory:
+            self.buf_writer = pa.BufferOutputStream()
+            self.writer = ArrowWriter(
+                features=self.selected_feature_format, stream=self.buf_writer, writer_batch_size=self.writer_batch_size
+            )
+        else:
+            self.buf_writer = None
+
+            # Get cache file name and lock it
+            if self.cache_file_name is None or self.filelock is None:
+                cache_file_name, filelock = self._create_cache_file()  # get ready
+                self.cache_file_name = cache_file_name
+                self.filelock = filelock
+
+            self.writer = ArrowWriter(
+                features=self.selected_feature_format,
+                path=self.cache_file_name,
+                writer_batch_size=self.writer_batch_size,
+            )
+        # Setup rendez-vous here if
+        if self.num_process > 1:
+            if self.process_id == 0:
+                self._check_all_processes_locks()  # wait for everyone to be ready
+                self.rendez_vous_lock.release()  # let everyone go
+            else:
+                self._check_rendez_vous()  # wait for master to be ready and to let everyone go
+
+    def _info(self) -> EvaluationModuleInfo:
+        """Construct the EvaluationModuleInfo object. See `EvaluationModuleInfo` for details.
+
+        Warning: This function is only called once and the result is cached for all
+        following .info() calls.
+
+        Returns:
+            info: (EvaluationModuleInfo) The EvaluationModule information
+        """
+        raise NotImplementedError
+
+    def download_and_prepare(
+        self,
+        download_config: Optional[DownloadConfig] = None,
+        dl_manager: Optional[DownloadManager] = None,
+    ):
+        """Downloads and prepares evaluation module for reading.
+
+        Args:
+            download_config ([`DownloadConfig`], *optional*):
+                Specific download configuration parameters.
+            dl_manager ([`DownloadManager`], *optional*):
+                Specific download manager to use.
+
+        Example:
+
+        ```py
+        >>> import evaluate
+        ```
+        """
+        if dl_manager is None:
+            if download_config is None:
+                download_config = DownloadConfig()
+                download_config.cache_dir = os.path.join(self.data_dir, "downloads")
+                download_config.force_download = False
+
+            dl_manager = DownloadManager(
+                dataset_name=self.name, download_config=download_config, data_dir=self.data_dir
+            )
+
+        self._download_and_prepare(dl_manager)
+
+    def _download_and_prepare(self, dl_manager):
+        """Downloads and prepares resources for the evaluation module.
+
+        This is the internal implementation to overwrite called when user calls
+        `download_and_prepare`. It should download all required resources for the evaluation module.
+
+        Args:
+            dl_manager (:class:`DownloadManager`): `DownloadManager` used to download and cache data.
+        """
+        return None
+
+    def _compute(self, *, predictions=None, references=None, **kwargs) -> Dict[str, Any]:
+        """This method defines the common API for all the evaluation module in the library"""
+        raise NotImplementedError
+
+    def __del__(self):
+        if hasattr(self, "filelock") and self.filelock is not None:
+            self.filelock.release()
+        if hasattr(self, "rendez_vous_lock") and self.rendez_vous_lock is not None:
+            self.rendez_vous_lock.release()
+        if hasattr(self, "writer"):  # in case it was already deleted
+            del self.writer
+        if hasattr(self, "data"):  # in case it was already deleted
+            del self.data
+
+    def _enforce_nested_string_type(self, schema, obj):
+        """
+        Recursively checks if there is any Value feature of type string and throws TypeError if corresponding object is not a string.
+        Since any Python object can be cast to string this avoids implicitly casting wrong input types (e.g. lists) to string without error.
+        """
+        # Nested structures: we allow dict, list, tuples, sequences
+        if isinstance(schema, dict):
+            return [self._enforce_nested_string_type(sub_schema, o) for k, (sub_schema, o) in zip_dict(schema, obj)]
+
+        elif isinstance(schema, (list, tuple)):
+            sub_schema = schema[0]
+            return [self._enforce_nested_string_type(sub_schema, o) for o in obj]
+        elif isinstance(schema, Sequence):
+            # We allow to reverse list of dict => dict of list for compatiblity with tfds
+            if isinstance(schema.feature, dict):
+                if isinstance(obj, (list, tuple)):
+                    # obj is a list of dict
+                    for k, dict_tuples in zip_dict(schema.feature, *obj):
+                        for sub_obj in dict_tuples[1:]:
+                            if _check_non_null_non_empty_recursive(sub_obj, dict_tuples[0]):
+                                self._enforce_nested_string_type(dict_tuples[0], sub_obj)
+                                break
+                    return None
+                else:
+                    # obj is a single dict
+                    for k, (sub_schema, sub_objs) in zip_dict(schema.feature, obj):
+                        for sub_obj in sub_objs:
+                            if _check_non_null_non_empty_recursive(sub_obj, sub_schema):
+                                self._enforce_nested_string_type(sub_schema, sub_obj)
+                                break
+                    return None
+            # schema.feature is not a dict
+            if isinstance(obj, str):  # don't interpret a string as a list
+                raise ValueError(f"Got a string but expected a list instead: '{obj}'")
+            if obj is None:
+                return None
+            else:
+                if len(obj) > 0:
+                    for first_elmt in obj:
+                        if _check_non_null_non_empty_recursive(first_elmt, schema.feature):
+                            break
+                    if not isinstance(first_elmt, list):
+                        return self._enforce_nested_string_type(schema.feature, first_elmt)
+
+        elif isinstance(schema, Value):
+            if pa.types.is_string(schema.pa_type) and not isinstance(obj, str):
+                raise TypeError(f"Expected type str but got {type(obj)}.")
+
+
+class Metric(EvaluationModule):
+    """A Metric is the base class and common API for all metrics.
+
+    Args:
+        config_name (`str`):
+            This is used to define a hash specific to a metric computation script and prevents the metric's data
+            to be overridden when the metric loading script is modified.
+        keep_in_memory (`bool`):
+            Keep all predictions and references in memory. Not possible in distributed settings.
+        cache_dir (`str`):
+            Path to a directory in which temporary prediction/references data will be stored.
+            The data directory should be located on a shared file-system in distributed setups.
+        num_process (`int`):
+            Specify the total number of nodes in a distributed settings.
+            This is useful to compute metrics in distributed setups (in particular non-additive metrics like F1).
+        process_id (`int`):
+            Specify the id of the current process in a distributed setup (between 0 and num_process-1)
+            This is useful to compute metrics in distributed setups (in particular non-additive metrics like F1).
+        seed (`int`, *optional*):
+            If specified, this will temporarily set numpy's random seed when [`~evaluate.Metric.compute`] is run.
+        experiment_id (`str`):
+            A specific experiment id. This is used if several distributed evaluations share the same file system.
+            This is useful to compute metrics in distributed setups (in particular non-additive metrics like F1).
+        max_concurrent_cache_files (`int`):
+            Max number of concurrent metric cache files (default `10000`).
+        timeout (`Union[int, float]`):
+            Timeout in second for distributed setting synchronization.
+    """
+
+
+class Comparison(EvaluationModule):
+    """A Comparison is the base class and common API for all comparisons.
+
+    Args:
+        config_name (`str`):
+            This is used to define a hash specific to a comparison computation script and prevents the comparison's data
+            to be overridden when the comparison loading script is modified.
+        keep_in_memory (`bool`):
+            Keep all predictions and references in memory. Not possible in distributed settings.
+        cache_dir (`str`):
+            Path to a directory in which temporary prediction/references data will be stored.
+            The data directory should be located on a shared file-system in distributed setups.
+        num_process (`int`):
+            Specify the total number of nodes in a distributed settings.
+            This is useful to compute  comparisons in distributed setups (in particular non-additive comparisons).
+        process_id (`int`):
+            Specify the id of the current process in a distributed setup (between 0 and num_process-1)
+            This is useful to compute  comparisons in distributed setups (in particular non-additive comparisons).
+        seed (`int`, *optional*):
+            If specified, this will temporarily set numpy's random seed when [`~evaluate.Comparison.compute`] is run.
+        experiment_id (`str`):
+            A specific experiment id. This is used if several distributed evaluations share the same file system.
+            This is useful to compute  comparisons in distributed setups (in particular non-additive comparisons).
+        max_concurrent_cache_files (`int`):
+            Max number of concurrent comparison cache files (default `10000`).
+        timeout (`Union[int, float]`):
+            Timeout in second for distributed setting synchronization.
+    """
+
+
+class Measurement(EvaluationModule):
+    """A Measurement is the base class and common API for all measurements.
+
+    Args:
+        config_name (`str`):
+            This is used to define a hash specific to a measurement computation script and prevents the measurement's data
+            to be overridden when the measurement loading script is modified.
+        keep_in_memory (`bool`):
+            Keep all predictions and references in memory. Not possible in distributed settings.
+        cache_dir (`str`):
+            Path to a directory in which temporary prediction/references data will be stored.
+            The data directory should be located on a shared file-system in distributed setups.
+        num_process (`int`):
+            Specify the total number of nodes in a distributed settings.
+            This is useful to compute measurements in distributed setups (in particular non-additive measurements).
+        process_id (`int`):
+            Specify the id of the current process in a distributed setup (between 0 and num_process-1)
+            This is useful to compute measurements in distributed setups (in particular non-additive measurements).
+        seed (`int`, *optional*):
+            If specified, this will temporarily set numpy's random seed when [`~evaluate.Measurement.compute`] is run.
+        experiment_id (`str`):
+            A specific experiment id. This is used if several distributed evaluations share the same file system.
+            This is useful to compute measurements in distributed setups (in particular non-additive measurements).
+        max_concurrent_cache_files (`int`):
+            Max number of concurrent measurement cache files (default `10000`).
+        timeout (`Union[int, float]`):
+            Timeout in second for distributed setting synchronization.
+    """
+
+
+class CombinedEvaluations:
+    def __init__(self, evaluation_modules, force_prefix=False):
+        from .loading import load  # avoid circular imports
+
+        self.evaluation_module_names = None
+        if isinstance(evaluation_modules, list):
+            self.evaluation_modules = evaluation_modules
+        elif isinstance(evaluation_modules, dict):
+            self.evaluation_modules = list(evaluation_modules.values())
+            self.evaluation_module_names = list(evaluation_modules.keys())
+        loaded_modules = []
+
+        for module in self.evaluation_modules:
+            if isinstance(module, str):
+                module = load(module)
+            loaded_modules.append(module)
+        self.evaluation_modules = loaded_modules
+
+        if self.evaluation_module_names is None:
+            self.evaluation_module_names = [module.name for module in self.evaluation_modules]
+
+        self.force_prefix = force_prefix
+
+    def add(self, prediction=None, reference=None, **kwargs):
+        """Add one prediction and reference for each evaluation module's stack.
+
+        Args:
+            predictions (`list/array/tensor`, *optional*):
+                Predictions.
+            references (`list/array/tensor`, *optional*):
+                References.
+
+        Example:
+
+        ```py
+        >>> import evaluate
+        >>> accuracy = evaluate.load("accuracy")
+        >>> f1 = evaluate.load("f1")
+        >>> clf_metrics = combine(["accuracy", "f1"])
+        >>> for ref, pred in zip([0,1,0,1], [1,0,0,1]):
+        ...     clf_metrics.add(references=ref, predictions=pred)
+        ```
+        """
+        for evaluation_module in self.evaluation_modules:
+            batch = {"predictions": prediction, "references": reference, **kwargs}
+            batch = {input_name: batch[input_name] for input_name in evaluation_module._feature_names()}
+            evaluation_module.add(**batch)
+
+    def add_batch(self, predictions=None, references=None, **kwargs):
+        """Add a batch of predictions and references for each evaluation module's stack.
+
+        Args:
+            predictions (`list/array/tensor`, *optional*):
+                Predictions.
+            references (`list/array/tensor`, *optional*):
+                References.
+
+        Example:
+        ```py
+        >>> import evaluate
+        >>> accuracy = evaluate.load("accuracy")
+        >>> f1 = evaluate.load("f1")
+        >>> clf_metrics = combine(["accuracy", "f1"])
+        >>> for refs, preds in zip([[0,1],[0,1]], [[1,0],[0,1]]):
+        ...     clf_metrics.add(references=refs, predictions=preds)
+        ```
+        """
+        for evaluation_module in self.evaluation_modules:
+            batch = {"predictions": predictions, "references": references, **kwargs}
+            batch = {input_name: batch[input_name] for input_name in evaluation_module._feature_names()}
+            evaluation_module.add_batch(**batch)
+
+    def compute(self, predictions=None, references=None, **kwargs):
+        """Compute each evaluation module.
+
+        Usage of positional arguments is not allowed to prevent mistakes.
+
+        Args:
+            predictions (`list/array/tensor`, *optional*):
+                Predictions.
+            references (`list/array/tensor`, *optional*):
+                References.
+            **kwargs (*optional*):
+                Keyword arguments that will be forwarded to the evaluation module [`~evaluate.EvaluationModule.compute`]
+                method (see details in the docstring).
+
+        Return:
+            `dict` or `None`
+
+            - Dictionary with the results if this evaluation module is run on the main process (`process_id == 0`).
+            - `None` if the evaluation module is not run on the main process (`process_id != 0`).
+
+        Example:
+
+        ```py
+        >>> import evaluate
+        >>> accuracy = evaluate.load("accuracy")
+        >>> f1 = evaluate.load("f1")
+        >>> clf_metrics = combine(["accuracy", "f1"])
+        >>> clf_metrics.compute(predictions=[0,1], references=[1,1])
+        {'accuracy': 0.5, 'f1': 0.6666666666666666}
+        ```
+        """
+        results = []
+
+        for evaluation_module in self.evaluation_modules:
+            batch = {"predictions": predictions, "references": references, **kwargs}
+            results.append(evaluation_module.compute(**batch))
+
+        return self._merge_results(results)
+
+    def _merge_results(self, results):
+        merged_results = {}
+        results_keys = list(itertools.chain.from_iterable([r.keys() for r in results]))
+        duplicate_keys = {item for item, count in collections.Counter(results_keys).items() if count > 1}
+
+        duplicate_names = [
+            item for item, count in collections.Counter(self.evaluation_module_names).items() if count > 1
+        ]
+        duplicate_counter = {name: 0 for name in duplicate_names}
+
+        for module_name, result in zip(self.evaluation_module_names, results):
+            for k, v in result.items():
+                if k not in duplicate_keys and not self.force_prefix:
+                    merged_results[f"{k}"] = v
+                elif module_name in duplicate_counter:
+                    merged_results[f"{module_name}_{duplicate_counter[module_name]}_{k}"] = v
+                else:
+                    merged_results[f"{module_name}_{k}"] = v
+
+            if module_name in duplicate_counter:
+                duplicate_counter[module_name] += 1
+
+        return merged_results
+
+
+def combine(evaluations, force_prefix=False):
+    """Combines several metrics, comparisons, or measurements into a single `CombinedEvaluations` object that
+    can be used like a single evaluation module.
+
+    If two scores have the same name, then they are prefixed with their module names.
+    And if two modules have the same name, please use a dictionary to give them different names, otherwise an integer id is appended to the prefix.
+
+    Args:
+        evaluations (`Union[list, dict]`):
+            A list or dictionary of evaluation modules. The modules can either be passed
+            as strings or loaded `EvaluationModule`s. If a dictionary is passed its keys are the names used and the values the modules.
+            The names are used as prefix in case there are name overlaps in the returned results of each module or if `force_prefix=True`.
+        force_prefix (`bool`, *optional*, defaults to `False`):
+            If `True` all scores from the modules are prefixed with their name. If
+            a dictionary is passed the keys are used as name otherwise the module's name.
+
+    Examples:
+
+    ```py
+    >>> import evaluate
+    >>> accuracy = evaluate.load("accuracy")
+    >>> f1 = evaluate.load("f1")
+    >>> clf_metrics = combine(["accuracy", "f1"])
+    ```
+    """
+
+    return CombinedEvaluations(evaluations, force_prefix=force_prefix)
--- a/evaluate-0.4.2/src/evaluate/naming.py
+++ b/evaluate-0.4.2/src/evaluate/naming.py
+# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Utilities for file names."""
+
+import itertools
+import os
+import re
+
+
+_uppercase_uppercase_re = re.compile(r"([A-Z]+)([A-Z][a-z])")
+_lowercase_uppercase_re = re.compile(r"([a-z\d])([A-Z])")
+
+_single_underscore_re = re.compile(r"(?<!_)_(?!_)")
+_multiple_underscores_re = re.compile(r"(_{2,})")
+
+_split_re = r"^\w+(\.\w+)*$"
+
+
+def camelcase_to_snakecase(name):
+    """Convert camel-case string to snake-case."""
+    name = _uppercase_uppercase_re.sub(r"\1_\2", name)
+    name = _lowercase_uppercase_re.sub(r"\1_\2", name)
+    return name.lower()
+
+
+def snakecase_to_camelcase(name):
+    """Convert snake-case string to camel-case string."""
+    name = _single_underscore_re.split(name)
+    name = [_multiple_underscores_re.split(n) for n in name]
+    return "".join(n.capitalize() for n in itertools.chain.from_iterable(name) if n != "")
+
+
+def filename_prefix_for_name(name):
+    if os.path.basename(name) != name:
+        raise ValueError(f"Should be a dataset name, not a path: {name}")
+    return camelcase_to_snakecase(name)
+
+
+def filename_prefix_for_split(name, split):
+    if os.path.basename(name) != name:
+        raise ValueError(f"Should be a dataset name, not a path: {name}")
+    if not re.match(_split_re, split):
+        raise ValueError(f"Split name should match '{_split_re}'' but got '{split}'.")
+    return f"{filename_prefix_for_name(name)}-{split}"
+
+
+def filepattern_for_dataset_split(dataset_name, split, data_dir, filetype_suffix=None):
+    prefix = filename_prefix_for_split(dataset_name, split)
+    if filetype_suffix:
+        prefix += f".{filetype_suffix}"
+    filepath = os.path.join(data_dir, prefix)
+    return f"{filepath}*"
+
+
+def filename_for_dataset_split(dataset_name, split, filetype_suffix=None):
+    prefix = filename_prefix_for_split(dataset_name, split)
+    if filetype_suffix:
+        prefix += f".{filetype_suffix}"
+    return prefix
+
+
+def filepath_for_dataset_split(dataset_name, split, data_dir, filetype_suffix=None):
+    filename = filename_for_dataset_split(
+        dataset_name=dataset_name,
+        split=split,
+        filetype_suffix=filetype_suffix,
+    )
+    filepath = os.path.join(data_dir, filename)
+    return filepath
--- a/evaluate-0.4.2/src/evaluate/saving.py
+++ b/evaluate-0.4.2/src/evaluate/saving.py
+import json
+import os
+import subprocess
+import sys
+from datetime import datetime
+from pathlib import Path
+
+from datasets.utils.filelock import FileLock
+
+from . import __version__
+
+
+def save(path_or_file, **data):
+    """
+    Saves results to a JSON file. Also saves system information such as current time, current commit
+    hash if inside a repository, and Python system information.
+
+    Args:
+        path_or_file (`str`):
+            Path or file to store the file. If only a folder is provided
+            the results file will be saved in the format `"result-%Y_%m_%d-%H_%M_%S.json"`.
+
+    Example:
+        ```py
+        >>> import evaluate
+        >>> result = {"bleu": 0.7}
+        >>> params = {"model": "gpt-2"}
+        >>> evaluate.save("./results/", **result, **params)
+        ```
+    """
+    current_time = datetime.now()
+
+    file_path = _setup_path(path_or_file, current_time)
+
+    data["_timestamp"] = current_time.isoformat()
+    data["_git_commit_hash"] = _git_commit_hash()
+    data["_evaluate_version"] = __version__
+    data["_python_version"] = sys.version
+    data["_interpreter_path"] = sys.executable
+
+    with FileLock(str(file_path) + ".lock"):
+        with open(file_path, "w") as f:
+            json.dump(data, f)
+
+    # cleanup lock file
+    try:
+        os.remove(str(file_path) + ".lock")
+    except FileNotFoundError:
+        pass
+
+    return file_path
+
+
+def _setup_path(path_or_file, current_time):
+    path_or_file = Path(path_or_file)
+    is_file = len(path_or_file.suffix) > 0
+    if is_file:
+        folder = path_or_file.parent
+        file_name = path_or_file.name
+    else:
+        folder = path_or_file
+        file_name = "result-" + current_time.strftime("%Y_%m_%d-%H_%M_%S") + ".json"
+    folder.mkdir(parents=True, exist_ok=True)
+    return folder / file_name
+
+
+def _git_commit_hash():
+    res = subprocess.run("git rev-parse --is-inside-work-tree".split(), cwd="./", stdout=subprocess.PIPE)
+    if res.stdout.decode().strip() == "true":
+        res = subprocess.run("git rev-parse HEAD".split(), cwd=os.getcwd(), stdout=subprocess.PIPE)
+        return res.stdout.decode().strip()
+    else:
+        return None
--- a/evaluate-0.4.2/src/evaluate/utils/__init__.py
+++ b/evaluate-0.4.2/src/evaluate/utils/__init__.py
+# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# flake8: noqa
+# Lint as: python3
+"""Util import."""
+
+__all__ = [
+    "disable_progress_bar",
+    "enable_progress_bar",
+    "is_progress_bar_enabled",
+    "infer_gradio_input_types",
+    "json_to_string_type",
+    "parse_readme",
+    "parse_gradio_data",
+    "parse_test_cases",
+    "launch_gradio_widget",
+]
+
+from .gradio import (
+    infer_gradio_input_types,
+    json_to_string_type,
+    launch_gradio_widget,
+    parse_gradio_data,
+    parse_readme,
+    parse_test_cases,
+)
+from .logging import disable_progress_bar, enable_progress_bar, is_progress_bar_enabled
--- a/evaluate-0.4.2/src/evaluate/utils/file_utils.py
+++ b/evaluate-0.4.2/src/evaluate/utils/file_utils.py
+"""
+Utilities for working with the local dataset cache.
+This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
+Copyright by the AllenNLP authors.
+"""
+
+import copy
+import io
+import json
+import os
+import posixpath
+import re
+import shutil
+import sys
+import tempfile
+import time
+import urllib
+from contextlib import closing, contextmanager
+from functools import partial
+from hashlib import sha256
+from pathlib import Path
+from typing import List, Optional, Type, TypeVar, Union
+from urllib.parse import urljoin, urlparse
+
+import requests
+from datasets import DownloadConfig
+from datasets.utils.extract import ExtractManager
+from datasets.utils.filelock import FileLock
+
+from .. import __version__, config
+from . import logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+INCOMPLETE_SUFFIX = ".incomplete"
+
+T = TypeVar("T", str, Path)
+
+
+def init_hf_modules(hf_modules_cache: Optional[Union[Path, str]] = None) -> str:
+    """
+    Add hf_modules_cache to the python path.
+    By default hf_modules_cache='~/.cache/huggingface/modules'.
+    It can also be set with the environment variable HF_MODULES_CACHE.
+    This is used to add modules such as `datasets_modules`
+    """
+    hf_modules_cache = hf_modules_cache if hf_modules_cache is not None else config.HF_MODULES_CACHE
+    hf_modules_cache = str(hf_modules_cache)
+    if hf_modules_cache not in sys.path:
+        sys.path.append(hf_modules_cache)
+
+        os.makedirs(hf_modules_cache, exist_ok=True)
+        if not os.path.exists(os.path.join(hf_modules_cache, "__init__.py")):
+            with open(os.path.join(hf_modules_cache, "__init__.py"), "w"):
+                pass
+    return hf_modules_cache
+
+
+def is_remote_url(url_or_filename: str) -> bool:
+    parsed = urlparse(url_or_filename)
+    return parsed.scheme in ("http", "https", "s3", "gs", "hdfs", "ftp")
+
+
+def is_local_path(url_or_filename: str) -> bool:
+    # On unix the scheme of a local path is empty (for both absolute and relative),
+    # while on windows the scheme is the drive name (ex: "c") for absolute paths.
+    # for details on the windows behavior, see https://bugs.python.org/issue42215
+    return urlparse(url_or_filename).scheme == "" or os.path.ismount(urlparse(url_or_filename).scheme + ":/")
+
+
+def is_relative_path(url_or_filename: str) -> bool:
+    return urlparse(url_or_filename).scheme == "" and not os.path.isabs(url_or_filename)
+
+
+def relative_to_absolute_path(path: T) -> T:
+    """Convert relative path to absolute path."""
+    abs_path_str = os.path.abspath(os.path.expanduser(os.path.expandvars(str(path))))
+    return Path(abs_path_str) if isinstance(path, Path) else abs_path_str
+
+
+def hf_bucket_url(identifier: str, filename: str, use_cdn=False, dataset=True) -> str:
+    if dataset:
+        endpoint = config.CLOUDFRONT_DATASETS_DISTRIB_PREFIX if use_cdn else config.S3_DATASETS_BUCKET_PREFIX
+    else:
+        endpoint = config.CLOUDFRONT_METRICS_DISTRIB_PREFIX if use_cdn else config.S3_METRICS_BUCKET_PREFIX
+    return "/".join((endpoint, identifier, filename))
+
+
+def head_hf_s3(
+    identifier: str, filename: str, use_cdn=False, dataset=True, max_retries=0
+) -> Union[requests.Response, Exception]:
+    return http_head(
+        hf_bucket_url(identifier=identifier, filename=filename, use_cdn=use_cdn, dataset=dataset),
+        max_retries=max_retries,
+    )
+
+
+def hf_hub_url(path: str, name: str, revision: Optional[str] = None) -> str:
+    revision = revision or config.HUB_DEFAULT_VERSION
+    return config.HUB_EVALUATE_URL.format(path=path, name=name, revision=revision)
+
+
+def url_or_path_join(base_name: str, *pathnames: str) -> str:
+    if is_remote_url(base_name):
+        return posixpath.join(base_name, *(str(pathname).replace(os.sep, "/").lstrip("/") for pathname in pathnames))
+    else:
+        return Path(base_name, *pathnames).as_posix()
+
+
+def url_or_path_parent(url_or_path: str) -> str:
+    if is_remote_url(url_or_path):
+        return url_or_path[: url_or_path.rindex("/")]
+    else:
+        return os.path.dirname(url_or_path)
+
+
+def hash_url_to_filename(url, etag=None):
+    """
+    Convert `url` into a hashed filename in a repeatable way.
+    If `etag` is specified, append its hash to the url's, delimited
+    by a period.
+    If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name
+    so that TF 2.0 can identify it as a HDF5 file
+    (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
+    """
+    url_bytes = url.encode("utf-8")
+    url_hash = sha256(url_bytes)
+    filename = url_hash.hexdigest()
+
+    if etag:
+        etag_bytes = etag.encode("utf-8")
+        etag_hash = sha256(etag_bytes)
+        filename += "." + etag_hash.hexdigest()
+
+    if url.endswith(".py"):
+        filename += ".py"
+
+    return filename
+
+
+def cached_path(
+    url_or_filename,
+    download_config=None,
+    **download_kwargs,
+) -> str:
+    """
+    Given something that might be a URL (or might be a local path),
+    determine which. If it's a URL, download the file and cache it, and
+    return the path to the cached file. If it's already a local path,
+    make sure the file exists and then return the path.
+
+    Return:
+        Local path (string)
+
+    Raises:
+        FileNotFoundError: in case of non-recoverable file
+            (non-existent or no cache on disk)
+        ConnectionError: in case of unreachable url
+            and no cache on disk
+        ValueError: if it couldn't parse the url or filename correctly
+        requests.exceptions.ConnectionError: in case of internet connection issue
+    """
+    if download_config is None:
+        download_config = DownloadConfig(**download_kwargs)
+
+    cache_dir = download_config.cache_dir or config.DOWNLOADED_EVALUATE_PATH
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+    if isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+
+    if is_remote_url(url_or_filename):
+        # URL, so get it from the cache (downloading if necessary)
+        output_path = get_from_cache(
+            url_or_filename,
+            cache_dir=cache_dir,
+            force_download=download_config.force_download,
+            proxies=download_config.proxies,
+            resume_download=download_config.resume_download,
+            user_agent=download_config.user_agent,
+            local_files_only=download_config.local_files_only,
+            use_etag=download_config.use_etag,
+            max_retries=download_config.max_retries,
+            use_auth_token=download_config.use_auth_token,
+            ignore_url_params=download_config.ignore_url_params,
+            download_desc=download_config.download_desc,
+        )
+    elif os.path.exists(url_or_filename):
+        # File, and it exists.
+        output_path = url_or_filename
+    elif is_local_path(url_or_filename):
+        # File, but it doesn't exist.
+        raise FileNotFoundError(f"Local file {url_or_filename} doesn't exist")
+    else:
+        # Something unknown
+        raise ValueError(f"unable to parse {url_or_filename} as a URL or as a local path")
+
+    if output_path is None:
+        return output_path
+
+    if download_config.extract_compressed_file:
+        output_path = ExtractManager(cache_dir=download_config.cache_dir).extract(
+            output_path, force_extract=download_config.force_extract
+        )
+
+    return output_path
+
+
+def get_datasets_user_agent(user_agent: Optional[Union[str, dict]] = None) -> str:
+    ua = f"datasets/{__version__}; python/{config.PY_VERSION}"
+    ua += f"; pyarrow/{config.PYARROW_VERSION}"
+    if config.TORCH_AVAILABLE:
+        ua += f"; torch/{config.TORCH_VERSION}"
+    if config.TF_AVAILABLE:
+        ua += f"; tensorflow/{config.TF_VERSION}"
+    if config.JAX_AVAILABLE:
+        ua += f"; jax/{config.JAX_VERSION}"
+    if isinstance(user_agent, dict):
+        ua += f"; {'; '.join(f'{k}/{v}' for k, v in user_agent.items())}"
+    elif isinstance(user_agent, str):
+        ua += "; " + user_agent
+    return ua
+
+
+def get_authentication_headers_for_url(url: str, use_auth_token: Optional[Union[str, bool]] = None) -> dict:
+    """Handle the HF authentication"""
+    headers = {}
+    if url.startswith(config.HF_ENDPOINT):
+        token = None
+        if isinstance(use_auth_token, str):
+            token = use_auth_token
+        elif bool(use_auth_token):
+            from huggingface_hub import hf_api
+
+            token = hf_api.HfFolder.get_token()
+        if token:
+            headers["authorization"] = f"Bearer {token}"
+    return headers
+
+
+class OfflineModeIsEnabled(ConnectionError):
+    pass
+
+
+def _raise_if_offline_mode_is_enabled(msg: Optional[str] = None):
+    """Raise an OfflineModeIsEnabled error (subclass of ConnectionError) if HF_EVALUATE_OFFLINE is True."""
+    if config.HF_EVALUATE_OFFLINE:
+        raise OfflineModeIsEnabled(
+            "Offline mode is enabled." if msg is None else "Offline mode is enabled. " + str(msg)
+        )
+
+
+def _retry(
+    func,
+    func_args: Optional[tuple] = None,
+    func_kwargs: Optional[dict] = None,
+    exceptions: Type[requests.exceptions.RequestException] = requests.exceptions.RequestException,
+    status_codes: Optional[List[int]] = None,
+    max_retries: int = 0,
+    base_wait_time: float = 0.5,
+    max_wait_time: float = 2,
+):
+    func_args = func_args or ()
+    func_kwargs = func_kwargs or {}
+    retry = 0
+    while True:
+        try:
+            return func(*func_args, **func_kwargs)
+        except exceptions as err:
+            if retry >= max_retries or (status_codes and err.response.status_code not in status_codes):
+                raise err
+            else:
+                sleep_time = min(max_wait_time, base_wait_time * 2**retry)  # Exponential backoff
+                logger.info(f"{func} timed out, retrying in {sleep_time}s... [{retry/max_retries}]")
+                time.sleep(sleep_time)
+                retry += 1
+
+
+def _request_with_retry(
+    method: str,
+    url: str,
+    max_retries: int = 0,
+    base_wait_time: float = 0.5,
+    max_wait_time: float = 2,
+    timeout: float = 10.0,
+    **params,
+) -> requests.Response:
+    """Wrapper around requests to retry in case it fails with a ConnectTimeout, with exponential backoff.
+
+    Note that if the environment variable HF_EVALUATE_OFFLINE is set to 1, then a OfflineModeIsEnabled error is raised.
+
+    Args:
+        method (str): HTTP method, such as 'GET' or 'HEAD'.
+        url (str): The URL of the resource to fetch.
+        max_retries (int): Maximum number of retries, defaults to 0 (no retries).
+        base_wait_time (float): Duration (in seconds) to wait before retrying the first time. Wait time between
+            retries then grows exponentially, capped by max_wait_time.
+        max_wait_time (float): Maximum amount of time between two retries, in seconds.
+        **params: Params to pass to :obj:`requests.request`.
+    """
+    _raise_if_offline_mode_is_enabled(f"Tried to reach {url}")
+    tries, success = 0, False
+    while not success:
+        tries += 1
+        try:
+            response = requests.request(method=method.upper(), url=url, timeout=timeout, **params)
+            success = True
+        except (requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError) as err:
+            if tries > max_retries:
+                raise err
+            else:
+                logger.info(f"{method} request to {url} timed out, retrying... [{tries/max_retries}]")
+                sleep_time = min(max_wait_time, base_wait_time * 2 ** (tries - 1))  # Exponential backoff
+                time.sleep(sleep_time)
+    return response
+
+
+def ftp_head(url, timeout=10.0):
+    _raise_if_offline_mode_is_enabled(f"Tried to reach {url}")
+    try:
+        with closing(urllib.request.urlopen(url, timeout=timeout)) as r:
+            r.read(1)
+    except Exception:
+        return False
+    return True
+
+
+def ftp_get(url, temp_file, timeout=10.0):
+    _raise_if_offline_mode_is_enabled(f"Tried to reach {url}")
+    try:
+        logger.info(f"Getting through FTP {url} into {temp_file.name}")
+        with closing(urllib.request.urlopen(url, timeout=timeout)) as r:
+            shutil.copyfileobj(r, temp_file)
+    except urllib.error.URLError as e:
+        raise ConnectionError(e) from None
+
+
+def http_get(
+    url, temp_file, proxies=None, resume_size=0, headers=None, cookies=None, timeout=100.0, max_retries=0, desc=None
+):
+    headers = copy.deepcopy(headers) or {}
+    headers["user-agent"] = get_datasets_user_agent(user_agent=headers.get("user-agent"))
+    if resume_size > 0:
+        headers["Range"] = f"bytes={resume_size:d}-"
+    response = _request_with_retry(
+        method="GET",
+        url=url,
+        stream=True,
+        proxies=proxies,
+        headers=headers,
+        cookies=cookies,
+        max_retries=max_retries,
+        timeout=timeout,
+    )
+    if response.status_code == 416:  # Range not satisfiable
+        return
+    content_length = response.headers.get("Content-Length")
+    total = resume_size + int(content_length) if content_length is not None else None
+    with logging.tqdm(
+        unit="B",
+        unit_scale=True,
+        total=total,
+        initial=resume_size,
+        desc=desc or "Downloading",
+        disable=not logging.is_progress_bar_enabled(),
+    ) as progress:
+        for chunk in response.iter_content(chunk_size=1024):
+            progress.update(len(chunk))
+            temp_file.write(chunk)
+
+
+def http_head(
+    url, proxies=None, headers=None, cookies=None, allow_redirects=True, timeout=10.0, max_retries=0
+) -> requests.Response:
+    headers = copy.deepcopy(headers) or {}
+    headers["user-agent"] = get_datasets_user_agent(user_agent=headers.get("user-agent"))
+    response = _request_with_retry(
+        method="HEAD",
+        url=url,
+        proxies=proxies,
+        headers=headers,
+        cookies=cookies,
+        allow_redirects=allow_redirects,
+        timeout=timeout,
+        max_retries=max_retries,
+    )
+    return response
+
+
+def request_etag(url: str, use_auth_token: Optional[Union[str, bool]] = None) -> Optional[str]:
+    headers = get_authentication_headers_for_url(url, use_auth_token=use_auth_token)
+    response = http_head(url, headers=headers, max_retries=3)
+    response.raise_for_status()
+    etag = response.headers.get("ETag") if response.ok else None
+    return etag
+
+
+def get_from_cache(
+    url,
+    cache_dir=None,
+    force_download=False,
+    proxies=None,
+    etag_timeout=100,
+    resume_download=False,
+    user_agent=None,
+    local_files_only=False,
+    use_etag=True,
+    max_retries=0,
+    use_auth_token=None,
+    ignore_url_params=False,
+    download_desc=None,
+) -> str:
+    """
+    Given a URL, look for the corresponding file in the local cache.
+    If it's not there, download it. Then return the path to the cached file.
+
+    Return:
+        Local path (string)
+
+    Raises:
+        FileNotFoundError: in case of non-recoverable file
+            (non-existent or no cache on disk)
+        ConnectionError: in case of unreachable url
+            and no cache on disk
+    """
+    if cache_dir is None:
+        cache_dir = config.HF_EVALUATE_CACHE
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    os.makedirs(cache_dir, exist_ok=True)
+
+    if ignore_url_params:
+        # strip all query parameters and #fragments from the URL
+        cached_url = urljoin(url, urlparse(url).path)
+    else:
+        cached_url = url  # additional parameters may be added to the given URL
+
+    connected = False
+    response = None
+    cookies = None
+    etag = None
+    head_error = None
+
+    # Try a first time to file the file on the local file system without eTag (None)
+    # if we don't ask for 'force_download' then we spare a request
+    filename = hash_url_to_filename(cached_url, etag=None)
+    cache_path = os.path.join(cache_dir, filename)
+
+    if os.path.exists(cache_path) and not force_download and not use_etag:
+        return cache_path
+
+    # Prepare headers for authentication
+    headers = get_authentication_headers_for_url(url, use_auth_token=use_auth_token)
+    if user_agent is not None:
+        headers["user-agent"] = user_agent
+
+    # We don't have the file locally or we need an eTag
+    if not local_files_only:
+        if url.startswith("ftp://"):
+            connected = ftp_head(url)
+        try:
+            response = http_head(
+                url,
+                allow_redirects=True,
+                proxies=proxies,
+                timeout=etag_timeout,
+                max_retries=max_retries,
+                headers=headers,
+            )
+            if response.status_code == 200:  # ok
+                etag = response.headers.get("ETag") if use_etag else None
+                for k, v in response.cookies.items():
+                    # In some edge cases, we need to get a confirmation token
+                    if k.startswith("download_warning") and "drive.google.com" in url:
+                        url += "&confirm=" + v
+                        cookies = response.cookies
+                connected = True
+                # Fix Google Drive URL to avoid Virus scan warning
+                if "drive.google.com" in url and "confirm=" not in url:
+                    url += "&confirm=t"
+            # In some edge cases, head request returns 400 but the connection is actually ok
+            elif (
+                (response.status_code == 400 and "firebasestorage.googleapis.com" in url)
+                or (response.status_code == 405 and "drive.google.com" in url)
+                or (
+                    response.status_code == 403
+                    and (
+                        re.match(r"^https?://github.com/.*?/.*?/releases/download/.*?/.*?$", url)
+                        or re.match(r"^https://.*?s3.*?amazonaws.com/.*?$", response.url)
+                    )
+                )
+                or (response.status_code == 403 and "ndownloader.figstatic.com" in url)
+            ):
+                connected = True
+                logger.info(f"Couldn't get ETag version for url {url}")
+            elif response.status_code == 401 and config.HF_ENDPOINT in url and use_auth_token is None:
+                raise ConnectionError(
+                    f"Unauthorized for URL {url}. Please use the parameter ``use_auth_token=True`` after logging in with ``huggingface-cli login``"
+                )
+        except (OSError, requests.exceptions.Timeout) as e:
+            # not connected
+            head_error = e
+            pass
+
+    # connected == False = we don't have a connection, or url doesn't exist, or is otherwise inaccessible.
+    # try to get the last downloaded one
+    if not connected:
+        if os.path.exists(cache_path) and not force_download:
+            return cache_path
+        if local_files_only:
+            raise FileNotFoundError(
+                f"Cannot find the requested files in the cached path at {cache_path} and outgoing traffic has been"
+                " disabled. To enable file online look-ups, set 'local_files_only' to False."
+            )
+        elif response is not None and response.status_code == 404:
+            raise FileNotFoundError(f"Couldn't find file at {url}")
+        _raise_if_offline_mode_is_enabled(f"Tried to reach {url}")
+        if head_error is not None:
+            raise ConnectionError(f"Couldn't reach {url} ({repr(head_error)})")
+        elif response is not None:
+            raise ConnectionError(f"Couldn't reach {url} (error {response.status_code})")
+        else:
+            raise ConnectionError(f"Couldn't reach {url}")
+
+    # Try a second time
+    filename = hash_url_to_filename(cached_url, etag)
+    cache_path = os.path.join(cache_dir, filename)
+
+    if os.path.exists(cache_path) and not force_download:
+        return cache_path
+
+    # From now on, connected is True.
+    # Prevent parallel downloads of the same file with a lock.
+    lock_path = cache_path + ".lock"
+    with FileLock(lock_path):
+
+        if resume_download:
+            incomplete_path = cache_path + ".incomplete"
+
+            @contextmanager
+            def _resumable_file_manager():
+                with open(incomplete_path, "a+b") as f:
+                    yield f
+
+            temp_file_manager = _resumable_file_manager
+            if os.path.exists(incomplete_path):
+                resume_size = os.stat(incomplete_path).st_size
+            else:
+                resume_size = 0
+        else:
+            temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False)
+            resume_size = 0
+
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with temp_file_manager() as temp_file:
+            logger.info(f"{url} not found in cache or force_download set to True, downloading to {temp_file.name}")
+
+            # GET file object
+            if url.startswith("ftp://"):
+                ftp_get(url, temp_file)
+            else:
+                http_get(
+                    url,
+                    temp_file,
+                    proxies=proxies,
+                    resume_size=resume_size,
+                    headers=headers,
+                    cookies=cookies,
+                    max_retries=max_retries,
+                    desc=download_desc,
+                )
+
+        logger.info(f"storing {url} in cache at {cache_path}")
+        shutil.move(temp_file.name, cache_path)
+
+        logger.info(f"creating metadata file for {cache_path}")
+        meta = {"url": url, "etag": etag}
+        meta_path = cache_path + ".json"
+        with open(meta_path, "w", encoding="utf-8") as meta_file:
+            json.dump(meta, meta_file)
+
+    return cache_path
+
+
+def add_start_docstrings(*docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = "".join(docstr) + "\n\n" + (fn.__doc__ if fn.__doc__ is not None else "")
+        return fn
+
+    return docstring_decorator
+
+
+def add_end_docstrings(*docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = (fn.__doc__ if fn.__doc__ is not None else "") + "\n\n" + "".join(docstr)
+        return fn
+
+    return docstring_decorator
+
+
+def estimate_dataset_size(paths):
+    return sum(path.stat().st_size for path in paths)
+
+
+def readline(f: io.RawIOBase):
+    # From: https://github.com/python/cpython/blob/d27e2f4d118e7a9909b6a3e5da06c5ff95806a85/Lib/_pyio.py#L525
+    res = bytearray()
+    while True:
+        b = f.read(1)
+        if not b:
+            break
+        res += b
+        if res.endswith(b"\n"):
+            break
+    return bytes(res)
--- a/evaluate-0.4.2/src/evaluate/utils/gradio.py
+++ b/evaluate-0.4.2/src/evaluate/utils/gradio.py
+import json
+import os
+import re
+import sys
+from pathlib import Path
+
+import numpy as np
+from datasets import Value
+
+from .logging import get_logger
+
+
+logger = get_logger(__name__)
+
+REGEX_YAML_BLOCK = re.compile(r"---[\n\r]+([\S\s]*?)[\n\r]+---[\n\r]")
+
+
+def infer_gradio_input_types(feature_types):
+    """
+    Maps metric feature types to input types for gradio Dataframes:
+        - float/int -> numbers
+        - string -> strings
+        - any other -> json
+    Note that json is not a native gradio type but will be treated as string that
+    is then parsed as a json.
+    """
+    input_types = []
+    for feature_type in feature_types:
+        input_type = "json"
+        if isinstance(feature_type, Value):
+            if feature_type.dtype.startswith("int") or feature_type.dtype.startswith("float"):
+                input_type = "number"
+            elif feature_type.dtype == "string":
+                input_type = "str"
+        input_types.append(input_type)
+    return input_types
+
+
+def json_to_string_type(input_types):
+    """Maps json input type to str."""
+    return ["str" if i == "json" else i for i in input_types]
+
+
+def parse_readme(filepath):
+    """Parses a repositories README and removes"""
+    if not os.path.exists(filepath):
+        return "No README.md found."
+    with open(filepath, "r") as f:
+        text = f.read()
+        match = REGEX_YAML_BLOCK.search(text)
+        if match:
+            text = text[match.end() :]
+    return text
+
+
+def parse_gradio_data(data, input_types):
+    """Parses data from gradio Dataframe for use in metric."""
+    metric_inputs = {}
+    data.replace("", np.nan, inplace=True)
+    data.dropna(inplace=True)
+    for feature_name, input_type in zip(data, input_types):
+        if input_type == "json":
+            metric_inputs[feature_name] = [json.loads(d) for d in data[feature_name].to_list()]
+        elif input_type == "str":
+            metric_inputs[feature_name] = [d.strip('"') for d in data[feature_name].to_list()]
+        else:
+            metric_inputs[feature_name] = data[feature_name]
+    return metric_inputs
+
+
+def parse_test_cases(test_cases, feature_names, input_types):
+    """
+    Parses test cases to be used in gradio Dataframe. Note that an apostrophe is added
+    to strings to follow the format in json.
+    """
+    if len(test_cases) == 0:
+        return None
+    examples = []
+    for test_case in test_cases:
+        parsed_cases = []
+        for feat, input_type in zip(feature_names, input_types):
+            if input_type == "json":
+                parsed_cases.append([str(element) for element in test_case[feat]])
+            elif input_type == "str":
+                parsed_cases.append(['"' + element + '"' for element in test_case[feat]])
+            else:
+                parsed_cases.append(test_case[feat])
+        examples.append([list(i) for i in zip(*parsed_cases)])
+    return examples
+
+
+def launch_gradio_widget(metric):
+    """Launches `metric` widget with Gradio."""
+
+    try:
+        import gradio as gr
+    except ImportError as error:
+        logger.error("To create a metric widget with Gradio make sure gradio is installed.")
+        raise error
+
+    local_path = Path(sys.path[0])
+    # if there are several input types, use first as default.
+    if isinstance(metric.features, list):
+        (feature_names, feature_types) = zip(*metric.features[0].items())
+    else:
+        (feature_names, feature_types) = zip(*metric.features.items())
+    gradio_input_types = infer_gradio_input_types(feature_types)
+
+    def compute(data):
+        return metric.compute(**parse_gradio_data(data, gradio_input_types))
+
+    iface = gr.Interface(
+        fn=compute,
+        inputs=gr.inputs.Dataframe(
+            headers=feature_names,
+            col_count=len(feature_names),
+            row_count=1,
+            datatype=json_to_string_type(gradio_input_types),
+        ),
+        outputs=gr.outputs.Textbox(label=metric.name),
+        description=(
+            metric.info.description + "\nIf this is a text-based metric, make sure to wrap you input in double quotes."
+            " Alternatively you can use a JSON-formatted list as input."
+        ),
+        title=f"Metric: {metric.name}",
+        article=parse_readme(local_path / "README.md"),
+        # TODO: load test cases and use them to populate examples
+        # examples=[parse_test_cases(test_cases, feature_names, gradio_input_types)]
+    )
+
+    iface.launch()
--- a/evaluate-0.4.2/src/evaluate/utils/logging.py
+++ b/evaluate-0.4.2/src/evaluate/utils/logging.py
+# Copyright 2020 Optuna, Hugging Face
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Logging utilities. """
+
+import logging
+import os
+from logging import CRITICAL  # NOQA
+from logging import DEBUG  # NOQA
+from logging import ERROR  # NOQA
+from logging import FATAL  # NOQA
+from logging import INFO  # NOQA
+from logging import NOTSET  # NOQA
+from logging import WARN  # NOQA
+from logging import WARNING  # NOQA
+from typing import Optional
+
+from tqdm import auto as tqdm_lib
+
+
+log_levels = {
+    "debug": logging.DEBUG,
+    "info": logging.INFO,
+    "warning": logging.WARNING,
+    "error": logging.ERROR,
+    "critical": logging.CRITICAL,
+}
+
+_default_log_level = logging.WARNING
+
+
+def _get_default_logging_level():
+    """
+    If EVALUATE_VERBOSITY env var is set to one of the valid choices return that as the new default level.
+    If it is not - fall back to ``_default_log_level``
+    """
+    env_level_str = os.getenv("EVALUATE_VERBOSITY", None)
+    if env_level_str:
+        if env_level_str in log_levels:
+            return log_levels[env_level_str]
+        else:
+            logging.getLogger().warning(
+                f"Unknown option EVALUATE_VERBOSITY={env_level_str}, "
+                f"has to be one of: { ', '.join(log_levels.keys()) }"
+            )
+    return _default_log_level
+
+
+def _get_library_name() -> str:
+    return __name__.split(".")[0]
+
+
+def _get_library_root_logger() -> logging.Logger:
+    return logging.getLogger(_get_library_name())
+
+
+def _configure_library_root_logger() -> None:
+    # Apply our default configuration to the library root logger.
+    library_root_logger = _get_library_root_logger()
+    library_root_logger.setLevel(_get_default_logging_level())
+
+
+def _reset_library_root_logger() -> None:
+    library_root_logger = _get_library_root_logger()
+    library_root_logger.setLevel(logging.NOTSET)
+
+
+def get_logger(name: Optional[str] = None) -> logging.Logger:
+    """Return a logger with the specified name."""
+    if name is None:
+        name = _get_library_name()
+    return logging.getLogger(name)
+
+
+def get_verbosity() -> int:
+    """Return the current level for the Hugging Face Evaluate library's root logger.
+    Returns:
+        Logging level, e.g., `evaluate.logging.DEBUG` and `evaluate.logging.INFO`.
+
+    <Tip>
+
+        Hugging Face Evaluate library has following logging levels:
+        - `evaluate.logging.CRITICAL`, `evaluate.logging.FATAL`
+        - `evaluate.logging.ERROR`
+        - `evaluate.logging.WARNING`, `evaluate.logging.WARN`
+        - `evaluate.logging.INFO`
+        - `evaluate.logging.DEBUG`
+
+    </Tip>
+    """
+    return _get_library_root_logger().getEffectiveLevel()
+
+
+def set_verbosity(verbosity: int) -> None:
+    """Set the level for the Hugging Face Evaluate library's root logger.
+    Args:
+        verbosity:
+            Logging level, e.g., `evaluate.logging.DEBUG` and `evaluate.logging.INFO`.
+    """
+    _get_library_root_logger().setLevel(verbosity)
+
+
+def set_verbosity_info():
+    """Set the level for the Hugging Face Evaluate library's root logger to `INFO`.
+
+    This will display most of the logging information and tqdm bars.
+
+    Shortcut to `evaluate.logging.set_verbosity(evaluate.logging.INFO)`.
+    """
+    return set_verbosity(INFO)
+
+
+def set_verbosity_warning():
+    """Set the level for the Hugging Face Evaluate library's root logger to `WARNING`.
+
+    This will display only the warning and errors logging information and tqdm bars.
+
+    Shortcut to `evaluate.logging.set_verbosity(evaluate.logging.WARNING)`.
+    """
+    return set_verbosity(WARNING)
+
+
+def set_verbosity_debug():
+    """Set the level for the Hugging Face Evaluate library's root logger to `DEBUG`.
+
+    This will display all the logging information and tqdm bars.
+
+    Shortcut to `evaluate.logging.set_verbosity(evaluate.logging.DEBUG)`.
+    """
+    return set_verbosity(DEBUG)
+
+
+def set_verbosity_error():
+    """Set the level for the Hugging Face Evaluate library's root logger to `ERROR`.
+
+    This will display only the errors logging information and tqdm bars.
+
+    Shortcut to `evaluate.logging.set_verbosity(evaluate.logging.ERROR)`.
+    """
+    return set_verbosity(ERROR)
+
+
+def disable_propagation() -> None:
+    """Disable propagation of the library log outputs.
+    Note that log propagation is disabled by default.
+    """
+    _get_library_root_logger().propagate = False
+
+
+def enable_propagation() -> None:
+    """Enable propagation of the library log outputs.
+    Please disable the Hugging Face Evaluate library's default handler to prevent double logging if the root logger has
+    been configured.
+    """
+    _get_library_root_logger().propagate = True
+
+
+# Configure the library root logger at the module level (singleton-like)
+_configure_library_root_logger()
+
+
+class EmptyTqdm:
+    """Dummy tqdm which doesn't do anything."""
+
+    def __init__(self, *args, **kwargs):  # pylint: disable=unused-argument
+        self._iterator = args[0] if args else None
+
+    def __iter__(self):
+        return iter(self._iterator)
+
+    def __getattr__(self, _):
+        """Return empty function."""
+
+        def empty_fn(*args, **kwargs):  # pylint: disable=unused-argument
+            return
+
+        return empty_fn
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type_, value, traceback):
+        return
+
+
+_tqdm_active = True
+
+
+class _tqdm_cls:
+    def __call__(self, *args, **kwargs):
+        if _tqdm_active:
+            return tqdm_lib.tqdm(*args, **kwargs)
+        else:
+            return EmptyTqdm(*args, **kwargs)
+
+    def set_lock(self, *args, **kwargs):
+        self._lock = None
+        if _tqdm_active:
+            return tqdm_lib.tqdm.set_lock(*args, **kwargs)
+
+    def get_lock(self):
+        if _tqdm_active:
+            return tqdm_lib.tqdm.get_lock()
+
+
+tqdm = _tqdm_cls()
+
+
+def is_progress_bar_enabled() -> bool:
+    """Return a boolean indicating whether tqdm progress bars are enabled."""
+    global _tqdm_active
+    return bool(_tqdm_active)
+
+
+def enable_progress_bar():
+    """Enable tqdm progress bar."""
+    global _tqdm_active
+    _tqdm_active = True
+
+
+def disable_progress_bar():
+    """Enable tqdm progress bar."""
+    global _tqdm_active
+    _tqdm_active = False
--- a/evaluate-0.4.2/src/evaluate/visualization.py
+++ b/evaluate-0.4.2/src/evaluate/visualization.py
+import textwrap
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+
+
+class ComplexRadar:
+    """Create a complex radar chart with different scales for each variable
+    Args:
+    fig (`matplotlib.figure`) :  A matplotlib figure object to add the axes on.
+    variables (`list`) : a list of variables to. plot
+    ranges (`list` of `tuples`): A list of ranges (min, max) for each variable
+    n_ring_levels (`int): Number of ordinate or ring levels to draw.
+        Default: 5.
+    show_scales (`bool`): Indicates if we the ranges for each variable are plotted.
+        Default: True.
+    format_cfg (`dict`): A dictionary with formatting configurations.
+        Default: None.
+    Returns:
+    `matplotlib.figure.Figure`: a radar plot.
+    """
+
+    def __init__(self, fig, variables, ranges, n_ring_levels=5, show_scales=True, format_cfg=None):
+
+        self.format_cfg = format_cfg
+
+        # Calculate angles and create for each variable an axes
+        # Consider here the trick with having the first axes element twice (len+1)
+        angles = np.arange(0, 360, 360.0 / len(variables))
+        axes = [
+            fig.add_axes([0.1, 0.1, 0.9, 0.9], polar=True, label="axes{}".format(i), **self.format_cfg["axes_args"])
+            for i in range(len(variables) + 1)
+        ]
+
+        # Ensure clockwise rotation (first variable at the top N)
+        for ax in axes:
+            ax.set_theta_zero_location("N")
+            ax.set_theta_direction(-1)
+            ax.set_axisbelow(True)
+
+        # Writing the ranges on each axes
+        for i, ax in enumerate(axes):
+
+            # Here we do the trick by repeating the first iteration
+            j = 0 if (i == 0 or i == 1) else i - 1
+            ax.set_ylim(*ranges[j])
+            # Set endpoint to True if you like to have values right before the last circle
+            grid = np.linspace(*ranges[j], num=n_ring_levels, endpoint=self.format_cfg["incl_endpoint"])
+            gridlabel = ["{}".format(round(x, 2)) for x in grid]
+            gridlabel[0] = ""  # remove values from the center
+            lines, labels = ax.set_rgrids(
+                grid, labels=gridlabel, angle=angles[j], **self.format_cfg["rgrid_tick_lbls_args"]
+            )
+
+            ax.set_ylim(*ranges[j])
+            ax.spines["polar"].set_visible(False)
+            ax.grid(visible=False)
+
+            if show_scales is False:
+                ax.set_yticklabels([])
+
+        # Set all axes except the first one unvisible
+        for ax in axes[1:]:
+            ax.patch.set_visible(False)
+            ax.xaxis.set_visible(False)
+
+        # Setting the attributes
+        self.angle = np.deg2rad(np.r_[angles, angles[0]])
+        self.ranges = ranges
+        self.ax = axes[0]
+        self.ax1 = axes[1]
+        self.plot_counter = 0
+
+        # Draw (inner) circles and lines
+        self.ax.yaxis.grid(**self.format_cfg["rad_ln_args"])
+        # Draw outer circle
+        self.ax.spines["polar"].set(**self.format_cfg["outer_ring"])
+        # Draw angle lines
+        self.ax.xaxis.grid(**self.format_cfg["angle_ln_args"])
+
+        # ax1 is the duplicate of axes[0] (self.ax)
+        # Remove everything from ax1 except the plot itself
+        self.ax1.axis("off")
+        self.ax1.set_zorder(9)
+
+        # Create the outer labels for each variable
+        l, text = self.ax.set_thetagrids(angles, labels=variables)
+
+        # Beautify them
+        labels = [t.get_text() for t in self.ax.get_xticklabels()]
+        labels = [
+            "\n".join(
+                textwrap.wrap(
+                    label,
+                    self.format_cfg["theta_tick_lbls_txt_wrap"],
+                    break_long_words=self.format_cfg["theta_tick_lbls_brk_lng_wrds"],
+                )
+            )
+            for label in labels
+        ]
+        self.ax.set_xticklabels(labels, **self.format_cfg["theta_tick_lbls"])
+
+        for t, a in zip(self.ax.get_xticklabels(), angles):
+            if a == 0:
+                t.set_ha("center")
+            elif a > 0 and a < 180:
+                t.set_ha("left")
+            elif a == 180:
+                t.set_ha("center")
+            else:
+                t.set_ha("right")
+
+        self.ax.tick_params(axis="both", pad=self.format_cfg["theta_tick_lbls_pad"])
+
+    def _scale_data(self, data, ranges):
+        """Scales data[1:] to ranges[0]"""
+        for d, (y1, y2) in zip(data[1:], ranges[1:]):
+            assert (y1 <= d <= y2) or (y2 <= d <= y1)
+        x1, x2 = ranges[0]
+        d = data[0]
+        sdata = [d]
+        for d, (y1, y2) in zip(data[1:], ranges[1:]):
+            sdata.append((d - y1) / (y2 - y1) * (x2 - x1) + x1)
+        return sdata
+
+    def plot(self, data, *args, **kwargs):
+        """Plots a line"""
+        sdata = self._scale_data(data, self.ranges)
+        self.ax1.plot(self.angle, np.r_[sdata, sdata[0]], *args, **kwargs)
+        self.plot_counter = self.plot_counter + 1
+
+    def use_legend(self, *args, **kwargs):
+        """Shows a legend"""
+        self.ax1.legend(*args, **kwargs)
+
+
+def radar_plot(data, model_names, invert_range=[], config=None, fig=None):
+    """Create a complex radar chart with different scales for each variable
+    Source: https://towardsdatascience.com/how-to-create-and-visualize-complex-radar-charts-f7764d0f3652
+
+    Args:
+        data (`List[dict]`): the results (list of metric + value pairs).
+            E.g. data = [{"accuracy": 0.9, "precision":0.8},{"accuracy": 0.7, "precision":0.6}]
+        names (`List[dict]`): model names.
+            E.g. names = ["model1", "model 2", ...]
+        invert_range (`List[dict]`, optional): the metrics to invert (in cases when smaller is better, e.g. speed)
+            E.g. invert_range=["latency_in_seconds"]
+        config (`dict`, optional) : a specification of the formatting configurations, namely:
+
+            - rad_ln_args (`dict`, default `{"visible": True}`): The visibility of the radial (circle) lines.
+
+            - outer_ring (`dict`, default `{"visible": True}`): The visibility of the outer ring.
+
+            - angle_ln_args (`dict`, default `{"visible": True}`): The visibility of the angle lines.
+
+            - rgrid_tick_lbls_args (`dict`, default `{"fontsize": 12}`): The font size of the tick labels on the scales.
+
+            - theta_tick_lbls (`dict`, default `{"fontsize": 12}`): The font size of the variable labels on the plot.
+
+            - theta_tick_lbls_pad (`int`, default `3`): The padding of the variable labels on the plot.
+
+            - theta_tick_lbls_brk_lng_wrds (`bool`, default `True` ): Whether long words in the label are broken up or not.
+
+            - theta_tick_lbls_txt_wrap (`int`, default `15`): Text wrap for tick labels
+
+            - incl_endpoint (`bool`, default `False`): Include value endpoints on calse
+
+            - marker (`str`, default `"o"`): the shape of the marker used in the radar plot.
+
+            - markersize (`int`, default `3`): the shape of the marker used in the radar plot.
+
+            - legend_loc (`str`, default `"upper right"`): the location of the legend in the radar plot. Must be one of: 'upper left', 'upper right', 'lower left', 'lower right'.
+
+            - bbox_to_anchor (`tuple`, default `(2, 1)`: anchor for the legend.
+        fig (`matplotlib.figure.Figure`, optional): figure used to plot the radar plot.
+
+    Returns:
+        `matplotlib.figure.Figure`
+    """
+    data = pd.DataFrame(data)
+    data.index = model_names
+    variables = data.keys()
+    if all(x in variables for x in invert_range) is False:
+        raise ValueError("All of the metrics in `invert_range` should be in the data provided.")
+    min_max_per_variable = data.describe().T[["min", "max"]]
+    min_max_per_variable["min"] = min_max_per_variable["min"] - 0.1 * (
+        min_max_per_variable["max"] - min_max_per_variable["min"]
+    )
+    min_max_per_variable["max"] = min_max_per_variable["max"] + 0.1 * (
+        min_max_per_variable["max"] - min_max_per_variable["min"]
+    )
+
+    ranges = list(min_max_per_variable.itertuples(index=False, name=None))
+    ranges = [
+        (max_value, min_value) if var in invert_range else (min_value, max_value)
+        for var, (min_value, max_value) in zip(variables, ranges)
+    ]
+    format_cfg = {
+        "axes_args": {},
+        "rad_ln_args": {"visible": True},
+        "outer_ring": {"visible": True},
+        "angle_ln_args": {"visible": True},
+        "rgrid_tick_lbls_args": {"fontsize": 12},
+        "theta_tick_lbls": {"fontsize": 12},
+        "theta_tick_lbls_pad": 3,
+        "theta_tick_lbls_brk_lng_wrds": True,
+        "theta_tick_lbls_txt_wrap": 15,
+        "incl_endpoint": False,
+        "marker": "o",
+        "markersize": 3,
+        "legend_loc": "upper right",
+        "bbox_to_anchor": (2, 1),
+    }
+    if config is not None:
+        format_cfg.update(config)
+    if fig is None:
+        fig = plt.figure()
+    radar = ComplexRadar(
+        fig,
+        variables,
+        ranges,
+        n_ring_levels=3,
+        show_scales=True,
+        format_cfg=format_cfg,
+    )
+    for g in zip(data.index):
+        radar.plot(data.loc[g].values, label=g, marker=format_cfg["marker"], markersize=format_cfg["markersize"])
+        radar.use_legend(**{"loc": format_cfg["legend_loc"], "bbox_to_anchor": format_cfg["bbox_to_anchor"]})
+    return fig
--- a/evaluate-0.4.2/templates/cookiecutter.json
+++ b/evaluate-0.4.2/templates/cookiecutter.json
+{
+    "module_name": "Awesome Module",
+    "module_type": "module",
+    "module_description": "This new module is designed to solve this great ML task and is crafted with a lot of care and love.",
+    "module_slug": "{{ cookiecutter.module_name|lower|replace(' ', '_') }}",
+    "module_class_name": "{{ cookiecutter.module_name|replace(' ', '') }}",
+    "namespace": "",
+    "dataset_name": ""
+}
\ No newline at end of file
--- a/evaluate-0.4.2/templates/{{ cookiecutter.module_slug }}/README.md
+++ b/evaluate-0.4.2/templates/{{ cookiecutter.module_slug }}/README.md
+---
+title: {{ cookiecutter.module_name }}
+datasets:
+- {{ cookiecutter.dataset_name }} 
+tags:
+- evaluate
+- {{ cookiecutter.module_type }}
+description: "TODO: add a description here"
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+---
+
+# {{ cookiecutter.module_type|capitalize }} Card for {{ cookiecutter.module_name }}
+
+***Module Card Instructions:*** *Fill out the following subsections. Feel free to take a look at existing {{ cookiecutter.module_type }} cards if you'd like examples.*
+
+## {{ cookiecutter.module_type|capitalize }} Description
+*Give a brief overview of this {{ cookiecutter.module_type }}, including what task(s) it is usually used for, if any.*
+
+## How to Use
+*Give general statement of how to use the {{ cookiecutter.module_type }}*
+
+*Provide simplest possible example for using the {{ cookiecutter.module_type }}*
+
+### Inputs
+*List all input arguments in the format below*
+- **input_field** *(type): Definition of input, with explanation if necessary. State any default value(s).*
+
+### Output Values
+
+*Explain what this {{ cookiecutter.module_type }} outputs and provide an example of what the {{ cookiecutter.module_type }} output looks like. Modules should return a dictionary with one or multiple key-value pairs, e.g. {"bleu" : 6.02}*
+
+*State the range of possible values that the {{ cookiecutter.module_type }}'s output can take, as well as what in that range is considered good. For example: "This {{ cookiecutter.module_type }} can take on any value between 0 and 100, inclusive. Higher scores are better."*
+
+#### Values from Popular Papers
+*Give examples, preferrably with links to leaderboards or publications, to papers that have reported this {{ cookiecutter.module_type }}, along with the values they have reported.*
+
+### Examples
+*Give code examples of the {{ cookiecutter.module_type }} being used. Try to include examples that clear up any potential ambiguity left from the {{ cookiecutter.module_type }} description above. If possible, provide a range of examples that show both typical and atypical results, as well as examples where a variety of input parameters are passed.*
+
+## Limitations and Bias
+*Note any known limitations or biases that the {{ cookiecutter.module_type }} has, with links and references if possible.*
+
+## Citation
+*Cite the source where this {{ cookiecutter.module_type }} was introduced.*
+
+## Further References
+*Add any useful further references.*
--- a/evaluate-0.4.2/templates/{{ cookiecutter.module_slug }}/app.py
+++ b/evaluate-0.4.2/templates/{{ cookiecutter.module_slug }}/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("{{ cookiecutter.namespace }}/{{ cookiecutter.module_slug }}")
+launch_gradio_widget(module)
\ No newline at end of file