Commit 25991f98 authored by hepj's avatar hepj
Browse files

修改readme

parent ac192496
Pipeline #1415 failed with stages
in 0 seconds
# Copyright 2022 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
from datasets import Dataset
from typing_extensions import Literal
from ..module import EvaluationModule
from ..utils.file_utils import add_start_docstrings
from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator
if TYPE_CHECKING:
from transformers import Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel
TASK_DOCUMENTATION_KWARGS = r"""
input_column (`str`, defaults to `"text"`):
the name of the column containing the input text in the dataset specified by `data`.
label_column (`str`, defaults to `"label"`):
the name of the column containing the labels in the dataset specified by `data`.
generation_kwargs (`Dict`, *optional*, defaults to `None`):
The generation kwargs are passed to the pipeline and set the text generation strategy.
"""
TEXT2TEXT_TASK_DOCSTRING_EXAMPLE = r"""
Examples:
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("text2text-generation")
>>> data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:40]")
>>> results = task_evaluator.compute(
>>> model_or_pipeline="facebook/bart-large-cnn",
>>> data=data,
>>> input_column="article",
>>> label_column="highlights",
>>> metric="rouge",
>>> )
```
"""
SUMMARIZATION_TASK_DOCSTRING_EXAMPLE = r"""
Examples:
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("summarization")
>>> data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:40]")
>>> results = task_evaluator.compute(
>>> model_or_pipeline="facebook/bart-large-cnn",
>>> data=data,
>>> input_column="article",
>>> label_column="highlights",
>>> )
```
"""
TRANSLATION_TASK_DOCSTRING_EXAMPLE = r"""
Examples:
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("translation")
>>> data = load_dataset("wmt19", "fr-de", split="validation[:40]")
>>> data = data.map(lambda x: {"text": x["translation"]["de"], "label": x["translation"]["fr"]})
>>> results = task_evaluator.compute(
>>> model_or_pipeline="Helsinki-NLP/opus-mt-de-fr",
>>> data=data,
>>> )
```
"""
class Text2TextGenerationEvaluator(Evaluator):
"""
Text2Text generation evaluator.
This Text2Text generation evaluator can currently be loaded from [`evaluator`] using the default task name
`text2text-generation`.
Methods in this class assume a data format compatible with the [`~transformers.Text2TextGenerationPipeline`].
"""
PREDICTION_PREFIX = "generated"
PIPELINE_KWARGS = {"truncation": True}
def __init__(self, task="text2text-generation", default_metric_name=None):
super().__init__(task, default_metric_name=default_metric_name)
def predictions_processor(self, predictions, label_mapping):
return {"predictions": [pred[f"{self.PREDICTION_PREFIX}_text"] for pred in predictions]}
@add_start_docstrings(
EVALUTOR_COMPUTE_START_DOCSTRING,
TASK_DOCUMENTATION_KWARGS,
EVALUATOR_COMPUTE_RETURN_DOCSTRING,
TEXT2TEXT_TASK_DOCSTRING_EXAMPLE,
)
def compute(
self,
model_or_pipeline: Union[
str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel" # noqa: F821
] = None,
data: Union[str, Dataset] = None,
subset: Optional[str] = None,
split: Optional[str] = None,
metric: Union[str, EvaluationModule] = None,
tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None, # noqa: F821
strategy: Literal["simple", "bootstrap"] = "simple",
confidence_level: float = 0.95,
n_resamples: int = 9999,
device: int = None,
random_state: Optional[int] = None,
input_column: str = "text",
label_column: str = "label",
generation_kwargs: dict = None,
) -> Tuple[Dict[str, float], Any]:
if generation_kwargs is not None:
self.PIPELINE_KWARGS.update(generation_kwargs)
result = super().compute(
model_or_pipeline=model_or_pipeline,
data=data,
subset=subset,
split=split,
metric=metric,
tokenizer=tokenizer,
strategy=strategy,
confidence_level=confidence_level,
n_resamples=n_resamples,
device=device,
random_state=random_state,
input_column=input_column,
label_column=label_column,
)
return result
class SummarizationEvaluator(Text2TextGenerationEvaluator):
"""
Text summarization evaluator.
This text summarization evaluator can currently be loaded from [`evaluator`] using the default task name
`summarization`.
Methods in this class assume a data format compatible with the [`SummarizationEvaluator`].
"""
PREDICTION_PREFIX = "summary"
PIPELINE_KWARGS = {"truncation": True}
def __init__(self, task="summarization", default_metric_name=None):
super().__init__(task, default_metric_name=default_metric_name)
@add_start_docstrings(
EVALUTOR_COMPUTE_START_DOCSTRING,
TASK_DOCUMENTATION_KWARGS,
EVALUATOR_COMPUTE_RETURN_DOCSTRING,
SUMMARIZATION_TASK_DOCSTRING_EXAMPLE,
)
def compute(
self,
model_or_pipeline: Union[
str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel" # noqa: F821
] = None,
data: Union[str, Dataset] = None,
subset: Optional[str] = None,
split: Optional[str] = None,
metric: Union[str, EvaluationModule] = None,
tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None, # noqa: F821
strategy: Literal["simple", "bootstrap"] = "simple",
confidence_level: float = 0.95,
n_resamples: int = 9999,
device: int = None,
random_state: Optional[int] = None,
input_column: str = "text",
label_column: str = "label",
generation_kwargs: dict = None,
) -> Tuple[Dict[str, float], Any]:
result = super().compute(
model_or_pipeline=model_or_pipeline,
data=data,
subset=subset,
split=split,
metric=metric,
tokenizer=tokenizer,
strategy=strategy,
confidence_level=confidence_level,
n_resamples=n_resamples,
device=device,
random_state=random_state,
input_column=input_column,
label_column=label_column,
generation_kwargs=generation_kwargs,
)
return result
class TranslationEvaluator(Text2TextGenerationEvaluator):
"""
Translation evaluator.
This translation generation evaluator can currently be loaded from [`evaluator`] using the default task name
`translation`.
Methods in this class assume a data format compatible with the [`~transformers.TranslationPipeline`].
"""
PREDICTION_PREFIX = "translation"
PIPELINE_KWARGS = {"truncation": True}
def __init__(self, task="translation", default_metric_name=None):
super().__init__(task, default_metric_name=default_metric_name)
@add_start_docstrings(
EVALUTOR_COMPUTE_START_DOCSTRING,
TASK_DOCUMENTATION_KWARGS,
EVALUATOR_COMPUTE_RETURN_DOCSTRING,
TRANSLATION_TASK_DOCSTRING_EXAMPLE,
)
def compute(
self,
model_or_pipeline: Union[
str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel" # noqa: F821
] = None,
data: Union[str, Dataset] = None,
subset: Optional[str] = None,
split: Optional[str] = None,
metric: Union[str, EvaluationModule] = None,
tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None, # noqa: F821
strategy: Literal["simple", "bootstrap"] = "simple",
confidence_level: float = 0.95,
n_resamples: int = 9999,
device: int = None,
random_state: Optional[int] = None,
input_column: str = "text",
label_column: str = "label",
generation_kwargs: dict = None,
) -> Tuple[Dict[str, float], Any]:
result = super().compute(
model_or_pipeline=model_or_pipeline,
data=data,
subset=subset,
split=split,
metric=metric,
tokenizer=tokenizer,
strategy=strategy,
confidence_level=confidence_level,
n_resamples=n_resamples,
device=device,
random_state=random_state,
input_column=input_column,
label_column=label_column,
generation_kwargs=generation_kwargs,
)
return result
# Copyright 2022 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from numbers import Number
from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Union
from datasets import Dataset, load_dataset
from typing_extensions import Literal
from ..module import EvaluationModule
from ..utils.file_utils import add_end_docstrings, add_start_docstrings
from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator
from .utils import DatasetColumnPair
if TYPE_CHECKING:
from transformers import FeatureExtractionMixin, Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel
TASK_DOCUMENTATION = r"""
Examples:
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("text-classification")
>>> data = load_dataset("imdb", split="test[:2]")
>>> results = task_evaluator.compute(
>>> model_or_pipeline="huggingface/prunebert-base-uncased-6-finepruned-w-distil-mnli",
>>> data=data,
>>> metric="accuracy",
>>> label_mapping={"LABEL_0": 0.0, "LABEL_1": 1.0},
>>> strategy="bootstrap",
>>> n_resamples=10,
>>> random_state=0
>>> )
```
"""
class TextClassificationEvaluator(Evaluator):
"""
Text classification evaluator.
This text classification evaluator can currently be loaded from [`evaluator`] using the default task name
`text-classification` or with a `"sentiment-analysis"` alias.
Methods in this class assume a data format compatible with the [`~transformers.TextClassificationPipeline`] - a single textual
feature as input and a categorical label as output.
"""
PIPELINE_KWARGS = {"truncation": True}
def __init__(self, task="text-classification", default_metric_name=None):
super().__init__(task, default_metric_name=default_metric_name)
def prepare_data(self, data: Union[str, Dataset], input_column: str, second_input_column: str, label_column: str):
if data is None:
raise ValueError(
"Please specify a valid `data` object - either a `str` with a name or a `Dataset` object."
)
self.check_required_columns(data, {"input_column": input_column, "label_column": label_column})
if second_input_column is not None:
self.check_required_columns(data, {"second_input_column": second_input_column})
data = load_dataset(data) if isinstance(data, str) else data
return {"references": data[label_column]}, DatasetColumnPair(
data, input_column, second_input_column, "text", "text_pair"
)
def predictions_processor(self, predictions, label_mapping):
predictions = [
label_mapping[element["label"]] if label_mapping is not None else element["label"]
for element in predictions
]
return {"predictions": predictions}
@add_start_docstrings(EVALUTOR_COMPUTE_START_DOCSTRING)
@add_end_docstrings(EVALUATOR_COMPUTE_RETURN_DOCSTRING, TASK_DOCUMENTATION)
def compute(
self,
model_or_pipeline: Union[
str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel" # noqa: F821
] = None,
data: Union[str, Dataset] = None,
subset: Optional[str] = None,
split: Optional[str] = None,
metric: Union[str, EvaluationModule] = None,
tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None, # noqa: F821
feature_extractor: Optional[Union[str, "FeatureExtractionMixin"]] = None, # noqa: F821
strategy: Literal["simple", "bootstrap"] = "simple",
confidence_level: float = 0.95,
n_resamples: int = 9999,
device: int = None,
random_state: Optional[int] = None,
input_column: str = "text",
second_input_column: Optional[str] = None,
label_column: str = "label",
label_mapping: Optional[Dict[str, Number]] = None,
) -> Tuple[Dict[str, float], Any]:
"""
input_column (`str`, *optional*, defaults to `"text"`):
The name of the column containing the text feature in the dataset specified by `data`.
second_input_column (`str`, *optional*, defaults to `None`):
The name of the second column containing the text features. This may be useful for classification tasks
as MNLI, where two columns are used.
label_column (`str`, defaults to `"label"`):
The name of the column containing the labels in the dataset specified by `data`.
label_mapping (`Dict[str, Number]`, *optional*, defaults to `None`):
We want to map class labels defined by the model in the pipeline to values consistent with those
defined in the `label_column` of the `data` dataset.
"""
result = {}
self.check_for_mismatch_in_device_setup(device, model_or_pipeline)
# Prepare inputs
data = self.load_data(data=data, subset=subset, split=split)
metric_inputs, pipe_inputs = self.prepare_data(
data=data, input_column=input_column, second_input_column=second_input_column, label_column=label_column
)
pipe = self.prepare_pipeline(
model_or_pipeline=model_or_pipeline,
tokenizer=tokenizer,
feature_extractor=feature_extractor,
device=device,
)
metric = self.prepare_metric(metric)
# Compute predictions
predictions, perf_results = self.call_pipeline(pipe, pipe_inputs)
predictions = self.predictions_processor(predictions, label_mapping)
metric_inputs.update(predictions)
# Compute metrics from references and predictions
metric_results = self.compute_metric(
metric=metric,
metric_inputs=metric_inputs,
strategy=strategy,
confidence_level=confidence_level,
n_resamples=n_resamples,
random_state=random_state,
)
result.update(metric_results)
result.update(perf_results)
return result
# Copyright 2022 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict, Tuple
from datasets import Dataset
from .base import Evaluator
from .utils import DatasetColumn
TASK_DOCUMENTATION_KWARGS = r"""
input_column (`str`, defaults to `"text"`):
the name of the column containing the input text in the dataset specified by `data`.
generation_kwargs (`Dict`, *optional*, defaults to `None`):
The generation kwargs are passed to the pipeline and set the text generation strategy.
"""
class TextGenerationEvaluator(Evaluator):
"""
Text generation evaluator.
This Text generation evaluator can currently be loaded from [`evaluator`] using the default task name
`text-generation`.
Methods in this class assume a data format compatible with the [`~transformers.TextGenerationPipeline`].
"""
def predictions_processor(self, predictions, *args, **kwargs):
"""
Args:
predictions: A list of lists of dicts
Returns:
`dict`: All the generated texts are flattened and stored under the "data" key.
"""
return {"data": [pred[f"{self.predictions_prefix}_text"] for pred_list in predictions for pred in pred_list]}
def __init__(self, task="text-generation", default_metric_name=None, predictions_prefix: str = "generated"):
super().__init__(task=task, default_metric_name=default_metric_name)
self.predictions_prefix = predictions_prefix
def prepare_data(self, data: Dataset, input_column: str, *args, **kwargs) -> Tuple[Dict, DatasetColumn]:
"""
Prepare data.
Args:
data ([`Dataset`]):
Specifies the dataset we will run evaluation on.
input_column (`str`, defaults to `"text"`):
The name of the column containing the text feature in the dataset specified by `data`.
Returns:
`dict`: metric inputs.
`list`: pipeline inputs.
"""
self.check_required_columns(data, {"input_column": input_column})
return {}, DatasetColumn(data, input_column)
# Copyright 2022 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
from datasets import ClassLabel, Dataset, Sequence
from typing_extensions import Literal
from ..module import EvaluationModule
from ..utils.file_utils import add_end_docstrings, add_start_docstrings
from .base import EVALUATOR_COMPUTE_RETURN_DOCSTRING, EVALUTOR_COMPUTE_START_DOCSTRING, Evaluator
from .utils import DatasetColumn
if TYPE_CHECKING:
from transformers import Pipeline, PreTrainedModel, PreTrainedTokenizer, TFPreTrainedModel
TASK_DOCUMENTATION = r"""
The dataset input and label columns are expected to be formatted as a list of words and a list of labels respectively, following [conll2003 dataset](https://huggingface.co/datasets/conll2003). Datasets whose inputs are single strings, and labels are a list of offset are not supported.
Examples:
```python
>>> from evaluate import evaluator
>>> from datasets import load_dataset
>>> task_evaluator = evaluator("token-classification")
>>> data = load_dataset("conll2003", split="validation[:2]")
>>> results = task_evaluator.compute(
>>> model_or_pipeline="elastic/distilbert-base-uncased-finetuned-conll03-english",
>>> data=data,
>>> metric="seqeval",
>>> )
```
<Tip>
For example, the following dataset format is accepted by the evaluator:
```python
dataset = Dataset.from_dict(
mapping={
"tokens": [["New", "York", "is", "a", "city", "and", "Felix", "a", "person", "."]],
"ner_tags": [[1, 2, 0, 0, 0, 0, 3, 0, 0, 0]],
},
features=Features({
"tokens": Sequence(feature=Value(dtype="string")),
"ner_tags": Sequence(feature=ClassLabel(names=["O", "B-LOC", "I-LOC", "B-PER", "I-PER"])),
}),
)
```
</Tip>
<Tip warning={true}>
For example, the following dataset format is **not** accepted by the evaluator:
```python
dataset = Dataset.from_dict(
mapping={
"tokens": [["New York is a city and Felix a person."]],
"starts": [[0, 23]],
"ends": [[7, 27]],
"ner_tags": [["LOC", "PER"]],
},
features=Features({
"tokens": Value(dtype="string"),
"starts": Sequence(feature=Value(dtype="int32")),
"ends": Sequence(feature=Value(dtype="int32")),
"ner_tags": Sequence(feature=Value(dtype="string")),
}),
)
```
</Tip>
"""
class TokenClassificationEvaluator(Evaluator):
"""
Token classification evaluator.
This token classification evaluator can currently be loaded from [`evaluator`] using the default task name
`token-classification`.
Methods in this class assume a data format compatible with the [`~transformers.TokenClassificationPipeline`].
"""
PIPELINE_KWARGS = {"ignore_labels": []}
def __init__(self, task="token-classification", default_metric_name=None):
super().__init__(task, default_metric_name=default_metric_name)
def predictions_processor(self, predictions: List[List[Dict]], words: List[List[str]], join_by: str):
"""
Transform the pipeline predictions into a list of predicted labels of the same length as the true labels.
Args:
predictions (`List[List[Dict]]`):
List of pipeline predictions, where each token has been labeled.
words (`List[List[str]]`):
Original input data to the pipeline, used to build predicted labels of the same length.
join_by (`str`):
String to use to join two words. In English, it will typically be " ".
Returns:
`dict`: a dictionary holding the predictions
"""
preds = []
# iterate over the data rows
for i, prediction in enumerate(predictions):
pred_processed = []
# get a list of tuples giving the indexes of the start and end character of each word
words_offsets = self.words_to_offsets(words[i], join_by)
token_index = 0
for word_offset in words_offsets:
# for each word, we may keep only the predicted label for the first token, discard the others
while prediction[token_index]["start"] < word_offset[0]:
token_index += 1
if prediction[token_index]["start"] > word_offset[0]: # bad indexing
pred_processed.append("O")
elif prediction[token_index]["start"] == word_offset[0]:
pred_processed.append(prediction[token_index]["entity"])
preds.append(pred_processed)
return {"predictions": preds}
def words_to_offsets(self, words: List[str], join_by: str):
"""
Convert a list of words to a list of offsets, where word are joined by `join_by`.
Args:
words (`List[str]`):
List of words to get offsets from.
join_by (`str`):
String to insert between words.
Returns:
`List[Tuple[int, int]]`: List of the characters (start index, end index) for each of the words.
"""
offsets = []
start = 0
for word in words:
end = start + len(word) - 1
offsets.append((start, end))
start = end + len(join_by) + 1
return offsets
def prepare_data(self, data: Union[str, Dataset], input_column: str, label_column: str, join_by: str):
super().prepare_data(data, input_column, label_column)
if not isinstance(data.features[input_column], Sequence) or not isinstance(
data.features[label_column], Sequence
):
raise ValueError(
"TokenClassificationEvaluator expects the input and label columns to be provided as lists."
)
# If the labels are of type ClassLabel, they are already integers and we have the map stored somewhere.
# Otherwise, we have to get the list of labels manually.
labels_are_int = isinstance(data.features[label_column].feature, ClassLabel)
if labels_are_int:
label_list = data.features[label_column].feature.names # list of string labels
id_to_label = {i: label for i, label in enumerate(label_list)}
references = [[id_to_label[label_id] for label_id in label_ids] for label_ids in data[label_column]]
elif data.features[label_column].feature.dtype.startswith("int"):
raise NotImplementedError(
"References provided as integers, but the reference column is not a Sequence of ClassLabels."
)
else:
# In the event the labels are not a `Sequence[ClassLabel]`, we have already labels as strings
# An example is labels as ["PER", "PER", "O", "LOC", "O", "LOC", "O"], e.g. in polyglot_ner dataset
references = data[label_column]
metric_inputs = {"references": references}
data = data.map(lambda x: {input_column: join_by.join(x[input_column])})
pipeline_inputs = DatasetColumn(data, input_column)
return metric_inputs, pipeline_inputs
def prepare_pipeline(
self,
model_or_pipeline: Union[str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel"], # noqa: F821
tokenizer: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"] = None, # noqa: F821
feature_extractor: Union["PreTrainedTokenizerBase", "FeatureExtractionMixin"] = None, # noqa: F821
device: int = None,
):
pipe = super().prepare_pipeline(model_or_pipeline, tokenizer, feature_extractor, device)
# check the pipeline outputs start characters in its predictions
dummy_output = pipe(["2003 New York Gregory"], **self.PIPELINE_KWARGS)
if dummy_output[0][0]["start"] is None:
raise ValueError(
"TokenClassificationEvaluator supports only pipelines giving 'start' index as a pipeline output (got None). "
"Transformers pipelines with a slow tokenizer will raise this error."
)
return pipe
@add_start_docstrings(EVALUTOR_COMPUTE_START_DOCSTRING)
@add_end_docstrings(EVALUATOR_COMPUTE_RETURN_DOCSTRING, TASK_DOCUMENTATION)
def compute(
self,
model_or_pipeline: Union[
str, "Pipeline", Callable, "PreTrainedModel", "TFPreTrainedModel" # noqa: F821
] = None,
data: Union[str, Dataset] = None,
subset: Optional[str] = None,
split: str = None,
metric: Union[str, EvaluationModule] = None,
tokenizer: Optional[Union[str, "PreTrainedTokenizer"]] = None, # noqa: F821
strategy: Literal["simple", "bootstrap"] = "simple",
confidence_level: float = 0.95,
n_resamples: int = 9999,
device: Optional[int] = None,
random_state: Optional[int] = None,
input_column: str = "tokens",
label_column: str = "ner_tags",
join_by: Optional[str] = " ",
) -> Tuple[Dict[str, float], Any]:
"""
input_column (`str`, defaults to `"tokens"`):
The name of the column containing the tokens feature in the dataset specified by `data`.
label_column (`str`, defaults to `"label"`):
The name of the column containing the labels in the dataset specified by `data`.
join_by (`str`, *optional*, defaults to `" "`):
This evaluator supports dataset whose input column is a list of words. This parameter specifies how to join
words to generate a string input. This is especially useful for languages that do not separate words by a space.
"""
result = {}
self.check_for_mismatch_in_device_setup(device, model_or_pipeline)
# Prepare inputs
data = self.load_data(data=data, subset=subset, split=split)
metric_inputs, pipe_inputs = self.prepare_data(
data=data, input_column=input_column, label_column=label_column, join_by=join_by
)
pipe = self.prepare_pipeline(model_or_pipeline=model_or_pipeline, tokenizer=tokenizer, device=device)
metric = self.prepare_metric(metric)
# Compute predictions
predictions, perf_results = self.call_pipeline(pipe, pipe_inputs)
predictions = self.predictions_processor(predictions, data[input_column], join_by)
metric_inputs.update(predictions)
# Compute metrics from references and predictions
metric_results = self.compute_metric(
metric=metric,
metric_inputs=metric_inputs,
strategy=strategy,
confidence_level=confidence_level,
n_resamples=n_resamples,
random_state=random_state,
)
result.update(metric_results)
result.update(perf_results)
return result
from datasets import Dataset, get_dataset_split_names
class DatasetColumn(list):
"""Helper class to avoid loading a dataset column into memory when accessing it."""
def __init__(self, dataset: Dataset, key: str):
self.dataset = dataset
self.key = key
def __len__(self):
return len(self.dataset)
def __getitem__(self, i):
return self.dataset[i][self.key]
def __iter__(self):
return (self.dataset[i][self.key] for i in range(len(self)))
def choose_split(data, subset=None):
available_splits = get_dataset_split_names(data, subset)
preferred_split_order = [
"test",
"testing",
"eval",
"evaluation",
"validation",
"val",
"valid",
"dev",
"train",
"training",
]
for split in preferred_split_order:
if split in available_splits:
return split
raise ValueError("No dataset split defined! Pass an explicit value to the `split` kwarg.")
class DatasetColumnPair(list):
"""Helper class to avoid loading two dataset columns into memory when accessing it."""
def __init__(
self,
dataset: Dataset,
first_col: str,
second_col: str,
first_key: str,
second_key: str,
):
"""
Args:
dataset (Dataset): dataset to build an iterator on
first_col (str): first column name to use in the dataset
second_col (str): second column name to use in the dataset
first_key (str): key name used for the first column in the returned dictionary
second_key (str): key name used for the second column in the returned dictionary
"""
self.dataset = dataset
self.first_col = first_col
self.second_col = second_col
self.first_key = first_key
self.second_key = second_key
def __len__(self):
return len(self.dataset)
def __getitem__(self, i):
return {
self.first_key: self.dataset[i][self.first_col],
self.second_key: self.dataset[i][self.second_col] if self.second_col else None,
}
def __iter__(self):
return (
{
self.first_key: self.dataset[i][self.first_col],
self.second_key: self.dataset[i][self.second_col] if self.second_col else None,
}
for i in range(len(self))
)
from typing import Dict
import requests
from huggingface_hub import dataset_info, model_info
from huggingface_hub.repocard import metadata_update
from .config import HF_HUB_ALLOWED_TASKS
from .utils.logging import get_logger
logger = get_logger(__name__)
def push_to_hub(
model_id: str,
task_type: str,
dataset_type: str,
dataset_name: str,
metric_type: str,
metric_name: str,
metric_value: float,
task_name: str = None,
dataset_config: str = None,
dataset_split: str = None,
dataset_revision: str = None,
dataset_args: Dict[str, int] = None,
metric_config: str = None,
metric_args: Dict[str, int] = None,
overwrite: bool = False,
):
r"""
Pushes the result of a metric to the metadata of a model repository in the Hub.
Args:
model_id (`str`):
Model id from https://hf.co/models.
task_type (`str`):
Task id, refer to the [Hub allowed tasks](https://github.com/huggingface/evaluate/blob/main/src/evaluate/config.py#L154) for allowed values.
dataset_type (`str`):
Dataset id from https://hf.co/datasets.
dataset_name (`str`):
Pretty name for the dataset.
metric_type (`str`):
Metric id from https://hf.co/metrics.
metric_name (`str`):
Pretty name for the metric.
metric_value (`float`):
Computed metric value.
task_name (`str`, *optional*):
Pretty name for the task.
dataset_config (`str`, *optional*):
Dataset configuration used in [`~datasets.load_dataset`].
See [`~datasets.load_dataset`] for more info.
dataset_split (`str`, *optional*):
Name of split used for metric computation.
dataset_revision (`str`, *optional*):
Git hash for the specific version of the dataset.
dataset_args (`dict[str, int]`, *optional*):
Additional arguments passed to [`~datasets.load_dataset`].
metric_config (`str`, *optional*):
Configuration for the metric (e.g. the GLUE metric has a configuration for each subset).
metric_args (`dict[str, int]`, *optional*):
Arguments passed during [`~evaluate.EvaluationModule.compute`].
overwrite (`bool`, *optional*, defaults to `False`):
If set to `True` an existing metric field can be overwritten, otherwise
attempting to overwrite any existing fields will cause an error.
Example:
```python
>>> push_to_hub(
... model_id="huggingface/gpt2-wikitext2",
... metric_value=0.5
... metric_type="bleu",
... metric_name="BLEU",
... dataset_name="WikiText",
... dataset_type="wikitext",
... dataset_split="test",
... task_type="text-generation",
... task_name="Text Generation"
... )
```"""
if task_type not in HF_HUB_ALLOWED_TASKS:
raise ValueError(f"Task type not supported. Task has to be one of {HF_HUB_ALLOWED_TASKS}")
try:
dataset_info(dataset_type)
except requests.exceptions.HTTPError:
logger.warning(f"Dataset {dataset_type} not found on the Hub at hf.co/datasets/{dataset_type}")
try:
model_info(model_id)
except requests.exceptions.HTTPError:
raise ValueError(f"Model {model_id} not found on the Hub at hf.co/{model_id}")
result = {
"task": {
"type": task_type,
},
"dataset": {
"type": dataset_type,
"name": dataset_name,
},
"metrics": [
{
"type": metric_type,
"value": metric_value,
},
],
}
if dataset_config is not None:
result["dataset"]["config"] = dataset_config
if dataset_split is not None:
result["dataset"]["split"] = dataset_split
if dataset_revision is not None:
result["dataset"]["revision"] = dataset_revision
if dataset_args is not None:
result["dataset"]["args"] = dataset_args
if task_name is not None:
result["task"]["name"] = task_name
if metric_name is not None:
result["metrics"][0]["name"] = metric_name
if metric_config is not None:
result["metrics"][0]["config"] = metric_config
if metric_args is not None:
result["metrics"][0]["args"] = metric_args
metadata = {"model-index": [{"results": [result]}]}
return metadata_update(repo_id=model_id, metadata=metadata, overwrite=overwrite)
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
""" EvaluationModuleInfo records information we know about a dataset and a metric.
"""
import dataclasses
import json
import os
from dataclasses import asdict, dataclass, field
from typing import List, Optional, Union
from datasets.features import Features, Value
from . import config
from .utils.logging import get_logger
logger = get_logger(__name__)
@dataclass
class EvaluationModuleInfo:
"""Base class to store information about an evaluation used for `MetricInfo`, `ComparisonInfo`,
and `MeasurementInfo`.
`EvaluationModuleInfo` documents an evaluation, including its name, version, and features.
See the constructor arguments and properties for a full list.
Note: Not all fields are known on construction and may be updated later.
"""
# Set in the dataset scripts
description: str
citation: str
features: Union[Features, List[Features]]
inputs_description: str = field(default_factory=str)
homepage: str = field(default_factory=str)
license: str = field(default_factory=str)
codebase_urls: List[str] = field(default_factory=list)
reference_urls: List[str] = field(default_factory=list)
streamable: bool = False
format: Optional[str] = None
module_type: str = "metric" # deprecate this in the future
# Set later by the builder
module_name: Optional[str] = None
config_name: Optional[str] = None
experiment_id: Optional[str] = None
def __post_init__(self):
if self.format is not None:
for key, value in self.features.items():
if not isinstance(value, Value):
raise ValueError(
f"When using 'numpy' format, all features should be a `datasets.Value` feature. "
f"Here {key} is an instance of {value.__class__.__name__}"
)
def write_to_directory(self, metric_info_dir):
"""Write `EvaluationModuleInfo` as JSON to `metric_info_dir`.
Also save the license separately in LICENSE.
Args:
metric_info_dir (`str`):
The directory to save `metric_info_dir` to.
Example:
```py
>>> my_metric.info.write_to_directory("/path/to/directory/")
```
"""
with open(os.path.join(metric_info_dir, config.METRIC_INFO_FILENAME), "w", encoding="utf-8") as f:
json.dump(asdict(self), f)
with open(os.path.join(metric_info_dir, config.LICENSE_FILENAME), "w", encoding="utf-8") as f:
f.write(self.license)
@classmethod
def from_directory(cls, metric_info_dir) -> "EvaluationModuleInfo":
"""Create `EvaluationModuleInfo` from the JSON file in `metric_info_dir`.
Args:
metric_info_dir (`str`):
The directory containing the `metric_info` JSON file. This
should be the root directory of a specific metric version.
Example:
```py
>>> my_metric = EvaluationModuleInfo.from_directory("/path/to/directory/")
```
"""
logger.info(f"Loading Metric info from {metric_info_dir}")
if not metric_info_dir:
raise ValueError("Calling EvaluationModuleInfo.from_directory() with undefined metric_info_dir.")
with open(os.path.join(metric_info_dir, config.METRIC_INFO_FILENAME), encoding="utf-8") as f:
metric_info_dict = json.load(f)
return cls.from_dict(metric_info_dict)
@classmethod
def from_dict(cls, metric_info_dict: dict) -> "EvaluationModuleInfo":
field_names = {f.name for f in dataclasses.fields(cls)}
return cls(**{k: v for k, v in metric_info_dict.items() if k in field_names})
@dataclass
class MetricInfo(EvaluationModuleInfo):
"""Information about a metric.
`EvaluationModuleInfo` documents a metric, including its name, version, and features.
See the constructor arguments and properties for a full list.
Note: Not all fields are known on construction and may be updated later.
"""
module_type: str = "metric"
@dataclass
class ComparisonInfo(EvaluationModuleInfo):
"""Information about a comparison.
`EvaluationModuleInfo` documents a comparison, including its name, version, and features.
See the constructor arguments and properties for a full list.
Note: Not all fields are known on construction and may be updated later.
"""
module_type: str = "comparison"
@dataclass
class MeasurementInfo(EvaluationModuleInfo):
"""Information about a measurement.
`EvaluationModuleInfo` documents a measurement, including its name, version, and features.
See the constructor arguments and properties for a full list.
Note: Not all fields are known on construction and may be updated later.
"""
module_type: str = "measurement"
# Copyright 2020 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
""" List and inspect metrics."""
from typing import Optional
import requests
from datasets import DownloadConfig
from .config import EVALUATION_MODULE_TYPES, HF_LIST_ENDPOINT
from .loading import evaluation_module_factory
from .utils.logging import get_logger
logger = get_logger(__name__)
class SplitsNotFoundError(ValueError):
pass
def list_evaluation_modules(module_type=None, include_community=True, with_details=False):
"""List all evaluation modules available on the Hugging Face Hub.
Args:
module_type (`str`, *optional*, defaults to `None`):
Type of evaluation modules to list. Has to be one of `'metric'`, `'comparison'`, or `'measurement'`. If `None`, all types are listed.
include_community (`bool`, *optional*, defaults to `True`):
Include community modules in the list.
with_details (`bool`, *optional*, defaults to `False`):
Return the full details on the metrics instead of only the ID.
Returns:
`List[Union[str, dict]]`
Example:
```py
>>> from evaluate import list_evaluation_modules
>>> list_evaluation_modules(module_type="metric")
```
"""
if module_type is None:
evaluations_list = []
for module_type in EVALUATION_MODULE_TYPES:
evaluations_list.extend(
_list_evaluation_modules_type(
module_type, include_community=include_community, with_details=with_details
)
)
else:
if module_type not in EVALUATION_MODULE_TYPES:
raise ValueError(f"Invalid module type '{module_type}'. Has to be one of {EVALUATION_MODULE_TYPES}.")
evaluations_list = _list_evaluation_modules_type(
module_type, include_community=include_community, with_details=with_details
)
return evaluations_list
def _list_evaluation_modules_type(module_type, include_community=True, with_details=False):
r = requests.get(HF_LIST_ENDPOINT.format(type=module_type))
r.raise_for_status()
d = r.json()
if not include_community:
d = [element for element in d if element["id"].split("/")[0] == f"evaluate-{module_type}"]
# remove namespace for canonical modules and add community tag
for element in d:
if element["id"].split("/")[0] == f"evaluate-{module_type}":
element["id"] = element["id"].split("/")[1]
element["community"] = False
else:
element["community"] = True
if with_details:
return [
{
"name": element["id"],
"type": module_type,
"community": element["community"],
"likes": element.get("likes", 0),
}
for element in d
]
else:
return [element["id"] for element in d]
def inspect_evaluation_module(
path: str, local_path: str, download_config: Optional[DownloadConfig] = None, **download_kwargs
):
r"""
Allow inspection/modification of a evaluation script by copying it on local drive at local_path.
Args:
path (``str``): path to the evaluation script. Can be either:
- a local path to script or the directory containing the script (if the script has the same name as the directory),
e.g. ``'./metrics/accuracy'`` or ``'./metrics/accuracy/accuracy.py'``
- a dataset identifier on the Hugging Face Hub (list all available datasets and ids with ``evaluate.list_evaluation_modules()``)
e.g. ``'accuracy'``, ``'bleu'`` or ``'word_length'``
local_path (``str``): path to the local folder to copy the datset script to.
download_config (Optional ``datasets.DownloadConfig``: specific download configuration parameters.
**download_kwargs: optional attributes for DownloadConfig() which will override the attributes in download_config if supplied.
"""
evaluation_module = evaluation_module_factory(
path, download_config=download_config, force_local_path=local_path, **download_kwargs
)
print(
f"The processing scripts for metric {path} can be inspected at {local_path}. "
f"The main class is in {evaluation_module.module_path}. "
f"You can modify this processing scripts and use it with `evaluate.load({local_path})`."
)
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Access datasets."""
import filecmp
import importlib
import inspect
import json
import os
import re
import shutil
import time
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Tuple, Type, Union
from urllib.parse import urlparse
from datasets import DownloadConfig, DownloadMode
from datasets.builder import DatasetBuilder
from datasets.packaged_modules import _EXTENSION_TO_MODULE, _hash_python_lines
from datasets.utils.filelock import FileLock
from datasets.utils.version import Version
from . import SCRIPTS_VERSION, config
from .module import EvaluationModule
from .utils.file_utils import (
cached_path,
head_hf_s3,
hf_hub_url,
init_hf_modules,
is_relative_path,
relative_to_absolute_path,
url_or_path_join,
)
from .utils.logging import get_logger
logger = get_logger(__name__)
ALL_ALLOWED_EXTENSIONS = list(_EXTENSION_TO_MODULE.keys()) + ["zip"]
def init_dynamic_modules(
name: str = config.MODULE_NAME_FOR_DYNAMIC_MODULES, hf_modules_cache: Optional[Union[Path, str]] = None
):
"""
Create a module with name `name` in which you can add dynamic modules
such as metrics or datasets. The module can be imported using its name.
The module is created in the HF_MODULE_CACHE directory by default (~/.cache/huggingface/modules) but it can
be overriden by specifying a path to another directory in `hf_modules_cache`.
"""
hf_modules_cache = init_hf_modules(hf_modules_cache)
dynamic_modules_path = os.path.join(hf_modules_cache, name)
os.makedirs(dynamic_modules_path, exist_ok=True)
if not os.path.exists(os.path.join(dynamic_modules_path, "__init__.py")):
with open(os.path.join(dynamic_modules_path, "__init__.py"), "w"):
pass
return dynamic_modules_path
def import_main_class(module_path) -> Optional[Union[Type[DatasetBuilder], Type[EvaluationModule]]]:
"""Import a module at module_path and return its main class, a Metric by default"""
module = importlib.import_module(module_path)
main_cls_type = EvaluationModule
# Find the main class in our imported module
module_main_cls = None
for name, obj in module.__dict__.items():
if isinstance(obj, type) and issubclass(obj, main_cls_type):
if inspect.isabstract(obj):
continue
module_main_cls = obj
break
return module_main_cls
def files_to_hash(file_paths: List[str]) -> str:
"""
Convert a list of scripts or text files provided in file_paths into a hashed filename in a repeatable way.
"""
# List all python files in directories if directories are supplied as part of external imports
to_use_files: List[Union[Path, str]] = []
for file_path in file_paths:
if os.path.isdir(file_path):
to_use_files.extend(list(Path(file_path).rglob("*.[pP][yY]")))
else:
to_use_files.append(file_path)
# Get the code from all these files
lines = []
for file_path in to_use_files:
with open(file_path, encoding="utf-8") as f:
lines.extend(f.readlines())
return _hash_python_lines(lines)
def convert_github_url(url_path: str) -> Tuple[str, Optional[str]]:
"""Convert a link to a file on a github repo in a link to the raw github object."""
parsed = urlparse(url_path)
sub_directory = None
if parsed.scheme in ("http", "https", "s3") and parsed.netloc == "github.com":
if "blob" in url_path:
if not url_path.endswith(".py"):
raise ValueError(f"External import from github at {url_path} should point to a file ending with '.py'")
url_path = url_path.replace("blob", "raw") # Point to the raw file
else:
# Parse github url to point to zip
github_path = parsed.path[1:]
repo_info, branch = github_path.split("/tree/") if "/tree/" in github_path else (github_path, "master")
repo_owner, repo_name = repo_info.split("/")
url_path = f"https://github.com/{repo_owner}/{repo_name}/archive/{branch}.zip"
sub_directory = f"{repo_name}-{branch}"
return url_path, sub_directory
def increase_load_count(name: str, resource_type: str):
"""Update the download count of a dataset or metric."""
if not config.HF_EVALUATE_OFFLINE and config.HF_UPDATE_DOWNLOAD_COUNTS:
try:
head_hf_s3(name, filename=name + ".py", dataset=(resource_type == "dataset"))
except Exception:
pass
def get_imports(file_path: str) -> Tuple[str, str, str, str]:
"""Find whether we should import or clone additional files for a given processing script.
And list the import.
We allow:
- library dependencies,
- local dependencies and
- external dependencies whose url is specified with a comment starting from "# From:' followed by the raw url to a file, an archive or a github repository.
external dependencies will be downloaded (and extracted if needed in the dataset folder).
We also add an `__init__.py` to each sub-folder of a downloaded folder so the user can import from them in the script.
Note that only direct import in the dataset processing script will be handled
We don't recursively explore the additional import to download further files.
Example::
import tensorflow
import .c4_utils
import .clicr.dataset-code.build_json_dataset # From: https://raw.githubusercontent.com/clips/clicr/master/dataset-code/build_json_dataset
"""
lines = []
with open(file_path, encoding="utf-8") as f:
lines.extend(f.readlines())
logger.debug(f"Checking {file_path} for additional imports.")
imports: List[Tuple[str, str, str, Optional[str]]] = []
is_in_docstring = False
for line in lines:
docstr_start_match = re.findall(r'[\s\S]*?"""[\s\S]*?', line)
if len(docstr_start_match) == 1:
# flip True <=> False only if doctstring
# starts at line without finishing
is_in_docstring = not is_in_docstring
if is_in_docstring:
# import statements in doctstrings should
# not be added as required dependencies
continue
match = re.match(r"^import\s+(\.?)([^\s\.]+)[^#\r\n]*(?:#\s+From:\s+)?([^\r\n]*)", line, flags=re.MULTILINE)
if match is None:
match = re.match(
r"^from\s+(\.?)([^\s\.]+)(?:[^\s]*)\s+import\s+[^#\r\n]*(?:#\s+From:\s+)?([^\r\n]*)",
line,
flags=re.MULTILINE,
)
if match is None:
continue
if match.group(1):
# The import starts with a '.', we will download the relevant file
if any(imp[1] == match.group(2) for imp in imports):
# We already have this import
continue
if match.group(3):
# The import has a comment with 'From:', we'll retrieve it from the given url
url_path = match.group(3)
url_path, sub_directory = convert_github_url(url_path)
imports.append(("external", match.group(2), url_path, sub_directory))
elif match.group(2):
# The import should be at the same place as the file
imports.append(("internal", match.group(2), match.group(2), None))
else:
if match.group(3):
# The import has a comment with `From: git+https:...`, asks user to pip install from git.
url_path = match.group(3)
imports.append(("library", match.group(2), url_path, None))
else:
imports.append(("library", match.group(2), match.group(2), None))
return imports
def _download_additional_modules(
name: str, base_path: str, imports: Tuple[str, str, str, str], download_config: Optional[DownloadConfig]
) -> List[Tuple[str, str]]:
"""
Download additional module for a module <name>.py at URL (or local path) <base_path>/<name>.py
The imports must have been parsed first using ``get_imports``.
If some modules need to be installed with pip, an error is raised showing how to install them.
This function return the list of downloaded modules as tuples (import_name, module_file_path).
The downloaded modules can then be moved into an importable directory with ``_copy_script_and_other_resources_in_importable_dir``.
"""
local_imports = []
library_imports = []
download_config = download_config.copy()
if download_config.download_desc is None:
download_config.download_desc = "Downloading extra modules"
for import_type, import_name, import_path, sub_directory in imports:
if import_type == "library":
library_imports.append((import_name, import_path)) # Import from a library
continue
if import_name == name:
raise ValueError(
f"Error in the {name} script, importing relative {import_name} module "
f"but {import_name} is the name of the script. "
f"Please change relative import {import_name} to another name and add a '# From: URL_OR_PATH' "
f"comment pointing to the original relative import file path."
)
if import_type == "internal":
url_or_filename = url_or_path_join(base_path, import_path + ".py")
elif import_type == "external":
url_or_filename = import_path
else:
raise ValueError("Wrong import_type")
local_import_path = cached_path(
url_or_filename,
download_config=download_config,
)
if sub_directory is not None:
local_import_path = os.path.join(local_import_path, sub_directory)
local_imports.append((import_name, local_import_path))
# Check library imports
needs_to_be_installed = set()
for library_import_name, library_import_path in library_imports:
try:
lib = importlib.import_module(library_import_name) # noqa F841
except ImportError:
library_import_name = "scikit-learn" if library_import_name == "sklearn" else library_import_name
needs_to_be_installed.add((library_import_name, library_import_path))
if needs_to_be_installed:
raise ImportError(
f"To be able to use {name}, you need to install the following dependencies"
f"{[lib_name for lib_name, lib_path in needs_to_be_installed]} using 'pip install "
f"{' '.join([lib_path for lib_name, lib_path in needs_to_be_installed])}' for instance'"
)
return local_imports
def _copy_script_and_other_resources_in_importable_dir(
name: str,
importable_directory_path: str,
subdirectory_name: str,
original_local_path: str,
local_imports: List[Tuple[str, str]],
additional_files: List[Tuple[str, str]],
download_mode: Optional[DownloadMode],
) -> str:
"""Copy a script and its required imports to an importable directory
Args:
name (str): name of the resource to load
importable_directory_path (str): path to the loadable folder in the dynamic modules directory
subdirectory_name (str): name of the subdirectory in importable_directory_path in which to place the script
original_local_path (str): local path to the resource script
local_imports (List[Tuple[str, str]]): list of (destination_filename, import_file_to_copy)
additional_files (List[Tuple[str, str]]): list of (destination_filename, additional_file_to_copy)
download_mode (Optional[DownloadMode]): download mode
Return:
importable_local_file: path to an importable module with importlib.import_module
"""
# Define a directory with a unique name in our dataset or metric folder
# path is: ./datasets|metrics/dataset|metric_name/hash_from_code/script.py
# we use a hash as subdirectory_name to be able to have multiple versions of a dataset/metric processing file together
importable_subdirectory = os.path.join(importable_directory_path, subdirectory_name)
importable_local_file = os.path.join(importable_subdirectory, name + ".py")
# Prevent parallel disk operations
lock_path = importable_directory_path + ".lock"
with FileLock(lock_path):
# Create main dataset/metrics folder if needed
if download_mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists(importable_directory_path):
shutil.rmtree(importable_directory_path)
os.makedirs(importable_directory_path, exist_ok=True)
# add an __init__ file to the main dataset folder if needed
init_file_path = os.path.join(importable_directory_path, "__init__.py")
if not os.path.exists(init_file_path):
with open(init_file_path, "w"):
pass
# Create hash dataset folder if needed
os.makedirs(importable_subdirectory, exist_ok=True)
# add an __init__ file to the hash dataset folder if needed
init_file_path = os.path.join(importable_subdirectory, "__init__.py")
if not os.path.exists(init_file_path):
with open(init_file_path, "w"):
pass
# Copy dataset.py file in hash folder if needed
if not os.path.exists(importable_local_file):
shutil.copyfile(original_local_path, importable_local_file)
# Record metadata associating original dataset path with local unique folder
meta_path = importable_local_file.split(".py")[0] + ".json"
if not os.path.exists(meta_path):
meta = {"original file path": original_local_path, "local file path": importable_local_file}
# the filename is *.py in our case, so better rename to filenam.json instead of filename.py.json
with open(meta_path, "w", encoding="utf-8") as meta_file:
json.dump(meta, meta_file)
# Copy all the additional imports
for import_name, import_path in local_imports:
if os.path.isfile(import_path):
full_path_local_import = os.path.join(importable_subdirectory, import_name + ".py")
if not os.path.exists(full_path_local_import):
shutil.copyfile(import_path, full_path_local_import)
elif os.path.isdir(import_path):
full_path_local_import = os.path.join(importable_subdirectory, import_name)
if not os.path.exists(full_path_local_import):
shutil.copytree(import_path, full_path_local_import)
else:
raise OSError(f"Error with local import at {import_path}")
# Copy aditional files like dataset infos file if needed
for file_name, original_path in additional_files:
destination_additional_path = os.path.join(importable_subdirectory, file_name)
if not os.path.exists(destination_additional_path) or not filecmp.cmp(
original_path, destination_additional_path
):
shutil.copyfile(original_path, destination_additional_path)
return importable_local_file
def _create_importable_file(
local_path: str,
local_imports: List[Tuple[str, str]],
additional_files: List[Tuple[str, str]],
dynamic_modules_path: str,
module_namespace: str,
name: str,
download_mode: DownloadMode,
) -> Tuple[str, str]:
importable_directory_path = os.path.join(dynamic_modules_path, module_namespace, name.replace("/", "--"))
Path(importable_directory_path).mkdir(parents=True, exist_ok=True)
(Path(importable_directory_path).parent / "__init__.py").touch(exist_ok=True)
hash = files_to_hash([local_path] + [loc[1] for loc in local_imports])
importable_local_file = _copy_script_and_other_resources_in_importable_dir(
name=name.split("/")[-1],
importable_directory_path=importable_directory_path,
subdirectory_name=hash,
original_local_path=local_path,
local_imports=local_imports,
additional_files=additional_files,
download_mode=download_mode,
)
logger.debug(f"Created importable dataset file at {importable_local_file}")
module_path = ".".join(
[os.path.basename(dynamic_modules_path), module_namespace, name.replace("/", "--"), hash, name.split("/")[-1]]
)
return module_path, hash
@dataclass
class ImportableModule:
module_path: str
hash: str
class _EvaluationModuleFactory:
def get_module(self) -> ImportableModule:
raise NotImplementedError
class LocalEvaluationModuleFactory(_EvaluationModuleFactory):
"""Get the module of a local metric. The metric script is loaded from a local script."""
def __init__(
self,
path: str,
module_type: str = "metrics",
download_config: Optional[DownloadConfig] = None,
download_mode: Optional[DownloadMode] = None,
dynamic_modules_path: Optional[str] = None,
):
self.path = path
self.module_type = module_type
self.name = Path(path).stem
self.download_config = download_config or DownloadConfig()
self.download_mode = download_mode
self.dynamic_modules_path = dynamic_modules_path
def get_module(self) -> ImportableModule:
# get script and other files
imports = get_imports(self.path)
local_imports = _download_additional_modules(
name=self.name,
base_path=str(Path(self.path).parent),
imports=imports,
download_config=self.download_config,
)
# copy the script and the files in an importable directory
dynamic_modules_path = self.dynamic_modules_path if self.dynamic_modules_path else init_dynamic_modules()
module_path, hash = _create_importable_file(
local_path=self.path,
local_imports=local_imports,
additional_files=[],
dynamic_modules_path=dynamic_modules_path,
module_namespace=self.module_type,
name=self.name,
download_mode=self.download_mode,
)
# make the new module to be noticed by the import system
importlib.invalidate_caches()
return ImportableModule(module_path, hash)
class HubEvaluationModuleFactory(_EvaluationModuleFactory):
"""Get the module of a metric from a metric repository on the Hub."""
def __init__(
self,
name: str,
module_type: str = "metrics",
revision: Optional[Union[str, Version]] = None,
download_config: Optional[DownloadConfig] = None,
download_mode: Optional[DownloadMode] = None,
dynamic_modules_path: Optional[str] = None,
):
self.name = name
self.module_type = module_type
self.revision = revision
self.download_config = download_config or DownloadConfig()
self.download_mode = download_mode
self.dynamic_modules_path = dynamic_modules_path
assert self.name.count("/") == 1
increase_load_count(name, resource_type="metric")
def download_loading_script(self, revision) -> str:
file_path = hf_hub_url(path=self.name, name=self.name.split("/")[1] + ".py", revision=revision)
download_config = self.download_config.copy()
if download_config.download_desc is None:
download_config.download_desc = "Downloading builder script"
return cached_path(file_path, download_config=download_config)
def get_module(self) -> ImportableModule:
revision = self.revision or os.getenv("HF_SCRIPTS_VERSION", SCRIPTS_VERSION)
if re.match(r"\d*\.\d*\.\d*", revision): # revision is version number (three digits separated by full stops)
revision = "v" + revision # tagging convention on evaluate repository starts with v
# get script and other files
try:
local_path = self.download_loading_script(revision)
except FileNotFoundError as err:
# if there is no file found with current revision tag try to load main
if self.revision is None and os.getenv("HF_SCRIPTS_VERSION", SCRIPTS_VERSION) != "main":
revision = "main"
local_path = self.download_loading_script(revision)
else:
raise err
imports = get_imports(local_path)
local_imports = _download_additional_modules(
name=self.name,
base_path=hf_hub_url(path=self.name, name="", revision=revision),
imports=imports,
download_config=self.download_config,
)
# copy the script and the files in an importable directory
dynamic_modules_path = self.dynamic_modules_path if self.dynamic_modules_path else init_dynamic_modules()
module_path, hash = _create_importable_file(
local_path=local_path,
local_imports=local_imports,
additional_files=[],
dynamic_modules_path=dynamic_modules_path,
module_namespace=self.module_type,
name=self.name,
download_mode=self.download_mode,
)
# make the new module to be noticed by the import system
importlib.invalidate_caches()
return ImportableModule(module_path, hash)
class CachedEvaluationModuleFactory(_EvaluationModuleFactory):
"""
Get the module of a metric that has been loaded once already and cached.
The script that is loaded from the cache is the most recent one with a matching name.
"""
def __init__(
self,
name: str,
module_type: str = "metrics",
dynamic_modules_path: Optional[str] = None,
):
self.name = name
self.module_type = module_type
self.dynamic_modules_path = dynamic_modules_path
assert self.name.count("/") == 0
def get_module(self) -> ImportableModule:
dynamic_modules_path = self.dynamic_modules_path if self.dynamic_modules_path else init_dynamic_modules()
importable_directory_path = os.path.join(dynamic_modules_path, self.module_type, self.name)
hashes = (
[h for h in os.listdir(importable_directory_path) if len(h) == 64]
if os.path.isdir(importable_directory_path)
else None
)
if not hashes:
raise FileNotFoundError(f"Metric {self.name} is not cached in {dynamic_modules_path}")
# get most recent
def _get_modification_time(module_hash):
return (
(Path(importable_directory_path) / module_hash / (self.name.split("--")[-1] + ".py")).stat().st_mtime
)
hash = sorted(hashes, key=_get_modification_time)[-1]
logger.warning(
f"Using the latest cached version of the module from {os.path.join(importable_directory_path, hash)} "
f"(last modified on {time.ctime(_get_modification_time(hash))}) since it "
f"couldn't be found locally at {self.name}, or remotely on the Hugging Face Hub."
)
# make the new module to be noticed by the import system
module_path = ".".join(
[os.path.basename(dynamic_modules_path), self.module_type, self.name, hash, self.name.split("--")[-1]]
)
importlib.invalidate_caches()
return ImportableModule(module_path, hash)
def evaluation_module_factory(
path: str,
module_type: Optional[str] = None,
revision: Optional[Union[str, Version]] = None,
download_config: Optional[DownloadConfig] = None,
download_mode: Optional[DownloadMode] = None,
force_local_path: Optional[str] = None,
dynamic_modules_path: Optional[str] = None,
**download_kwargs,
) -> ImportableModule:
"""
Download/extract/cache a metric module.
Metrics codes are cached inside the the dynamic modules cache to allow easy import (avoid ugly sys.path tweaks).
Args:
path (str): Path or name of the metric script.
- if ``path`` is a local metric script or a directory containing a local metric script (if the script has the same name as the directory):
-> load the module from the metric script
e.g. ``'./metrics/accuracy'`` or ``'./metrics/accuracy/accuracy.py'``.
- if ``path`` is a metric on the Hugging Face Hub (ex: `glue`, `squad`)
-> load the module from the metric script in the github repository at huggingface/datasets
e.g. ``'accuracy'`` or ``'rouge'``.
revision (Optional ``Union[str, datasets.Version]``):
If specified, the module will be loaded from the datasets repository at this version.
By default:
- it is set to the local version of the lib.
- it will also try to load it from the master branch if it's not available at the local version of the lib.
Specifying a version that is different from your local version of the lib might cause compatibility issues.
download_config (:class:`DownloadConfig`, optional): Specific download configuration parameters.
download_mode (:class:`DownloadMode`, default ``REUSE_DATASET_IF_EXISTS``): Download/generate mode.
force_local_path (Optional str): Optional path to a local path to download and prepare the script to.
Used to inspect or modify the script folder.
dynamic_modules_path (Optional str, defaults to HF_MODULES_CACHE / "datasets_modules", i.e. ~/.cache/huggingface/modules/datasets_modules):
Optional path to the directory in which the dynamic modules are saved. It must have been initialized with :obj:`init_dynamic_modules`.
By default the datasets and metrics are stored inside the `datasets_modules` module.
download_kwargs: optional attributes for DownloadConfig() which will override the attributes in download_config if supplied.
Returns:
ImportableModule
"""
if download_config is None:
download_config = DownloadConfig(**download_kwargs)
download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
download_config.extract_compressed_file = True
download_config.force_extract = True
filename = list(filter(lambda x: x, path.replace(os.sep, "/").split("/")))[-1]
if not filename.endswith(".py"):
filename = filename + ".py"
combined_path = os.path.join(path, filename)
# Try locally
if path.endswith(filename):
if os.path.isfile(path):
return LocalEvaluationModuleFactory(
path, download_mode=download_mode, dynamic_modules_path=dynamic_modules_path
).get_module()
else:
raise FileNotFoundError(f"Couldn't find a metric script at {relative_to_absolute_path(path)}")
elif os.path.isfile(combined_path):
return LocalEvaluationModuleFactory(
combined_path, download_mode=download_mode, dynamic_modules_path=dynamic_modules_path
).get_module()
elif is_relative_path(path) and path.count("/") <= 1 and not force_local_path:
try:
# load a canonical evaluation module from hub
if path.count("/") == 0:
# if no type provided look through all possible modules
if module_type is None:
for current_type in ["metric", "comparison", "measurement"]:
try:
return HubEvaluationModuleFactory(
f"evaluate-{current_type}/{path}",
revision=revision,
download_config=download_config,
download_mode=download_mode,
dynamic_modules_path=dynamic_modules_path,
).get_module()
except ConnectionError:
pass
raise FileNotFoundError
# if module_type provided load specific module_type
else:
return HubEvaluationModuleFactory(
f"evaluate-{module_type}/{path}",
revision=revision,
download_config=download_config,
download_mode=download_mode,
dynamic_modules_path=dynamic_modules_path,
).get_module()
# load community evaluation module from hub
elif path.count("/") == 1:
return HubEvaluationModuleFactory(
path,
revision=revision,
download_config=download_config,
download_mode=download_mode,
dynamic_modules_path=dynamic_modules_path,
).get_module()
except Exception as e1: # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
# if it's a canonical module we need to check if it's any of the types
if path.count("/") == 0:
for current_type in ["metric", "comparison", "measurement"]:
try:
return CachedEvaluationModuleFactory(
f"evaluate-{current_type}--{path}", dynamic_modules_path=dynamic_modules_path
).get_module()
except Exception as e2: # noqa: if it's not in the cache, then it doesn't exist.
pass
# if it's a community module we just need to check on path
elif path.count("/") == 1:
try:
return CachedEvaluationModuleFactory(
path.replace("/", "--"), dynamic_modules_path=dynamic_modules_path
).get_module()
except Exception as e2: # noqa: if it's not in the cache, then it doesn't exist.
pass
if not isinstance(e1, (ConnectionError, FileNotFoundError)):
raise e1 from None
raise FileNotFoundError(
f"Couldn't find a module script at {relative_to_absolute_path(combined_path)}. "
f"Module '{path}' doesn't exist on the Hugging Face Hub either."
) from None
else:
raise FileNotFoundError(f"Couldn't find a module script at {relative_to_absolute_path(combined_path)}.")
def load(
path: str,
config_name: Optional[str] = None,
module_type: Optional[str] = None,
process_id: int = 0,
num_process: int = 1,
cache_dir: Optional[str] = None,
experiment_id: Optional[str] = None,
keep_in_memory: bool = False,
download_config: Optional[DownloadConfig] = None,
download_mode: Optional[DownloadMode] = None,
revision: Optional[Union[str, Version]] = None,
**init_kwargs,
) -> EvaluationModule:
"""Load a [`~evaluate.EvaluationModule`].
Args:
path (`str`):
Path to the evaluation processing script with the evaluation builder. Can be either:
- a local path to processing script or the directory containing the script (if the script has the same name as the directory),
e.g. `'./metrics/rouge'` or `'./metrics/rouge/rouge.py'`
- a evaluation module identifier on the HuggingFace evaluate repo e.g. `'rouge'` or `'bleu'` that are in either `'metrics/'`,
`'comparisons/'`, or `'measurements/'` depending on the provided `module_type`
config_name (`str`, *optional*):
Selecting a configuration for the metric (e.g. the GLUE metric has a configuration for each subset).
module_type (`str`, default `'metric'`):
Type of evaluation module, can be one of `'metric'`, `'comparison'`, or `'measurement'`.
process_id (`int`, *optional*):
For distributed evaluation: id of the process.
num_process (`int`, *optional*):
For distributed evaluation: total number of processes.
cache_dir (`str`, *optional*):
Path to store the temporary predictions and references (default to `~/.cache/huggingface/evaluate/`).
experiment_id (`str`):
A specific experiment id. This is used if several distributed evaluations share the same file system.
This is useful to compute metrics in distributed setups (in particular non-additive metrics like F1).
keep_in_memory (`bool`):
Whether to store the temporary results in memory (defaults to `False`).
download_config ([`~evaluate.DownloadConfig`], *optional*):
Specific download configuration parameters.
download_mode ([`DownloadMode`], defaults to `REUSE_DATASET_IF_EXISTS`):
Download/generate mode.
revision (`Union[str, evaluate.Version]`, *optional*):
If specified, the module will be loaded from the datasets repository
at this version. By default it is set to the local version of the lib. Specifying a version that is different from
your local version of the lib might cause compatibility issues.
Returns:
[`evaluate.EvaluationModule`]
Example:
```py
>>> from evaluate import load
>>> accuracy = load("accuracy")
```
"""
download_mode = DownloadMode(download_mode or DownloadMode.REUSE_DATASET_IF_EXISTS)
evaluation_module = evaluation_module_factory(
path, module_type=module_type, revision=revision, download_config=download_config, download_mode=download_mode
)
evaluation_cls = import_main_class(evaluation_module.module_path)
evaluation_instance = evaluation_cls(
config_name=config_name,
process_id=process_id,
num_process=num_process,
cache_dir=cache_dir,
keep_in_memory=keep_in_memory,
experiment_id=experiment_id,
hash=evaluation_module.hash,
**init_kwargs,
)
if module_type and module_type != evaluation_instance.module_type:
raise TypeError(
f"No module of module type '{module_type}' not found for '{path}' locally, or on the Hugging Face Hub. Found module of module type '{evaluation_instance.module_type}' instead."
)
# Download and prepare resources for the metric
evaluation_instance.download_and_prepare(download_config=download_config)
return evaluation_instance
# Copyright 2020 The HuggingFace Datasets Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
""" EvaluationModule base class."""
import collections
import itertools
import os
import types
import uuid
from typing import Any, Dict, List, Optional, Tuple, Union
import numpy as np
import pyarrow as pa
from datasets import DatasetInfo, DownloadConfig, DownloadManager
from datasets.arrow_dataset import Dataset
from datasets.arrow_reader import ArrowReader
from datasets.arrow_writer import ArrowWriter
from datasets.features import Features, Sequence, Value
from datasets.features.features import _check_non_null_non_empty_recursive
from datasets.utils.filelock import BaseFileLock, FileLock, Timeout
from datasets.utils.py_utils import copyfunc, temp_seed, zip_dict
from . import config
from .info import EvaluationModuleInfo
from .naming import camelcase_to_snakecase
from .utils.logging import get_logger
logger = get_logger(__name__)
class FileFreeLock(BaseFileLock):
"""Thread lock until a file **cannot** be locked"""
def __init__(self, lock_file, *args, **kwargs):
self.filelock = FileLock(lock_file)
super().__init__(lock_file, *args, **kwargs)
self._lock_file_fd = None
def _acquire(self):
try:
self.filelock.acquire(timeout=0.01, poll_interval=0.02) # Try to lock once
except Timeout:
# We couldn't acquire the lock, the file is locked!
self._lock_file_fd = self.filelock.lock_file
else:
# We were able to acquire the lock, the file is not yet locked!
self.filelock.release()
self._lock_file_fd = None
def _release(self):
self._lock_file_fd = None
@property
def is_locked(self) -> bool:
return self._lock_file_fd is not None
# lists - summarize long lists similarly to NumPy
# arrays/tensors - let the frameworks control formatting
def summarize_if_long_list(obj):
if type(obj) is not list or len(obj) <= 6:
return f"{obj}"
def format_chunk(chunk):
return ", ".join(repr(x) for x in chunk)
return f"[{format_chunk(obj[:3])}, ..., {format_chunk(obj[-3:])}]"
class EvaluationModuleInfoMixin:
"""This base class exposes some attributes of EvaluationModuleInfo
at the base level of the EvaluationModule for easy access.
"""
def __init__(self, info: EvaluationModuleInfo):
self._module_info = info
@property
def info(self):
""":class:`evaluate.EvaluationModuleInfo` object containing all the metadata in the evaluation module."""
return self._module_info
@property
def name(self) -> str:
return self._module_info.module_name
@property
def experiment_id(self) -> Optional[str]:
return self._module_info.experiment_id
@property
def description(self) -> str:
return self._module_info.description
@property
def citation(self) -> str:
return self._module_info.citation
@property
def features(self) -> Features:
return self._module_info.features
@property
def inputs_description(self) -> str:
return self._module_info.inputs_description
@property
def homepage(self) -> Optional[str]:
return self._module_info.homepage
@property
def license(self) -> str:
return self._module_info.license
@property
def codebase_urls(self) -> Optional[List[str]]:
return self._module_info.codebase_urls
@property
def reference_urls(self) -> Optional[List[str]]:
return self._module_info.reference_urls
@property
def streamable(self) -> bool:
return self._module_info.streamable
@property
def format(self) -> Optional[str]:
return self._module_info.format
@property
def module_type(self) -> str:
return self._module_info.module_type
class EvaluationModule(EvaluationModuleInfoMixin):
"""A `EvaluationModule` is the base class and common API for metrics, comparisons, and measurements.
Args:
config_name (`str`):
This is used to define a hash specific to a module computation script and prevents the module's data
to be overridden when the module loading script is modified.
keep_in_memory (`bool`):
Keep all predictions and references in memory. Not possible in distributed settings.
cache_dir (`str`):
Path to a directory in which temporary prediction/references data will be stored.
The data directory should be located on a shared file-system in distributed setups.
num_process (`int`):
Specify the total number of nodes in a distributed settings.
This is useful to compute module in distributed setups (in particular non-additive modules like F1).
process_id (`int`):
Specify the id of the current process in a distributed setup (between 0 and num_process-1)
This is useful to compute module in distributed setups (in particular non-additive metrics like F1).
seed (`int`, optional):
If specified, this will temporarily set numpy's random seed when [`~evaluate.EvaluationModule.compute`] is run.
experiment_id (`str`):
A specific experiment id. This is used if several distributed evaluations share the same file system.
This is useful to compute module in distributed setups (in particular non-additive metrics like F1).
hash (`str`):
Used to identify the evaluation module according to the hashed file contents.
max_concurrent_cache_files (`int`):
Max number of concurrent module cache files (default `10000`).
timeout (`Union[int, float]`):
Timeout in second for distributed setting synchronization.
"""
def __init__(
self,
config_name: Optional[str] = None,
keep_in_memory: bool = False,
cache_dir: Optional[str] = None,
num_process: int = 1,
process_id: int = 0,
seed: Optional[int] = None,
experiment_id: Optional[str] = None,
hash: str = None,
max_concurrent_cache_files: int = 10000,
timeout: Union[int, float] = 100,
**kwargs,
):
# prepare info
self.config_name = config_name or "default"
info = self._info()
info.module_name = camelcase_to_snakecase(self.__class__.__name__)
info.config_name = self.config_name
info.experiment_id = experiment_id or "default_experiment"
EvaluationModuleInfoMixin.__init__(self, info) # For easy access on low level
# Safety checks on num_process and process_id
if not isinstance(process_id, int) or process_id < 0:
raise ValueError("'process_id' should be a number greater than 0")
if not isinstance(num_process, int) or num_process <= process_id:
raise ValueError("'num_process' should be a number greater than process_id")
if keep_in_memory and num_process != 1:
raise ValueError("Using 'keep_in_memory' is not possible in distributed setting (num_process > 1).")
self.num_process = num_process
self.process_id = process_id
self.max_concurrent_cache_files = max_concurrent_cache_files
self.keep_in_memory = keep_in_memory
self._data_dir_root = os.path.expanduser(cache_dir or config.HF_METRICS_CACHE)
self.data_dir = self._build_data_dir()
if seed is None:
_, seed, pos, *_ = np.random.get_state()
self.seed: int = seed[pos] if pos < 624 else seed[0]
else:
self.seed: int = seed
self.timeout: Union[int, float] = timeout
# Update 'compute' and 'add' docstring
# methods need to be copied otherwise it changes the docstrings of every instance
self.compute = types.MethodType(copyfunc(self.compute), self)
self.add_batch = types.MethodType(copyfunc(self.add_batch), self)
self.add = types.MethodType(copyfunc(self.add), self)
self.compute.__func__.__doc__ += self.info.inputs_description
self.add_batch.__func__.__doc__ += self.info.inputs_description
self.add.__func__.__doc__ += self.info.inputs_description
# self.arrow_schema = pa.schema(field for field in self.info.features.type)
self.selected_feature_format = None
self.buf_writer = None
self.writer = None
self.writer_batch_size = None
self.data = None
# This is the cache file we store our predictions/references in
# Keep it None for now so we can (cloud)pickle the object
self.cache_file_name = None
self.filelock = None
self.rendez_vous_lock = None
# This is all the cache files on which we have a lock when we are in a distributed setting
self.file_paths = None
self.filelocks = None
# This fingerprints the evaluation module according to the hashed contents of the module code
self._hash = hash
def __len__(self):
"""Return the number of examples (predictions or predictions/references pair)
currently stored in the evaluation module's cache.
"""
return 0 if self.writer is None else len(self.writer)
def __repr__(self):
return (
f'EvaluationModule(name: "{self.name}", module_type: "{self.module_type}", '
f'features: {self.features}, usage: """{self.inputs_description}""", '
f"stored examples: {len(self)})"
)
def _build_data_dir(self):
"""Path of this evaluation module in cache_dir:
Will be:
self._data_dir_root/self.name/self.config_name/self.hash (if not none)/
If any of these element is missing or if ``with_version=False`` the corresponding subfolders are dropped.
"""
builder_data_dir = self._data_dir_root
builder_data_dir = os.path.join(builder_data_dir, self.name, self.config_name)
os.makedirs(builder_data_dir, exist_ok=True)
return builder_data_dir
def _create_cache_file(self, timeout=1) -> Tuple[str, FileLock]:
"""Create a new cache file. If the default cache file is used, we generated a new hash."""
file_path = os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-{self.process_id}.arrow")
filelock = None
for i in range(self.max_concurrent_cache_files):
filelock = FileLock(file_path + ".lock")
try:
filelock.acquire(timeout=timeout)
except Timeout:
# If we have reached the max number of attempts or we are not allow to find a free name (distributed setup)
# We raise an error
if self.num_process != 1:
raise ValueError(
f"Error in _create_cache_file: another evaluation module instance is already using the local cache file at {file_path}. "
f"Please specify an experiment_id (currently: {self.experiment_id}) to avoid collision "
f"between distributed evaluation module instances."
) from None
if i == self.max_concurrent_cache_files - 1:
raise ValueError(
f"Cannot acquire lock, too many evaluation module instance are operating concurrently on this file system."
f"You should set a larger value of max_concurrent_cache_files when creating the evaluation module "
f"(current value is {self.max_concurrent_cache_files})."
) from None
# In other cases (allow to find new file name + not yet at max num of attempts) we can try to sample a new hashing name.
file_uuid = str(uuid.uuid4())
file_path = os.path.join(
self.data_dir, f"{self.experiment_id}-{file_uuid}-{self.num_process}-{self.process_id}.arrow"
)
else:
break
return file_path, filelock
def _get_all_cache_files(self) -> Tuple[List[str], List[FileLock]]:
"""Get a lock on all the cache files in a distributed setup.
We wait for timeout second to let all the distributed node finish their tasks (default is 100 seconds).
"""
if self.num_process == 1:
if self.cache_file_name is None:
raise ValueError(
"Evaluation module cache file doesn't exist. Please make sure that you call `add` or `add_batch` "
"at least once before calling `compute`."
)
file_paths = [self.cache_file_name]
else:
file_paths = [
os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-{process_id}.arrow")
for process_id in range(self.num_process)
]
# Let's acquire a lock on each process files to be sure they are finished writing
filelocks = []
for process_id, file_path in enumerate(file_paths):
if process_id == 0: # process 0 already has its lock file
filelocks.append(self.filelock)
else:
filelock = FileLock(file_path + ".lock")
try:
filelock.acquire(timeout=self.timeout)
except Timeout:
raise ValueError(
f"Cannot acquire lock on cached file {file_path} for process {process_id}."
) from None
else:
filelocks.append(filelock)
return file_paths, filelocks
def _check_all_processes_locks(self):
expected_lock_file_names = [
os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-{process_id}.arrow.lock")
for process_id in range(self.num_process)
]
for expected_lock_file_name in expected_lock_file_names:
nofilelock = FileFreeLock(expected_lock_file_name)
try:
nofilelock.acquire(timeout=self.timeout)
except Timeout:
raise ValueError(
f"Expected to find locked file {expected_lock_file_name} from process {self.process_id} but it doesn't exist."
) from None
else:
nofilelock.release()
def _check_rendez_vous(self):
expected_lock_file_name = os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-0.arrow.lock")
nofilelock = FileFreeLock(expected_lock_file_name)
try:
nofilelock.acquire(timeout=self.timeout)
except Timeout:
raise ValueError(
f"Expected to find locked file {expected_lock_file_name} from process {self.process_id} but it doesn't exist."
) from None
else:
nofilelock.release()
lock_file_name = os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-rdv.lock")
rendez_vous_lock = FileLock(lock_file_name)
try:
rendez_vous_lock.acquire(timeout=self.timeout)
except Timeout:
raise ValueError(f"Couldn't acquire lock on {lock_file_name} from process {self.process_id}.") from None
else:
rendez_vous_lock.release()
def _finalize(self):
"""Close all the writing process and load/gather the data
from all the nodes if main node or all_process is True.
"""
if self.writer is not None:
self.writer.finalize()
self.writer = None
# release the locks of the processes > 0 so that process 0 can lock them to read + delete the data
if self.filelock is not None and self.process_id > 0:
self.filelock.release()
if self.keep_in_memory:
# Read the predictions and references
reader = ArrowReader(path=self.data_dir, info=DatasetInfo(features=self.selected_feature_format))
self.data = Dataset.from_buffer(self.buf_writer.getvalue())
elif self.process_id == 0:
# Let's acquire a lock on each node files to be sure they are finished writing
file_paths, filelocks = self._get_all_cache_files()
# Read the predictions and references
try:
reader = ArrowReader(path="", info=DatasetInfo(features=self.selected_feature_format))
self.data = Dataset(**reader.read_files([{"filename": f} for f in file_paths]))
except FileNotFoundError:
raise ValueError(
"Error in finalize: another evaluation module instance is already using the local cache file. "
"Please specify an experiment_id to avoid collision between distributed evaluation module instances."
) from None
# Store file paths and locks and we will release/delete them after the computation.
self.file_paths = file_paths
self.filelocks = filelocks
def compute(self, *, predictions=None, references=None, **kwargs) -> Optional[dict]:
"""Compute the evaluation module.
Usage of positional arguments is not allowed to prevent mistakes.
Args:
predictions (`list/array/tensor`, *optional*):
Predictions.
references (`list/array/tensor`, *optional*):
References.
**kwargs (optional):
Keyword arguments that will be forwarded to the evaluation module [`~evaluate.EvaluationModule.compute`]
method (see details in the docstring).
Return:
`dict` or `None`
- Dictionary with the results if this evaluation module is run on the main process (`process_id == 0`).
- `None` if the evaluation module is not run on the main process (`process_id != 0`).
```py
>>> import evaluate
>>> accuracy = evaluate.load("accuracy")
>>> accuracy.compute(predictions=[0, 1, 1, 0], references=[0, 1, 0, 1])
```
"""
all_kwargs = {"predictions": predictions, "references": references, **kwargs}
if predictions is None and references is None:
missing_kwargs = {k: None for k in self._feature_names() if k not in all_kwargs}
all_kwargs.update(missing_kwargs)
else:
missing_inputs = [k for k in self._feature_names() if k not in all_kwargs]
if missing_inputs:
raise ValueError(
f"Evaluation module inputs are missing: {missing_inputs}. All required inputs are {list(self._feature_names())}"
)
inputs = {input_name: all_kwargs[input_name] for input_name in self._feature_names()}
compute_kwargs = {k: kwargs[k] for k in kwargs if k not in self._feature_names()}
if any(v is not None for v in inputs.values()):
self.add_batch(**inputs)
self._finalize()
self.cache_file_name = None
self.filelock = None
self.selected_feature_format = None
if self.process_id == 0:
self.data.set_format(type=self.info.format)
inputs = {input_name: self.data[input_name] for input_name in self._feature_names()}
with temp_seed(self.seed):
output = self._compute(**inputs, **compute_kwargs)
if self.buf_writer is not None:
self.buf_writer = None
del self.data
self.data = None
else:
# Release locks and delete all the cache files. Process 0 is released last.
for filelock, file_path in reversed(list(zip(self.filelocks, self.file_paths))):
logger.info(f"Removing {file_path}")
del self.data
self.data = None
del self.writer
self.writer = None
os.remove(file_path)
filelock.release()
return output
else:
return None
def add_batch(self, *, predictions=None, references=None, **kwargs):
"""Add a batch of predictions and references for the evaluation module's stack.
Args:
predictions (`list/array/tensor`, *optional*):
Predictions.
references (`list/array/tensor`, *optional*):
References.
Example:
```py
>>> import evaluate
>>> accuracy = evaluate.load("accuracy")
>>> for refs, preds in zip([[0,1],[0,1]], [[1,0],[0,1]]):
... accuracy.add_batch(references=refs, predictions=preds)
```
"""
bad_inputs = [input_name for input_name in kwargs if input_name not in self._feature_names()]
if bad_inputs:
raise ValueError(
f"Bad inputs for evaluation module: {bad_inputs}. All required inputs are {list(self._feature_names())}"
)
batch = {"predictions": predictions, "references": references, **kwargs}
batch = {input_name: batch[input_name] for input_name in self._feature_names()}
if self.writer is None:
self.selected_feature_format = self._infer_feature_from_batch(batch)
self._init_writer()
try:
for key, column in batch.items():
if len(column) > 0:
self._enforce_nested_string_type(self.selected_feature_format[key], column[0])
batch = self.selected_feature_format.encode_batch(batch)
self.writer.write_batch(batch)
except (pa.ArrowInvalid, TypeError):
if any(len(batch[c]) != len(next(iter(batch.values()))) for c in batch):
col0 = next(iter(batch))
bad_col = [c for c in batch if len(batch[c]) != len(batch[col0])][0]
error_msg = (
f"Mismatch in the number of {col0} ({len(batch[col0])}) and {bad_col} ({len(batch[bad_col])})"
)
elif set(self.selected_feature_format) != {"references", "predictions"}:
error_msg = (
f"Module inputs don't match the expected format.\n"
f"Expected format: {self.selected_feature_format },\n"
)
error_msg_inputs = ",\n".join(
f"Input {input_name}: {summarize_if_long_list(batch[input_name])}"
for input_name in self.selected_feature_format
)
error_msg += error_msg_inputs
else:
error_msg = (
f"Predictions and/or references don't match the expected format.\n"
f"Expected format: {self.selected_feature_format },\n"
f"Input predictions: {summarize_if_long_list(predictions)},\n"
f"Input references: {summarize_if_long_list(references)}"
)
raise ValueError(error_msg) from None
def add(self, *, prediction=None, reference=None, **kwargs):
"""Add one prediction and reference for the evaluation module's stack.
Args:
prediction (`list/array/tensor`, *optional*):
Predictions.
reference (`list/array/tensor`, *optional*):
References.
Example:
```py
>>> import evaluate
>>> accuracy = evaluate.load("accuracy")
>>> accuracy.add(references=[0,1], predictions=[1,0])
```
"""
bad_inputs = [input_name for input_name in kwargs if input_name not in self._feature_names()]
if bad_inputs:
raise ValueError(
f"Bad inputs for evaluation module: {bad_inputs}. All required inputs are {list(self._feature_names())}"
)
example = {"predictions": prediction, "references": reference, **kwargs}
example = {input_name: example[input_name] for input_name in self._feature_names()}
if self.writer is None:
self.selected_feature_format = self._infer_feature_from_example(example)
self._init_writer()
try:
self._enforce_nested_string_type(self.selected_feature_format, example)
example = self.selected_feature_format.encode_example(example)
self.writer.write(example)
except (pa.ArrowInvalid, TypeError):
error_msg = (
f"Evaluation module inputs don't match the expected format.\n"
f"Expected format: {self.selected_feature_format},\n"
)
error_msg_inputs = ",\n".join(
f"Input {input_name}: {summarize_if_long_list(example[input_name])}"
for input_name in self.selected_feature_format
)
error_msg += error_msg_inputs
raise ValueError(error_msg) from None
def _infer_feature_from_batch(self, batch):
if isinstance(self.features, Features):
return self.features
else:
example = dict([(k, v[0]) for k, v in batch.items()])
return self._infer_feature_from_example(example)
def _infer_feature_from_example(self, example):
if isinstance(self.features, Features):
return self.features
else:
for features in self.features:
try:
self._enforce_nested_string_type(features, example)
features.encode_example(example)
return features
except (ValueError, TypeError):
continue
feature_strings = "\n".join([f"Feature option {i}: {feature}" for i, feature in enumerate(self.features)])
error_msg = (
f"Predictions and/or references don't match the expected format.\n"
f"Expected format:\n{feature_strings},\n"
f"Input predictions: {summarize_if_long_list(example['predictions'])},\n"
f"Input references: {summarize_if_long_list(example['references'])}"
)
raise ValueError(error_msg) from None
def _feature_names(self):
if isinstance(self.features, list):
feature_names = list(self.features[0].keys())
else:
feature_names = list(self.features.keys())
return feature_names
def _init_writer(self, timeout=1):
if self.num_process > 1:
if self.process_id == 0:
file_path = os.path.join(self.data_dir, f"{self.experiment_id}-{self.num_process}-rdv.lock")
self.rendez_vous_lock = FileLock(file_path)
try:
self.rendez_vous_lock.acquire(timeout=timeout)
except TimeoutError:
raise ValueError(
f"Error in _init_writer: another evalution module instance is already using the local cache file at {file_path}. "
f"Please specify an experiment_id (currently: {self.experiment_id}) to avoid collision "
f"between distributed evaluation module instances."
) from None
if self.keep_in_memory:
self.buf_writer = pa.BufferOutputStream()
self.writer = ArrowWriter(
features=self.selected_feature_format, stream=self.buf_writer, writer_batch_size=self.writer_batch_size
)
else:
self.buf_writer = None
# Get cache file name and lock it
if self.cache_file_name is None or self.filelock is None:
cache_file_name, filelock = self._create_cache_file() # get ready
self.cache_file_name = cache_file_name
self.filelock = filelock
self.writer = ArrowWriter(
features=self.selected_feature_format,
path=self.cache_file_name,
writer_batch_size=self.writer_batch_size,
)
# Setup rendez-vous here if
if self.num_process > 1:
if self.process_id == 0:
self._check_all_processes_locks() # wait for everyone to be ready
self.rendez_vous_lock.release() # let everyone go
else:
self._check_rendez_vous() # wait for master to be ready and to let everyone go
def _info(self) -> EvaluationModuleInfo:
"""Construct the EvaluationModuleInfo object. See `EvaluationModuleInfo` for details.
Warning: This function is only called once and the result is cached for all
following .info() calls.
Returns:
info: (EvaluationModuleInfo) The EvaluationModule information
"""
raise NotImplementedError
def download_and_prepare(
self,
download_config: Optional[DownloadConfig] = None,
dl_manager: Optional[DownloadManager] = None,
):
"""Downloads and prepares evaluation module for reading.
Args:
download_config ([`DownloadConfig`], *optional*):
Specific download configuration parameters.
dl_manager ([`DownloadManager`], *optional*):
Specific download manager to use.
Example:
```py
>>> import evaluate
```
"""
if dl_manager is None:
if download_config is None:
download_config = DownloadConfig()
download_config.cache_dir = os.path.join(self.data_dir, "downloads")
download_config.force_download = False
dl_manager = DownloadManager(
dataset_name=self.name, download_config=download_config, data_dir=self.data_dir
)
self._download_and_prepare(dl_manager)
def _download_and_prepare(self, dl_manager):
"""Downloads and prepares resources for the evaluation module.
This is the internal implementation to overwrite called when user calls
`download_and_prepare`. It should download all required resources for the evaluation module.
Args:
dl_manager (:class:`DownloadManager`): `DownloadManager` used to download and cache data.
"""
return None
def _compute(self, *, predictions=None, references=None, **kwargs) -> Dict[str, Any]:
"""This method defines the common API for all the evaluation module in the library"""
raise NotImplementedError
def __del__(self):
if hasattr(self, "filelock") and self.filelock is not None:
self.filelock.release()
if hasattr(self, "rendez_vous_lock") and self.rendez_vous_lock is not None:
self.rendez_vous_lock.release()
if hasattr(self, "writer"): # in case it was already deleted
del self.writer
if hasattr(self, "data"): # in case it was already deleted
del self.data
def _enforce_nested_string_type(self, schema, obj):
"""
Recursively checks if there is any Value feature of type string and throws TypeError if corresponding object is not a string.
Since any Python object can be cast to string this avoids implicitly casting wrong input types (e.g. lists) to string without error.
"""
# Nested structures: we allow dict, list, tuples, sequences
if isinstance(schema, dict):
return [self._enforce_nested_string_type(sub_schema, o) for k, (sub_schema, o) in zip_dict(schema, obj)]
elif isinstance(schema, (list, tuple)):
sub_schema = schema[0]
return [self._enforce_nested_string_type(sub_schema, o) for o in obj]
elif isinstance(schema, Sequence):
# We allow to reverse list of dict => dict of list for compatiblity with tfds
if isinstance(schema.feature, dict):
if isinstance(obj, (list, tuple)):
# obj is a list of dict
for k, dict_tuples in zip_dict(schema.feature, *obj):
for sub_obj in dict_tuples[1:]:
if _check_non_null_non_empty_recursive(sub_obj, dict_tuples[0]):
self._enforce_nested_string_type(dict_tuples[0], sub_obj)
break
return None
else:
# obj is a single dict
for k, (sub_schema, sub_objs) in zip_dict(schema.feature, obj):
for sub_obj in sub_objs:
if _check_non_null_non_empty_recursive(sub_obj, sub_schema):
self._enforce_nested_string_type(sub_schema, sub_obj)
break
return None
# schema.feature is not a dict
if isinstance(obj, str): # don't interpret a string as a list
raise ValueError(f"Got a string but expected a list instead: '{obj}'")
if obj is None:
return None
else:
if len(obj) > 0:
for first_elmt in obj:
if _check_non_null_non_empty_recursive(first_elmt, schema.feature):
break
if not isinstance(first_elmt, list):
return self._enforce_nested_string_type(schema.feature, first_elmt)
elif isinstance(schema, Value):
if pa.types.is_string(schema.pa_type) and not isinstance(obj, str):
raise TypeError(f"Expected type str but got {type(obj)}.")
class Metric(EvaluationModule):
"""A Metric is the base class and common API for all metrics.
Args:
config_name (`str`):
This is used to define a hash specific to a metric computation script and prevents the metric's data
to be overridden when the metric loading script is modified.
keep_in_memory (`bool`):
Keep all predictions and references in memory. Not possible in distributed settings.
cache_dir (`str`):
Path to a directory in which temporary prediction/references data will be stored.
The data directory should be located on a shared file-system in distributed setups.
num_process (`int`):
Specify the total number of nodes in a distributed settings.
This is useful to compute metrics in distributed setups (in particular non-additive metrics like F1).
process_id (`int`):
Specify the id of the current process in a distributed setup (between 0 and num_process-1)
This is useful to compute metrics in distributed setups (in particular non-additive metrics like F1).
seed (`int`, *optional*):
If specified, this will temporarily set numpy's random seed when [`~evaluate.Metric.compute`] is run.
experiment_id (`str`):
A specific experiment id. This is used if several distributed evaluations share the same file system.
This is useful to compute metrics in distributed setups (in particular non-additive metrics like F1).
max_concurrent_cache_files (`int`):
Max number of concurrent metric cache files (default `10000`).
timeout (`Union[int, float]`):
Timeout in second for distributed setting synchronization.
"""
class Comparison(EvaluationModule):
"""A Comparison is the base class and common API for all comparisons.
Args:
config_name (`str`):
This is used to define a hash specific to a comparison computation script and prevents the comparison's data
to be overridden when the comparison loading script is modified.
keep_in_memory (`bool`):
Keep all predictions and references in memory. Not possible in distributed settings.
cache_dir (`str`):
Path to a directory in which temporary prediction/references data will be stored.
The data directory should be located on a shared file-system in distributed setups.
num_process (`int`):
Specify the total number of nodes in a distributed settings.
This is useful to compute comparisons in distributed setups (in particular non-additive comparisons).
process_id (`int`):
Specify the id of the current process in a distributed setup (between 0 and num_process-1)
This is useful to compute comparisons in distributed setups (in particular non-additive comparisons).
seed (`int`, *optional*):
If specified, this will temporarily set numpy's random seed when [`~evaluate.Comparison.compute`] is run.
experiment_id (`str`):
A specific experiment id. This is used if several distributed evaluations share the same file system.
This is useful to compute comparisons in distributed setups (in particular non-additive comparisons).
max_concurrent_cache_files (`int`):
Max number of concurrent comparison cache files (default `10000`).
timeout (`Union[int, float]`):
Timeout in second for distributed setting synchronization.
"""
class Measurement(EvaluationModule):
"""A Measurement is the base class and common API for all measurements.
Args:
config_name (`str`):
This is used to define a hash specific to a measurement computation script and prevents the measurement's data
to be overridden when the measurement loading script is modified.
keep_in_memory (`bool`):
Keep all predictions and references in memory. Not possible in distributed settings.
cache_dir (`str`):
Path to a directory in which temporary prediction/references data will be stored.
The data directory should be located on a shared file-system in distributed setups.
num_process (`int`):
Specify the total number of nodes in a distributed settings.
This is useful to compute measurements in distributed setups (in particular non-additive measurements).
process_id (`int`):
Specify the id of the current process in a distributed setup (between 0 and num_process-1)
This is useful to compute measurements in distributed setups (in particular non-additive measurements).
seed (`int`, *optional*):
If specified, this will temporarily set numpy's random seed when [`~evaluate.Measurement.compute`] is run.
experiment_id (`str`):
A specific experiment id. This is used if several distributed evaluations share the same file system.
This is useful to compute measurements in distributed setups (in particular non-additive measurements).
max_concurrent_cache_files (`int`):
Max number of concurrent measurement cache files (default `10000`).
timeout (`Union[int, float]`):
Timeout in second for distributed setting synchronization.
"""
class CombinedEvaluations:
def __init__(self, evaluation_modules, force_prefix=False):
from .loading import load # avoid circular imports
self.evaluation_module_names = None
if isinstance(evaluation_modules, list):
self.evaluation_modules = evaluation_modules
elif isinstance(evaluation_modules, dict):
self.evaluation_modules = list(evaluation_modules.values())
self.evaluation_module_names = list(evaluation_modules.keys())
loaded_modules = []
for module in self.evaluation_modules:
if isinstance(module, str):
module = load(module)
loaded_modules.append(module)
self.evaluation_modules = loaded_modules
if self.evaluation_module_names is None:
self.evaluation_module_names = [module.name for module in self.evaluation_modules]
self.force_prefix = force_prefix
def add(self, prediction=None, reference=None, **kwargs):
"""Add one prediction and reference for each evaluation module's stack.
Args:
predictions (`list/array/tensor`, *optional*):
Predictions.
references (`list/array/tensor`, *optional*):
References.
Example:
```py
>>> import evaluate
>>> accuracy = evaluate.load("accuracy")
>>> f1 = evaluate.load("f1")
>>> clf_metrics = combine(["accuracy", "f1"])
>>> for ref, pred in zip([0,1,0,1], [1,0,0,1]):
... clf_metrics.add(references=ref, predictions=pred)
```
"""
for evaluation_module in self.evaluation_modules:
batch = {"predictions": prediction, "references": reference, **kwargs}
batch = {input_name: batch[input_name] for input_name in evaluation_module._feature_names()}
evaluation_module.add(**batch)
def add_batch(self, predictions=None, references=None, **kwargs):
"""Add a batch of predictions and references for each evaluation module's stack.
Args:
predictions (`list/array/tensor`, *optional*):
Predictions.
references (`list/array/tensor`, *optional*):
References.
Example:
```py
>>> import evaluate
>>> accuracy = evaluate.load("accuracy")
>>> f1 = evaluate.load("f1")
>>> clf_metrics = combine(["accuracy", "f1"])
>>> for refs, preds in zip([[0,1],[0,1]], [[1,0],[0,1]]):
... clf_metrics.add(references=refs, predictions=preds)
```
"""
for evaluation_module in self.evaluation_modules:
batch = {"predictions": predictions, "references": references, **kwargs}
batch = {input_name: batch[input_name] for input_name in evaluation_module._feature_names()}
evaluation_module.add_batch(**batch)
def compute(self, predictions=None, references=None, **kwargs):
"""Compute each evaluation module.
Usage of positional arguments is not allowed to prevent mistakes.
Args:
predictions (`list/array/tensor`, *optional*):
Predictions.
references (`list/array/tensor`, *optional*):
References.
**kwargs (*optional*):
Keyword arguments that will be forwarded to the evaluation module [`~evaluate.EvaluationModule.compute`]
method (see details in the docstring).
Return:
`dict` or `None`
- Dictionary with the results if this evaluation module is run on the main process (`process_id == 0`).
- `None` if the evaluation module is not run on the main process (`process_id != 0`).
Example:
```py
>>> import evaluate
>>> accuracy = evaluate.load("accuracy")
>>> f1 = evaluate.load("f1")
>>> clf_metrics = combine(["accuracy", "f1"])
>>> clf_metrics.compute(predictions=[0,1], references=[1,1])
{'accuracy': 0.5, 'f1': 0.6666666666666666}
```
"""
results = []
for evaluation_module in self.evaluation_modules:
batch = {"predictions": predictions, "references": references, **kwargs}
results.append(evaluation_module.compute(**batch))
return self._merge_results(results)
def _merge_results(self, results):
merged_results = {}
results_keys = list(itertools.chain.from_iterable([r.keys() for r in results]))
duplicate_keys = {item for item, count in collections.Counter(results_keys).items() if count > 1}
duplicate_names = [
item for item, count in collections.Counter(self.evaluation_module_names).items() if count > 1
]
duplicate_counter = {name: 0 for name in duplicate_names}
for module_name, result in zip(self.evaluation_module_names, results):
for k, v in result.items():
if k not in duplicate_keys and not self.force_prefix:
merged_results[f"{k}"] = v
elif module_name in duplicate_counter:
merged_results[f"{module_name}_{duplicate_counter[module_name]}_{k}"] = v
else:
merged_results[f"{module_name}_{k}"] = v
if module_name in duplicate_counter:
duplicate_counter[module_name] += 1
return merged_results
def combine(evaluations, force_prefix=False):
"""Combines several metrics, comparisons, or measurements into a single `CombinedEvaluations` object that
can be used like a single evaluation module.
If two scores have the same name, then they are prefixed with their module names.
And if two modules have the same name, please use a dictionary to give them different names, otherwise an integer id is appended to the prefix.
Args:
evaluations (`Union[list, dict]`):
A list or dictionary of evaluation modules. The modules can either be passed
as strings or loaded `EvaluationModule`s. If a dictionary is passed its keys are the names used and the values the modules.
The names are used as prefix in case there are name overlaps in the returned results of each module or if `force_prefix=True`.
force_prefix (`bool`, *optional*, defaults to `False`):
If `True` all scores from the modules are prefixed with their name. If
a dictionary is passed the keys are used as name otherwise the module's name.
Examples:
```py
>>> import evaluate
>>> accuracy = evaluate.load("accuracy")
>>> f1 = evaluate.load("f1")
>>> clf_metrics = combine(["accuracy", "f1"])
```
"""
return CombinedEvaluations(evaluations, force_prefix=force_prefix)
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""Utilities for file names."""
import itertools
import os
import re
_uppercase_uppercase_re = re.compile(r"([A-Z]+)([A-Z][a-z])")
_lowercase_uppercase_re = re.compile(r"([a-z\d])([A-Z])")
_single_underscore_re = re.compile(r"(?<!_)_(?!_)")
_multiple_underscores_re = re.compile(r"(_{2,})")
_split_re = r"^\w+(\.\w+)*$"
def camelcase_to_snakecase(name):
"""Convert camel-case string to snake-case."""
name = _uppercase_uppercase_re.sub(r"\1_\2", name)
name = _lowercase_uppercase_re.sub(r"\1_\2", name)
return name.lower()
def snakecase_to_camelcase(name):
"""Convert snake-case string to camel-case string."""
name = _single_underscore_re.split(name)
name = [_multiple_underscores_re.split(n) for n in name]
return "".join(n.capitalize() for n in itertools.chain.from_iterable(name) if n != "")
def filename_prefix_for_name(name):
if os.path.basename(name) != name:
raise ValueError(f"Should be a dataset name, not a path: {name}")
return camelcase_to_snakecase(name)
def filename_prefix_for_split(name, split):
if os.path.basename(name) != name:
raise ValueError(f"Should be a dataset name, not a path: {name}")
if not re.match(_split_re, split):
raise ValueError(f"Split name should match '{_split_re}'' but got '{split}'.")
return f"{filename_prefix_for_name(name)}-{split}"
def filepattern_for_dataset_split(dataset_name, split, data_dir, filetype_suffix=None):
prefix = filename_prefix_for_split(dataset_name, split)
if filetype_suffix:
prefix += f".{filetype_suffix}"
filepath = os.path.join(data_dir, prefix)
return f"{filepath}*"
def filename_for_dataset_split(dataset_name, split, filetype_suffix=None):
prefix = filename_prefix_for_split(dataset_name, split)
if filetype_suffix:
prefix += f".{filetype_suffix}"
return prefix
def filepath_for_dataset_split(dataset_name, split, data_dir, filetype_suffix=None):
filename = filename_for_dataset_split(
dataset_name=dataset_name,
split=split,
filetype_suffix=filetype_suffix,
)
filepath = os.path.join(data_dir, filename)
return filepath
import json
import os
import subprocess
import sys
from datetime import datetime
from pathlib import Path
from datasets.utils.filelock import FileLock
from . import __version__
def save(path_or_file, **data):
"""
Saves results to a JSON file. Also saves system information such as current time, current commit
hash if inside a repository, and Python system information.
Args:
path_or_file (`str`):
Path or file to store the file. If only a folder is provided
the results file will be saved in the format `"result-%Y_%m_%d-%H_%M_%S.json"`.
Example:
```py
>>> import evaluate
>>> result = {"bleu": 0.7}
>>> params = {"model": "gpt-2"}
>>> evaluate.save("./results/", **result, **params)
```
"""
current_time = datetime.now()
file_path = _setup_path(path_or_file, current_time)
data["_timestamp"] = current_time.isoformat()
data["_git_commit_hash"] = _git_commit_hash()
data["_evaluate_version"] = __version__
data["_python_version"] = sys.version
data["_interpreter_path"] = sys.executable
with FileLock(str(file_path) + ".lock"):
with open(file_path, "w") as f:
json.dump(data, f)
# cleanup lock file
try:
os.remove(str(file_path) + ".lock")
except FileNotFoundError:
pass
return file_path
def _setup_path(path_or_file, current_time):
path_or_file = Path(path_or_file)
is_file = len(path_or_file.suffix) > 0
if is_file:
folder = path_or_file.parent
file_name = path_or_file.name
else:
folder = path_or_file
file_name = "result-" + current_time.strftime("%Y_%m_%d-%H_%M_%S") + ".json"
folder.mkdir(parents=True, exist_ok=True)
return folder / file_name
def _git_commit_hash():
res = subprocess.run("git rev-parse --is-inside-work-tree".split(), cwd="./", stdout=subprocess.PIPE)
if res.stdout.decode().strip() == "true":
res = subprocess.run("git rev-parse HEAD".split(), cwd=os.getcwd(), stdout=subprocess.PIPE)
return res.stdout.decode().strip()
else:
return None
# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# flake8: noqa
# Lint as: python3
"""Util import."""
__all__ = [
"disable_progress_bar",
"enable_progress_bar",
"is_progress_bar_enabled",
"infer_gradio_input_types",
"json_to_string_type",
"parse_readme",
"parse_gradio_data",
"parse_test_cases",
"launch_gradio_widget",
]
from .gradio import (
infer_gradio_input_types,
json_to_string_type,
launch_gradio_widget,
parse_gradio_data,
parse_readme,
parse_test_cases,
)
from .logging import disable_progress_bar, enable_progress_bar, is_progress_bar_enabled
"""
Utilities for working with the local dataset cache.
This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
Copyright by the AllenNLP authors.
"""
import copy
import io
import json
import os
import posixpath
import re
import shutil
import sys
import tempfile
import time
import urllib
from contextlib import closing, contextmanager
from functools import partial
from hashlib import sha256
from pathlib import Path
from typing import List, Optional, Type, TypeVar, Union
from urllib.parse import urljoin, urlparse
import requests
from datasets import DownloadConfig
from datasets.utils.extract import ExtractManager
from datasets.utils.filelock import FileLock
from .. import __version__, config
from . import logging
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
INCOMPLETE_SUFFIX = ".incomplete"
T = TypeVar("T", str, Path)
def init_hf_modules(hf_modules_cache: Optional[Union[Path, str]] = None) -> str:
"""
Add hf_modules_cache to the python path.
By default hf_modules_cache='~/.cache/huggingface/modules'.
It can also be set with the environment variable HF_MODULES_CACHE.
This is used to add modules such as `datasets_modules`
"""
hf_modules_cache = hf_modules_cache if hf_modules_cache is not None else config.HF_MODULES_CACHE
hf_modules_cache = str(hf_modules_cache)
if hf_modules_cache not in sys.path:
sys.path.append(hf_modules_cache)
os.makedirs(hf_modules_cache, exist_ok=True)
if not os.path.exists(os.path.join(hf_modules_cache, "__init__.py")):
with open(os.path.join(hf_modules_cache, "__init__.py"), "w"):
pass
return hf_modules_cache
def is_remote_url(url_or_filename: str) -> bool:
parsed = urlparse(url_or_filename)
return parsed.scheme in ("http", "https", "s3", "gs", "hdfs", "ftp")
def is_local_path(url_or_filename: str) -> bool:
# On unix the scheme of a local path is empty (for both absolute and relative),
# while on windows the scheme is the drive name (ex: "c") for absolute paths.
# for details on the windows behavior, see https://bugs.python.org/issue42215
return urlparse(url_or_filename).scheme == "" or os.path.ismount(urlparse(url_or_filename).scheme + ":/")
def is_relative_path(url_or_filename: str) -> bool:
return urlparse(url_or_filename).scheme == "" and not os.path.isabs(url_or_filename)
def relative_to_absolute_path(path: T) -> T:
"""Convert relative path to absolute path."""
abs_path_str = os.path.abspath(os.path.expanduser(os.path.expandvars(str(path))))
return Path(abs_path_str) if isinstance(path, Path) else abs_path_str
def hf_bucket_url(identifier: str, filename: str, use_cdn=False, dataset=True) -> str:
if dataset:
endpoint = config.CLOUDFRONT_DATASETS_DISTRIB_PREFIX if use_cdn else config.S3_DATASETS_BUCKET_PREFIX
else:
endpoint = config.CLOUDFRONT_METRICS_DISTRIB_PREFIX if use_cdn else config.S3_METRICS_BUCKET_PREFIX
return "/".join((endpoint, identifier, filename))
def head_hf_s3(
identifier: str, filename: str, use_cdn=False, dataset=True, max_retries=0
) -> Union[requests.Response, Exception]:
return http_head(
hf_bucket_url(identifier=identifier, filename=filename, use_cdn=use_cdn, dataset=dataset),
max_retries=max_retries,
)
def hf_hub_url(path: str, name: str, revision: Optional[str] = None) -> str:
revision = revision or config.HUB_DEFAULT_VERSION
return config.HUB_EVALUATE_URL.format(path=path, name=name, revision=revision)
def url_or_path_join(base_name: str, *pathnames: str) -> str:
if is_remote_url(base_name):
return posixpath.join(base_name, *(str(pathname).replace(os.sep, "/").lstrip("/") for pathname in pathnames))
else:
return Path(base_name, *pathnames).as_posix()
def url_or_path_parent(url_or_path: str) -> str:
if is_remote_url(url_or_path):
return url_or_path[: url_or_path.rindex("/")]
else:
return os.path.dirname(url_or_path)
def hash_url_to_filename(url, etag=None):
"""
Convert `url` into a hashed filename in a repeatable way.
If `etag` is specified, append its hash to the url's, delimited
by a period.
If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name
so that TF 2.0 can identify it as a HDF5 file
(see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
"""
url_bytes = url.encode("utf-8")
url_hash = sha256(url_bytes)
filename = url_hash.hexdigest()
if etag:
etag_bytes = etag.encode("utf-8")
etag_hash = sha256(etag_bytes)
filename += "." + etag_hash.hexdigest()
if url.endswith(".py"):
filename += ".py"
return filename
def cached_path(
url_or_filename,
download_config=None,
**download_kwargs,
) -> str:
"""
Given something that might be a URL (or might be a local path),
determine which. If it's a URL, download the file and cache it, and
return the path to the cached file. If it's already a local path,
make sure the file exists and then return the path.
Return:
Local path (string)
Raises:
FileNotFoundError: in case of non-recoverable file
(non-existent or no cache on disk)
ConnectionError: in case of unreachable url
and no cache on disk
ValueError: if it couldn't parse the url or filename correctly
requests.exceptions.ConnectionError: in case of internet connection issue
"""
if download_config is None:
download_config = DownloadConfig(**download_kwargs)
cache_dir = download_config.cache_dir or config.DOWNLOADED_EVALUATE_PATH
if isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
if isinstance(url_or_filename, Path):
url_or_filename = str(url_or_filename)
if is_remote_url(url_or_filename):
# URL, so get it from the cache (downloading if necessary)
output_path = get_from_cache(
url_or_filename,
cache_dir=cache_dir,
force_download=download_config.force_download,
proxies=download_config.proxies,
resume_download=download_config.resume_download,
user_agent=download_config.user_agent,
local_files_only=download_config.local_files_only,
use_etag=download_config.use_etag,
max_retries=download_config.max_retries,
use_auth_token=download_config.use_auth_token,
ignore_url_params=download_config.ignore_url_params,
download_desc=download_config.download_desc,
)
elif os.path.exists(url_or_filename):
# File, and it exists.
output_path = url_or_filename
elif is_local_path(url_or_filename):
# File, but it doesn't exist.
raise FileNotFoundError(f"Local file {url_or_filename} doesn't exist")
else:
# Something unknown
raise ValueError(f"unable to parse {url_or_filename} as a URL or as a local path")
if output_path is None:
return output_path
if download_config.extract_compressed_file:
output_path = ExtractManager(cache_dir=download_config.cache_dir).extract(
output_path, force_extract=download_config.force_extract
)
return output_path
def get_datasets_user_agent(user_agent: Optional[Union[str, dict]] = None) -> str:
ua = f"datasets/{__version__}; python/{config.PY_VERSION}"
ua += f"; pyarrow/{config.PYARROW_VERSION}"
if config.TORCH_AVAILABLE:
ua += f"; torch/{config.TORCH_VERSION}"
if config.TF_AVAILABLE:
ua += f"; tensorflow/{config.TF_VERSION}"
if config.JAX_AVAILABLE:
ua += f"; jax/{config.JAX_VERSION}"
if isinstance(user_agent, dict):
ua += f"; {'; '.join(f'{k}/{v}' for k, v in user_agent.items())}"
elif isinstance(user_agent, str):
ua += "; " + user_agent
return ua
def get_authentication_headers_for_url(url: str, use_auth_token: Optional[Union[str, bool]] = None) -> dict:
"""Handle the HF authentication"""
headers = {}
if url.startswith(config.HF_ENDPOINT):
token = None
if isinstance(use_auth_token, str):
token = use_auth_token
elif bool(use_auth_token):
from huggingface_hub import hf_api
token = hf_api.HfFolder.get_token()
if token:
headers["authorization"] = f"Bearer {token}"
return headers
class OfflineModeIsEnabled(ConnectionError):
pass
def _raise_if_offline_mode_is_enabled(msg: Optional[str] = None):
"""Raise an OfflineModeIsEnabled error (subclass of ConnectionError) if HF_EVALUATE_OFFLINE is True."""
if config.HF_EVALUATE_OFFLINE:
raise OfflineModeIsEnabled(
"Offline mode is enabled." if msg is None else "Offline mode is enabled. " + str(msg)
)
def _retry(
func,
func_args: Optional[tuple] = None,
func_kwargs: Optional[dict] = None,
exceptions: Type[requests.exceptions.RequestException] = requests.exceptions.RequestException,
status_codes: Optional[List[int]] = None,
max_retries: int = 0,
base_wait_time: float = 0.5,
max_wait_time: float = 2,
):
func_args = func_args or ()
func_kwargs = func_kwargs or {}
retry = 0
while True:
try:
return func(*func_args, **func_kwargs)
except exceptions as err:
if retry >= max_retries or (status_codes and err.response.status_code not in status_codes):
raise err
else:
sleep_time = min(max_wait_time, base_wait_time * 2**retry) # Exponential backoff
logger.info(f"{func} timed out, retrying in {sleep_time}s... [{retry/max_retries}]")
time.sleep(sleep_time)
retry += 1
def _request_with_retry(
method: str,
url: str,
max_retries: int = 0,
base_wait_time: float = 0.5,
max_wait_time: float = 2,
timeout: float = 10.0,
**params,
) -> requests.Response:
"""Wrapper around requests to retry in case it fails with a ConnectTimeout, with exponential backoff.
Note that if the environment variable HF_EVALUATE_OFFLINE is set to 1, then a OfflineModeIsEnabled error is raised.
Args:
method (str): HTTP method, such as 'GET' or 'HEAD'.
url (str): The URL of the resource to fetch.
max_retries (int): Maximum number of retries, defaults to 0 (no retries).
base_wait_time (float): Duration (in seconds) to wait before retrying the first time. Wait time between
retries then grows exponentially, capped by max_wait_time.
max_wait_time (float): Maximum amount of time between two retries, in seconds.
**params: Params to pass to :obj:`requests.request`.
"""
_raise_if_offline_mode_is_enabled(f"Tried to reach {url}")
tries, success = 0, False
while not success:
tries += 1
try:
response = requests.request(method=method.upper(), url=url, timeout=timeout, **params)
success = True
except (requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError) as err:
if tries > max_retries:
raise err
else:
logger.info(f"{method} request to {url} timed out, retrying... [{tries/max_retries}]")
sleep_time = min(max_wait_time, base_wait_time * 2 ** (tries - 1)) # Exponential backoff
time.sleep(sleep_time)
return response
def ftp_head(url, timeout=10.0):
_raise_if_offline_mode_is_enabled(f"Tried to reach {url}")
try:
with closing(urllib.request.urlopen(url, timeout=timeout)) as r:
r.read(1)
except Exception:
return False
return True
def ftp_get(url, temp_file, timeout=10.0):
_raise_if_offline_mode_is_enabled(f"Tried to reach {url}")
try:
logger.info(f"Getting through FTP {url} into {temp_file.name}")
with closing(urllib.request.urlopen(url, timeout=timeout)) as r:
shutil.copyfileobj(r, temp_file)
except urllib.error.URLError as e:
raise ConnectionError(e) from None
def http_get(
url, temp_file, proxies=None, resume_size=0, headers=None, cookies=None, timeout=100.0, max_retries=0, desc=None
):
headers = copy.deepcopy(headers) or {}
headers["user-agent"] = get_datasets_user_agent(user_agent=headers.get("user-agent"))
if resume_size > 0:
headers["Range"] = f"bytes={resume_size:d}-"
response = _request_with_retry(
method="GET",
url=url,
stream=True,
proxies=proxies,
headers=headers,
cookies=cookies,
max_retries=max_retries,
timeout=timeout,
)
if response.status_code == 416: # Range not satisfiable
return
content_length = response.headers.get("Content-Length")
total = resume_size + int(content_length) if content_length is not None else None
with logging.tqdm(
unit="B",
unit_scale=True,
total=total,
initial=resume_size,
desc=desc or "Downloading",
disable=not logging.is_progress_bar_enabled(),
) as progress:
for chunk in response.iter_content(chunk_size=1024):
progress.update(len(chunk))
temp_file.write(chunk)
def http_head(
url, proxies=None, headers=None, cookies=None, allow_redirects=True, timeout=10.0, max_retries=0
) -> requests.Response:
headers = copy.deepcopy(headers) or {}
headers["user-agent"] = get_datasets_user_agent(user_agent=headers.get("user-agent"))
response = _request_with_retry(
method="HEAD",
url=url,
proxies=proxies,
headers=headers,
cookies=cookies,
allow_redirects=allow_redirects,
timeout=timeout,
max_retries=max_retries,
)
return response
def request_etag(url: str, use_auth_token: Optional[Union[str, bool]] = None) -> Optional[str]:
headers = get_authentication_headers_for_url(url, use_auth_token=use_auth_token)
response = http_head(url, headers=headers, max_retries=3)
response.raise_for_status()
etag = response.headers.get("ETag") if response.ok else None
return etag
def get_from_cache(
url,
cache_dir=None,
force_download=False,
proxies=None,
etag_timeout=100,
resume_download=False,
user_agent=None,
local_files_only=False,
use_etag=True,
max_retries=0,
use_auth_token=None,
ignore_url_params=False,
download_desc=None,
) -> str:
"""
Given a URL, look for the corresponding file in the local cache.
If it's not there, download it. Then return the path to the cached file.
Return:
Local path (string)
Raises:
FileNotFoundError: in case of non-recoverable file
(non-existent or no cache on disk)
ConnectionError: in case of unreachable url
and no cache on disk
"""
if cache_dir is None:
cache_dir = config.HF_EVALUATE_CACHE
if isinstance(cache_dir, Path):
cache_dir = str(cache_dir)
os.makedirs(cache_dir, exist_ok=True)
if ignore_url_params:
# strip all query parameters and #fragments from the URL
cached_url = urljoin(url, urlparse(url).path)
else:
cached_url = url # additional parameters may be added to the given URL
connected = False
response = None
cookies = None
etag = None
head_error = None
# Try a first time to file the file on the local file system without eTag (None)
# if we don't ask for 'force_download' then we spare a request
filename = hash_url_to_filename(cached_url, etag=None)
cache_path = os.path.join(cache_dir, filename)
if os.path.exists(cache_path) and not force_download and not use_etag:
return cache_path
# Prepare headers for authentication
headers = get_authentication_headers_for_url(url, use_auth_token=use_auth_token)
if user_agent is not None:
headers["user-agent"] = user_agent
# We don't have the file locally or we need an eTag
if not local_files_only:
if url.startswith("ftp://"):
connected = ftp_head(url)
try:
response = http_head(
url,
allow_redirects=True,
proxies=proxies,
timeout=etag_timeout,
max_retries=max_retries,
headers=headers,
)
if response.status_code == 200: # ok
etag = response.headers.get("ETag") if use_etag else None
for k, v in response.cookies.items():
# In some edge cases, we need to get a confirmation token
if k.startswith("download_warning") and "drive.google.com" in url:
url += "&confirm=" + v
cookies = response.cookies
connected = True
# Fix Google Drive URL to avoid Virus scan warning
if "drive.google.com" in url and "confirm=" not in url:
url += "&confirm=t"
# In some edge cases, head request returns 400 but the connection is actually ok
elif (
(response.status_code == 400 and "firebasestorage.googleapis.com" in url)
or (response.status_code == 405 and "drive.google.com" in url)
or (
response.status_code == 403
and (
re.match(r"^https?://github.com/.*?/.*?/releases/download/.*?/.*?$", url)
or re.match(r"^https://.*?s3.*?amazonaws.com/.*?$", response.url)
)
)
or (response.status_code == 403 and "ndownloader.figstatic.com" in url)
):
connected = True
logger.info(f"Couldn't get ETag version for url {url}")
elif response.status_code == 401 and config.HF_ENDPOINT in url and use_auth_token is None:
raise ConnectionError(
f"Unauthorized for URL {url}. Please use the parameter ``use_auth_token=True`` after logging in with ``huggingface-cli login``"
)
except (OSError, requests.exceptions.Timeout) as e:
# not connected
head_error = e
pass
# connected == False = we don't have a connection, or url doesn't exist, or is otherwise inaccessible.
# try to get the last downloaded one
if not connected:
if os.path.exists(cache_path) and not force_download:
return cache_path
if local_files_only:
raise FileNotFoundError(
f"Cannot find the requested files in the cached path at {cache_path} and outgoing traffic has been"
" disabled. To enable file online look-ups, set 'local_files_only' to False."
)
elif response is not None and response.status_code == 404:
raise FileNotFoundError(f"Couldn't find file at {url}")
_raise_if_offline_mode_is_enabled(f"Tried to reach {url}")
if head_error is not None:
raise ConnectionError(f"Couldn't reach {url} ({repr(head_error)})")
elif response is not None:
raise ConnectionError(f"Couldn't reach {url} (error {response.status_code})")
else:
raise ConnectionError(f"Couldn't reach {url}")
# Try a second time
filename = hash_url_to_filename(cached_url, etag)
cache_path = os.path.join(cache_dir, filename)
if os.path.exists(cache_path) and not force_download:
return cache_path
# From now on, connected is True.
# Prevent parallel downloads of the same file with a lock.
lock_path = cache_path + ".lock"
with FileLock(lock_path):
if resume_download:
incomplete_path = cache_path + ".incomplete"
@contextmanager
def _resumable_file_manager():
with open(incomplete_path, "a+b") as f:
yield f
temp_file_manager = _resumable_file_manager
if os.path.exists(incomplete_path):
resume_size = os.stat(incomplete_path).st_size
else:
resume_size = 0
else:
temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False)
resume_size = 0
# Download to temporary file, then copy to cache dir once finished.
# Otherwise you get corrupt cache entries if the download gets interrupted.
with temp_file_manager() as temp_file:
logger.info(f"{url} not found in cache or force_download set to True, downloading to {temp_file.name}")
# GET file object
if url.startswith("ftp://"):
ftp_get(url, temp_file)
else:
http_get(
url,
temp_file,
proxies=proxies,
resume_size=resume_size,
headers=headers,
cookies=cookies,
max_retries=max_retries,
desc=download_desc,
)
logger.info(f"storing {url} in cache at {cache_path}")
shutil.move(temp_file.name, cache_path)
logger.info(f"creating metadata file for {cache_path}")
meta = {"url": url, "etag": etag}
meta_path = cache_path + ".json"
with open(meta_path, "w", encoding="utf-8") as meta_file:
json.dump(meta, meta_file)
return cache_path
def add_start_docstrings(*docstr):
def docstring_decorator(fn):
fn.__doc__ = "".join(docstr) + "\n\n" + (fn.__doc__ if fn.__doc__ is not None else "")
return fn
return docstring_decorator
def add_end_docstrings(*docstr):
def docstring_decorator(fn):
fn.__doc__ = (fn.__doc__ if fn.__doc__ is not None else "") + "\n\n" + "".join(docstr)
return fn
return docstring_decorator
def estimate_dataset_size(paths):
return sum(path.stat().st_size for path in paths)
def readline(f: io.RawIOBase):
# From: https://github.com/python/cpython/blob/d27e2f4d118e7a9909b6a3e5da06c5ff95806a85/Lib/_pyio.py#L525
res = bytearray()
while True:
b = f.read(1)
if not b:
break
res += b
if res.endswith(b"\n"):
break
return bytes(res)
import json
import os
import re
import sys
from pathlib import Path
import numpy as np
from datasets import Value
from .logging import get_logger
logger = get_logger(__name__)
REGEX_YAML_BLOCK = re.compile(r"---[\n\r]+([\S\s]*?)[\n\r]+---[\n\r]")
def infer_gradio_input_types(feature_types):
"""
Maps metric feature types to input types for gradio Dataframes:
- float/int -> numbers
- string -> strings
- any other -> json
Note that json is not a native gradio type but will be treated as string that
is then parsed as a json.
"""
input_types = []
for feature_type in feature_types:
input_type = "json"
if isinstance(feature_type, Value):
if feature_type.dtype.startswith("int") or feature_type.dtype.startswith("float"):
input_type = "number"
elif feature_type.dtype == "string":
input_type = "str"
input_types.append(input_type)
return input_types
def json_to_string_type(input_types):
"""Maps json input type to str."""
return ["str" if i == "json" else i for i in input_types]
def parse_readme(filepath):
"""Parses a repositories README and removes"""
if not os.path.exists(filepath):
return "No README.md found."
with open(filepath, "r") as f:
text = f.read()
match = REGEX_YAML_BLOCK.search(text)
if match:
text = text[match.end() :]
return text
def parse_gradio_data(data, input_types):
"""Parses data from gradio Dataframe for use in metric."""
metric_inputs = {}
data.replace("", np.nan, inplace=True)
data.dropna(inplace=True)
for feature_name, input_type in zip(data, input_types):
if input_type == "json":
metric_inputs[feature_name] = [json.loads(d) for d in data[feature_name].to_list()]
elif input_type == "str":
metric_inputs[feature_name] = [d.strip('"') for d in data[feature_name].to_list()]
else:
metric_inputs[feature_name] = data[feature_name]
return metric_inputs
def parse_test_cases(test_cases, feature_names, input_types):
"""
Parses test cases to be used in gradio Dataframe. Note that an apostrophe is added
to strings to follow the format in json.
"""
if len(test_cases) == 0:
return None
examples = []
for test_case in test_cases:
parsed_cases = []
for feat, input_type in zip(feature_names, input_types):
if input_type == "json":
parsed_cases.append([str(element) for element in test_case[feat]])
elif input_type == "str":
parsed_cases.append(['"' + element + '"' for element in test_case[feat]])
else:
parsed_cases.append(test_case[feat])
examples.append([list(i) for i in zip(*parsed_cases)])
return examples
def launch_gradio_widget(metric):
"""Launches `metric` widget with Gradio."""
try:
import gradio as gr
except ImportError as error:
logger.error("To create a metric widget with Gradio make sure gradio is installed.")
raise error
local_path = Path(sys.path[0])
# if there are several input types, use first as default.
if isinstance(metric.features, list):
(feature_names, feature_types) = zip(*metric.features[0].items())
else:
(feature_names, feature_types) = zip(*metric.features.items())
gradio_input_types = infer_gradio_input_types(feature_types)
def compute(data):
return metric.compute(**parse_gradio_data(data, gradio_input_types))
iface = gr.Interface(
fn=compute,
inputs=gr.inputs.Dataframe(
headers=feature_names,
col_count=len(feature_names),
row_count=1,
datatype=json_to_string_type(gradio_input_types),
),
outputs=gr.outputs.Textbox(label=metric.name),
description=(
metric.info.description + "\nIf this is a text-based metric, make sure to wrap you input in double quotes."
" Alternatively you can use a JSON-formatted list as input."
),
title=f"Metric: {metric.name}",
article=parse_readme(local_path / "README.md"),
# TODO: load test cases and use them to populate examples
# examples=[parse_test_cases(test_cases, feature_names, gradio_input_types)]
)
iface.launch()
# Copyright 2020 Optuna, Hugging Face
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Logging utilities. """
import logging
import os
from logging import CRITICAL # NOQA
from logging import DEBUG # NOQA
from logging import ERROR # NOQA
from logging import FATAL # NOQA
from logging import INFO # NOQA
from logging import NOTSET # NOQA
from logging import WARN # NOQA
from logging import WARNING # NOQA
from typing import Optional
from tqdm import auto as tqdm_lib
log_levels = {
"debug": logging.DEBUG,
"info": logging.INFO,
"warning": logging.WARNING,
"error": logging.ERROR,
"critical": logging.CRITICAL,
}
_default_log_level = logging.WARNING
def _get_default_logging_level():
"""
If EVALUATE_VERBOSITY env var is set to one of the valid choices return that as the new default level.
If it is not - fall back to ``_default_log_level``
"""
env_level_str = os.getenv("EVALUATE_VERBOSITY", None)
if env_level_str:
if env_level_str in log_levels:
return log_levels[env_level_str]
else:
logging.getLogger().warning(
f"Unknown option EVALUATE_VERBOSITY={env_level_str}, "
f"has to be one of: { ', '.join(log_levels.keys()) }"
)
return _default_log_level
def _get_library_name() -> str:
return __name__.split(".")[0]
def _get_library_root_logger() -> logging.Logger:
return logging.getLogger(_get_library_name())
def _configure_library_root_logger() -> None:
# Apply our default configuration to the library root logger.
library_root_logger = _get_library_root_logger()
library_root_logger.setLevel(_get_default_logging_level())
def _reset_library_root_logger() -> None:
library_root_logger = _get_library_root_logger()
library_root_logger.setLevel(logging.NOTSET)
def get_logger(name: Optional[str] = None) -> logging.Logger:
"""Return a logger with the specified name."""
if name is None:
name = _get_library_name()
return logging.getLogger(name)
def get_verbosity() -> int:
"""Return the current level for the Hugging Face Evaluate library's root logger.
Returns:
Logging level, e.g., `evaluate.logging.DEBUG` and `evaluate.logging.INFO`.
<Tip>
Hugging Face Evaluate library has following logging levels:
- `evaluate.logging.CRITICAL`, `evaluate.logging.FATAL`
- `evaluate.logging.ERROR`
- `evaluate.logging.WARNING`, `evaluate.logging.WARN`
- `evaluate.logging.INFO`
- `evaluate.logging.DEBUG`
</Tip>
"""
return _get_library_root_logger().getEffectiveLevel()
def set_verbosity(verbosity: int) -> None:
"""Set the level for the Hugging Face Evaluate library's root logger.
Args:
verbosity:
Logging level, e.g., `evaluate.logging.DEBUG` and `evaluate.logging.INFO`.
"""
_get_library_root_logger().setLevel(verbosity)
def set_verbosity_info():
"""Set the level for the Hugging Face Evaluate library's root logger to `INFO`.
This will display most of the logging information and tqdm bars.
Shortcut to `evaluate.logging.set_verbosity(evaluate.logging.INFO)`.
"""
return set_verbosity(INFO)
def set_verbosity_warning():
"""Set the level for the Hugging Face Evaluate library's root logger to `WARNING`.
This will display only the warning and errors logging information and tqdm bars.
Shortcut to `evaluate.logging.set_verbosity(evaluate.logging.WARNING)`.
"""
return set_verbosity(WARNING)
def set_verbosity_debug():
"""Set the level for the Hugging Face Evaluate library's root logger to `DEBUG`.
This will display all the logging information and tqdm bars.
Shortcut to `evaluate.logging.set_verbosity(evaluate.logging.DEBUG)`.
"""
return set_verbosity(DEBUG)
def set_verbosity_error():
"""Set the level for the Hugging Face Evaluate library's root logger to `ERROR`.
This will display only the errors logging information and tqdm bars.
Shortcut to `evaluate.logging.set_verbosity(evaluate.logging.ERROR)`.
"""
return set_verbosity(ERROR)
def disable_propagation() -> None:
"""Disable propagation of the library log outputs.
Note that log propagation is disabled by default.
"""
_get_library_root_logger().propagate = False
def enable_propagation() -> None:
"""Enable propagation of the library log outputs.
Please disable the Hugging Face Evaluate library's default handler to prevent double logging if the root logger has
been configured.
"""
_get_library_root_logger().propagate = True
# Configure the library root logger at the module level (singleton-like)
_configure_library_root_logger()
class EmptyTqdm:
"""Dummy tqdm which doesn't do anything."""
def __init__(self, *args, **kwargs): # pylint: disable=unused-argument
self._iterator = args[0] if args else None
def __iter__(self):
return iter(self._iterator)
def __getattr__(self, _):
"""Return empty function."""
def empty_fn(*args, **kwargs): # pylint: disable=unused-argument
return
return empty_fn
def __enter__(self):
return self
def __exit__(self, type_, value, traceback):
return
_tqdm_active = True
class _tqdm_cls:
def __call__(self, *args, **kwargs):
if _tqdm_active:
return tqdm_lib.tqdm(*args, **kwargs)
else:
return EmptyTqdm(*args, **kwargs)
def set_lock(self, *args, **kwargs):
self._lock = None
if _tqdm_active:
return tqdm_lib.tqdm.set_lock(*args, **kwargs)
def get_lock(self):
if _tqdm_active:
return tqdm_lib.tqdm.get_lock()
tqdm = _tqdm_cls()
def is_progress_bar_enabled() -> bool:
"""Return a boolean indicating whether tqdm progress bars are enabled."""
global _tqdm_active
return bool(_tqdm_active)
def enable_progress_bar():
"""Enable tqdm progress bar."""
global _tqdm_active
_tqdm_active = True
def disable_progress_bar():
"""Enable tqdm progress bar."""
global _tqdm_active
_tqdm_active = False
import textwrap
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
class ComplexRadar:
"""Create a complex radar chart with different scales for each variable
Args:
fig (`matplotlib.figure`) : A matplotlib figure object to add the axes on.
variables (`list`) : a list of variables to. plot
ranges (`list` of `tuples`): A list of ranges (min, max) for each variable
n_ring_levels (`int): Number of ordinate or ring levels to draw.
Default: 5.
show_scales (`bool`): Indicates if we the ranges for each variable are plotted.
Default: True.
format_cfg (`dict`): A dictionary with formatting configurations.
Default: None.
Returns:
`matplotlib.figure.Figure`: a radar plot.
"""
def __init__(self, fig, variables, ranges, n_ring_levels=5, show_scales=True, format_cfg=None):
self.format_cfg = format_cfg
# Calculate angles and create for each variable an axes
# Consider here the trick with having the first axes element twice (len+1)
angles = np.arange(0, 360, 360.0 / len(variables))
axes = [
fig.add_axes([0.1, 0.1, 0.9, 0.9], polar=True, label="axes{}".format(i), **self.format_cfg["axes_args"])
for i in range(len(variables) + 1)
]
# Ensure clockwise rotation (first variable at the top N)
for ax in axes:
ax.set_theta_zero_location("N")
ax.set_theta_direction(-1)
ax.set_axisbelow(True)
# Writing the ranges on each axes
for i, ax in enumerate(axes):
# Here we do the trick by repeating the first iteration
j = 0 if (i == 0 or i == 1) else i - 1
ax.set_ylim(*ranges[j])
# Set endpoint to True if you like to have values right before the last circle
grid = np.linspace(*ranges[j], num=n_ring_levels, endpoint=self.format_cfg["incl_endpoint"])
gridlabel = ["{}".format(round(x, 2)) for x in grid]
gridlabel[0] = "" # remove values from the center
lines, labels = ax.set_rgrids(
grid, labels=gridlabel, angle=angles[j], **self.format_cfg["rgrid_tick_lbls_args"]
)
ax.set_ylim(*ranges[j])
ax.spines["polar"].set_visible(False)
ax.grid(visible=False)
if show_scales is False:
ax.set_yticklabels([])
# Set all axes except the first one unvisible
for ax in axes[1:]:
ax.patch.set_visible(False)
ax.xaxis.set_visible(False)
# Setting the attributes
self.angle = np.deg2rad(np.r_[angles, angles[0]])
self.ranges = ranges
self.ax = axes[0]
self.ax1 = axes[1]
self.plot_counter = 0
# Draw (inner) circles and lines
self.ax.yaxis.grid(**self.format_cfg["rad_ln_args"])
# Draw outer circle
self.ax.spines["polar"].set(**self.format_cfg["outer_ring"])
# Draw angle lines
self.ax.xaxis.grid(**self.format_cfg["angle_ln_args"])
# ax1 is the duplicate of axes[0] (self.ax)
# Remove everything from ax1 except the plot itself
self.ax1.axis("off")
self.ax1.set_zorder(9)
# Create the outer labels for each variable
l, text = self.ax.set_thetagrids(angles, labels=variables)
# Beautify them
labels = [t.get_text() for t in self.ax.get_xticklabels()]
labels = [
"\n".join(
textwrap.wrap(
label,
self.format_cfg["theta_tick_lbls_txt_wrap"],
break_long_words=self.format_cfg["theta_tick_lbls_brk_lng_wrds"],
)
)
for label in labels
]
self.ax.set_xticklabels(labels, **self.format_cfg["theta_tick_lbls"])
for t, a in zip(self.ax.get_xticklabels(), angles):
if a == 0:
t.set_ha("center")
elif a > 0 and a < 180:
t.set_ha("left")
elif a == 180:
t.set_ha("center")
else:
t.set_ha("right")
self.ax.tick_params(axis="both", pad=self.format_cfg["theta_tick_lbls_pad"])
def _scale_data(self, data, ranges):
"""Scales data[1:] to ranges[0]"""
for d, (y1, y2) in zip(data[1:], ranges[1:]):
assert (y1 <= d <= y2) or (y2 <= d <= y1)
x1, x2 = ranges[0]
d = data[0]
sdata = [d]
for d, (y1, y2) in zip(data[1:], ranges[1:]):
sdata.append((d - y1) / (y2 - y1) * (x2 - x1) + x1)
return sdata
def plot(self, data, *args, **kwargs):
"""Plots a line"""
sdata = self._scale_data(data, self.ranges)
self.ax1.plot(self.angle, np.r_[sdata, sdata[0]], *args, **kwargs)
self.plot_counter = self.plot_counter + 1
def use_legend(self, *args, **kwargs):
"""Shows a legend"""
self.ax1.legend(*args, **kwargs)
def radar_plot(data, model_names, invert_range=[], config=None, fig=None):
"""Create a complex radar chart with different scales for each variable
Source: https://towardsdatascience.com/how-to-create-and-visualize-complex-radar-charts-f7764d0f3652
Args:
data (`List[dict]`): the results (list of metric + value pairs).
E.g. data = [{"accuracy": 0.9, "precision":0.8},{"accuracy": 0.7, "precision":0.6}]
names (`List[dict]`): model names.
E.g. names = ["model1", "model 2", ...]
invert_range (`List[dict]`, optional): the metrics to invert (in cases when smaller is better, e.g. speed)
E.g. invert_range=["latency_in_seconds"]
config (`dict`, optional) : a specification of the formatting configurations, namely:
- rad_ln_args (`dict`, default `{"visible": True}`): The visibility of the radial (circle) lines.
- outer_ring (`dict`, default `{"visible": True}`): The visibility of the outer ring.
- angle_ln_args (`dict`, default `{"visible": True}`): The visibility of the angle lines.
- rgrid_tick_lbls_args (`dict`, default `{"fontsize": 12}`): The font size of the tick labels on the scales.
- theta_tick_lbls (`dict`, default `{"fontsize": 12}`): The font size of the variable labels on the plot.
- theta_tick_lbls_pad (`int`, default `3`): The padding of the variable labels on the plot.
- theta_tick_lbls_brk_lng_wrds (`bool`, default `True` ): Whether long words in the label are broken up or not.
- theta_tick_lbls_txt_wrap (`int`, default `15`): Text wrap for tick labels
- incl_endpoint (`bool`, default `False`): Include value endpoints on calse
- marker (`str`, default `"o"`): the shape of the marker used in the radar plot.
- markersize (`int`, default `3`): the shape of the marker used in the radar plot.
- legend_loc (`str`, default `"upper right"`): the location of the legend in the radar plot. Must be one of: 'upper left', 'upper right', 'lower left', 'lower right'.
- bbox_to_anchor (`tuple`, default `(2, 1)`: anchor for the legend.
fig (`matplotlib.figure.Figure`, optional): figure used to plot the radar plot.
Returns:
`matplotlib.figure.Figure`
"""
data = pd.DataFrame(data)
data.index = model_names
variables = data.keys()
if all(x in variables for x in invert_range) is False:
raise ValueError("All of the metrics in `invert_range` should be in the data provided.")
min_max_per_variable = data.describe().T[["min", "max"]]
min_max_per_variable["min"] = min_max_per_variable["min"] - 0.1 * (
min_max_per_variable["max"] - min_max_per_variable["min"]
)
min_max_per_variable["max"] = min_max_per_variable["max"] + 0.1 * (
min_max_per_variable["max"] - min_max_per_variable["min"]
)
ranges = list(min_max_per_variable.itertuples(index=False, name=None))
ranges = [
(max_value, min_value) if var in invert_range else (min_value, max_value)
for var, (min_value, max_value) in zip(variables, ranges)
]
format_cfg = {
"axes_args": {},
"rad_ln_args": {"visible": True},
"outer_ring": {"visible": True},
"angle_ln_args": {"visible": True},
"rgrid_tick_lbls_args": {"fontsize": 12},
"theta_tick_lbls": {"fontsize": 12},
"theta_tick_lbls_pad": 3,
"theta_tick_lbls_brk_lng_wrds": True,
"theta_tick_lbls_txt_wrap": 15,
"incl_endpoint": False,
"marker": "o",
"markersize": 3,
"legend_loc": "upper right",
"bbox_to_anchor": (2, 1),
}
if config is not None:
format_cfg.update(config)
if fig is None:
fig = plt.figure()
radar = ComplexRadar(
fig,
variables,
ranges,
n_ring_levels=3,
show_scales=True,
format_cfg=format_cfg,
)
for g in zip(data.index):
radar.plot(data.loc[g].values, label=g, marker=format_cfg["marker"], markersize=format_cfg["markersize"])
radar.use_legend(**{"loc": format_cfg["legend_loc"], "bbox_to_anchor": format_cfg["bbox_to_anchor"]})
return fig
{
"module_name": "Awesome Module",
"module_type": "module",
"module_description": "This new module is designed to solve this great ML task and is crafted with a lot of care and love.",
"module_slug": "{{ cookiecutter.module_name|lower|replace(' ', '_') }}",
"module_class_name": "{{ cookiecutter.module_name|replace(' ', '') }}",
"namespace": "",
"dataset_name": ""
}
\ No newline at end of file
---
title: {{ cookiecutter.module_name }}
datasets:
- {{ cookiecutter.dataset_name }}
tags:
- evaluate
- {{ cookiecutter.module_type }}
description: "TODO: add a description here"
sdk: gradio
sdk_version: 3.19.1
app_file: app.py
pinned: false
---
# {{ cookiecutter.module_type|capitalize }} Card for {{ cookiecutter.module_name }}
***Module Card Instructions:*** *Fill out the following subsections. Feel free to take a look at existing {{ cookiecutter.module_type }} cards if you'd like examples.*
## {{ cookiecutter.module_type|capitalize }} Description
*Give a brief overview of this {{ cookiecutter.module_type }}, including what task(s) it is usually used for, if any.*
## How to Use
*Give general statement of how to use the {{ cookiecutter.module_type }}*
*Provide simplest possible example for using the {{ cookiecutter.module_type }}*
### Inputs
*List all input arguments in the format below*
- **input_field** *(type): Definition of input, with explanation if necessary. State any default value(s).*
### Output Values
*Explain what this {{ cookiecutter.module_type }} outputs and provide an example of what the {{ cookiecutter.module_type }} output looks like. Modules should return a dictionary with one or multiple key-value pairs, e.g. {"bleu" : 6.02}*
*State the range of possible values that the {{ cookiecutter.module_type }}'s output can take, as well as what in that range is considered good. For example: "This {{ cookiecutter.module_type }} can take on any value between 0 and 100, inclusive. Higher scores are better."*
#### Values from Popular Papers
*Give examples, preferrably with links to leaderboards or publications, to papers that have reported this {{ cookiecutter.module_type }}, along with the values they have reported.*
### Examples
*Give code examples of the {{ cookiecutter.module_type }} being used. Try to include examples that clear up any potential ambiguity left from the {{ cookiecutter.module_type }} description above. If possible, provide a range of examples that show both typical and atypical results, as well as examples where a variety of input parameters are passed.*
## Limitations and Bias
*Note any known limitations or biases that the {{ cookiecutter.module_type }} has, with links and references if possible.*
## Citation
*Cite the source where this {{ cookiecutter.module_type }} was introduced.*
## Further References
*Add any useful further references.*
import evaluate
from evaluate.utils import launch_gradio_widget
module = evaluate.load("{{ cookiecutter.namespace }}/{{ cookiecutter.module_slug }}")
launch_gradio_widget(module)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment