Commit 25991f98 authored by hepj's avatar hepj
Browse files

修改readme

parent ac192496
Pipeline #1415 failed with stages
in 0 seconds
import evaluate
from evaluate.utils import launch_gradio_widget
module = evaluate.load("meteor")
launch_gradio_widget(module)
# Copyright 2020 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" METEOR metric. """
import datasets
import numpy as np
from nltk.translate import meteor_score
from packaging import version
import evaluate
if evaluate.config.PY_VERSION < version.parse("3.8"):
import importlib_metadata
else:
import importlib.metadata as importlib_metadata
NLTK_VERSION = version.parse(importlib_metadata.version("nltk"))
if NLTK_VERSION >= version.Version("3.6.4"):
from nltk import word_tokenize
_CITATION = """\
@inproceedings{banarjee2005,
title = {{METEOR}: An Automatic Metric for {MT} Evaluation with Improved Correlation with Human Judgments},
author = {Banerjee, Satanjeev and Lavie, Alon},
booktitle = {Proceedings of the {ACL} Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and/or Summarization},
month = jun,
year = {2005},
address = {Ann Arbor, Michigan},
publisher = {Association for Computational Linguistics},
url = {https://www.aclweb.org/anthology/W05-0909},
pages = {65--72},
}
"""
_DESCRIPTION = """\
METEOR, an automatic metric for machine translation evaluation
that is based on a generalized concept of unigram matching between the
machine-produced translation and human-produced reference translations.
Unigrams can be matched based on their surface forms, stemmed forms,
and meanings; furthermore, METEOR can be easily extended to include more
advanced matching strategies. Once all generalized unigram matches
between the two strings have been found, METEOR computes a score for
this matching using a combination of unigram-precision, unigram-recall, and
a measure of fragmentation that is designed to directly capture how
well-ordered the matched words in the machine translation are in relation
to the reference.
METEOR gets an R correlation value of 0.347 with human evaluation on the Arabic
data and 0.331 on the Chinese data. This is shown to be an improvement on
using simply unigram-precision, unigram-recall and their harmonic F1
combination.
"""
_KWARGS_DESCRIPTION = """
Computes METEOR score of translated segments against one or more references.
Args:
predictions: list of predictions to score. Each prediction
should be a string with tokens separated by spaces.
references: list of reference for each prediction. Each
reference should be a string with tokens separated by spaces.
alpha: Parameter for controlling relative weights of precision and recall. default: 0.9
beta: Parameter for controlling shape of penalty as a function of fragmentation. default: 3
gamma: Relative weight assigned to fragmentation penalty. default: 0.5
Returns:
'meteor': meteor score.
Examples:
>>> meteor = evaluate.load('meteor')
>>> predictions = ["It is a guide to action which ensures that the military always obeys the commands of the party"]
>>> references = ["It is a guide to action that ensures that the military will forever heed Party commands"]
>>> results = meteor.compute(predictions=predictions, references=references)
>>> print(round(results["meteor"], 4))
0.6944
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Meteor(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=[
datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
}
),
datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence"),
}
),
],
codebase_urls=["https://github.com/nltk/nltk/blob/develop/nltk/translate/meteor_score.py"],
reference_urls=[
"https://www.nltk.org/api/nltk.translate.html#module-nltk.translate.meteor_score",
"https://en.wikipedia.org/wiki/METEOR",
],
)
def _download_and_prepare(self, dl_manager):
import nltk
nltk.download("wordnet")
if NLTK_VERSION >= version.Version("3.6.5"):
nltk.download("punkt")
if NLTK_VERSION >= version.Version("3.6.6"):
nltk.download("omw-1.4")
def _compute(self, predictions, references, alpha=0.9, beta=3, gamma=0.5):
multiple_refs = isinstance(references[0], list)
if NLTK_VERSION >= version.Version("3.6.5"):
# the version of METEOR in NLTK version 3.6.5 and earlier expect tokenized inputs
if multiple_refs:
scores = [
meteor_score.meteor_score(
[word_tokenize(ref) for ref in refs],
word_tokenize(pred),
alpha=alpha,
beta=beta,
gamma=gamma,
)
for refs, pred in zip(references, predictions)
]
else:
scores = [
meteor_score.single_meteor_score(
word_tokenize(ref), word_tokenize(pred), alpha=alpha, beta=beta, gamma=gamma
)
for ref, pred in zip(references, predictions)
]
else:
if multiple_refs:
scores = [
meteor_score.meteor_score(
[[word_tokenize(ref) for ref in group] for group in references][0],
word_tokenize(pred),
alpha=alpha,
beta=beta,
gamma=gamma,
)
for ref, pred in zip(references, predictions)
]
else:
scores = [
meteor_score.single_meteor_score(ref, pred, alpha=alpha, beta=beta, gamma=gamma)
for ref, pred in zip(references, predictions)
]
return {"meteor": np.mean(scores)}
git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
nltk
\ No newline at end of file
---
title: MSE
emoji: 🤗
colorFrom: blue
colorTo: red
sdk: gradio
sdk_version: 3.19.1
app_file: app.py
pinned: false
tags:
- evaluate
- metric
description: >-
Mean Squared Error(MSE) is the average of the square of difference between the predicted
and actual values.
---
# Metric Card for MSE
## Metric Description
Mean Squared Error(MSE) represents the average of the squares of errors -- i.e. the average squared difference between the estimated values and the actual values.
![image](https://user-images.githubusercontent.com/14205986/165999302-eba3702d-81e3-4363-9c0e-d3bfceb7ec5a.png)
## How to Use
At minimum, this metric requires predictions and references as inputs.
```python
>>> mse_metric = evaluate.load("mse")
>>> predictions = [2.5, 0.0, 2, 8]
>>> references = [3, -0.5, 2, 7]
>>> results = mse_metric.compute(predictions=predictions, references=references)
```
### Inputs
Mandatory inputs:
- `predictions`: numeric array-like of shape (`n_samples,`) or (`n_samples`, `n_outputs`), representing the estimated target values.
- `references`: numeric array-like of shape (`n_samples,`) or (`n_samples`, `n_outputs`), representing the ground truth (correct) target values.
Optional arguments:
- `sample_weight`: numeric array-like of shape (`n_samples,`) representing sample weights. The default is `None`.
- `multioutput`: `raw_values`, `uniform_average` or numeric array-like of shape (`n_outputs,`), which defines the aggregation of multiple output values. The default value is `uniform_average`.
- `raw_values` returns a full set of errors in case of multioutput input.
- `uniform_average` means that the errors of all outputs are averaged with uniform weight.
- the array-like value defines weights used to average errors.
- `squared` (`bool`): If `True` returns MSE value, if `False` returns RMSE (Root Mean Squared Error). The default value is `True`.
### Output Values
This metric outputs a dictionary, containing the mean squared error score, which is of type:
- `float`: if multioutput is `uniform_average` or an ndarray of weights, then the weighted average of all output errors is returned.
- numeric array-like of shape (`n_outputs,`): if multioutput is `raw_values`, then the score is returned for each output separately.
Each MSE `float` value ranges from `0.0` to `1.0`, with the best value being `0.0`.
Output Example(s):
```python
{'mse': 0.5}
```
If `multioutput="raw_values"`:
```python
{'mse': array([0.41666667, 1. ])}
```
#### Values from Popular Papers
### Examples
Example with the `uniform_average` config:
```python
>>> mse_metric = evaluate.load("mse")
>>> predictions = [2.5, 0.0, 2, 8]
>>> references = [3, -0.5, 2, 7]
>>> results = mse_metric.compute(predictions=predictions, references=references)
>>> print(results)
{'mse': 0.375}
```
Example with `squared = True`, which returns the RMSE:
```python
>>> mse_metric = evaluate.load("mse")
>>> predictions = [2.5, 0.0, 2, 8]
>>> references = [3, -0.5, 2, 7]
>>> rmse_result = mse_metric.compute(predictions=predictions, references=references, squared=False)
>>> print(rmse_result)
{'mse': 0.6123724356957945}
```
Example with multi-dimensional lists, and the `raw_values` config:
```python
>>> mse_metric = evaluate.load("mse", "multilist")
>>> predictions = [[0.5, 1], [-1, 1], [7, -6]]
>>> references = [[0, 2], [-1, 2], [8, -5]]
>>> results = mse_metric.compute(predictions=predictions, references=references, multioutput='raw_values')
>>> print(results)
{'mse': array([0.41666667, 1. ])}
"""
```
## Limitations and Bias
MSE has the disadvantage of heavily weighting outliers -- given that it squares them, this results in large errors weighing more heavily than small ones. It can be used alongside [MAE](https://huggingface.co/metrics/mae), which is complementary given that it does not square the errors.
## Citation(s)
```bibtex
@article{scikit-learn,
title={Scikit-learn: Machine Learning in {P}ython},
author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
journal={Journal of Machine Learning Research},
volume={12},
pages={2825--2830},
year={2011}
}
```
```bibtex
@article{willmott2005advantages,
title={Advantages of the mean absolute error (MAE) over the root mean square error (RMSE) in assessing average model performance},
author={Willmott, Cort J and Matsuura, Kenji},
journal={Climate research},
volume={30},
number={1},
pages={79--82},
year={2005}
}
```
## Further References
- [Mean Squared Error - Wikipedia](https://en.wikipedia.org/wiki/Mean_squared_error)
import evaluate
from evaluate.utils import launch_gradio_widget
module = evaluate.load("mse")
launch_gradio_widget(module)
# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""MSE - Mean Squared Error Metric"""
import datasets
from sklearn.metrics import mean_squared_error
import evaluate
_CITATION = """\
@article{scikit-learn,
title={Scikit-learn: Machine Learning in {P}ython},
author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
journal={Journal of Machine Learning Research},
volume={12},
pages={2825--2830},
year={2011}
}
"""
_DESCRIPTION = """\
Mean Squared Error(MSE) is the average of the square of difference between the predicted
and actual values.
"""
_KWARGS_DESCRIPTION = """
Args:
predictions: array-like of shape (n_samples,) or (n_samples, n_outputs)
Estimated target values.
references: array-like of shape (n_samples,) or (n_samples, n_outputs)
Ground truth (correct) target values.
sample_weight: array-like of shape (n_samples,), default=None
Sample weights.
multioutput: {"raw_values", "uniform_average"} or array-like of shape (n_outputs,), default="uniform_average"
Defines aggregating of multiple output values. Array-like value defines weights used to average errors.
"raw_values" : Returns a full set of errors in case of multioutput input.
"uniform_average" : Errors of all outputs are averaged with uniform weight.
squared : bool, default=True
If True returns MSE value, if False returns RMSE (Root Mean Squared Error) value.
Returns:
mse : mean squared error.
Examples:
>>> mse_metric = evaluate.load("mse")
>>> predictions = [2.5, 0.0, 2, 8]
>>> references = [3, -0.5, 2, 7]
>>> results = mse_metric.compute(predictions=predictions, references=references)
>>> print(results)
{'mse': 0.375}
>>> rmse_result = mse_metric.compute(predictions=predictions, references=references, squared=False)
>>> print(rmse_result)
{'mse': 0.6123724356957945}
If you're using multi-dimensional lists, then set the config as follows :
>>> mse_metric = evaluate.load("mse", "multilist")
>>> predictions = [[0.5, 1], [-1, 1], [7, -6]]
>>> references = [[0, 2], [-1, 2], [8, -5]]
>>> results = mse_metric.compute(predictions=predictions, references=references)
>>> print(results)
{'mse': 0.7083333333333334}
>>> results = mse_metric.compute(predictions=predictions, references=references, multioutput='raw_values')
>>> print(results) # doctest: +NORMALIZE_WHITESPACE
{'mse': array([0.41666667, 1. ])}
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Mse(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(self._get_feature_types()),
reference_urls=[
"https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html"
],
)
def _get_feature_types(self):
if self.config_name == "multilist":
return {
"predictions": datasets.Sequence(datasets.Value("float")),
"references": datasets.Sequence(datasets.Value("float")),
}
else:
return {
"predictions": datasets.Value("float"),
"references": datasets.Value("float"),
}
def _compute(self, predictions, references, sample_weight=None, multioutput="uniform_average", squared=True):
mse = mean_squared_error(
references, predictions, sample_weight=sample_weight, multioutput=multioutput, squared=squared
)
return {"mse": mse}
git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
scikit-learn
\ No newline at end of file
---
title: NIST_MT
emoji: 🤗
colorFrom: purple
colorTo: red
sdk: gradio
sdk_version: 3.19.1
app_file: app.py
pinned: false
tags:
- evaluate
- metric
- machine-translation
description:
DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU score.
---
# Metric Card for NIST's MT metric
## Metric Description
DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU
score. The official script used by NIST to compute BLEU and NIST score is
mteval-14.pl. The main differences are:
- BLEU uses geometric mean of the ngram overlaps, NIST uses arithmetic mean.
- NIST has a different brevity penalty
- NIST score from mteval-14.pl has a self-contained tokenizer (in the Hugging Face implementation we rely on NLTK's
implementation of the NIST-specific tokenizer)
## Intended Uses
NIST was developed for machine translation evaluation.
## How to Use
```python
import evaluate
nist_mt = evaluate.load("nist_mt")
hypothesis1 = "It is a guide to action which ensures that the military always obeys the commands of the party"
reference1 = "It is a guide to action that ensures that the military will forever heed Party commands"
reference2 = "It is the guiding principle which guarantees the military forces always being under the command of the Party"
nist_mt.compute(hypothesis1, [reference1, reference2])
# {'nist_mt': 3.3709935957649324}
```
### Inputs
- **predictions**: tokenized predictions to score. For sentence-level NIST, a list of tokens (str);
for corpus-level NIST, a list (sentences) of lists of tokens (str)
- **references**: potentially multiple tokenized references for each prediction. For sentence-level NIST, a
list (multiple potential references) of list of tokens (str); for corpus-level NIST, a list (corpus) of lists
(multiple potential references) of lists of tokens (str)
- **n**: highest n-gram order
- **tokenize_kwargs**: arguments passed to the tokenizer (see: https://github.com/nltk/nltk/blob/90fa546ea600194f2799ee51eaf1b729c128711e/nltk/tokenize/nist.py#L139)
### Output Values
- **nist_mt** (`float`): NIST score
Output Example:
```python
{'nist_mt': 3.3709935957649324}
```
## Citation
```bibtex
@inproceedings{10.5555/1289189.1289273,
author = {Doddington, George},
title = {Automatic Evaluation of Machine Translation Quality Using N-Gram Co-Occurrence Statistics},
year = {2002},
publisher = {Morgan Kaufmann Publishers Inc.},
address = {San Francisco, CA, USA},
booktitle = {Proceedings of the Second International Conference on Human Language Technology Research},
pages = {138–145},
numpages = {8},
location = {San Diego, California},
series = {HLT '02}
}
```
## Further References
This Hugging Face implementation uses [the NLTK implementation](https://github.com/nltk/nltk/blob/develop/nltk/translate/nist_score.py)
import evaluate
from evaluate.utils import launch_gradio_widget
module = evaluate.load("nist_mt")
launch_gradio_widget(module)
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""NLTK's NIST implementation on both the sentence and corpus level"""
from typing import Dict, Optional
import datasets
import nltk
from datasets import Sequence, Value
try:
nltk.data.find("perluniprops")
except LookupError:
nltk.download("perluniprops", quiet=True) # NISTTokenizer requirement
from nltk.tokenize.nist import NISTTokenizer
from nltk.translate.nist_score import corpus_nist, sentence_nist
import evaluate
_CITATION = """\
@inproceedings{10.5555/1289189.1289273,
author = {Doddington, George},
title = {Automatic Evaluation of Machine Translation Quality Using N-Gram Co-Occurrence Statistics},
year = {2002},
publisher = {Morgan Kaufmann Publishers Inc.},
address = {San Francisco, CA, USA},
booktitle = {Proceedings of the Second International Conference on Human Language Technology Research},
pages = {138–145},
numpages = {8},
location = {San Diego, California},
series = {HLT '02}
}
"""
_DESCRIPTION = """\
DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU
score. The official script used by NIST to compute BLEU and NIST score is
mteval-14.pl. The main differences are:
- BLEU uses geometric mean of the ngram precisions, NIST uses arithmetic mean.
- NIST has a different brevity penalty
- NIST score from mteval-14.pl has a self-contained tokenizer (in the Hugging Face implementation we rely on NLTK's
implementation of the NIST-specific tokenizer)
"""
_KWARGS_DESCRIPTION = """
Computes NIST score of translated segments against one or more references.
Args:
predictions: predictions to score (list of str)
references: potentially multiple references for each prediction (list of str or list of list of str)
n: highest n-gram order
lowercase: whether to lowercase the data (only applicable if 'western_lang' is True)
western_lang: whether the current language is a Western language, which will enable some specific tokenization
rules with respect to, e.g., punctuation
Returns:
'nist_mt': nist_mt score
Examples:
>>> nist_mt = evaluate.load("nist_mt")
>>> hypothesis = "It is a guide to action which ensures that the military always obeys the commands of the party"
>>> reference1 = "It is a guide to action that ensures that the military will forever heed Party commands"
>>> reference2 = "It is the guiding principle which guarantees the military forces always being under the command of the Party"
>>> reference3 = "It is the practical guide for the army always to heed the directions of the party"
>>> nist_mt.compute(predictions=[hypothesis], references=[[reference1, reference2, reference3]])
{'nist_mt': 3.3709935957649324}
>>> nist_mt.compute(predictions=[hypothesis], references=[reference1])
{'nist_mt': 2.4477124183006533}
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class NistMt(evaluate.Metric):
"""A wrapper around NLTK's NIST implementation."""
def _info(self):
return evaluate.MetricInfo(
module_type="metric",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=[
datasets.Features(
{
"predictions": Value("string", id="prediction"),
"references": Sequence(Value("string", id="reference"), id="references"),
}
),
datasets.Features(
{
"predictions": Value("string", id="prediction"),
"references": Value("string", id="reference"),
}
),
],
homepage="https://www.nltk.org/api/nltk.translate.nist_score.html",
codebase_urls=["https://github.com/nltk/nltk/blob/develop/nltk/translate/nist_score.py"],
reference_urls=["https://en.wikipedia.org/wiki/NIST_(metric)"],
)
def _compute(self, predictions, references, n: int = 5, lowercase=False, western_lang=True):
tokenizer = NISTTokenizer()
# Account for single reference cases: references always need to have one more dimension than predictions
if isinstance(references[0], str):
references = [[ref] for ref in references]
predictions = [
tokenizer.tokenize(pred, return_str=False, lowercase=lowercase, western_lang=western_lang)
for pred in predictions
]
references = [
[
tokenizer.tokenize(ref, return_str=False, lowercase=lowercase, western_lang=western_lang)
for ref in ref_sentences
]
for ref_sentences in references
]
return {"nist_mt": corpus_nist(list_of_references=references, hypotheses=predictions, n=n)}
git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
nltk
from _pytest.fixtures import fixture
from nist_mt import Nist_mt
nist = Nist_mt()
@fixture
def hypothesis_sent():
return "It is a guide to action which ensures that the military always obeys the commands of the party"
@fixture
def reference_sent1():
return "It is a guide to action that ensures that the military will forever heed Party commands"
@fixture
def reference_sent2():
return (
"It is the guiding principle which guarantees the military forces always being under the command of the Party"
)
@fixture
def reference_sent3():
return "It is the practical guide for the army always to heed the directions of the party"
def test_nist_sentence(hypothesis_sent, reference_sent1, reference_sent2, reference_sent3):
nist_score = nist.compute(
predictions=[hypothesis_sent], references=[[reference_sent1, reference_sent2, reference_sent3]]
)
assert abs(nist_score["nist_mt"] - 3.3709935957649324) < 1e-6
---
title: Pearson Correlation Coefficient
emoji: 🤗
colorFrom: blue
colorTo: red
sdk: gradio
sdk_version: 3.19.1
app_file: app.py
pinned: false
tags:
- evaluate
- metric
description: >-
Pearson correlation coefficient and p-value for testing non-correlation.
The Pearson correlation coefficient measures the linear relationship between two datasets. The calculation of the p-value relies on the assumption that each dataset is normally distributed. Like other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply an exact linear relationship. Positive correlations imply that as x increases, so does y. Negative correlations imply that as x increases, y decreases.
The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets.
---
# Metric Card for Pearson Correlation Coefficient (pearsonr)
## Metric Description
Pearson correlation coefficient and p-value for testing non-correlation.
The Pearson correlation coefficient measures the linear relationship between two datasets. The calculation of the p-value relies on the assumption that each dataset is normally distributed. Like other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply an exact linear relationship. Positive correlations imply that as x increases, so does y. Negative correlations imply that as x increases, y decreases.
The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets.
## How to Use
This metric takes a list of predictions and a list of references as input
```python
>>> pearsonr_metric = evaluate.load("pearsonr")
>>> results = pearsonr_metric.compute(predictions=[10, 9, 2.5, 6, 4], references=[1, 2, 3, 4, 5])
>>> print(round(results['pearsonr']), 2)
['-0.74']
```
### Inputs
- **predictions** (`list` of `int`): Predicted class labels, as returned by a model.
- **references** (`list` of `int`): Ground truth labels.
- **return_pvalue** (`boolean`): If `True`, returns the p-value, along with the correlation coefficient. If `False`, returns only the correlation coefficient. Defaults to `False`.
### Output Values
- **pearsonr**(`float`): Pearson correlation coefficient. Minimum possible value is -1. Maximum possible value is 1. Values of 1 and -1 indicate exact linear positive and negative relationships, respectively. A value of 0 implies no correlation.
- **p-value**(`float`): P-value, which roughly indicates the probability of an The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets. Minimum possible value is 0. Maximum possible value is 1. Higher values indicate higher probabilities.
Like other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply an exact linear relationship. Positive correlations imply that as x increases, so does y. Negative correlations imply that as x increases, y decreases.
Output Example(s):
```python
{'pearsonr': -0.7}
```
```python
{'p-value': 0.15}
```
#### Values from Popular Papers
### Examples
Example 1-A simple example using only predictions and references.
```python
>>> pearsonr_metric = evaluate.load("pearsonr")
>>> results = pearsonr_metric.compute(predictions=[10, 9, 2.5, 6, 4], references=[1, 2, 3, 4, 5])
>>> print(round(results['pearsonr'], 2))
-0.74
```
Example 2-The same as Example 1, but that also returns the `p-value`.
```python
>>> pearsonr_metric = evaluate.load("pearsonr")
>>> results = pearsonr_metric.compute(predictions=[10, 9, 2.5, 6, 4], references=[1, 2, 3, 4, 5], return_pvalue=True)
>>> print(sorted(list(results.keys())))
['p-value', 'pearsonr']
>>> print(round(results['pearsonr'], 2))
-0.74
>>> print(round(results['p-value'], 2))
0.15
```
## Limitations and Bias
As stated above, the calculation of the p-value relies on the assumption that each data set is normally distributed. This is not always the case, so verifying the true distribution of datasets is recommended.
## Citation(s)
```bibtex
@article{2020SciPy-NMeth,
author = {Virtanen, Pauli and Gommers, Ralf and Oliphant, Travis E. and
Haberland, Matt and Reddy, Tyler and Cournapeau, David and
Burovski, Evgeni and Peterson, Pearu and Weckesser, Warren and
Bright, Jonathan and {van der Walt}, St{\'e}fan J. and
Brett, Matthew and Wilson, Joshua and Millman, K. Jarrod and
Mayorov, Nikolay and Nelson, Andrew R. J. and Jones, Eric and
Kern, Robert and Larson, Eric and Carey, C J and
Polat, {\.I}lhan and Feng, Yu and Moore, Eric W. and
{VanderPlas}, Jake and Laxalde, Denis and Perktold, Josef and
Cimrman, Robert and Henriksen, Ian and Quintero, E. A. and
Harris, Charles R. and Archibald, Anne M. and
Ribeiro, Ant{\^o}nio H. and Pedregosa, Fabian and
{van Mulbregt}, Paul and {SciPy 1.0 Contributors}},
title = {{{SciPy} 1.0: Fundamental Algorithms for Scientific
Computing in Python}},
journal = {Nature Methods},
year = {2020},
volume = {17},
pages = {261--272},
adsurl = {https://rdcu.be/b08Wh},
doi = {10.1038/s41592-019-0686-2},
}
```
## Further References
import evaluate
from evaluate.utils import launch_gradio_widget
module = evaluate.load("pearsonr")
launch_gradio_widget(module)
# Copyright 2021 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Pearson correlation coefficient metric."""
import datasets
from scipy.stats import pearsonr
import evaluate
_DESCRIPTION = """
Pearson correlation coefficient and p-value for testing non-correlation.
The Pearson correlation coefficient measures the linear relationship between two datasets. The calculation of the p-value relies on the assumption that each dataset is normally distributed. Like other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply an exact linear relationship. Positive correlations imply that as x increases, so does y. Negative correlations imply that as x increases, y decreases.
The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets.
"""
_KWARGS_DESCRIPTION = """
Args:
predictions (`list` of `int`): Predicted class labels, as returned by a model.
references (`list` of `int`): Ground truth labels.
return_pvalue (`boolean`): If `True`, returns the p-value, along with the correlation coefficient. If `False`, returns only the correlation coefficient. Defaults to `False`.
Returns:
pearsonr (`float`): Pearson correlation coefficient. Minimum possible value is -1. Maximum possible value is 1. Values of 1 and -1 indicate exact linear positive and negative relationships, respectively. A value of 0 implies no correlation.
p-value (`float`): P-value, which roughly indicates the probability of an The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets. Minimum possible value is 0. Maximum possible value is 1. Higher values indicate higher probabilities.
Examples:
Example 1-A simple example using only predictions and references.
>>> pearsonr_metric = evaluate.load("pearsonr")
>>> results = pearsonr_metric.compute(predictions=[10, 9, 2.5, 6, 4], references=[1, 2, 3, 4, 5])
>>> print(round(results['pearsonr'], 2))
-0.74
Example 2-The same as Example 1, but that also returns the `p-value`.
>>> pearsonr_metric = evaluate.load("pearsonr")
>>> results = pearsonr_metric.compute(predictions=[10, 9, 2.5, 6, 4], references=[1, 2, 3, 4, 5], return_pvalue=True)
>>> print(sorted(list(results.keys())))
['p-value', 'pearsonr']
>>> print(round(results['pearsonr'], 2))
-0.74
>>> print(round(results['p-value'], 2))
0.15
"""
_CITATION = """
@article{2020SciPy-NMeth,
author = {Virtanen, Pauli and Gommers, Ralf and Oliphant, Travis E. and
Haberland, Matt and Reddy, Tyler and Cournapeau, David and
Burovski, Evgeni and Peterson, Pearu and Weckesser, Warren and
Bright, Jonathan and {van der Walt}, St{\'e}fan J. and
Brett, Matthew and Wilson, Joshua and Millman, K. Jarrod and
Mayorov, Nikolay and Nelson, Andrew R. J. and Jones, Eric and
Kern, Robert and Larson, Eric and Carey, C J and
Polat, Ilhan and Feng, Yu and Moore, Eric W. and
{VanderPlas}, Jake and Laxalde, Denis and Perktold, Josef and
Cimrman, Robert and Henriksen, Ian and Quintero, E. A. and
Harris, Charles R. and Archibald, Anne M. and
Ribeiro, Antonio H. and Pedregosa, Fabian and
{van Mulbregt}, Paul and {SciPy 1.0 Contributors}},
title = {{{SciPy} 1.0: Fundamental Algorithms for Scientific
Computing in Python}},
journal = {Nature Methods},
year = {2020},
volume = {17},
pages = {261--272},
adsurl = {https://rdcu.be/b08Wh},
doi = {10.1038/s41592-019-0686-2},
}
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Pearsonr(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"predictions": datasets.Value("float"),
"references": datasets.Value("float"),
}
),
reference_urls=["https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html"],
)
def _compute(self, predictions, references, return_pvalue=False):
if return_pvalue:
results = pearsonr(references, predictions)
return {"pearsonr": results[0], "p-value": results[1]}
else:
return {"pearsonr": float(pearsonr(references, predictions)[0])}
git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
scipy
\ No newline at end of file
---
title: Perplexity
emoji: 🤗
colorFrom: blue
colorTo: red
sdk: gradio
sdk_version: 3.19.1
app_file: app.py
pinned: false
tags:
- evaluate
- metric
description: >-
Perplexity (PPL) is one of the most common metrics for evaluating language models.
It is defined as the exponentiated average negative log-likelihood of a sequence, calculated with exponent base `e`.
For more information on perplexity, see [this tutorial](https://huggingface.co/docs/transformers/perplexity).
---
# Metric Card for Perplexity
## Metric Description
Given a model and an input text sequence, perplexity measures how likely the model is to generate the input text sequence.
As a metric, it can be used to evaluate how well the model has learned the distribution of the text it was trained on.
In this case, `model_id` should be the trained model to be evaluated, and the input texts should be the text that the model was trained on.
This implementation of perplexity is calculated with log base `e`, as in `perplexity = e**(sum(losses) / num_tokenized_tokens)`, following recent convention in deep learning frameworks.
## Intended Uses
Any language generation task.
## How to Use
The metric takes a list of text as input, as well as the name of the model used to compute the metric:
```python
from evaluate import load
perplexity = load("perplexity", module_type="metric")
results = perplexity.compute(predictions=predictions, model_id='gpt2')
```
### Inputs
- **model_id** (str): model used for calculating Perplexity. NOTE: Perplexity can only be calculated for causal language models.
- This includes models such as gpt2, causal variations of bert, causal versions of t5, and more (the full list can be found in the AutoModelForCausalLM documentation here: https://huggingface.co/docs/transformers/master/en/model_doc/auto#transformers.AutoModelForCausalLM )
- **predictions** (list of str): input text, where each separate text snippet is one list entry.
- **batch_size** (int): the batch size to run texts through the model. Defaults to 16.
- **add_start_token** (bool): whether to add the start token to the texts, so the perplexity can include the probability of the first word. Defaults to True.
- **device** (str): device to run on, defaults to `cuda` when available
### Output Values
This metric outputs a dictionary with the perplexity scores for the text input in the list, and the average perplexity.
If one of the input texts is longer than the max input length of the model, then it is truncated to the max length for the perplexity computation.
```
{'perplexities': [8.182524681091309, 33.42122268676758, 27.012239456176758], 'mean_perplexity': 22.871995608011883}
```
The range of this metric is [0, inf). A lower score is better.
#### Values from Popular Papers
### Examples
Calculating perplexity on predictions defined here:
```python
perplexity = evaluate.load("perplexity", module_type="metric")
input_texts = ["lorem ipsum", "Happy Birthday!", "Bienvenue"]
results = perplexity.compute(model_id='gpt2',
add_start_token=False,
predictions=input_texts)
print(list(results.keys()))
>>>['perplexities', 'mean_perplexity']
print(round(results["mean_perplexity"], 2))
>>>646.75
print(round(results["perplexities"][0], 2))
>>>32.25
```
Calculating perplexity on predictions loaded in from a dataset:
```python
perplexity = evaluate.load("perplexity", module_type="metric")
input_texts = datasets.load_dataset("wikitext",
"wikitext-2-raw-v1",
split="test")["text"][:50]
input_texts = [s for s in input_texts if s!='']
results = perplexity.compute(model_id='gpt2',
predictions=input_texts)
print(list(results.keys()))
>>>['perplexities', 'mean_perplexity']
print(round(results["mean_perplexity"], 2))
>>>576.76
print(round(results["perplexities"][0], 2))
>>>889.28
```
## Limitations and Bias
Note that the output value is based heavily on what text the model was trained on. This means that perplexity scores are not comparable between models or datasets.
See Meister and Cotterell, ["Language Model Evaluation Beyond Perplexity"]( https://arxiv.org/abs/2106.00085) (2021) for more information about alternative model evaluation strategies.
## Citation
```bibtex
@article{jelinek1977perplexity,
title={Perplexity—a measure of the difficulty of speech recognition tasks},
author={Jelinek, Fred and Mercer, Robert L and Bahl, Lalit R and Baker, James K},
journal={The Journal of the Acoustical Society of America},
volume={62},
number={S1},
pages={S63--S63},
year={1977},
publisher={Acoustical Society of America}
}
```
## Further References
- [Hugging Face Perplexity Blog Post](https://huggingface.co/docs/transformers/perplexity)
import evaluate
from evaluate.utils import launch_gradio_widget
module = evaluate.load("perplexity", module_type="metric")
launch_gradio_widget(module)
# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Perplexity Metric."""
import datasets
import numpy as np
import torch
from torch.nn import CrossEntropyLoss
from transformers import AutoModelForCausalLM, AutoTokenizer
import evaluate
from evaluate import logging
_CITATION = """\
"""
_DESCRIPTION = """
Perplexity (PPL) is one of the most common metrics for evaluating language models.
It is defined as the exponentiated average negative log-likelihood of a sequence, calculated with exponent base `e`.
For more information, see https://huggingface.co/docs/transformers/perplexity
"""
_KWARGS_DESCRIPTION = """
Args:
model_id (str): model used for calculating Perplexity
NOTE: Perplexity can only be calculated for causal language models.
This includes models such as gpt2, causal variations of bert,
causal versions of t5, and more (the full list can be found
in the AutoModelForCausalLM documentation here:
https://huggingface.co/docs/transformers/master/en/model_doc/auto#transformers.AutoModelForCausalLM )
predictions (list of str): input text, each separate text snippet
is one list entry.
batch_size (int): the batch size to run texts through the model. Defaults to 16.
add_start_token (bool): whether to add the start token to the texts,
so the perplexity can include the probability of the first word. Defaults to True.
device (str): device to run on, defaults to 'cuda' when available
Returns:
perplexity: dictionary containing the perplexity scores for the texts
in the input list, as well as the mean perplexity. If one of the input texts is
longer than the max input length of the model, then it is truncated to the
max length for the perplexity computation.
Examples:
Example 1:
>>> perplexity = evaluate.load("perplexity", module_type="metric")
>>> input_texts = ["lorem ipsum", "Happy Birthday!", "Bienvenue"]
>>> results = perplexity.compute(model_id='gpt2',
... add_start_token=False,
... predictions=input_texts) # doctest:+ELLIPSIS
>>> print(list(results.keys()))
['perplexities', 'mean_perplexity']
>>> print(round(results["mean_perplexity"], 0))
647.0
>>> print(round(results["perplexities"][0], 0))
32.0
Example 2:
>>> from datasets import load_dataset
>>> perplexity = evaluate.load("perplexity", module_type="metric")
>>> input_texts = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")["text"][:10] # doctest: +SKIP
>>> input_texts = [s for s in input_texts if s!='']
>>> results = perplexity.compute(model_id='gpt2',
... predictions=input_texts)
>>> print(list(results.keys()))
['perplexities', 'mean_perplexity']
>>> print(round(results["mean_perplexity"], 2)) # doctest: +SKIP
576.76
>>> print(round(results["perplexities"][0], 2)) # doctest: +SKIP
889.28
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Perplexity(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
module_type="metric",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"predictions": datasets.Value("string"),
}
),
reference_urls=["https://huggingface.co/docs/transformers/perplexity"],
)
def _compute(
self, predictions, model_id, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None
):
if device is not None:
assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
if device == "gpu":
device = "cuda"
else:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(model_id)
model = model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
# if batch_size > 1 (which generally leads to padding being required), and
# if there is not an already assigned pad_token, assign an existing
# special token to also be the padding token
if tokenizer.pad_token is None and batch_size > 1:
existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())
# check that the model already has at least one special token defined
assert (
len(existing_special_tokens) > 0
), "If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1."
# assign one of the special tokens to also be the pad token
tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})
if add_start_token and max_length:
# leave room for <BOS> token to be added:
assert (
tokenizer.bos_token is not None
), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
max_tokenized_len = max_length - 1
else:
max_tokenized_len = max_length
encodings = tokenizer(
predictions,
add_special_tokens=False,
padding=True,
truncation=True if max_tokenized_len else False,
max_length=max_tokenized_len,
return_tensors="pt",
return_attention_mask=True,
).to(device)
encoded_texts = encodings["input_ids"]
attn_masks = encodings["attention_mask"]
# check that each input is long enough:
if add_start_token:
assert torch.all(torch.ge(attn_masks.sum(1), 1)), "Each input text must be at least one token long."
else:
assert torch.all(
torch.ge(attn_masks.sum(1), 2)
), "When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings."
ppls = []
loss_fct = CrossEntropyLoss(reduction="none")
for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):
end_index = min(start_index + batch_size, len(encoded_texts))
encoded_batch = encoded_texts[start_index:end_index]
attn_mask = attn_masks[start_index:end_index]
if add_start_token:
bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)
encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)
attn_mask = torch.cat(
[torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_mask], dim=1
)
labels = encoded_batch
with torch.no_grad():
out_logits = model(encoded_batch, attention_mask=attn_mask).logits
shift_logits = out_logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
shift_attention_mask_batch = attn_mask[..., 1:].contiguous()
perplexity_batch = torch.exp(
(loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)
/ shift_attention_mask_batch.sum(1)
)
ppls += perplexity_batch.tolist()
return {"perplexities": ppls, "mean_perplexity": np.mean(ppls)}
git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
torch
torch
transformers
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment