修改readme

25991f98 · hepj · ac192496 · 25991f98 · 25991f98 · 25991f98
Commit 25991f98 authored Jul 25, 2024 by hepj
20 changed files
--- a/evaluate-0.4.2/metrics/meteor/app.py
+++ b/evaluate-0.4.2/metrics/meteor/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+module = evaluate.load("meteor")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/meteor/meteor.py
+++ b/evaluate-0.4.2/metrics/meteor/meteor.py
+# Copyright 2020 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" METEOR metric. """
+import datasets
+import numpy as np
+from nltk.translate import meteor_score
+from packaging import version
+import evaluate
+if evaluate.config.PY_VERSION < version.parse("3.8"):
+    import importlib_metadata
+else:
+    import importlib.metadata as importlib_metadata
+NLTK_VERSION = version.parse(importlib_metadata.version("nltk"))
+if NLTK_VERSION >= version.Version("3.6.4"):
+    from nltk import word_tokenize
+_CITATION = """\
+@inproceedings{banarjee2005,
+  title     = {{METEOR}: An Automatic Metric for {MT} Evaluation with Improved Correlation with Human Judgments},
+  author    = {Banerjee, Satanjeev  and Lavie, Alon},
+  booktitle = {Proceedings of the {ACL} Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and/or Summarization},
+  month     = jun,
+  year      = {2005},
+  address   = {Ann Arbor, Michigan},
+  publisher = {Association for Computational Linguistics},
+  url       = {https://www.aclweb.org/anthology/W05-0909},
+  pages     = {65--72},
+}
+"""
+_DESCRIPTION = """\
+METEOR, an automatic metric for machine translation evaluation
+that is based on a generalized concept of unigram matching between the
+machine-produced translation and human-produced reference translations.
+Unigrams can be matched based on their surface forms, stemmed forms,
+and meanings; furthermore, METEOR can be easily extended to include more
+advanced matching strategies. Once all generalized unigram matches
+between the two strings have been found, METEOR computes a score for
+this matching using a combination of unigram-precision, unigram-recall, and
+a measure of fragmentation that is designed to directly capture how
+well-ordered the matched words in the machine translation are in relation
+to the reference.
+METEOR gets an R correlation value of 0.347 with human evaluation on the Arabic
+data and 0.331 on the Chinese data. This is shown to be an improvement on
+using simply unigram-precision, unigram-recall and their harmonic F1
+combination.
+"""
+_KWARGS_DESCRIPTION = """
+Computes METEOR score of translated segments against one or more references.
+Args:
+    predictions: list of predictions to score. Each prediction
+        should be a string with tokens separated by spaces.
+    references: list of reference for each prediction. Each
+        reference should be a string with tokens separated by spaces.
+    alpha: Parameter for controlling relative weights of precision and recall. default: 0.9
+    beta: Parameter for controlling shape of penalty as a function of fragmentation. default: 3
+    gamma: Relative weight assigned to fragmentation penalty. default: 0.5
+Returns:
+    'meteor': meteor score.
+Examples:
+    >>> meteor = evaluate.load('meteor')
+    >>> predictions = ["It is a guide to action which ensures that the military always obeys the commands of the party"]
+    >>> references = ["It is a guide to action that ensures that the military will forever heed Party commands"]
+    >>> results = meteor.compute(predictions=predictions, references=references)
+    >>> print(round(results["meteor"], 4))
+    0.6944
+"""
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Meteor(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=[
+                datasets.Features(
+                    {
+                        "predictions": datasets.Value("string", id="sequence"),
+                        "references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
+                    }
+                ),
+                datasets.Features(
+                    {
+                        "predictions": datasets.Value("string", id="sequence"),
+                        "references": datasets.Value("string", id="sequence"),
+                    }
+                ),
+            ],
+            codebase_urls=["https://github.com/nltk/nltk/blob/develop/nltk/translate/meteor_score.py"],
+            reference_urls=[
+                "https://www.nltk.org/api/nltk.translate.html#module-nltk.translate.meteor_score",
+                "https://en.wikipedia.org/wiki/METEOR",
+            ],
+        )
+    def _download_and_prepare(self, dl_manager):
+        import nltk
+        nltk.download("wordnet")
+        if NLTK_VERSION >= version.Version("3.6.5"):
+            nltk.download("punkt")
+        if NLTK_VERSION >= version.Version("3.6.6"):
+            nltk.download("omw-1.4")
+    def _compute(self, predictions, references, alpha=0.9, beta=3, gamma=0.5):
+        multiple_refs = isinstance(references[0], list)
+        if NLTK_VERSION >= version.Version("3.6.5"):
+            # the version of METEOR in NLTK version 3.6.5 and earlier expect tokenized inputs
+            if multiple_refs:
+                scores = [
+                    meteor_score.meteor_score(
+                        [word_tokenize(ref) for ref in refs],
+                        word_tokenize(pred),
+                        alpha=alpha,
+                        beta=beta,
+                        gamma=gamma,
+                    )
+                    for refs, pred in zip(references, predictions)
+                ]
+            else:
+                scores = [
+                    meteor_score.single_meteor_score(
+                        word_tokenize(ref), word_tokenize(pred), alpha=alpha, beta=beta, gamma=gamma
+                    )
+                    for ref, pred in zip(references, predictions)
+                ]
+        else:
+            if multiple_refs:
+                scores = [
+                    meteor_score.meteor_score(
+                        [[word_tokenize(ref) for ref in group] for group in references][0],
+                        word_tokenize(pred),
+                        alpha=alpha,
+                        beta=beta,
+                        gamma=gamma,
+                    )
+                    for ref, pred in zip(references, predictions)
+                ]
+            else:
+                scores = [
+                    meteor_score.single_meteor_score(ref, pred, alpha=alpha, beta=beta, gamma=gamma)
+                    for ref, pred in zip(references, predictions)
+                ]
+        return {"meteor": np.mean(scores)}
--- a/evaluate-0.4.2/metrics/meteor/requirements.txt
+++ b/evaluate-0.4.2/metrics/meteor/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+nltk
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/mse/README.md
+++ b/evaluate-0.4.2/metrics/mse/README.md
+---
+title: MSE
+emoji: 🤗 
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+description: >-
+  Mean Squared Error(MSE) is the average of the square of difference between the predicted
+  and actual values.
+---
+# Metric Card for MSE
+## Metric Description
+Mean Squared Error(MSE) represents the average of the squares of errors -- i.e. the average squared difference between the estimated values and the actual values.
+![image](https://user-images.githubusercontent.com/14205986/165999302-eba3702d-81e3-4363-9c0e-d3bfceb7ec5a.png)
+## How to Use
+At minimum, this metric requires predictions and references as inputs.
+```python
+>>> mse_metric = evaluate.load("mse")
+>>> predictions = [2.5, 0.0, 2, 8]
+>>> references = [3, -0.5, 2, 7]
+>>> results = mse_metric.compute(predictions=predictions, references=references)
+```
+### Inputs
+Mandatory inputs: 
+- `predictions`: numeric array-like of shape (`n_samples,`) or (`n_samples`, `n_outputs`), representing the estimated target values.
+- `references`: numeric array-like of shape (`n_samples,`) or (`n_samples`, `n_outputs`), representing the ground truth (correct) target values.
+Optional arguments:
+- `sample_weight`: numeric array-like of shape (`n_samples,`) representing sample weights. The default is `None`.
+- `multioutput`: `raw_values`, `uniform_average` or numeric array-like of shape (`n_outputs,`), which defines the aggregation of multiple output values. The default value is `uniform_average`.
+  - `raw_values` returns a full set of errors in case of multioutput input.
+  - `uniform_average` means that the errors of all outputs are averaged with uniform weight. 
+  - the array-like value defines weights used to average errors.
+- `squared` (`bool`): If `True` returns MSE value, if `False` returns RMSE (Root Mean Squared Error). The default value is `True`.
+### Output Values
+This metric outputs a dictionary, containing the mean squared error score, which is of type:
+- `float`: if multioutput is `uniform_average` or an ndarray of weights, then the weighted average of all output errors is returned.
+- numeric array-like of shape (`n_outputs,`): if multioutput is `raw_values`, then the score is returned for each output separately. 
+Each MSE `float` value ranges from `0.0` to `1.0`, with the best value being `0.0`.
+Output Example(s):
+```python
+{'mse': 0.5}
+```
+If `multioutput="raw_values"`:
+```python
+{'mse': array([0.41666667, 1. ])}
+```
+#### Values from Popular Papers
+### Examples
+Example with the `uniform_average` config:
+```python
+>>> mse_metric = evaluate.load("mse")
+>>> predictions = [2.5, 0.0, 2, 8]
+>>> references = [3, -0.5, 2, 7]
+>>> results = mse_metric.compute(predictions=predictions, references=references)
+>>> print(results)
+{'mse': 0.375}
+```
+Example with `squared = True`, which returns the RMSE:
+```python
+>>> mse_metric = evaluate.load("mse")
+>>> predictions = [2.5, 0.0, 2, 8]
+>>> references = [3, -0.5, 2, 7]
+>>> rmse_result = mse_metric.compute(predictions=predictions, references=references, squared=False)
+>>> print(rmse_result)
+{'mse': 0.6123724356957945}
+```
+Example with multi-dimensional lists, and the `raw_values` config:
+```python
+>>> mse_metric = evaluate.load("mse", "multilist")
+>>> predictions = [[0.5, 1], [-1, 1], [7, -6]]
+>>> references = [[0, 2], [-1, 2], [8, -5]]
+>>> results = mse_metric.compute(predictions=predictions, references=references, multioutput='raw_values')
+>>> print(results) 
+{'mse': array([0.41666667, 1. ])}
+"""
+```
+## Limitations and Bias
+MSE has the disadvantage of heavily weighting outliers -- given that it squares them, this results in large errors weighing more heavily than small ones. It can be used alongside [MAE](https://huggingface.co/metrics/mae), which is complementary given that it does not square the errors. 
+## Citation(s)
+```bibtex
+@article{scikit-learn,
+  title={Scikit-learn: Machine Learning in {P}ython},
+  author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
+         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
+         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
+         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
+  journal={Journal of Machine Learning Research},
+  volume={12},
+  pages={2825--2830},
+  year={2011}
+}
+```
+```bibtex
+@article{willmott2005advantages,
+  title={Advantages of the mean absolute error (MAE) over the root mean square error (RMSE) in assessing average model performance},
+  author={Willmott, Cort J and Matsuura, Kenji},
+  journal={Climate research},
+  volume={30},
+  number={1},
+  pages={79--82},
+  year={2005}
+}
+```
+## Further References
+- [Mean Squared Error - Wikipedia](https://en.wikipedia.org/wiki/Mean_squared_error)
--- a/evaluate-0.4.2/metrics/mse/app.py
+++ b/evaluate-0.4.2/metrics/mse/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+module = evaluate.load("mse")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/mse/mse.py
+++ b/evaluate-0.4.2/metrics/mse/mse.py
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MSE - Mean Squared Error Metric"""
+import datasets
+from sklearn.metrics import mean_squared_error
+import evaluate
+_CITATION = """\
+@article{scikit-learn,
+ title={Scikit-learn: Machine Learning in {P}ython},
+ author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
+         and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
+         and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
+         Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
+ journal={Journal of Machine Learning Research},
+ volume={12},
+ pages={2825--2830},
+ year={2011}
+}
+"""
+_DESCRIPTION = """\
+Mean Squared Error(MSE) is the average of the square of difference between the predicted
+and actual values.
+"""
+_KWARGS_DESCRIPTION = """
+Args:
+    predictions: array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Estimated target values.
+    references: array-like of shape (n_samples,) or (n_samples, n_outputs)
+        Ground truth (correct) target values.
+    sample_weight: array-like of shape (n_samples,), default=None
+        Sample weights.
+    multioutput: {"raw_values", "uniform_average"} or array-like of shape (n_outputs,), default="uniform_average"
+        Defines aggregating of multiple output values. Array-like value defines weights used to average errors.
+        "raw_values" : Returns a full set of errors in case of multioutput input.
+        "uniform_average" : Errors of all outputs are averaged with uniform weight.
+    squared : bool, default=True
+        If True returns MSE value, if False returns RMSE (Root Mean Squared Error) value.
+Returns:
+    mse : mean squared error.
+Examples:
+    >>> mse_metric = evaluate.load("mse")
+    >>> predictions = [2.5, 0.0, 2, 8]
+    >>> references = [3, -0.5, 2, 7]
+    >>> results = mse_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'mse': 0.375}
+    >>> rmse_result = mse_metric.compute(predictions=predictions, references=references, squared=False)
+    >>> print(rmse_result)
+    {'mse': 0.6123724356957945}
+    If you're using multi-dimensional lists, then set the config as follows :
+    >>> mse_metric = evaluate.load("mse", "multilist")
+    >>> predictions = [[0.5, 1], [-1, 1], [7, -6]]
+    >>> references = [[0, 2], [-1, 2], [8, -5]]
+    >>> results = mse_metric.compute(predictions=predictions, references=references)
+    >>> print(results)
+    {'mse': 0.7083333333333334}
+    >>> results = mse_metric.compute(predictions=predictions, references=references, multioutput='raw_values')
+    >>> print(results) # doctest: +NORMALIZE_WHITESPACE
+    {'mse': array([0.41666667, 1. ])}
+"""
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Mse(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(self._get_feature_types()),
+            reference_urls=[
+                "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html"
+            ],
+        )
+    def _get_feature_types(self):
+        if self.config_name == "multilist":
+            return {
+                "predictions": datasets.Sequence(datasets.Value("float")),
+                "references": datasets.Sequence(datasets.Value("float")),
+            }
+        else:
+            return {
+                "predictions": datasets.Value("float"),
+                "references": datasets.Value("float"),
+            }
+    def _compute(self, predictions, references, sample_weight=None, multioutput="uniform_average", squared=True):
+        mse = mean_squared_error(
+            references, predictions, sample_weight=sample_weight, multioutput=multioutput, squared=squared
+        )
+        return {"mse": mse}
--- a/evaluate-0.4.2/metrics/mse/requirements.txt
+++ b/evaluate-0.4.2/metrics/mse/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+scikit-learn
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/nist_mt/README.md
+++ b/evaluate-0.4.2/metrics/nist_mt/README.md
+---
+title: NIST_MT
+emoji: 🤗 
+colorFrom: purple
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+- machine-translation
+description: 
+  DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU score.
+---
+# Metric Card for NIST's MT metric
+## Metric Description
+DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU
+score. The official script used by NIST to compute BLEU and NIST score is
+mteval-14.pl. The main differences are:
+ - BLEU uses geometric mean of the ngram overlaps, NIST uses arithmetic mean.
+ - NIST has a different brevity penalty
+ - NIST score from mteval-14.pl has a self-contained tokenizer (in the Hugging Face implementation we rely on NLTK's 
+implementation of the NIST-specific tokenizer)
+## Intended Uses
+NIST was developed for machine translation evaluation.
+## How to Use
+```python
+import evaluate
+nist_mt = evaluate.load("nist_mt")
+hypothesis1 = "It is a guide to action which ensures that the military always obeys the commands of the party"
+reference1 = "It is a guide to action that ensures that the military will forever heed Party commands"
+reference2 = "It is the guiding principle which guarantees the military forces always being under the command of the Party"
+nist_mt.compute(hypothesis1, [reference1, reference2])
+# {'nist_mt': 3.3709935957649324}
+```
+### Inputs
+- **predictions**: tokenized predictions to score. For sentence-level NIST, a list of tokens (str);
+     for corpus-level NIST, a list (sentences) of lists of tokens (str)
+- **references**:  potentially multiple tokenized references for each prediction.  For sentence-level NIST, a
+     list (multiple potential references) of list of tokens (str); for corpus-level NIST, a list (corpus) of lists
+     (multiple potential references) of lists of tokens (str)
+- **n**: highest n-gram order
+- **tokenize_kwargs**: arguments passed to the tokenizer (see: https://github.com/nltk/nltk/blob/90fa546ea600194f2799ee51eaf1b729c128711e/nltk/tokenize/nist.py#L139)
+### Output Values
+- **nist_mt** (`float`): NIST score
+Output Example:
+```python
+{'nist_mt': 3.3709935957649324}
+```
+## Citation
+```bibtex
+@inproceedings{10.5555/1289189.1289273,
+    author = {Doddington, George},
+    title = {Automatic Evaluation of Machine Translation Quality Using N-Gram Co-Occurrence Statistics},
+    year = {2002},
+    publisher = {Morgan Kaufmann Publishers Inc.},
+    address = {San Francisco, CA, USA},
+    booktitle = {Proceedings of the Second International Conference on Human Language Technology Research},
+    pages = {138–145},
+    numpages = {8},
+    location = {San Diego, California},
+    series = {HLT '02}
+}
+```
+## Further References
+This Hugging Face implementation uses [the NLTK implementation](https://github.com/nltk/nltk/blob/develop/nltk/translate/nist_score.py)
--- a/evaluate-0.4.2/metrics/nist_mt/app.py
+++ b/evaluate-0.4.2/metrics/nist_mt/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+module = evaluate.load("nist_mt")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/nist_mt/nist_mt.py
+++ b/evaluate-0.4.2/metrics/nist_mt/nist_mt.py
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""NLTK's NIST implementation on both the sentence and corpus level"""
+from typing import Dict, Optional
+import datasets
+import nltk
+from datasets import Sequence, Value
+try:
+    nltk.data.find("perluniprops")
+except LookupError:
+    nltk.download("perluniprops", quiet=True)  # NISTTokenizer requirement
+from nltk.tokenize.nist import NISTTokenizer
+from nltk.translate.nist_score import corpus_nist, sentence_nist
+import evaluate
+_CITATION = """\
+@inproceedings{10.5555/1289189.1289273,
+    author = {Doddington, George},
+    title = {Automatic Evaluation of Machine Translation Quality Using N-Gram Co-Occurrence Statistics},
+    year = {2002},
+    publisher = {Morgan Kaufmann Publishers Inc.},
+    address = {San Francisco, CA, USA},
+    booktitle = {Proceedings of the Second International Conference on Human Language Technology Research},
+    pages = {138–145},
+    numpages = {8},
+    location = {San Diego, California},
+    series = {HLT '02}
+}
+"""
+_DESCRIPTION = """\
+DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU
+score. The official script used by NIST to compute BLEU and NIST score is
+mteval-14.pl. The main differences are:
+ - BLEU uses geometric mean of the ngram precisions, NIST uses arithmetic mean.
+ - NIST has a different brevity penalty
+ - NIST score from mteval-14.pl has a self-contained tokenizer (in the Hugging Face implementation we rely on NLTK's
+implementation of the NIST-specific tokenizer)
+"""
+_KWARGS_DESCRIPTION = """
+Computes NIST score of translated segments against one or more references.
+Args:
+    predictions: predictions to score (list of str)
+    references: potentially multiple references for each prediction (list of str or list of list of str)
+    n: highest n-gram order
+    lowercase: whether to lowercase the data (only applicable if 'western_lang' is True)
+    western_lang: whether the current language is a Western language, which will enable some specific tokenization
+ rules with respect to, e.g., punctuation
+Returns:
+    'nist_mt': nist_mt score
+Examples:
+    >>> nist_mt = evaluate.load("nist_mt")
+    >>> hypothesis = "It is a guide to action which ensures that the military always obeys the commands of the party"
+    >>> reference1 = "It is a guide to action that ensures that the military will forever heed Party commands"
+    >>> reference2 = "It is the guiding principle which guarantees the military forces always being under the command of the Party"
+    >>> reference3 = "It is the practical guide for the army always to heed the directions of the party"
+    >>> nist_mt.compute(predictions=[hypothesis], references=[[reference1, reference2, reference3]])
+    {'nist_mt': 3.3709935957649324}
+    >>> nist_mt.compute(predictions=[hypothesis], references=[reference1])
+    {'nist_mt': 2.4477124183006533}
+"""
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class NistMt(evaluate.Metric):
+    """A wrapper around NLTK's NIST implementation."""
+    def _info(self):
+        return evaluate.MetricInfo(
+            module_type="metric",
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=[
+                datasets.Features(
+                    {
+                        "predictions": Value("string", id="prediction"),
+                        "references": Sequence(Value("string", id="reference"), id="references"),
+                    }
+                ),
+                datasets.Features(
+                    {
+                        "predictions": Value("string", id="prediction"),
+                        "references": Value("string", id="reference"),
+                    }
+                ),
+            ],
+            homepage="https://www.nltk.org/api/nltk.translate.nist_score.html",
+            codebase_urls=["https://github.com/nltk/nltk/blob/develop/nltk/translate/nist_score.py"],
+            reference_urls=["https://en.wikipedia.org/wiki/NIST_(metric)"],
+        )
+    def _compute(self, predictions, references, n: int = 5, lowercase=False, western_lang=True):
+        tokenizer = NISTTokenizer()
+        # Account for single reference cases: references always need to have one more dimension than predictions
+        if isinstance(references[0], str):
+            references = [[ref] for ref in references]
+        predictions = [
+            tokenizer.tokenize(pred, return_str=False, lowercase=lowercase, western_lang=western_lang)
+            for pred in predictions
+        ]
+        references = [
+            [
+                tokenizer.tokenize(ref, return_str=False, lowercase=lowercase, western_lang=western_lang)
+                for ref in ref_sentences
+            ]
+            for ref_sentences in references
+        ]
+        return {"nist_mt": corpus_nist(list_of_references=references, hypotheses=predictions, n=n)}
--- a/evaluate-0.4.2/metrics/nist_mt/requirements.txt
+++ b/evaluate-0.4.2/metrics/nist_mt/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+nltk
--- a/evaluate-0.4.2/metrics/nist_mt/tests.py
+++ b/evaluate-0.4.2/metrics/nist_mt/tests.py
+from _pytest.fixtures import fixture
+from nist_mt import Nist_mt
+nist = Nist_mt()
+@fixture
+def hypothesis_sent():
+    return "It is a guide to action which ensures that the military always obeys the commands of the party"
+@fixture
+def reference_sent1():
+    return "It is a guide to action that ensures that the military will forever heed Party commands"
+@fixture
+def reference_sent2():
+    return (
+        "It is the guiding principle which guarantees the military forces always being under the command of the Party"
+    )
+@fixture
+def reference_sent3():
+    return "It is the practical guide for the army always to heed the directions of the party"
+def test_nist_sentence(hypothesis_sent, reference_sent1, reference_sent2, reference_sent3):
+    nist_score = nist.compute(
+        predictions=[hypothesis_sent], references=[[reference_sent1, reference_sent2, reference_sent3]]
+    )
+    assert abs(nist_score["nist_mt"] - 3.3709935957649324) < 1e-6
--- a/evaluate-0.4.2/metrics/pearsonr/README.md
+++ b/evaluate-0.4.2/metrics/pearsonr/README.md
+---
+title: Pearson Correlation Coefficient 
+emoji: 🤗 
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+description: >-
+  Pearson correlation coefficient and p-value for testing non-correlation.
+  The Pearson correlation coefficient measures the linear relationship between two datasets. The calculation of the p-value relies on the assumption that each dataset is normally distributed. Like other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply an exact linear relationship. Positive correlations imply that as x increases, so does y. Negative correlations imply that as x increases, y decreases.
+  The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets.
+---
+# Metric Card for Pearson Correlation Coefficient (pearsonr)
+## Metric Description
+Pearson correlation coefficient and p-value for testing non-correlation.
+The Pearson correlation coefficient measures the linear relationship between two datasets. The calculation of the p-value relies on the assumption that each dataset is normally distributed. Like other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply an exact linear relationship. Positive correlations imply that as x increases, so does y. Negative correlations imply that as x increases, y decreases.
+The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets.
+## How to Use
+This metric takes a list of predictions and a list of references as input
+```python
+>>> pearsonr_metric = evaluate.load("pearsonr")
+>>> results = pearsonr_metric.compute(predictions=[10, 9, 2.5, 6, 4], references=[1, 2, 3, 4, 5])
+>>> print(round(results['pearsonr']), 2)
+['-0.74']
+```
+### Inputs
+- **predictions** (`list` of `int`): Predicted class labels, as returned by a model.
+- **references** (`list` of `int`): Ground truth labels.
+- **return_pvalue** (`boolean`): If `True`, returns the p-value, along with the correlation coefficient. If `False`, returns only the correlation coefficient. Defaults to `False`.
+### Output Values
+- **pearsonr**(`float`): Pearson correlation coefficient. Minimum possible value is -1. Maximum possible value is 1. Values of 1 and -1 indicate exact linear positive and negative relationships, respectively. A value of 0 implies no correlation.
+- **p-value**(`float`): P-value, which roughly indicates the probability of an The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets. Minimum possible value is 0. Maximum possible value is 1. Higher values indicate higher probabilities.
+Like other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply an exact linear relationship. Positive correlations imply that as x increases, so does y. Negative correlations imply that as x increases, y decreases.
+Output Example(s):
+```python
+{'pearsonr': -0.7}
+```
+```python
+{'p-value': 0.15}
+```
+#### Values from Popular Papers
+### Examples
+Example 1-A simple example using only predictions and references.
+```python
+>>> pearsonr_metric = evaluate.load("pearsonr")
+>>> results = pearsonr_metric.compute(predictions=[10, 9, 2.5, 6, 4], references=[1, 2, 3, 4, 5])
+>>> print(round(results['pearsonr'], 2))
+-0.74
+```
+Example 2-The same as Example 1, but that also returns the `p-value`.
+```python
+>>> pearsonr_metric = evaluate.load("pearsonr")
+>>> results = pearsonr_metric.compute(predictions=[10, 9, 2.5, 6, 4], references=[1, 2, 3, 4, 5], return_pvalue=True)
+>>> print(sorted(list(results.keys())))
+['p-value', 'pearsonr']
+>>> print(round(results['pearsonr'], 2))
+-0.74
+>>> print(round(results['p-value'], 2))
+0.15
+```
+## Limitations and Bias
+As stated above, the calculation of the p-value relies on the assumption that each data set is normally distributed. This is not always the case, so verifying the true distribution of datasets is recommended.
+## Citation(s)
+```bibtex
+@article{2020SciPy-NMeth,
+author  = {Virtanen, Pauli and Gommers, Ralf and Oliphant, Travis E. and
+      Haberland, Matt and Reddy, Tyler and Cournapeau, David and
+      Burovski, Evgeni and Peterson, Pearu and Weckesser, Warren and
+      Bright, Jonathan and {van der Walt}, St{\'e}fan J. and
+      Brett, Matthew and Wilson, Joshua and Millman, K. Jarrod and
+      Mayorov, Nikolay and Nelson, Andrew R. J. and Jones, Eric and
+      Kern, Robert and Larson, Eric and Carey, C J and
+      Polat, {\.I}lhan and Feng, Yu and Moore, Eric W. and
+      {VanderPlas}, Jake and Laxalde, Denis and Perktold, Josef and
+      Cimrman, Robert and Henriksen, Ian and Quintero, E. A. and
+      Harris, Charles R. and Archibald, Anne M. and
+      Ribeiro, Ant{\^o}nio H. and Pedregosa, Fabian and
+      {van Mulbregt}, Paul and {SciPy 1.0 Contributors}},
+title   = {{{SciPy} 1.0: Fundamental Algorithms for Scientific
+      Computing in Python}},
+journal = {Nature Methods},
+year    = {2020},
+volume  = {17},
+pages   = {261--272},
+adsurl  = {https://rdcu.be/b08Wh},
+doi = {10.1038/s41592-019-0686-2},
+}
+```
+## Further References
--- a/evaluate-0.4.2/metrics/pearsonr/app.py
+++ b/evaluate-0.4.2/metrics/pearsonr/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+module = evaluate.load("pearsonr")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/pearsonr/pearsonr.py
+++ b/evaluate-0.4.2/metrics/pearsonr/pearsonr.py
+# Copyright 2021 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pearson correlation coefficient metric."""
+import datasets
+from scipy.stats import pearsonr
+import evaluate
+_DESCRIPTION = """
+Pearson correlation coefficient and p-value for testing non-correlation.
+The Pearson correlation coefficient measures the linear relationship between two datasets. The calculation of the p-value relies on the assumption that each dataset is normally distributed. Like other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply an exact linear relationship. Positive correlations imply that as x increases, so does y. Negative correlations imply that as x increases, y decreases.
+The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets.
+"""
+_KWARGS_DESCRIPTION = """
+Args:
+    predictions (`list` of `int`): Predicted class labels, as returned by a model.
+    references (`list` of `int`): Ground truth labels.
+    return_pvalue (`boolean`): If `True`, returns the p-value, along with the correlation coefficient. If `False`, returns only the correlation coefficient. Defaults to `False`.
+Returns:
+    pearsonr (`float`): Pearson correlation coefficient. Minimum possible value is -1. Maximum possible value is 1. Values of 1 and -1 indicate exact linear positive and negative relationships, respectively. A value of 0 implies no correlation.
+    p-value (`float`): P-value, which roughly indicates the probability of an The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets. Minimum possible value is 0. Maximum possible value is 1. Higher values indicate higher probabilities.
+Examples:
+    Example 1-A simple example using only predictions and references.
+        >>> pearsonr_metric = evaluate.load("pearsonr")
+        >>> results = pearsonr_metric.compute(predictions=[10, 9, 2.5, 6, 4], references=[1, 2, 3, 4, 5])
+        >>> print(round(results['pearsonr'], 2))
+        -0.74
+    Example 2-The same as Example 1, but that also returns the `p-value`.
+        >>> pearsonr_metric = evaluate.load("pearsonr")
+        >>> results = pearsonr_metric.compute(predictions=[10, 9, 2.5, 6, 4], references=[1, 2, 3, 4, 5], return_pvalue=True)
+        >>> print(sorted(list(results.keys())))
+        ['p-value', 'pearsonr']
+        >>> print(round(results['pearsonr'], 2))
+        -0.74
+        >>> print(round(results['p-value'], 2))
+        0.15
+"""
+_CITATION = """
+@article{2020SciPy-NMeth,
+author  = {Virtanen, Pauli and Gommers, Ralf and Oliphant, Travis E. and
+      Haberland, Matt and Reddy, Tyler and Cournapeau, David and
+      Burovski, Evgeni and Peterson, Pearu and Weckesser, Warren and
+      Bright, Jonathan and {van der Walt}, St{\'e}fan J. and
+      Brett, Matthew and Wilson, Joshua and Millman, K. Jarrod and
+      Mayorov, Nikolay and Nelson, Andrew R. J. and Jones, Eric and
+      Kern, Robert and Larson, Eric and Carey, C J and
+      Polat, Ilhan and Feng, Yu and Moore, Eric W. and
+      {VanderPlas}, Jake and Laxalde, Denis and Perktold, Josef and
+      Cimrman, Robert and Henriksen, Ian and Quintero, E. A. and
+      Harris, Charles R. and Archibald, Anne M. and
+      Ribeiro, Antonio H. and Pedregosa, Fabian and
+      {van Mulbregt}, Paul and {SciPy 1.0 Contributors}},
+title   = {{{SciPy} 1.0: Fundamental Algorithms for Scientific
+      Computing in Python}},
+journal = {Nature Methods},
+year    = {2020},
+volume  = {17},
+pages   = {261--272},
+adsurl  = {https://rdcu.be/b08Wh},
+doi = {10.1038/s41592-019-0686-2},
+}
+"""
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Pearsonr(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Value("float"),
+                    "references": datasets.Value("float"),
+                }
+            ),
+            reference_urls=["https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html"],
+        )
+    def _compute(self, predictions, references, return_pvalue=False):
+        if return_pvalue:
+            results = pearsonr(references, predictions)
+            return {"pearsonr": results[0], "p-value": results[1]}
+        else:
+            return {"pearsonr": float(pearsonr(references, predictions)[0])}
--- a/evaluate-0.4.2/metrics/pearsonr/requirements.txt
+++ b/evaluate-0.4.2/metrics/pearsonr/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+scipy
\ No newline at end of file
--- a/evaluate-0.4.2/metrics/perplexity/README.md
+++ b/evaluate-0.4.2/metrics/perplexity/README.md
+---
+title: Perplexity
+emoji: 🤗
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.19.1
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- metric
+description: >-
+  Perplexity (PPL) is one of the most common metrics for evaluating language models.
+  It is defined as the exponentiated average negative log-likelihood of a sequence, calculated with exponent base `e`.
+  For more information on perplexity, see [this tutorial](https://huggingface.co/docs/transformers/perplexity).
+---
+# Metric Card for Perplexity
+## Metric Description
+Given a model and an input text sequence, perplexity measures how likely the model is to generate the input text sequence.
+As a metric, it can be used to evaluate how well the model has learned the distribution of the text it was trained on.
+In this case, `model_id` should be the trained model to be evaluated, and the input texts should be the text that the model was trained on.
+This implementation of perplexity is calculated with log base `e`, as in `perplexity = e**(sum(losses) / num_tokenized_tokens)`, following recent convention in deep learning frameworks.
+## Intended Uses
+Any language generation task.
+## How to Use
+The metric takes a list of text as input, as well as the name of the model used to compute the metric:
+```python
+from evaluate import load
+perplexity = load("perplexity", module_type="metric")
+results = perplexity.compute(predictions=predictions, model_id='gpt2')
+```
+### Inputs
+- **model_id** (str): model used for calculating Perplexity. NOTE: Perplexity can only be calculated for causal language models.
+    - This includes models such as gpt2, causal variations of bert, causal versions of t5, and more (the full list can be found in the AutoModelForCausalLM documentation here: https://huggingface.co/docs/transformers/master/en/model_doc/auto#transformers.AutoModelForCausalLM )
+- **predictions** (list of str): input text, where each separate text snippet is one list entry.
+- **batch_size** (int): the batch size to run texts through the model. Defaults to 16.
+- **add_start_token** (bool): whether to add the start token to the texts, so the perplexity can include the probability of the first word. Defaults to True.
+- **device** (str): device to run on, defaults to `cuda` when available
+### Output Values
+This metric outputs a dictionary with the perplexity scores for the text input in the list, and the average perplexity.
+If one of the input texts is longer than the max input length of the model, then it is truncated to the max length for the perplexity computation.
+```
+{'perplexities': [8.182524681091309, 33.42122268676758, 27.012239456176758], 'mean_perplexity': 22.871995608011883}
+```
+The range of this metric is [0, inf). A lower score is better.
+#### Values from Popular Papers
+### Examples
+Calculating perplexity on predictions defined here:
+```python
+perplexity = evaluate.load("perplexity", module_type="metric")
+input_texts = ["lorem ipsum", "Happy Birthday!", "Bienvenue"]
+results = perplexity.compute(model_id='gpt2',
+                             add_start_token=False,
+                             predictions=input_texts)
+print(list(results.keys()))
+>>>['perplexities', 'mean_perplexity']
+print(round(results["mean_perplexity"], 2))
+>>>646.75
+print(round(results["perplexities"][0], 2))
+>>>32.25
+```
+Calculating perplexity on predictions loaded in from a dataset:
+```python
+perplexity = evaluate.load("perplexity", module_type="metric")
+input_texts = datasets.load_dataset("wikitext",
+                                    "wikitext-2-raw-v1",
+                                    split="test")["text"][:50]
+input_texts = [s for s in input_texts if s!='']
+results = perplexity.compute(model_id='gpt2',
+                             predictions=input_texts)
+print(list(results.keys()))
+>>>['perplexities', 'mean_perplexity']
+print(round(results["mean_perplexity"], 2))
+>>>576.76
+print(round(results["perplexities"][0], 2))
+>>>889.28
+```
+## Limitations and Bias
+Note that the output value is based heavily on what text the model was trained on. This means that perplexity scores are not comparable between models or datasets.
+See Meister and Cotterell, ["Language Model Evaluation Beyond Perplexity"]( https://arxiv.org/abs/2106.00085) (2021) for more information about alternative model evaluation strategies. 
+## Citation
+```bibtex
+@article{jelinek1977perplexity,
+title={Perplexity—a measure of the difficulty of speech recognition tasks},
+author={Jelinek, Fred and Mercer, Robert L and Bahl, Lalit R and Baker, James K},
+journal={The Journal of the Acoustical Society of America},
+volume={62},
+number={S1},
+pages={S63--S63},
+year={1977},
+publisher={Acoustical Society of America}
+}
+```
+## Further References
+- [Hugging Face Perplexity Blog Post](https://huggingface.co/docs/transformers/perplexity)
--- a/evaluate-0.4.2/metrics/perplexity/app.py
+++ b/evaluate-0.4.2/metrics/perplexity/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+module = evaluate.load("perplexity", module_type="metric")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/metrics/perplexity/perplexity.py
+++ b/evaluate-0.4.2/metrics/perplexity/perplexity.py
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Perplexity Metric."""
+import datasets
+import numpy as np
+import torch
+from torch.nn import CrossEntropyLoss
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import evaluate
+from evaluate import logging
+_CITATION = """\
+"""
+_DESCRIPTION = """
+Perplexity (PPL) is one of the most common metrics for evaluating language models.
+It is defined as the exponentiated average negative log-likelihood of a sequence, calculated with exponent base `e`.
+For more information, see https://huggingface.co/docs/transformers/perplexity
+"""
+_KWARGS_DESCRIPTION = """
+Args:
+    model_id (str): model used for calculating Perplexity
+            NOTE: Perplexity can only be calculated for causal language models.
+                    This includes models such as gpt2, causal variations of bert,
+                    causal versions of t5, and more (the full list can be found
+                    in the AutoModelForCausalLM documentation here:
+                    https://huggingface.co/docs/transformers/master/en/model_doc/auto#transformers.AutoModelForCausalLM )
+    predictions (list of str): input text, each separate text snippet
+        is one list entry.
+    batch_size (int): the batch size to run texts through the model. Defaults to 16.
+    add_start_token (bool): whether to add the start token to the texts,
+        so the perplexity can include the probability of the first word. Defaults to True.
+    device (str): device to run on, defaults to 'cuda' when available
+Returns:
+    perplexity: dictionary containing the perplexity scores for the texts
+        in the input list, as well as the mean perplexity. If one of the input texts is
+        longer than the max input length of the model, then it is truncated to the
+        max length for the perplexity computation.
+Examples:
+    Example 1:
+        >>> perplexity = evaluate.load("perplexity", module_type="metric")
+        >>> input_texts = ["lorem ipsum", "Happy Birthday!", "Bienvenue"]
+        >>> results = perplexity.compute(model_id='gpt2',
+        ...                              add_start_token=False,
+        ...                              predictions=input_texts) # doctest:+ELLIPSIS
+        >>> print(list(results.keys()))
+        ['perplexities', 'mean_perplexity']
+        >>> print(round(results["mean_perplexity"], 0))
+        647.0
+        >>> print(round(results["perplexities"][0], 0))
+        32.0
+    Example 2:
+        >>> from datasets import load_dataset
+        >>> perplexity = evaluate.load("perplexity", module_type="metric")
+        >>> input_texts = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")["text"][:10] # doctest: +SKIP
+        >>> input_texts = [s for s in input_texts if s!='']
+        >>> results = perplexity.compute(model_id='gpt2',
+        ...                              predictions=input_texts)
+        >>> print(list(results.keys()))
+        ['perplexities', 'mean_perplexity']
+        >>> print(round(results["mean_perplexity"], 2)) # doctest: +SKIP
+        576.76
+        >>> print(round(results["perplexities"][0], 2)) # doctest: +SKIP
+        889.28
+"""
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Perplexity(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            module_type="metric",
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Value("string"),
+                }
+            ),
+            reference_urls=["https://huggingface.co/docs/transformers/perplexity"],
+        )
+    def _compute(
+        self, predictions, model_id, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None
+    ):
+        if device is not None:
+            assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
+            if device == "gpu":
+                device = "cuda"
+        else:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+        model = AutoModelForCausalLM.from_pretrained(model_id)
+        model = model.to(device)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        # if batch_size > 1 (which generally leads to padding being required), and
+        # if there is not an already assigned pad_token, assign an existing
+        # special token to also be the padding token
+        if tokenizer.pad_token is None and batch_size > 1:
+            existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())
+            # check that the model already has at least one special token defined
+            assert (
+                len(existing_special_tokens) > 0
+            ), "If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1."
+            # assign one of the special tokens to also be the pad token
+            tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})
+        if add_start_token and max_length:
+            # leave room for <BOS> token to be added:
+            assert (
+                tokenizer.bos_token is not None
+            ), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
+            max_tokenized_len = max_length - 1
+        else:
+            max_tokenized_len = max_length
+        encodings = tokenizer(
+            predictions,
+            add_special_tokens=False,
+            padding=True,
+            truncation=True if max_tokenized_len else False,
+            max_length=max_tokenized_len,
+            return_tensors="pt",
+            return_attention_mask=True,
+        ).to(device)
+        encoded_texts = encodings["input_ids"]
+        attn_masks = encodings["attention_mask"]
+        # check that each input is long enough:
+        if add_start_token:
+            assert torch.all(torch.ge(attn_masks.sum(1), 1)), "Each input text must be at least one token long."
+        else:
+            assert torch.all(
+                torch.ge(attn_masks.sum(1), 2)
+            ), "When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings."
+        ppls = []
+        loss_fct = CrossEntropyLoss(reduction="none")
+        for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):
+            end_index = min(start_index + batch_size, len(encoded_texts))
+            encoded_batch = encoded_texts[start_index:end_index]
+            attn_mask = attn_masks[start_index:end_index]
+            if add_start_token:
+                bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)
+                encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)
+                attn_mask = torch.cat(
+                    [torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_mask], dim=1
+                )
+            labels = encoded_batch
+            with torch.no_grad():
+                out_logits = model(encoded_batch, attention_mask=attn_mask).logits
+            shift_logits = out_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            shift_attention_mask_batch = attn_mask[..., 1:].contiguous()
+            perplexity_batch = torch.exp(
+                (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)
+                / shift_attention_mask_batch.sum(1)
+            )
+            ppls += perplexity_batch.tolist()
+        return {"perplexities": ppls, "mean_perplexity": np.mean(ppls)}
--- a/evaluate-0.4.2/metrics/perplexity/requirements.txt
+++ b/evaluate-0.4.2/metrics/perplexity/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+torch
+torch
+transformers
\ No newline at end of file