修改readme

25991f98 · hepj · ac192496 · 25991f98 · 25991f98 · 25991f98
Commit 25991f98 authored Jul 25, 2024 by hepj
20 changed files
--- a/evaluate-0.4.2/measurements/perplexity/perplexity.py
+++ b/evaluate-0.4.2/measurements/perplexity/perplexity.py
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Perplexity Metric."""
+
+import datasets
+import numpy as np
+import torch
+from torch.nn import CrossEntropyLoss
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+import evaluate
+from evaluate import logging
+
+
+_CITATION = """\
+
+"""
+
+_DESCRIPTION = """
+Perplexity (PPL) can be used for evaluating to what extent a dataset is similar to the distribution of text that a given model was trained on.
+It is defined as the exponentiated average negative log-likelihood of a sequence, calculated with exponent base `e`.
+
+For more information, see https://huggingface.co/docs/transformers/perplexity
+"""
+
+_KWARGS_DESCRIPTION = """
+Args:
+    model_id (str): model used for calculating Perplexity
+            NOTE: Perplexity can only be calculated for causal language models.
+                    This includes models such as gpt2, causal variations of bert,
+                    causal versions of t5, and more (the full list can be found
+                    in the AutoModelForCausalLM documentation here:
+                    https://huggingface.co/docs/transformers/master/en/model_doc/auto#transformers.AutoModelForCausalLM )
+
+    data (list of str): input data, each separate text snippet
+        is one list entry.
+    batch_size (int): the batch size to run texts through the model. Defaults to 16.
+    add_start_token (bool): whether to add the start token to the texts,
+        so the perplexity can include the probability of the first word. Defaults to True.
+    device (str): device to run on, defaults to 'cuda' when available
+    max_length (int): the maximum length to truncate input texts to. Should be set to the maximum length the model supports. Defaults to None.
+Returns:
+    perplexity: dictionary containing the perplexity scores for the texts
+        in the input list, as well as the mean perplexity. If one of the input texts is
+        longer than the max input length of the model, then it is truncated to the
+        max length for the perplexity computation.
+Examples:
+    Example 1:
+        >>> perplexity = evaluate.load("perplexity", module_type="measurement")
+        >>> data = ["lorem ipsum", "Happy Birthday!", "Bienvenue"]
+        >>> results = perplexity.compute(model_id='gpt2',
+        ...                              add_start_token=False,
+        ...                              data=data) # doctest:+ELLIPSIS
+        >>> print(list(results.keys()))
+        ['perplexities', 'mean_perplexity']
+        >>> print(round(results["mean_perplexity"], 0))
+        647.0
+        >>> print(round(results["perplexities"][0], 0))
+        32.0
+
+    Example 2:
+        >>> from datasets import load_dataset
+        >>> perplexity = evaluate.load("perplexity", module_type="measurement")
+        >>> data = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")["text"][:10] # doctest: +SKIP
+        >>> data = [s for s in data if s!='']
+        >>> results = perplexity.compute(model_id='gpt2',
+        ...                              data=data)
+        >>> print(list(results.keys()))
+        ['perplexities', 'mean_perplexity']
+        >>> print(round(results["mean_perplexity"], 2)) # doctest: +SKIP
+        576.76
+        >>> print(round(results["perplexities"][0], 2)) # doctest: +SKIP
+        889.28
+"""
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Perplexity(evaluate.Measurement):
+    def _info(self):
+        return evaluate.MeasurementInfo(
+            module_type="measurement",
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "data": datasets.Value("string"),
+                }
+            ),
+            reference_urls=["https://huggingface.co/docs/transformers/perplexity"],
+        )
+
+    def _compute(
+        self, data, model_id, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None
+    ):
+
+        if device is not None:
+            assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
+            if device == "gpu":
+                device = "cuda"
+        else:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        model = AutoModelForCausalLM.from_pretrained(model_id)
+        model = model.to(device)
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+        # if batch_size > 1 (which generally leads to padding being required), and
+        # if there is not an already assigned pad_token, assign an existing
+        # special token to also be the padding token
+        if tokenizer.pad_token is None and batch_size > 1:
+            existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())
+            # check that the model already has at least one special token defined
+            assert (
+                len(existing_special_tokens) > 0
+            ), "If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1."
+            # assign one of the special tokens to also be the pad token
+            tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})
+
+        if add_start_token and max_length:
+            # leave room for <BOS> token to be added:
+            assert (
+                tokenizer.bos_token is not None
+            ), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
+            max_tokenized_len = max_length - 1
+        else:
+            max_tokenized_len = max_length
+
+        encodings = tokenizer(
+            data,
+            add_special_tokens=False,
+            padding=True,
+            truncation=True if max_tokenized_len else False,
+            max_length=max_tokenized_len,
+            return_tensors="pt",
+            return_attention_mask=True,
+        ).to(device)
+
+        encoded_texts = encodings["input_ids"]
+        attn_masks = encodings["attention_mask"]
+
+        # check that each input is long enough:
+        if add_start_token:
+            assert torch.all(torch.ge(attn_masks.sum(1), 1)), "Each input text must be at least one token long."
+        else:
+            assert torch.all(
+                torch.ge(attn_masks.sum(1), 2)
+            ), "When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings."
+
+        ppls = []
+        loss_fct = CrossEntropyLoss(reduction="none")
+
+        for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):
+            end_index = min(start_index + batch_size, len(encoded_texts))
+            encoded_batch = encoded_texts[start_index:end_index]
+            attn_mask = attn_masks[start_index:end_index]
+
+            if add_start_token:
+                bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)
+                encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)
+                attn_mask = torch.cat(
+                    [torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_mask], dim=1
+                )
+
+            labels = encoded_batch
+
+            with torch.no_grad():
+                out_logits = model(encoded_batch, attention_mask=attn_mask).logits
+
+            shift_logits = out_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            shift_attention_mask_batch = attn_mask[..., 1:].contiguous()
+
+            perplexity_batch = torch.exp(
+                (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)
+                / shift_attention_mask_batch.sum(1)
+            )
+
+            ppls += perplexity_batch.tolist()
+
+        return {"perplexities": ppls, "mean_perplexity": np.mean(ppls)}
--- a/evaluate-0.4.2/measurements/perplexity/requirements.txt
+++ b/evaluate-0.4.2/measurements/perplexity/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+torch
+transformers
\ No newline at end of file
--- a/evaluate-0.4.2/measurements/regard/README.md
+++ b/evaluate-0.4.2/measurements/regard/README.md
+---
+title: Regard
+emoji: 🤗
+colorFrom: green
+colorTo: purple
+sdk: gradio
+sdk_version: 3.0.2
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- measurement
+description: >-
+  Regard aims to measure language polarity towards and social perceptions of a demographic (e.g. gender, race, sexual orientation).
+---
+
+# Measurement Card for Regard
+
+
+## Measurement Description
+
+The `regard` measurement returns the estimated language polarity towards and social perceptions of a demographic (e.g. gender, race, sexual orientation).
+
+It uses a model trained on labelled data from the paper ["The Woman Worked as a Babysitter: On Biases in Language Generation" (EMNLP 2019)](https://arxiv.org/abs/1909.01326)
+
+## How to Use
+
+This measurement requires two lists of strings as input, enabling comparing the estimated polarity between the groups.
+
+```python
+>>> regard = evaluate.load("regard", module_type="measurement")
+>>> group1 = ['xyz are described as mean', 'xyz are thought of as being too ambitious']
+>>> group2 = ['xyz are known for making too much noise', 'xyz are described as often violent']
+>>> regard.compute(data = group1, references = group2)
+```
+
+### Inputs
+- **data** (list of `str`): prediction/candidate sentences, e.g. sentences describing a given demographic group.
+- **references** (list of `str`) (optional): reference/comparison sentences, e.g. sentences describing a different demographic group to compare against.
+- **aggregation** (`str`) (optional): determines the type of aggregation performed.
+    If set to `None`, the difference between the regard scores for the two categories is returned.
+     Otherwise:
+        - `average` : returns the average regard for each category (negative, positive, neutral, other) for each group
+        - `maximum`: returns the maximum regard for each group
+
+### Output Values
+
+**With a single input**:
+
+`regard` : the regard scores of each string in the input list (if no aggregation is specified)
+```python
+{'neutral': 0.95, 'positive': 0.02, 'negative': 0.02, 'other': 0.01}
+{'negative': 0.97, 'other': 0.02, 'neutral': 0.01, 'positive': 0.0}
+```
+
+`average_regard`: the average regard for each category (negative, positive, neutral, other)  (if `aggregation` = `average`)
+```python
+{'neutral': 0.48, 'positive': 0.01, 'negative': 0.5, 'other': 0.01}
+```
+
+`max_regard`: the maximum regard across all input strings (if `aggregation` = `maximum`)
+```python
+{'neutral': 0.95, 'positive': 0.024, 'negative': 0.972, 'other': 0.019}
+```
+
+**With two lists of inputs**:
+
+By default, this measurement outputs a dictionary containing a list of regard scores, one for each category (negative, positive, neutral, other), representing the difference in regard between the two groups.
+
+```python
+{'neutral': 0.35, 'negative': -0.36, 'other': 0.01, 'positive': 0.01}
+```
+
+With the `aggregation='maximum'` option, this measurement will output the maximum regard for each group:
+
+```python
+{'negative': 0.98, 'other': 0.04, 'neutral': 0.03, 'positive': 0.0}
+```
+
+With the `aggregation='average'` option, this measurement will output the average regard for each category (negative, positive, neutral, other):
+
+```python
+{'neutral': 0.37, 'negative': 0.57, 'other': 0.05, 'positive': 0.01}
+```
+
+### Examples
+
+Example 1 (single input):
+
+```python
+>>> regard = evaluate.load("regard")
+>>> group1 = ['xyz are described as mean', 'xyz are thought of as being too ambitious']
+>>> results = regard.compute(data = group1)
+>>> for d in results['regard']:
+...     print({l['label']: round(l['score'],2) for l in d})
+{'neutral': 0.95, 'positive': 0.02, 'negative': 0.02, 'other': 0.01}
+{'negative': 0.97, 'other': 0.02, 'neutral': 0.01, 'positive': 0.0}
+```
+
+Example 2 (comparison mode):
+```python
+>>> regard = evaluate.load("regard", "compare")
+>>> group1 = ['xyz are described as mean', 'xyz are thought of as being too ambitious']
+>>> group2 = ['xyz are known for making too much noise', 'xyz are described as often violent']
+>>> results = regard.compute(data = group1, references = group2)
+>>> print({k: round(v, 2) for k, v in results['regard_difference'].items()})
+{'neutral': 0.46, 'positive': 0.01, 'negative': -0.46, 'other': -0.01}
+```
+
+Example 3 (returns the maximum regard score):
+```python
+>>> regard = evaluate.load("regard", "compare")
+>>> group1 = ['xyz are described as mean', 'xyz are thought of as being too ambitious']
+>>> group2 = ['xyz are known for making too much noise', 'xyz are described as often violent']
+>>> results = regard.compute(data = group1, references = group2, aggregation = "maximum")
+>>> print({k: round(v, 2) for k, v in results['max_data_regard'].items()})
+{'neutral': 0.95, 'positive': 0.02, 'negative': 0.97, 'other': 0.02}
+>>> print({k: round(v, 2) for k, v in results['max_references_regard'].items()})
+{'negative': 0.98, 'other': 0.04, 'neutral': 0.03, 'positive': 0.0}
+```
+
+Example 4 (returns the average regard score):
+```python
+>>> regard = evaluate.load("regard", "compare")
+>>> group1 = ['xyz are described as mean', 'xyz are thought of as being too ambitious']
+>>> group2 = ['xyz are known for making too much noise', 'xyz are described as often violent']
+>>> results = regard.compute(data = group1, references = group2, aggregation = "average")
+>>> print({k: round(v, 2) for k, v in results['average_data_regard'].items()})
+{'neutral': 0.48, 'positive': 0.01, 'negative': 0.5, 'other': 0.01}
+>>> print({k: round(v, 2) for k, v in results['average_references_regard'].items()})
+{'negative': 0.96, 'other': 0.02, 'neutral': 0.02, 'positive': 0.0}
+```
+
+## Citation(s)
+@article{https://doi.org/10.48550/arxiv.1909.01326,
+  doi = {10.48550/ARXIV.1909.01326},
+  url = {https://arxiv.org/abs/1909.01326},
+  author = {Sheng, Emily and Chang, Kai-Wei and Natarajan, Premkumar and Peng, Nanyun},
+  title = {The Woman Worked as a Babysitter: On Biases in Language Generation},
+  publisher = {arXiv},
+  year = {2019}
+}
+
+
+## Further References
+- [`nlg-bias` library](https://github.com/ewsheng/nlg-bias/)
--- a/evaluate-0.4.2/measurements/regard/app.py
+++ b/evaluate-0.4.2/measurements/regard/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("regard")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/measurements/regard/regard.py
+++ b/evaluate-0.4.2/measurements/regard/regard.py
+# Copyright 2020 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Regard measurement. """
+
+from collections import defaultdict
+from operator import itemgetter
+from statistics import mean
+
+import datasets
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
+
+import evaluate
+
+
+logger = evaluate.logging.get_logger(__name__)
+
+
+_CITATION = """
+@article{https://doi.org/10.48550/arxiv.1909.01326,
+  doi = {10.48550/ARXIV.1909.01326},
+  url = {https://arxiv.org/abs/1909.01326},
+  author = {Sheng, Emily and Chang, Kai-Wei and Natarajan, Premkumar and Peng, Nanyun},
+  title = {The Woman Worked as a Babysitter: On Biases in Language Generation},
+  publisher = {arXiv},
+  year = {2019}
+}
+
+"""
+
+_DESCRIPTION = """\
+Regard aims to measure language polarity towards and social perceptions of a demographic (e.g. gender, race, sexual orientation).
+"""
+
+_KWARGS_DESCRIPTION = """
+Compute the regard of the input sentences.
+
+Args:
+    `data` (list of str): prediction/candidate sentences, e.g. sentences describing a given demographic group.
+    `references` (list of str) (optional): reference/comparison sentences, e.g. sentences describing a different demographic group to compare against.
+    `aggregation` (str) (optional): determines the type of aggregation performed.
+    If set to `None`, the difference between the regard scores for the two categories is returned.
+     Otherwise:
+        - 'average' : returns the average regard for each category (negative, positive, neutral, other) for each group
+        - 'maximum': returns the maximum regard for each group
+
+Returns:
+    With only `data` as input (default config):
+        `regard` : the regard scores of each string in the input list (if no aggregation is specified)
+        `average_regard`: the average regard for each category (negative, positive, neutral, other)  (if `aggregation` = `average`)
+        `max_regard`: the maximum regard across all input strings (if `aggregation` = `maximum`)
+    With `data` and `references` as input (`compare` config):
+        `regard_difference`: the difference between the regard scores for the two groups (if no aggregation is specified)
+        `average_data_regard` and 'average_references_regard': the average regard for each category (negative, positive, neutral, other)  (if `aggregation` = `average`)
+        `max_data_regard` and 'max_references_regard': the maximum regard for each group  (if `aggregation` = `maximum`)
+
+Examples:
+
+Example 1 (single input):
+>>> regard = evaluate.load("regard")
+>>> group1 = ['xyz are described as mean', 'xyz are thought of as being too ambitious']
+>>> results = regard.compute(data = group1)
+>>> for d in results['regard']:
+...     print({l['label']: round(l['score'],2) for l in d})
+{'neutral': 0.95, 'positive': 0.02, 'negative': 0.02, 'other': 0.01}
+{'negative': 0.97, 'other': 0.02, 'neutral': 0.01, 'positive': 0.0}
+
+Example 2 (comparison mode):
+>>> regard = evaluate.load("regard", "compare")
+>>> group1 = ['xyz are described as mean', 'xyz are thought of as being too ambitious']
+>>> group2 = ['xyz are known for making too much noise', 'xyz are described as often violent']
+>>> results = regard.compute(data = group1, references = group2)
+>>> print({k: round(v, 2) for k, v in results['regard_difference'].items()})
+{'neutral': 0.46, 'positive': 0.01, 'negative': -0.46, 'other': -0.01}
+
+Example 3 (returns the maximum regard score per category):
+>>> regard = evaluate.load("regard", "compare")
+>>> group1 = ['xyz are described as mean', 'xyz are thought of as being too ambitious']
+>>> group2 = ['xyz are known for making too much noise', 'xyz are described as often violent']
+>>> results = regard.compute(data = group1, references = group2, aggregation = "maximum")
+>>> print({k: round(v, 2) for k, v in results['max_data_regard'].items()})
+{'neutral': 0.95, 'positive': 0.02, 'negative': 0.97, 'other': 0.02}
+>>> print({k: round(v, 2) for k, v in results['max_references_regard'].items()})
+{'negative': 0.98, 'other': 0.04, 'neutral': 0.03, 'positive': 0.0}
+
+Example 4 (returns the average regard score):
+>>> regard = evaluate.load("regard", "compare")
+>>> group1 = ['xyz are described as mean', 'xyz are thought of as being too ambitious']
+>>> group2 = ['xyz are known for making too much noise', 'xyz are described as often violent']
+>>> results = regard.compute(data = group1, references = group2, aggregation = "average")
+>>> print({k: round(v, 2) for k, v in results['average_data_regard'].items()})
+{'neutral': 0.48, 'positive': 0.01, 'negative': 0.5, 'other': 0.01}
+>>> print({k: round(v, 2) for k, v in results['average_references_regard'].items()})
+{'negative': 0.96, 'other': 0.02, 'neutral': 0.02, 'positive': 0.0}
+"""
+
+
+def regard(group, regard_classifier):
+    group_scores = defaultdict(list)
+    group_regard = regard_classifier(group)
+    for pred in group_regard:
+        for pred_score in pred:
+            group_scores[pred_score["label"]].append(pred_score["score"])
+    return group_regard, dict(group_scores)
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Regard(evaluate.Measurement):
+    def _info(self):
+        if self.config_name not in ["compare", "default"]:
+            raise KeyError("You should supply a configuration name selected in " '["config", "default"]')
+        return evaluate.MeasurementInfo(
+            module_type="measurement",
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "data": datasets.Value("string", id="sequence"),
+                    "references": datasets.Value("string", id="sequence"),
+                }
+                if self.config_name == "compare"
+                else {
+                    "data": datasets.Value("string", id="sequence"),
+                }
+            ),
+            codebase_urls=[],
+            reference_urls=[],
+        )
+
+    def _download_and_prepare(self, dl_manager):
+        regard_tokenizer = AutoTokenizer.from_pretrained("sasha/regardv3")
+        regard_model = AutoModelForSequenceClassification.from_pretrained("sasha/regardv3")
+        self.regard_classifier = pipeline(
+            "text-classification", model=regard_model, top_k=4, tokenizer=regard_tokenizer, truncation=True
+        )
+
+    def _compute(
+        self,
+        data,
+        references=None,
+        aggregation=None,
+    ):
+        if self.config_name == "compare":
+            pred_scores, pred_regard = regard(data, self.regard_classifier)
+            ref_scores, ref_regard = regard(references, self.regard_classifier)
+            pred_mean = {k: mean(v) for k, v in pred_regard.items()}
+            pred_max = {k: max(v) for k, v in pred_regard.items()}
+            ref_mean = {k: mean(v) for k, v in ref_regard.items()}
+            ref_max = {k: max(v) for k, v in ref_regard.items()}
+            if aggregation == "maximum":
+                return {
+                    "max_data_regard": pred_max,
+                    "max_references_regard": ref_max,
+                }
+            elif aggregation == "average":
+                return {"average_data_regard": pred_mean, "average_references_regard": ref_mean}
+            else:
+                return {"regard_difference": {key: pred_mean[key] - ref_mean.get(key, 0) for key in pred_mean}}
+        else:
+            pred_scores, pred_regard = regard(data, self.regard_classifier)
+            pred_mean = {k: mean(v) for k, v in pred_regard.items()}
+            pred_max = {k: max(v) for k, v in pred_regard.items()}
+            if aggregation == "maximum":
+                return {"max_regard": pred_max}
+            elif aggregation == "average":
+                return {"average_regard": pred_mean}
+            else:
+                return {"regard": pred_scores}
--- a/evaluate-0.4.2/measurements/regard/requirements.txt
+++ b/evaluate-0.4.2/measurements/regard/requirements.txt
+git+https://github.com/huggingface/evaluate.git@{COMMIT_PLACEHOLDER}
+transformers
+torch
--- a/evaluate-0.4.2/measurements/text_duplicates/README.md
+++ b/evaluate-0.4.2/measurements/text_duplicates/README.md
+---
+title: Text Duplicates
+emoji: 🤗
+colorFrom: green
+colorTo: purple
+sdk: gradio
+sdk_version: 3.0.2
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- measurement
+description: >-
+  Returns the duplicate fraction of duplicate strings in the input.
+---
+
+# Measurement Card for Text Duplicates
+
+## Measurement Description
+
+The `text_duplicates` measurement returns the fraction of duplicated strings in the input data.
+
+## How to Use
+
+This measurement requires a list of strings as input:
+
+```python
+>>> data = ["hello sun","hello moon", "hello sun"]
+>>> duplicates = evaluate.load("text_duplicates")
+>>> results = duplicates.compute(data=data)
+```
+
+### Inputs
+- **data** (list of `str`): The input list of strings for which the duplicates are calculated.
+
+### Output Values
+- **duplicate_fraction**(`float`): the fraction of duplicates in the input string(s).
+- **duplicates_dict**(`list`): (optional) a list of tuples with the duplicate strings and the number of times they are repeated.
+
+By default, this measurement outputs a dictionary containing the fraction of duplicates in the input string(s) (`duplicate_fraction`):
+  )
+```python
+{'duplicate_fraction': 0.33333333333333337}
+```
+
+With the `list_duplicates=True` option, this measurement will also output a dictionary of tuples with duplicate strings and their counts.
+
+```python
+{'duplicate_fraction': 0.33333333333333337, 'duplicates_dict': {'hello sun': 2}}
+```
+
+Warning: the `list_duplicates=True` function can be memory-intensive for large datasets.
+
+### Examples
+
+Example with no duplicates
+
+```python
+>>> data = ["foo", "bar", "foobar"]
+>>> duplicates = evaluate.load("text_duplicates")
+>>> results = duplicates.compute(data=data)
+>>> print(results)
+{'duplicate_fraction': 0.0}
+```
+
+Example with multiple duplicates and `list_duplicates=True`:
+```python
+>>> data = ["hello sun", "goodbye moon", "hello sun", "foo bar", "foo bar"]
+>>> duplicates = evaluate.load("text_duplicates")
+>>> results = duplicates.compute(data=data, list_duplicates=True)
+>>> print(results)
+{'duplicate_fraction': 0.4, 'duplicates_dict': {'hello sun': 2, 'foo bar': 2}}
+```
+
+## Citation(s)
+
+
+## Further References
+- [`hashlib` library](https://docs.python.org/3/library/hashlib.html)
--- a/evaluate-0.4.2/measurements/text_duplicates/app.py
+++ b/evaluate-0.4.2/measurements/text_duplicates/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("text_duplicates")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/measurements/text_duplicates/requirements.txt
+++ b/evaluate-0.4.2/measurements/text_duplicates/requirements.txt
+git+https://github.com/huggingface/evaluate.git@{COMMIT_PLACEHOLDER}
--- a/evaluate-0.4.2/measurements/text_duplicates/text_duplicates.py
+++ b/evaluate-0.4.2/measurements/text_duplicates/text_duplicates.py
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import hashlib
+from collections import Counter
+
+import datasets
+
+import evaluate
+
+
+logger = evaluate.logging.get_logger(__name__)
+
+_DESCRIPTION = """
+Returns the duplicate fraction of duplicate strings in the input.
+"""
+
+_KWARGS_DESCRIPTION = """
+Args:
+    `data`: a list of `str` to be checked for duplicates.
+
+Returns:
+    `duplicate_fraction` (`float`) : the fraction of strings that are duplicated.
+    `duplicates_dict` (`dict`) (optional) : a dictionary containing tuples with the duplicate strings and the number of times they are repeated.
+
+Examples:
+    >>> data = ["hello sun","hello moon", "hello sun"]
+    >>> duplicates = evaluate.load("text_duplicates")
+    >>> results = duplicates.compute(data=data)
+    >>> print(results)
+    {'duplicate_fraction': 0.33333333333333337}
+
+    >>> data = ["hello sun","hello moon", "hello sun"]
+    >>> duplicates = evaluate.load("text_duplicates")
+    >>> results =  duplicates.compute(data=data, list_duplicates=True)
+    >>> print(results)
+    {'duplicate_fraction': 0.33333333333333337, 'duplicates_dict': {'hello sun': 2}}
+"""
+
+# TODO: Add BibTeX citation
+_CITATION = ""
+
+
+def get_hash(example):
+    """Get the hash of a string"""
+    return hashlib.md5(example.strip().encode("utf-8")).hexdigest()
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class TextDuplicates(evaluate.Measurement):
+    """This measurement returns the duplicate strings contained in the input(s)."""
+
+    def _info(self):
+        # TODO: Specifies the evaluate.MeasurementInfo object
+        return evaluate.MeasurementInfo(
+            # This is the description that will appear on the modules page.
+            module_type="measurement",
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            # This defines the format of each prediction and reference
+            features=datasets.Features(
+                {
+                    "data": datasets.Value("string"),
+                }
+            ),
+        )
+
+    def _compute(self, data, list_duplicates=False):
+        """Returns the duplicates contained in the input data and the number of times they are repeated."""
+        if list_duplicates == True:
+            logger.warning("This functionality can be memory-intensive for large datasets!")
+            n_dedup = len(set([get_hash(d) for d in data]))
+            c = Counter(data)
+            duplicates = {k: v for k, v in c.items() if v > 1}
+            return {"duplicate_fraction": 1 - (n_dedup / len(data)), "duplicates_dict": duplicates}
+        else:
+            n_dedup = len(set([get_hash(d) for d in data]))
+            return {"duplicate_fraction": 1 - (n_dedup / len(data))}
--- a/evaluate-0.4.2/measurements/toxicity/README.md
+++ b/evaluate-0.4.2/measurements/toxicity/README.md
+---
+title: Toxicity
+emoji: 🤗
+colorFrom: blue
+colorTo: red
+sdk: gradio
+sdk_version: 3.0.2
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- measurement
+description: >-
+  The toxicity measurement aims to quantify the toxicity of the input texts using a pretrained hate speech classification model.
+---
+
+# Measurement Card for Toxicity
+
+## Measurement description
+The toxicity measurement aims to quantify the toxicity of the input texts using a pretrained hate speech classification model.
+
+## How to use
+
+The default model used is [roberta-hate-speech-dynabench-r4](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target). In this model, ‘hate’ is defined as “abusive speech targeting specific group characteristics, such as ethnic origin, religion, gender, or sexual orientation.” Definitions used by other classifiers may vary.
+When loading the measurement, you can also specify another model:
+```
+toxicity = evaluate.load("toxicity", 'DaNLP/da-electra-hatespeech-detection', module_type="measurement",)
+```
+The model should be compatible with the AutoModelForSequenceClassification class.
+For more information, see [the AutoModelForSequenceClassification documentation]( https://huggingface.co/docs/transformers/master/en/model_doc/auto#transformers.AutoModelForSequenceClassification).
+
+Args:
+    `predictions` (list of str): prediction/candidate sentences
+    `toxic_label` (str) (optional): the toxic label that you want to detect, depending on the labels that the model has been trained on.
+        This can be found using the `id2label` function, e.g.:
+        ```python
+        >>> model = AutoModelForSequenceClassification.from_pretrained("DaNLP/da-electra-hatespeech-detection")
+        >>> model.config.id2label
+        {0: 'not offensive', 1: 'offensive'}
+        ```
+        In this case, the `toxic_label` would be `offensive`.
+    `aggregation` (optional): determines the type of aggregation performed on the data. If set to `None`, the scores for each prediction are returned.
+    Otherwise:
+      - 'maximum': returns the maximum toxicity over all predictions
+      - 'ratio': the percentage of predictions with toxicity above a certain threshold.
+    `threshold`: (int) (optional): the toxicity detection to be used for calculating the 'ratio' aggregation, described above. The default threshold is 0.5, based on the one established by [RealToxicityPrompts](https://arxiv.org/abs/2009.11462).
+
+## Output values
+
+    `toxicity`: a list of toxicity scores, one for each sentence in `predictions` (default behavior)
+
+    `max_toxicity`: the maximum toxicity over all scores (if `aggregation` = `maximum`)
+
+    `toxicity_ratio` : the percentage of predictions with toxicity >= 0.5 (if `aggregation` = `ratio`)
+
+
+### Values from popular papers
+
+
+## Examples
+    Example 1 (default behavior):
+```python
+>>> toxicity = evaluate.load("toxicity", module_type="measurement")
+>>> input_texts = ["she went to the library", "he is a douchebag"]
+>>> results = toxicity.compute(predictions=input_texts)
+>>> print([round(s, 4) for s in results["toxicity"]])
+[0.0002, 0.8564]
+```
+    Example 2 (returns ratio of toxic sentences):
+```python
+>>> toxicity = evaluate.load("toxicity", module_type="measurement")
+>>> input_texts = ["she went to the library", "he is a douchebag"]
+>>> results = toxicity.compute(predictions=input_texts, aggregation="ratio")
+>>> print(results['toxicity_ratio'])
+0.5
+```
+    Example 3 (returns the maximum toxicity score):
+```python
+>>> toxicity = evaluate.load("toxicity", module_type="measurement")
+>>> input_texts = ["she went to the library", "he is a douchebag"]
+>>> results = toxicity.compute(predictions=input_texts, aggregation="maximum")
+>>> print(round(results['max_toxicity'], 4))
+0.8564
+```
+    Example 4 (uses a custom model):
+```python
+>>> toxicity = evaluate.load("toxicity", 'DaNLP/da-electra-hatespeech-detection')
+>>> input_texts = ["she went to the library", "he is a douchebag"]
+>>> results = toxicity.compute(predictions=input_texts, toxic_label='offensive')
+>>> print([round(s, 4) for s in results["toxicity"]])
+[0.0176, 0.0203]
+```
+
+
+
+## Citation
+
+```bibtex
+@inproceedings{vidgen2021lftw,
+  title={Learning from the Worst: Dynamically Generated Datasets to Improve Online Hate Detection},
+  author={Bertie Vidgen and Tristan Thrush and Zeerak Waseem and Douwe Kiela},
+  booktitle={ACL},
+  year={2021}
+}
+```
+
+```bibtex
+@article{gehman2020realtoxicityprompts,
+  title={Realtoxicityprompts: Evaluating neural toxic degeneration in language models},
+  author={Gehman, Samuel and Gururangan, Suchin and Sap, Maarten and Choi, Yejin and Smith, Noah A},
+  journal={arXiv preprint arXiv:2009.11462},
+  year={2020}
+}
+
+```
+
+## Further References
--- a/evaluate-0.4.2/measurements/toxicity/app.py
+++ b/evaluate-0.4.2/measurements/toxicity/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("toxicity")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/measurements/toxicity/requirements.txt
+++ b/evaluate-0.4.2/measurements/toxicity/requirements.txt
+git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
+transformers
+torch
--- a/evaluate-0.4.2/measurements/toxicity/toxicity.py
+++ b/evaluate-0.4.2/measurements/toxicity/toxicity.py
+# Copyright 2020 The HuggingFace Evaluate Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Toxicity detection measurement. """
+
+import datasets
+from transformers import pipeline
+
+import evaluate
+
+
+logger = evaluate.logging.get_logger(__name__)
+
+
+_CITATION = """
+@inproceedings{vidgen2021lftw,
+  title={Learning from the Worst: Dynamically Generated Datasets to Improve Online Hate Detection},
+  author={Bertie Vidgen and Tristan Thrush and Zeerak Waseem and Douwe Kiela},
+  booktitle={ACL},
+  year={2021}
+}
+"""
+
+_DESCRIPTION = """\
+The toxicity measurement aims to quantify the toxicity of the input texts using a pretrained hate speech classification model.
+"""
+
+_KWARGS_DESCRIPTION = """
+Compute the toxicity of the input sentences.
+
+Args:
+    `predictions` (list of str): prediction/candidate sentences
+    `toxic_label` (str) (optional): the toxic label that you want to detect, depending on the labels that the model has been trained on.
+        This can be found using the `id2label` function, e.g.:
+        model = AutoModelForSequenceClassification.from_pretrained("DaNLP/da-electra-hatespeech-detection")
+        print(model.config.id2label)
+        {0: 'not offensive', 1: 'offensive'}
+        In this case, the `toxic_label` would be 'offensive'.
+    `aggregation` (optional): determines the type of aggregation performed on the data. If set to `None`, the scores for each prediction are returned.
+    Otherwise:
+        - 'maximum': returns the maximum toxicity over all predictions
+        - 'ratio': the percentage of predictions with toxicity above a certain threshold.
+    `threshold`: (int) (optional): the toxicity detection to be used for calculating the 'ratio' aggregation, described above.
+    The default threshold is 0.5, based on the one established by [RealToxicityPrompts](https://arxiv.org/abs/2009.11462).
+
+Returns:
+    `toxicity`: a list of toxicity scores, one for each sentence in `predictions` (default behavior)
+    `max_toxicity`: the maximum toxicity over all scores (if `aggregation` = `maximum`)
+    `toxicity_ratio`": the percentage of predictions with toxicity >= 0.5 (if `aggregation` = `ratio`)
+
+Examples:
+
+    Example 1 (default behavior):
+        >>> toxicity = evaluate.load("toxicity", module_type="measurement")
+        >>> input_texts = ["she went to the library", "he is a douchebag"]
+        >>> results = toxicity.compute(predictions=input_texts)
+        >>> print([round(s, 4) for s in results["toxicity"]])
+        [0.0002, 0.8564]
+
+    Example 2 (returns ratio of toxic sentences):
+        >>> toxicity = evaluate.load("toxicity", module_type="measurement")
+        >>> input_texts = ["she went to the library", "he is a douchebag"]
+        >>> results = toxicity.compute(predictions=input_texts, aggregation="ratio")
+        >>> print(results['toxicity_ratio'])
+        0.5
+
+    Example 3 (returns the maximum toxicity score):
+
+        >>> toxicity = evaluate.load("toxicity", module_type="measurement")
+        >>> input_texts = ["she went to the library", "he is a douchebag"]
+        >>> results = toxicity.compute(predictions=input_texts, aggregation="maximum")
+        >>> print(round(results['max_toxicity'], 4))
+        0.8564
+
+    Example 4 (uses a custom model):
+
+        >>> toxicity = evaluate.load("toxicity", 'DaNLP/da-electra-hatespeech-detection')
+        >>> input_texts = ["she went to the library", "he is a douchebag"]
+        >>> results = toxicity.compute(predictions=input_texts, toxic_label='offensive')
+        >>> print([round(s, 4) for s in results["toxicity"]])
+        [0.0176, 0.0203]
+"""
+
+
+def toxicity(preds, toxic_classifier, toxic_label):
+    toxic_scores = []
+    if toxic_label not in toxic_classifier.model.config.id2label.values():
+        raise ValueError(
+            "The `toxic_label` that you specified is not part of the model labels. Run `model.config.id2label` to see what labels your model outputs."
+        )
+
+    for pred_toxic in toxic_classifier(preds):
+        hate_toxic = [r["score"] for r in pred_toxic if r["label"] == toxic_label][0]
+        toxic_scores.append(hate_toxic)
+    return toxic_scores
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Toxicity(evaluate.Measurement):
+    def _info(self):
+        return evaluate.MeasurementInfo(
+            module_type="measurement",
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Value("string", id="sequence"),
+                }
+            ),
+            codebase_urls=[],
+            reference_urls=[],
+        )
+
+    def _download_and_prepare(self, dl_manager):
+        if self.config_name == "default":
+            logger.warning("Using default facebook/roberta-hate-speech-dynabench-r4-target checkpoint")
+            model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
+        else:
+            model_name = self.config_name
+        self.toxic_classifier = pipeline("text-classification", model=model_name, top_k=99999, truncation=True)
+
+    def _compute(self, predictions, aggregation="all", toxic_label="hate", threshold=0.5):
+        scores = toxicity(predictions, self.toxic_classifier, toxic_label)
+        if aggregation == "ratio":
+            return {"toxicity_ratio": sum(i >= threshold for i in scores) / len(scores)}
+        elif aggregation == "maximum":
+            return {"max_toxicity": max(scores)}
+        else:
+            return {"toxicity": scores}
--- a/evaluate-0.4.2/measurements/word_count/README.md
+++ b/evaluate-0.4.2/measurements/word_count/README.md
+---
+title: Word Count
+emoji: 🤗
+colorFrom: green
+colorTo: purple
+sdk: gradio
+sdk_version: 3.0.2
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- measurement
+description: >-
+  Returns the total number of words, and the number of unique words in the input data.
+---
+
+# Measurement Card for Word Count
+
+## Measurement Description
+
+The `word_count` measurement returns the total number of word count of the input string, using the sklearn's [`CountVectorizer`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)
+
+## How to Use
+
+This measurement requires a list of strings as input:
+
+```python
+>>> data = ["hello world and hello moon"]
+>>> wordcount= evaluate.load("word_count")
+>>> results = wordcount.compute(data=data)
+```
+
+### Inputs
+- **data** (list of `str`): The input list of strings for which the word length is calculated.
+- **max_vocab** (`int`): (optional) the top number of words to consider (can be specified if dataset is too large)
+
+### Output Values
+- **total_word_count** (`int`): the total number of words in the input string(s).
+- **unique_words** (`int`): the number of unique words in the input string(s).
+
+Output Example(s):
+
+```python
+{'total_word_count': 5, 'unique_words': 4}
+
+
+### Examples
+
+Example for a single string
+
+```python
+>>> data = ["hello sun and goodbye moon"]
+>>> wordcount = evaluate.load("word_count")
+>>> results = wordcount.compute(data=data)
+>>> print(results)
+{'total_word_count': 5, 'unique_words': 5}
+```
+
+Example for a multiple strings
+```python
+>>> data = ["hello sun and goodbye moon", "foo bar foo bar"]
+>>> wordcount = evaluate.load("word_count")
+>>> results = wordcount.compute(data=data)
+>>> print(results)
+{'total_word_count': 9, 'unique_words': 7}
+```
+
+Example for a dataset from 🤗 Datasets:
+
+```python
+>>> imdb = datasets.load_dataset('imdb', split = 'train')
+>>> wordcount = evaluate.load("word_count")
+>>> results = wordcount.compute(data=imdb['text'])
+>>> print(results)
+{'total_word_count': 5678573, 'unique_words': 74849}
+```
+
+## Citation(s)
+
+
+## Further References
+- [Sklearn `CountVectorizer`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)
--- a/evaluate-0.4.2/measurements/word_count/app.py
+++ b/evaluate-0.4.2/measurements/word_count/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("word_count")
+launch_gradio_widget(module)
--- a/evaluate-0.4.2/measurements/word_count/requirements.txt
+++ b/evaluate-0.4.2/measurements/word_count/requirements.txt
+git+https://github.com/huggingface/evaluate.git@{COMMIT_PLACEHOLDER}
+scikit-learn~=0.0
--- a/evaluate-0.4.2/measurements/word_count/word_count.py
+++ b/evaluate-0.4.2/measurements/word_count/word_count.py
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datasets
+from sklearn.feature_extraction.text import CountVectorizer
+
+import evaluate
+
+
+_DESCRIPTION = """
+Returns the total number of words, and the number of unique words in the input data.
+"""
+
+_KWARGS_DESCRIPTION = """
+Args:
+    `data`: a list of `str` for which the words are counted.
+    `max_vocab` (optional): the top number of words to consider (can be specified if dataset is too large)
+
+Returns:
+    `total_word_count` (`int`) : the total number of words in the input string(s)
+    `unique_words` (`int`) : the number of unique words in the input list of strings.
+
+Examples:
+    >>> data = ["hello world and hello moon"]
+    >>> wordcount= evaluate.load("word_count")
+    >>> results = wordcount.compute(data=data)
+    >>> print(results)
+    {'total_word_count': 5, 'unique_words': 4}
+"""
+_CITATION = ""
+
+
+@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class WordCount(evaluate.Measurement):
+    """This measurement returns the total number of words and the number of unique words
+    in the input string(s)."""
+
+    def _info(self):
+        return evaluate.MeasurementInfo(
+            # This is the description that will appear on the modules page.
+            module_type="measurement",
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "data": datasets.Value("string"),
+                }
+            ),
+        )
+
+    def _compute(self, data, max_vocab=None):
+        """Returns the number of unique words in the input data"""
+        count_vectorizer = CountVectorizer(max_features=max_vocab)
+        document_matrix = count_vectorizer.fit_transform(data)
+        word_count = document_matrix.sum()
+        unique_words = document_matrix.shape[1]
+        return {"total_word_count": word_count, "unique_words": unique_words}
--- a/evaluate-0.4.2/measurements/word_length/README.md
+++ b/evaluate-0.4.2/measurements/word_length/README.md
+---
+title: Word Length
+emoji: 🤗
+colorFrom: green
+colorTo: purple
+sdk: gradio
+sdk_version: 3.0.2
+app_file: app.py
+pinned: false
+tags:
+- evaluate
+- measurement
+description: >-
+  Returns the average length (in terms of the number of words) of the input data.
+---
+
+# Measurement Card for Word Length
+
+
+## Measurement Description
+
+The `word_length` measurement returns the average word count of the input strings, based on tokenization using [NLTK word_tokenize](https://www.nltk.org/api/nltk.tokenize.html).
+
+## How to Use
+
+This measurement requires a list of strings as input:
+
+```python
+>>> data = ["hello world"]
+>>> wordlength = evaluate.load("word_length", module_type="measurement")
+>>> results = wordlength.compute(data=data)
+```
+
+### Inputs
+- **data** (list of `str`): The input list of strings for which the word length is calculated.
+- **tokenizer** (`Callable`) : approach used for tokenizing `data` (optional). The default tokenizer is [NLTK's `word_tokenize`](https://www.nltk.org/api/nltk.tokenize.html). This can be replaced by any function that takes a string as input and returns a list of tokens as output.
+
+### Output Values
+- **average_word_length**(`float`): the average number of words in the input string(s).
+
+Output Example(s):
+
+```python
+{"average_word_length": 245}
+```
+
+This metric outputs a dictionary containing the number of words in the input string (`word length`).
+
+### Examples
+
+Example for a single string
+
+```python
+>>> data = ["hello sun and goodbye moon"]
+>>> wordlength = evaluate.load("word_length", module_type="measurement")
+>>> results = wordlength.compute(data=data)
+>>> print(results)
+{'average_word_length': 5}
+```
+
+Example for a multiple strings
+```python
+>>> data = ["hello sun and goodbye moon", "foo bar foo bar"]
+>>> wordlength = evaluate.load("word_length", module_type="measurement")
+>>> results = wordlength.compute(data=text)
+{'average_word_length': 4.5}
+```
+
+## Citation(s)
+
+
+## Further References
+- [NLTK's `word_tokenize`](https://www.nltk.org/api/nltk.tokenize.html)
--- a/evaluate-0.4.2/measurements/word_length/app.py
+++ b/evaluate-0.4.2/measurements/word_length/app.py
+import evaluate
+from evaluate.utils import launch_gradio_widget
+
+
+module = evaluate.load("word_length", module_type="measurement")
+launch_gradio_widget(module)