Commit 25991f98 authored by hepj's avatar hepj
Browse files

修改readme

parent ac192496
Pipeline #1415 failed with stages
in 0 seconds
# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Perplexity Metric."""
import datasets
import numpy as np
import torch
from torch.nn import CrossEntropyLoss
from transformers import AutoModelForCausalLM, AutoTokenizer
import evaluate
from evaluate import logging
_CITATION = """\
"""
_DESCRIPTION = """
Perplexity (PPL) can be used for evaluating to what extent a dataset is similar to the distribution of text that a given model was trained on.
It is defined as the exponentiated average negative log-likelihood of a sequence, calculated with exponent base `e`.
For more information, see https://huggingface.co/docs/transformers/perplexity
"""
_KWARGS_DESCRIPTION = """
Args:
model_id (str): model used for calculating Perplexity
NOTE: Perplexity can only be calculated for causal language models.
This includes models such as gpt2, causal variations of bert,
causal versions of t5, and more (the full list can be found
in the AutoModelForCausalLM documentation here:
https://huggingface.co/docs/transformers/master/en/model_doc/auto#transformers.AutoModelForCausalLM )
data (list of str): input data, each separate text snippet
is one list entry.
batch_size (int): the batch size to run texts through the model. Defaults to 16.
add_start_token (bool): whether to add the start token to the texts,
so the perplexity can include the probability of the first word. Defaults to True.
device (str): device to run on, defaults to 'cuda' when available
max_length (int): the maximum length to truncate input texts to. Should be set to the maximum length the model supports. Defaults to None.
Returns:
perplexity: dictionary containing the perplexity scores for the texts
in the input list, as well as the mean perplexity. If one of the input texts is
longer than the max input length of the model, then it is truncated to the
max length for the perplexity computation.
Examples:
Example 1:
>>> perplexity = evaluate.load("perplexity", module_type="measurement")
>>> data = ["lorem ipsum", "Happy Birthday!", "Bienvenue"]
>>> results = perplexity.compute(model_id='gpt2',
... add_start_token=False,
... data=data) # doctest:+ELLIPSIS
>>> print(list(results.keys()))
['perplexities', 'mean_perplexity']
>>> print(round(results["mean_perplexity"], 0))
647.0
>>> print(round(results["perplexities"][0], 0))
32.0
Example 2:
>>> from datasets import load_dataset
>>> perplexity = evaluate.load("perplexity", module_type="measurement")
>>> data = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")["text"][:10] # doctest: +SKIP
>>> data = [s for s in data if s!='']
>>> results = perplexity.compute(model_id='gpt2',
... data=data)
>>> print(list(results.keys()))
['perplexities', 'mean_perplexity']
>>> print(round(results["mean_perplexity"], 2)) # doctest: +SKIP
576.76
>>> print(round(results["perplexities"][0], 2)) # doctest: +SKIP
889.28
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Perplexity(evaluate.Measurement):
def _info(self):
return evaluate.MeasurementInfo(
module_type="measurement",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"data": datasets.Value("string"),
}
),
reference_urls=["https://huggingface.co/docs/transformers/perplexity"],
)
def _compute(
self, data, model_id, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None
):
if device is not None:
assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
if device == "gpu":
device = "cuda"
else:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(model_id)
model = model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)
# if batch_size > 1 (which generally leads to padding being required), and
# if there is not an already assigned pad_token, assign an existing
# special token to also be the padding token
if tokenizer.pad_token is None and batch_size > 1:
existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())
# check that the model already has at least one special token defined
assert (
len(existing_special_tokens) > 0
), "If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1."
# assign one of the special tokens to also be the pad token
tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})
if add_start_token and max_length:
# leave room for <BOS> token to be added:
assert (
tokenizer.bos_token is not None
), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
max_tokenized_len = max_length - 1
else:
max_tokenized_len = max_length
encodings = tokenizer(
data,
add_special_tokens=False,
padding=True,
truncation=True if max_tokenized_len else False,
max_length=max_tokenized_len,
return_tensors="pt",
return_attention_mask=True,
).to(device)
encoded_texts = encodings["input_ids"]
attn_masks = encodings["attention_mask"]
# check that each input is long enough:
if add_start_token:
assert torch.all(torch.ge(attn_masks.sum(1), 1)), "Each input text must be at least one token long."
else:
assert torch.all(
torch.ge(attn_masks.sum(1), 2)
), "When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings."
ppls = []
loss_fct = CrossEntropyLoss(reduction="none")
for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):
end_index = min(start_index + batch_size, len(encoded_texts))
encoded_batch = encoded_texts[start_index:end_index]
attn_mask = attn_masks[start_index:end_index]
if add_start_token:
bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)
encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)
attn_mask = torch.cat(
[torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_mask], dim=1
)
labels = encoded_batch
with torch.no_grad():
out_logits = model(encoded_batch, attention_mask=attn_mask).logits
shift_logits = out_logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
shift_attention_mask_batch = attn_mask[..., 1:].contiguous()
perplexity_batch = torch.exp(
(loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)
/ shift_attention_mask_batch.sum(1)
)
ppls += perplexity_batch.tolist()
return {"perplexities": ppls, "mean_perplexity": np.mean(ppls)}
git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
torch
transformers
\ No newline at end of file
---
title: Regard
emoji: 🤗
colorFrom: green
colorTo: purple
sdk: gradio
sdk_version: 3.0.2
app_file: app.py
pinned: false
tags:
- evaluate
- measurement
description: >-
Regard aims to measure language polarity towards and social perceptions of a demographic (e.g. gender, race, sexual orientation).
---
# Measurement Card for Regard
## Measurement Description
The `regard` measurement returns the estimated language polarity towards and social perceptions of a demographic (e.g. gender, race, sexual orientation).
It uses a model trained on labelled data from the paper ["The Woman Worked as a Babysitter: On Biases in Language Generation" (EMNLP 2019)](https://arxiv.org/abs/1909.01326)
## How to Use
This measurement requires two lists of strings as input, enabling comparing the estimated polarity between the groups.
```python
>>> regard = evaluate.load("regard", module_type="measurement")
>>> group1 = ['xyz are described as mean', 'xyz are thought of as being too ambitious']
>>> group2 = ['xyz are known for making too much noise', 'xyz are described as often violent']
>>> regard.compute(data = group1, references = group2)
```
### Inputs
- **data** (list of `str`): prediction/candidate sentences, e.g. sentences describing a given demographic group.
- **references** (list of `str`) (optional): reference/comparison sentences, e.g. sentences describing a different demographic group to compare against.
- **aggregation** (`str`) (optional): determines the type of aggregation performed.
If set to `None`, the difference between the regard scores for the two categories is returned.
Otherwise:
- `average` : returns the average regard for each category (negative, positive, neutral, other) for each group
- `maximum`: returns the maximum regard for each group
### Output Values
**With a single input**:
`regard` : the regard scores of each string in the input list (if no aggregation is specified)
```python
{'neutral': 0.95, 'positive': 0.02, 'negative': 0.02, 'other': 0.01}
{'negative': 0.97, 'other': 0.02, 'neutral': 0.01, 'positive': 0.0}
```
`average_regard`: the average regard for each category (negative, positive, neutral, other) (if `aggregation` = `average`)
```python
{'neutral': 0.48, 'positive': 0.01, 'negative': 0.5, 'other': 0.01}
```
`max_regard`: the maximum regard across all input strings (if `aggregation` = `maximum`)
```python
{'neutral': 0.95, 'positive': 0.024, 'negative': 0.972, 'other': 0.019}
```
**With two lists of inputs**:
By default, this measurement outputs a dictionary containing a list of regard scores, one for each category (negative, positive, neutral, other), representing the difference in regard between the two groups.
```python
{'neutral': 0.35, 'negative': -0.36, 'other': 0.01, 'positive': 0.01}
```
With the `aggregation='maximum'` option, this measurement will output the maximum regard for each group:
```python
{'negative': 0.98, 'other': 0.04, 'neutral': 0.03, 'positive': 0.0}
```
With the `aggregation='average'` option, this measurement will output the average regard for each category (negative, positive, neutral, other):
```python
{'neutral': 0.37, 'negative': 0.57, 'other': 0.05, 'positive': 0.01}
```
### Examples
Example 1 (single input):
```python
>>> regard = evaluate.load("regard")
>>> group1 = ['xyz are described as mean', 'xyz are thought of as being too ambitious']
>>> results = regard.compute(data = group1)
>>> for d in results['regard']:
... print({l['label']: round(l['score'],2) for l in d})
{'neutral': 0.95, 'positive': 0.02, 'negative': 0.02, 'other': 0.01}
{'negative': 0.97, 'other': 0.02, 'neutral': 0.01, 'positive': 0.0}
```
Example 2 (comparison mode):
```python
>>> regard = evaluate.load("regard", "compare")
>>> group1 = ['xyz are described as mean', 'xyz are thought of as being too ambitious']
>>> group2 = ['xyz are known for making too much noise', 'xyz are described as often violent']
>>> results = regard.compute(data = group1, references = group2)
>>> print({k: round(v, 2) for k, v in results['regard_difference'].items()})
{'neutral': 0.46, 'positive': 0.01, 'negative': -0.46, 'other': -0.01}
```
Example 3 (returns the maximum regard score):
```python
>>> regard = evaluate.load("regard", "compare")
>>> group1 = ['xyz are described as mean', 'xyz are thought of as being too ambitious']
>>> group2 = ['xyz are known for making too much noise', 'xyz are described as often violent']
>>> results = regard.compute(data = group1, references = group2, aggregation = "maximum")
>>> print({k: round(v, 2) for k, v in results['max_data_regard'].items()})
{'neutral': 0.95, 'positive': 0.02, 'negative': 0.97, 'other': 0.02}
>>> print({k: round(v, 2) for k, v in results['max_references_regard'].items()})
{'negative': 0.98, 'other': 0.04, 'neutral': 0.03, 'positive': 0.0}
```
Example 4 (returns the average regard score):
```python
>>> regard = evaluate.load("regard", "compare")
>>> group1 = ['xyz are described as mean', 'xyz are thought of as being too ambitious']
>>> group2 = ['xyz are known for making too much noise', 'xyz are described as often violent']
>>> results = regard.compute(data = group1, references = group2, aggregation = "average")
>>> print({k: round(v, 2) for k, v in results['average_data_regard'].items()})
{'neutral': 0.48, 'positive': 0.01, 'negative': 0.5, 'other': 0.01}
>>> print({k: round(v, 2) for k, v in results['average_references_regard'].items()})
{'negative': 0.96, 'other': 0.02, 'neutral': 0.02, 'positive': 0.0}
```
## Citation(s)
@article{https://doi.org/10.48550/arxiv.1909.01326,
doi = {10.48550/ARXIV.1909.01326},
url = {https://arxiv.org/abs/1909.01326},
author = {Sheng, Emily and Chang, Kai-Wei and Natarajan, Premkumar and Peng, Nanyun},
title = {The Woman Worked as a Babysitter: On Biases in Language Generation},
publisher = {arXiv},
year = {2019}
}
## Further References
- [`nlg-bias` library](https://github.com/ewsheng/nlg-bias/)
import evaluate
from evaluate.utils import launch_gradio_widget
module = evaluate.load("regard")
launch_gradio_widget(module)
# Copyright 2020 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Regard measurement. """
from collections import defaultdict
from operator import itemgetter
from statistics import mean
import datasets
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
import evaluate
logger = evaluate.logging.get_logger(__name__)
_CITATION = """
@article{https://doi.org/10.48550/arxiv.1909.01326,
doi = {10.48550/ARXIV.1909.01326},
url = {https://arxiv.org/abs/1909.01326},
author = {Sheng, Emily and Chang, Kai-Wei and Natarajan, Premkumar and Peng, Nanyun},
title = {The Woman Worked as a Babysitter: On Biases in Language Generation},
publisher = {arXiv},
year = {2019}
}
"""
_DESCRIPTION = """\
Regard aims to measure language polarity towards and social perceptions of a demographic (e.g. gender, race, sexual orientation).
"""
_KWARGS_DESCRIPTION = """
Compute the regard of the input sentences.
Args:
`data` (list of str): prediction/candidate sentences, e.g. sentences describing a given demographic group.
`references` (list of str) (optional): reference/comparison sentences, e.g. sentences describing a different demographic group to compare against.
`aggregation` (str) (optional): determines the type of aggregation performed.
If set to `None`, the difference between the regard scores for the two categories is returned.
Otherwise:
- 'average' : returns the average regard for each category (negative, positive, neutral, other) for each group
- 'maximum': returns the maximum regard for each group
Returns:
With only `data` as input (default config):
`regard` : the regard scores of each string in the input list (if no aggregation is specified)
`average_regard`: the average regard for each category (negative, positive, neutral, other) (if `aggregation` = `average`)
`max_regard`: the maximum regard across all input strings (if `aggregation` = `maximum`)
With `data` and `references` as input (`compare` config):
`regard_difference`: the difference between the regard scores for the two groups (if no aggregation is specified)
`average_data_regard` and 'average_references_regard': the average regard for each category (negative, positive, neutral, other) (if `aggregation` = `average`)
`max_data_regard` and 'max_references_regard': the maximum regard for each group (if `aggregation` = `maximum`)
Examples:
Example 1 (single input):
>>> regard = evaluate.load("regard")
>>> group1 = ['xyz are described as mean', 'xyz are thought of as being too ambitious']
>>> results = regard.compute(data = group1)
>>> for d in results['regard']:
... print({l['label']: round(l['score'],2) for l in d})
{'neutral': 0.95, 'positive': 0.02, 'negative': 0.02, 'other': 0.01}
{'negative': 0.97, 'other': 0.02, 'neutral': 0.01, 'positive': 0.0}
Example 2 (comparison mode):
>>> regard = evaluate.load("regard", "compare")
>>> group1 = ['xyz are described as mean', 'xyz are thought of as being too ambitious']
>>> group2 = ['xyz are known for making too much noise', 'xyz are described as often violent']
>>> results = regard.compute(data = group1, references = group2)
>>> print({k: round(v, 2) for k, v in results['regard_difference'].items()})
{'neutral': 0.46, 'positive': 0.01, 'negative': -0.46, 'other': -0.01}
Example 3 (returns the maximum regard score per category):
>>> regard = evaluate.load("regard", "compare")
>>> group1 = ['xyz are described as mean', 'xyz are thought of as being too ambitious']
>>> group2 = ['xyz are known for making too much noise', 'xyz are described as often violent']
>>> results = regard.compute(data = group1, references = group2, aggregation = "maximum")
>>> print({k: round(v, 2) for k, v in results['max_data_regard'].items()})
{'neutral': 0.95, 'positive': 0.02, 'negative': 0.97, 'other': 0.02}
>>> print({k: round(v, 2) for k, v in results['max_references_regard'].items()})
{'negative': 0.98, 'other': 0.04, 'neutral': 0.03, 'positive': 0.0}
Example 4 (returns the average regard score):
>>> regard = evaluate.load("regard", "compare")
>>> group1 = ['xyz are described as mean', 'xyz are thought of as being too ambitious']
>>> group2 = ['xyz are known for making too much noise', 'xyz are described as often violent']
>>> results = regard.compute(data = group1, references = group2, aggregation = "average")
>>> print({k: round(v, 2) for k, v in results['average_data_regard'].items()})
{'neutral': 0.48, 'positive': 0.01, 'negative': 0.5, 'other': 0.01}
>>> print({k: round(v, 2) for k, v in results['average_references_regard'].items()})
{'negative': 0.96, 'other': 0.02, 'neutral': 0.02, 'positive': 0.0}
"""
def regard(group, regard_classifier):
group_scores = defaultdict(list)
group_regard = regard_classifier(group)
for pred in group_regard:
for pred_score in pred:
group_scores[pred_score["label"]].append(pred_score["score"])
return group_regard, dict(group_scores)
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Regard(evaluate.Measurement):
def _info(self):
if self.config_name not in ["compare", "default"]:
raise KeyError("You should supply a configuration name selected in " '["config", "default"]')
return evaluate.MeasurementInfo(
module_type="measurement",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"data": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence"),
}
if self.config_name == "compare"
else {
"data": datasets.Value("string", id="sequence"),
}
),
codebase_urls=[],
reference_urls=[],
)
def _download_and_prepare(self, dl_manager):
regard_tokenizer = AutoTokenizer.from_pretrained("sasha/regardv3")
regard_model = AutoModelForSequenceClassification.from_pretrained("sasha/regardv3")
self.regard_classifier = pipeline(
"text-classification", model=regard_model, top_k=4, tokenizer=regard_tokenizer, truncation=True
)
def _compute(
self,
data,
references=None,
aggregation=None,
):
if self.config_name == "compare":
pred_scores, pred_regard = regard(data, self.regard_classifier)
ref_scores, ref_regard = regard(references, self.regard_classifier)
pred_mean = {k: mean(v) for k, v in pred_regard.items()}
pred_max = {k: max(v) for k, v in pred_regard.items()}
ref_mean = {k: mean(v) for k, v in ref_regard.items()}
ref_max = {k: max(v) for k, v in ref_regard.items()}
if aggregation == "maximum":
return {
"max_data_regard": pred_max,
"max_references_regard": ref_max,
}
elif aggregation == "average":
return {"average_data_regard": pred_mean, "average_references_regard": ref_mean}
else:
return {"regard_difference": {key: pred_mean[key] - ref_mean.get(key, 0) for key in pred_mean}}
else:
pred_scores, pred_regard = regard(data, self.regard_classifier)
pred_mean = {k: mean(v) for k, v in pred_regard.items()}
pred_max = {k: max(v) for k, v in pred_regard.items()}
if aggregation == "maximum":
return {"max_regard": pred_max}
elif aggregation == "average":
return {"average_regard": pred_mean}
else:
return {"regard": pred_scores}
git+https://github.com/huggingface/evaluate.git@{COMMIT_PLACEHOLDER}
transformers
torch
---
title: Text Duplicates
emoji: 🤗
colorFrom: green
colorTo: purple
sdk: gradio
sdk_version: 3.0.2
app_file: app.py
pinned: false
tags:
- evaluate
- measurement
description: >-
Returns the duplicate fraction of duplicate strings in the input.
---
# Measurement Card for Text Duplicates
## Measurement Description
The `text_duplicates` measurement returns the fraction of duplicated strings in the input data.
## How to Use
This measurement requires a list of strings as input:
```python
>>> data = ["hello sun","hello moon", "hello sun"]
>>> duplicates = evaluate.load("text_duplicates")
>>> results = duplicates.compute(data=data)
```
### Inputs
- **data** (list of `str`): The input list of strings for which the duplicates are calculated.
### Output Values
- **duplicate_fraction**(`float`): the fraction of duplicates in the input string(s).
- **duplicates_dict**(`list`): (optional) a list of tuples with the duplicate strings and the number of times they are repeated.
By default, this measurement outputs a dictionary containing the fraction of duplicates in the input string(s) (`duplicate_fraction`):
)
```python
{'duplicate_fraction': 0.33333333333333337}
```
With the `list_duplicates=True` option, this measurement will also output a dictionary of tuples with duplicate strings and their counts.
```python
{'duplicate_fraction': 0.33333333333333337, 'duplicates_dict': {'hello sun': 2}}
```
Warning: the `list_duplicates=True` function can be memory-intensive for large datasets.
### Examples
Example with no duplicates
```python
>>> data = ["foo", "bar", "foobar"]
>>> duplicates = evaluate.load("text_duplicates")
>>> results = duplicates.compute(data=data)
>>> print(results)
{'duplicate_fraction': 0.0}
```
Example with multiple duplicates and `list_duplicates=True`:
```python
>>> data = ["hello sun", "goodbye moon", "hello sun", "foo bar", "foo bar"]
>>> duplicates = evaluate.load("text_duplicates")
>>> results = duplicates.compute(data=data, list_duplicates=True)
>>> print(results)
{'duplicate_fraction': 0.4, 'duplicates_dict': {'hello sun': 2, 'foo bar': 2}}
```
## Citation(s)
## Further References
- [`hashlib` library](https://docs.python.org/3/library/hashlib.html)
import evaluate
from evaluate.utils import launch_gradio_widget
module = evaluate.load("text_duplicates")
launch_gradio_widget(module)
git+https://github.com/huggingface/evaluate.git@{COMMIT_PLACEHOLDER}
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import hashlib
from collections import Counter
import datasets
import evaluate
logger = evaluate.logging.get_logger(__name__)
_DESCRIPTION = """
Returns the duplicate fraction of duplicate strings in the input.
"""
_KWARGS_DESCRIPTION = """
Args:
`data`: a list of `str` to be checked for duplicates.
Returns:
`duplicate_fraction` (`float`) : the fraction of strings that are duplicated.
`duplicates_dict` (`dict`) (optional) : a dictionary containing tuples with the duplicate strings and the number of times they are repeated.
Examples:
>>> data = ["hello sun","hello moon", "hello sun"]
>>> duplicates = evaluate.load("text_duplicates")
>>> results = duplicates.compute(data=data)
>>> print(results)
{'duplicate_fraction': 0.33333333333333337}
>>> data = ["hello sun","hello moon", "hello sun"]
>>> duplicates = evaluate.load("text_duplicates")
>>> results = duplicates.compute(data=data, list_duplicates=True)
>>> print(results)
{'duplicate_fraction': 0.33333333333333337, 'duplicates_dict': {'hello sun': 2}}
"""
# TODO: Add BibTeX citation
_CITATION = ""
def get_hash(example):
"""Get the hash of a string"""
return hashlib.md5(example.strip().encode("utf-8")).hexdigest()
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class TextDuplicates(evaluate.Measurement):
"""This measurement returns the duplicate strings contained in the input(s)."""
def _info(self):
# TODO: Specifies the evaluate.MeasurementInfo object
return evaluate.MeasurementInfo(
# This is the description that will appear on the modules page.
module_type="measurement",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
# This defines the format of each prediction and reference
features=datasets.Features(
{
"data": datasets.Value("string"),
}
),
)
def _compute(self, data, list_duplicates=False):
"""Returns the duplicates contained in the input data and the number of times they are repeated."""
if list_duplicates == True:
logger.warning("This functionality can be memory-intensive for large datasets!")
n_dedup = len(set([get_hash(d) for d in data]))
c = Counter(data)
duplicates = {k: v for k, v in c.items() if v > 1}
return {"duplicate_fraction": 1 - (n_dedup / len(data)), "duplicates_dict": duplicates}
else:
n_dedup = len(set([get_hash(d) for d in data]))
return {"duplicate_fraction": 1 - (n_dedup / len(data))}
---
title: Toxicity
emoji: 🤗
colorFrom: blue
colorTo: red
sdk: gradio
sdk_version: 3.0.2
app_file: app.py
pinned: false
tags:
- evaluate
- measurement
description: >-
The toxicity measurement aims to quantify the toxicity of the input texts using a pretrained hate speech classification model.
---
# Measurement Card for Toxicity
## Measurement description
The toxicity measurement aims to quantify the toxicity of the input texts using a pretrained hate speech classification model.
## How to use
The default model used is [roberta-hate-speech-dynabench-r4](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target). In this model, ‘hate’ is defined as “abusive speech targeting specific group characteristics, such as ethnic origin, religion, gender, or sexual orientation.” Definitions used by other classifiers may vary.
When loading the measurement, you can also specify another model:
```
toxicity = evaluate.load("toxicity", 'DaNLP/da-electra-hatespeech-detection', module_type="measurement",)
```
The model should be compatible with the AutoModelForSequenceClassification class.
For more information, see [the AutoModelForSequenceClassification documentation]( https://huggingface.co/docs/transformers/master/en/model_doc/auto#transformers.AutoModelForSequenceClassification).
Args:
`predictions` (list of str): prediction/candidate sentences
`toxic_label` (str) (optional): the toxic label that you want to detect, depending on the labels that the model has been trained on.
This can be found using the `id2label` function, e.g.:
```python
>>> model = AutoModelForSequenceClassification.from_pretrained("DaNLP/da-electra-hatespeech-detection")
>>> model.config.id2label
{0: 'not offensive', 1: 'offensive'}
```
In this case, the `toxic_label` would be `offensive`.
`aggregation` (optional): determines the type of aggregation performed on the data. If set to `None`, the scores for each prediction are returned.
Otherwise:
- 'maximum': returns the maximum toxicity over all predictions
- 'ratio': the percentage of predictions with toxicity above a certain threshold.
`threshold`: (int) (optional): the toxicity detection to be used for calculating the 'ratio' aggregation, described above. The default threshold is 0.5, based on the one established by [RealToxicityPrompts](https://arxiv.org/abs/2009.11462).
## Output values
`toxicity`: a list of toxicity scores, one for each sentence in `predictions` (default behavior)
`max_toxicity`: the maximum toxicity over all scores (if `aggregation` = `maximum`)
`toxicity_ratio` : the percentage of predictions with toxicity >= 0.5 (if `aggregation` = `ratio`)
### Values from popular papers
## Examples
Example 1 (default behavior):
```python
>>> toxicity = evaluate.load("toxicity", module_type="measurement")
>>> input_texts = ["she went to the library", "he is a douchebag"]
>>> results = toxicity.compute(predictions=input_texts)
>>> print([round(s, 4) for s in results["toxicity"]])
[0.0002, 0.8564]
```
Example 2 (returns ratio of toxic sentences):
```python
>>> toxicity = evaluate.load("toxicity", module_type="measurement")
>>> input_texts = ["she went to the library", "he is a douchebag"]
>>> results = toxicity.compute(predictions=input_texts, aggregation="ratio")
>>> print(results['toxicity_ratio'])
0.5
```
Example 3 (returns the maximum toxicity score):
```python
>>> toxicity = evaluate.load("toxicity", module_type="measurement")
>>> input_texts = ["she went to the library", "he is a douchebag"]
>>> results = toxicity.compute(predictions=input_texts, aggregation="maximum")
>>> print(round(results['max_toxicity'], 4))
0.8564
```
Example 4 (uses a custom model):
```python
>>> toxicity = evaluate.load("toxicity", 'DaNLP/da-electra-hatespeech-detection')
>>> input_texts = ["she went to the library", "he is a douchebag"]
>>> results = toxicity.compute(predictions=input_texts, toxic_label='offensive')
>>> print([round(s, 4) for s in results["toxicity"]])
[0.0176, 0.0203]
```
## Citation
```bibtex
@inproceedings{vidgen2021lftw,
title={Learning from the Worst: Dynamically Generated Datasets to Improve Online Hate Detection},
author={Bertie Vidgen and Tristan Thrush and Zeerak Waseem and Douwe Kiela},
booktitle={ACL},
year={2021}
}
```
```bibtex
@article{gehman2020realtoxicityprompts,
title={Realtoxicityprompts: Evaluating neural toxic degeneration in language models},
author={Gehman, Samuel and Gururangan, Suchin and Sap, Maarten and Choi, Yejin and Smith, Noah A},
journal={arXiv preprint arXiv:2009.11462},
year={2020}
}
```
## Further References
import evaluate
from evaluate.utils import launch_gradio_widget
module = evaluate.load("toxicity")
launch_gradio_widget(module)
git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
transformers
torch
# Copyright 2020 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Toxicity detection measurement. """
import datasets
from transformers import pipeline
import evaluate
logger = evaluate.logging.get_logger(__name__)
_CITATION = """
@inproceedings{vidgen2021lftw,
title={Learning from the Worst: Dynamically Generated Datasets to Improve Online Hate Detection},
author={Bertie Vidgen and Tristan Thrush and Zeerak Waseem and Douwe Kiela},
booktitle={ACL},
year={2021}
}
"""
_DESCRIPTION = """\
The toxicity measurement aims to quantify the toxicity of the input texts using a pretrained hate speech classification model.
"""
_KWARGS_DESCRIPTION = """
Compute the toxicity of the input sentences.
Args:
`predictions` (list of str): prediction/candidate sentences
`toxic_label` (str) (optional): the toxic label that you want to detect, depending on the labels that the model has been trained on.
This can be found using the `id2label` function, e.g.:
model = AutoModelForSequenceClassification.from_pretrained("DaNLP/da-electra-hatespeech-detection")
print(model.config.id2label)
{0: 'not offensive', 1: 'offensive'}
In this case, the `toxic_label` would be 'offensive'.
`aggregation` (optional): determines the type of aggregation performed on the data. If set to `None`, the scores for each prediction are returned.
Otherwise:
- 'maximum': returns the maximum toxicity over all predictions
- 'ratio': the percentage of predictions with toxicity above a certain threshold.
`threshold`: (int) (optional): the toxicity detection to be used for calculating the 'ratio' aggregation, described above.
The default threshold is 0.5, based on the one established by [RealToxicityPrompts](https://arxiv.org/abs/2009.11462).
Returns:
`toxicity`: a list of toxicity scores, one for each sentence in `predictions` (default behavior)
`max_toxicity`: the maximum toxicity over all scores (if `aggregation` = `maximum`)
`toxicity_ratio`": the percentage of predictions with toxicity >= 0.5 (if `aggregation` = `ratio`)
Examples:
Example 1 (default behavior):
>>> toxicity = evaluate.load("toxicity", module_type="measurement")
>>> input_texts = ["she went to the library", "he is a douchebag"]
>>> results = toxicity.compute(predictions=input_texts)
>>> print([round(s, 4) for s in results["toxicity"]])
[0.0002, 0.8564]
Example 2 (returns ratio of toxic sentences):
>>> toxicity = evaluate.load("toxicity", module_type="measurement")
>>> input_texts = ["she went to the library", "he is a douchebag"]
>>> results = toxicity.compute(predictions=input_texts, aggregation="ratio")
>>> print(results['toxicity_ratio'])
0.5
Example 3 (returns the maximum toxicity score):
>>> toxicity = evaluate.load("toxicity", module_type="measurement")
>>> input_texts = ["she went to the library", "he is a douchebag"]
>>> results = toxicity.compute(predictions=input_texts, aggregation="maximum")
>>> print(round(results['max_toxicity'], 4))
0.8564
Example 4 (uses a custom model):
>>> toxicity = evaluate.load("toxicity", 'DaNLP/da-electra-hatespeech-detection')
>>> input_texts = ["she went to the library", "he is a douchebag"]
>>> results = toxicity.compute(predictions=input_texts, toxic_label='offensive')
>>> print([round(s, 4) for s in results["toxicity"]])
[0.0176, 0.0203]
"""
def toxicity(preds, toxic_classifier, toxic_label):
toxic_scores = []
if toxic_label not in toxic_classifier.model.config.id2label.values():
raise ValueError(
"The `toxic_label` that you specified is not part of the model labels. Run `model.config.id2label` to see what labels your model outputs."
)
for pred_toxic in toxic_classifier(preds):
hate_toxic = [r["score"] for r in pred_toxic if r["label"] == toxic_label][0]
toxic_scores.append(hate_toxic)
return toxic_scores
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Toxicity(evaluate.Measurement):
def _info(self):
return evaluate.MeasurementInfo(
module_type="measurement",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
}
),
codebase_urls=[],
reference_urls=[],
)
def _download_and_prepare(self, dl_manager):
if self.config_name == "default":
logger.warning("Using default facebook/roberta-hate-speech-dynabench-r4-target checkpoint")
model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
else:
model_name = self.config_name
self.toxic_classifier = pipeline("text-classification", model=model_name, top_k=99999, truncation=True)
def _compute(self, predictions, aggregation="all", toxic_label="hate", threshold=0.5):
scores = toxicity(predictions, self.toxic_classifier, toxic_label)
if aggregation == "ratio":
return {"toxicity_ratio": sum(i >= threshold for i in scores) / len(scores)}
elif aggregation == "maximum":
return {"max_toxicity": max(scores)}
else:
return {"toxicity": scores}
---
title: Word Count
emoji: 🤗
colorFrom: green
colorTo: purple
sdk: gradio
sdk_version: 3.0.2
app_file: app.py
pinned: false
tags:
- evaluate
- measurement
description: >-
Returns the total number of words, and the number of unique words in the input data.
---
# Measurement Card for Word Count
## Measurement Description
The `word_count` measurement returns the total number of word count of the input string, using the sklearn's [`CountVectorizer`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)
## How to Use
This measurement requires a list of strings as input:
```python
>>> data = ["hello world and hello moon"]
>>> wordcount= evaluate.load("word_count")
>>> results = wordcount.compute(data=data)
```
### Inputs
- **data** (list of `str`): The input list of strings for which the word length is calculated.
- **max_vocab** (`int`): (optional) the top number of words to consider (can be specified if dataset is too large)
### Output Values
- **total_word_count** (`int`): the total number of words in the input string(s).
- **unique_words** (`int`): the number of unique words in the input string(s).
Output Example(s):
```python
{'total_word_count': 5, 'unique_words': 4}
### Examples
Example for a single string
```python
>>> data = ["hello sun and goodbye moon"]
>>> wordcount = evaluate.load("word_count")
>>> results = wordcount.compute(data=data)
>>> print(results)
{'total_word_count': 5, 'unique_words': 5}
```
Example for a multiple strings
```python
>>> data = ["hello sun and goodbye moon", "foo bar foo bar"]
>>> wordcount = evaluate.load("word_count")
>>> results = wordcount.compute(data=data)
>>> print(results)
{'total_word_count': 9, 'unique_words': 7}
```
Example for a dataset from 🤗 Datasets:
```python
>>> imdb = datasets.load_dataset('imdb', split = 'train')
>>> wordcount = evaluate.load("word_count")
>>> results = wordcount.compute(data=imdb['text'])
>>> print(results)
{'total_word_count': 5678573, 'unique_words': 74849}
```
## Citation(s)
## Further References
- [Sklearn `CountVectorizer`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)
import evaluate
from evaluate.utils import launch_gradio_widget
module = evaluate.load("word_count")
launch_gradio_widget(module)
git+https://github.com/huggingface/evaluate.git@{COMMIT_PLACEHOLDER}
scikit-learn~=0.0
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import datasets
from sklearn.feature_extraction.text import CountVectorizer
import evaluate
_DESCRIPTION = """
Returns the total number of words, and the number of unique words in the input data.
"""
_KWARGS_DESCRIPTION = """
Args:
`data`: a list of `str` for which the words are counted.
`max_vocab` (optional): the top number of words to consider (can be specified if dataset is too large)
Returns:
`total_word_count` (`int`) : the total number of words in the input string(s)
`unique_words` (`int`) : the number of unique words in the input list of strings.
Examples:
>>> data = ["hello world and hello moon"]
>>> wordcount= evaluate.load("word_count")
>>> results = wordcount.compute(data=data)
>>> print(results)
{'total_word_count': 5, 'unique_words': 4}
"""
_CITATION = ""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class WordCount(evaluate.Measurement):
"""This measurement returns the total number of words and the number of unique words
in the input string(s)."""
def _info(self):
return evaluate.MeasurementInfo(
# This is the description that will appear on the modules page.
module_type="measurement",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"data": datasets.Value("string"),
}
),
)
def _compute(self, data, max_vocab=None):
"""Returns the number of unique words in the input data"""
count_vectorizer = CountVectorizer(max_features=max_vocab)
document_matrix = count_vectorizer.fit_transform(data)
word_count = document_matrix.sum()
unique_words = document_matrix.shape[1]
return {"total_word_count": word_count, "unique_words": unique_words}
---
title: Word Length
emoji: 🤗
colorFrom: green
colorTo: purple
sdk: gradio
sdk_version: 3.0.2
app_file: app.py
pinned: false
tags:
- evaluate
- measurement
description: >-
Returns the average length (in terms of the number of words) of the input data.
---
# Measurement Card for Word Length
## Measurement Description
The `word_length` measurement returns the average word count of the input strings, based on tokenization using [NLTK word_tokenize](https://www.nltk.org/api/nltk.tokenize.html).
## How to Use
This measurement requires a list of strings as input:
```python
>>> data = ["hello world"]
>>> wordlength = evaluate.load("word_length", module_type="measurement")
>>> results = wordlength.compute(data=data)
```
### Inputs
- **data** (list of `str`): The input list of strings for which the word length is calculated.
- **tokenizer** (`Callable`) : approach used for tokenizing `data` (optional). The default tokenizer is [NLTK's `word_tokenize`](https://www.nltk.org/api/nltk.tokenize.html). This can be replaced by any function that takes a string as input and returns a list of tokens as output.
### Output Values
- **average_word_length**(`float`): the average number of words in the input string(s).
Output Example(s):
```python
{"average_word_length": 245}
```
This metric outputs a dictionary containing the number of words in the input string (`word length`).
### Examples
Example for a single string
```python
>>> data = ["hello sun and goodbye moon"]
>>> wordlength = evaluate.load("word_length", module_type="measurement")
>>> results = wordlength.compute(data=data)
>>> print(results)
{'average_word_length': 5}
```
Example for a multiple strings
```python
>>> data = ["hello sun and goodbye moon", "foo bar foo bar"]
>>> wordlength = evaluate.load("word_length", module_type="measurement")
>>> results = wordlength.compute(data=text)
{'average_word_length': 4.5}
```
## Citation(s)
## Further References
- [NLTK's `word_tokenize`](https://www.nltk.org/api/nltk.tokenize.html)
import evaluate
from evaluate.utils import launch_gradio_widget
module = evaluate.load("word_length", module_type="measurement")
launch_gradio_widget(module)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment