Commit 25991f98 authored by hepj's avatar hepj
Browse files

修改readme

parent ac192496
Pipeline #1415 failed with stages
in 0 seconds
"""
Official evaluation script for ReCoRD v1.0.
(Some functions are adopted from the SQuAD evaluation script.)
"""
import argparse
import json
import re
import string
import sys
from collections import Counter
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
return re.sub(r"\b(a|an|the)\b", " ", text)
def white_space_fix(text):
return " ".join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return "".join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def f1_score(prediction, ground_truth):
prediction_tokens = normalize_answer(prediction).split()
ground_truth_tokens = normalize_answer(ground_truth).split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
return 0
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
return f1
def exact_match_score(prediction, ground_truth):
return normalize_answer(prediction) == normalize_answer(ground_truth)
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def evaluate(dataset, predictions):
f1 = exact_match = total = 0
correct_ids = []
for passage in dataset:
for qa in passage["qas"]:
total += 1
if qa["id"] not in predictions:
message = f'Unanswered question {qa["id"]} will receive score 0.'
print(message, file=sys.stderr)
continue
ground_truths = list(map(lambda x: x["text"], qa["answers"]))
prediction = predictions[qa["id"]]
_exact_match = metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
if int(_exact_match) == 1:
correct_ids.append(qa["id"])
exact_match += _exact_match
f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)
exact_match = exact_match / total
f1 = f1 / total
return {"exact_match": exact_match, "f1": f1}, correct_ids
if __name__ == "__main__":
expected_version = "1.0"
parser = argparse.ArgumentParser("Official evaluation script for ReCoRD v1.0.")
parser.add_argument("data_file", help="The dataset file in JSON format.")
parser.add_argument("pred_file", help="The model prediction file in JSON format.")
parser.add_argument("--output_correct_ids", action="store_true", help="Output the correctly answered query IDs.")
args = parser.parse_args()
with open(args.data_file) as data_file:
dataset_json = json.load(data_file)
if dataset_json["version"] != expected_version:
print(
f'Evaluation expects v-{expected_version}, but got dataset with v-{dataset_json["version"]}',
file=sys.stderr,
)
dataset = dataset_json["data"]
with open(args.pred_file) as pred_file:
predictions = json.load(pred_file)
metrics, correct_ids = evaluate(dataset, predictions)
if args.output_correct_ids:
print(f"Output {len(correct_ids)} correctly answered question IDs.")
with open("correct_ids.json", "w") as f:
json.dump(correct_ids, f)
git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
scikit-learn
\ No newline at end of file
# Copyright 2020 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The SuperGLUE benchmark metric."""
import datasets
from sklearn.metrics import f1_score, matthews_corrcoef
import evaluate
from .record_evaluation import evaluate as evaluate_record
_CITATION = """\
@article{wang2019superglue,
title={SuperGLUE: A Stickier Benchmark for General-Purpose Language Understanding Systems},
author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},
journal={arXiv preprint arXiv:1905.00537},
year={2019}
}
"""
_DESCRIPTION = """\
SuperGLUE (https://super.gluebenchmark.com/) is a new benchmark styled after
GLUE with a new set of more difficult language understanding tasks, improved
resources, and a new public leaderboard.
"""
_KWARGS_DESCRIPTION = """
Compute SuperGLUE evaluation metric associated to each SuperGLUE dataset.
Args:
predictions: list of predictions to score. Depending on the SuperGlUE subset:
- for 'record': list of question-answer dictionaries with the following keys:
- 'idx': index of the question as specified by the dataset
- 'prediction_text': the predicted answer text
- for 'multirc': list of question-answer dictionaries with the following keys:
- 'idx': index of the question-answer pair as specified by the dataset
- 'prediction': the predicted answer label
- otherwise: list of predicted labels
references: list of reference labels. Depending on the SuperGLUE subset:
- for 'record': list of question-answers dictionaries with the following keys:
- 'idx': index of the question as specified by the dataset
- 'answers': list of possible answers
- otherwise: list of reference labels
Returns: depending on the SuperGLUE subset:
- for 'record':
- 'exact_match': Exact match between answer and gold answer
- 'f1': F1 score
- for 'multirc':
- 'exact_match': Exact match between answer and gold answer
- 'f1_m': Per-question macro-F1 score
- 'f1_a': Average F1 score over all answers
- for 'axb':
'matthews_correlation': Matthew Correlation
- for 'cb':
- 'accuracy': Accuracy
- 'f1': F1 score
- for all others:
- 'accuracy': Accuracy
Examples:
>>> super_glue_metric = evaluate.load('super_glue', 'copa') # any of ["copa", "rte", "wic", "wsc", "wsc.fixed", "boolq", "axg"]
>>> predictions = [0, 1]
>>> references = [0, 1]
>>> results = super_glue_metric.compute(predictions=predictions, references=references)
>>> print(results)
{'accuracy': 1.0}
>>> super_glue_metric = evaluate.load('super_glue', 'cb')
>>> predictions = [0, 1]
>>> references = [0, 1]
>>> results = super_glue_metric.compute(predictions=predictions, references=references)
>>> print(results)
{'accuracy': 1.0, 'f1': 1.0}
>>> super_glue_metric = evaluate.load('super_glue', 'record')
>>> predictions = [{'idx': {'passage': 0, 'query': 0}, 'prediction_text': 'answer'}]
>>> references = [{'idx': {'passage': 0, 'query': 0}, 'answers': ['answer', 'another_answer']}]
>>> results = super_glue_metric.compute(predictions=predictions, references=references)
>>> print(results)
{'exact_match': 1.0, 'f1': 1.0}
>>> super_glue_metric = evaluate.load('super_glue', 'multirc')
>>> predictions = [{'idx': {'answer': 0, 'paragraph': 0, 'question': 0}, 'prediction': 0}, {'idx': {'answer': 1, 'paragraph': 2, 'question': 3}, 'prediction': 1}]
>>> references = [0, 1]
>>> results = super_glue_metric.compute(predictions=predictions, references=references)
>>> print(results)
{'exact_match': 1.0, 'f1_m': 1.0, 'f1_a': 1.0}
>>> super_glue_metric = evaluate.load('super_glue', 'axb')
>>> references = [0, 1]
>>> predictions = [0, 1]
>>> results = super_glue_metric.compute(predictions=predictions, references=references)
>>> print(results)
{'matthews_correlation': 1.0}
"""
def simple_accuracy(preds, labels):
return float((preds == labels).mean())
def acc_and_f1(preds, labels, f1_avg="binary"):
acc = simple_accuracy(preds, labels)
f1 = float(f1_score(y_true=labels, y_pred=preds, average=f1_avg))
return {
"accuracy": acc,
"f1": f1,
}
def evaluate_multirc(ids_preds, labels):
"""
Computes F1 score and Exact Match for MultiRC predictions.
"""
question_map = {}
for id_pred, label in zip(ids_preds, labels):
question_id = f'{id_pred["idx"]["paragraph"]}-{id_pred["idx"]["question"]}'
pred = id_pred["prediction"]
if question_id in question_map:
question_map[question_id].append((pred, label))
else:
question_map[question_id] = [(pred, label)]
f1s, ems = [], []
for question, preds_labels in question_map.items():
question_preds, question_labels = zip(*preds_labels)
f1 = f1_score(y_true=question_labels, y_pred=question_preds, average="macro")
f1s.append(f1)
em = int(sum(p == l for p, l in preds_labels) == len(preds_labels))
ems.append(em)
f1_m = float(sum(f1s) / len(f1s))
em = sum(ems) / len(ems)
f1_a = float(f1_score(y_true=labels, y_pred=[id_pred["prediction"] for id_pred in ids_preds]))
return {"exact_match": em, "f1_m": f1_m, "f1_a": f1_a}
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class SuperGlue(evaluate.Metric):
def _info(self):
if self.config_name not in [
"boolq",
"cb",
"copa",
"multirc",
"record",
"rte",
"wic",
"wsc",
"wsc.fixed",
"axb",
"axg",
]:
raise KeyError(
"You should supply a configuration name selected in "
'["boolq", "cb", "copa", "multirc", "record", "rte", "wic", "wsc", "wsc.fixed", "axb", "axg",]'
)
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(self._get_feature_types()),
codebase_urls=[],
reference_urls=[],
format="numpy" if not self.config_name == "record" and not self.config_name == "multirc" else None,
)
def _get_feature_types(self):
if self.config_name == "record":
return {
"predictions": {
"idx": {
"passage": datasets.Value("int64"),
"query": datasets.Value("int64"),
},
"prediction_text": datasets.Value("string"),
},
"references": {
"idx": {
"passage": datasets.Value("int64"),
"query": datasets.Value("int64"),
},
"answers": datasets.Sequence(datasets.Value("string")),
},
}
elif self.config_name == "multirc":
return {
"predictions": {
"idx": {
"answer": datasets.Value("int64"),
"paragraph": datasets.Value("int64"),
"question": datasets.Value("int64"),
},
"prediction": datasets.Value("int64"),
},
"references": datasets.Value("int64"),
}
else:
return {
"predictions": datasets.Value("int64"),
"references": datasets.Value("int64"),
}
def _compute(self, predictions, references):
if self.config_name == "axb":
return {"matthews_correlation": matthews_corrcoef(references, predictions)}
elif self.config_name == "cb":
return acc_and_f1(predictions, references, f1_avg="macro")
elif self.config_name == "record":
dataset = [
{
"qas": [
{"id": ref["idx"]["query"], "answers": [{"text": ans} for ans in ref["answers"]]}
for ref in references
]
}
]
predictions = {pred["idx"]["query"]: pred["prediction_text"] for pred in predictions}
return evaluate_record(dataset, predictions)[0]
elif self.config_name == "multirc":
return evaluate_multirc(predictions, references)
elif self.config_name in ["copa", "rte", "wic", "wsc", "wsc.fixed", "boolq", "axg"]:
return {"accuracy": simple_accuracy(predictions, references)}
else:
raise KeyError(
"You should supply a configuration name selected in "
'["boolq", "cb", "copa", "multirc", "record", "rte", "wic", "wsc", "wsc.fixed", "axb", "axg",]'
)
---
title: TER
emoji: 🤗
colorFrom: blue
colorTo: red
sdk: gradio
sdk_version: 3.19.1
app_file: app.py
pinned: false
tags:
- evaluate
- metric
description: >-
TER (Translation Edit Rate, also called Translation Error Rate) is a metric to quantify the edit operations that a
hypothesis requires to match a reference translation. We use the implementation that is already present in sacrebleu
(https://github.com/mjpost/sacreBLEU#ter), which in turn is inspired by the TERCOM implementation, which can be found
here: https://github.com/jhclark/tercom.
The implementation here is slightly different from sacrebleu in terms of the required input format. The length of
the references and hypotheses lists need to be the same, so you may need to transpose your references compared to
sacrebleu's required input format. See https://github.com/huggingface/datasets/issues/3154#issuecomment-950746534
See the README.md file at https://github.com/mjpost/sacreBLEU#ter for more information.
---
# Metric Card for TER
## Metric Description
TER (Translation Edit Rate, also called Translation Error Rate) is a metric to quantify the edit operations that a hypothesis requires to match a reference translation. We use the implementation that is already present in [sacrebleu](https://github.com/mjpost/sacreBLEU#ter), which in turn is inspired by the [TERCOM implementation](https://github.com/jhclark/tercom).
The implementation here is slightly different from sacrebleu in terms of the required input format. The length of the references and hypotheses lists need to be the same, so you may need to transpose your references compared to sacrebleu's required input format. See [this github issue](https://github.com/huggingface/datasets/issues/3154#issuecomment-950746534).
See the README.md file at https://github.com/mjpost/sacreBLEU#ter for more information.
## How to Use
This metric takes, at minimum, predicted sentences and reference sentences:
```python
>>> predictions = ["does this sentence match??",
... "what about this sentence?",
... "What did the TER metric user say to the developer?"]
>>> references = [["does this sentence match", "does this sentence match!?!"],
... ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"],
... ["Your jokes are...", "...TERrible"]]
>>> ter = evaluate.load("ter")
>>> results = ter.compute(predictions=predictions,
... references=references,
... case_sensitive=True)
>>> print(results)
{'score': 150.0, 'num_edits': 15, 'ref_length': 10.0}
```
### Inputs
This metric takes the following as input:
- **`predictions`** (`list` of `str`): The system stream (a sequence of segments).
- **`references`** (`list` of `list` of `str`): A list of one or more reference streams (each a sequence of segments).
- **`normalized`** (`boolean`): If `True`, applies basic tokenization and normalization to sentences. Defaults to `False`.
- **`ignore_punct`** (`boolean`): If `True`, applies basic tokenization and normalization to sentences. Defaults to `False`.
- **`support_zh_ja_chars`** (`boolean`): If `True`, tokenization/normalization supports processing of Chinese characters, as well as Japanese Kanji, Hiragana, Katakana, and Phonetic Extensions of Katakana. Only applies if `normalized = True`. Defaults to `False`.
- **`case_sensitive`** (`boolean`): If `False`, makes all predictions and references lowercase to ignore differences in case. Defaults to `False`.
### Output Values
This metric returns the following:
- **`score`** (`float`): TER score (num_edits / sum_ref_lengths * 100)
- **`num_edits`** (`int`): The cumulative number of edits
- **`ref_length`** (`float`): The cumulative average reference length
The output takes the following form:
```python
{'score': ter_score, 'num_edits': num_edits, 'ref_length': ref_length}
```
The metric can take on any value `0` and above. `0` is a perfect score, meaning the predictions exactly match the references and no edits were necessary. Higher scores are worse. Scores above 100 mean that the cumulative number of edits, `num_edits`, is higher than the cumulative length of the references, `ref_length`.
#### Values from Popular Papers
### Examples
Basic example with only predictions and references as inputs:
```python
>>> predictions = ["does this sentence match??",
... "what about this sentence?"]
>>> references = [["does this sentence match", "does this sentence match!?!"],
... ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"]]
>>> ter = evaluate.load("ter")
>>> results = ter.compute(predictions=predictions,
... references=references,
... case_sensitive=True)
>>> print(results)
{'score': 62.5, 'num_edits': 5, 'ref_length': 8.0}
```
Example with `normalization = True`:
```python
>>> predictions = ["does this sentence match??",
... "what about this sentence?"]
>>> references = [["does this sentence match", "does this sentence match!?!"],
... ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"]]
>>> ter = evaluate.load("ter")
>>> results = ter.compute(predictions=predictions,
... references=references,
... normalized=True,
... case_sensitive=True)
>>> print(results)
{'score': 57.14285714285714, 'num_edits': 6, 'ref_length': 10.5}
```
Example ignoring punctuation and capitalization, and everything matches:
```python
>>> predictions = ["does this sentence match??",
... "what about this sentence?"]
>>> references = [["does this sentence match", "does this sentence match!?!"],
... ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"]]
>>> ter = evaluate.load("ter")
>>> results = ter.compute(predictions=predictions,
... references=references,
... ignore_punct=True,
... case_sensitive=False)
>>> print(results)
{'score': 0.0, 'num_edits': 0, 'ref_length': 8.0}
```
Example ignoring punctuation and capitalization, but with an extra (incorrect) sample:
```python
>>> predictions = ["does this sentence match??",
... "what about this sentence?",
... "What did the TER metric user say to the developer?"]
>>> references = [["does this sentence match", "does this sentence match!?!"],
... ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"],
... ["Your jokes are...", "...TERrible"]]
>>> ter = evaluate.load("ter")
>>> results = ter.compute(predictions=predictions,
... references=references,
... ignore_punct=True,
... case_sensitive=False)
>>> print(results)
{'score': 100.0, 'num_edits': 10, 'ref_length': 10.0}
```
## Limitations and Bias
## Citation
```bibtex
@inproceedings{snover-etal-2006-study,
title = "A Study of Translation Edit Rate with Targeted Human Annotation",
author = "Snover, Matthew and
Dorr, Bonnie and
Schwartz, Rich and
Micciulla, Linnea and
Makhoul, John",
booktitle = "Proceedings of the 7th Conference of the Association for Machine Translation in the Americas: Technical Papers",
month = aug # " 8-12",
year = "2006",
address = "Cambridge, Massachusetts, USA",
publisher = "Association for Machine Translation in the Americas",
url = "https://aclanthology.org/2006.amta-papers.25",
pages = "223--231",
}
@inproceedings{post-2018-call,
title = "A Call for Clarity in Reporting {BLEU} Scores",
author = "Post, Matt",
booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
month = oct,
year = "2018",
address = "Belgium, Brussels",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W18-6319",
pages = "186--191",
}
```
## Further References
- See [the sacreBLEU github repo](https://github.com/mjpost/sacreBLEU#ter) for more information.
import evaluate
from evaluate.utils import launch_gradio_widget
module = evaluate.load("ter")
launch_gradio_widget(module)
git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
sacrebleu
\ No newline at end of file
# Copyright 2021 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" TER metric as available in sacrebleu. """
import datasets
import sacrebleu as scb
from packaging import version
from sacrebleu import TER
import evaluate
_CITATION = """\
@inproceedings{snover-etal-2006-study,
title = "A Study of Translation Edit Rate with Targeted Human Annotation",
author = "Snover, Matthew and
Dorr, Bonnie and
Schwartz, Rich and
Micciulla, Linnea and
Makhoul, John",
booktitle = "Proceedings of the 7th Conference of the Association for Machine Translation in the Americas: Technical Papers",
month = aug # " 8-12",
year = "2006",
address = "Cambridge, Massachusetts, USA",
publisher = "Association for Machine Translation in the Americas",
url = "https://aclanthology.org/2006.amta-papers.25",
pages = "223--231",
}
@inproceedings{post-2018-call,
title = "A Call for Clarity in Reporting {BLEU} Scores",
author = "Post, Matt",
booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
month = oct,
year = "2018",
address = "Belgium, Brussels",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W18-6319",
pages = "186--191",
}
"""
_DESCRIPTION = """\
TER (Translation Edit Rate, also called Translation Error Rate) is a metric to quantify the edit operations that a
hypothesis requires to match a reference translation. We use the implementation that is already present in sacrebleu
(https://github.com/mjpost/sacreBLEU#ter), which in turn is inspired by the TERCOM implementation, which can be found
here: https://github.com/jhclark/tercom.
The implementation here is slightly different from sacrebleu in terms of the required input format. The length of
the references and hypotheses lists need to be the same, so you may need to transpose your references compared to
sacrebleu's required input format. See https://github.com/huggingface/datasets/issues/3154#issuecomment-950746534
See the README.md file at https://github.com/mjpost/sacreBLEU#ter for more information.
"""
_KWARGS_DESCRIPTION = """
Produces TER scores alongside the number of edits and reference length.
Args:
predictions (list of str): The system stream (a sequence of segments).
references (list of list of str): A list of one or more reference streams (each a sequence of segments).
normalized (boolean): If `True`, applies basic tokenization and normalization to sentences. Defaults to `False`.
ignore_punct (boolean): If `True`, applies basic tokenization and normalization to sentences. Defaults to `False`.
support_zh_ja_chars (boolean): If `True`, tokenization/normalization supports processing of Chinese characters,
as well as Japanese Kanji, Hiragana, Katakana, and Phonetic Extensions of Katakana.
Only applies if `normalized = True`. Defaults to `False`.
case_sensitive (boolean): If `False`, makes all predictions and references lowercase to ignore differences in case. Defaults to `False`.
Returns:
'score' (float): TER score (num_edits / sum_ref_lengths * 100)
'num_edits' (int): The cumulative number of edits
'ref_length' (float): The cumulative average reference length
Examples:
Example 1:
>>> predictions = ["does this sentence match??",
... "what about this sentence?",
... "What did the TER metric user say to the developer?"]
>>> references = [["does this sentence match", "does this sentence match!?!"],
... ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"],
... ["Your jokes are...", "...TERrible"]]
>>> ter = evaluate.load("ter")
>>> results = ter.compute(predictions=predictions,
... references=references,
... case_sensitive=True)
>>> print(results)
{'score': 150.0, 'num_edits': 15, 'ref_length': 10.0}
Example 2:
>>> predictions = ["does this sentence match??",
... "what about this sentence?"]
>>> references = [["does this sentence match", "does this sentence match!?!"],
... ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"]]
>>> ter = evaluate.load("ter")
>>> results = ter.compute(predictions=predictions,
... references=references,
... case_sensitive=True)
>>> print(results)
{'score': 62.5, 'num_edits': 5, 'ref_length': 8.0}
Example 3:
>>> predictions = ["does this sentence match??",
... "what about this sentence?"]
>>> references = [["does this sentence match", "does this sentence match!?!"],
... ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"]]
>>> ter = evaluate.load("ter")
>>> results = ter.compute(predictions=predictions,
... references=references,
... normalized=True,
... case_sensitive=True)
>>> print(results)
{'score': 57.14285714285714, 'num_edits': 6, 'ref_length': 10.5}
Example 4:
>>> predictions = ["does this sentence match??",
... "what about this sentence?"]
>>> references = [["does this sentence match", "does this sentence match!?!"],
... ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"]]
>>> ter = evaluate.load("ter")
>>> results = ter.compute(predictions=predictions,
... references=references,
... ignore_punct=True,
... case_sensitive=False)
>>> print(results)
{'score': 0.0, 'num_edits': 0, 'ref_length': 8.0}
Example 5:
>>> predictions = ["does this sentence match??",
... "what about this sentence?",
... "What did the TER metric user say to the developer?"]
>>> references = [["does this sentence match", "does this sentence match!?!"],
... ["wHaT aBoUt ThIs SeNtEnCe?", "wHaT aBoUt ThIs SeNtEnCe?"],
... ["Your jokes are...", "...TERrible"]]
>>> ter = evaluate.load("ter")
>>> results = ter.compute(predictions=predictions,
... references=references,
... ignore_punct=True,
... case_sensitive=False)
>>> print(results)
{'score': 100.0, 'num_edits': 10, 'ref_length': 10.0}
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Ter(evaluate.Metric):
def _info(self):
if version.parse(scb.__version__) < version.parse("1.4.12"):
raise ImportWarning(
"To use `sacrebleu`, the module `sacrebleu>=1.4.12` is required, and the current version of `sacrebleu` doesn't match this condition.\n"
'You can install it with `pip install "sacrebleu>=1.4.12"`.'
)
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
homepage="http://www.cs.umd.edu/~snover/tercom/",
inputs_description=_KWARGS_DESCRIPTION,
features=[
datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
}
),
datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence"),
}
),
],
codebase_urls=["https://github.com/mjpost/sacreBLEU#ter"],
reference_urls=[
"https://github.com/jhclark/tercom",
],
)
def _compute(
self,
predictions,
references,
normalized: bool = False,
ignore_punct: bool = False,
support_zh_ja_chars: bool = False,
case_sensitive: bool = False,
):
# if only one reference is provided make sure we still use list of lists
if isinstance(references[0], str):
references = [[ref] for ref in references]
references_per_prediction = len(references[0])
if any(len(refs) != references_per_prediction for refs in references):
raise ValueError("Sacrebleu requires the same number of references for each prediction")
transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)]
sb_ter = TER(
normalized=normalized,
no_punct=ignore_punct,
asian_support=support_zh_ja_chars,
case_sensitive=case_sensitive,
)
output = sb_ter.corpus_score(predictions, transformed_references)
return {"score": output.score, "num_edits": output.num_edits, "ref_length": output.ref_length}
---
title: TREC Eval
emoji: 🤗
colorFrom: blue
colorTo: red
sdk: gradio
sdk_version: 3.19.1
app_file: app.py
pinned: false
tags:
- evaluate
- metric
description: >-
The TREC Eval metric combines a number of information retrieval metrics such as precision and nDCG. It is used to score rankings of retrieved documents with reference values.
---
# Metric Card for TREC Eval
## Metric Description
The TREC Eval metric combines a number of information retrieval metrics such as precision and normalized Discounted Cumulative Gain (nDCG). It is used to score rankings of retrieved documents with reference values.
## How to Use
```Python
from evaluate import load
trec_eval = load("trec_eval")
results = trec_eval.compute(predictions=[run], references=[qrel])
```
### Inputs
- **predictions** *(dict): a single retrieval run.*
- **query** *(int): Query ID.*
- **q0** *(str): Literal `"q0"`.*
- **docid** *(str): Document ID.*
- **rank** *(int): Rank of document.*
- **score** *(float): Score of document.*
- **system** *(str): Tag for current run.*
- **references** *(dict): a single qrel.*
- **query** *(int): Query ID.*
- **q0** *(str): Literal `"q0"`.*
- **docid** *(str): Document ID.*
- **rel** *(int): Relevance of document.*
### Output Values
- **runid** *(str): Run name.*
- **num_ret** *(int): Number of retrieved documents.*
- **num_rel** *(int): Number of relevant documents.*
- **num_rel_ret** *(int): Number of retrieved relevant documents.*
- **num_q** *(int): Number of queries.*
- **map** *(float): Mean average precision.*
- **gm_map** *(float): geometric mean average precision.*
- **bpref** *(float): binary preference score.*
- **Rprec** *(float): precision@R, where R is number of relevant documents.*
- **recip_rank** *(float): reciprocal rank*
- **P@k** *(float): precision@k (k in [5, 10, 15, 20, 30, 100, 200, 500, 1000]).*
- **NDCG@k** *(float): nDCG@k (k in [5, 10, 15, 20, 30, 100, 200, 500, 1000]).*
### Examples
A minimal example of looks as follows:
```Python
qrel = {
"query": [0],
"q0": ["q0"],
"docid": ["doc_1"],
"rel": [2]
}
run = {
"query": [0, 0],
"q0": ["q0", "q0"],
"docid": ["doc_2", "doc_1"],
"rank": [0, 1],
"score": [1.5, 1.2],
"system": ["test", "test"]
}
trec_eval = evaluate.load("trec_eval")
results = trec_eval.compute(references=[qrel], predictions=[run])
results["P@5"]
0.2
```
A more realistic use case with an examples from [`trectools`](https://github.com/joaopalotti/trectools):
```python
qrel = pd.read_csv("robust03_qrels.txt", sep="\s+", names=["query", "q0", "docid", "rel"])
qrel["q0"] = qrel["q0"].astype(str)
qrel = qrel.to_dict(orient="list")
run = pd.read_csv("input.InexpC2", sep="\s+", names=["query", "q0", "docid", "rank", "score", "system"])
run = run.to_dict(orient="list")
trec_eval = evaluate.load("trec_eval")
result = trec_eval.compute(run=[run], qrel=[qrel])
```
```python
result
{'runid': 'InexpC2',
'num_ret': 100000,
'num_rel': 6074,
'num_rel_ret': 3198,
'num_q': 100,
'map': 0.22485930431817494,
'gm_map': 0.10411523825735523,
'bpref': 0.217511695914079,
'Rprec': 0.2502547201167236,
'recip_rank': 0.6646545943335417,
'P@5': 0.44,
'P@10': 0.37,
'P@15': 0.34600000000000003,
'P@20': 0.30999999999999994,
'P@30': 0.2563333333333333,
'P@100': 0.1428,
'P@200': 0.09510000000000002,
'P@500': 0.05242,
'P@1000': 0.03198,
'NDCG@5': 0.4101480395089769,
'NDCG@10': 0.3806761417784469,
'NDCG@15': 0.37819463408955706,
'NDCG@20': 0.3686080836061317,
'NDCG@30': 0.352474353427451,
'NDCG@100': 0.3778329431025776,
'NDCG@200': 0.4119129817248979,
'NDCG@500': 0.4585354576461375,
'NDCG@1000': 0.49092149290805653}
```
## Limitations and Bias
The `trec_eval` metric requires the inputs to be in the TREC run and qrel formats for predictions and references.
## Citation
```bibtex
@inproceedings{palotti2019,
author = {Palotti, Joao and Scells, Harrisen and Zuccon, Guido},
title = {TrecTools: an open-source Python library for Information Retrieval practitioners involved in TREC-like campaigns},
series = {SIGIR'19},
year = {2019},
location = {Paris, France},
publisher = {ACM}
}
```
## Further References
- Homepage: https://github.com/joaopalotti/trectools
\ No newline at end of file
import evaluate
from evaluate.utils import launch_gradio_widget
module = evaluate.load("trec_eval")
launch_gradio_widget(module)
git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
trectools
\ No newline at end of file
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Module to compute TREC evaluation scores."""
import datasets
import pandas as pd
from trectools import TrecEval, TrecQrel, TrecRun
import evaluate
_CITATION = """\
@inproceedings{palotti2019,
author = {Palotti, Joao and Scells, Harrisen and Zuccon, Guido},
title = {TrecTools: an open-source Python library for Information Retrieval practitioners involved in TREC-like campaigns},
series = {SIGIR'19},
year = {2019},
location = {Paris, France},
publisher = {ACM}
}
"""
# TODO: Add description of the module here
_DESCRIPTION = """\
The TREC Eval metric combines a number of information retrieval metrics such as \
precision and nDCG. It is used to score rankings of retrieved documents with reference values."""
# TODO: Add description of the arguments of the module here
_KWARGS_DESCRIPTION = """
Calculates TREC evaluation scores based on a run and qrel.
Args:
predictions: list containing a single run.
references: list containing a single qrel.
Returns:
dict: TREC evaluation scores.
Examples:
>>> trec = evaluate.load("trec_eval")
>>> qrel = {
... "query": [0],
... "q0": ["0"],
... "docid": ["doc_1"],
... "rel": [2]
... }
>>> run = {
... "query": [0, 0],
... "q0": ["q0", "q0"],
... "docid": ["doc_2", "doc_1"],
... "rank": [0, 1],
... "score": [1.5, 1.2],
... "system": ["test", "test"]
... }
>>> results = trec.compute(references=[qrel], predictions=[run])
>>> print(results["P@5"])
0.2
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class TRECEval(evaluate.Metric):
"""Compute TREC evaluation scores."""
def _info(self):
return evaluate.MetricInfo(
module_type="metric",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"predictions": {
"query": datasets.Sequence(datasets.Value("int64")),
"q0": datasets.Sequence(datasets.Value("string")),
"docid": datasets.Sequence(datasets.Value("string")),
"rank": datasets.Sequence(datasets.Value("int64")),
"score": datasets.Sequence(datasets.Value("float")),
"system": datasets.Sequence(datasets.Value("string")),
},
"references": {
"query": datasets.Sequence(datasets.Value("int64")),
"q0": datasets.Sequence(datasets.Value("string")),
"docid": datasets.Sequence(datasets.Value("string")),
"rel": datasets.Sequence(datasets.Value("int64")),
},
}
),
homepage="https://github.com/joaopalotti/trectools",
)
def _compute(self, references, predictions):
"""Returns the TREC evaluation scores."""
if len(predictions) > 1 or len(references) > 1:
raise ValueError(
f"You can only pass one prediction and reference per evaluation. You passed {len(predictions)} prediction(s) and {len(references)} reference(s)."
)
df_run = pd.DataFrame(predictions[0])
df_qrel = pd.DataFrame(references[0])
trec_run = TrecRun()
trec_run.filename = "placeholder.file"
trec_run.run_data = df_run
trec_qrel = TrecQrel()
trec_qrel.filename = "placeholder.file"
trec_qrel.qrels_data = df_qrel
trec_eval = TrecEval(trec_run, trec_qrel)
result = {}
result["runid"] = trec_eval.run.get_runid()
result["num_ret"] = trec_eval.get_retrieved_documents(per_query=False)
result["num_rel"] = trec_eval.get_relevant_documents(per_query=False)
result["num_rel_ret"] = trec_eval.get_relevant_retrieved_documents(per_query=False)
result["num_q"] = len(trec_eval.run.topics())
result["map"] = trec_eval.get_map(depth=10000, per_query=False, trec_eval=True)
result["gm_map"] = trec_eval.get_geometric_map(depth=10000, trec_eval=True)
result["bpref"] = trec_eval.get_bpref(depth=1000, per_query=False, trec_eval=True)
result["Rprec"] = trec_eval.get_rprec(depth=1000, per_query=False, trec_eval=True)
result["recip_rank"] = trec_eval.get_reciprocal_rank(depth=1000, per_query=False, trec_eval=True)
for v in [5, 10, 15, 20, 30, 100, 200, 500, 1000]:
result[f"P@{v}"] = trec_eval.get_precision(depth=v, per_query=False, trec_eval=True)
for v in [5, 10, 15, 20, 30, 100, 200, 500, 1000]:
result[f"NDCG@{v}"] = trec_eval.get_ndcg(depth=v, per_query=False, trec_eval=True)
return result
---
title: WER
emoji: 🤗
colorFrom: blue
colorTo: red
sdk: gradio
sdk_version: 3.19.1
app_file: app.py
pinned: false
tags:
- evaluate
- metric
description: >-
Word error rate (WER) is a common metric of the performance of an automatic speech recognition system.
The general difficulty of measuring performance lies in the fact that the recognized word sequence can have a different length from the reference word sequence (supposedly the correct one). The WER is derived from the Levenshtein distance, working at the word level instead of the phoneme level. The WER is a valuable tool for comparing different systems as well as for evaluating improvements within one system. This kind of measurement, however, provides no details on the nature of translation errors and further work is therefore required to identify the main source(s) of error and to focus any research effort.
This problem is solved by first aligning the recognized word sequence with the reference (spoken) word sequence using dynamic string alignment. Examination of this issue is seen through a theory called the power law that states the correlation between perplexity and word error rate.
Word error rate can then be computed as:
WER = (S + D + I) / N = (S + D + I) / (S + D + C)
where
S is the number of substitutions,
D is the number of deletions,
I is the number of insertions,
C is the number of correct words,
N is the number of words in the reference (N=S+D+C).
This value indicates the average number of errors per reference word. The lower the value, the better the
performance of the ASR system with a WER of 0 being a perfect score.
---
# Metric Card for WER
## Metric description
Word error rate (WER) is a common metric of the performance of an automatic speech recognition (ASR) system.
The general difficulty of measuring the performance of ASR systems lies in the fact that the recognized word sequence can have a different length from the reference word sequence (supposedly the correct one). The WER is derived from the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance), working at the word level.
This problem is solved by first aligning the recognized word sequence with the reference (spoken) word sequence using dynamic string alignment. Examination of this issue is seen through a theory called the power law that states the correlation between [perplexity](https://huggingface.co/metrics/perplexity) and word error rate (see [this article](https://www.cs.cmu.edu/~roni/papers/eval-metrics-bntuw-9802.pdf) for further information).
Word error rate can then be computed as:
`WER = (S + D + I) / N = (S + D + I) / (S + D + C)`
where
`S` is the number of substitutions,
`D` is the number of deletions,
`I` is the number of insertions,
`C` is the number of correct words,
`N` is the number of words in the reference (`N=S+D+C`).
## How to use
The metric takes two inputs: references (a list of references for each speech input) and predictions (a list of transcriptions to score).
```python
from evaluate import load
wer = load("wer")
wer_score = wer.compute(predictions=predictions, references=references)
```
## Output values
This metric outputs a float representing the word error rate.
```
print(wer_score)
0.5
```
This value indicates the average number of errors per reference word.
The **lower** the value, the **better** the performance of the ASR system, with a WER of 0 being a perfect score.
### Values from popular papers
This metric is highly dependent on the content and quality of the dataset, and therefore users can expect very different values for the same model but on different datasets.
For example, datasets such as [LibriSpeech](https://huggingface.co/datasets/librispeech_asr) report a WER in the 1.8-3.3 range, whereas ASR models evaluated on [Timit](https://huggingface.co/datasets/timit_asr) report a WER in the 8.3-20.4 range.
See the leaderboards for [LibriSpeech](https://paperswithcode.com/sota/speech-recognition-on-librispeech-test-clean) and [Timit](https://paperswithcode.com/sota/speech-recognition-on-timit) for the most recent values.
## Examples
Perfect match between prediction and reference:
```python
from evaluate import load
wer = load("wer")
predictions = ["hello world", "good night moon"]
references = ["hello world", "good night moon"]
wer_score = wer.compute(predictions=predictions, references=references)
print(wer_score)
0.0
```
Partial match between prediction and reference:
```python
from evaluate import load
wer = load("wer")
predictions = ["this is the prediction", "there is an other sample"]
references = ["this is the reference", "there is another one"]
wer_score = wer.compute(predictions=predictions, references=references)
print(wer_score)
0.5
```
No match between prediction and reference:
```python
from evaluate import load
wer = load("wer")
predictions = ["hello world", "good night moon"]
references = ["hi everyone", "have a great day"]
wer_score = wer.compute(predictions=predictions, references=references)
print(wer_score)
1.0
```
## Limitations and bias
WER is a valuable tool for comparing different systems as well as for evaluating improvements within one system. This kind of measurement, however, provides no details on the nature of translation errors and further work is therefore required to identify the main source(s) of error and to focus any research effort.
## Citation
```bibtex
@inproceedings{woodard1982,
author = {Woodard, J.P. and Nelson, J.T.,
year = {1982},
journal = {Workshop on standardisation for speech I/O technology, Naval Air Development Center, Warminster, PA},
title = {An information theoretic measure of speech recognition performance}
}
```
```bibtex
@inproceedings{morris2004,
author = {Morris, Andrew and Maier, Viktoria and Green, Phil},
year = {2004},
month = {01},
pages = {},
title = {From WER and RIL to MER and WIL: improved evaluation measures for connected speech recognition.}
}
```
## Further References
- [Word Error Rate -- Wikipedia](https://en.wikipedia.org/wiki/Word_error_rate)
- [Hugging Face Tasks -- Automatic Speech Recognition](https://huggingface.co/tasks/automatic-speech-recognition)
import evaluate
from evaluate.utils import launch_gradio_widget
module = evaluate.load("wer")
launch_gradio_widget(module)
git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
jiwer
\ No newline at end of file
# Copyright 2021 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Word Error Ratio (WER) metric. """
import datasets
from jiwer import compute_measures
import evaluate
_CITATION = """\
@inproceedings{inproceedings,
author = {Morris, Andrew and Maier, Viktoria and Green, Phil},
year = {2004},
month = {01},
pages = {},
title = {From WER and RIL to MER and WIL: improved evaluation measures for connected speech recognition.}
}
"""
_DESCRIPTION = """\
Word error rate (WER) is a common metric of the performance of an automatic speech recognition system.
The general difficulty of measuring performance lies in the fact that the recognized word sequence can have a different length from the reference word sequence (supposedly the correct one). The WER is derived from the Levenshtein distance, working at the word level instead of the phoneme level. The WER is a valuable tool for comparing different systems as well as for evaluating improvements within one system. This kind of measurement, however, provides no details on the nature of translation errors and further work is therefore required to identify the main source(s) of error and to focus any research effort.
This problem is solved by first aligning the recognized word sequence with the reference (spoken) word sequence using dynamic string alignment. Examination of this issue is seen through a theory called the power law that states the correlation between perplexity and word error rate.
Word error rate can then be computed as:
WER = (S + D + I) / N = (S + D + I) / (S + D + C)
where
S is the number of substitutions,
D is the number of deletions,
I is the number of insertions,
C is the number of correct words,
N is the number of words in the reference (N=S+D+C).
This value indicates the average number of errors per reference word. The lower the value, the better the
performance of the ASR system with a WER of 0 being a perfect score.
"""
_KWARGS_DESCRIPTION = """
Compute WER score of transcribed segments against references.
Args:
references: List of references for each speech input.
predictions: List of transcriptions to score.
concatenate_texts (bool, default=False): Whether to concatenate all input texts or compute WER iteratively.
Returns:
(float): the word error rate
Examples:
>>> predictions = ["this is the prediction", "there is an other sample"]
>>> references = ["this is the reference", "there is another one"]
>>> wer = evaluate.load("wer")
>>> wer_score = wer.compute(predictions=predictions, references=references)
>>> print(wer_score)
0.5
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class WER(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence"),
}
),
codebase_urls=["https://github.com/jitsi/jiwer/"],
reference_urls=[
"https://en.wikipedia.org/wiki/Word_error_rate",
],
)
def _compute(self, predictions=None, references=None, concatenate_texts=False):
if concatenate_texts:
return compute_measures(references, predictions)["wer"]
else:
incorrect = 0
total = 0
for prediction, reference in zip(predictions, references):
measures = compute_measures(reference, prediction)
incorrect += measures["substitutions"] + measures["deletions"] + measures["insertions"]
total += measures["substitutions"] + measures["deletions"] + measures["hits"]
return incorrect / total
---
title: WikiSplit
emoji: 🤗
colorFrom: blue
colorTo: red
sdk: gradio
sdk_version: 3.19.1
app_file: app.py
pinned: false
tags:
- evaluate
- metric
description: >-
WIKI_SPLIT is the combination of three metrics SARI, EXACT and SACREBLEU
It can be used to evaluate the quality of machine-generated texts.
---
# Metric Card for WikiSplit
## Metric description
WikiSplit is the combination of three metrics: [SARI](https://huggingface.co/metrics/sari), [exact match](https://huggingface.co/metrics/exact_match) and [SacreBLEU](https://huggingface.co/metrics/sacrebleu).
It can be used to evaluate the quality of sentence splitting approaches, which require rewriting a long sentence into two or more coherent short sentences, e.g. based on the [WikiSplit dataset](https://huggingface.co/datasets/wiki_split).
## How to use
The WIKI_SPLIT metric takes three inputs:
`sources`: a list of source sentences, where each sentence should be a string.
`predictions`: a list of predicted sentences, where each sentence should be a string.
`references`: a list of lists of reference sentences, where each sentence should be a string.
```python
>>> wiki_split = evaluate.load("wiki_split")
>>> sources = ["About 95 species are currently accepted ."]
>>> predictions = ["About 95 you now get in ."]
>>> references= [["About 95 species are currently known ."]]
>>> results = wiki_split.compute(sources=sources, predictions=predictions, references=references)
```
## Output values
This metric outputs a dictionary containing three scores:
`sari`: the [SARI](https://huggingface.co/metrics/sari) score, whose range is between `0.0` and `100.0` -- the higher the value, the better the performance of the model being evaluated, with a SARI of 100 being a perfect score.
`sacrebleu`: the [SacreBLEU](https://huggingface.co/metrics/sacrebleu) score, which can take any value between `0.0` and `100.0`, inclusive.
`exact`: the [exact match](https://huggingface.co/metrics/exact_match) score, which represents the sum of all of the individual exact match scores in the set, divided by the total number of predictions in the set. It ranges from `0.0` to `100`, inclusive. Here, `0.0` means no prediction/reference pairs were matches, while `100.0` means they all were.
```python
>>> print(results)
{'sari': 21.805555555555557, 'sacrebleu': 14.535768424205482, 'exact': 0.0}
```
### Values from popular papers
This metric was initially used by [Rothe et al.(2020)](https://arxiv.org/pdf/1907.12461.pdf) to evaluate the performance of different split-and-rephrase approaches on the [WikiSplit dataset](https://huggingface.co/datasets/wiki_split). They reported a SARI score of 63.5, a SacreBLEU score of 77.2, and an EXACT_MATCH score of 16.3.
## Examples
Perfect match between prediction and reference:
```python
>>> wiki_split = evaluate.load("wiki_split")
>>> sources = ["About 95 species are currently accepted ."]
>>> predictions = ["About 95 species are currently accepted ."]
>>> references= [["About 95 species are currently accepted ."]]
>>> results = wiki_split.compute(sources=sources, predictions=predictions, references=references)
>>> print(results)
{'sari': 100.0, 'sacrebleu': 100.00000000000004, 'exact': 100.0
```
Partial match between prediction and reference:
```python
>>> wiki_split = evaluate.load("wiki_split")
>>> sources = ["About 95 species are currently accepted ."]
>>> predictions = ["About 95 you now get in ."]
>>> references= [["About 95 species are currently known ."]]
>>> results = wiki_split.compute(sources=sources, predictions=predictions, references=references)
>>> print(results)
{'sari': 21.805555555555557, 'sacrebleu': 14.535768424205482, 'exact': 0.0}
```
No match between prediction and reference:
```python
>>> wiki_split = evaluate.load("wiki_split")
>>> sources = ["About 95 species are currently accepted ."]
>>> predictions = ["Hello world ."]
>>> references= [["About 95 species are currently known ."]]
>>> results = wiki_split.compute(sources=sources, predictions=predictions, references=references)
>>> print(results)
{'sari': 14.047619047619046, 'sacrebleu': 0.0, 'exact': 0.0}
```
## Limitations and bias
This metric is not the official metric to evaluate models on the [WikiSplit dataset](https://huggingface.co/datasets/wiki_split). It was initially proposed by [Rothe et al.(2020)](https://arxiv.org/pdf/1907.12461.pdf), whereas the [original paper introducing the WikiSplit dataset (2018)](https://aclanthology.org/D18-1080.pdf) uses different metrics to evaluate performance, such as corpus-level [BLEU](https://huggingface.co/metrics/bleu) and sentence-level BLEU.
## Citation
```bibtex
@article{rothe2020leveraging,
title={Leveraging pre-trained checkpoints for sequence generation tasks},
author={Rothe, Sascha and Narayan, Shashi and Severyn, Aliaksei},
journal={Transactions of the Association for Computational Linguistics},
volume={8},
pages={264--280},
year={2020},
publisher={MIT Press}
}
```
## Further References
- [WikiSplit dataset](https://huggingface.co/datasets/wiki_split)
- [WikiSplit paper (Botha et al., 2018)](https://aclanthology.org/D18-1080.pdf)
import evaluate
from evaluate.utils import launch_gradio_widget
module = evaluate.load("wiki_split")
launch_gradio_widget(module)
git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
sacrebleu
sacremoses
\ No newline at end of file
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" WIKI_SPLIT metric."""
import re
import string
from collections import Counter
import datasets
import sacrebleu
import sacremoses
from packaging import version
import evaluate
_CITATION = """
@inproceedings{xu-etal-2016-optimizing,
title = {Optimizing Statistical Machine Translation for Text Simplification},
authors={Xu, Wei and Napoles, Courtney and Pavlick, Ellie and Chen, Quanze and Callison-Burch, Chris},
journal = {Transactions of the Association for Computational Linguistics},
volume = {4},
year={2016},
url = {https://www.aclweb.org/anthology/Q16-1029},
pages = {401--415
},
@inproceedings{post-2018-call,
title = "A Call for Clarity in Reporting {BLEU} Scores",
author = "Post, Matt",
booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
month = oct,
year = "2018",
address = "Belgium, Brussels",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W18-6319",
pages = "186--191",
}
"""
_DESCRIPTION = """\
WIKI_SPLIT is the combination of three metrics SARI, EXACT and SACREBLEU
It can be used to evaluate the quality of machine-generated texts.
"""
_KWARGS_DESCRIPTION = """
Calculates sari score (between 0 and 100) given a list of source and predicted
sentences, and a list of lists of reference sentences. It also computes the BLEU score as well as the exact match score.
Args:
sources: list of source sentences where each sentence should be a string.
predictions: list of predicted sentences where each sentence should be a string.
references: list of lists of reference sentences where each sentence should be a string.
Returns:
sari: sari score
sacrebleu: sacrebleu score
exact: exact score
Examples:
>>> sources=["About 95 species are currently accepted ."]
>>> predictions=["About 95 you now get in ."]
>>> references=[["About 95 species are currently known ."]]
>>> wiki_split = evaluate.load("wiki_split")
>>> results = wiki_split.compute(sources=sources, predictions=predictions, references=references)
>>> print(results)
{'sari': 21.805555555555557, 'sacrebleu': 14.535768424205482, 'exact': 0.0}
"""
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
return re.sub(regex, " ", text)
def white_space_fix(text):
return " ".join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return "".join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def compute_exact(a_gold, a_pred):
return int(normalize_answer(a_gold) == normalize_answer(a_pred))
def compute_em(predictions, references):
scores = [any([compute_exact(ref, pred) for ref in refs]) for pred, refs in zip(predictions, references)]
return (sum(scores) / len(scores)) * 100
def SARIngram(sgrams, cgrams, rgramslist, numref):
rgramsall = [rgram for rgrams in rgramslist for rgram in rgrams]
rgramcounter = Counter(rgramsall)
sgramcounter = Counter(sgrams)
sgramcounter_rep = Counter()
for sgram, scount in sgramcounter.items():
sgramcounter_rep[sgram] = scount * numref
cgramcounter = Counter(cgrams)
cgramcounter_rep = Counter()
for cgram, ccount in cgramcounter.items():
cgramcounter_rep[cgram] = ccount * numref
# KEEP
keepgramcounter_rep = sgramcounter_rep & cgramcounter_rep
keepgramcountergood_rep = keepgramcounter_rep & rgramcounter
keepgramcounterall_rep = sgramcounter_rep & rgramcounter
keeptmpscore1 = 0
keeptmpscore2 = 0
for keepgram in keepgramcountergood_rep:
keeptmpscore1 += keepgramcountergood_rep[keepgram] / keepgramcounter_rep[keepgram]
# Fix an alleged bug [2] in the keep score computation.
# keeptmpscore2 += keepgramcountergood_rep[keepgram] / keepgramcounterall_rep[keepgram]
keeptmpscore2 += keepgramcountergood_rep[keepgram]
# Define 0/0=1 instead of 0 to give higher scores for predictions that match
# a target exactly.
keepscore_precision = 1
keepscore_recall = 1
if len(keepgramcounter_rep) > 0:
keepscore_precision = keeptmpscore1 / len(keepgramcounter_rep)
if len(keepgramcounterall_rep) > 0:
# Fix an alleged bug [2] in the keep score computation.
# keepscore_recall = keeptmpscore2 / len(keepgramcounterall_rep)
keepscore_recall = keeptmpscore2 / sum(keepgramcounterall_rep.values())
keepscore = 0
if keepscore_precision > 0 or keepscore_recall > 0:
keepscore = 2 * keepscore_precision * keepscore_recall / (keepscore_precision + keepscore_recall)
# DELETION
delgramcounter_rep = sgramcounter_rep - cgramcounter_rep
delgramcountergood_rep = delgramcounter_rep - rgramcounter
delgramcounterall_rep = sgramcounter_rep - rgramcounter
deltmpscore1 = 0
deltmpscore2 = 0
for delgram in delgramcountergood_rep:
deltmpscore1 += delgramcountergood_rep[delgram] / delgramcounter_rep[delgram]
deltmpscore2 += delgramcountergood_rep[delgram] / delgramcounterall_rep[delgram]
# Define 0/0=1 instead of 0 to give higher scores for predictions that match
# a target exactly.
delscore_precision = 1
if len(delgramcounter_rep) > 0:
delscore_precision = deltmpscore1 / len(delgramcounter_rep)
# ADDITION
addgramcounter = set(cgramcounter) - set(sgramcounter)
addgramcountergood = set(addgramcounter) & set(rgramcounter)
addgramcounterall = set(rgramcounter) - set(sgramcounter)
addtmpscore = 0
for addgram in addgramcountergood:
addtmpscore += 1
# Define 0/0=1 instead of 0 to give higher scores for predictions that match
# a target exactly.
addscore_precision = 1
addscore_recall = 1
if len(addgramcounter) > 0:
addscore_precision = addtmpscore / len(addgramcounter)
if len(addgramcounterall) > 0:
addscore_recall = addtmpscore / len(addgramcounterall)
addscore = 0
if addscore_precision > 0 or addscore_recall > 0:
addscore = 2 * addscore_precision * addscore_recall / (addscore_precision + addscore_recall)
return (keepscore, delscore_precision, addscore)
def SARIsent(ssent, csent, rsents):
numref = len(rsents)
s1grams = ssent.split(" ")
c1grams = csent.split(" ")
s2grams = []
c2grams = []
s3grams = []
c3grams = []
s4grams = []
c4grams = []
r1gramslist = []
r2gramslist = []
r3gramslist = []
r4gramslist = []
for rsent in rsents:
r1grams = rsent.split(" ")
r2grams = []
r3grams = []
r4grams = []
r1gramslist.append(r1grams)
for i in range(0, len(r1grams) - 1):
if i < len(r1grams) - 1:
r2gram = r1grams[i] + " " + r1grams[i + 1]
r2grams.append(r2gram)
if i < len(r1grams) - 2:
r3gram = r1grams[i] + " " + r1grams[i + 1] + " " + r1grams[i + 2]
r3grams.append(r3gram)
if i < len(r1grams) - 3:
r4gram = r1grams[i] + " " + r1grams[i + 1] + " " + r1grams[i + 2] + " " + r1grams[i + 3]
r4grams.append(r4gram)
r2gramslist.append(r2grams)
r3gramslist.append(r3grams)
r4gramslist.append(r4grams)
for i in range(0, len(s1grams) - 1):
if i < len(s1grams) - 1:
s2gram = s1grams[i] + " " + s1grams[i + 1]
s2grams.append(s2gram)
if i < len(s1grams) - 2:
s3gram = s1grams[i] + " " + s1grams[i + 1] + " " + s1grams[i + 2]
s3grams.append(s3gram)
if i < len(s1grams) - 3:
s4gram = s1grams[i] + " " + s1grams[i + 1] + " " + s1grams[i + 2] + " " + s1grams[i + 3]
s4grams.append(s4gram)
for i in range(0, len(c1grams) - 1):
if i < len(c1grams) - 1:
c2gram = c1grams[i] + " " + c1grams[i + 1]
c2grams.append(c2gram)
if i < len(c1grams) - 2:
c3gram = c1grams[i] + " " + c1grams[i + 1] + " " + c1grams[i + 2]
c3grams.append(c3gram)
if i < len(c1grams) - 3:
c4gram = c1grams[i] + " " + c1grams[i + 1] + " " + c1grams[i + 2] + " " + c1grams[i + 3]
c4grams.append(c4gram)
(keep1score, del1score, add1score) = SARIngram(s1grams, c1grams, r1gramslist, numref)
(keep2score, del2score, add2score) = SARIngram(s2grams, c2grams, r2gramslist, numref)
(keep3score, del3score, add3score) = SARIngram(s3grams, c3grams, r3gramslist, numref)
(keep4score, del4score, add4score) = SARIngram(s4grams, c4grams, r4gramslist, numref)
avgkeepscore = sum([keep1score, keep2score, keep3score, keep4score]) / 4
avgdelscore = sum([del1score, del2score, del3score, del4score]) / 4
avgaddscore = sum([add1score, add2score, add3score, add4score]) / 4
finalscore = (avgkeepscore + avgdelscore + avgaddscore) / 3
return finalscore
def normalize(sentence, lowercase: bool = True, tokenizer: str = "13a", return_str: bool = True):
# Normalization is requried for the ASSET dataset (one of the primary
# datasets in sentence simplification) to allow using space
# to split the sentence. Even though Wiki-Auto and TURK datasets,
# do not require normalization, we do it for consistency.
# Code adapted from the EASSE library [1] written by the authors of the ASSET dataset.
# [1] https://github.com/feralvam/easse/blob/580bba7e1378fc8289c663f864e0487188fe8067/easse/utils/preprocessing.py#L7
if lowercase:
sentence = sentence.lower()
if tokenizer in ["13a", "intl"]:
if version.parse(sacrebleu.__version__).major >= 2:
normalized_sent = sacrebleu.metrics.bleu._get_tokenizer(tokenizer)()(sentence)
else:
normalized_sent = sacrebleu.TOKENIZERS[tokenizer]()(sentence)
elif tokenizer == "moses":
normalized_sent = sacremoses.MosesTokenizer().tokenize(sentence, return_str=True, escape=False)
elif tokenizer == "penn":
normalized_sent = sacremoses.MosesTokenizer().penn_tokenize(sentence, return_str=True)
else:
normalized_sent = sentence
if not return_str:
normalized_sent = normalized_sent.split()
return normalized_sent
def compute_sari(sources, predictions, references):
if not (len(sources) == len(predictions) == len(references)):
raise ValueError("Sources length must match predictions and references lengths.")
sari_score = 0
for src, pred, refs in zip(sources, predictions, references):
sari_score += SARIsent(normalize(src), normalize(pred), [normalize(sent) for sent in refs])
sari_score = sari_score / len(predictions)
return 100 * sari_score
def compute_sacrebleu(
predictions,
references,
smooth_method="exp",
smooth_value=None,
force=False,
lowercase=False,
use_effective_order=False,
):
references_per_prediction = len(references[0])
if any(len(refs) != references_per_prediction for refs in references):
raise ValueError("Sacrebleu requires the same number of references for each prediction")
transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)]
output = sacrebleu.corpus_bleu(
predictions,
transformed_references,
smooth_method=smooth_method,
smooth_value=smooth_value,
force=force,
lowercase=lowercase,
use_effective_order=use_effective_order,
)
return output.score
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class WikiSplit(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=[
datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
}
),
datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence"),
}
),
],
codebase_urls=[
"https://github.com/huggingface/transformers/blob/master/src/transformers/data/metrics/squad_metrics.py",
"https://github.com/cocoxu/simplification/blob/master/SARI.py",
"https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/sari_hook.py",
"https://github.com/mjpost/sacreBLEU",
],
reference_urls=[
"https://www.aclweb.org/anthology/Q16-1029.pdf",
"https://github.com/mjpost/sacreBLEU",
"https://en.wikipedia.org/wiki/BLEU",
"https://towardsdatascience.com/evaluating-text-output-in-nlp-bleu-at-your-own-risk-e8609665a213",
],
)
def _compute(self, sources, predictions, references):
# if only one reference is provided make sure we still use list of lists
if isinstance(references[0], str):
references = [[ref] for ref in references]
result = {}
result.update({"sari": compute_sari(sources=sources, predictions=predictions, references=references)})
result.update({"sacrebleu": compute_sacrebleu(predictions=predictions, references=references)})
result.update({"exact": compute_em(predictions=predictions, references=references)})
return result
---
title: XNLI
emoji: 🤗
colorFrom: blue
colorTo: red
sdk: gradio
sdk_version: 3.19.1
app_file: app.py
pinned: false
tags:
- evaluate
- metric
description: >-
XNLI is a subset of a few thousand examples from MNLI which has been translated
into a 14 different languages (some low-ish resource). As with MNLI, the goal is
to predict textual entailment (does sentence A imply/contradict/neither sentence
B) and is a classification task (given two sentences, predict one of three
labels).
---
# Metric Card for XNLI
## Metric description
The XNLI metric allows to evaluate a model's score on the [XNLI dataset](https://huggingface.co/datasets/xnli), which is a subset of a few thousand examples from the [MNLI dataset](https://huggingface.co/datasets/glue/viewer/mnli) that have been translated into a 14 different languages, some of which are relatively low resource such as Swahili and Urdu.
As with MNLI, the task is to predict textual entailment (does sentence A imply/contradict/neither sentence B) and is a classification task (given two sentences, predict one of three labels).
## How to use
The XNLI metric is computed based on the `predictions` (a list of predicted labels) and the `references` (a list of ground truth labels).
```python
from evaluate import load
xnli_metric = load("xnli")
predictions = [0, 1]
references = [0, 1]
results = xnli_metric.compute(predictions=predictions, references=references)
```
## Output values
The output of the XNLI metric is simply the `accuracy`, i.e. the proportion of correct predictions among the total number of cases processed, with a range between 0 and 1 (see [accuracy](https://huggingface.co/metrics/accuracy) for more information).
### Values from popular papers
The [original XNLI paper](https://arxiv.org/pdf/1809.05053.pdf) reported accuracies ranging from 59.3 (for `ur`) to 73.7 (for `en`) for the BiLSTM-max model.
For more recent model performance, see the [dataset leaderboard](https://paperswithcode.com/dataset/xnli).
## Examples
Maximal values:
```python
>>> from evaluate import load
>>> xnli_metric = load("xnli")
>>> predictions = [0, 1]
>>> references = [0, 1]
>>> results = xnli_metric.compute(predictions=predictions, references=references)
>>> print(results)
{'accuracy': 1.0}
```
Minimal values:
```python
>>> from evaluate import load
>>> xnli_metric = load("xnli")
>>> predictions = [1, 0]
>>> references = [0, 1]
>>> results = xnli_metric.compute(predictions=predictions, references=references)
>>> print(results)
{'accuracy': 0.0}
```
Partial match:
```python
>>> from evaluate import load
>>> xnli_metric = load("xnli")
>>> predictions = [1, 0, 1]
>>> references = [1, 0, 0]
>>> results = xnli_metric.compute(predictions=predictions, references=references)
>>> print(results)
{'accuracy': 0.6666666666666666}
```
## Limitations and bias
While accuracy alone does give a certain indication of performance, it can be supplemented by error analysis and a better understanding of the model's mistakes on each of the categories represented in the dataset, especially if they are unbalanced.
While the XNLI dataset is multilingual and represents a diversity of languages, in reality, cross-lingual sentence understanding goes beyond translation, given that there are many cultural differences that have an impact on human sentiment annotations. Since the XNLI dataset was obtained by translation based on English sentences, it does not capture these cultural differences.
## Citation
```bibtex
@InProceedings{conneau2018xnli,
author = "Conneau, Alexis
and Rinott, Ruty
and Lample, Guillaume
and Williams, Adina
and Bowman, Samuel R.
and Schwenk, Holger
and Stoyanov, Veselin",
title = "XNLI: Evaluating Cross-lingual Sentence Representations",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods
in Natural Language Processing",
year = "2018",
publisher = "Association for Computational Linguistics",
location = "Brussels, Belgium",
}
```
## Further References
- [XNI Dataset GitHub](https://github.com/facebookresearch/XNLI)
- [HuggingFace Tasks -- Text Classification](https://huggingface.co/tasks/text-classification)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment