Commit 25991f98 authored by hepj's avatar hepj
Browse files

修改readme

parent ac192496
Pipeline #1415 failed with stages
in 0 seconds
import evaluate
from evaluate.utils import launch_gradio_widget
module = evaluate.load("brier_score")
launch_gradio_widget(module)
# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Brier Score Metric"""
import datasets
from sklearn.metrics import brier_score_loss
import evaluate
_CITATION = """\
@article{scikit-learn,
title={Scikit-learn: Machine Learning in {P}ython},
author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
journal={Journal of Machine Learning Research},
volume={12},
pages={2825--2830},
year={2011}
}
"""
_DESCRIPTION = """\
Brier score is a type of evaluation metric for classification tasks, where you predict outcomes such as win/lose, spam/ham, click/no-click etc.
`BrierScore = 1/N * sum( (p_i - o_i)^2 )`
"""
_KWARGS_DESCRIPTION = """
Args:
y_true : array of shape (n_samples,)
True targets.
y_prob : array of shape (n_samples,)
Probabilities of the positive class.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
pos_label : int or str, default=None
Label of the positive class. `pos_label` will be inferred in the
following manner:
* if `y_true` in {-1, 1} or {0, 1}, `pos_label` defaults to 1;
* else if `y_true` contains string, an error will be raised and
`pos_label` should be explicitly specified;
* otherwise, `pos_label` defaults to the greater label,
i.e. `np.unique(y_true)[-1]`.
Returns
score : float
Brier score loss.
Examples:
Example-1: if y_true in {-1, 1} or {0, 1}, pos_label defaults to 1.
>>> import numpy as np
>>> brier_score = evaluate.load("brier_score")
>>> references = np.array([0, 0, 1, 1])
>>> predictions = np.array([0.1, 0.9, 0.8, 0.3])
>>> results = brier_score.compute(references=references, predictions=predictions)
>>> print(round(results["brier_score"], 4))
0.3375
Example-2: if y_true contains string, an error will be raised and pos_label should be explicitly specified.
>>> import numpy as np
>>> brier_score = evaluate.load("brier_score")
>>> references = np.array(["spam", "ham", "ham", "spam"])
>>> predictions = np.array([0.1, 0.9, 0.8, 0.3])
>>> results = brier_score.compute(references=references, predictions=predictions, pos_label="ham")
>>> print(round(results["brier_score"], 4))
0.0375
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class BrierScore(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=self._get_feature_types(),
reference_urls=["https://scikit-learn.org/stable/modules/generated/sklearn.metrics.brier_score_loss.html"],
)
def _get_feature_types(self):
if self.config_name == "multilist":
return [
datasets.Features(
{
"references": datasets.Sequence(datasets.Value("float")),
"predictions": datasets.Sequence(datasets.Value("float")),
}
),
datasets.Features(
{
"references": datasets.Sequence(datasets.Value("string")),
"predictions": datasets.Sequence(datasets.Value("float")),
}
),
]
else:
return [
datasets.Features(
{
"references": datasets.Value("float"),
"predictions": datasets.Value("float"),
}
),
datasets.Features(
{
"references": datasets.Value("string"),
"predictions": datasets.Value("float"),
}
),
]
def _compute(self, references, predictions, sample_weight=None, pos_label=1):
brier_score = brier_score_loss(references, predictions, sample_weight=sample_weight, pos_label=pos_label)
return {"brier_score": brier_score}
git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
scikit-learn
\ No newline at end of file
---
title: CER
emoji: 🤗
colorFrom: blue
colorTo: red
sdk: gradio
sdk_version: 3.19.1
app_file: app.py
pinned: false
tags:
- evaluate
- metric
description: >-
Character error rate (CER) is a common metric of the performance of an automatic speech recognition system.
CER is similar to Word Error Rate (WER), but operates on character instead of word. Please refer to docs of WER for further information.
Character error rate can be computed as:
CER = (S + D + I) / N = (S + D + I) / (S + D + C)
where
S is the number of substitutions,
D is the number of deletions,
I is the number of insertions,
C is the number of correct characters,
N is the number of characters in the reference (N=S+D+C).
CER's output is not always a number between 0 and 1, in particular when there is a high number of insertions. This value is often associated to the percentage of characters that were incorrectly predicted. The lower the value, the better the
performance of the ASR system with a CER of 0 being a perfect score.
---
# Metric Card for CER
## Metric description
Character error rate (CER) is a common metric of the performance of an automatic speech recognition (ASR) system. CER is similar to Word Error Rate (WER), but operates on character instead of word.
Character error rate can be computed as:
`CER = (S + D + I) / N = (S + D + I) / (S + D + C)`
where
`S` is the number of substitutions,
`D` is the number of deletions,
`I` is the number of insertions,
`C` is the number of correct characters,
`N` is the number of characters in the reference (`N=S+D+C`).
## How to use
The metric takes two inputs: references (a list of references for each speech input) and predictions (a list of transcriptions to score).
```python
from evaluate import load
cer = load("cer")
cer_score = cer.compute(predictions=predictions, references=references)
```
## Output values
This metric outputs a float representing the character error rate.
```
print(cer_score)
0.34146341463414637
```
The **lower** the CER value, the **better** the performance of the ASR system, with a CER of 0 being a perfect score.
However, CER's output is not always a number between 0 and 1, in particular when there is a high number of insertions (see [Examples](#Examples) below).
### Values from popular papers
This metric is highly dependent on the content and quality of the dataset, and therefore users can expect very different values for the same model but on different datasets.
Multilingual datasets such as [Common Voice](https://huggingface.co/datasets/common_voice) report different CERs depending on the language, ranging from 0.02-0.03 for languages such as French and Italian, to 0.05-0.07 for English (see [here](https://github.com/speechbrain/speechbrain/tree/develop/recipes/CommonVoice/ASR/CTC) for more values).
## Examples
Perfect match between prediction and reference:
```python
from evaluate import load
cer = load("cer")
predictions = ["hello world", "good night moon"]
references = ["hello world", "good night moon"]
cer_score = cer.compute(predictions=predictions, references=references)
print(cer_score)
0.0
```
Partial match between prediction and reference:
```python
from evaluate import load
cer = load("cer")
predictions = ["this is the prediction", "there is an other sample"]
references = ["this is the reference", "there is another one"]
cer_score = cer.compute(predictions=predictions, references=references)
print(cer_score)
0.34146341463414637
```
No match between prediction and reference:
```python
from evaluate import load
cer = load("cer")
predictions = ["hello"]
references = ["gracias"]
cer_score = cer.compute(predictions=predictions, references=references)
print(cer_score)
1.0
```
CER above 1 due to insertion errors:
```python
from evaluate import load
cer = load("cer")
predictions = ["hello world"]
references = ["hello"]
cer_score = cer.compute(predictions=predictions, references=references)
print(cer_score)
1.2
```
## Limitations and bias
CER is useful for comparing different models for tasks such as automatic speech recognition (ASR) and optic character recognition (OCR), especially for multilingual datasets where WER is not suitable given the diversity of languages. However, CER provides no details on the nature of translation errors and further work is therefore required to identify the main source(s) of error and to focus any research effort.
Also, in some cases, instead of reporting the raw CER, a normalized CER is reported where the number of mistakes is divided by the sum of the number of edit operations (`I` + `S` + `D`) and `C` (the number of correct characters), which results in CER values that fall within the range of 0–100%.
## Citation
```bibtex
@inproceedings{morris2004,
author = {Morris, Andrew and Maier, Viktoria and Green, Phil},
year = {2004},
month = {01},
pages = {},
title = {From WER and RIL to MER and WIL: improved evaluation measures for connected speech recognition.}
}
```
## Further References
- [Hugging Face Tasks -- Automatic Speech Recognition](https://huggingface.co/tasks/automatic-speech-recognition)
import evaluate
from evaluate.utils import launch_gradio_widget
module = evaluate.load("cer")
launch_gradio_widget(module)
# Copyright 2021 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Character Error Ratio (CER) metric. """
from typing import List
import datasets
import jiwer
import jiwer.transforms as tr
from datasets.config import PY_VERSION
from packaging import version
import evaluate
if PY_VERSION < version.parse("3.8"):
import importlib_metadata
else:
import importlib.metadata as importlib_metadata
SENTENCE_DELIMITER = ""
if version.parse(importlib_metadata.version("jiwer")) < version.parse("2.3.0"):
class SentencesToListOfCharacters(tr.AbstractTransform):
def __init__(self, sentence_delimiter: str = " "):
self.sentence_delimiter = sentence_delimiter
def process_string(self, s: str):
return list(s)
def process_list(self, inp: List[str]):
chars = []
for sent_idx, sentence in enumerate(inp):
chars.extend(self.process_string(sentence))
if self.sentence_delimiter is not None and self.sentence_delimiter != "" and sent_idx < len(inp) - 1:
chars.append(self.sentence_delimiter)
return chars
cer_transform = tr.Compose(
[tr.RemoveMultipleSpaces(), tr.Strip(), SentencesToListOfCharacters(SENTENCE_DELIMITER)]
)
else:
cer_transform = tr.Compose(
[
tr.RemoveMultipleSpaces(),
tr.Strip(),
tr.ReduceToSingleSentence(SENTENCE_DELIMITER),
tr.ReduceToListOfListOfChars(),
]
)
_CITATION = """\
@inproceedings{inproceedings,
author = {Morris, Andrew and Maier, Viktoria and Green, Phil},
year = {2004},
month = {01},
pages = {},
title = {From WER and RIL to MER and WIL: improved evaluation measures for connected speech recognition.}
}
"""
_DESCRIPTION = """\
Character error rate (CER) is a common metric of the performance of an automatic speech recognition system.
CER is similar to Word Error Rate (WER), but operates on character instead of word. Please refer to docs of WER for further information.
Character error rate can be computed as:
CER = (S + D + I) / N = (S + D + I) / (S + D + C)
where
S is the number of substitutions,
D is the number of deletions,
I is the number of insertions,
C is the number of correct characters,
N is the number of characters in the reference (N=S+D+C).
CER's output is not always a number between 0 and 1, in particular when there is a high number of insertions. This value is often associated to the percentage of characters that were incorrectly predicted. The lower the value, the better the
performance of the ASR system with a CER of 0 being a perfect score.
"""
_KWARGS_DESCRIPTION = """
Computes CER score of transcribed segments against references.
Args:
references: list of references for each speech input.
predictions: list of transcribtions to score.
concatenate_texts: Whether or not to concatenate sentences before evaluation, set to True for more accurate result.
Returns:
(float): the character error rate
Examples:
>>> predictions = ["this is the prediction", "there is an other sample"]
>>> references = ["this is the reference", "there is another one"]
>>> cer = evaluate.load("cer")
>>> cer_score = cer.compute(predictions=predictions, references=references)
>>> print(cer_score)
0.34146341463414637
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class CER(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence"),
}
),
codebase_urls=["https://github.com/jitsi/jiwer/"],
reference_urls=[
"https://en.wikipedia.org/wiki/Word_error_rate",
"https://sites.google.com/site/textdigitisation/qualitymeasures/computingerrorrates",
],
)
def _compute(self, predictions, references, concatenate_texts=False):
if concatenate_texts:
return jiwer.compute_measures(
references,
predictions,
truth_transform=cer_transform,
hypothesis_transform=cer_transform,
)["wer"]
incorrect = 0
total = 0
for prediction, reference in zip(predictions, references):
measures = jiwer.compute_measures(
reference,
prediction,
truth_transform=cer_transform,
hypothesis_transform=cer_transform,
)
incorrect += measures["substitutions"] + measures["deletions"] + measures["insertions"]
total += measures["substitutions"] + measures["deletions"] + measures["hits"]
return incorrect / total
git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
jiwer
\ No newline at end of file
# Copyright 2021 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from cer import CER
cer = CER()
class TestCER(unittest.TestCase):
def test_cer_case_sensitive(self):
refs = ["White House"]
preds = ["white house"]
# S = 2, D = 0, I = 0, N = 11, CER = 2 / 11
char_error_rate = cer.compute(predictions=preds, references=refs)
self.assertTrue(abs(char_error_rate - 0.1818181818) < 1e-6)
def test_cer_whitespace(self):
refs = ["were wolf"]
preds = ["werewolf"]
# S = 0, D = 0, I = 1, N = 9, CER = 1 / 9
char_error_rate = cer.compute(predictions=preds, references=refs)
self.assertTrue(abs(char_error_rate - 0.1111111) < 1e-6)
refs = ["werewolf"]
preds = ["weae wolf"]
# S = 1, D = 1, I = 0, N = 8, CER = 0.25
char_error_rate = cer.compute(predictions=preds, references=refs)
self.assertTrue(abs(char_error_rate - 0.25) < 1e-6)
# consecutive whitespaces case 1
refs = ["were wolf"]
preds = ["were wolf"]
# S = 0, D = 0, I = 0, N = 9, CER = 0
char_error_rate = cer.compute(predictions=preds, references=refs)
self.assertTrue(abs(char_error_rate - 0.0) < 1e-6)
# consecutive whitespaces case 2
refs = ["were wolf"]
preds = ["were wolf"]
# S = 0, D = 0, I = 0, N = 9, CER = 0
char_error_rate = cer.compute(predictions=preds, references=refs)
self.assertTrue(abs(char_error_rate - 0.0) < 1e-6)
def test_cer_sub(self):
refs = ["werewolf"]
preds = ["weaewolf"]
# S = 1, D = 0, I = 0, N = 8, CER = 0.125
char_error_rate = cer.compute(predictions=preds, references=refs)
self.assertTrue(abs(char_error_rate - 0.125) < 1e-6)
def test_cer_del(self):
refs = ["werewolf"]
preds = ["wereawolf"]
# S = 0, D = 1, I = 0, N = 8, CER = 0.125
char_error_rate = cer.compute(predictions=preds, references=refs)
self.assertTrue(abs(char_error_rate - 0.125) < 1e-6)
def test_cer_insert(self):
refs = ["werewolf"]
preds = ["wereolf"]
# S = 0, D = 0, I = 1, N = 8, CER = 0.125
char_error_rate = cer.compute(predictions=preds, references=refs)
self.assertTrue(abs(char_error_rate - 0.125) < 1e-6)
def test_cer_equal(self):
refs = ["werewolf"]
char_error_rate = cer.compute(predictions=refs, references=refs)
self.assertEqual(char_error_rate, 0.0)
def test_cer_list_of_seqs(self):
refs = ["werewolf", "I am your father"]
char_error_rate = cer.compute(predictions=refs, references=refs)
self.assertEqual(char_error_rate, 0.0)
refs = ["werewolf", "I am your father", "doge"]
preds = ["werxwolf", "I am your father", "doge"]
# S = 1, D = 0, I = 0, N = 28, CER = 1 / 28
char_error_rate = cer.compute(predictions=preds, references=refs)
self.assertTrue(abs(char_error_rate - 0.03571428) < 1e-6)
def test_correlated_sentences(self):
refs = ["My hovercraft", "is full of eels"]
preds = ["My hovercraft is full", " of eels"]
# S = 0, D = 0, I = 2, N = 28, CER = 2 / 28
# whitespace at the front of " of eels" will be strip during preporcessing
# so need to insert 2 whitespaces
char_error_rate = cer.compute(predictions=preds, references=refs, concatenate_texts=True)
self.assertTrue(abs(char_error_rate - 0.071428) < 1e-6)
def test_cer_unicode(self):
refs = ["我能吞下玻璃而不伤身体"]
preds = [" 能吞虾玻璃而 不霜身体啦"]
# S = 3, D = 2, I = 0, N = 11, CER = 5 / 11
char_error_rate = cer.compute(predictions=preds, references=refs)
self.assertTrue(abs(char_error_rate - 0.4545454545) < 1e-6)
refs = ["我能吞下玻璃", "而不伤身体"]
preds = ["我 能 吞 下 玻 璃", "而不伤身体"]
# S = 0, D = 5, I = 0, N = 11, CER = 5 / 11
char_error_rate = cer.compute(predictions=preds, references=refs)
self.assertTrue(abs(char_error_rate - 0.454545454545) < 1e-6)
refs = ["我能吞下玻璃而不伤身体"]
char_error_rate = cer.compute(predictions=refs, references=refs)
self.assertFalse(char_error_rate, 0.0)
def test_cer_empty(self):
refs = [""]
preds = ["Hypothesis"]
with self.assertRaises(ValueError):
cer.compute(predictions=preds, references=refs)
if __name__ == "__main__":
unittest.main()
---
title: CharacTER
emoji: 🔤
colorFrom: orange
colorTo: red
sdk: gradio
sdk_version: 3.19.1
app_file: app.py
pinned: false
tags:
- evaluate
- metric
- machine-translation
description: >-
CharacTer is a character-level metric inspired by the commonly applied translation edit rate (TER).
---
# Metric Card for CharacTER
## Metric Description
CharacTer is a character-level metric inspired by the translation edit rate (TER) metric. It is
defined as the minimum number of character edits required to adjust a hypothesis, until it completely matches the
reference, normalized by the length of the hypothesis sentence. CharacTer calculates the character level edit
distance while performing the shift edit on word level. Unlike the strict matching criterion in TER, a hypothesis
word is considered to match a reference word and could be shifted, if the edit distance between them is below a
threshold value. The Levenshtein distance between the reference and the shifted hypothesis sequence is computed on the
character level. In addition, the lengths of hypothesis sequences instead of reference sequences are used for
normalizing the edit distance, which effectively counters the issue that shorter translations normally achieve lower
TER.
## Intended Uses
CharacTER was developed for machine translation evaluation.
## How to Use
```python
import evaluate
character = evaluate.load("character")
# Single hyp/ref
preds = ["this week the saudis denied information published in the new york times"]
refs = ["saudi arabia denied this week information published in the american new york times"]
results = character.compute(references=refs, predictions=preds)
# Corpus example
preds = ["this week the saudis denied information published in the new york times",
"this is in fact an estimate"]
refs = ["saudi arabia denied this week information published in the american new york times",
"this is actually an estimate"]
results = character.compute(references=refs, predictions=preds)
```
### Inputs
- **predictions**: a single prediction or a list of predictions to score. Each prediction should be a string with
tokens separated by spaces.
- **references**: a single reference or a list of reference for each prediction. Each reference should be a string with
tokens separated by spaces.
### Output Values
*=only when a list of references/hypotheses are given
- **count** (*): how many parallel sentences were processed
- **mean** (*): the mean CharacTER score
- **median** (*): the median score
- **std** (*): standard deviation of the score
- **min** (*): smallest score
- **max** (*): largest score
- **cer_scores**: all scores, one per ref/hyp pair
### Output Example
```python
{
'count': 2,
'mean': 0.3127282211789254,
'median': 0.3127282211789254,
'std': 0.07561653111280243,
'min': 0.25925925925925924,
'max': 0.36619718309859156,
'cer_scores': [0.36619718309859156, 0.25925925925925924]
}
```
## Citation
```bibtex
@inproceedings{wang-etal-2016-character,
title = "{C}harac{T}er: Translation Edit Rate on Character Level",
author = "Wang, Weiyue and
Peter, Jan-Thorsten and
Rosendahl, Hendrik and
Ney, Hermann",
booktitle = "Proceedings of the First Conference on Machine Translation: Volume 2, Shared Task Papers",
month = aug,
year = "2016",
address = "Berlin, Germany",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W16-2342",
doi = "10.18653/v1/W16-2342",
pages = "505--510",
}
```
## Further References
- Repackaged version that is used in this HF implementation: [https://github.com/bramvanroy/CharacTER](https://github.com/bramvanroy/CharacTER)
- Original version: [https://github.com/rwth-i6/CharacTER](https://github.com/rwth-i6/CharacTER)
import evaluate
from evaluate.utils import launch_gradio_widget
module = evaluate.load("character")
launch_gradio_widget(module)
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""CharacTER metric, a character-based TER variant, for machine translation."""
import math
from statistics import mean, median
from typing import Iterable, List, Union
import cer
import datasets
from cer import calculate_cer
from datasets import Sequence, Value
import evaluate
_CITATION = """\
@inproceedings{wang-etal-2016-character,
title = "{C}harac{T}er: Translation Edit Rate on Character Level",
author = "Wang, Weiyue and
Peter, Jan-Thorsten and
Rosendahl, Hendrik and
Ney, Hermann",
booktitle = "Proceedings of the First Conference on Machine Translation: Volume 2, Shared Task Papers",
month = aug,
year = "2016",
address = "Berlin, Germany",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W16-2342",
doi = "10.18653/v1/W16-2342",
pages = "505--510",
}
"""
_DESCRIPTION = """\
CharacTer is a character-level metric inspired by the commonly applied translation edit rate (TER). It is
defined as the minimum number of character edits required to adjust a hypothesis, until it completely matches the
reference, normalized by the length of the hypothesis sentence. CharacTer calculates the character level edit
distance while performing the shift edit on word level. Unlike the strict matching criterion in TER, a hypothesis
word is considered to match a reference word and could be shifted, if the edit distance between them is below a
threshold value. The Levenshtein distance between the reference and the shifted hypothesis sequence is computed on the
character level. In addition, the lengths of hypothesis sequences instead of reference sequences are used for
normalizing the edit distance, which effectively counters the issue that shorter translations normally achieve lower
TER."""
_KWARGS_DESCRIPTION = """
Calculates how good the predictions are in terms of the CharacTER metric given some references.
Args:
predictions: a list of predictions to score. Each prediction should be a string with
tokens separated by spaces.
references: a list of references for each prediction. You can also pass multiple references for each prediction,
so a list and in that list a sublist for each prediction for its related references. When multiple references are
given, the lowest (best) score is returned for that prediction-references pair.
Each reference should be a string with tokens separated by spaces.
aggregate: one of "mean", "sum", "median" to indicate how the scores of individual sentences should be
aggregated
return_all_scores: a boolean, indicating whether in addition to the aggregated score, also all individual
scores should be returned
Returns:
cer_score: an aggregated score across all the items, based on 'aggregate'
cer_scores: (optionally, if 'return_all_scores' evaluates to True) a list of all scores, one per ref/hyp pair
Examples:
>>> character_mt = evaluate.load("character")
>>> preds = ["this week the saudis denied information published in the new york times"]
>>> refs = ["saudi arabia denied this week information published in the american new york times"]
>>> character_mt.compute(references=refs, predictions=preds)
{'cer_score': 0.36619718309859156}
>>> preds = ["this week the saudis denied information published in the new york times",
... "this is in fact an estimate"]
>>> refs = ["saudi arabia denied this week information published in the american new york times",
... "this is actually an estimate"]
>>> character_mt.compute(references=refs, predictions=preds, aggregate="sum", return_all_scores=True)
{'cer_score': 0.6254564423578508, 'cer_scores': [0.36619718309859156, 0.25925925925925924]}
>>> preds = ["this week the saudis denied information published in the new york times"]
>>> refs = [["saudi arabia denied this week information published in the american new york times",
... "the saudis have denied new information published in the ny times"]]
>>> character_mt.compute(references=refs, predictions=preds, aggregate="median", return_all_scores=True)
{'cer_score': 0.36619718309859156, 'cer_scores': [0.36619718309859156]}
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Character(evaluate.Metric):
"""CharacTer is a character-level metric inspired by the commonly applied translation edit rate (TER)."""
def _info(self):
return evaluate.MetricInfo(
module_type="metric",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=[
datasets.Features(
{"predictions": Value("string", id="prediction"), "references": Value("string", id="reference")}
),
datasets.Features(
{
"predictions": Value("string", id="prediction"),
"references": Sequence(Value("string", id="reference"), id="references"),
}
),
],
homepage="https://github.com/bramvanroy/CharacTER",
codebase_urls=["https://github.com/bramvanroy/CharacTER", "https://github.com/rwth-i6/CharacTER"],
)
def _compute(
self,
predictions: Iterable[str],
references: Union[Iterable[str], Iterable[Iterable[str]]],
aggregate: str = "mean",
return_all_scores: bool = False,
):
if aggregate not in ("mean", "sum", "median"):
raise ValueError("'aggregate' must be one of 'sum', 'mean', 'median'")
predictions = [p.split() for p in predictions]
# Predictions and references have the same internal types (both lists of strings),
# so only one reference per prediction
if isinstance(references[0], str):
references = [r.split() for r in references]
scores_d = cer.calculate_cer_corpus(predictions, references)
cer_scores: List[float] = scores_d["cer_scores"]
if aggregate == "sum":
score = sum(cer_scores)
elif aggregate == "mean":
score = scores_d["mean"]
else:
score = scores_d["median"]
else:
# In the case of multiple references, we just find the "best score",
# i.e., the reference that the prediction is closest to, i.e. the lowest characTER score
references = [[r.split() for r in refs] for refs in references]
cer_scores = []
for pred, refs in zip(predictions, references):
min_score = math.inf
for ref in refs:
score = calculate_cer(pred, ref)
if score < min_score:
min_score = score
cer_scores.append(min_score)
if aggregate == "sum":
score = sum(cer_scores)
elif aggregate == "mean":
score = mean(cer_scores)
else:
score = median(cer_scores)
# Return scores
if return_all_scores:
return {"cer_score": score, "cer_scores": cer_scores}
else:
return {"cer_score": score}
git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
cer>=1.2.0
---
title: CharCut
emoji: 🔤
colorFrom: blue
colorTo: red
sdk: gradio
sdk_version: 3.19.1
app_file: app.py
pinned: false
tags:
- evaluate
- metric
- machine-translation
description: >-
CharCut is a character-based machine translation evaluation metric.
---
# Metric Card for CharacTER
## Metric Description
CharCut compares outputs of MT systems with reference translations. The matching algorithm is based on an iterative
search for longest common substrings, combined with a length-based threshold that limits short and noisy character
matches. As a similarity metric this is not new, but to the best of our knowledge it was never applied to highlighting
and scoring of MT outputs. It has the neat effect of keeping character-based differences readable by humans.
## Intended Uses
CharCut was developed for machine translation evaluation.
## How to Use
```python
import evaluate
charcut = evaluate.load("charcut")
preds = ["this week the saudis denied information published in the new york times",
"this is in fact an estimate"]
refs = ["saudi arabia denied this week information published in the american new york times",
"this is actually an estimate"]
results = charcut.compute(references=refs, predictions=preds)
print(results)
# {'charcut_mt': 0.1971153846153846}
```
### Inputs
- **predictions**: a single prediction or a list of predictions to score. Each prediction should be a string with
tokens separated by spaces.
- **references**: a single reference or a list of reference for each prediction. Each reference should be a string with
tokens separated by spaces.
### Output Values
- **charcut_mt**: the CharCut evaluation score (lower is better)
### Output Example
```python
{'charcut_mt': 0.1971153846153846}
```
## Citation
```bibtex
@inproceedings{lardilleux-lepage-2017-charcut,
title = "{CHARCUT}: Human-Targeted Character-Based {MT} Evaluation with Loose Differences",
author = "Lardilleux, Adrien and
Lepage, Yves",
booktitle = "Proceedings of the 14th International Conference on Spoken Language Translation",
month = dec # " 14-15",
year = "2017",
address = "Tokyo, Japan",
publisher = "International Workshop on Spoken Language Translation",
url = "https://aclanthology.org/2017.iwslt-1.20",
pages = "146--153",
abstract = "We present CHARCUT, a character-based machine translation evaluation metric derived from a human-targeted segment difference visualisation algorithm. It combines an iterative search for longest common substrings between the candidate and the reference translation with a simple length-based threshold, enabling loose differences that limit noisy character matches. Its main advantage is to produce scores that directly reflect human-readable string differences, making it a useful support tool for the manual analysis of MT output and its display to end users. Experiments on WMT16 metrics task data show that it is on par with the best {``}un-trained{''} metrics in terms of correlation with human judgement, well above BLEU and TER baselines, on both system and segment tasks.",
}
```
## Further References
- Repackaged version that is used in this HF implementation: [https://github.com/BramVanroy/CharCut](https://github.com/BramVanroy/CharCut)
- Original version: [https://github.com/alardill/CharCut](https://github.com/alardill/CharCut)
import evaluate
from evaluate.utils import launch_gradio_widget
module = evaluate.load("charcut_mt")
launch_gradio_widget(module)
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""An implementation for calculating CharCut, a character-based machine translation evaluation metric."""
from typing import Iterable, Union
import datasets
from charcut import calculate_charcut
from datasets import Sequence, Value
import evaluate
_CITATION = """\
@inproceedings{lardilleux-lepage-2017-charcut,
title = "{CHARCUT}: Human-Targeted Character-Based {MT} Evaluation with Loose Differences",
author = "Lardilleux, Adrien and
Lepage, Yves",
booktitle = "Proceedings of the 14th International Conference on Spoken Language Translation",
month = dec # " 14-15",
year = "2017",
address = "Tokyo, Japan",
publisher = "International Workshop on Spoken Language Translation",
url = "https://aclanthology.org/2017.iwslt-1.20",
pages = "146--153"
}
"""
_DESCRIPTION = """\
CharCut compares outputs of MT systems with reference translations. The matching algorithm is based on an iterative
search for longest common substrings, combined with a length-based threshold that limits short and noisy character
matches. As a similarity metric this is not new, but to the best of our knowledge it was never applied to highlighting
and scoring of MT outputs. It has the neat effect of keeping character-based differences readable by humans."""
_KWARGS_DESCRIPTION = """
Calculates how good predictions are given some references.
Args:
predictions: a list of predictions to score. Each prediction should be a string with
tokens separated by spaces.
references: a list of reference for each prediction. Each reference should be a string with
tokens separated by spaces.
Returns:
charcut_mt: the CharCut score
Examples:
>>> charcut_mt = evaluate.load("charcut_mt")
>>> preds = ["this week the saudis denied information published in the new york times",
... "this is in fact an estimate"]
>>> refs = ["saudi arabia denied this week information published in the american new york times",
... "this is actually an estimate"]
>>> charcut_mt.compute(references=refs, predictions=preds)
{'charcut_mt': 0.1971153846153846}
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Charcut(evaluate.Metric):
"""Character-based MT evaluation."""
def _info(self):
return evaluate.MetricInfo(
# This is the description that will appear on the modules page.
module_type="metric",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
# This defines the format of each prediction and reference
features=[
datasets.Features(
{"predictions": Value("string", id="prediction"), "references": Value("string", id="reference")}
),
],
# Homepage of the module for documentation
homepage="https://github.com/BramVanroy/CharCut",
# Additional links to the codebase or references
codebase_urls=["https://github.com/BramVanroy/CharCut", "https://github.com/alardill/CharCut"],
)
def _compute(self, predictions: Iterable[str], references: Iterable[str]):
return {"charcut_mt": calculate_charcut(predictions, references)[0]}
git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
charcut>=1.1.1
---
title: chrF
emoji: 🤗
colorFrom: blue
colorTo: red
sdk: gradio
sdk_version: 3.19.1
app_file: app.py
pinned: false
tags:
- evaluate
- metric
description: >-
ChrF and ChrF++ are two MT evaluation metrics. They both use the F-score statistic for character n-gram matches,
and ChrF++ adds word n-grams as well which correlates more strongly with direct assessment. We use the implementation
that is already present in sacrebleu.
The implementation here is slightly different from sacrebleu in terms of the required input format. The length of
the references and hypotheses lists need to be the same, so you may need to transpose your references compared to
sacrebleu's required input format. See https://github.com/huggingface/datasets/issues/3154#issuecomment-950746534
See the README.md file at https://github.com/mjpost/sacreBLEU#chrf--chrf for more information.
---
# Metric Card for chrF(++)
## Metric Description
ChrF and ChrF++ are two MT evaluation metrics that use the F-score statistic for character n-gram matches. ChrF++ additionally includes word n-grams, which correlate more strongly with direct assessment. We use the implementation that is already present in sacrebleu.
While this metric is included in sacreBLEU, the implementation here is slightly different from sacreBLEU in terms of the required input format. Here, the length of the references and hypotheses lists need to be the same, so you may need to transpose your references compared to sacrebleu's required input format. See https://github.com/huggingface/datasets/issues/3154#issuecomment-950746534
See the [sacreBLEU README.md](https://github.com/mjpost/sacreBLEU#chrf--chrf) for more information.
## How to Use
At minimum, this metric requires a `list` of predictions and a `list` of `list`s of references:
```python
>>> prediction = ["The relationship between cats and dogs is not exactly friendly.", "a good bookshop is just a genteel black hole that knows how to read."]
>>> reference = [["The relationship between dogs and cats is not exactly friendly.", ], ["A good bookshop is just a genteel Black Hole that knows how to read."]]
>>> chrf = evaluate.load("chrf")
>>> results = chrf.compute(predictions=prediction, references=reference)
>>> print(results)
{'score': 84.64214891738334, 'char_order': 6, 'word_order': 0, 'beta': 2}
```
### Inputs
- **`predictions`** (`list` of `str`): The predicted sentences.
- **`references`** (`list` of `list` of `str`): The references. There should be one reference sub-list for each prediction sentence.
- **`char_order`** (`int`): Character n-gram order. Defaults to `6`.
- **`word_order`** (`int`): Word n-gram order. If equals to 2, the metric is referred to as chrF++. Defaults to `0`.
- **`beta`** (`int`): Determine the importance of recall w.r.t precision. Defaults to `2`.
- **`lowercase`** (`bool`): If `True`, enables case-insensitivity. Defaults to `False`.
- **`whitespace`** (`bool`): If `True`, include whitespaces when extracting character n-grams. Defaults to `False`.
- **`eps_smoothing`** (`bool`): If `True`, applies epsilon smoothing similar to reference chrF++.py, NLTK, and Moses implementations. If `False`, takes into account effective match order similar to sacreBLEU < 2.0.0. Defaults to `False`.
### Output Values
The output is a dictionary containing the following fields:
- **`'score'`** (`float`): The chrF (chrF++) score.
- **`'char_order'`** (`int`): The character n-gram order.
- **`'word_order'`** (`int`): The word n-gram order. If equals to `2`, the metric is referred to as chrF++.
- **`'beta'`** (`int`): Determine the importance of recall w.r.t precision.
The output is formatted as below:
```python
{'score': 61.576379378113785, 'char_order': 6, 'word_order': 0, 'beta': 2}
```
The chrF(++) score can be any value between `0.0` and `100.0`, inclusive.
#### Values from Popular Papers
### Examples
A simple example of calculating chrF:
```python
>>> prediction = ["The relationship between cats and dogs is not exactly friendly.", "a good bookshop is just a genteel black hole that knows how to read."]
>>> reference = [["The relationship between dogs and cats is not exactly friendly.", ], ["A good bookshop is just a genteel Black Hole that knows how to read."]]
>>> chrf = evaluate.load("chrf")
>>> results = chrf.compute(predictions=prediction, references=reference)
>>> print(results)
{'score': 84.64214891738334, 'char_order': 6, 'word_order': 0, 'beta': 2}
```
The same example, but with the argument `word_order=2`, to calculate chrF++ instead of chrF:
```python
>>> prediction = ["The relationship between cats and dogs is not exactly friendly.", "a good bookshop is just a genteel black hole that knows how to read."]
>>> reference = [["The relationship between dogs and cats is not exactly friendly.", ], ["A good bookshop is just a genteel Black Hole that knows how to read."]]
>>> chrf = evaluate.load("chrf")
>>> results = chrf.compute(predictions=prediction,
... references=reference,
... word_order=2)
>>> print(results)
{'score': 82.87263732906315, 'char_order': 6, 'word_order': 2, 'beta': 2}
```
The same chrF++ example as above, but with `lowercase=True` to normalize all case:
```python
>>> prediction = ["The relationship between cats and dogs is not exactly friendly.", "a good bookshop is just a genteel black hole that knows how to read."]
>>> reference = [["The relationship between dogs and cats is not exactly friendly.", ], ["A good bookshop is just a genteel Black Hole that knows how to read."]]
>>> chrf = evaluate.load("chrf")
>>> results = chrf.compute(predictions=prediction,
... references=reference,
... word_order=2,
... lowercase=True)
>>> print(results)
{'score': 92.12853119829202, 'char_order': 6, 'word_order': 2, 'beta': 2}
```
## Limitations and Bias
- According to [Popović 2017](https://www.statmt.org/wmt17/pdf/WMT70.pdf), chrF+ (where `word_order=1`) and chrF++ (where `word_order=2`) produce scores that correlate better with human judgements than chrF (where `word_order=0`) does.
## Citation
```bibtex
@inproceedings{popovic-2015-chrf,
title = "chr{F}: character n-gram {F}-score for automatic {MT} evaluation",
author = "Popovi{\'c}, Maja",
booktitle = "Proceedings of the Tenth Workshop on Statistical Machine Translation",
month = sep,
year = "2015",
address = "Lisbon, Portugal",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W15-3049",
doi = "10.18653/v1/W15-3049",
pages = "392--395",
}
@inproceedings{popovic-2017-chrf,
title = "chr{F}++: words helping character n-grams",
author = "Popovi{\'c}, Maja",
booktitle = "Proceedings of the Second Conference on Machine Translation",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W17-4770",
doi = "10.18653/v1/W17-4770",
pages = "612--618",
}
@inproceedings{post-2018-call,
title = "A Call for Clarity in Reporting {BLEU} Scores",
author = "Post, Matt",
booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
month = oct,
year = "2018",
address = "Belgium, Brussels",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W18-6319",
pages = "186--191",
}
```
## Further References
- See the [sacreBLEU README.md](https://github.com/mjpost/sacreBLEU#chrf--chrf) for more information on this implementation.
\ No newline at end of file
import evaluate
from evaluate.utils import launch_gradio_widget
module = evaluate.load("chrf")
launch_gradio_widget(module)
# Copyright 2021 The HuggingFace Evaluate Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Chrf(++) metric as available in sacrebleu. """
import datasets
import sacrebleu as scb
from packaging import version
from sacrebleu import CHRF
import evaluate
_CITATION = """\
@inproceedings{popovic-2015-chrf,
title = "chr{F}: character n-gram {F}-score for automatic {MT} evaluation",
author = "Popovi{\'c}, Maja",
booktitle = "Proceedings of the Tenth Workshop on Statistical Machine Translation",
month = sep,
year = "2015",
address = "Lisbon, Portugal",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W15-3049",
doi = "10.18653/v1/W15-3049",
pages = "392--395",
}
@inproceedings{popovic-2017-chrf,
title = "chr{F}++: words helping character n-grams",
author = "Popovi{\'c}, Maja",
booktitle = "Proceedings of the Second Conference on Machine Translation",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W17-4770",
doi = "10.18653/v1/W17-4770",
pages = "612--618",
}
@inproceedings{post-2018-call,
title = "A Call for Clarity in Reporting {BLEU} Scores",
author = "Post, Matt",
booktitle = "Proceedings of the Third Conference on Machine Translation: Research Papers",
month = oct,
year = "2018",
address = "Belgium, Brussels",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W18-6319",
pages = "186--191",
}
"""
_DESCRIPTION = """\
ChrF and ChrF++ are two MT evaluation metrics. They both use the F-score statistic for character n-gram matches,
and ChrF++ adds word n-grams as well which correlates more strongly with direct assessment. We use the implementation
that is already present in sacrebleu.
The implementation here is slightly different from sacrebleu in terms of the required input format. The length of
the references and hypotheses lists need to be the same, so you may need to transpose your references compared to
sacrebleu's required input format. See https://github.com/huggingface/datasets/issues/3154#issuecomment-950746534
See the README.md file at https://github.com/mjpost/sacreBLEU#chrf--chrf for more information.
"""
_KWARGS_DESCRIPTION = """
Produces ChrF(++) scores for hypotheses given reference translations.
Args:
predictions (list of str): The predicted sentences.
references (list of list of str): The references. There should be one reference sub-list for each prediction sentence.
char_order (int): Character n-gram order. Defaults to `6`.
word_order (int): Word n-gram order. If equals to `2`, the metric is referred to as chrF++. Defaults to `0`.
beta (int): Determine the importance of recall w.r.t precision. Defaults to `2`.
lowercase (bool): if `True`, enables case-insensitivity. Defaults to `False`.
whitespace (bool): If `True`, include whitespaces when extracting character n-grams.
eps_smoothing (bool): If `True`, applies epsilon smoothing similar
to reference chrF++.py, NLTK and Moses implementations. If `False`,
it takes into account effective match order similar to sacreBLEU < 2.0.0. Defaults to `False`.
Returns:
'score' (float): The chrF (chrF++) score,
'char_order' (int): The character n-gram order,
'word_order' (int): The word n-gram order. If equals to 2, the metric is referred to as chrF++,
'beta' (int): Determine the importance of recall w.r.t precision
Examples:
Example 1--a simple example of calculating chrF:
>>> prediction = ["The relationship between cats and dogs is not exactly friendly.", "a good bookshop is just a genteel black hole that knows how to read."]
>>> reference = [["The relationship between dogs and cats is not exactly friendly."], ["A good bookshop is just a genteel Black Hole that knows how to read."]]
>>> chrf = evaluate.load("chrf")
>>> results = chrf.compute(predictions=prediction, references=reference)
>>> print(results)
{'score': 84.64214891738334, 'char_order': 6, 'word_order': 0, 'beta': 2}
Example 2--the same example, but with the argument word_order=2, to calculate chrF++ instead of chrF:
>>> prediction = ["The relationship between cats and dogs is not exactly friendly.", "a good bookshop is just a genteel black hole that knows how to read."]
>>> reference = [["The relationship between dogs and cats is not exactly friendly."], ["A good bookshop is just a genteel Black Hole that knows how to read."]]
>>> chrf = evaluate.load("chrf")
>>> results = chrf.compute(predictions=prediction,
... references=reference,
... word_order=2)
>>> print(results)
{'score': 82.87263732906315, 'char_order': 6, 'word_order': 2, 'beta': 2}
Example 3--the same chrF++ example as above, but with `lowercase=True` to normalize all case:
>>> prediction = ["The relationship between cats and dogs is not exactly friendly.", "a good bookshop is just a genteel black hole that knows how to read."]
>>> reference = [["The relationship between dogs and cats is not exactly friendly."], ["A good bookshop is just a genteel Black Hole that knows how to read."]]
>>> chrf = evaluate.load("chrf")
>>> results = chrf.compute(predictions=prediction,
... references=reference,
... word_order=2,
... lowercase=True)
>>> print(results)
{'score': 92.12853119829202, 'char_order': 6, 'word_order': 2, 'beta': 2}
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class ChrF(evaluate.Metric):
def _info(self):
if version.parse(scb.__version__) < version.parse("1.4.12"):
raise ImportWarning(
"To use `sacrebleu`, the module `sacrebleu>=1.4.12` is required, and the current version of `sacrebleu` doesn't match this condition.\n"
'You can install it with `pip install "sacrebleu>=1.4.12"`.'
)
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
homepage="https://github.com/mjpost/sacreBLEU#chrf--chrf",
inputs_description=_KWARGS_DESCRIPTION,
features=[
datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Sequence(datasets.Value("string", id="sequence"), id="references"),
}
),
datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence"),
}
),
],
codebase_urls=["https://github.com/mjpost/sacreBLEU#chrf--chrf"],
reference_urls=[
"https://github.com/m-popovic/chrF",
],
)
def _compute(
self,
predictions,
references,
char_order: int = CHRF.CHAR_ORDER,
word_order: int = CHRF.WORD_ORDER,
beta: int = CHRF.BETA,
lowercase: bool = False,
whitespace: bool = False,
eps_smoothing: bool = False,
):
# if only one reference is provided make sure we still use list of lists
if isinstance(references[0], str):
references = [[ref] for ref in references]
references_per_prediction = len(references[0])
if any(len(refs) != references_per_prediction for refs in references):
raise ValueError(
"ChrF, as implemented by sacrebleu, requires the same number of references for each prediction"
)
transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)]
sb_chrf = CHRF(char_order, word_order, beta, lowercase, whitespace, eps_smoothing)
output = sb_chrf.corpus_score(predictions, transformed_references)
return {
"score": output.score,
"char_order": output.char_order,
"word_order": output.word_order,
"beta": output.beta,
}
git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
sacrebleu
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment