metric.py 1.69 KB
Newer Older
lintangsutawika's avatar
lintangsutawika committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import evaluate
from functools import partial


def _squad_metric(predictions, references):
    squad_metric = evaluate.load("squad_v2")
    return squad_metric.compute(predictions=predictions, references=references)

# Exact match (the normalized answer exactly match the gold answer)
def exact(predictions, references):
    return _squad_metric(predictions=predictions, references=references).get("exact", 0)

# The F-score of predicted tokens versus the gold answer
def f1(predictions, references):
    return _squad_metric(predictions=predictions, references=references).get("f1", 0)

# Exact match (the normalized answer exactly match the gold answer)
def HasAns_exact(predictions, references):
    return _squad_metric(predictions=predictions, references=references).get("HasAns_exact", 0)

# The F-score of predicted tokens versus the gold answer
def HasAns_f1(predictions, references):
    return _squad_metric(predictions=predictions, references=references).get("HasAns_f1", 0)

# Exact match (the normalized answer exactly match the gold answer)
def NoAns_exact(predictions, references):
    return _squad_metric(predictions=predictions, references=references).get("NoAns_exact", 0)

# The F-score of predicted tokens versus the gold answer
def NoAns_f1(predictions, references):
    return _squad_metric(predictions=predictions, references=references).get("NoAns_f1", 0)

# Best exact match (with varying threshold)
def best_exact(predictions, references):
    return _squad_metric(predictions=predictions, references=references).get("best_exact", 0)

# Best F1 (with varying threshold)
def best_f1(predictions, references):
    return _squad_metric(predictions=predictions, references=references).get("best_f1", 0)