metrics.py 4.89 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import torch
from sklearn.metrics import f1_score, precision_score, recall_score


inference_decorator = (
    torch.inference_mode if torch.__version__ >= "2.0.0" else torch.no_grad
)


def _aggreg_ls(predictions):
    """
    Custom aggregation to compute corpus level metrics for the lexical substitution task
    predictions is a list of tuples (prec, has_answ, has_annotation)
    prec is the precision before dividing by |A|
    has_answ is 0 if the model did not produce any answer
    has_annotation is 0 if the gold answer is empty: no synonims from annotators
    """
    # get |A| and |T| to compute the final precision and recall using a lambda function
    A = sum([p[1] for p in predictions])
    T = sum([p[2] for p in predictions])
    # compute the final precision and recall
    if A == 0:
        prec = sum([p[0] for p in predictions]) / 1
    else:
        prec = sum([p[0] for p in predictions]) / A
    if T == 0:
        rec = sum([p[0] for p in predictions]) / 1
    else:
        rec = sum([p[0] for p in predictions]) / T
    # compute the final F1 score
    f1 = 0
    if prec + rec != 0:
        f1 = (2 * prec * rec) / (prec + rec)
    return f1


def _aggreg_sa_v2(predictions):
    """
    This aggregation considers the sentiment analysis task as a multiple choice one with four classes
    the f1 score is computed as the average of the f1 scores for each class weighted by the number of samples
    See sklearn.metrics.f1_score for more details

    """
    predictions, references = zip(*predictions)
    f1 = f1_score(references, predictions, average="weighted")
    return f1


def _aggreg_sa(predictions):
    """
    Custom aggregation function for the sentiment analysis task
    The original tasks compute the F1 score for each class and then average them
    Since the prompt cast the task to a multple choice one we need to aggregate the results in a different way
    """
    # split the predictions and references in two lists (pred is a tuple)
    predictions, references = zip(*predictions)
    """
    Class 0: positivo -> 'opos': 1, 'oneg': 0
    Class 1: negativo -> 'opos': 0, 'oneg': 1
    etc.
    """

    def _map_to_original_labels(x):
        """
        Return two separate list of labels for opos and oneg
        x is a list of integers
        """
        opos = []
        oneg = []
        for i in x:
            if i == 0:
                # positive
                opos.append(1)
                oneg.append(0)
            elif i == 1:
                # negative
                opos.append(0)
                oneg.append(1)
            elif i == 2:
                # neutral
                opos.append(0)
                oneg.append(0)
            elif i == 3:
                # mixed
                opos.append(1)
                oneg.append(1)
            else:
                pass
        return opos, oneg

    pred_opos, pred_oneg = _map_to_original_labels(predictions)
    ref_opos, ref_oneg = _map_to_original_labels(references)

    opos_f1 = f1_score(ref_opos, pred_opos, average=None)
    opos_f1_c0 = f1_score(ref_opos, pred_opos, average=None)[0]
    if len(opos_f1) > 1:
        opos_f1_c1 = opos_f1[1]
    else:
        opos_f1_c1 = 0

    # oneg class
    oneg_prec_c0, oneg_prec_c1 = precision_score(
        ref_oneg, pred_oneg, labels=[0, 1], average=None
    )
    oneg_rec_c0, oneg_rec_c1 = recall_score(
        ref_oneg, pred_oneg, labels=[0, 1], average=None
    )
    oneg_f1 = f1_score(ref_oneg, pred_oneg, average=None)
    oneg_f1_c0 = f1_score(ref_oneg, pred_oneg, average=None)[0]
    if len(oneg_f1) > 1:
        oneg_f1_c1 = f1_score(ref_oneg, pred_oneg, average=None)[1]
    else:
        oneg_f1_c1 = 0

    # average f1 score for each class (opos and oneg)
    f1_score_opos = (opos_f1_c0 + opos_f1_c1) / 2
    f1_score_oneg = (oneg_f1_c0 + oneg_f1_c1) / 2
    # average f1 score for the two classes
    f1_final = (f1_score_opos + f1_score_oneg) / 2

    return f1_final


def _aggreg_ner(predictions):
    pred, ref = zip(*predictions)
    # concat all the predictions and references
    all_pred = []
    for p in pred:
        all_pred.extend(p)
    all_ref = []
    for r in ref:
        all_ref.extend(r)
    # compute the F1 score
    f1 = f1_score(all_ref, all_pred, average=None)
    if len(f1) > 1:
        f1_sum = sum(f1[:-1]) / (len(f1) - 1)
    else:
        f1_sum = f1[0]

    return f1_sum


def _aggreg_rel(predictions):
    pred, ref = zip(*predictions)
    # concat all the predictions and references
    all_pred = []
    for p in pred:
        all_pred.extend(p)
    all_ref = []
    for r in ref:
        all_ref.extend(r)
    # compute the F1 score
    f1 = f1_score(all_ref, all_pred, average="macro")
    return f1


# ------------------------ DOCUMENT DATING ---------------------------


def _aggreg_dd(items):
    unzipped_list = list(zip(*items))
    golds = unzipped_list[0]
    preds = unzipped_list[1]
    fscore = f1_score(golds, preds, average="macro")
    return fscore