"docs/en/user_guides/dataset_prepare.md" did not exist on "f9b36b6ebc61ab53a2e7938a150d4e42e361b06b"
overall_indicator.py 15.8 KB
Newer Older
liusilu's avatar
liusilu committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import json
import pandas as pd
import numpy as np
import re
from nltk.translate.bleu_score import sentence_bleu
import time
import argparse
import os
from sklearn.metrics import classification_report,confusion_matrix
from collections import Counter
from sklearn import metrics
from pandas import isnull


def indicator_cal(json_standard,json_test):

    json_standard = pd.DataFrame(json_standard)
    json_test = pd.DataFrame(json_test)



    '''数据集总体指标'''
    
    a=json_test[['id','mid_json']]
liusilu's avatar
liusilu committed
25
    b=json_standard[['id','mid_json','pass_label']]
liusilu's avatar
liusilu committed
26
    outer_merge=pd.merge(a,b,on='id',how='outer')
liusilu's avatar
liusilu committed
27
    outer_merge.columns=['id','standard_mid_json','test_mid_json','pass_label']
liusilu's avatar
liusilu committed
28
29
30
31
32
33
34
35
36
37
38
    standard_exist=outer_merge.standard_mid_json.apply(lambda x: not isnull(x))
    test_exist=outer_merge.test_mid_json.apply(lambda x: not isnull(x))

    overall_report = {}
    overall_report['accuracy']=metrics.accuracy_score(standard_exist,test_exist)
    overall_report['precision']=metrics.precision_score(standard_exist,test_exist)
    overall_report['recall']=metrics.recall_score(standard_exist,test_exist)
    overall_report['f1_score']=metrics.f1_score(standard_exist,test_exist)


    inner_merge=pd.merge(a,b,on='id',how='inner')
liusilu's avatar
liusilu committed
39
    inner_merge.columns=['id','standard_mid_json','test_mid_json','pass_label']
liusilu's avatar
liusilu committed
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
    json_standard = inner_merge['standard_mid_json']#check一下是否对齐
    json_test = inner_merge['test_mid_json']


    

    '''批量读取中间生成的json文件'''
    test_inline_equations=[]
    test_interline_equations=[]
    test_dropped_text_bboxes=[]
    test_dropped_text_tag=[]
    test_dropped_image_bboxes=[]
    test_dropped_table_bboxes=[] 
    test_preproc_num=[]#阅读顺序
    test_para_num=[]
    test_para_text=[]

    for i in json_test:
        mid_json=pd.DataFrame(i)
        mid_json=mid_json.iloc[:,:-1]
        for j1 in mid_json.loc['inline_equations',:]:
            page_in=[]
            for k1 in j1:
                page_in.append(k1['latex_text'])
            test_inline_equations.append(page_in)
        for j2 in mid_json.loc['interline_equations',:]:
            page_in=[]
            for k2 in j2:
                page_in.append(k2['latex_text'])
            test_interline_equations.append(page_in)

        for j3 in mid_json.loc['droped_text_block',:]:
            page_in_bbox=[]
            page_in_tag=[]
            for k3 in j3:
                page_in_bbox.append(k3['bbox'])
                #如果k3中存在tag这个key
                if 'tag' in k3.keys():
                    page_in_tag.append(k3['tag'])
                else:
                    page_in_tag.append('None')
            test_dropped_text_tag.append(page_in_tag)
            test_dropped_text_bboxes.append(page_in_bbox)
        for j4 in mid_json.loc['droped_image_block',:]:
                test_dropped_image_bboxes.append(j4)
        for j5 in mid_json.loc['droped_table_block',:]:
                test_dropped_table_bboxes.append(j5)
        for j6 in mid_json.loc['preproc_blocks',:]:
            page_in=[]
            for k6 in j6:
                page_in.append(k6['number'])
            test_preproc_num.append(page_in)

        test_pdf_text=[]     
        for j7 in mid_json.loc['para_blocks',:]:
            test_para_num.append(len(j7))  
            for k7 in j7:
                test_pdf_text.append(k7['text'])  
        test_para_text.append(test_pdf_text)



    standard_inline_equations=[]
    standard_interline_equations=[]
    standard_dropped_text_bboxes=[]
    standard_dropped_text_tag=[]
    standard_dropped_image_bboxes=[]
    standard_dropped_table_bboxes=[] 
    standard_preproc_num=[]#阅读顺序
    standard_para_num=[]
    standard_para_text=[]

    for i in json_standard:
        mid_json=pd.DataFrame(i)
        mid_json=mid_json.iloc[:,:-1]
        for j1 in mid_json.loc['inline_equations',:]:
            page_in=[]
            for k1 in j1:
                page_in.append(k1['latex_text'])
            standard_inline_equations.append(page_in)
        for j2 in mid_json.loc['interline_equations',:]:
            page_in=[]
            for k2 in j2:
                page_in.append(k2['latex_text'])
            standard_interline_equations.append(page_in)
        for j3 in mid_json.loc['droped_text_block',:]:
            page_in_bbox=[]
            page_in_tag=[]
            for k3 in j3:
                page_in_bbox.append(k3['bbox'])
                if 'tag' in k3.keys():
                    page_in_tag.append(k3['tag'])
                else:
                    page_in_tag.append('None')
            standard_dropped_text_bboxes.append(page_in_bbox)
            standard_dropped_text_tag.append(page_in_tag)
        for j4 in mid_json.loc['droped_image_block',:]:
                standard_dropped_image_bboxes.append(j4)
        for j5 in mid_json.loc['droped_table_block',:]:
                standard_dropped_table_bboxes.append(j5)
        for j6 in mid_json.loc['preproc_blocks',:]:
            page_in=[]
            for k6 in j6:
                page_in.append(k6['number'])
            standard_preproc_num.append(page_in)     

        standard_pdf_text=[]
        for j7 in mid_json.loc['para_blocks',:]:
            standard_para_num.append(len(j7))  
            for k7 in j7:
                standard_pdf_text.append(k7['text'])
        standard_para_text.append(standard_pdf_text)


    """
    在计算指标之前最好先确认基本统计信息是否一致
    """


liusilu's avatar
liusilu committed
159
160
161
162
163
164
165
166
    '''
    计算pdf之间的总体编辑距离和bleu
    这里只计算正例的pdf
    '''
    
    test_para_text=np.asarray(test_para_text, dtype = object)[inner_merge['pass_label']=='yes']
    standard_para_text=np.asarray(standard_para_text, dtype = object)[inner_merge['pass_label']=='yes']

liusilu's avatar
liusilu committed
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
    pdf_dis=[]
    pdf_bleu=[]
    for a,b in zip(test_para_text,standard_para_text):
        a1=[ ''.join(i) for i in a]
        b1=[ ''.join(i) for i in b]
        pdf_dis.append(Levenshtein_Distance(a1,b1))
        pdf_bleu.append(sentence_bleu([a1],b1))
    overall_report['pdf间的平均编辑距离']=np.mean(pdf_dis)
    overall_report['pdf间的平均bleu']=np.mean(pdf_bleu)


    '''行内公式编辑距离和bleu'''
    dis1=[]
    bleu1=[]

    test_inline_equations=[ ''.join(i) for i in test_inline_equations]
    standard_inline_equations=[ ''.join(i) for i in standard_inline_equations]
           
    for a,b in zip(test_inline_equations,standard_inline_equations):
        if len(a)==0 and len(b)==0:
            continue
        else:
            if a==b:
                dis1.append(0)
                bleu1.append(1)
            else:
                dis1.append(Levenshtein_Distance(a,b))
                bleu1.append(sentence_bleu([a],b))
    inline_equations_edit=np.mean(dis1)
    inline_equations_bleu=np.mean(bleu1)


    '''行间公式编辑距离和bleu'''
    dis2=[]
    bleu2=[]

    test_interline_equations=[ ''.join(i) for i in test_interline_equations]
    standard_interline_equations=[ ''.join(i) for i in standard_interline_equations]

    for a,b in zip(test_interline_equations,standard_interline_equations):
        if len(a)==0 and len(b)==0:
            continue
        else:
            if a==b:
                dis2.append(0)
                bleu2.append(1)
            else:
                dis2.append(Levenshtein_Distance(a,b))
                bleu2.append(sentence_bleu([a],b))
    interline_equations_edit=np.mean(dis2)
    interline_equations_bleu=np.mean(bleu2)




    '''可以先检查page和bbox数量是否一致'''

    '''dropped_text_block的bbox匹配相关指标'''
    test_text_bbox=[]
    standard_text_bbox=[]
    test_tag=[]
    standard_tag=[]


    index=0
    for a,b in zip(test_dropped_text_bboxes,standard_dropped_text_bboxes):
        test_page_tag=[]
        standard_page_tag=[]
        test_page_bbox=[]
        standard_page_bbox=[]
        if len(a)==0 and len(b)==0:
            pass
        else:
            for i in range(len(b)):
                judge=0
                standard_page_tag.append(standard_dropped_text_tag[index][i])
                standard_page_bbox.append(1)
                for j in range(len(a)):
                    if bbox_offset(b[i],a[j]):
                        judge=1
                        test_page_tag.append(test_dropped_text_tag[index][j])
                        test_page_bbox.append(1)
                        break
                if judge==0:
                    test_page_tag.append('None')
                    test_page_bbox.append(0)


            if len(test_dropped_text_tag[index])+test_page_tag.count('None')>len(standard_dropped_text_tag[index]):#有多删的情况出现
                test_page_tag1=test_page_tag.copy()
                if 'None' in test_page_tag:
                    test_page_tag1=test_page_tag1.remove('None')
                else:
                    test_page_tag1=test_page_tag

                diff=list((Counter(test_dropped_text_tag[index]) - Counter(test_page_tag1)).elements())
              
                test_page_tag.extend(diff)
                standard_page_tag.extend(['None']*len(diff))
                test_page_bbox.extend([1]*len(diff))
                standard_page_bbox.extend([0]*len(diff))

            test_tag.extend(test_page_tag)
            standard_tag.extend(standard_page_tag)
            test_text_bbox.extend(test_page_bbox)
            standard_text_bbox.extend(standard_page_bbox)

        index+=1

    
    text_block_report = {}
    text_block_report['accuracy']=metrics.accuracy_score(standard_text_bbox,test_text_bbox)
    text_block_report['precision']=metrics.precision_score(standard_text_bbox,test_text_bbox)
    text_block_report['recall']=metrics.recall_score(standard_text_bbox,test_text_bbox)
    text_block_report['f1_score']=metrics.f1_score(standard_text_bbox,test_text_bbox)

    '''删除的text_block的tag的准确率,召回率和f1-score'''
    text_block_tag_report = classification_report(y_true=standard_tag , y_pred=test_tag,output_dict=True)
    del text_block_tag_report['None']
    del text_block_tag_report["macro avg"]
    del text_block_tag_report["weighted avg"]


    '''dropped_image_block的bbox匹配相关指标'''
    '''有数据格式不一致的问题'''
    
    test_image_bbox=[]
    standard_image_bbox=[]
    for a,b in zip(test_dropped_image_bboxes,standard_dropped_image_bboxes):

        test_page_bbox=[]
        standard_page_bbox=[]
        if len(a)==0 and len(b)==0:
            pass
        else:
            for i in b:
                if len(i)!=4:
                    continue
                else:
                    judge=0
                    standard_page_bbox.append(1)
                    for j in a:
                        if bbox_offset(i,j):
                            judge=1
                            test_page_bbox.append(1)
                            break
                    if judge==0:
                        test_page_bbox.append(0)
                        
            diff_num=len(a)+test_page_bbox.count(0)-len(b)
            if diff_num>0:#有多删的情况出现
                test_page_bbox.extend([1]*diff_num)
                standard_page_bbox.extend([0]*diff_num)

          
            test_image_bbox.extend(test_page_bbox)
            standard_image_bbox.extend(standard_page_bbox)

    
    image_block_report = {}
    image_block_report['accuracy']=metrics.accuracy_score(standard_image_bbox,test_image_bbox)
    image_block_report['precision']=metrics.precision_score(standard_image_bbox,test_image_bbox)
    image_block_report['recall']=metrics.recall_score(standard_image_bbox,test_image_bbox)
    image_block_report['f1_score']=metrics.f1_score(standard_image_bbox,test_image_bbox)
    


    
    '''dropped_table_block的bbox匹配相关指标'''
    test_table_bbox=[]
    standard_table_bbox=[]
    for a,b in zip(test_dropped_table_bboxes,standard_dropped_table_bboxes):

        test_page_bbox=[]
        standard_page_bbox=[]
        if len(a)==0 and len(b)==0:
            pass
        else:
            for i in b:
                if len(i)!=4:
                    continue
                else:
                    judge=0
                    standard_page_bbox.append(1)
                    for j in a:
                        if bbox_offset(i,j):
                            judge=1
                            test_page_bbox.append(1)
                            break
                    if judge==0:
                        test_page_bbox.append(0)
                        
            diff_num=len(a)+test_page_bbox.count(0)-len(b)
            if diff_num>0:#有多删的情况出现
                test_page_bbox.extend([1]*diff_num)
                standard_page_bbox.extend([0]*diff_num)

          
            test_table_bbox.extend(test_page_bbox)
            standard_table_bbox.extend(standard_page_bbox)

    table_block_report = {}
    table_block_report['accuracy']=metrics.accuracy_score(standard_table_bbox,test_table_bbox)
    table_block_report['precision']=metrics.precision_score(standard_table_bbox,test_table_bbox)
    table_block_report['recall']=metrics.recall_score(standard_table_bbox,test_table_bbox)
    table_block_report['f1_score']=metrics.f1_score(standard_table_bbox,test_table_bbox)
    
   
    '''阅读顺序编辑距离的均值'''
    preproc_num_dis=[]
    for a,b in zip(test_preproc_num,standard_preproc_num):
        preproc_num_dis.append(Levenshtein_Distance(a,b))
    preproc_num_edit=np.mean(preproc_num_dis)



    '''分段准确率'''
    test_para_num=np.array(test_para_num)
    standard_para_num=np.array(standard_para_num)
    acc_para=np.mean(test_para_num==standard_para_num)

    
    output=pd.DataFrame()
    output['总体指标']=[overall_report]
    output['行内公式平均编辑距离']=[inline_equations_edit]
    output['行间公式平均编辑距离']=[interline_equations_edit]
    output['行内公式平均bleu']=[inline_equations_bleu]
    output['行间公式平均bleu']=[interline_equations_bleu]
    output['阅读顺序平均编辑距离']=[preproc_num_edit]
    output['分段准确率']=[acc_para]
    output['删除的text block的相关指标']=[text_block_report]
    output['删除的image block的相关指标']=[image_block_report]
    output['删除的table block的相关指标']=[table_block_report]
    output['删除的text block的tag相关指标']=[text_block_tag_report]
    

    return output

"""
计算编辑距离
"""
def Levenshtein_Distance(str1, str2):
    matrix = [[ i + j for j in range(len(str2) + 1)] for i in range(len(str1) + 1)]
    for i in range(1, len(str1)+1):
        for j in range(1, len(str2)+1):
            if(str1[i-1] == str2[j-1]):
                d = 0
            else:
                d = 1
            matrix[i][j] = min(matrix[i-1][j]+1, matrix[i][j-1]+1, matrix[i-1][j-1]+d)
    return matrix[len(str1)][len(str2)]


'''
计算bbox偏移量是否符合标准的函数
'''
def bbox_offset(b_t,b_s):
    '''b_t是test_doc里的bbox,b_s是standard_doc里的bbox'''
    x1_t,y1_t,x2_t,y2_t=b_t
    x1_s,y1_s,x2_s,y2_s=b_s
    x1=max(x1_t,x1_s)
    x2=min(x2_t,x2_s)
    y1=max(y1_t,y1_s)
    y2=min(y2_t,y2_s)
    area_overlap=(x2-x1)*(y2-y1)
    area_t=(x2_t-x1_t)*(y2_t-y1_t)+(x2_s-x1_s)*(y2_s-y1_s)-area_overlap
    if area_t-area_overlap==0 or area_overlap/(area_t-area_overlap)>0.95:
        return True
    else:
        return False




   
parser = argparse.ArgumentParser()
parser.add_argument('--test', type=str)
parser.add_argument('--standard', type=str)
args = parser.parse_args()
pdf_json_test = args.test
pdf_json_standard = args.standard



if __name__ == '__main__':
    
   pdf_json_test = [json.loads(line) 
                        for line in open(pdf_json_test, 'r', encoding='utf-8')]
   pdf_json_standard = [json.loads(line) 
                    for line in open(pdf_json_standard, 'r', encoding='utf-8')]
   
   overall_indicator=indicator_cal(pdf_json_standard,pdf_json_test)

   '''计算的指标输出到overall_indicator_output.json中'''
   overall_indicator.to_json('overall_indicator_output.json',orient='records',lines=True,force_ascii=False)