pdf2text_recogFootnoteLine.py 31.4 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
import io
import re
import os
import json
from libs.boxbase import _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
from libs.commons import fitz
from fitz import Point
from pprint import pprint
import pickle
import collections
from typing import List


def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
    # 计算两个rect,重叠面积各占2个rect面积的比例
    if min(R1, R2) < max(L1, L2) or min(D1, D2) < max(U1, U2):
        return 0, 0
    square_1 = (R1 - L1) * (D1 - U1)
    square_2 = (R2 - L2) * (D2 - U2)
    if square_1 == 0 or square_2 == 0:
        return 0, 0
    square_overlap = (min(R1, R2) - max(L1, L2)) * (min(D1, D2) - max(U1, U2))
    return square_overlap / square_1, square_overlap / square_2

def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float):
    # 计算两个line,重叠区间各占2个line长度的比例
    if max(L1, L2) > min(R1, R2):
        return 0, 0
    if L1 == R1 or L2 == R2:
        return 0, 0
    overlap_line = min(R1, R2) - max(L1, L2)
    return overlap_line / (R1 - L1), overlap_line / (R2 - L2)


def parse_footnoteLine(page_ID: int, page: fitz.Page, json_from_DocXchain_obj, exclude_bboxes):
    """
    :param page_ID: int类型,当前page在当前pdf文档中是第page_D页。
    :param page :fitz读取的当前页的内容
    :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
    :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
    """
    DPI = 72  # use this resolution
    pix = page.get_pixmap(dpi=DPI)
    pageL = 0
    pageR = int(pix.w)
    pageU = 0
    pageD = int(pix.h)

    #---------------------- PyMuPDF解析text --------------------#
    textSize_freq = collections.defaultdict(float)        # text块中,textSize的频率
    textBlock_bboxs = []
    textLine_bboxs = []
    text_blocks = page.get_text(
            "dict",
            flags=fitz.TEXTFLAGS_TEXT,
            #clip=clip,
        )["blocks"]
    totText_list = []
    for i in range(len(text_blocks)):
        # print(blocks[i])                #### print
        bbox = text_blocks[i]['bbox']
        textBlock_bboxs.append(bbox)
        # print(bbox) 
        cur_block_text_list = []
        for tt in text_blocks[i]['lines']:
            # 当前line
            cur_line_text_list = []
            cur_line_bbox = None                            # 当前line,最右侧的section的bbox
            for xf in tt['spans']:
                L, U, R, D = xf['bbox']
                L, R = min(L, R), max(L, R)
                U, D = min(U, D), max(U, D)
                textLine_bboxs.append((L, U, R, D))
                cur_line_text_list.append(xf['text'])
                textSize_freq[xf['size']] += len(xf['text'])
            cur_lines_text = ' '.join(cur_line_text_list)
            cur_block_text_list.append(cur_lines_text)
        totText_list.append('\n'.join(cur_block_text_list))
    totText = '\n'.join(totText_list)
    # print(totText)                              # 打印Text

    textLine_bboxs.sort(key = lambda LURD: (LURD[0], LURD[1]))
    textBlock_bboxs.sort(key = lambda LURD: (LURD[0], LURD[1]))
    
    # print('------------ textSize_freq -----------')
    max_sizeFreq = 0                        # 出现频率最高的textSize
    textSize_withMaxFreq = 0
    for x, f in textSize_freq.items():
        # print(x, f)
        if f > max_sizeFreq:
            max_sizeFreq = f
            textSize_withMaxFreq = x
    #**********************************************************#

    #------------------ PyMuPDF读取drawings -----------------#
    horizon_lines = []
    drawings = page.get_cdrawings()
    for drawing in drawings:
        try:
            rect = drawing['rect']
            L, U, R, D = rect
            # if (L, U, R, D) in exclude_bboxes:
            #     continue        # 如果是Fiugre, Table, Equation。注释掉是因为,可以暂时先不消,先自我对消。最后再判读需不需要排除。
            # 如果是水平线
            if U <= D and D - U <= 3:
                # 如果长度够
                if (pageR - pageL) / 15 <= R - L:
                    if not(80/800 * pageD <= U <= 750/800 * pageD):
                        continue    # 很可能是页眉和页脚的线
                    horizon_lines.append((L, U, R, D))
                    # print((L, U, R, D))
        except:
            pass
    horizon_lines.sort(key = lambda LURD: (LURD[1]))
    #********************************************************#
    
    #----------------- 两条线可能是在表格中 ------------------#
    def has_text_below_line(L: float, U: float, R: float, D: float, inLowerArea: bool) -> bool:
        """
        检查线下是否紧挨着text
        """
        Uu, Du = U - textSize_withMaxFreq, U        # 线上的一个矩形
        Lu, Ru = L, R
        Ud, Dd = U, U + textSize_withMaxFreq        # 线下的一个矩形
        Ld, Rd = L, R
        find = 0                        # 在线下的文字。统计面积。
        leftTextCnt = 0                 # 不在线底下的文字(整体在线左侧的文字),说明不是个脚注线。统计面积。
        English_alpha_cnt = 0           # 英文字母个数
        nonEnglish_alpha_cnt = 0        # 非英文字母个数
        punctuation_mark_cnt = 0        # 常见标点符号个数
        digit_cnt = 0                   # 数字个数

        distance_nearest_up_line = None
        distance_nearest_down_line = None

        for i in range(len(text_blocks)):
            # print(blocks[i])                #### print
            bbox = text_blocks[i]['bbox']
            L0, U0, R0, D0 = bbox
            if 0< (R0 - L0) < pageR / 6 and (D0 - U0) / (R0 - L0) > 10 :
                continue                # 一个很窄的,竖直的长条。比如,arXiv预印本,左侧的arXiv标志信息。
            textBlock_bboxs.append(bbox)
            # print(bbox) 
            cur_block_text_list = []
            for tt in text_blocks[i]['lines']:
                # 当前line
                cur_line_text_list = []
                cur_line_bbox = None                            # 当前line,最右侧的section的bbox
                for xf in tt['spans']:
                    L2, U2, R2, D2 = xf['bbox']
                    L2, R2 = min(L2, R2), max(L2, R2)
                    U2, D2 = min(U2, D2), max(U2, D2)
                    textLine = xf['text']
                    if L>0 and L2 < L and (L - L2) / L > 0.2:                        
                        leftTextCnt += abs(R2 - L2) * abs(D2 - U2)
                    else:
                        ## 线下的部分
                        ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(Ud, Dd, U2, D2)
                        ratio_3, ratio_4 = calculate_overlapRatio_between_line1_and_line2(Ld, Rd, L2, R2)
                        if U < (U2 + D2) / 2 and ratio_1 > 0 and ratio_2 > 0:
                            if max(ratio_3, ratio_4) > 0.8:
                                # if 444 <= U1 < 445 and 55 <= L2 < 56:
                                #     print('匹配的框', L2, U2, R2, D2)
                                # if xf['size'] > 1.2 * textSize_withMaxFreq:
                                #     return False        # 可能是个标题。不能这样卡
                                find += abs(R2 - L2) * abs(D2 - U2)
                                distance_nearest_down_line = (U2 + D2) / 2 - U
                                for c in textLine:
                                    if c == ' ':
                                        continue
                                    elif c.isdigit() == True:
                                        digit_cnt += 1
                                    elif c in ',.:!?[]()%,。、!?:【】()《》-':
                                        punctuation_mark_cnt += 1
                                    elif c.isalpha() == True:
                                        English_alpha_cnt += 1
                                    else:
                                        nonEnglish_alpha_cnt += 1
                        ## 线上的部分
                        ratio_5, ratio_6 = calculate_overlapRatio_between_line1_and_line2(Uu, Du, U2, D2)
                        ratio_7, ratio_8 = calculate_overlapRatio_between_line1_and_line2(Lu, Ru, L2, R2)
                        if (U2 + D2) / 2 < U and ratio_5 > 0 and ratio_6 > 0:
                            if max(ratio_7, ratio_8) > 0.8:
                                distance_nearest_up_line = U - (U2 + D2) / 2
                                # if distance_nearest_up_line < 0:
                                #     print(Lu, Uu, Ru, Du, L2, U2, R2, D2)
        # print(distance_nearest_up_line, distance_nearest_down_line)
        if distance_nearest_up_line != None and distance_nearest_down_line != None:
            if distance_nearest_up_line * 1.5 < distance_nearest_down_line:
                return False                        # 如果,一根线。距离上面的文字line更近。说明是个下划线,而不是footnoteLine
                        
        ## 在上面的线条,要考虑左侧的text块儿。在很靠下的线条,就暂时不考虑左侧text块儿了。
        if inLowerArea == False:
            if leftTextCnt >= 2000/500000 * pageR * pageD:
                return False
            return find >= 0 and (English_alpha_cnt + nonEnglish_alpha_cnt + digit_cnt) >= 10
        ## 最下面区域的线条,判断时。
        # print(English_alpha_cnt, nonEnglish_alpha_cnt, digit_cnt)
        if (English_alpha_cnt + nonEnglish_alpha_cnt + digit_cnt) == 0:
            return False
        if (English_alpha_cnt + digit_cnt) / (English_alpha_cnt + nonEnglish_alpha_cnt + digit_cnt) > 0.5:
            if nonEnglish_alpha_cnt / (English_alpha_cnt + nonEnglish_alpha_cnt + digit_cnt) > 0.4:
                return False
            else:
                return True
        return True
            
    
    visited = [False for _ in range(len(horizon_lines))]
    for i, b1 in enumerate(horizon_lines):
        for j in range(i + 1, len(horizon_lines)):
            L1, U1, R1, D1 = horizon_lines[i]
            L2, U2, R2, D2 = horizon_lines[j]
            
            ## 在一条水平线,且挨着
            if L1 > L2:
                L1, U1, R1, D1, L2, U2, R2, D2 = L2, U2, R2, D2, L1, U1, R1, D1
            in_horizontal_line_flag = (max(U1, D1, U2, D2) - min(U1, D1, U2, D2) <= 5) and (L2 - R1 <= pageR/10)
            if in_horizontal_line_flag == True:
                visited[i] = True
                visited[j] = True
                
            ## 在竖直方向上是一致的。(表格,或者有的文章就是喜欢划线)
            L1, U1, R1, D1 = horizon_lines[i]
            L2, U2, R2, D2 = horizon_lines[j]            
            ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
            # print(L1, U1, R1, D1, L2, U2, R2, D2, ratio_1, ratio_2)
            in_vertical_line_flag = (ratio_1 > 0.9 and ratio_2 > 0.9) or (max(ratio_1, ratio_2) > 0.95)
            if in_vertical_line_flag == True:
                visited[i] = True         
                # if (U2 < pageD * 0.8 or (U2 - U1) < pageD * 0.3) and has_text_below_line(L2, U2, R2, D2, False) == False:
                #     visited[j] = True             # 最最底下的线先不要动
            else:
                if ratio_1 > 0 and (R2 - L2) / (R1 - L1) > 1:
                    visited[i] = True
    # print(horizon_lines)
    horizon_lines = [horizon_lines[i] for i in range(len(horizon_lines)) if visited[i] == False]
    # print(horizon_lines)
    #*****************************************************************#    

    #------- 靠上的,就不是脚注。用一个THRESHOLD直接卡掉位于上半页的 -------#
    visited = [False for _ in range(len(horizon_lines))]
    THRESHOLD = (pageD - pageU) * 0.5
    for i, (L, U, R, D) in enumerate(horizon_lines):
        if U < THRESHOLD:
            visited[i] = True
    horizon_lines = [horizon_lines[i] for i in range(len(horizon_lines)) if visited[i] == False]
    #******************************************************#
    
    #--------------- 此时,还有遮挡的,上面的丢弃 ---------------#
    visited = [False for _ in range(len(horizon_lines))]
    for i, (L1, U1, R1, D1) in enumerate(horizon_lines):
        for j in range(i + 1, len(horizon_lines)):
            L2, U2, R2, D2 = horizon_lines[j]
            ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
            if (ratio_1 > 0.2 and ratio_2 > 0.2) or max(ratio_1, ratio_2) > 0.7:
                visited[i] = True
    horizon_lines = [horizon_lines[i] for i in range(len(horizon_lines)) if visited[i] == False]
    #********************************************************#
    # print(horizon_lines)
    ## 检查,线下面有没有紧挨着的text
    horizon_lines = [LURD for LURD in horizon_lines if has_text_below_line(*(LURD), True) == True]
    # print(horizon_lines)
    ## 卡一下长度
    # horizon_lines = [LURD for LURD in horizon_lines if (LURD[2] - LURD[0] >= pageR / 10)]
    
    ## 上面最多保留2条
    horizon_lines = horizon_lines[max(-2, -len(horizon_lines)) :]
    
    
    #----------------------------------------------------- 第2段 -----------------------------------------------------------#
    #----------------------------------- 最下面的情形,用距离硬卡。还有在右侧的情形就被包含了 -----------------------------------#
    #------------------ PyMuPDF读取drawings -----------------#
    down_horizon_lines = []
        
    drawings = page.get_cdrawings()
    for drawing in drawings:
        try:
            rect = drawing['rect']
            L, U, R, D = rect
            # if (L, U, R, D) in exclude_bboxes:
            #     continue        # 如果是Fiugre, Table, Equation。目前是Figure识别的比较好。但是Table和Equation识别的不好
            # 如果是水平线
            if U <= D and D - U <= 3 and U > pageD * 0.85:
                # 如果长度够
                if (pageR - pageL) / 15 <= R - L:
                    down_horizon_lines.append((L, U, R, D))
                    # print((L, U, R, D))
        except:
            pass
                
    down_horizon_lines.sort(key = lambda LURD: (LURD[0], LURD[2], LURD[1]))
    visited = [False for _ in range(len(down_horizon_lines))]
    for i in range(len(down_horizon_lines) - 1):
        L1, U1, R1, D1 = down_horizon_lines[i]
        L2, U2, R2, D2 = down_horizon_lines[i + 1]
        ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
        if ratio_1 <= 0.1 and ratio_2 <= 0.1:
            if L2 - R1 <= pageR / 3:
                visited[i] = True
                visited[i + 1] = True
    down_horizon_lines = [down_horizon_lines[i] for i in range(len(down_horizon_lines)) if visited[i] == False]
    
    down_horizon_lines = [LURD for LURD in down_horizon_lines if has_text_below_line(*(LURD), True) == True]
    # for LURD in down_horizon_lines:
    #     print('第2阶段,LURD是: ', LURD)
    #     print(has_text_below_line(*(LURD), True))

    footnoteLines = horizon_lines + down_horizon_lines
    footnoteLines = list(set(footnoteLines))
    footnoteLines = footnoteLines[max(-2, -len(footnoteLines)) : ]
    
    #-------------------------- 最后再检查一遍。是否在图片、表格、公式中。 ------------------------------#
    def line_in_specialBboxes(L: float, U: float, R: float, D: float, specialBboxes) -> bool:
        L2, U2, R2, D2 = L, U, R, D     # 当前这根线
        for L1, U1, R1, D1 in specialBboxes:
            if U1 <= U2 <= D2 < D1:
                ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
                if ratio_1 > 0 and ratio_2 > 0.6:
                    return True
            # else:
                # U1 -= min(textSize_withMaxFreq * 2, 20)
                # D1 += min(textSize_withMaxFreq * 2, 20)
                # if U1 <= U2 <= D2 < D1:
                #     ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
                #     if ratio_1 > 0 and ratio_2 > 0.8:
                #         return True
        return False                
        
    footnoteLines = [LURD for LURD in footnoteLines if line_in_specialBboxes(*(LURD), exclude_bboxes) == False]
    
    #-------------------------- 检查,线,是否在当前column的左侧,而不是在一段文字的中间 (通过DocXChain识别的column或者徐超老师写的Layout识别)------------------------------#
    # #--------- 通过json_from_DocXchain来获取 column ---------#
    # column_bbox_from_DocXChain = []

    # xf_json = json_from_DocXchain_obj
    # width_from_json = xf_json['page_info']['width']
    # height_from_json = xf_json['page_info']['height']
    # LR_scaleRatio = width_from_json / (pageR - pageL)
    # UD_scaleRatio = height_from_json / (pageD - pageU)

    # # {0: 'title',  # 标题
    # # 1: 'figure', # 图片
    # #  2: 'plain text',  # 文本
    # #  3: 'header',      # 页眉
    # #  4: 'page number', # 页码
    # #  5: 'footnote',    # 脚注
    # #  6: 'footer',      # 页脚
    # #  7: 'table',       # 表格
    # #  8: 'table caption',  # 表格描述
    # #  9: 'figure caption', # 图片描述
    # #  10: 'equation',      # 公式
    # #  11: 'full column',   # 单栏
    # #  12: 'sub column',    # 多栏
    # #  13: 'embedding',     # 嵌入公式
    # #  14: 'isolated'}      # 单行公式
    # for xf in xf_json['layout_dets']:
    #     L = xf['poly'][0] / LR_scaleRatio
    #     U = xf['poly'][1] / UD_scaleRatio
    #     R = xf['poly'][2] / LR_scaleRatio
    #     D = xf['poly'][5] / UD_scaleRatio
    #     # L += pageL          # 有的页面,artBox偏移了。不在(0,0)
    #     # R += pageL
    #     # U += pageU
    #     # D += pageU
    #     L, R = min(L, R), max(L, R)
    #     U, D = min(U, D), max(U, D)
    #     if (xf['category_id'] == 11 or xf['category_id'] == 12) and xf['score'] >= 0.3:
    #         column_bbox_from_DocXChain.append((L, U, R, D))
    
    #---------------手写,检查,线是否是与某个column的左端对齐 ------------------#
    def check_isOnTheLeftOfColumn(L: float, U: float, R: float, D: float) -> bool:
        LL = L - textSize_withMaxFreq
        RR = LL
        UU = max(pageD * 0.02, U - 100/800 * pageD)
        DD = min(U + 50/800 * pageD, pageD * 0.98)
        
        # print(LL, UU, RR, DD)
        cnt = 0
        for bbox in textLine_bboxs:
            L2, U2, R2, D2 = bbox
            ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(UU, DD, U2, D2)
            ratio_3, ratio_4 = calculate_overlapRatio_between_line1_and_line2(L, R, L2, R2)
            if ratio_1 > 0 and ratio_2 > 0:
                if max(ratio_3, ratio_4) > 0.8:
                    if abs(LL - L2) <= 20/700 * pageR:
                        cnt += 1
                    # else:
                    #     if (R2 - L2) >= 30/700 * pageR:
                    #         print(LL, UU, RR, DD, L2, U2, R2, D2)
                    #         return False                  # 不能这样卡。有些注释里面,单独的特殊符号就是一个textLineBbox
        # print('cnt: ', cnt)
        return cnt >= 4
    
    # def check_isOnTheLeftOfColumn_considerLayout(L0: float, U0: float, R0: float, D0: float) -> bool:
    #     LL = L0 - textSize_withMaxFreq * 1.5
    #     RR = LL
    #     UU = 100/800 * pageD
    #     DD = 700/800 * pageD
        
    #     STEP = textSize_withMaxFreq / 2
        
    #     def check_ok(L: float, U: float, R: float, D: float) -> bool:
    #         for bbox in textBlock_bboxs:
    #             L2, U2, R2, D2 = bbox
    #             ratio_3, ratio_4 = calculate_overlapRatio_between_line1_and_line2(L, R, L2, R2)
    #             if max(ratio_3, ratio_4) > 0.8:
    #                 if (R2 - L2) > 1/4 * pageR and L2 < LL <= RR < R2:
    #                     if abs(LL - L2) < 50/700 * pageR or abs(RR - R2) < 50/700 * pageR:
    #                         continue
    #                     else:
    #                         return False
    #         return True
                             
    #     ## 先探上面
    #     u = UU
    #     d = U0
    #     while u + STEP/2 < d:
    #         mid = (u + d) / 2
    #         if check_ok(L0, mid, R0, U0) == True:
    #             d = mid
    #         else:
    #             u = mid + STEP
    #             print(mid)
    #     dist_up = U0 - u
    #     print(u)
    #     ## 再探下面
    #     u = D0
    #     d = DD
    #     while u + STEP/2 < d:
    #         mid = (u + d) / 2
    #         if check_ok(L0, mid, R0, D0) == True:
    #             u = mid
    #         else:
    #             d = mid - STEP
    #     print(u)
    #     print('^^^^^^^^^^^^^^')
    #     dist_down = u - D0
        
    #     if dist_up + dist_down < textSize_withMaxFreq * 10:
    #         return False
    #     return True
    
    
    footnoteLines = [LURD for LURD in footnoteLines if check_isOnTheLeftOfColumn(*(LURD)) == True]
    # footnoteLines = [LURD for LURD in footnoteLines if check_isOnTheLeftOfColumn_considerLayout(*(LURD)) == True]     # 不具有泛化性。不用了。
    
    #--------------------------------- 通过footnoteLine获取bbox -------------------------------#
    def get_footnoteBbox(L: float, U: float, R: float, D: float) -> (float, float, float, float):
        """
        检查线下是否紧挨着text
        """
        L1, U1, R1, D1 = L, U, R, D
        raw_bboxes = []
        for i in range(len(text_blocks)):
            bbox = text_blocks[i]['bbox']
            L2, U2, R2, D2 = bbox
            if (D2 - U2) / (R2 - L2) > 10 and (R2 - L2) < pageR / 6:
                continue                # 一个很窄的,竖直的长条。比如,arXiv预印本,左侧的arXiv标志信息。
            if U2 < D2 < U1:
                continue                # 在线上面
            under_THRESHOLD = min(D1 + textSize_withMaxFreq * 20, pageD * 0.98)
            if U2 < under_THRESHOLD:
                ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R1, L2, R2)
                if max(ratio_1, ratio_2) > 0.8:
                    raw_bboxes.append((L2, U2, R2, D2))
        # print(L1, U1, R1, D1)
        # print(raw_bboxes)
        if len(raw_bboxes) == 0:
            return []
        
        raw_bboxes.sort(key = lambda LURD: (LURD[1], LURD[0]))
        raw_bboxes = [LURD for LURD in raw_bboxes if (abs(LURD[0] - L1) < textSize_withMaxFreq * 6 or L1 < LURD[0])]  # footnote的bbox,应该都是左端对齐的
        if len(raw_bboxes) == 0:
            return []
        #------------------ full column和sub column混合,肯定也不行 ------------------#
        LL, UU, RR, DD = raw_bboxes[0]
        for L, U, R, D in raw_bboxes:
            LL, UU, RR, DD = min(LL, L), min(UU, U), max(RR, R), max(DD, D)
        for L, U, R, D in raw_bboxes:
            if (RR - LL) > pageR*0.8 and (R - L) > pageR * 0.15 and (RR - LL) / (R - L) > 2:
                return []
            if abs(LL - L) > textSize_withMaxFreq * 3:
                return []       
        
        #-------------------- 太高了的,full column的框。不行 ----------------------#
        if UU < 650/800 * pageD and (RR - LL) > 0.5 * pageR:
            return []
        
        #-------------- 第一段字数很少。后面的段字数很多,也不行 ----------------#
        if len(raw_bboxes) > 1:
            bbox_square = []
            for L, U, R, D in raw_bboxes:
                cur_s = abs(R - L) * abs(D - U)
                bbox_square.append(cur_s)
            
            s0 = bbox_square[0]
            s1n = sum(bbox_square[1: ]) / len(bbox_square[1: ])
            if s1n / s0 > 10 or max(bbox_square) / s0 > 15:
                return []
        
        raw_bboxes += [(LL, UU, RR, DD)]
        return raw_bboxes            
                                
    # print(footnoteLines)
    footnoteBboxes = []
    for L, U, R, D in footnoteLines:
        cur = get_footnoteBbox(L, U, R, D)
        if len(cur) > 0:
            footnoteBboxes.append((L, U, R, D))
            footnoteBboxes += cur
    
    footnoteBboxes = list(set(footnoteBboxes))
    return footnoteBboxes
    

def __bbox_in(box1, box2):
    """
    box1是否在box2中
    """
    L1, U1, R1, D1 = box1
    L2, U2, R2, D2 = box2
    if int(L2) <= int(L1) and int(U2) <= int(U1) and int(R1) <= int(R2) and int(D1) <= int(D2):
        return True
    return False
    
def remove_footnote_text(raw_text_block, footnote_bboxes):
    """
    :param raw_text_block: str类型,是当前页的文本内容
    :param footnoteBboxes: list类型,是当前页的脚注bbox
    """
    footnote_text_blocks = []
    for block in raw_text_block:
        text_bbox = block['bbox']
        # TODO 更严谨点在line级别做
        if any([_is_in_or_part_overlap(text_bbox, footnote_bbox) for footnote_bbox in footnote_bboxes]):
        #if any([text_bbox[3]>=footnote_bbox[1] for footnote_bbox in footnote_bboxes]):
            block['tag'] = 'footnote'
            footnote_text_blocks.append(block)
            #raw_text_block.remove(block)
            
    # 移除,不能再内部移除,否则会出错
    for block in footnote_text_blocks:
        raw_text_block.remove(block)
        
    return raw_text_block, footnote_text_blocks

def remove_footnote_image(image_blocks, footnote_bboxes):
    """
    :param image_bboxes: list类型,是当前页的图片bbox(结构体)
    :param footnoteBboxes: list类型,是当前页的脚注bbox
    """
    footnote_imgs_blocks = []
    for image_block in image_blocks:
        if any([__bbox_in(image_block['bbox'], footnote_bbox) for footnote_bbox in footnote_bboxes]):
            footnote_imgs_blocks.append(image_block)
            
    for footnote_imgs_block in footnote_imgs_blocks:
        image_blocks.remove(footnote_imgs_block)
            
    return image_blocks, footnote_imgs_blocks


def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs, page_no_bboxs, page_w, page_h):
    """
    删除页眉页脚,页码
    从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中
    """
    header = []
    footer = []
    if len(header)==0:
        model_header = header_bboxs
        if model_header:
            x0 = min([x for x,_,_,_ in model_header])
            y0 = min([y for _,y,_,_ in model_header])
            x1 = max([x1 for _,_,x1,_ in model_header])
            y1 = max([y1 for _,_,_,y1 in model_header])
            header = [x0, y0, x1, y1]
    if len(footer)==0:
        model_footer = footer_bboxs
        if model_footer:
            x0 = min([x for x,_,_,_ in model_footer])
            y0 = min([y for _,y,_,_ in model_footer])
            x1 = max([x1 for _,_,x1,_ in model_footer])
            y1 = max([y1 for _,_,_,y1 in model_footer])
            footer = [x0, y0, x1, y1]


    header_y0 = 0 if len(header) == 0 else header[3]
    footer_y0 = page_h if len(footer) == 0 else footer[1]
    if page_no_bboxs:
        top_part = [b for b in page_no_bboxs if b[3] < page_h/2]
        btn_part = [b for b in page_no_bboxs if b[1] > page_h/2]
        
        top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
        btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
        
        header_y0 = max(header_y0, top_max_y0)
        footer_y0 = min(footer_y0, btn_min_y1)
        
    content_boundry = [0, header_y0, page_w, footer_y0]
    
    header = [0,0, page_w, header_y0]
    footer = [0, footer_y0, page_w, page_h]
        
    """以上计算出来了页眉页脚的边界,下面开始进行删除"""
    text_block_to_remove = []
    # 首先检查每个textblock
    for blk in text_raw_blocks:
        if len(blk['lines']) > 0:
            for line in blk['lines']:
                line_del = []
                for span in line['spans']:
                    span_del = []
                    if span['bbox'][3] < header_y0:
                        span_del.append(span)
                    elif _is_in_or_part_overlap(span['bbox'], header) or _is_in_or_part_overlap(span['bbox'], footer):
                        span_del.append(span)
                for span in span_del:
                    line['spans'].remove(span)
                if not line['spans']:
                    line_del.append(line)

            for line in line_del:
                blk['lines'].remove(line)
        else:
        # if not blk['lines']:
            blk['tag'] = 'in-foot-header-area'
            text_block_to_remove.append(blk)
        
    """有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除"""
    page_no_block_2_remove = []
    if page_no_bboxs:
        for pagenobox in page_no_bboxs:
            for block in text_raw_blocks:
                if _is_in_or_part_overlap(pagenobox, block['bbox']): # 在span级别删除页码
                    for line in block['lines']:
                        for span in line['spans']:
                            if _is_in_or_part_overlap(pagenobox, span['bbox']):
                                #span['text'] = ''
                                span['tag'] = "page-no"
                                # 检查这个block是否只有这一个span,如果是,那么就把这个block也删除
                                if len(line['spans']) == 1 and len(block['lines'])==1:
                                    page_no_block_2_remove.append(block)
    else:
        # 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字
        if len(text_raw_blocks) > 0:
            text_raw_blocks.sort(key=lambda x: x['bbox'][1], reverse=True)
            last_block = text_raw_blocks[0]
            if len(last_block['lines']) == 1:
                last_line = last_block['lines'][0]
                if len(last_line['spans']) == 1:
                    last_span = last_line['spans'][0]
                    if last_span['text'].strip() and not re.search('[a-zA-Z]', last_span['text']) and re.search('[0-9]', last_span['text']):
                        last_span['tag'] = "page-no"
                        page_no_block_2_remove.append(last_block)
                        
                                    
    for b in page_no_block_2_remove:
        text_block_to_remove.append(b)
        
    for blk in text_block_to_remove:
        if blk in text_raw_blocks:
            text_raw_blocks.remove(blk)
    
    text_block_remain = text_raw_blocks
    image_bbox_to_remove = [bbox for bbox in image_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]

    image_bbox_remain = [bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
    table_bbox_to_remove = [bbox for bbox in table_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
    table_bbox_remain = [bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
    
    return image_bbox_remain, table_bbox_remain, text_block_remain, text_block_to_remove, image_bbox_to_remove, table_bbox_to_remove