"alphafold/vscode:/vscode.git/clone" did not exist on "98caef21efa959e44ed01ab33cfb15ab04a39418"
pdf2text_getNumberOfColumn.py 13.5 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
from libs.commons import fitz
from typing import List


def show_image(item, title=""):
    """Display a pixmap.

    Just to display Pixmap image of "item" - ignore the man behind the curtain.

    Args:
        item: any PyMuPDF object having a "get_pixmap" method.
        title: a string to be used as image title

    Generates an RGB Pixmap from item using a constant DPI and using matplotlib
    to show it inline of the notebook.
    """
    DPI = 150  # use this resolution
    import numpy as np
    import matplotlib.pyplot as plt

    # %matplotlib inline
    pix = item.get_pixmap(dpi=DPI)
    img = np.ndarray([pix.h, pix.w, 3], dtype=np.uint8, buffer=pix.samples_mv)
    plt.figure(dpi=DPI)  # set the figure's DPI
    plt.title(title)  # set title of image
    _ = plt.imshow(img, extent=(0, pix.w * 72 / DPI, pix.h * 72 / DPI, 0))


def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float):
    # 计算两个line,重叠line各占2个line长度的比例
    if max(L1, L2) > min(R1, R2):
        return 0, 0
    if L1 == R1 or L2 == R2:
        return 0, 0
    overlap_line = min(R1, R2) - max(L1, L2)
    return overlap_line / (R1 - L1), overlap_line / (R2 - L2)


def get_targetAxis_and_splitAxis(page_ID: int, page: fitz.Page, columnNumber: int, textBboxs: List[(float, float, float, float)]) -> (List[float], List[float]):
    """
    param: page: fitz解析出来的格式
    param: columnNumber: Text的列数
    param: textBboxs: 文本块list。 [(L, U, R, D), ... ]
    return: 
    
    """
    INF = 10 ** 9
    pageL, pageU, pageR, pageD = INF, INF, 0, 0
    for L, U, R, D in textBboxs:
        assert L <= R and U <= D
        pageL = min(pageL, L)
        pageR = max(pageR, R)
        pageU = min(pageU, U)
        pageD = max(pageD, D)

    pageWidth = pageR - pageL
    pageHeight = pageD - pageU
    pageL -= pageWidth / 10  # 10是经验值
    pageR += pageWidth / 10
    pageU -= pageHeight / 10
    pageD += pageHeight / 10
    pageWidth = pageR - pageL
    pageHeight = pageD - pageU

    x_targetAxis = []
    x_splitAxis = []
    for i in range(0, columnNumber * 2 + 1):
        if i & 1:
            x_targetAxis.append(pageL + pageWidth / (2 * columnNumber) * i)
        else:
            x_splitAxis.append(pageL + pageWidth / (2 * columnNumber) * i)

    # # 可视化:分列的外框
    # path_bbox = []
    # N = len(x_targetAxis)
    # for i in range(N):
    #     L, R = x_splitAxis[i], x_splitAxis[i + 1]
    #     path_bbox.append((L, pageU, R, pageD))
    # shape = page.new_shape()
    # # iterate over the bboxes
    # color_map = [fitz.pdfcolor["red"], fitz.pdfcolor["blue"], fitz.pdfcolor["yellow"], fitz.pdfcolor["black"], fitz.pdfcolor["green"], fitz.pdfcolor["brown"]]
    # for i, rect in enumerate(path_bbox):
    #     # if i < 20:
    #     #     continue
    #     shape.draw_rect(rect)  # draw a border
    #     shape.insert_text(Point(rect[0], rect[1])+(5, 15), str(i), color=fitz.pdfcolor["blue"])
    #     shape.finish(color=color_map[i%len(color_map)])
    #     # shape.finish(color=fitz.pdfcolor["blue"])
    #     shape.commit()  # store to the page

    #     # if i == 3:
    #     #     print(rect)
    #     #     break
    #     # print(rect)
    # show_image(page, f"Table & Header BBoxes")            

    return x_targetAxis, x_splitAxis


def calculate_loss(page_ID: int, x_targetAxis: List[float], x_splitAxis: List[float], textBboxs: List[(float, float, float, float)]) -> (float, bool):
    INF = 10 ** 9

    # page_artbox = page.artbox
    # pageL, pageU, pageR, pageD = page_artbox[0], page_artbox[1], page_artbox[2], page_artbox[3]

    pageL, pageU, pageR, pageD = INF, INF, 0, 0
    for L, U, R, D in textBboxs:
        assert L <= R and U <= D
        pageL = min(pageL, L)
        pageR = max(pageR, R)
        pageU = min(pageU, U)
        pageD = max(pageD, D)

    pageWidth = pageR - pageL
    pageHeight = pageD - pageU
    pageL -= pageWidth / 10
    pageR += pageWidth / 10
    pageU -= pageHeight / 10
    pageD += pageHeight / 10
    pageWidth = pageR - pageL
    pageHeight = pageD - pageU

    col_N = len(x_targetAxis)  # 列数
    col_texts_mid = [[] for _ in range(col_N)]
    col_texts_LR = [[] for _ in range(col_N)]

    oneLocateLoss_mid = 0
    oneLocateLoss_LR = 0
    oneLocateCnt_mid = 0  # 完美在一列中的个数
    oneLocateCnt_LR = 0
    oneLocateSquare_mid = 0.0  # 完美在一列的面积
    oneLocateSquare_LR = 0.0

    multiLocateLoss_mid = 0
    multiLocateLoss_LR = 0
    multiLocateCnt_mid = 0  # 在多列中的个数
    multiLocateCnt_LR = 0
    multiLocateSquare_mid = 0.0  # 在多列中的面积
    multiLocateSquare_LR = 0.0

    allLocateLoss_mid = 0
    allLocateLoss_LR = 0
    allLocateCnt_mid = 0  # 横跨页面的大框的个数
    allLocateCnt_LR = 0
    allLocateSquare_mid = 0.0  # 横跨整个页面的个数
    allLocateSquare_LR = 0.0

    isSimpleCondition = True  # 就1个。2种方式,只要有一种情况不规整,就是不规整。
    colID_Textcnt_mid = [0 for _ in range(col_N)]  # 每一列中有多少个Text块,根据mid判断的
    colID_Textcnt_LR = [0 for _ in range(col_N)]  # 每一列中有多少个Text块,根据区间边界判断

    allLocateBboxs_mid = []  # 跨整页的,bbox
    allLocateBboxs_LR = []
    non_allLocateBboxs_mid = []
    non_allLocateBboxs_LR = []  # 不在单独某一列,但又不是全列
    for L, U, R, D in textBboxs:
        if D - U < 40:  # 现在还没拼接好。先简单这样过滤页眉。也会牺牲一些很窄的长条
            continue
        if R - L < 40:
            continue
        located_cols_mid = []
        located_cols_LR = []
        for col_ID in range(col_N):
            if col_N == 1:
                located_cols_mid.append(col_ID)
                located_cols_LR.append(col_ID)
            else:
                if L <= x_targetAxis[col_ID] <= R:
                    located_cols_mid.append(col_ID)
                if calculate_overlapRatio_between_line1_and_line2(x_splitAxis[col_ID], x_splitAxis[col_ID + 1], L, R)[0] >= 0.2:
                    located_cols_LR.append(col_ID)

        if len(located_cols_mid) == col_N:
            allLocateBboxs_mid.append((L, U, R, D))
        else:
            non_allLocateBboxs_mid.append((L, U, R, D))
        if len(located_cols_LR) == col_N:
            allLocateBboxs_LR.append((L, U, R, D))
        else:
            non_allLocateBboxs_LR.append((L, U, R, D))

    allLocateBboxs_mid.sort(key=lambda LURD: (LURD[1], LURD[0]))
    non_allLocateBboxs_mid.sort(key=lambda LURD: (LURD[1], LURD[0]))
    allLocateBboxs_LR.sort(key=lambda LURD: (LURD[1], LURD[0]))
    non_allLocateBboxs_LR.sort(key=lambda LURD: (LURD[1], LURD[0]))

    # --------------------判断,是不是有标题类的小块,掺杂在一列的pdf页面里。-------------#
    isOneClumn = False
    under_cnt = 0
    under_square = 0.0
    before_cnt = 0
    before_square = 0.0
    for nL, nU, nR, nD in non_allLocateBboxs_mid:
        cnt = 0
        for L, U, R, D in allLocateBboxs_mid:
            if nD <= U:
                cnt += 1
        if cnt >= 1:
            before_cnt += cnt
            before_square += (R - L) * (D - U) * cnt
        else:
            under_cnt += 1
            under_square += (R - L) * (D - U) * cnt

    if (before_square + under_square) != 0 and before_square / (before_square + under_square) >= 0.2:
        isOneClumn = True

    if isOneClumn == True and col_N != 1:
        return INF, False
    if isOneClumn == True and col_N == 1:
        return 0, True
    #### 根据边界的统计情况,再判断一次
    isOneClumn = False
    under_cnt = 0
    under_square = 0.0
    before_cnt = 0
    before_square = 0.0
    for nL, nU, nR, nD in non_allLocateBboxs_LR:
        cnt = 0
        for L, U, R, D in allLocateBboxs_LR:
            if nD <= U:
                cnt += 1
        if cnt >= 1:
            before_cnt += cnt
            before_square += (R - L) * (D - U) * cnt
        else:
            under_cnt += 1
            under_square += (R - L) * (D - U) * cnt

    if (before_square + under_square) != 0 and before_square / (before_square + under_square) >= 0.2:
        isOneClumn = True

    if isOneClumn == True and col_N != 1:
        return INF, False
    if isOneClumn == True and col_N == 1:
        return 0, True

    for L, U, R, D in textBboxs:
        assert L < R and U < D, 'There is an error on bbox of text when calculate loss!'

        # 简单排除页眉、迷你小块
        # if (D - U) < pageHeight / 15 < 40 or (R - L) < pageWidth / 8:
        if (D - U) < 40:
            continue
        if (R - L) < 40:
            continue
        mid = (L + R) / 2
        located_cols_mid = []  # 在哪一列里,根据中点来判断
        located_cols_LR = []  # 在哪一列里,根据边界判断
        for col_ID in range(col_N):
            if col_N == 1:
                located_cols_mid.append(col_ID)
            else:
                # 根据中点判断
                if L <= x_targetAxis[col_ID] <= R:
                    located_cols_mid.append(col_ID)
                # 根据边界判断
                if calculate_overlapRatio_between_line1_and_line2(x_splitAxis[col_ID], x_splitAxis[col_ID + 1], L, R)[0] >= 0.2:
                    located_cols_LR.append(col_ID)

        ## 1列的情形
        if col_N == 1:
            oneLocateLoss_mid += abs(mid - x_targetAxis[located_cols_mid[0]]) * (D - U) * (R - L)
            # oneLocateLoss_mid += abs(L - x_splitAxis[located_cols[0]]) * (D - U) * (R - L)
            oneLocateLoss_LR += abs(L - x_splitAxis[located_cols_mid[0]]) * (D - U) * (R - L)
            oneLocateCnt_mid += 1
            oneLocateSquare_mid += (D - U) * (R - L)
        ## 多列的情形
        else:
            ######## 根据mid判断
            if len(located_cols_mid) == 1:
                oneLocateLoss_mid += abs(mid - x_targetAxis[located_cols_mid[0]]) * (D - U) * (R - L)
                # oneLocateLoss_mid += abs(L - x_splitAxis[located_cols[0]]) * (D - U) * (R - L)
                oneLocateCnt_mid += 1
                oneLocateSquare_mid += (D - U) * (R - L)
            elif 1 <= len(located_cols_mid) < col_N:
                ll, rr = located_cols_mid[0], located_cols_mid[-1]
                # multiLocateLoss_mid += abs(mid - (x_targetAxis[ll] + x_targetAxis[rr]) / 2) * (D - U) * (R - L)
                multiLocateLoss_mid += abs(mid - x_targetAxis[ll]) * (D - U) * (R - L)
                # multiLocateLoss_mid += abs(mid - (pageL + pageR) / 2) * (D - U) * (R - L)
                multiLocateCnt_mid += 1
                multiLocateSquare_mid += (D - U) * (R - L)
                isSimpleCondition = False
            else:
                allLocateLoss_mid += abs(mid - (pageR + pageL) / 2) * (D - U) * (R - L)
                allLocateCnt_mid += 1
                allLocateSquare_mid += (D - U) * (R - L)
                isSimpleCondition = False

            ######## 根据区间的边界判断
            if len(located_cols_LR) == 1:
                oneLocateLoss_LR += abs(mid - x_targetAxis[located_cols_LR[0]]) * (D - U) * (R - L)
                # oneLocateLoss_LR += abs(L - x_splitAxis[located_cols_LR[0]]) * (D - U) * (R - L)
                oneLocateCnt_LR += 1
                oneLocateSquare_LR += (D - U) * (R - L)
            elif 1 <= len(located_cols_LR) < col_N:
                ll, rr = located_cols_LR[0], located_cols_LR[-1]
                # multiLocateLoss_LR += abs(mid - (x_targetAxis[ll] + x_targetAxis[rr]) / 2) * (D - U) * (R - L)
                multiLocateLoss_LR += abs(mid - x_targetAxis[ll]) * (D - U) * (R - L)
                # multiLocateLoss_LR += abs(mid - (pageL + pageR) / 2) * (D - U) * (R - L)
                multiLocateCnt_LR += 1
                multiLocateSquare_LR += (D - U) * (R - L)
                isSimpleCondition = False
            else:
                allLocateLoss_LR += abs(mid - (pageR + pageL) / 2) * (D - U) * (R - L)
                allLocateCnt_LR += 1
                allLocateSquare_LR += (D - U) * (R - L)
                isSimpleCondition = False

    tot_TextCnt = oneLocateCnt_mid + multiLocateCnt_mid + allLocateCnt_mid
    tot_TextSquare = oneLocateSquare_mid + multiLocateSquare_mid + allLocateSquare_mid

    # 1列的情形
    if tot_TextSquare != 0 and allLocateSquare_mid / tot_TextSquare >= 0.85 and col_N == 1:
        return 0, True

    # 多列的情形

    # if col_N >= 2:
    #     if allLocateCnt >= 1:
    #         oneLocateLoss_mid += ((pageR - pageL)) * oneLocateCnt_mid
    #         multiLocateLoss_mid += ((pageR - pageL) ) * multiLocateCnt_mid
    #     else:
    #         if multiLocateCnt_mid >= 1:
    #             oneLocateLoss_mid += ((pageR - pageL)) * oneLocateCnt_mid
    totLoss_mid = oneLocateLoss_mid + multiLocateLoss_mid + allLocateLoss_mid
    totLoss_LR = oneLocateCnt_LR + multiLocateCnt_LR + allLocateLoss_LR
    return totLoss_mid + totLoss_LR, isSimpleCondition


def get_columnNumber(page_ID: int, page: fitz.Page, textBboxs) -> (int, float):
    columnNumber_loss = dict()
    columnNumber_isSimpleCondition = dict()
    #### 枚举列数
    for columnNumber in range(1, 5):
        # print('---------{}--------'.format(columnNumber))
        x_targetAxis, x_splitAxis = get_targetAxis_and_splitAxis(page_ID, page, columnNumber, textBboxs)
        loss, isSimpleCondition = calculate_loss(page_ID, x_targetAxis, x_splitAxis, textBboxs)
        columnNumber_loss[columnNumber] = loss
        columnNumber_isSimpleCondition[columnNumber] = isSimpleCondition

    col_idxs = [i for i in range(1, len(columnNumber_loss) + 1)]
    col_idxs.sort(key=lambda i: (columnNumber_loss[i], i))

    return col_idxs, columnNumber_loss, columnNumber_isSimpleCondition