layout_sort.py 36.1 KB
Newer Older
drunkpig's avatar
drunkpig committed
1
"""对pdf上的box进行layout识别,并对内部组成的box进行排序."""
赵小蒙's avatar
赵小蒙 committed
2
3
4

from loguru import logger

drunkpig's avatar
drunkpig committed
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
from magic_pdf.layout.bbox_sort import (CONTENT_IDX, CONTENT_TYPE_IDX,
                                        X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX,
                                        Y0_EXT_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX,
                                        paper_bbox_sort)
from magic_pdf.layout.layout_det_utils import (
    find_all_bottom_bbox_direct, find_all_left_bbox_direct,
    find_all_right_bbox_direct, find_all_top_bbox_direct,
    find_bottom_bbox_direct_from_left_edge,
    find_bottom_bbox_direct_from_right_edge,
    find_top_bbox_direct_from_left_edge, find_top_bbox_direct_from_right_edge,
    get_left_edge_bboxes, get_right_edge_bboxes)
from magic_pdf.libs.boxbase import get_bbox_in_boundary

LAYOUT_V = 'V'
LAYOUT_H = 'H'
LAYOUT_UNPROC = 'U'
LAYOUT_BAD = 'B'
赵小蒙's avatar
赵小蒙 committed
22
23
24


def _is_single_line_text(bbox):
drunkpig's avatar
drunkpig committed
25
26
    """检查bbox里面的文字是否只有一行."""
    return True  # TODO
赵小蒙's avatar
赵小蒙 committed
27
28
29
    box_type = bbox[CONTENT_TYPE_IDX]
    if box_type != 'text':
        return False
drunkpig's avatar
drunkpig committed
30
31
    paras = bbox[CONTENT_IDX]['paras']
    text_content = ''
赵小蒙's avatar
赵小蒙 committed
32
33
    for para_id, para in paras.items():  # 拼装内部的段落文本
        is_title = para['is_title']
drunkpig's avatar
drunkpig committed
34
        if is_title != 0:
赵小蒙's avatar
赵小蒙 committed
35
36
            text_content += f"## {para['text']}"
        else:
drunkpig's avatar
drunkpig committed
37
38
            text_content += para['text']
        text_content += '\n\n'
赵小蒙's avatar
赵小蒙 committed
39

drunkpig's avatar
drunkpig committed
40
    return bbox[CONTENT_TYPE_IDX] == 'text' and len(text_content.split('\n\n')) <= 1
赵小蒙's avatar
赵小蒙 committed
41

drunkpig's avatar
drunkpig committed
42
43

def _horizontal_split(bboxes: list, boundary: tuple, avg_font_size=20) -> list:
赵小蒙's avatar
赵小蒙 committed
44
45
46
47
48
49
    """
    对bboxes进行水平切割
    方法是:找到左侧和右侧都没有被直接遮挡的box,然后进行扩展,之后进行切割
    return:
        返回几个大的Layout区域 [[x0, y0, x1, y1, "h|u|v"], ], h代表水平,u代表未探测的,v代表垂直布局
    """
drunkpig's avatar
drunkpig committed
50
51
52
53
54
    sorted_layout_blocks = []  # 这是要最终返回的值

    bound_x0, bound_y0, bound_x1, bound_y1 = boundary
    all_bboxes = get_bbox_in_boundary(bboxes, boundary)
    # all_bboxes = paper_bbox_sort(all_bboxes, abs(bound_x1-bound_x0), abs(bound_y1-bound_x0)) # 大致拍下序, 这个是基于直接遮挡的。
赵小蒙's avatar
赵小蒙 committed
55
56
    """
    首先在水平方向上扩展独占一行的bbox
drunkpig's avatar
drunkpig committed
57

赵小蒙's avatar
赵小蒙 committed
58
    """
drunkpig's avatar
drunkpig committed
59
    last_h_split_line_y1 = bound_y0  # 记录下上次的水平分割线
赵小蒙's avatar
赵小蒙 committed
60
    for i, bbox in enumerate(all_bboxes):
drunkpig's avatar
drunkpig committed
61
        left_nearest_bbox = find_all_left_bbox_direct(bbox, all_bboxes)  # 非扩展线
赵小蒙's avatar
赵小蒙 committed
62
        right_nearest_bbox = find_all_right_bbox_direct(bbox, all_bboxes)
drunkpig's avatar
drunkpig committed
63
        if left_nearest_bbox is None and right_nearest_bbox is None:  # 独占一行
赵小蒙's avatar
赵小蒙 committed
64
65
66
67
68
69
70
            """
            然而,如果只是孤立的一行文字,那么就还要满足以下几个条件才可以:
            1. bbox和中心线相交。或者
            2. 上方或者下方也存在同类水平的独占一行的bbox。 或者
            3. TODO 加强条件:这个bbox上方和下方是同一列column,那么就不能算作独占一行
            """
            # 先检查这个bbox里是否只包含一行文字
drunkpig's avatar
drunkpig committed
71
            # is_single_line = _is_single_line_text(bbox)
赵小蒙's avatar
赵小蒙 committed
72
            """
drunkpig's avatar
drunkpig committed
73
74
            这里有个点需要注意,当页面内容不是居中的时候,第一次调用传递的是page的boundary,这个时候mid_x就不是中心线了.
            所以这里计算出最紧致的boundary,然后再计算mid_x
赵小蒙's avatar
赵小蒙 committed
75
            """
drunkpig's avatar
drunkpig committed
76
77
78
79
            boundary_real_x0, boundary_real_x1 = min(
                [bbox[X0_IDX] for bbox in all_bboxes]
            ), max([bbox[X1_IDX] for bbox in all_bboxes])
            mid_x = (boundary_real_x0 + boundary_real_x1) / 2
赵小蒙's avatar
赵小蒙 committed
80
81
            # 检查这个box是否内容在中心线有交
            # 必须跨过去2个字符的宽度
drunkpig's avatar
drunkpig committed
82
83
84
            is_cross_boundary_mid_line = (
                min(mid_x - bbox[X0_IDX], bbox[X1_IDX] - mid_x) > avg_font_size * 2
            )
赵小蒙's avatar
赵小蒙 committed
85
86
87
88
89
90
91
92
93
94
95
96
            """
            检查条件2
            """
            is_belong_to_col = False
            """
            检查是否能被上方col吸收,方法是:
            1. 上方非空且不是独占一行的,并且
            2. 从上个水平分割的最大y=y1开始到当前bbox,最左侧的bbox的[min_x0, max_x1],能够覆盖当前box的[x0, x1]
            """
            """
            以迭代的方式向上找,查找范围是[bound_x0, last_h_sp, bound_x1, bbox[Y0_IDX]]
            """
drunkpig's avatar
drunkpig committed
97
            # 先确定上方的y0, y0
赵小蒙's avatar
赵小蒙 committed
98
            b_y0, b_y1 = last_h_split_line_y1, bbox[Y0_IDX]
drunkpig's avatar
drunkpig committed
99
            # 然后从box开始逐个向上找到所有与box在x上有交集的box
赵小蒙's avatar
赵小蒙 committed
100
            box_to_check = [bound_x0, b_y0, bound_x1, b_y1]
drunkpig's avatar
drunkpig committed
101
102
            bbox_in_bound_check = get_bbox_in_boundary(all_bboxes, box_to_check)

赵小蒙's avatar
赵小蒙 committed
103
104
105
106
107
108
            bboxes_on_top = []
            virtual_box = bbox
            while True:
                b_on_top = find_all_top_bbox_direct(virtual_box, bbox_in_bound_check)
                if b_on_top is not None:
                    bboxes_on_top.append(b_on_top)
drunkpig's avatar
drunkpig committed
109
110
111
112
113
114
                    virtual_box = [
                        min([virtual_box[X0_IDX], b_on_top[X0_IDX]]),
                        min(virtual_box[Y0_IDX], b_on_top[Y0_IDX]),
                        max([virtual_box[X1_IDX], b_on_top[X1_IDX]]),
                        b_y1,
                    ]
赵小蒙's avatar
赵小蒙 committed
115
116
117
118
                else:
                    break

            # 随后确定这些box的最小x0, 最大x1
drunkpig's avatar
drunkpig committed
119
120
121
            if len(bboxes_on_top) > 0 and len(bboxes_on_top) != len(
                bbox_in_bound_check
            ):  # virtual_box可能会膨胀到占满整个区域,这实际上就不能属于一个col了。
赵小蒙's avatar
赵小蒙 committed
122
123
                min_x0, max_x1 = virtual_box[X0_IDX], virtual_box[X1_IDX]
                # 然后采用一种比较粗糙的方法,看min_x0,max_x1是否与位于[bound_x0, last_h_sp, bound_x1, bbox[Y0_IDX]]之间的box有相交
drunkpig's avatar
drunkpig committed
124
125
126
127
128
129
130
131

                if not any(
                    [
                        b[X0_IDX] <= min_x0 - 1 <= b[X1_IDX]
                        or b[X0_IDX] <= max_x1 + 1 <= b[X1_IDX]
                        for b in bbox_in_bound_check
                    ]
                ):
赵小蒙's avatar
赵小蒙 committed
132
133
134
                    # 其上,下都不能被扩展成行,暂时只检查一下上方 TODO
                    top_nearest_bbox = find_all_top_bbox_direct(bbox, bboxes)
                    bottom_nearest_bbox = find_all_bottom_bbox_direct(bbox, bboxes)
drunkpig's avatar
drunkpig committed
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
                    if not any(
                        [
                            top_nearest_bbox is not None
                            and (
                                find_all_left_bbox_direct(top_nearest_bbox, bboxes)
                                is None
                                and find_all_right_bbox_direct(top_nearest_bbox, bboxes)
                                is None
                            ),
                            bottom_nearest_bbox is not None
                            and (
                                find_all_left_bbox_direct(bottom_nearest_bbox, bboxes)
                                is None
                                and find_all_right_bbox_direct(
                                    bottom_nearest_bbox, bboxes
                                )
                                is None
                            ),
                            top_nearest_bbox is None or bottom_nearest_bbox is None,
                        ]
                    ):
                        is_belong_to_col = True

赵小蒙's avatar
赵小蒙 committed
158
159
            # 检查是否能被下方col吸收 TODO
            """
drunkpig's avatar
drunkpig committed
160
            这里为什么没有is_cross_boundary_mid_line的条件呢?
赵小蒙's avatar
赵小蒙 committed
161
162
            确实有些杂志左右两栏宽度不是对称的。
            """
drunkpig's avatar
drunkpig committed
163
            if not is_belong_to_col or is_cross_boundary_mid_line:
赵小蒙's avatar
赵小蒙 committed
164
165
166
167
                bbox[X0_EXT_IDX] = bound_x0
                bbox[Y0_EXT_IDX] = bbox[Y0_IDX]
                bbox[X1_EXT_IDX] = bound_x1
                bbox[Y1_EXT_IDX] = bbox[Y1_IDX]
drunkpig's avatar
drunkpig committed
168
                last_h_split_line_y1 = bbox[Y1_IDX]  # 更新这条线
赵小蒙's avatar
赵小蒙 committed
169
170
171
172
173
174
175
176
177
178
179
180
181
182
            else:
                continue
    """
    此时独占一行的被成功扩展到指定的边界上,这个时候利用边界条件合并连续的bbox,成为一个group
    然后合并所有连续水平方向的bbox.
    """
    all_bboxes.sort(key=lambda x: x[Y0_IDX])
    h_bboxes = []
    h_bbox_group = []

    for bbox in all_bboxes:
        if bbox[X0_EXT_IDX] == bound_x0 and bbox[X1_EXT_IDX] == bound_x1:
            h_bbox_group.append(bbox)
        else:
drunkpig's avatar
drunkpig committed
183
184
            if len(h_bbox_group) > 0:
                h_bboxes.append(h_bbox_group)
赵小蒙's avatar
赵小蒙 committed
185
186
                h_bbox_group = []
    # 最后一个group
drunkpig's avatar
drunkpig committed
187
    if len(h_bbox_group) > 0:
赵小蒙's avatar
赵小蒙 committed
188
189
190
191
192
193
194
195
196
        h_bboxes.append(h_bbox_group)
    """
    现在h_bboxes里面是所有的group了,每个group都是一个list
    对h_bboxes里的每个group进行计算放回到sorted_layouts里
    """
    h_layouts = []
    for gp in h_bboxes:
        gp.sort(key=lambda x: x[Y0_IDX])
        # 然后计算这个group的layout_bbox,也就是最小的x0,y0, 最大的x1,y1
drunkpig's avatar
drunkpig committed
197
198
199
200
201
202
203
        x0, y0, x1, y1 = (
            gp[0][X0_EXT_IDX],
            gp[0][Y0_EXT_IDX],
            gp[-1][X1_EXT_IDX],
            gp[-1][Y1_EXT_IDX],
        )
        h_layouts.append([x0, y0, x1, y1, LAYOUT_H])  # 水平的布局
赵小蒙's avatar
赵小蒙 committed
204
205
206
207
    """
    接下来利用这些连续的水平bbox的layout_bbox的y0, y1,从水平上切分开其余的为几个部分
    """
    h_split_lines = [bound_y0]
drunkpig's avatar
drunkpig committed
208
    for gp in h_bboxes:  # gp是一个list[bbox_list]
赵小蒙's avatar
赵小蒙 committed
209
210
211
212
        y0, y1 = gp[0][1], gp[-1][3]
        h_split_lines.append(y0)
        h_split_lines.append(y1)
    h_split_lines.append(bound_y1)
drunkpig's avatar
drunkpig committed
213

赵小蒙's avatar
赵小蒙 committed
214
215
    unsplited_bboxes = []
    for i in range(0, len(h_split_lines), 2):
drunkpig's avatar
drunkpig committed
216
        start_y0, start_y1 = h_split_lines[i : i + 2]
赵小蒙's avatar
赵小蒙 committed
217
        # 然后找出[start_y0, start_y1]之间的其他bbox,这些组成一个未分割板块
drunkpig's avatar
drunkpig committed
218
219
220
221
222
        bboxes_in_block = [
            bbox
            for bbox in all_bboxes
            if bbox[Y0_IDX] >= start_y0 and bbox[Y1_IDX] <= start_y1
        ]
赵小蒙's avatar
赵小蒙 committed
223
224
225
226
227
        unsplited_bboxes.append(bboxes_in_block)
    # 接着把未处理的加入到h_layouts里
    for bboxes_in_block in unsplited_bboxes:
        if len(bboxes_in_block) == 0:
            continue
drunkpig's avatar
drunkpig committed
228
229
230
231
232
233
        x0, y0, x1, y1 = (
            bound_x0,
            min([bbox[Y0_IDX] for bbox in bboxes_in_block]),
            bound_x1,
            max([bbox[Y1_IDX] for bbox in bboxes_in_block]),
        )
赵小蒙's avatar
赵小蒙 committed
234
        h_layouts.append([x0, y0, x1, y1, LAYOUT_UNPROC])
drunkpig's avatar
drunkpig committed
235
236

    h_layouts.sort(key=lambda x: x[1])  # 按照y0排序, 也就是从上到下的顺序
赵小蒙's avatar
赵小蒙 committed
237
238
239
240
    """
    转换成如下格式返回
    """
    for layout in h_layouts:
drunkpig's avatar
drunkpig committed
241
242
243
244
245
246
247
        sorted_layout_blocks.append(
            {
                'layout_bbox': layout[:4],
                'layout_label': layout[4],
                'sub_layout': [],
            }
        )
赵小蒙's avatar
赵小蒙 committed
248
    return sorted_layout_blocks
drunkpig's avatar
drunkpig committed
249
250


赵小蒙's avatar
赵小蒙 committed
251
252
253
254
255
###############################################################################################
#
#  垂直方向的处理
#
#
drunkpig's avatar
drunkpig committed
256
257
###############################################################################################
def _vertical_align_split_v1(bboxes: list, boundary: tuple) -> list:
赵小蒙's avatar
赵小蒙 committed
258
259
260
261
262
263
264
265
266
267
268
    """
    计算垂直方向上的对齐, 并分割bboxes成layout。负责对一列多行的进行列维度分割。
    如果不能完全分割,剩余部分作为layout_lable为u的layout返回
    -----------------------
    |     |           |
    |     |           |
    |     |           |
    |     |           |
    -------------------------
    此函数会将:以上布局将会切分出来2列
    """
drunkpig's avatar
drunkpig committed
269
270
271
    sorted_layout_blocks = []  # 这是要最终返回的值
    new_boundary = [boundary[0], boundary[1], boundary[2], boundary[3]]

赵小蒙's avatar
赵小蒙 committed
272
273
274
275
    v_blocks = []
    """
    先从左到右切分
    """
drunkpig's avatar
drunkpig committed
276
277
    while True:
        all_bboxes = get_bbox_in_boundary(bboxes, new_boundary)
赵小蒙's avatar
赵小蒙 committed
278
279
280
        left_edge_bboxes = get_left_edge_bboxes(all_bboxes)
        if len(left_edge_bboxes) == 0:
            break
drunkpig's avatar
drunkpig committed
281
        right_split_line_x1 = max([bbox[X1_IDX] for bbox in left_edge_bboxes]) + 1
赵小蒙's avatar
赵小蒙 committed
282
        # 然后检查这条线能不与其他bbox的左边界相交或者重合
drunkpig's avatar
drunkpig committed
283
284
285
        if any(
            [bbox[X0_IDX] <= right_split_line_x1 <= bbox[X1_IDX] for bbox in all_bboxes]
        ):
赵小蒙's avatar
赵小蒙 committed
286
287
            # 垂直切分线与某些box发生相交,说明无法完全垂直方向切分。
            break
drunkpig's avatar
drunkpig committed
288
        else:  # 说明成功分割出一列
赵小蒙's avatar
赵小蒙 committed
289
            # 找到左侧边界最靠左的bbox作为layout的x0
drunkpig's avatar
drunkpig committed
290
291
292
293
294
295
296
297
298
299
300
301
302
            layout_x0 = min(
                [bbox[X0_IDX] for bbox in left_edge_bboxes]
            )  # 这里主要是为了画出来有一定间距
            v_blocks.append(
                [
                    layout_x0,
                    new_boundary[1],
                    right_split_line_x1,
                    new_boundary[3],
                    LAYOUT_V,
                ]
            )
            new_boundary[0] = right_split_line_x1  # 更新边界
赵小蒙's avatar
赵小蒙 committed
303
304
305
306
307
    """
    再从右到左切, 此时如果还是无法完全切分,那么剩余部分作为layout_lable为u的layout返回
    """
    unsplited_block = []
    while True:
drunkpig's avatar
drunkpig committed
308
        all_bboxes = get_bbox_in_boundary(bboxes, new_boundary)
赵小蒙's avatar
赵小蒙 committed
309
310
311
        right_edge_bboxes = get_right_edge_bboxes(all_bboxes)
        if len(right_edge_bboxes) == 0:
            break
drunkpig's avatar
drunkpig committed
312
        left_split_line_x0 = min([bbox[X0_IDX] for bbox in right_edge_bboxes]) - 1
赵小蒙's avatar
赵小蒙 committed
313
        # 然后检查这条线能不与其他bbox的左边界相交或者重合
drunkpig's avatar
drunkpig committed
314
315
316
        if any(
            [bbox[X0_IDX] <= left_split_line_x0 <= bbox[X1_IDX] for bbox in all_bboxes]
        ):
赵小蒙's avatar
赵小蒙 committed
317
            # 这里是余下的
drunkpig's avatar
drunkpig committed
318
319
320
321
322
323
324
325
326
            unsplited_block.append(
                [
                    new_boundary[0],
                    new_boundary[1],
                    new_boundary[2],
                    new_boundary[3],
                    LAYOUT_UNPROC,
                ]
            )
赵小蒙's avatar
赵小蒙 committed
327
328
329
330
            break
        else:
            # 找到右侧边界最靠右的bbox作为layout的x1
            layout_x1 = max([bbox[X1_IDX] for bbox in right_edge_bboxes])
drunkpig's avatar
drunkpig committed
331
332
333
334
335
336
337
338
339
340
            v_blocks.append(
                [
                    left_split_line_x0,
                    new_boundary[1],
                    layout_x1,
                    new_boundary[3],
                    LAYOUT_V,
                ]
            )
            new_boundary[2] = left_split_line_x0  # 更新右边界
赵小蒙's avatar
赵小蒙 committed
341
342
343
344
    """
    最后拼装成layout格式返回
    """
    for block in v_blocks:
drunkpig's avatar
drunkpig committed
345
346
347
348
349
350
351
        sorted_layout_blocks.append(
            {
                'layout_bbox': block[:4],
                'layout_label': block[4],
                'sub_layout': [],
            }
        )
赵小蒙's avatar
赵小蒙 committed
352
    for block in unsplited_block:
drunkpig's avatar
drunkpig committed
353
354
355
356
357
358
359
360
        sorted_layout_blocks.append(
            {
                'layout_bbox': block[:4],
                'layout_label': block[4],
                'sub_layout': [],
            }
        )

赵小蒙's avatar
赵小蒙 committed
361
362
363
    # 按照x0排序
    sorted_layout_blocks.sort(key=lambda x: x['layout_bbox'][0])
    return sorted_layout_blocks
drunkpig's avatar
drunkpig committed
364
365
366
367
368
369
370
371
372


def _vertical_align_split_v2(bboxes: list, boundary: tuple) -> list:
    """改进的
    _vertical_align_split算法,原算法会因为第二列的box由于左侧没有遮挡被认为是左侧的一部分,导致整个layout多列被识别为一列。
    利用从左上角的box开始向下看的方法,不断扩展w_x0, w_x1,直到不能继续向下扩展,或者到达边界下边界。"""
    sorted_layout_blocks = []  # 这是要最终返回的值
    new_boundary = [boundary[0], boundary[1], boundary[2], boundary[3]]
    bad_boxes = []  # 被割中的box
赵小蒙's avatar
赵小蒙 committed
373
374
    v_blocks = []
    while True:
drunkpig's avatar
drunkpig committed
375
        all_bboxes = get_bbox_in_boundary(bboxes, new_boundary)
赵小蒙's avatar
赵小蒙 committed
376
377
        if len(all_bboxes) == 0:
            break
drunkpig's avatar
drunkpig committed
378
379
380
381
382
383
384
385
386
        left_top_box = min(
            all_bboxes, key=lambda x: (x[X0_IDX], x[Y0_IDX])
        )  # 这里应该加强,检查一下必须是在第一列的 TODO
        start_box = [
            left_top_box[X0_IDX],
            left_top_box[Y0_IDX],
            left_top_box[X1_IDX],
            left_top_box[Y1_IDX],
        ]
赵小蒙's avatar
赵小蒙 committed
387
388
389
390
391
392
393
394
        w_x0, w_x1 = left_top_box[X0_IDX], left_top_box[X1_IDX]
        """
        然后沿着这个box线向下找最近的那个box, 然后扩展w_x0, w_x1
        扩展之后,宽度会增加,随后用x=w_x1来检测在边界内是否有box与相交,如果相交,那么就说明不能再扩展了。
        当不能扩展的时候就要看是否到达下边界:
        1. 达到,那么更新左边界继续分下一个列
        2. 没有达到,那么此时开始从右侧切分进入下面的循环里
        """
drunkpig's avatar
drunkpig committed
395
        while left_top_box is not None:  # 向下去找
赵小蒙's avatar
赵小蒙 committed
396
            virtual_box = [w_x0, left_top_box[Y0_IDX], w_x1, left_top_box[Y1_IDX]]
drunkpig's avatar
drunkpig committed
397
398
399
            left_top_box = find_bottom_bbox_direct_from_left_edge(
                virtual_box, all_bboxes
            )
赵小蒙's avatar
赵小蒙 committed
400
            if left_top_box:
drunkpig's avatar
drunkpig committed
401
402
403
                w_x0, w_x1 = min(virtual_box[X0_IDX], left_top_box[X0_IDX]), max(
                    [virtual_box[X1_IDX], left_top_box[X1_IDX]]
                )
赵小蒙's avatar
赵小蒙 committed
404
        # 万一这个初始的box在column中间,那么还要向上看
drunkpig's avatar
drunkpig committed
405
406
407
408
409
410
        start_box = [
            w_x0,
            start_box[Y0_IDX],
            w_x1,
            start_box[Y1_IDX],
        ]  # 扩展一下宽度更鲁棒
赵小蒙's avatar
赵小蒙 committed
411
        left_top_box = find_top_bbox_direct_from_left_edge(start_box, all_bboxes)
drunkpig's avatar
drunkpig committed
412
        while left_top_box is not None:  # 向上去找
赵小蒙's avatar
赵小蒙 committed
413
414
415
            virtual_box = [w_x0, left_top_box[Y0_IDX], w_x1, left_top_box[Y1_IDX]]
            left_top_box = find_top_bbox_direct_from_left_edge(virtual_box, all_bboxes)
            if left_top_box:
drunkpig's avatar
drunkpig committed
416
417
418
419
420
421
                w_x0, w_x1 = min(virtual_box[X0_IDX], left_top_box[X0_IDX]), max(
                    [virtual_box[X1_IDX], left_top_box[X1_IDX]]
                )

        # 检查相交
        if any([bbox[X0_IDX] <= w_x1 + 1 <= bbox[X1_IDX] for bbox in all_bboxes]):
赵小蒙's avatar
赵小蒙 committed
422
            for b in all_bboxes:
drunkpig's avatar
drunkpig committed
423
                if b[X0_IDX] <= w_x1 + 1 <= b[X1_IDX]:
赵小蒙's avatar
赵小蒙 committed
424
425
                    bad_boxes.append([b[X0_IDX], b[Y0_IDX], b[X1_IDX], b[Y1_IDX]])
            break
drunkpig's avatar
drunkpig committed
426
427
428
        else:  # 说明成功分割出一列
            v_blocks.append([w_x0, new_boundary[1], w_x1, new_boundary[3], LAYOUT_V])
            new_boundary[0] = w_x1  # 更新边界
赵小蒙's avatar
赵小蒙 committed
429
430
431
    """
    接着开始从右上角的box扫描
    """
drunkpig's avatar
drunkpig committed
432
    w_x0, w_x1 = 0, 0
赵小蒙's avatar
赵小蒙 committed
433
434
    unsplited_block = []
    while True:
drunkpig's avatar
drunkpig committed
435
        all_bboxes = get_bbox_in_boundary(bboxes, new_boundary)
赵小蒙's avatar
赵小蒙 committed
436
437
438
        if len(all_bboxes) == 0:
            break
        # 先找到X1最大的
drunkpig's avatar
drunkpig committed
439
440
441
        bbox_list_sorted = sorted(
            all_bboxes, key=lambda bbox: bbox[X1_IDX], reverse=True
        )
赵小蒙's avatar
赵小蒙 committed
442
443
        # Then, find the boxes with the smallest Y0 value
        bigest_x1 = bbox_list_sorted[0][X1_IDX]
drunkpig's avatar
drunkpig committed
444
445
446
447
448
449
450
451
452
453
454
455
        boxes_with_bigest_x1 = [
            bbox for bbox in bbox_list_sorted if bbox[X1_IDX] == bigest_x1
        ]  # 也就是最靠右的那些
        right_top_box = min(
            boxes_with_bigest_x1, key=lambda bbox: bbox[Y0_IDX]
        )  # y0最小的那个
        start_box = [
            right_top_box[X0_IDX],
            right_top_box[Y0_IDX],
            right_top_box[X1_IDX],
            right_top_box[Y1_IDX],
        ]
赵小蒙's avatar
赵小蒙 committed
456
        w_x0, w_x1 = right_top_box[X0_IDX], right_top_box[X1_IDX]
drunkpig's avatar
drunkpig committed
457

赵小蒙's avatar
赵小蒙 committed
458
459
        while right_top_box is not None:
            virtual_box = [w_x0, right_top_box[Y0_IDX], w_x1, right_top_box[Y1_IDX]]
drunkpig's avatar
drunkpig committed
460
461
462
            right_top_box = find_bottom_bbox_direct_from_right_edge(
                virtual_box, all_bboxes
            )
赵小蒙's avatar
赵小蒙 committed
463
            if right_top_box:
drunkpig's avatar
drunkpig committed
464
465
466
                w_x0, w_x1 = min([w_x0, right_top_box[X0_IDX]]), max(
                    [w_x1, right_top_box[X1_IDX]]
                )
赵小蒙's avatar
赵小蒙 committed
467
        # 在向上扫描
drunkpig's avatar
drunkpig committed
468
469
470
471
472
473
        start_box = [
            w_x0,
            start_box[Y0_IDX],
            w_x1,
            start_box[Y1_IDX],
        ]  # 扩展一下宽度更鲁棒
赵小蒙's avatar
赵小蒙 committed
474
475
476
        right_top_box = find_top_bbox_direct_from_right_edge(start_box, all_bboxes)
        while right_top_box is not None:
            virtual_box = [w_x0, right_top_box[Y0_IDX], w_x1, right_top_box[Y1_IDX]]
drunkpig's avatar
drunkpig committed
477
478
479
            right_top_box = find_top_bbox_direct_from_right_edge(
                virtual_box, all_bboxes
            )
赵小蒙's avatar
赵小蒙 committed
480
            if right_top_box:
drunkpig's avatar
drunkpig committed
481
482
483
484
                w_x0, w_x1 = min([w_x0, right_top_box[X0_IDX]]), max(
                    [w_x1, right_top_box[X1_IDX]]
                )

赵小蒙's avatar
赵小蒙 committed
485
        # 检查是否与其他box相交, 垂直切分线与某些box发生相交,说明无法完全垂直方向切分。
drunkpig's avatar
drunkpig committed
486
487
488
489
490
491
492
493
494
495
        if any([bbox[X0_IDX] <= w_x0 - 1 <= bbox[X1_IDX] for bbox in all_bboxes]):
            unsplited_block.append(
                [
                    new_boundary[0],
                    new_boundary[1],
                    new_boundary[2],
                    new_boundary[3],
                    LAYOUT_UNPROC,
                ]
            )
赵小蒙's avatar
赵小蒙 committed
496
            for b in all_bboxes:
drunkpig's avatar
drunkpig committed
497
                if b[X0_IDX] <= w_x0 - 1 <= b[X1_IDX]:
赵小蒙's avatar
赵小蒙 committed
498
499
                    bad_boxes.append([b[X0_IDX], b[Y0_IDX], b[X1_IDX], b[Y1_IDX]])
            break
drunkpig's avatar
drunkpig committed
500
501
502
        else:  # 说明成功分割出一列
            v_blocks.append([w_x0, new_boundary[1], w_x1, new_boundary[3], LAYOUT_V])
            new_boundary[2] = w_x0
赵小蒙's avatar
赵小蒙 committed
503
504
    """转换数据结构"""
    for block in v_blocks:
drunkpig's avatar
drunkpig committed
505
506
507
508
509
510
511
512
        sorted_layout_blocks.append(
            {
                'layout_bbox': block[:4],
                'layout_label': block[4],
                'sub_layout': [],
            }
        )

赵小蒙's avatar
赵小蒙 committed
513
    for block in unsplited_block:
drunkpig's avatar
drunkpig committed
514
515
516
517
518
519
520
521
522
        sorted_layout_blocks.append(
            {
                'layout_bbox': block[:4],
                'layout_label': block[4],
                'sub_layout': [],
                'bad_boxes': bad_boxes,  # 记录下来,这个box是被割中的
            }
        )

赵小蒙's avatar
赵小蒙 committed
523
524
525
526
527
    # 按照x0排序
    sorted_layout_blocks.sort(key=lambda x: x['layout_bbox'][0])
    return sorted_layout_blocks


drunkpig's avatar
drunkpig committed
528
def _try_horizontal_mult_column_split(bboxes: list, boundary: tuple) -> list:
赵小蒙's avatar
赵小蒙 committed
529
530
531
532
533
534
535
536
537
538
539
540
541
    """
    尝试水平切分,如果切分不动,那就当一个BAD_LAYOUT返回
    ------------------
    |        |       |
    ------------------
    |    |       |   |   <-  这里是此函数要切分的场景
    ------------------
    |        |       |
    |        |       |
    """
    pass


drunkpig's avatar
drunkpig committed
542
def _vertical_split(bboxes: list, boundary: tuple) -> list:
赵小蒙's avatar
赵小蒙 committed
543
544
545
    """
    从垂直方向进行切割,分block
    这个版本里,如果垂直切分不动,那就当一个BAD_LAYOUT返回
drunkpig's avatar
drunkpig committed
546

赵小蒙's avatar
赵小蒙 committed
547
548
549
550
                                --------------------------
                                    |        |       |
                                    |        |       |
                                | |
drunkpig's avatar
drunkpig committed
551
    这种列是此函数要切分的  ->    | |
赵小蒙's avatar
赵小蒙 committed
552
553
554
555
556
                                | |
                                    |        |       |
                                    |        |       |
                                -------------------------
    """
drunkpig's avatar
drunkpig committed
557
558
559
560
    sorted_layout_blocks = []  # 这是要最终返回的值

    bound_x0, bound_y0, bound_x1, bound_y1 = boundary
    all_bboxes = get_bbox_in_boundary(bboxes, boundary)
赵小蒙's avatar
赵小蒙 committed
561
562
563
    """
    all_bboxes = fix_vertical_bbox_pos(all_bboxes) # 垂直方向解覆盖
    all_bboxes = fix_hor_bbox_pos(all_bboxes)  # 水平解覆盖
drunkpig's avatar
drunkpig committed
564

赵小蒙's avatar
赵小蒙 committed
565
566
567
568
    这两行代码目前先不执行,因为公式检测,表格检测还不是很成熟,导致非常多的textblock参与了运算,时间消耗太大。
    这两行代码的作用是:
    如果遇到互相重叠的bbox, 那么会把面积较小的box进行压缩,从而避免重叠。对布局切分来说带来正反馈。
    """
drunkpig's avatar
drunkpig committed
569
570

    # all_bboxes = paper_bbox_sort(all_bboxes, abs(bound_x1-bound_x0), abs(bound_y1-bound_x0)) # 大致拍下序, 这个是基于直接遮挡的。
赵小蒙's avatar
赵小蒙 committed
571
572
    """
    首先在垂直方向上扩展独占一行的bbox
drunkpig's avatar
drunkpig committed
573

赵小蒙's avatar
赵小蒙 committed
574
575
    """
    for bbox in all_bboxes:
drunkpig's avatar
drunkpig committed
576
        top_nearest_bbox = find_all_top_bbox_direct(bbox, all_bboxes)  # 非扩展线
赵小蒙's avatar
赵小蒙 committed
577
        bottom_nearest_bbox = find_all_bottom_bbox_direct(bbox, all_bboxes)
drunkpig's avatar
drunkpig committed
578
579
580
581
582
583
584
585
586
587
588
        if (
            top_nearest_bbox is None
            and bottom_nearest_bbox is None
            and not any(
                [
                    b[X0_IDX] < bbox[X1_IDX] < b[X1_IDX]
                    or b[X0_IDX] < bbox[X0_IDX] < b[X1_IDX]
                    for b in all_bboxes
                ]
            )
        ):  # 独占一列, 且不和其他重叠
赵小蒙's avatar
赵小蒙 committed
589
590
591
592
            bbox[X0_EXT_IDX] = bbox[X0_IDX]
            bbox[Y0_EXT_IDX] = bound_y0
            bbox[X1_EXT_IDX] = bbox[X1_IDX]
            bbox[Y1_EXT_IDX] = bound_y1
drunkpig's avatar
drunkpig committed
593
        """
赵小蒙's avatar
赵小蒙 committed
594
595
596
597
598
599
600
    此时独占一列的被成功扩展到指定的边界上,这个时候利用边界条件合并连续的bbox,成为一个group
    然后合并所有连续垂直方向的bbox.
    """
    all_bboxes.sort(key=lambda x: x[X0_IDX])
    # fix: 这里水平方向的列不要合并成一个行,因为需要保证返回给下游的最小block,总是可以无脑从上到下阅读文字。
    v_bboxes = []
    for box in all_bboxes:
drunkpig's avatar
drunkpig committed
601
        if box[Y0_EXT_IDX] == bound_y0 and box[Y1_EXT_IDX] == bound_y1:
赵小蒙's avatar
赵小蒙 committed
602
603
604
605
606
607
608
            v_bboxes.append(box)
    """
    现在v_bboxes里面是所有的group了,每个group都是一个list
    对v_bboxes里的每个group进行计算放回到sorted_layouts里
    """
    v_layouts = []
    for vbox in v_bboxes:
drunkpig's avatar
drunkpig committed
609
        # gp.sort(key=lambda x: x[X0_IDX])
赵小蒙's avatar
赵小蒙 committed
610
        # 然后计算这个group的layout_bbox,也就是最小的x0,y0, 最大的x1,y1
drunkpig's avatar
drunkpig committed
611
612
613
614
615
616
617
        x0, y0, x1, y1 = (
            vbox[X0_EXT_IDX],
            vbox[Y0_EXT_IDX],
            vbox[X1_EXT_IDX],
            vbox[Y1_EXT_IDX],
        )
        v_layouts.append([x0, y0, x1, y1, LAYOUT_V])  # 垂直的布局
赵小蒙's avatar
赵小蒙 committed
618
619
620
621
622
623
624
625
626
    """
    接下来利用这些连续的垂直bbox的layout_bbox的x0, x1,从垂直上切分开其余的为几个部分
    """
    v_split_lines = [bound_x0]
    for gp in v_bboxes:
        x0, x1 = gp[X0_IDX], gp[X1_IDX]
        v_split_lines.append(x0)
        v_split_lines.append(x1)
    v_split_lines.append(bound_x1)
drunkpig's avatar
drunkpig committed
627

赵小蒙's avatar
赵小蒙 committed
628
629
    unsplited_bboxes = []
    for i in range(0, len(v_split_lines), 2):
drunkpig's avatar
drunkpig committed
630
        start_x0, start_x1 = v_split_lines[i : i + 2]
赵小蒙's avatar
赵小蒙 committed
631
        # 然后找出[start_x0, start_x1]之间的其他bbox,这些组成一个未分割板块
drunkpig's avatar
drunkpig committed
632
633
634
635
636
        bboxes_in_block = [
            bbox
            for bbox in all_bboxes
            if bbox[X0_IDX] >= start_x0 and bbox[X1_IDX] <= start_x1
        ]
赵小蒙's avatar
赵小蒙 committed
637
638
639
640
641
        unsplited_bboxes.append(bboxes_in_block)
    # 接着把未处理的加入到v_layouts里
    for bboxes_in_block in unsplited_bboxes:
        if len(bboxes_in_block) == 0:
            continue
drunkpig's avatar
drunkpig committed
642
643
644
645
646
647
648
649
650
651
652
653
        x0, y0, x1, y1 = (
            min([bbox[X0_IDX] for bbox in bboxes_in_block]),
            bound_y0,
            max([bbox[X1_IDX] for bbox in bboxes_in_block]),
            bound_y1,
        )
        v_layouts.append(
            [x0, y0, x1, y1, LAYOUT_UNPROC]
        )  # 说明这篇区域未能够分析出可靠的版面

    v_layouts.sort(key=lambda x: x[0])  # 按照x0排序, 也就是从左到右的顺序

赵小蒙's avatar
赵小蒙 committed
654
    for layout in v_layouts:
drunkpig's avatar
drunkpig committed
655
656
657
658
659
660
661
        sorted_layout_blocks.append(
            {
                'layout_bbox': layout[:4],
                'layout_label': layout[4],
                'sub_layout': [],
            }
        )
赵小蒙's avatar
赵小蒙 committed
662
663
664
665
666
667
668
669
670
    """
    至此,垂直方向切成了2种类型,其一是独占一列的,其二是未处理的。
    下面对这些未处理的进行垂直方向切分,这个切分要切出来类似“吕”这种类型的垂直方向的布局
    """
    for i, layout in enumerate(sorted_layout_blocks):
        if layout['layout_label'] == LAYOUT_UNPROC:
            x0, y0, x1, y1 = layout['layout_bbox']
            v_split_layouts = _vertical_align_split_v2(bboxes, [x0, y0, x1, y1])
            sorted_layout_blocks[i] = {
drunkpig's avatar
drunkpig committed
671
672
673
                'layout_bbox': [x0, y0, x1, y1],
                'layout_label': LAYOUT_H,
                'sub_layout': v_split_layouts,
赵小蒙's avatar
赵小蒙 committed
674
            }
drunkpig's avatar
drunkpig committed
675
676
            layout['layout_label'] = LAYOUT_H  # 被垂线切分成了水平布局

赵小蒙's avatar
赵小蒙 committed
677
678
    return sorted_layout_blocks

drunkpig's avatar
drunkpig committed
679
680

def split_layout(bboxes: list, boundary: tuple, page_num: int) -> list:
赵小蒙's avatar
赵小蒙 committed
681
682
683
684
685
    """
    把bboxes切割成layout
    return:
    [
        {
drunkpig's avatar
drunkpig committed
686
            "layout_bbox": [x0,y0,x1,y1],
赵小蒙's avatar
赵小蒙 committed
687
            "layout_label":"u|v|h|b", 未处理|垂直|水平|BAD_LAYOUT
drunkpig's avatar
drunkpig committed
688
689
690
691
692
693
694
695
696
            "sub_layout":[] #每个元素都是[
                                            x0,y0,
                                            x1,y1,
                                            block_content,
                                            idx_x,idx_y,
                                            content_type,
                                            ext_x0,ext_y0,
                                            ext_x1,ext_y1
                                        ], 并且顺序就是阅读顺序
赵小蒙's avatar
赵小蒙 committed
697
698
699
700
701
702
703
704
        }
    ]
    example:
    [
        {
            "layout_bbox": [0, 0, 100, 100],
            "layout_label":"u|v|h|b",
            "sub_layout":[
drunkpig's avatar
drunkpig committed
705

赵小蒙's avatar
赵小蒙 committed
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
            ]
        },
        {
            "layout_bbox": [0, 0, 100, 100],
            "layout_label":"u|v|h|b",
            "sub_layout":[
                {
                    "layout_bbox": [0, 0, 100, 100],
                    "layout_label":"u|v|h|b",
                    "content_bboxes":[
                        [],
                        [],
                        []
                    ]
                },
                {
                    "layout_bbox": [0, 0, 100, 100],
                    "layout_label":"u|v|h|b",
                    "sub_layout":[
drunkpig's avatar
drunkpig committed
725

赵小蒙's avatar
赵小蒙 committed
726
727
728
                    ]
                }
        }
drunkpig's avatar
drunkpig committed
729
    ]
赵小蒙's avatar
赵小蒙 committed
730
    """
drunkpig's avatar
drunkpig committed
731
732
733
734
    sorted_layouts = []  # 最终返回的结果

    boundary_x0, boundary_y0, boundary_x1, boundary_y1 = boundary
    if len(bboxes) <= 1:
赵小蒙's avatar
赵小蒙 committed
735
736
        return [
            {
drunkpig's avatar
drunkpig committed
737
738
739
                'layout_bbox': [boundary_x0, boundary_y0, boundary_x1, boundary_y1],
                'layout_label': LAYOUT_V,
                'sub_layout': [],
赵小蒙's avatar
赵小蒙 committed
740
741
742
743
744
            }
        ]
    """
    接下来按照先水平后垂直的顺序进行切分
    """
drunkpig's avatar
drunkpig committed
745
746
747
748
    bboxes = paper_bbox_sort(
        bboxes, boundary_x1 - boundary_x0, boundary_y1 - boundary_y0
    )
    sorted_layouts = _horizontal_split(bboxes, boundary)  # 通过水平分割出来的layout
赵小蒙's avatar
赵小蒙 committed
749
750
751
    for i, layout in enumerate(sorted_layouts):
        x0, y0, x1, y1 = layout['layout_bbox']
        layout_type = layout['layout_label']
drunkpig's avatar
drunkpig committed
752
        if layout_type == LAYOUT_UNPROC:  # 说明是非独占单行的,这些需要垂直切分
赵小蒙's avatar
赵小蒙 committed
753
754
755
756
757
758
759
760
761
            v_split_layouts = _vertical_split(bboxes, [x0, y0, x1, y1])
            """
            最后这里有个逻辑问题:如果这个函数只分离出来了一个column layout,那么这个layout分割肯定超出了算法能力范围。因为我们假定的是传进来的
            box已经把行全部剥离了,所以这里必须十多个列才可以。如果只剥离出来一个layout,并且是多个box,那么就说明这个layout是无法分割的,标记为LAYOUT_UNPROC
            """
            layout_label = LAYOUT_V
            if len(v_split_layouts) == 1:
                if len(v_split_layouts[0]['sub_layout']) == 0:
                    layout_label = LAYOUT_UNPROC
drunkpig's avatar
drunkpig committed
762
                    # logger.warning(f"WARNING: pageno={page_num}, 无法分割的layout: ", v_split_layouts)
赵小蒙's avatar
赵小蒙 committed
763
764
765
766
            """
            组合起来最终的layout
            """
            sorted_layouts[i] = {
drunkpig's avatar
drunkpig committed
767
768
769
                'layout_bbox': [x0, y0, x1, y1],
                'layout_label': layout_label,
                'sub_layout': v_split_layouts,
赵小蒙's avatar
赵小蒙 committed
770
771
772
773
774
775
776
            }
            layout['layout_label'] = LAYOUT_H
    """
    水平和垂直方向都切分完毕了。此时还有一些未处理的,这些未处理的可能是因为水平和垂直方向都无法切分。
    这些最后调用_try_horizontal_mult_block_split做一次水平多个block的联合切分,如果也不能切分最终就当做BAD_LAYOUT返回
    """
    # TODO
drunkpig's avatar
drunkpig committed
777

赵小蒙's avatar
赵小蒙 committed
778
779
780
    return sorted_layouts


drunkpig's avatar
drunkpig committed
781
def get_bboxes_layout(all_boxes: list, boundary: tuple, page_id: int):
赵小蒙's avatar
赵小蒙 committed
782
783
784
785
786
787
788
789
790
791
    """
    对利用layout排序之后的box,进行排序
    return:
    [
        {
            "layout_bbox": [x0, y0, x1, y1],
            "layout_label":"u|v|h|b", 未处理|垂直|水平|BAD_LAYOUT
        },
    ]
    """
drunkpig's avatar
drunkpig committed
792

赵小蒙's avatar
赵小蒙 committed
793
    def _preorder_traversal(layout):
drunkpig's avatar
drunkpig committed
794
795
        """对sorted_layouts的叶子节点,也就是len(sub_layout)==0的节点进行排序。排序按照前序遍历的顺序,也就是从上到
        下,从左到右的顺序."""
赵小蒙's avatar
赵小蒙 committed
796
797
798
799
800
801
802
803
804
        sorted_layout_blocks = []
        for layout in layout:
            sub_layout = layout['sub_layout']
            if len(sub_layout) == 0:
                sorted_layout_blocks.append(layout)
            else:
                s = _preorder_traversal(sub_layout)
                sorted_layout_blocks.extend(s)
        return sorted_layout_blocks
drunkpig's avatar
drunkpig committed
805

赵小蒙's avatar
赵小蒙 committed
806
    # -------------------------------------------------------------------------------------------------------------------------
drunkpig's avatar
drunkpig committed
807
808
809
810
    sorted_layouts = split_layout(
        all_boxes, boundary, page_id
    )  # 先切分成layout,得到一个Tree
    total_sorted_layout_blocks = _preorder_traversal(sorted_layouts)
赵小蒙's avatar
赵小蒙 committed
811
812
813
814
    return total_sorted_layout_blocks, sorted_layouts


def get_columns_cnt_of_layout(layout_tree):
drunkpig's avatar
drunkpig committed
815
816
817
818
    """获取一个layout的宽度."""
    max_width_list = [0]  # 初始化一个元素,防止max,min函数报错

    for items in layout_tree:  # 针对每一层(横切)计算列数,横着的算一列
赵小蒙's avatar
赵小蒙 committed
819
820
        layout_type = items['layout_label']
        sub_layouts = items['sub_layout']
drunkpig's avatar
drunkpig committed
821
        if len(sub_layouts) == 0:
赵小蒙's avatar
赵小蒙 committed
822
823
824
825
826
827
            max_width_list.append(1)
        else:
            if layout_type == LAYOUT_H:
                max_width_list.append(1)
            else:
                width = 0
drunkpig's avatar
drunkpig committed
828
829
                for sub_layout in sub_layouts:
                    if len(sub_layout['sub_layout']) == 0:
赵小蒙's avatar
赵小蒙 committed
830
831
                        width += 1
                    else:
drunkpig's avatar
drunkpig committed
832
                        for lay in sub_layout['sub_layout']:
赵小蒙's avatar
赵小蒙 committed
833
834
                            width += get_columns_cnt_of_layout([lay])
                max_width_list.append(width)
drunkpig's avatar
drunkpig committed
835

赵小蒙's avatar
赵小蒙 committed
836
837
    return max(max_width_list)

drunkpig's avatar
drunkpig committed
838
839
840
841

def sort_with_layout(bboxes: list, page_width, page_height) -> (list, list):
    """输入是一个bbox的list.

赵小蒙's avatar
赵小蒙 committed
842
843
844
845
846
847
    获取到输入之后,先进行layout切分,然后对这些bbox进行排序。返回排序后的bboxes
    """

    new_bboxes = []
    for box in bboxes:
        # new_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'text', None, None, None, None])
drunkpig's avatar
drunkpig committed
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
        new_bboxes.append(
            [
                box[0],
                box[1],
                box[2],
                box[3],
                None,
                None,
                None,
                'text',
                None,
                None,
                None,
                None,
                box[4],
            ]
        )

    layout_bboxes, _ = get_bboxes_layout(
        new_bboxes, tuple([0, 0, page_width, page_height]), 0
    )
    if any([lay['layout_label'] == LAYOUT_UNPROC for lay in layout_bboxes]):
        logger.warning('drop this pdf, reason: 复杂版面')
        return None, None

    sorted_bboxes = []
赵小蒙's avatar
赵小蒙 committed
874
875
876
    # 利用layout bbox每次框定一些box,然后排序
    for layout in layout_bboxes:
        lbox = layout['layout_bbox']
drunkpig's avatar
drunkpig committed
877
878
879
880
        bbox_in_layout = get_bbox_in_boundary(new_bboxes, lbox)
        sorted_bbox = paper_bbox_sort(
            bbox_in_layout, lbox[2] - lbox[0], lbox[3] - lbox[1]
        )
赵小蒙's avatar
赵小蒙 committed
881
        sorted_bboxes.extend(sorted_bbox)
drunkpig's avatar
drunkpig committed
882

赵小蒙's avatar
赵小蒙 committed
883
884
885
886
    return sorted_bboxes, layout_bboxes


def sort_text_block(text_block, layout_bboxes):
drunkpig's avatar
drunkpig committed
887
    """对一页的text_block进行排序."""
赵小蒙's avatar
赵小蒙 committed
888
889
890
891
892
893
894
895
    sorted_text_bbox = []
    all_text_bbox = []
    # 做一个box=>text的映射
    box_to_text = {}
    for blk in text_block:
        box = blk['bbox']
        box_to_text[(box[0], box[1], box[2], box[3])] = blk
        all_text_bbox.append(box)
drunkpig's avatar
drunkpig committed
896

赵小蒙's avatar
赵小蒙 committed
897
898
899
    # text_blocks_to_sort = []
    # for box in box_to_text.keys():
    #     text_blocks_to_sort.append([box[0], box[1], box[2], box[3], None, None, None, 'text', None, None, None, None])
drunkpig's avatar
drunkpig committed
900

赵小蒙's avatar
赵小蒙 committed
901
902
903
    # 按照layout_bboxes的顺序,对text_block进行排序
    for layout in layout_bboxes:
        layout_box = layout['layout_bbox']
drunkpig's avatar
drunkpig committed
904
905
906
907
908
909
910
911
912
913
914
915
916
917
        text_bbox_in_layout = get_bbox_in_boundary(
            all_text_bbox,
            [
                layout_box[0] - 1,
                layout_box[1] - 1,
                layout_box[2] + 1,
                layout_box[3] + 1,
            ],
        )
        # sorted_bbox = paper_bbox_sort(text_bbox_in_layout, layout_box[2]-layout_box[0], layout_box[3]-layout_box[1])
        text_bbox_in_layout.sort(
            key=lambda x: x[1]
        )  # 一个layout内部的box,按照y0自上而下排序
        # sorted_bbox = [[b] for b in text_blocks_to_sort]
赵小蒙's avatar
赵小蒙 committed
918
919
        for sb in text_bbox_in_layout:
            sorted_text_bbox.append(box_to_text[(sb[0], sb[1], sb[2], sb[3])])
drunkpig's avatar
drunkpig committed
920

赵小蒙's avatar
赵小蒙 committed
921
    return sorted_text_bbox