draw_bbox.py 15.4 KB
Newer Older
1
2
import time

3
from magic_pdf.libs.commons import fitz  # PyMuPDF
4
5
from magic_pdf.libs.Constants import CROSS_PAGE
from magic_pdf.libs.ocr_content_type import BlockType, CategoryId, ContentType
6
from magic_pdf.model.magic_model import MagicModel
赵小蒙's avatar
赵小蒙 committed
7

赵小蒙's avatar
赵小蒙 committed
8

赵小蒙's avatar
赵小蒙 committed
9
def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
10
11
12
13
14
15
16
    new_rgb = []
    for item in rgb_config:
        item = float(item) / 255
        new_rgb.append(item)
    page_data = bbox_list[i]
    for bbox in page_data:
        x0, y0, x1, y1 = bbox
赵小蒙's avatar
赵小蒙 committed
17
        rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
赵小蒙's avatar
赵小蒙 committed
18
        if fill_config:
许瑞's avatar
许瑞 committed
19
20
21
22
23
24
25
26
            page.draw_rect(
                rect_coords,
                color=None,
                fill=new_rgb,
                fill_opacity=0.3,
                width=0.5,
                overlay=True,
            )  # Draw the rectangle
赵小蒙's avatar
赵小蒙 committed
27
        else:
许瑞's avatar
许瑞 committed
28
29
30
31
32
33
34
35
            page.draw_rect(
                rect_coords,
                color=new_rgb,
                fill=None,
                fill_opacity=1,
                width=0.5,
                overlay=True,
            )  # Draw the rectangle
赵小蒙's avatar
赵小蒙 committed
36

37

赵小蒙's avatar
赵小蒙 committed
38
def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
赵小蒙's avatar
赵小蒙 committed
39
40
41
42
43
44
45
46
    new_rgb = []
    for item in rgb_config:
        item = float(item) / 255
        new_rgb.append(item)
    page_data = bbox_list[i]
    for j, bbox in enumerate(page_data):
        x0, y0, x1, y1 = bbox
        rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
赵小蒙's avatar
赵小蒙 committed
47
        if fill_config:
许瑞's avatar
许瑞 committed
48
49
50
51
52
53
54
55
            page.draw_rect(
                rect_coords,
                color=None,
                fill=new_rgb,
                fill_opacity=0.3,
                width=0.5,
                overlay=True,
            )  # Draw the rectangle
赵小蒙's avatar
赵小蒙 committed
56
        else:
许瑞's avatar
许瑞 committed
57
58
59
60
61
62
63
64
65
66
            page.draw_rect(
                rect_coords,
                color=new_rgb,
                fill=None,
                fill_opacity=1,
                width=0.5,
                overlay=True,
            )  # Draw the rectangle
        page.insert_text(
            (x0, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
赵小蒙's avatar
赵小蒙 committed
67
        )  # Insert the index in the top left corner of the rectangle
赵小蒙's avatar
赵小蒙 committed
68
69


70
def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
71
    layout_bbox_list = []
赵小蒙's avatar
赵小蒙 committed
72
    dropped_bbox_list = []
73
74
    tables_list, tables_body_list = [], []
    tables_caption_list, tables_footnote_list = [], []
许瑞's avatar
许瑞 committed
75
    imgs_list, imgs_body_list, imgs_caption_list = [], [], []
76
    imgs_footnote_list = []
许瑞's avatar
许瑞 committed
77
78
79
    titles_list = []
    texts_list = []
    interequations_list = []
赵小蒙's avatar
赵小蒙 committed
80
    for page in pdf_info:
赵小蒙's avatar
赵小蒙 committed
81
82
        page_layout_list = []
        page_dropped_list = []
许瑞's avatar
许瑞 committed
83
        tables, tables_body, tables_caption, tables_footnote = [], [], [], []
84
        imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
许瑞's avatar
许瑞 committed
85
86
87
        titles = []
        texts = []
        interequations = []
88
89
        for layout in page['layout_bboxes']:
            page_layout_list.append(layout['layout_bbox'])
赵小蒙's avatar
赵小蒙 committed
90
        layout_bbox_list.append(page_layout_list)
91
92
        for dropped_bbox in page['discarded_blocks']:
            page_dropped_list.append(dropped_bbox['bbox'])
赵小蒙's avatar
赵小蒙 committed
93
        dropped_bbox_list.append(page_dropped_list)
94
95
96
        for block in page['para_blocks']:
            bbox = block['bbox']
            if block['type'] == BlockType.Table:
许瑞's avatar
许瑞 committed
97
                tables.append(bbox)
98
99
100
                for nested_block in block['blocks']:
                    bbox = nested_block['bbox']
                    if nested_block['type'] == BlockType.TableBody:
许瑞's avatar
许瑞 committed
101
                        tables_body.append(bbox)
102
                    elif nested_block['type'] == BlockType.TableCaption:
许瑞's avatar
许瑞 committed
103
                        tables_caption.append(bbox)
104
                    elif nested_block['type'] == BlockType.TableFootnote:
许瑞's avatar
许瑞 committed
105
                        tables_footnote.append(bbox)
106
            elif block['type'] == BlockType.Image:
许瑞's avatar
许瑞 committed
107
                imgs.append(bbox)
108
109
110
                for nested_block in block['blocks']:
                    bbox = nested_block['bbox']
                    if nested_block['type'] == BlockType.ImageBody:
许瑞's avatar
许瑞 committed
111
                        imgs_body.append(bbox)
112
                    elif nested_block['type'] == BlockType.ImageCaption:
许瑞's avatar
许瑞 committed
113
                        imgs_caption.append(bbox)
114
115
                    elif nested_block['type'] == BlockType.ImageFootnote:
                        imgs_footnote.append(bbox)
116
            elif block['type'] == BlockType.Title:
许瑞's avatar
许瑞 committed
117
                titles.append(bbox)
118
            elif block['type'] == BlockType.Text:
许瑞's avatar
许瑞 committed
119
                texts.append(bbox)
120
            elif block['type'] == BlockType.InterlineEquation:
许瑞's avatar
许瑞 committed
121
122
123
124
125
126
127
128
                interequations.append(bbox)
        tables_list.append(tables)
        tables_body_list.append(tables_body)
        tables_caption_list.append(tables_caption)
        tables_footnote_list.append(tables_footnote)
        imgs_list.append(imgs)
        imgs_body_list.append(imgs_body)
        imgs_caption_list.append(imgs_caption)
129
        imgs_footnote_list.append(imgs_footnote)
许瑞's avatar
许瑞 committed
130
131
132
133
        titles_list.append(titles)
        texts_list.append(texts)
        interequations_list.append(interequations)

134
    pdf_docs = fitz.open('pdf', pdf_bytes)
135
    for i, page in enumerate(pdf_docs):
赵小蒙's avatar
赵小蒙 committed
136
        draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
137
138
139
140
141
142
143
144
145
146
        draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158],
                                 True)
        draw_bbox_without_number(i, tables_list, page, [153, 153, 0],
                                 True)  # color !
        draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0],
                                 True)
        draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102],
                                 True)
        draw_bbox_without_number(i, tables_footnote_list, page,
                                 [229, 255, 204], True)
许瑞's avatar
许瑞 committed
147
148
        draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
        draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
149
150
        draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255],
                                 True)
151
152
        draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
                              True),
许瑞's avatar
许瑞 committed
153
154
        draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
        draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
155
156
        draw_bbox_without_number(i, interequations_list, page, [0, 255, 0],
                                 True)
许瑞's avatar
许瑞 committed
157

158
    # Save the PDF
159
    pdf_docs.save(f'{out_path}/{filename}_layout.pdf')
160

许瑞's avatar
许瑞 committed
161

162
def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
163
164
    text_list = []
    inline_equation_list = []
赵小蒙's avatar
赵小蒙 committed
165
    interline_equation_list = []
赵小蒙's avatar
赵小蒙 committed
166
167
    image_list = []
    table_list = []
赵小蒙's avatar
赵小蒙 committed
168
    dropped_list = []
169
170
    next_page_text_list = []
    next_page_inline_equation_list = []
赵小蒙's avatar
赵小蒙 committed
171
172

    def get_span_info(span):
173
        if span['type'] == ContentType.Text:
赵小蒙's avatar
赵小蒙 committed
174
            if span.get(CROSS_PAGE, False):
175
                next_page_text_list.append(span['bbox'])
赵小蒙's avatar
赵小蒙 committed
176
            else:
177
178
                page_text_list.append(span['bbox'])
        elif span['type'] == ContentType.InlineEquation:
赵小蒙's avatar
赵小蒙 committed
179
            if span.get(CROSS_PAGE, False):
180
                next_page_inline_equation_list.append(span['bbox'])
赵小蒙's avatar
赵小蒙 committed
181
            else:
182
183
184
185
186
187
188
                page_inline_equation_list.append(span['bbox'])
        elif span['type'] == ContentType.InterlineEquation:
            page_interline_equation_list.append(span['bbox'])
        elif span['type'] == ContentType.Image:
            page_image_list.append(span['bbox'])
        elif span['type'] == ContentType.Table:
            page_table_list.append(span['bbox'])
赵小蒙's avatar
赵小蒙 committed
189

赵小蒙's avatar
赵小蒙 committed
190
    for page in pdf_info:
191
192
        page_text_list = []
        page_inline_equation_list = []
赵小蒙's avatar
赵小蒙 committed
193
        page_interline_equation_list = []
赵小蒙's avatar
赵小蒙 committed
194
195
        page_image_list = []
        page_table_list = []
赵小蒙's avatar
赵小蒙 committed
196
        page_dropped_list = []
197
198
199
200

        # 将跨页的span放到移动到下一页的列表中
        if len(next_page_text_list) > 0:
            page_text_list.extend(next_page_text_list)
赵小蒙's avatar
赵小蒙 committed
201
            next_page_text_list.clear()
202
203
        if len(next_page_inline_equation_list) > 0:
            page_inline_equation_list.extend(next_page_inline_equation_list)
赵小蒙's avatar
赵小蒙 committed
204
            next_page_inline_equation_list.clear()
205

赵小蒙's avatar
赵小蒙 committed
206
        # 构造dropped_list
207
208
209
210
211
        for block in page['discarded_blocks']:
            if block['type'] == BlockType.Discarded:
                for line in block['lines']:
                    for span in line['spans']:
                        page_dropped_list.append(span['bbox'])
赵小蒙's avatar
赵小蒙 committed
212
213
        dropped_list.append(page_dropped_list)
        # 构造其余useful_list
214
215
        for block in page['para_blocks']:
            if block['type'] in [
216
217
218
                BlockType.Text,
                BlockType.Title,
                BlockType.InterlineEquation,
许瑞's avatar
许瑞 committed
219
            ]:
220
221
                for line in block['lines']:
                    for span in line['spans']:
赵小蒙's avatar
赵小蒙 committed
222
                        get_span_info(span)
223
224
225
226
            elif block['type'] in [BlockType.Image, BlockType.Table]:
                for sub_block in block['blocks']:
                    for line in sub_block['lines']:
                        for span in line['spans']:
赵小蒙's avatar
赵小蒙 committed
227
                            get_span_info(span)
228
229
        text_list.append(page_text_list)
        inline_equation_list.append(page_inline_equation_list)
赵小蒙's avatar
赵小蒙 committed
230
        interline_equation_list.append(page_interline_equation_list)
赵小蒙's avatar
赵小蒙 committed
231
232
        image_list.append(page_image_list)
        table_list.append(page_table_list)
233
    pdf_docs = fitz.open('pdf', pdf_bytes)
234
    for i, page in enumerate(pdf_docs):
235
        # 获取当前页面的数据
赵小蒙's avatar
赵小蒙 committed
236
        draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
237
238
239
240
        draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0],
                                 False)
        draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255],
                                 False)
赵小蒙's avatar
赵小蒙 committed
241
242
        draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
        draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
赵小蒙's avatar
赵小蒙 committed
243
        draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
244
245

    # Save the PDF
246
    pdf_docs.save(f'{out_path}/{filename}_spans.pdf')
247
248


249
def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
250
251
    dropped_bbox_list = []
    tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
252
    imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
253
254
255
    titles_list = []
    texts_list = []
    interequations_list = []
256
    pdf_docs = fitz.open('pdf', pdf_bytes)
257
258
259
260
    magic_model = MagicModel(model_list, pdf_docs)
    for i in range(len(model_list)):
        page_dropped_list = []
        tables_body, tables_caption, tables_footnote = [], [], []
261
        imgs_body, imgs_caption, imgs_footnote = [], [], []
262
263
264
265
        titles = []
        texts = []
        interequations = []
        page_info = magic_model.get_model_list(i)
266
        layout_dets = page_info['layout_dets']
267
        for layout_det in layout_dets:
268
269
            bbox = layout_det['bbox']
            if layout_det['category_id'] == CategoryId.Text:
270
                texts.append(bbox)
271
            elif layout_det['category_id'] == CategoryId.Title:
272
                titles.append(bbox)
273
            elif layout_det['category_id'] == CategoryId.TableBody:
274
                tables_body.append(bbox)
275
            elif layout_det['category_id'] == CategoryId.TableCaption:
276
                tables_caption.append(bbox)
277
            elif layout_det['category_id'] == CategoryId.TableFootnote:
278
                tables_footnote.append(bbox)
279
            elif layout_det['category_id'] == CategoryId.ImageBody:
280
                imgs_body.append(bbox)
281
            elif layout_det['category_id'] == CategoryId.ImageCaption:
282
                imgs_caption.append(bbox)
283
            elif layout_det[
284
                'category_id'] == CategoryId.InterlineEquation_YOLO:
285
                interequations.append(bbox)
286
            elif layout_det['category_id'] == CategoryId.Abandon:
287
                page_dropped_list.append(bbox)
288
289
            elif layout_det['category_id'] == CategoryId.ImageFootnote:
                imgs_footnote.append(bbox)
290
291
292
293
294
295
296
297
298
299

        tables_body_list.append(tables_body)
        tables_caption_list.append(tables_caption)
        tables_footnote_list.append(tables_footnote)
        imgs_body_list.append(imgs_body)
        imgs_caption_list.append(imgs_caption)
        titles_list.append(titles)
        texts_list.append(texts)
        interequations_list.append(interequations)
        dropped_bbox_list.append(page_dropped_list)
300
        imgs_footnote_list.append(imgs_footnote)
301
302

    for i, page in enumerate(pdf_docs):
303
304
        draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158],
                              True)  # color !
305
        draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True)
306
307
308
309
        draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102],
                              True)
        draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204],
                              True)
310
        draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
311
312
        draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255],
                              True)
313
314
        draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
                              True)
315
316
317
318
319
        draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
        draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
        draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)

    # Save the PDF
320
    pdf_docs.save(f'{out_path}/{filename}_model.pdf')
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372


from typing import List


def do_predict(boxes: List[List[int]]) -> List[int]:
    from transformers import LayoutLMv3ForTokenClassification
    from magic_pdf.v3.helpers import prepare_inputs, boxes2inputs, parse_logits
    model = LayoutLMv3ForTokenClassification.from_pretrained("hantian/layoutreader")
    inputs = boxes2inputs(boxes)
    inputs = prepare_inputs(inputs, model)
    logits = model(**inputs).logits.cpu().squeeze(0)
    return parse_logits(logits, len(boxes))


def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
    layout_bbox_list = []

    from loguru import logger
    for page in pdf_info:
        page_layout_list = []
        for block in page['para_blocks']:
            bbox = block['bbox']
            page_layout_list.append(bbox)

        # 使用layoutreader排序
        page_size = page['page_size']
        x_scale = 1000.0 / page_size[0]
        y_scale = 1000.0 / page_size[1]
        boxes = []
        logger.info(f"Scale: {x_scale}, {y_scale}, Boxes len: {len(page_layout_list)}")
        for left, top, right, bottom in page_layout_list:
            left = round(left * x_scale)
            top = round(top * y_scale)
            right = round(right * x_scale)
            bottom = round(bottom * y_scale)
            assert (
                    1000 >= right >= left >= 0 and 1000 >= bottom >= top >= 0
            ), f"Invalid box. right: {right}, left: {left}, bottom: {bottom}, top: {top}"
            boxes.append([left, top, right, bottom])
        logger.info("layoutreader start")
        start = time.time()
        orders = do_predict(boxes)
        print(orders)
        logger.info(f"layoutreader end, cos time{time.time() - start}")
        sorted_bboxes = [page_layout_list[i] for i in orders]
        layout_bbox_list.append(sorted_bboxes)
    pdf_docs = fitz.open('pdf', pdf_bytes)
    for i, page in enumerate(pdf_docs):
        draw_bbox_with_number(i, layout_bbox_list, page, [102, 102, 255], False)

    pdf_docs.save(f'{out_path}/{filename}_layout_sort.pdf')