draw_bbox.py 15.7 KB
Newer Older
1
2
import time

3
4
import torch

5
from magic_pdf.libs.commons import fitz  # PyMuPDF
6
7
from magic_pdf.libs.Constants import CROSS_PAGE
from magic_pdf.libs.ocr_content_type import BlockType, CategoryId, ContentType
8
from magic_pdf.model.magic_model import MagicModel
赵小蒙's avatar
赵小蒙 committed
9

赵小蒙's avatar
赵小蒙 committed
10

赵小蒙's avatar
赵小蒙 committed
11
def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
12
13
14
15
16
17
18
    new_rgb = []
    for item in rgb_config:
        item = float(item) / 255
        new_rgb.append(item)
    page_data = bbox_list[i]
    for bbox in page_data:
        x0, y0, x1, y1 = bbox
赵小蒙's avatar
赵小蒙 committed
19
        rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
赵小蒙's avatar
赵小蒙 committed
20
        if fill_config:
许瑞's avatar
许瑞 committed
21
22
23
24
25
26
27
28
            page.draw_rect(
                rect_coords,
                color=None,
                fill=new_rgb,
                fill_opacity=0.3,
                width=0.5,
                overlay=True,
            )  # Draw the rectangle
赵小蒙's avatar
赵小蒙 committed
29
        else:
许瑞's avatar
许瑞 committed
30
31
32
33
34
35
36
37
            page.draw_rect(
                rect_coords,
                color=new_rgb,
                fill=None,
                fill_opacity=1,
                width=0.5,
                overlay=True,
            )  # Draw the rectangle
赵小蒙's avatar
赵小蒙 committed
38

39

赵小蒙's avatar
赵小蒙 committed
40
def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
赵小蒙's avatar
赵小蒙 committed
41
42
43
44
45
46
47
48
    new_rgb = []
    for item in rgb_config:
        item = float(item) / 255
        new_rgb.append(item)
    page_data = bbox_list[i]
    for j, bbox in enumerate(page_data):
        x0, y0, x1, y1 = bbox
        rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
赵小蒙's avatar
赵小蒙 committed
49
        if fill_config:
许瑞's avatar
许瑞 committed
50
51
52
53
54
55
56
57
            page.draw_rect(
                rect_coords,
                color=None,
                fill=new_rgb,
                fill_opacity=0.3,
                width=0.5,
                overlay=True,
            )  # Draw the rectangle
赵小蒙's avatar
赵小蒙 committed
58
        else:
许瑞's avatar
许瑞 committed
59
60
61
62
63
64
65
66
67
            page.draw_rect(
                rect_coords,
                color=new_rgb,
                fill=None,
                fill_opacity=1,
                width=0.5,
                overlay=True,
            )  # Draw the rectangle
        page.insert_text(
68
            (x1+2, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
赵小蒙's avatar
赵小蒙 committed
69
        )  # Insert the index in the top left corner of the rectangle
赵小蒙's avatar
赵小蒙 committed
70
71


72
def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
73
    layout_bbox_list = []
赵小蒙's avatar
赵小蒙 committed
74
    dropped_bbox_list = []
75
76
    tables_list, tables_body_list = [], []
    tables_caption_list, tables_footnote_list = [], []
许瑞's avatar
许瑞 committed
77
    imgs_list, imgs_body_list, imgs_caption_list = [], [], []
78
    imgs_footnote_list = []
许瑞's avatar
许瑞 committed
79
80
81
    titles_list = []
    texts_list = []
    interequations_list = []
赵小蒙's avatar
赵小蒙 committed
82
    for page in pdf_info:
赵小蒙's avatar
赵小蒙 committed
83
84
        page_layout_list = []
        page_dropped_list = []
许瑞's avatar
许瑞 committed
85
        tables, tables_body, tables_caption, tables_footnote = [], [], [], []
86
        imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
许瑞's avatar
许瑞 committed
87
88
89
        titles = []
        texts = []
        interequations = []
90
91
        for layout in page['layout_bboxes']:
            page_layout_list.append(layout['layout_bbox'])
赵小蒙's avatar
赵小蒙 committed
92
        layout_bbox_list.append(page_layout_list)
93
94
        for dropped_bbox in page['discarded_blocks']:
            page_dropped_list.append(dropped_bbox['bbox'])
赵小蒙's avatar
赵小蒙 committed
95
        dropped_bbox_list.append(page_dropped_list)
96
97
98
        for block in page['para_blocks']:
            bbox = block['bbox']
            if block['type'] == BlockType.Table:
许瑞's avatar
许瑞 committed
99
                tables.append(bbox)
100
101
102
                for nested_block in block['blocks']:
                    bbox = nested_block['bbox']
                    if nested_block['type'] == BlockType.TableBody:
许瑞's avatar
许瑞 committed
103
                        tables_body.append(bbox)
104
                    elif nested_block['type'] == BlockType.TableCaption:
许瑞's avatar
许瑞 committed
105
                        tables_caption.append(bbox)
106
                    elif nested_block['type'] == BlockType.TableFootnote:
许瑞's avatar
许瑞 committed
107
                        tables_footnote.append(bbox)
108
            elif block['type'] == BlockType.Image:
许瑞's avatar
许瑞 committed
109
                imgs.append(bbox)
110
111
112
                for nested_block in block['blocks']:
                    bbox = nested_block['bbox']
                    if nested_block['type'] == BlockType.ImageBody:
许瑞's avatar
许瑞 committed
113
                        imgs_body.append(bbox)
114
                    elif nested_block['type'] == BlockType.ImageCaption:
许瑞's avatar
许瑞 committed
115
                        imgs_caption.append(bbox)
116
117
                    elif nested_block['type'] == BlockType.ImageFootnote:
                        imgs_footnote.append(bbox)
118
            elif block['type'] == BlockType.Title:
许瑞's avatar
许瑞 committed
119
                titles.append(bbox)
120
            elif block['type'] == BlockType.Text:
许瑞's avatar
许瑞 committed
121
                texts.append(bbox)
122
            elif block['type'] == BlockType.InterlineEquation:
许瑞's avatar
许瑞 committed
123
124
125
126
127
128
129
130
                interequations.append(bbox)
        tables_list.append(tables)
        tables_body_list.append(tables_body)
        tables_caption_list.append(tables_caption)
        tables_footnote_list.append(tables_footnote)
        imgs_list.append(imgs)
        imgs_body_list.append(imgs_body)
        imgs_caption_list.append(imgs_caption)
131
        imgs_footnote_list.append(imgs_footnote)
许瑞's avatar
许瑞 committed
132
133
134
135
        titles_list.append(titles)
        texts_list.append(texts)
        interequations_list.append(interequations)

136
    pdf_docs = fitz.open('pdf', pdf_bytes)
137
    for i, page in enumerate(pdf_docs):
赵小蒙's avatar
赵小蒙 committed
138
        draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
139
140
141
142
143
144
145
146
147
148
        draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158],
                                 True)
        draw_bbox_without_number(i, tables_list, page, [153, 153, 0],
                                 True)  # color !
        draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0],
                                 True)
        draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102],
                                 True)
        draw_bbox_without_number(i, tables_footnote_list, page,
                                 [229, 255, 204], True)
许瑞's avatar
许瑞 committed
149
150
        draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
        draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
151
152
        draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255],
                                 True)
153
154
        draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
                              True),
许瑞's avatar
许瑞 committed
155
156
        draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
        draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
157
158
        draw_bbox_without_number(i, interequations_list, page, [0, 255, 0],
                                 True)
许瑞's avatar
许瑞 committed
159

160
    # Save the PDF
161
    pdf_docs.save(f'{out_path}/{filename}_layout.pdf')
162

许瑞's avatar
许瑞 committed
163

164
def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
165
166
    text_list = []
    inline_equation_list = []
赵小蒙's avatar
赵小蒙 committed
167
    interline_equation_list = []
赵小蒙's avatar
赵小蒙 committed
168
169
    image_list = []
    table_list = []
赵小蒙's avatar
赵小蒙 committed
170
    dropped_list = []
171
172
    next_page_text_list = []
    next_page_inline_equation_list = []
赵小蒙's avatar
赵小蒙 committed
173
174

    def get_span_info(span):
175
        if span['type'] == ContentType.Text:
赵小蒙's avatar
赵小蒙 committed
176
            if span.get(CROSS_PAGE, False):
177
                next_page_text_list.append(span['bbox'])
赵小蒙's avatar
赵小蒙 committed
178
            else:
179
180
                page_text_list.append(span['bbox'])
        elif span['type'] == ContentType.InlineEquation:
赵小蒙's avatar
赵小蒙 committed
181
            if span.get(CROSS_PAGE, False):
182
                next_page_inline_equation_list.append(span['bbox'])
赵小蒙's avatar
赵小蒙 committed
183
            else:
184
185
186
187
188
189
190
                page_inline_equation_list.append(span['bbox'])
        elif span['type'] == ContentType.InterlineEquation:
            page_interline_equation_list.append(span['bbox'])
        elif span['type'] == ContentType.Image:
            page_image_list.append(span['bbox'])
        elif span['type'] == ContentType.Table:
            page_table_list.append(span['bbox'])
赵小蒙's avatar
赵小蒙 committed
191

赵小蒙's avatar
赵小蒙 committed
192
    for page in pdf_info:
193
194
        page_text_list = []
        page_inline_equation_list = []
赵小蒙's avatar
赵小蒙 committed
195
        page_interline_equation_list = []
赵小蒙's avatar
赵小蒙 committed
196
197
        page_image_list = []
        page_table_list = []
赵小蒙's avatar
赵小蒙 committed
198
        page_dropped_list = []
199
200
201
202

        # 将跨页的span放到移动到下一页的列表中
        if len(next_page_text_list) > 0:
            page_text_list.extend(next_page_text_list)
赵小蒙's avatar
赵小蒙 committed
203
            next_page_text_list.clear()
204
205
        if len(next_page_inline_equation_list) > 0:
            page_inline_equation_list.extend(next_page_inline_equation_list)
赵小蒙's avatar
赵小蒙 committed
206
            next_page_inline_equation_list.clear()
207

赵小蒙's avatar
赵小蒙 committed
208
        # 构造dropped_list
209
210
211
212
213
        for block in page['discarded_blocks']:
            if block['type'] == BlockType.Discarded:
                for line in block['lines']:
                    for span in line['spans']:
                        page_dropped_list.append(span['bbox'])
赵小蒙's avatar
赵小蒙 committed
214
215
        dropped_list.append(page_dropped_list)
        # 构造其余useful_list
216
217
        for block in page['para_blocks']:
            if block['type'] in [
218
219
220
                BlockType.Text,
                BlockType.Title,
                BlockType.InterlineEquation,
许瑞's avatar
许瑞 committed
221
            ]:
222
223
                for line in block['lines']:
                    for span in line['spans']:
赵小蒙's avatar
赵小蒙 committed
224
                        get_span_info(span)
225
226
227
228
            elif block['type'] in [BlockType.Image, BlockType.Table]:
                for sub_block in block['blocks']:
                    for line in sub_block['lines']:
                        for span in line['spans']:
赵小蒙's avatar
赵小蒙 committed
229
                            get_span_info(span)
230
231
        text_list.append(page_text_list)
        inline_equation_list.append(page_inline_equation_list)
赵小蒙's avatar
赵小蒙 committed
232
        interline_equation_list.append(page_interline_equation_list)
赵小蒙's avatar
赵小蒙 committed
233
234
        image_list.append(page_image_list)
        table_list.append(page_table_list)
235
    pdf_docs = fitz.open('pdf', pdf_bytes)
236
    for i, page in enumerate(pdf_docs):
237
        # 获取当前页面的数据
赵小蒙's avatar
赵小蒙 committed
238
        draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
239
240
        draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0], False)
        draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False)
赵小蒙's avatar
赵小蒙 committed
241
242
        draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
        draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
赵小蒙's avatar
赵小蒙 committed
243
        draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
244
245

    # Save the PDF
246
    pdf_docs.save(f'{out_path}/{filename}_spans.pdf')
247
248


249
def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
250
251
    dropped_bbox_list = []
    tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
252
    imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
253
254
255
    titles_list = []
    texts_list = []
    interequations_list = []
256
    pdf_docs = fitz.open('pdf', pdf_bytes)
257
258
259
260
    magic_model = MagicModel(model_list, pdf_docs)
    for i in range(len(model_list)):
        page_dropped_list = []
        tables_body, tables_caption, tables_footnote = [], [], []
261
        imgs_body, imgs_caption, imgs_footnote = [], [], []
262
263
264
265
        titles = []
        texts = []
        interequations = []
        page_info = magic_model.get_model_list(i)
266
        layout_dets = page_info['layout_dets']
267
        for layout_det in layout_dets:
268
269
            bbox = layout_det['bbox']
            if layout_det['category_id'] == CategoryId.Text:
270
                texts.append(bbox)
271
            elif layout_det['category_id'] == CategoryId.Title:
272
                titles.append(bbox)
273
            elif layout_det['category_id'] == CategoryId.TableBody:
274
                tables_body.append(bbox)
275
            elif layout_det['category_id'] == CategoryId.TableCaption:
276
                tables_caption.append(bbox)
277
            elif layout_det['category_id'] == CategoryId.TableFootnote:
278
                tables_footnote.append(bbox)
279
            elif layout_det['category_id'] == CategoryId.ImageBody:
280
                imgs_body.append(bbox)
281
            elif layout_det['category_id'] == CategoryId.ImageCaption:
282
                imgs_caption.append(bbox)
283
            elif layout_det[
284
                'category_id'] == CategoryId.InterlineEquation_YOLO:
285
                interequations.append(bbox)
286
            elif layout_det['category_id'] == CategoryId.Abandon:
287
                page_dropped_list.append(bbox)
288
289
            elif layout_det['category_id'] == CategoryId.ImageFootnote:
                imgs_footnote.append(bbox)
290
291
292
293
294
295
296
297
298
299

        tables_body_list.append(tables_body)
        tables_caption_list.append(tables_caption)
        tables_footnote_list.append(tables_footnote)
        imgs_body_list.append(imgs_body)
        imgs_caption_list.append(imgs_caption)
        titles_list.append(titles)
        texts_list.append(texts)
        interequations_list.append(interequations)
        dropped_bbox_list.append(page_dropped_list)
300
        imgs_footnote_list.append(imgs_footnote)
301
302

    for i, page in enumerate(pdf_docs):
303
304
        draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158],
                              True)  # color !
305
        draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True)
306
307
308
309
        draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102],
                              True)
        draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204],
                              True)
310
        draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
311
312
        draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255],
                              True)
313
314
        draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
                              True)
315
316
317
318
319
        draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
        draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
        draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)

    # Save the PDF
320
    pdf_docs.save(f'{out_path}/{filename}_model.pdf')
321
322
323
324
325
326
327
328
329


from typing import List


def do_predict(boxes: List[List[int]]) -> List[int]:
    from transformers import LayoutLMv3ForTokenClassification
    from magic_pdf.v3.helpers import prepare_inputs, boxes2inputs, parse_logits
    model = LayoutLMv3ForTokenClassification.from_pretrained("hantian/layoutreader")
330
    model.to("cuda")
331
332
333
334
335
336
337
338
339
340
341
    inputs = boxes2inputs(boxes)
    inputs = prepare_inputs(inputs, model)
    logits = model(**inputs).logits.cpu().squeeze(0)
    return parse_logits(logits, len(boxes))


def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
    layout_bbox_list = []

    from loguru import logger
    for page in pdf_info:
342
343
        page_line_list = []
        for block in page['preproc_blocks']:
344
            if block['type'] == 'text' or block['type'] == 'title' or block['type'] == 'interline_equation':
345
                for line in block['lines']:
346
347
                    bbox = line['bbox']
                    page_line_list.append(bbox)
348
349
350
            if block['type'] == 'table' or block['type'] == 'image':
                bbox = block['bbox']
                page_line_list.append(bbox)
351
352
353
354
355
356

        # 使用layoutreader排序
        page_size = page['page_size']
        x_scale = 1000.0 / page_size[0]
        y_scale = 1000.0 / page_size[1]
        boxes = []
357
358
        logger.info(f"Scale: {x_scale}, {y_scale}, Boxes len: {len(page_line_list)}")
        for left, top, right, bottom in page_line_list:
359
360
361
362
363
364
365
366
367
368
369
            left = round(left * x_scale)
            top = round(top * y_scale)
            right = round(right * x_scale)
            bottom = round(bottom * y_scale)
            assert (
                    1000 >= right >= left >= 0 and 1000 >= bottom >= top >= 0
            ), f"Invalid box. right: {right}, left: {left}, bottom: {bottom}, top: {top}"
            boxes.append([left, top, right, bottom])
        logger.info("layoutreader start")
        start = time.time()
        orders = do_predict(boxes)
370
371
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
372
373
        print(orders)
        logger.info(f"layoutreader end, cos time{time.time() - start}")
374
        sorted_bboxes = [page_line_list[i] for i in orders]
375
376
377
        layout_bbox_list.append(sorted_bboxes)
    pdf_docs = fitz.open('pdf', pdf_bytes)
    for i, page in enumerate(pdf_docs):
378
        draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
379
380

    pdf_docs.save(f'{out_path}/{filename}_layout_sort.pdf')