draw_bbox.py 16.7 KB
Newer Older
1
from magic_pdf.data.dataset import PymuDocDataset
2
from magic_pdf.libs.commons import fitz  # PyMuPDF
3
4
from magic_pdf.libs.Constants import CROSS_PAGE
from magic_pdf.libs.ocr_content_type import BlockType, CategoryId, ContentType
5
from magic_pdf.model.magic_model import MagicModel
赵小蒙's avatar
赵小蒙 committed
6

赵小蒙's avatar
赵小蒙 committed
7

赵小蒙's avatar
赵小蒙 committed
8
def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
9
10
11
12
13
14
15
    new_rgb = []
    for item in rgb_config:
        item = float(item) / 255
        new_rgb.append(item)
    page_data = bbox_list[i]
    for bbox in page_data:
        x0, y0, x1, y1 = bbox
赵小蒙's avatar
赵小蒙 committed
16
        rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
赵小蒙's avatar
赵小蒙 committed
17
        if fill_config:
许瑞's avatar
许瑞 committed
18
19
20
21
22
23
24
25
            page.draw_rect(
                rect_coords,
                color=None,
                fill=new_rgb,
                fill_opacity=0.3,
                width=0.5,
                overlay=True,
            )  # Draw the rectangle
赵小蒙's avatar
赵小蒙 committed
26
        else:
许瑞's avatar
许瑞 committed
27
28
29
30
31
32
33
34
            page.draw_rect(
                rect_coords,
                color=new_rgb,
                fill=None,
                fill_opacity=1,
                width=0.5,
                overlay=True,
            )  # Draw the rectangle
赵小蒙's avatar
赵小蒙 committed
35

36

37
def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config, draw_bbox=True):
赵小蒙's avatar
赵小蒙 committed
38
39
40
41
42
43
44
45
    new_rgb = []
    for item in rgb_config:
        item = float(item) / 255
        new_rgb.append(item)
    page_data = bbox_list[i]
    for j, bbox in enumerate(page_data):
        x0, y0, x1, y1 = bbox
        rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
        if draw_bbox:
            if fill_config:
                page.draw_rect(
                    rect_coords,
                    color=None,
                    fill=new_rgb,
                    fill_opacity=0.3,
                    width=0.5,
                    overlay=True,
                )  # Draw the rectangle
            else:
                page.draw_rect(
                    rect_coords,
                    color=new_rgb,
                    fill=None,
                    fill_opacity=1,
                    width=0.5,
                    overlay=True,
                )  # Draw the rectangle
许瑞's avatar
许瑞 committed
65
        page.insert_text(
66
            (x1 + 2, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
赵小蒙's avatar
赵小蒙 committed
67
        )  # Insert the index in the top left corner of the rectangle
赵小蒙's avatar
赵小蒙 committed
68
69


70
def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
赵小蒙's avatar
赵小蒙 committed
71
    dropped_bbox_list = []
72
73
    tables_list, tables_body_list = [], []
    tables_caption_list, tables_footnote_list = [], []
许瑞's avatar
许瑞 committed
74
    imgs_list, imgs_body_list, imgs_caption_list = [], [], []
75
    imgs_footnote_list = []
许瑞's avatar
许瑞 committed
76
77
78
    titles_list = []
    texts_list = []
    interequations_list = []
79
80
    lists_list = []
    indexs_list = []
赵小蒙's avatar
赵小蒙 committed
81
    for page in pdf_info:
82

赵小蒙's avatar
赵小蒙 committed
83
        page_dropped_list = []
许瑞's avatar
许瑞 committed
84
        tables, tables_body, tables_caption, tables_footnote = [], [], [], []
85
        imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
许瑞's avatar
许瑞 committed
86
87
88
        titles = []
        texts = []
        interequations = []
89
        lists = []
90
        indices = []
91

92
93
        for dropped_bbox in page['discarded_blocks']:
            page_dropped_list.append(dropped_bbox['bbox'])
赵小蒙's avatar
赵小蒙 committed
94
        dropped_bbox_list.append(page_dropped_list)
95
96
97
        for block in page['para_blocks']:
            bbox = block['bbox']
            if block['type'] == BlockType.Table:
许瑞's avatar
许瑞 committed
98
                tables.append(bbox)
99
100
101
                for nested_block in block['blocks']:
                    bbox = nested_block['bbox']
                    if nested_block['type'] == BlockType.TableBody:
许瑞's avatar
许瑞 committed
102
                        tables_body.append(bbox)
103
                    elif nested_block['type'] == BlockType.TableCaption:
许瑞's avatar
许瑞 committed
104
                        tables_caption.append(bbox)
105
                    elif nested_block['type'] == BlockType.TableFootnote:
许瑞's avatar
许瑞 committed
106
                        tables_footnote.append(bbox)
107
            elif block['type'] == BlockType.Image:
许瑞's avatar
许瑞 committed
108
                imgs.append(bbox)
109
110
111
                for nested_block in block['blocks']:
                    bbox = nested_block['bbox']
                    if nested_block['type'] == BlockType.ImageBody:
许瑞's avatar
许瑞 committed
112
                        imgs_body.append(bbox)
113
                    elif nested_block['type'] == BlockType.ImageCaption:
许瑞's avatar
许瑞 committed
114
                        imgs_caption.append(bbox)
115
116
                    elif nested_block['type'] == BlockType.ImageFootnote:
                        imgs_footnote.append(bbox)
117
            elif block['type'] == BlockType.Title:
许瑞's avatar
许瑞 committed
118
                titles.append(bbox)
119
            elif block['type'] == BlockType.Text:
许瑞's avatar
许瑞 committed
120
                texts.append(bbox)
121
            elif block['type'] == BlockType.InterlineEquation:
许瑞's avatar
许瑞 committed
122
                interequations.append(bbox)
123
124
125
            elif block['type'] == BlockType.List:
                lists.append(bbox)
            elif block['type'] == BlockType.Index:
126
                indices.append(bbox)
127

许瑞's avatar
许瑞 committed
128
129
130
131
132
133
134
        tables_list.append(tables)
        tables_body_list.append(tables_body)
        tables_caption_list.append(tables_caption)
        tables_footnote_list.append(tables_footnote)
        imgs_list.append(imgs)
        imgs_body_list.append(imgs_body)
        imgs_caption_list.append(imgs_caption)
135
        imgs_footnote_list.append(imgs_footnote)
许瑞's avatar
许瑞 committed
136
137
138
        titles_list.append(titles)
        texts_list.append(texts)
        interequations_list.append(interequations)
139
        lists_list.append(lists)
140
        indexs_list.append(indices)
许瑞's avatar
许瑞 committed
141

142
143
    layout_bbox_list = []

144
145
146
147
148
    table_type_order = {
        'table_caption': 1,
        'table_body': 2,
        'table_footnote': 3
    }
149
150
151
    for page in pdf_info:
        page_block_list = []
        for block in page['para_blocks']:
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
            if block['type'] in [
                BlockType.Text,
                BlockType.Title,
                BlockType.InterlineEquation,
                BlockType.List,
                BlockType.Index,
            ]:
                bbox = block['bbox']
                page_block_list.append(bbox)
            elif block['type'] in [BlockType.Image]:
                for sub_block in block['blocks']:
                    bbox = sub_block['bbox']
                    page_block_list.append(bbox)
            elif block['type'] in [BlockType.Table]:
                sorted_blocks = sorted(block['blocks'], key=lambda x: table_type_order[x['type']])
                for sub_block in sorted_blocks:
                    bbox = sub_block['bbox']
                    page_block_list.append(bbox)

171
172
        layout_bbox_list.append(page_block_list)

173
    pdf_docs = fitz.open('pdf', pdf_bytes)
174

175
    for i, page in enumerate(pdf_docs):
176

177
        draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True)
178
        # draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True)  # color !
179
180
181
        draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
        draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
        draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True)
182
        # draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
许瑞's avatar
许瑞 committed
183
        draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
184
185
        draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
        draw_bbox_without_number(i, imgs_footnote_list, page, [255, 178, 102], True),
许瑞's avatar
许瑞 committed
186
187
        draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
        draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
188
        draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], True)
189
190
        draw_bbox_without_number(i, lists_list, page, [40, 169, 92], True)
        draw_bbox_without_number(i, indexs_list, page, [40, 169, 92], True)
191

192
193
194
        draw_bbox_with_number(
            i, layout_bbox_list, page, [255, 0, 0], False, draw_bbox=False
        )
许瑞's avatar
许瑞 committed
195

196
    # Save the PDF
197
    pdf_docs.save(f'{out_path}/{filename}_layout.pdf')
198

许瑞's avatar
许瑞 committed
199

200
def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
201
202
    text_list = []
    inline_equation_list = []
赵小蒙's avatar
赵小蒙 committed
203
    interline_equation_list = []
赵小蒙's avatar
赵小蒙 committed
204
205
    image_list = []
    table_list = []
赵小蒙's avatar
赵小蒙 committed
206
    dropped_list = []
207
208
    next_page_text_list = []
    next_page_inline_equation_list = []
赵小蒙's avatar
赵小蒙 committed
209
210

    def get_span_info(span):
211
        if span['type'] == ContentType.Text:
赵小蒙's avatar
赵小蒙 committed
212
            if span.get(CROSS_PAGE, False):
213
                next_page_text_list.append(span['bbox'])
赵小蒙's avatar
赵小蒙 committed
214
            else:
215
216
                page_text_list.append(span['bbox'])
        elif span['type'] == ContentType.InlineEquation:
赵小蒙's avatar
赵小蒙 committed
217
            if span.get(CROSS_PAGE, False):
218
                next_page_inline_equation_list.append(span['bbox'])
赵小蒙's avatar
赵小蒙 committed
219
            else:
220
221
222
223
224
225
226
                page_inline_equation_list.append(span['bbox'])
        elif span['type'] == ContentType.InterlineEquation:
            page_interline_equation_list.append(span['bbox'])
        elif span['type'] == ContentType.Image:
            page_image_list.append(span['bbox'])
        elif span['type'] == ContentType.Table:
            page_table_list.append(span['bbox'])
赵小蒙's avatar
赵小蒙 committed
227

赵小蒙's avatar
赵小蒙 committed
228
    for page in pdf_info:
229
230
        page_text_list = []
        page_inline_equation_list = []
赵小蒙's avatar
赵小蒙 committed
231
        page_interline_equation_list = []
赵小蒙's avatar
赵小蒙 committed
232
233
        page_image_list = []
        page_table_list = []
赵小蒙's avatar
赵小蒙 committed
234
        page_dropped_list = []
235
236
237
238

        # 将跨页的span放到移动到下一页的列表中
        if len(next_page_text_list) > 0:
            page_text_list.extend(next_page_text_list)
赵小蒙's avatar
赵小蒙 committed
239
            next_page_text_list.clear()
240
241
        if len(next_page_inline_equation_list) > 0:
            page_inline_equation_list.extend(next_page_inline_equation_list)
赵小蒙's avatar
赵小蒙 committed
242
            next_page_inline_equation_list.clear()
243

赵小蒙's avatar
赵小蒙 committed
244
        # 构造dropped_list
245
246
247
248
249
        for block in page['discarded_blocks']:
            if block['type'] == BlockType.Discarded:
                for line in block['lines']:
                    for span in line['spans']:
                        page_dropped_list.append(span['bbox'])
赵小蒙's avatar
赵小蒙 committed
250
251
        dropped_list.append(page_dropped_list)
        # 构造其余useful_list
252
253
        for block in page['para_blocks']:
            if block['type'] in [
254
255
256
                BlockType.Text,
                BlockType.Title,
                BlockType.InterlineEquation,
257
258
                BlockType.List,
                BlockType.Index,
许瑞's avatar
许瑞 committed
259
            ]:
260
261
                for line in block['lines']:
                    for span in line['spans']:
赵小蒙's avatar
赵小蒙 committed
262
                        get_span_info(span)
263
264
265
266
            elif block['type'] in [BlockType.Image, BlockType.Table]:
                for sub_block in block['blocks']:
                    for line in sub_block['lines']:
                        for span in line['spans']:
赵小蒙's avatar
赵小蒙 committed
267
                            get_span_info(span)
268
269
        text_list.append(page_text_list)
        inline_equation_list.append(page_inline_equation_list)
赵小蒙's avatar
赵小蒙 committed
270
        interline_equation_list.append(page_interline_equation_list)
赵小蒙's avatar
赵小蒙 committed
271
272
        image_list.append(page_image_list)
        table_list.append(page_table_list)
273
    pdf_docs = fitz.open('pdf', pdf_bytes)
274
    for i, page in enumerate(pdf_docs):
275
        # 获取当前页面的数据
赵小蒙's avatar
赵小蒙 committed
276
        draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
277
278
        draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0], False)
        draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False)
赵小蒙's avatar
赵小蒙 committed
279
280
        draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
        draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
赵小蒙's avatar
赵小蒙 committed
281
        draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
282
283

    # Save the PDF
284
    pdf_docs.save(f'{out_path}/{filename}_spans.pdf')
285
286


287
def draw_model_bbox(model_list: list, pdf_bytes, out_path, filename):
288
289
    dropped_bbox_list = []
    tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
290
    imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
291
292
293
    titles_list = []
    texts_list = []
    interequations_list = []
294
    pdf_docs = fitz.open('pdf', pdf_bytes)
295
    magic_model = MagicModel(model_list, PymuDocDataset(pdf_bytes))
296
297
298
    for i in range(len(model_list)):
        page_dropped_list = []
        tables_body, tables_caption, tables_footnote = [], [], []
299
        imgs_body, imgs_caption, imgs_footnote = [], [], []
300
301
302
303
        titles = []
        texts = []
        interequations = []
        page_info = magic_model.get_model_list(i)
304
        layout_dets = page_info['layout_dets']
305
        for layout_det in layout_dets:
306
307
            bbox = layout_det['bbox']
            if layout_det['category_id'] == CategoryId.Text:
308
                texts.append(bbox)
309
            elif layout_det['category_id'] == CategoryId.Title:
310
                titles.append(bbox)
311
            elif layout_det['category_id'] == CategoryId.TableBody:
312
                tables_body.append(bbox)
313
            elif layout_det['category_id'] == CategoryId.TableCaption:
314
                tables_caption.append(bbox)
315
            elif layout_det['category_id'] == CategoryId.TableFootnote:
316
                tables_footnote.append(bbox)
317
            elif layout_det['category_id'] == CategoryId.ImageBody:
318
                imgs_body.append(bbox)
319
            elif layout_det['category_id'] == CategoryId.ImageCaption:
320
                imgs_caption.append(bbox)
321
            elif layout_det['category_id'] == CategoryId.InterlineEquation_YOLO:
322
                interequations.append(bbox)
323
            elif layout_det['category_id'] == CategoryId.Abandon:
324
                page_dropped_list.append(bbox)
325
326
            elif layout_det['category_id'] == CategoryId.ImageFootnote:
                imgs_footnote.append(bbox)
327
328
329
330
331
332
333
334
335
336

        tables_body_list.append(tables_body)
        tables_caption_list.append(tables_caption)
        tables_footnote_list.append(tables_footnote)
        imgs_body_list.append(imgs_body)
        imgs_caption_list.append(imgs_caption)
        titles_list.append(titles)
        texts_list.append(texts)
        interequations_list.append(interequations)
        dropped_bbox_list.append(page_dropped_list)
337
        imgs_footnote_list.append(imgs_footnote)
338
339

    for i, page in enumerate(pdf_docs):
340
341
342
        draw_bbox_with_number(
            i, dropped_bbox_list, page, [158, 158, 158], True
        )  # color !
343
        draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True)
344
345
        draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102], True)
        draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204], True)
346
        draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
347
348
        draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255], True)
        draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102], True)
349
350
351
352
353
        draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
        draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
        draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)

    # Save the PDF
354
    pdf_docs.save(f'{out_path}/{filename}_model.pdf')
355
356


357
def draw_line_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
358
359
360
    layout_bbox_list = []

    for page in pdf_info:
361
362
        page_line_list = []
        for block in page['preproc_blocks']:
363
            if block['type'] in [BlockType.Text, BlockType.Title, BlockType.InterlineEquation]:
364
                for line in block['lines']:
365
                    bbox = line['bbox']
366
367
                    index = line['index']
                    page_line_list.append({'index': index, 'bbox': bbox})
368
369
370
371
372
373
374
375
376
377
378
379
            if block['type'] in [BlockType.Image, BlockType.Table]:
                for sub_block in block['blocks']:
                    if sub_block['type'] in [BlockType.ImageBody, BlockType.TableBody]:
                        for line in sub_block['virtual_lines']:
                            bbox = line['bbox']
                            index = line['index']
                            page_line_list.append({'index': index, 'bbox': bbox})
                    elif sub_block['type'] in [BlockType.ImageCaption, BlockType.TableCaption, BlockType.ImageFootnote, BlockType.TableFootnote]:
                        for line in sub_block['lines']:
                            bbox = line['bbox']
                            index = line['index']
                            page_line_list.append({'index': index, 'bbox': bbox})
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
        sorted_bboxes = sorted(page_line_list, key=lambda x: x['index'])
        layout_bbox_list.append(sorted_bbox['bbox'] for sorted_bbox in sorted_bboxes)
    pdf_docs = fitz.open('pdf', pdf_bytes)
    for i, page in enumerate(pdf_docs):
        draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)

    pdf_docs.save(f'{out_path}/{filename}_line_sort.pdf')


def draw_layout_sort_bbox(pdf_info, pdf_bytes, out_path, filename):
    layout_bbox_list = []

    for page in pdf_info:
        page_block_list = []
        for block in page['para_blocks']:
            bbox = block['bbox']
            page_block_list.append(bbox)
        layout_bbox_list.append(page_block_list)
398
399
    pdf_docs = fitz.open('pdf', pdf_bytes)
    for i, page in enumerate(pdf_docs):
400
        draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
401
402

    pdf_docs.save(f'{out_path}/{filename}_layout_sort.pdf')