draw_bbox.py 12.7 KB
Newer Older
1
from magic_pdf.libs.commons import fitz  # PyMuPDF
2
3
from magic_pdf.libs.Constants import CROSS_PAGE
from magic_pdf.libs.ocr_content_type import BlockType, CategoryId, ContentType
4
from magic_pdf.model.magic_model import MagicModel
赵小蒙's avatar
赵小蒙 committed
5

赵小蒙's avatar
赵小蒙 committed
6

赵小蒙's avatar
赵小蒙 committed
7
def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
8
9
10
11
12
13
14
    new_rgb = []
    for item in rgb_config:
        item = float(item) / 255
        new_rgb.append(item)
    page_data = bbox_list[i]
    for bbox in page_data:
        x0, y0, x1, y1 = bbox
赵小蒙's avatar
赵小蒙 committed
15
        rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
赵小蒙's avatar
赵小蒙 committed
16
        if fill_config:
许瑞's avatar
许瑞 committed
17
18
19
20
21
22
23
24
            page.draw_rect(
                rect_coords,
                color=None,
                fill=new_rgb,
                fill_opacity=0.3,
                width=0.5,
                overlay=True,
            )  # Draw the rectangle
赵小蒙's avatar
赵小蒙 committed
25
        else:
许瑞's avatar
许瑞 committed
26
27
28
29
30
31
32
33
            page.draw_rect(
                rect_coords,
                color=new_rgb,
                fill=None,
                fill_opacity=1,
                width=0.5,
                overlay=True,
            )  # Draw the rectangle
赵小蒙's avatar
赵小蒙 committed
34

35

赵小蒙's avatar
赵小蒙 committed
36
def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
赵小蒙's avatar
赵小蒙 committed
37
38
39
40
41
42
43
44
    new_rgb = []
    for item in rgb_config:
        item = float(item) / 255
        new_rgb.append(item)
    page_data = bbox_list[i]
    for j, bbox in enumerate(page_data):
        x0, y0, x1, y1 = bbox
        rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
赵小蒙's avatar
赵小蒙 committed
45
        if fill_config:
许瑞's avatar
许瑞 committed
46
47
48
49
50
51
52
53
            page.draw_rect(
                rect_coords,
                color=None,
                fill=new_rgb,
                fill_opacity=0.3,
                width=0.5,
                overlay=True,
            )  # Draw the rectangle
赵小蒙's avatar
赵小蒙 committed
54
        else:
许瑞's avatar
许瑞 committed
55
56
57
58
59
60
61
62
63
64
            page.draw_rect(
                rect_coords,
                color=new_rgb,
                fill=None,
                fill_opacity=1,
                width=0.5,
                overlay=True,
            )  # Draw the rectangle
        page.insert_text(
            (x0, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
赵小蒙's avatar
赵小蒙 committed
65
        )  # Insert the index in the top left corner of the rectangle
赵小蒙's avatar
赵小蒙 committed
66
67


68
def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
69
    layout_bbox_list = []
赵小蒙's avatar
赵小蒙 committed
70
    dropped_bbox_list = []
71
72
    tables_list, tables_body_list = [], []
    tables_caption_list, tables_footnote_list = [], []
许瑞's avatar
许瑞 committed
73
74
75
76
    imgs_list, imgs_body_list, imgs_caption_list = [], [], []
    titles_list = []
    texts_list = []
    interequations_list = []
赵小蒙's avatar
赵小蒙 committed
77
    for page in pdf_info:
赵小蒙's avatar
赵小蒙 committed
78
79
        page_layout_list = []
        page_dropped_list = []
许瑞's avatar
许瑞 committed
80
81
82
83
84
        tables, tables_body, tables_caption, tables_footnote = [], [], [], []
        imgs, imgs_body, imgs_caption = [], [], []
        titles = []
        texts = []
        interequations = []
85
86
        for layout in page['layout_bboxes']:
            page_layout_list.append(layout['layout_bbox'])
赵小蒙's avatar
赵小蒙 committed
87
        layout_bbox_list.append(page_layout_list)
88
89
        for dropped_bbox in page['discarded_blocks']:
            page_dropped_list.append(dropped_bbox['bbox'])
赵小蒙's avatar
赵小蒙 committed
90
        dropped_bbox_list.append(page_dropped_list)
91
92
93
        for block in page['para_blocks']:
            bbox = block['bbox']
            if block['type'] == BlockType.Table:
许瑞's avatar
许瑞 committed
94
                tables.append(bbox)
95
96
97
                for nested_block in block['blocks']:
                    bbox = nested_block['bbox']
                    if nested_block['type'] == BlockType.TableBody:
许瑞's avatar
许瑞 committed
98
                        tables_body.append(bbox)
99
                    elif nested_block['type'] == BlockType.TableCaption:
许瑞's avatar
许瑞 committed
100
                        tables_caption.append(bbox)
101
                    elif nested_block['type'] == BlockType.TableFootnote:
许瑞's avatar
许瑞 committed
102
                        tables_footnote.append(bbox)
103
            elif block['type'] == BlockType.Image:
许瑞's avatar
许瑞 committed
104
                imgs.append(bbox)
105
106
107
                for nested_block in block['blocks']:
                    bbox = nested_block['bbox']
                    if nested_block['type'] == BlockType.ImageBody:
许瑞's avatar
许瑞 committed
108
                        imgs_body.append(bbox)
109
                    elif nested_block['type'] == BlockType.ImageCaption:
许瑞's avatar
许瑞 committed
110
                        imgs_caption.append(bbox)
111
            elif block['type'] == BlockType.Title:
许瑞's avatar
许瑞 committed
112
                titles.append(bbox)
113
            elif block['type'] == BlockType.Text:
许瑞's avatar
许瑞 committed
114
                texts.append(bbox)
115
            elif block['type'] == BlockType.InterlineEquation:
许瑞's avatar
许瑞 committed
116
117
118
119
120
121
122
123
124
125
126
127
                interequations.append(bbox)
        tables_list.append(tables)
        tables_body_list.append(tables_body)
        tables_caption_list.append(tables_caption)
        tables_footnote_list.append(tables_footnote)
        imgs_list.append(imgs)
        imgs_body_list.append(imgs_body)
        imgs_caption_list.append(imgs_caption)
        titles_list.append(titles)
        texts_list.append(texts)
        interequations_list.append(interequations)

128
    pdf_docs = fitz.open('pdf', pdf_bytes)
129
    for i, page in enumerate(pdf_docs):
赵小蒙's avatar
赵小蒙 committed
130
        draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
131
132
133
134
135
136
137
138
139
140
        draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158],
                                 True)
        draw_bbox_without_number(i, tables_list, page, [153, 153, 0],
                                 True)  # color !
        draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0],
                                 True)
        draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102],
                                 True)
        draw_bbox_without_number(i, tables_footnote_list, page,
                                 [229, 255, 204], True)
许瑞's avatar
许瑞 committed
141
142
        draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
        draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
143
144
        draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255],
                                 True)
许瑞's avatar
许瑞 committed
145
146
        draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
        draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
147
148
        draw_bbox_without_number(i, interequations_list, page, [0, 255, 0],
                                 True)
许瑞's avatar
许瑞 committed
149

150
    # Save the PDF
151
    pdf_docs.save(f'{out_path}/{filename}_layout.pdf')
152

许瑞's avatar
许瑞 committed
153

154
def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
155
156
    text_list = []
    inline_equation_list = []
赵小蒙's avatar
赵小蒙 committed
157
    interline_equation_list = []
赵小蒙's avatar
赵小蒙 committed
158
159
    image_list = []
    table_list = []
赵小蒙's avatar
赵小蒙 committed
160
    dropped_list = []
161
162
    next_page_text_list = []
    next_page_inline_equation_list = []
赵小蒙's avatar
赵小蒙 committed
163
164

    def get_span_info(span):
165
        if span['type'] == ContentType.Text:
赵小蒙's avatar
赵小蒙 committed
166
            if span.get(CROSS_PAGE, False):
167
                next_page_text_list.append(span['bbox'])
赵小蒙's avatar
赵小蒙 committed
168
            else:
169
170
                page_text_list.append(span['bbox'])
        elif span['type'] == ContentType.InlineEquation:
赵小蒙's avatar
赵小蒙 committed
171
            if span.get(CROSS_PAGE, False):
172
                next_page_inline_equation_list.append(span['bbox'])
赵小蒙's avatar
赵小蒙 committed
173
            else:
174
175
176
177
178
179
180
                page_inline_equation_list.append(span['bbox'])
        elif span['type'] == ContentType.InterlineEquation:
            page_interline_equation_list.append(span['bbox'])
        elif span['type'] == ContentType.Image:
            page_image_list.append(span['bbox'])
        elif span['type'] == ContentType.Table:
            page_table_list.append(span['bbox'])
赵小蒙's avatar
赵小蒙 committed
181

赵小蒙's avatar
赵小蒙 committed
182
    for page in pdf_info:
183
184
        page_text_list = []
        page_inline_equation_list = []
赵小蒙's avatar
赵小蒙 committed
185
        page_interline_equation_list = []
赵小蒙's avatar
赵小蒙 committed
186
187
        page_image_list = []
        page_table_list = []
赵小蒙's avatar
赵小蒙 committed
188
        page_dropped_list = []
189
190
191
192

        # 将跨页的span放到移动到下一页的列表中
        if len(next_page_text_list) > 0:
            page_text_list.extend(next_page_text_list)
赵小蒙's avatar
赵小蒙 committed
193
            next_page_text_list.clear()
194
195
        if len(next_page_inline_equation_list) > 0:
            page_inline_equation_list.extend(next_page_inline_equation_list)
赵小蒙's avatar
赵小蒙 committed
196
            next_page_inline_equation_list.clear()
197

赵小蒙's avatar
赵小蒙 committed
198
        # 构造dropped_list
199
200
201
202
203
        for block in page['discarded_blocks']:
            if block['type'] == BlockType.Discarded:
                for line in block['lines']:
                    for span in line['spans']:
                        page_dropped_list.append(span['bbox'])
赵小蒙's avatar
赵小蒙 committed
204
205
        dropped_list.append(page_dropped_list)
        # 构造其余useful_list
206
207
208
209
210
        for block in page['para_blocks']:
            if block['type'] in [
                    BlockType.Text,
                    BlockType.Title,
                    BlockType.InterlineEquation,
许瑞's avatar
许瑞 committed
211
            ]:
212
213
                for line in block['lines']:
                    for span in line['spans']:
赵小蒙's avatar
赵小蒙 committed
214
                        get_span_info(span)
215
216
217
218
            elif block['type'] in [BlockType.Image, BlockType.Table]:
                for sub_block in block['blocks']:
                    for line in sub_block['lines']:
                        for span in line['spans']:
赵小蒙's avatar
赵小蒙 committed
219
                            get_span_info(span)
220
221
        text_list.append(page_text_list)
        inline_equation_list.append(page_inline_equation_list)
赵小蒙's avatar
赵小蒙 committed
222
        interline_equation_list.append(page_interline_equation_list)
赵小蒙's avatar
赵小蒙 committed
223
224
        image_list.append(page_image_list)
        table_list.append(page_table_list)
225
    pdf_docs = fitz.open('pdf', pdf_bytes)
226
    for i, page in enumerate(pdf_docs):
227
        # 获取当前页面的数据
赵小蒙's avatar
赵小蒙 committed
228
        draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
229
230
231
232
        draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0],
                                 False)
        draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255],
                                 False)
赵小蒙's avatar
赵小蒙 committed
233
234
        draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
        draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
赵小蒙's avatar
赵小蒙 committed
235
        draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
236
237

    # Save the PDF
238
    pdf_docs.save(f'{out_path}/{filename}_spans.pdf')
239
240


241
def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
242
243
244
245
246
247
    dropped_bbox_list = []
    tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
    imgs_body_list, imgs_caption_list = [], []
    titles_list = []
    texts_list = []
    interequations_list = []
248
    pdf_docs = fitz.open('pdf', pdf_bytes)
249
250
251
252
253
254
255
256
257
    magic_model = MagicModel(model_list, pdf_docs)
    for i in range(len(model_list)):
        page_dropped_list = []
        tables_body, tables_caption, tables_footnote = [], [], []
        imgs_body, imgs_caption = [], []
        titles = []
        texts = []
        interequations = []
        page_info = magic_model.get_model_list(i)
258
        layout_dets = page_info['layout_dets']
259
        for layout_det in layout_dets:
260
261
            bbox = layout_det['bbox']
            if layout_det['category_id'] == CategoryId.Text:
262
                texts.append(bbox)
263
            elif layout_det['category_id'] == CategoryId.Title:
264
                titles.append(bbox)
265
            elif layout_det['category_id'] == CategoryId.TableBody:
266
                tables_body.append(bbox)
267
            elif layout_det['category_id'] == CategoryId.TableCaption:
268
                tables_caption.append(bbox)
269
            elif layout_det['category_id'] == CategoryId.TableFootnote:
270
                tables_footnote.append(bbox)
271
            elif layout_det['category_id'] == CategoryId.ImageBody:
272
                imgs_body.append(bbox)
273
            elif layout_det['category_id'] == CategoryId.ImageCaption:
274
                imgs_caption.append(bbox)
275
276
            elif layout_det[
                    'category_id'] == CategoryId.InterlineEquation_YOLO:
277
                interequations.append(bbox)
278
            elif layout_det['category_id'] == CategoryId.Abandon:
279
280
281
282
283
284
285
286
287
288
289
290
291
                page_dropped_list.append(bbox)

        tables_body_list.append(tables_body)
        tables_caption_list.append(tables_caption)
        tables_footnote_list.append(tables_footnote)
        imgs_body_list.append(imgs_body)
        imgs_caption_list.append(imgs_caption)
        titles_list.append(titles)
        texts_list.append(texts)
        interequations_list.append(interequations)
        dropped_bbox_list.append(page_dropped_list)

    for i, page in enumerate(pdf_docs):
292
293
        draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158],
                              True)  # color !
294
        draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True)
295
296
297
298
        draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102],
                              True)
        draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204],
                              True)
299
        draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
300
301
        draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255],
                              True)
302
303
304
305
306
        draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
        draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
        draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)

    # Save the PDF
307
    pdf_docs.save(f'{out_path}/{filename}_model.pdf')