pdf_parse_by_ocr.py 10.3 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
import json
2
3
4
import os
import time

赵小蒙's avatar
赵小蒙 committed
5
from loguru import logger
赵小蒙's avatar
赵小蒙 committed
6

赵小蒙's avatar
赵小蒙 committed
7
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_text_bbox
许瑞's avatar
许瑞 committed
8
9
10
11
12
13
14
15
from magic_pdf.libs.commons import (
    read_file,
    join_path,
    fitz,
    get_img_s3_client,
    get_delta_time,
    get_docx_model_output,
)
16
from magic_pdf.libs.coordinate_transform import get_scale_ratio
赵小蒙's avatar
赵小蒙 committed
17
from magic_pdf.libs.ocr_content_type import ContentType
18
from magic_pdf.libs.safe_filename import sanitize_filename
xuchao's avatar
xuchao committed
19
from magic_pdf.para.para_split import para_split
20
21
22
23
from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
from magic_pdf.pre_proc.detect_header import parse_headers
from magic_pdf.pre_proc.detect_page_number import parse_pageNos
赵小蒙's avatar
赵小蒙 committed
24
from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table
25
from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
许瑞's avatar
许瑞 committed
26
from magic_pdf.pre_proc.ocr_dict_merge import (
赵小蒙's avatar
赵小蒙 committed
27
    merge_spans_to_line_by_layout, merge_lines_to_block,
许瑞's avatar
许瑞 committed
28
)
29
from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remove_overlaps_min_spans, \
赵小蒙's avatar
赵小蒙 committed
30
31
    adjust_bbox_for_standalone_block, modify_y_axis, modify_inline_equation, get_qa_need_list, \
    remove_spans_by_bboxes_dict
许瑞's avatar
许瑞 committed
32
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
赵小蒙's avatar
赵小蒙 committed
33

赵小蒙's avatar
赵小蒙 committed
34

xuchao's avatar
xuchao committed
35
def construct_page_component(blocks, para_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
赵小蒙's avatar
赵小蒙 committed
36
                             images, tables, interline_equations, inline_equations,
赵小蒙's avatar
赵小蒙 committed
37
38
                             dropped_text_block, dropped_image_block, dropped_table_block,
                             need_remove_spans_bboxes_dict):
赵小蒙's avatar
赵小蒙 committed
39
    return_dict = {
赵小蒙's avatar
赵小蒙 committed
40
        'preproc_blocks': blocks,
xuchao's avatar
xuchao committed
41
        "para_blocks": para_blocks, # 分好段落的blocks
42
        'layout_bboxes': layout_bboxes,
赵小蒙's avatar
赵小蒙 committed
43
44
45
        'page_idx': page_id,
        'page_size': [page_w, page_h],
        '_layout_tree': layout_tree,
赵小蒙's avatar
赵小蒙 committed
46
47
48
49
        'images': images,
        'tables': tables,
        'interline_equations': interline_equations,
        'inline_equations': inline_equations,
赵小蒙's avatar
赵小蒙 committed
50
51
52
53
        'droped_text_block': dropped_text_block,
        'droped_image_block': dropped_image_block,
        'droped_table_block': dropped_table_block,
        'droped_bboxes': need_remove_spans_bboxes_dict,
赵小蒙's avatar
赵小蒙 committed
54
55
56
57
58
    }
    return return_dict


def parse_pdf_by_ocr(
许瑞's avatar
许瑞 committed
59
60
61
62
63
64
65
66
67
68
    pdf_path,
    s3_pdf_profile,
    pdf_model_output,
    save_path,
    book_name,
    pdf_model_profile=None,
    image_s3_config=None,
    start_page_id=0,
    end_page_id=None,
    debug_mode=False,
赵小蒙's avatar
赵小蒙 committed
69
):
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
    pdf_bytes = read_file(pdf_path, s3_pdf_profile)
    save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
    book_name = sanitize_filename(book_name)
    md_bookname_save_path = ""
    if debug_mode:
        save_path = join_path(save_tmp_path, "md")
        pdf_local_path = join_path(save_tmp_path, "download-pdfs", book_name)

        if not os.path.exists(os.path.dirname(pdf_local_path)):
            # 如果目录不存在,创建它
            os.makedirs(os.path.dirname(pdf_local_path))

        md_bookname_save_path = join_path(save_tmp_path, "md", book_name)
        if not os.path.exists(md_bookname_save_path):
            # 如果目录不存在,创建它
            os.makedirs(md_bookname_save_path)

        with open(pdf_local_path + ".pdf", "wb") as pdf_file:
            pdf_file.write(pdf_bytes)
赵小蒙's avatar
赵小蒙 committed
89

90
91
    pdf_docs = fitz.open("pdf", pdf_bytes)
    # 初始化空的pdf_info_dict
赵小蒙's avatar
赵小蒙 committed
92
    pdf_info_dict = {}
93
94
95
96
97
98
    img_s3_client = get_img_s3_client(save_path, image_s3_config)

    start_time = time.time()


    end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
赵小蒙's avatar
赵小蒙 committed
99
    for page_id in range(start_page_id, end_page_id + 1):
100
101
102

        # 获取当前页的page对象
        page = pdf_docs[page_id]
赵小蒙's avatar
赵小蒙 committed
103
104
105
        # 获取当前页的宽高
        page_w = page.rect.width
        page_h = page.rect.height
106
107
108

        if debug_mode:
            time_now = time.time()
许瑞's avatar
许瑞 committed
109
110
111
            logger.info(
                f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}"
            )
112
113
114
            start_time = time_now

        # 获取当前页的模型数据
许瑞's avatar
许瑞 committed
115
116
117
        ocr_page_info = get_docx_model_output(
            pdf_model_output, pdf_model_profile, page_id
        )
118
119
120
121
122

        """从json中获取每页的页码、页眉、页脚的bbox"""
        page_no_bboxes = parse_pageNos(page_id, page, ocr_page_info)
        header_bboxes = parse_headers(page_id, page, ocr_page_info)
        footer_bboxes = parse_footers(page_id, page, ocr_page_info)
许瑞's avatar
许瑞 committed
123
124
125
        footnote_bboxes = parse_footnotes_by_model(
            page_id, page, ocr_page_info, md_bookname_save_path, debug_mode=debug_mode
        )
126
127

        # 构建需要remove的bbox列表
赵小蒙's avatar
赵小蒙 committed
128
129
130
131
132
133
134
135
136
137
138
139
140
        # need_remove_spans_bboxes = []
        # need_remove_spans_bboxes.extend(page_no_bboxes)
        # need_remove_spans_bboxes.extend(header_bboxes)
        # need_remove_spans_bboxes.extend(footer_bboxes)
        # need_remove_spans_bboxes.extend(footnote_bboxes)

        # 构建需要remove的bbox字典
        need_remove_spans_bboxes_dict = {
            "page_no": page_no_bboxes,
            "header": header_bboxes,
            "footer": footer_bboxes,
            "footnote": footnote_bboxes,
        }
141

许瑞's avatar
许瑞 committed
142
        layout_dets = ocr_page_info["layout_dets"]
赵小蒙's avatar
赵小蒙 committed
143
        spans = []
144

145
        # 计算模型坐标和pymu坐标的缩放比例
许瑞's avatar
许瑞 committed
146
147
148
        horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(
            ocr_page_info, page
        )
149

赵小蒙's avatar
赵小蒙 committed
150
        for layout_det in layout_dets:
许瑞's avatar
许瑞 committed
151
            category_id = layout_det["category_id"]
赵小蒙's avatar
赵小蒙 committed
152
            allow_category_id_list = [1, 7, 13, 14, 15]
赵小蒙's avatar
赵小蒙 committed
153
            if category_id in allow_category_id_list:
许瑞's avatar
许瑞 committed
154
155
156
157
158
159
160
                x0, y0, _, _, x1, y1, _, _ = layout_det["poly"]
                bbox = [
                    int(x0 / horizontal_scale_ratio),
                    int(y0 / vertical_scale_ratio),
                    int(x1 / horizontal_scale_ratio),
                    int(y1 / vertical_scale_ratio),
                ]
161
162
163
                # 删除高度或者宽度为0的spans
                if bbox[2] - bbox[0] == 0 or bbox[3] - bbox[1] == 0:
                    continue
许瑞's avatar
许瑞 committed
164
                """要删除的"""
赵小蒙's avatar
赵小蒙 committed
165
166
167
168
                #  3: 'header',      # 页眉
                #  4: 'page number', # 页码
                #  5: 'footnote',    # 脚注
                #  6: 'footer',      # 页脚
许瑞's avatar
许瑞 committed
169
                """当成span拼接的"""
赵小蒙's avatar
赵小蒙 committed
170
171
172
                #  1: 'image', # 图片
                #  7: 'table',       # 表格
                #  13: 'inline_equation',     # 行内公式
赵小蒙's avatar
赵小蒙 committed
173
                #  14: 'interline_equation',      # 行间公式
赵小蒙's avatar
赵小蒙 committed
174
                #  15: 'text',      # ocr识别文本
许瑞's avatar
许瑞 committed
175
                """layout信息"""
赵小蒙's avatar
赵小蒙 committed
176
177
                #  11: 'full column',   # 单栏
                #  12: 'sub column',    # 多栏
赵小蒙's avatar
赵小蒙 committed
178
                span = {
许瑞's avatar
许瑞 committed
179
                    "bbox": bbox,
赵小蒙's avatar
赵小蒙 committed
180
                }
赵小蒙's avatar
赵小蒙 committed
181
                if category_id == 1:
赵小蒙's avatar
赵小蒙 committed
182
                    span["type"] = ContentType.Image
183

赵小蒙's avatar
赵小蒙 committed
184
                elif category_id == 7:
赵小蒙's avatar
赵小蒙 committed
185
                    span["type"] = ContentType.Table
186

赵小蒙's avatar
赵小蒙 committed
187
                elif category_id == 13:
许瑞's avatar
许瑞 committed
188
                    span["content"] = layout_det["latex"]
赵小蒙's avatar
赵小蒙 committed
189
                    span["type"] = ContentType.InlineEquation
赵小蒙's avatar
赵小蒙 committed
190
                elif category_id == 14:
许瑞's avatar
许瑞 committed
191
                    span["content"] = layout_det["latex"]
赵小蒙's avatar
赵小蒙 committed
192
                    span["type"] = ContentType.InterlineEquation
赵小蒙's avatar
赵小蒙 committed
193
                elif category_id == 15:
许瑞's avatar
许瑞 committed
194
                    span["content"] = layout_det["text"]
赵小蒙's avatar
赵小蒙 committed
195
                    span["type"] = ContentType.Text
赵小蒙's avatar
赵小蒙 committed
196
197
198
199
200
                # print(span)
                spans.append(span)
            else:
                continue

201
202
203



赵小蒙's avatar
赵小蒙 committed
204
205
206
        # 删除重叠spans中较小的那些
        spans = remove_overlaps_min_spans(spans)

207
        # 删除remove_span_block_bboxes中的bbox
赵小蒙's avatar
赵小蒙 committed
208
209
210
        # spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
        # 按qa要求,增加drop相关数据
        spans, dropped_text_block, dropped_image_block, dropped_table_block = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict)
211

赵小蒙's avatar
赵小蒙 committed
212
        # 对image和table截图
213
        spans = cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client)
赵小蒙's avatar
赵小蒙 committed
214

赵小蒙's avatar
赵小蒙 committed
215
        # 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
liukaiwen's avatar
liukaiwen committed
216
217
218
        displayed_list = []
        text_inline_lines = []
        modify_y_axis(spans, displayed_list, text_inline_lines)
赵小蒙's avatar
赵小蒙 committed
219
        # 模型识别错误的行间公式, type类型转换成行内公式
liukaiwen's avatar
liukaiwen committed
220
        spans = modify_inline_equation(spans, displayed_list, text_inline_lines)
赵小蒙's avatar
赵小蒙 committed
221

赵小蒙's avatar
赵小蒙 committed
222
        # bbox去除粘连
许瑞's avatar
许瑞 committed
223
        spans = remove_overlap_between_bbox(spans)
赵小蒙's avatar
赵小蒙 committed
224

赵小蒙's avatar
赵小蒙 committed
225
        # 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
226
227
        spans = adjust_bbox_for_standalone_block(spans)

赵小蒙's avatar
赵小蒙 committed
228

赵小蒙's avatar
赵小蒙 committed
229
        # 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
赵小蒙's avatar
赵小蒙 committed
230
        layout_bboxes, layout_tree = layout_detect(ocr_page_info['subfield_dets'], page, ocr_page_info)
赵小蒙's avatar
赵小蒙 committed
231

赵小蒙's avatar
赵小蒙 committed
232
        # 将spans合并成line(在layout内,从上到下,从左到右)
233
        lines = merge_spans_to_line_by_layout(spans, layout_bboxes)
赵小蒙's avatar
赵小蒙 committed
234

赵小蒙's avatar
赵小蒙 committed
235
236
237
238
        # 将lines合并成block
        blocks = merge_lines_to_block(lines)

        # 根据block合并段落
xuchao's avatar
xuchao committed
239
240
        para_blocks = para_split(blocks, layout_bboxes)
        
赵小蒙's avatar
赵小蒙 committed
241
242
        # 获取QA需要外置的list
        images, tables, interline_equations, inline_equations = get_qa_need_list(blocks)
赵小蒙's avatar
赵小蒙 committed
243
244

        # 构造pdf_info_dict
xuchao's avatar
xuchao committed
245
        page_info = construct_page_component(blocks, para_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
赵小蒙's avatar
赵小蒙 committed
246
                                             images, tables, interline_equations, inline_equations,
赵小蒙's avatar
赵小蒙 committed
247
248
                                             dropped_text_block, dropped_image_block, dropped_table_block,
                                             need_remove_spans_bboxes_dict)
赵小蒙's avatar
赵小蒙 committed
249
250
        pdf_info_dict[f"page_{page_id}"] = page_info

251
252
    # 在测试时,保存调试信息
    if debug_mode:
许瑞's avatar
许瑞 committed
253
254
255
        params_file_save_path = join_path(
            save_tmp_path, "md", book_name, "preproc_out.json"
        )
256
257
        with open(params_file_save_path, "w", encoding="utf-8") as f:
            json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
赵小蒙's avatar
赵小蒙 committed
258

259
260
261
        # drow_bbox
        draw_layout_bbox(pdf_info_dict, pdf_path, md_bookname_save_path)
        draw_text_bbox(pdf_info_dict, pdf_path, md_bookname_save_path)
赵小蒙's avatar
赵小蒙 committed
262

赵小蒙's avatar
赵小蒙 committed
263
    return pdf_info_dict