equations_replace.py.bak 20 KB
Newer Older
1
"""对pymupdf返回的结构里的公式进行替换,替换为模型识别的公式结果."""
许瑞's avatar
许瑞 committed
2

赵小蒙's avatar
赵小蒙 committed
3
4
5
import json
import os
from pathlib import Path
6

赵小蒙's avatar
赵小蒙 committed
7
from loguru import logger
8
9
10

from magic_pdf.config.ocr_content_type import ContentType
from magic_pdf.libs.commons import fitz
赵小蒙's avatar
赵小蒙 committed
11

12
13
TYPE_INLINE_EQUATION = ContentType.InlineEquation
TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
赵小蒙's avatar
赵小蒙 committed
14
15
16


def combine_chars_to_pymudict(block_dict, char_dict):
17
    """把block级别的pymupdf 结构里加入char结构."""
赵小蒙's avatar
赵小蒙 committed
18
    # 因为block_dict 被裁剪过,因此先把他和char_dict文字块对齐,才能进行补充
19
    char_map = {tuple(item['bbox']): item for item in char_dict}
许瑞's avatar
许瑞 committed
20

21
    for i in range(len(block_dict)):  # block
赵小蒙's avatar
赵小蒙 committed
22
        block = block_dict[i]
23
        key = block['bbox']
赵小蒙's avatar
赵小蒙 committed
24
        char_dict_item = char_map[tuple(key)]
25
26
27
28
29
30
        char_dict_map = {tuple(item['bbox']): item for item in char_dict_item['lines']}
        for j in range(len(block['lines'])):
            lines = block['lines'][j]
            with_char_lines = char_dict_map[lines['bbox']]
            for k in range(len(lines['spans'])):
                spans = lines['spans'][k]
赵小蒙's avatar
赵小蒙 committed
31
                try:
32
33
34
                    chars = with_char_lines['spans'][k]['chars']
                except Exception:
                    logger.error(char_dict[i]['lines'][j])
许瑞's avatar
许瑞 committed
35

36
                spans['chars'] = chars
许瑞's avatar
许瑞 committed
37

赵小蒙's avatar
赵小蒙 committed
38
39
40
41
    return block_dict


def calculate_overlap_area_2_minbox_area_ratio(bbox1, min_bbox):
42
    """计算box1和box2的重叠面积占最小面积的box的比例."""
赵小蒙's avatar
赵小蒙 committed
43
44
45
46
47
48
49
50
51
52
53
    # Determine the coordinates of the intersection rectangle
    x_left = max(bbox1[0], min_bbox[0])
    y_top = max(bbox1[1], min_bbox[1])
    x_right = min(bbox1[2], min_bbox[2])
    y_bottom = min(bbox1[3], min_bbox[3])

    if x_right < x_left or y_bottom < y_top:
        return 0.0

    # The area of overlap area
    intersection_area = (x_right - x_left) * (y_bottom - y_top)
许瑞's avatar
许瑞 committed
54
55
    min_box_area = (min_bbox[3] - min_bbox[1]) * (min_bbox[2] - min_bbox[0])
    if min_box_area == 0:
赵小蒙's avatar
赵小蒙 committed
56
57
58
        return 0
    else:
        return intersection_area / min_box_area
许瑞's avatar
许瑞 committed
59

赵小蒙's avatar
赵小蒙 committed
60
61

def _is_xin(bbox1, bbox2):
许瑞's avatar
许瑞 committed
62
63
64
    area1 = abs(bbox1[2] - bbox1[0]) * abs(bbox1[3] - bbox1[1])
    area2 = abs(bbox2[2] - bbox2[0]) * abs(bbox2[3] - bbox2[1])
    if area1 < area2:
赵小蒙's avatar
赵小蒙 committed
65
66
67
68
        ratio = calculate_overlap_area_2_minbox_area_ratio(bbox2, bbox1)
    else:
        ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)

许瑞's avatar
许瑞 committed
69
    return ratio > 0.6
赵小蒙's avatar
赵小蒙 committed
70
71
72


def remove_text_block_in_interline_equation_bbox(interline_bboxes, text_blocks):
73
    """消除掉整个块都在行间公式块内部的文本块."""
赵小蒙's avatar
赵小蒙 committed
74
75
76
    for eq_bbox in interline_bboxes:
        removed_txt_blk = []
        for text_blk in text_blocks:
77
            text_bbox = text_blk['bbox']
许瑞's avatar
许瑞 committed
78
            if (
79
                calculate_overlap_area_2_minbox_area_ratio(eq_bbox['bbox'], text_bbox)
许瑞's avatar
许瑞 committed
80
81
                >= 0.7
            ):
赵小蒙's avatar
赵小蒙 committed
82
83
84
85
86
87
88
89
                removed_txt_blk.append(text_blk)
        for blk in removed_txt_blk:
            text_blocks.remove(blk)

    return text_blocks


def _is_in_or_part_overlap(box1, box2) -> bool:
90
    """两个bbox是否有部分重叠或者包含."""
赵小蒙's avatar
赵小蒙 committed
91
92
    if box1 is None or box2 is None:
        return False
许瑞's avatar
许瑞 committed
93

赵小蒙's avatar
赵小蒙 committed
94
95
96
    x0_1, y0_1, x1_1, y1_1 = box1
    x0_2, y0_2, x1_2, y1_2 = box2

许瑞's avatar
许瑞 committed
97
98
99
100
101
102
    return not (
        x1_1 < x0_2  # box1在box2的左边
        or x0_1 > x1_2  # box1在box2的右边
        or y1_1 < y0_2  # box1在box2的上边
        or y0_1 > y1_2
    )  # box1在box2的下边
赵小蒙's avatar
赵小蒙 committed
103

104

许瑞's avatar
许瑞 committed
105
106
107
def remove_text_block_overlap_interline_equation_bbox(
    interline_eq_bboxes, pymu_block_list
):
108
    """消除掉行行内公式有部分重叠的文本块的内容。 同时重新计算消除重叠之后文本块的大小."""
赵小蒙's avatar
赵小蒙 committed
109
110
111
    deleted_block = []
    for text_block in pymu_block_list:
        deleted_line = []
112
        for line in text_block['lines']:
赵小蒙's avatar
赵小蒙 committed
113
            deleted_span = []
114
            for span in line['spans']:
赵小蒙's avatar
赵小蒙 committed
115
                deleted_chars = []
116
                for char in span['chars']:
许瑞's avatar
许瑞 committed
117
                    if any(
118
119
120
121
122
123
124
125
126
                        [
                            (
                                calculate_overlap_area_2_minbox_area_ratio(
                                    eq_bbox['bbox'], char['bbox']
                                )
                                > 0.5
                            )
                            for eq_bbox in interline_eq_bboxes
                        ]
许瑞's avatar
许瑞 committed
127
                    ):
赵小蒙's avatar
赵小蒙 committed
128
129
130
                        deleted_chars.append(char)
                # 检查span里没有char则删除这个span
                for char in deleted_chars:
131
                    span['chars'].remove(char)
赵小蒙's avatar
赵小蒙 committed
132
                # 重新计算这个span的大小
133
                if len(span['chars']) == 0:  # 删除这个span
赵小蒙's avatar
赵小蒙 committed
134
135
                    deleted_span.append(span)
                else:
136
137
138
139
140
                    span['bbox'] = (
                        min([b['bbox'][0] for b in span['chars']]),
                        min([b['bbox'][1] for b in span['chars']]),
                        max([b['bbox'][2] for b in span['chars']]),
                        max([b['bbox'][3] for b in span['chars']]),
许瑞's avatar
许瑞 committed
141
142
                    )

赵小蒙's avatar
赵小蒙 committed
143
144
            # 检查这个span
            for span in deleted_span:
145
146
                line['spans'].remove(span)
            if len(line['spans']) == 0:  # 删除这个line
赵小蒙's avatar
赵小蒙 committed
147
148
                deleted_line.append(line)
            else:
149
150
151
152
153
                line['bbox'] = (
                    min([b['bbox'][0] for b in line['spans']]),
                    min([b['bbox'][1] for b in line['spans']]),
                    max([b['bbox'][2] for b in line['spans']]),
                    max([b['bbox'][3] for b in line['spans']]),
许瑞's avatar
许瑞 committed
154
                )
赵小蒙's avatar
赵小蒙 committed
155
156
157

        # 检查这个block是否可以删除
        for line in deleted_line:
158
159
            text_block['lines'].remove(line)
        if len(text_block['lines']) == 0:  # 删除block
赵小蒙's avatar
赵小蒙 committed
160
161
            deleted_block.append(text_block)
        else:
162
163
164
165
166
            text_block['bbox'] = (
                min([b['bbox'][0] for b in text_block['lines']]),
                min([b['bbox'][1] for b in text_block['lines']]),
                max([b['bbox'][2] for b in text_block['lines']]),
                max([b['bbox'][3] for b in text_block['lines']]),
许瑞's avatar
许瑞 committed
167
            )
赵小蒙's avatar
赵小蒙 committed
168
169
170
171

    # 检查text block删除
    for block in deleted_block:
        pymu_block_list.remove(block)
许瑞's avatar
许瑞 committed
172
    if len(pymu_block_list) == 0:
赵小蒙's avatar
赵小蒙 committed
173
174
175
176
177
178
        return []

    return pymu_block_list


def insert_interline_equations_textblock(interline_eq_bboxes, pymu_block_list):
179
    """在行间公式对应的地方插上一个伪造的block."""
赵小蒙's avatar
赵小蒙 committed
180
    for eq in interline_eq_bboxes:
181
182
        bbox = eq['bbox']
        latex_content = eq['latex']
赵小蒙's avatar
赵小蒙 committed
183
        text_block = {
184
185
186
187
            'number': len(pymu_block_list),
            'type': 0,
            'bbox': bbox,
            'lines': [
许瑞's avatar
许瑞 committed
188
                {
189
                    'spans': [
许瑞's avatar
许瑞 committed
190
                        {
191
192
193
194
195
196
197
198
199
200
                            'size': 9.962599754333496,
                            'type': TYPE_INTERLINE_EQUATION,
                            'flags': 4,
                            'font': TYPE_INTERLINE_EQUATION,
                            'color': 0,
                            'ascender': 0.9409999847412109,
                            'descender': -0.3050000071525574,
                            'latex': latex_content,
                            'origin': [bbox[0], bbox[1]],
                            'bbox': bbox,
许瑞's avatar
许瑞 committed
201
202
                        }
                    ],
203
204
205
                    'wmode': 0,
                    'dir': [1.0, 0.0],
                    'bbox': bbox,
许瑞's avatar
许瑞 committed
206
207
208
                }
            ],
        }
赵小蒙's avatar
赵小蒙 committed
209
        pymu_block_list.append(text_block)
许瑞's avatar
许瑞 committed
210
211


赵小蒙's avatar
赵小蒙 committed
212
213
214
215
216
217
def x_overlap_ratio(box1, box2):
    a, _, c, _ = box1
    e, _, g, _ = box2

    # 计算重叠宽度
    overlap_x = max(min(c, g) - max(a, e), 0)
许瑞's avatar
许瑞 committed
218

赵小蒙's avatar
赵小蒙 committed
219
220
221
222
223
224
225
226
    # 计算box1的宽度
    width1 = g - e

    # 计算重叠比例
    overlap_ratio = overlap_x / width1 if width1 != 0 else 0

    return overlap_ratio

许瑞's avatar
许瑞 committed
227

赵小蒙's avatar
赵小蒙 committed
228
def __is_x_dir_overlap(bbox1, bbox2):
许瑞's avatar
许瑞 committed
229
230
    return not (bbox1[2] < bbox2[0] or bbox1[0] > bbox2[2])

赵小蒙's avatar
赵小蒙 committed
231
232
233
234
235
236
237
238

def __y_overlap_ratio(box1, box2):
    """"""
    _, b, _, d = box1
    _, f, _, h = box2

    # 计算重叠高度
    overlap_y = max(min(d, h) - max(b, f), 0)
许瑞's avatar
许瑞 committed
239

赵小蒙's avatar
赵小蒙 committed
240
241
242
243
244
245
246
    # 计算box1的高度
    height1 = d - b

    # 计算重叠比例
    overlap_ratio = overlap_y / height1 if height1 != 0 else 0

    return overlap_ratio
许瑞's avatar
许瑞 committed
247
248


赵小蒙's avatar
赵小蒙 committed
249
def replace_line_v2(eqinfo, line):
250
251
    """扫描这一行所有的和公式框X方向重叠的char,然后计算char的左、右x0, x1,位于这个区间内的span删除掉。
    最后与这个x0,x1有相交的span0, span1内部进行分割。"""
赵小蒙's avatar
赵小蒙 committed
252
253
254
255
    first_overlap_span = -1
    first_overlap_span_idx = -1
    last_overlap_span = -1
    delete_chars = []
256
257
    for i in range(0, len(line['spans'])):
        if 'chars' not in line['spans'][i]:
许瑞's avatar
许瑞 committed
258
259
            continue

260
        if line['spans'][i].get('_type', None) is not None:
许瑞's avatar
许瑞 committed
261
262
            continue  # 忽略,因为已经是插入的伪造span公式了

263
264
265
266
267
268
269
        for char in line['spans'][i]['chars']:
            if __is_x_dir_overlap(eqinfo['bbox'], char['bbox']):
                line_txt = ''
                for span in line['spans']:
                    span_txt = '<span>'
                    for ch in span['chars']:
                        span_txt = span_txt + ch['c']
赵小蒙's avatar
赵小蒙 committed
270

271
                    span_txt = span_txt + '</span>'
赵小蒙's avatar
赵小蒙 committed
272
273

                    line_txt = line_txt + span_txt
许瑞's avatar
许瑞 committed
274

赵小蒙's avatar
赵小蒙 committed
275
                if first_overlap_span_idx == -1:
276
                    first_overlap_span = line['spans'][i]
赵小蒙's avatar
赵小蒙 committed
277
                    first_overlap_span_idx = i
278
                last_overlap_span = line['spans'][i]
赵小蒙's avatar
赵小蒙 committed
279
280
281
                delete_chars.append(char)

    # 第一个和最后一个char要进行检查,到底属于公式多还是属于正常span多
许瑞's avatar
许瑞 committed
282
    if len(delete_chars) > 0:
283
284
        ch0_bbox = delete_chars[0]['bbox']
        if x_overlap_ratio(eqinfo['bbox'], ch0_bbox) < 0.51:
赵小蒙's avatar
赵小蒙 committed
285
            delete_chars.remove(delete_chars[0])
许瑞's avatar
许瑞 committed
286
    if len(delete_chars) > 0:
287
288
        ch0_bbox = delete_chars[-1]['bbox']
        if x_overlap_ratio(eqinfo['bbox'], ch0_bbox) < 0.51:
赵小蒙's avatar
赵小蒙 committed
289
            delete_chars.remove(delete_chars[-1])
许瑞's avatar
许瑞 committed
290

赵小蒙's avatar
赵小蒙 committed
291
292
    # 计算x方向上被删除区间内的char的真实x0, x1
    if len(delete_chars):
293
294
295
        x0, x1 = (
            min([b['bbox'][0] for b in delete_chars]),
            max([b['bbox'][2] for b in delete_chars]),
许瑞's avatar
许瑞 committed
296
        )
赵小蒙's avatar
赵小蒙 committed
297
    else:
298
        # logger.debug(f"行内公式替换没有发生,尝试下一行匹配, eqinfo={eqinfo}")
赵小蒙's avatar
赵小蒙 committed
299
        return False
许瑞's avatar
许瑞 committed
300

赵小蒙's avatar
赵小蒙 committed
301
302
    # 删除位于x0, x1这两个中间的span
    delete_span = []
303
304
    for span in line['spans']:
        span_box = span['bbox']
许瑞's avatar
许瑞 committed
305
        if x0 <= span_box[0] and span_box[2] <= x1:
赵小蒙's avatar
赵小蒙 committed
306
307
            delete_span.append(span)
    for span in delete_span:
308
        line['spans'].remove(span)
赵小蒙's avatar
赵小蒙 committed
309
310

    equation_span = {
311
312
313
314
315
316
317
318
319
320
        'size': 9.962599754333496,
        'type': TYPE_INLINE_EQUATION,
        'flags': 4,
        'font': TYPE_INLINE_EQUATION,
        'color': 0,
        'ascender': 0.9409999847412109,
        'descender': -0.3050000071525574,
        'latex': '',
        'origin': [337.1410153102337, 216.0205245153934],
        'bbox': eqinfo['bbox'],
许瑞's avatar
许瑞 committed
321
322
    }
    # equation_span = line['spans'][0].copy()
323
324
325
326
327
328
329
    equation_span['latex'] = eqinfo['latex']
    equation_span['bbox'] = [x0, equation_span['bbox'][1], x1, equation_span['bbox'][3]]
    equation_span['origin'] = [equation_span['bbox'][0], equation_span['bbox'][1]]
    equation_span['chars'] = delete_chars
    equation_span['type'] = TYPE_INLINE_EQUATION
    equation_span['_eq_bbox'] = eqinfo['bbox']
    line['spans'].insert(first_overlap_span_idx + 1, equation_span)  # 放入公式
赵小蒙's avatar
赵小蒙 committed
330
331

    # logger.info(f"==>text is 【{line_txt}】, equation is 【{eqinfo['latex_text']}】")
许瑞's avatar
许瑞 committed
332

赵小蒙's avatar
赵小蒙 committed
333
    # 第一个、和最后一个有overlap的span进行分割,然后插入对应的位置
许瑞's avatar
许瑞 committed
334
335
    first_span_chars = [
        char
336
337
        for char in first_overlap_span['chars']
        if (char['bbox'][2] + char['bbox'][0]) / 2 < x0
许瑞's avatar
许瑞 committed
338
339
340
    ]
    tail_span_chars = [
        char
341
342
        for char in last_overlap_span['chars']
        if (char['bbox'][0] + char['bbox'][2]) / 2 > x1
许瑞's avatar
许瑞 committed
343
344
345
    ]

    if len(first_span_chars) > 0:
346
347
348
349
350
351
352
        first_overlap_span['chars'] = first_span_chars
        first_overlap_span['text'] = ''.join([char['c'] for char in first_span_chars])
        first_overlap_span['bbox'] = (
            first_overlap_span['bbox'][0],
            first_overlap_span['bbox'][1],
            max([chr['bbox'][2] for chr in first_span_chars]),
            first_overlap_span['bbox'][3],
许瑞's avatar
许瑞 committed
353
        )
赵小蒙's avatar
赵小蒙 committed
354
355
356
357
        # first_overlap_span['_type'] = "first"
    else:
        # 删掉
        if first_overlap_span not in delete_span:
358
            line['spans'].remove(first_overlap_span)
许瑞's avatar
许瑞 committed
359
360

    if len(tail_span_chars) > 0:
361
362
363
364
        min_of_tail_span_x0 = min([chr['bbox'][0] for chr in tail_span_chars])
        min_of_tail_span_y0 = min([chr['bbox'][1] for chr in tail_span_chars])
        max_of_tail_span_x1 = max([chr['bbox'][2] for chr in tail_span_chars])
        max_of_tail_span_y1 = max([chr['bbox'][3] for chr in tail_span_chars])
365

许瑞's avatar
许瑞 committed
366
        if last_overlap_span == first_overlap_span:  # 这个时候应该插入一个新的
367
            tail_span_txt = ''.join([char['c'] for char in tail_span_chars])  # noqa: F841
许瑞's avatar
许瑞 committed
368
            last_span_to_insert = last_overlap_span.copy()
369
370
371
            last_span_to_insert['chars'] = tail_span_chars
            last_span_to_insert['text'] = ''.join(
                [char['c'] for char in tail_span_chars]
许瑞's avatar
许瑞 committed
372
            )
373
374
            if equation_span['bbox'][2] >= last_overlap_span['bbox'][2]:
                last_span_to_insert['bbox'] = (
375
376
377
                    min_of_tail_span_x0,
                    min_of_tail_span_y0,
                    max_of_tail_span_x1,
378
                    max_of_tail_span_y1,
379
380
                )
            else:
381
382
383
384
385
                last_span_to_insert['bbox'] = (
                    min([chr['bbox'][0] for chr in tail_span_chars]),
                    last_overlap_span['bbox'][1],
                    last_overlap_span['bbox'][2],
                    last_overlap_span['bbox'][3],
386
                )
赵小蒙's avatar
赵小蒙 committed
387
            # 插入到公式对象之后
388
389
            equation_idx = line['spans'].index(equation_span)
            line['spans'].insert(equation_idx + 1, last_span_to_insert)  # 放入公式
许瑞's avatar
许瑞 committed
390
        else:  # 直接修改原来的span
391
392
393
394
395
396
397
            last_overlap_span['chars'] = tail_span_chars
            last_overlap_span['text'] = ''.join([char['c'] for char in tail_span_chars])
            last_overlap_span['bbox'] = (
                min([chr['bbox'][0] for chr in tail_span_chars]),
                last_overlap_span['bbox'][1],
                last_overlap_span['bbox'][2],
                last_overlap_span['bbox'][3],
许瑞's avatar
许瑞 committed
398
            )
赵小蒙's avatar
赵小蒙 committed
399
400
    else:
        # 删掉
许瑞's avatar
许瑞 committed
401
402
403
404
        if (
            last_overlap_span not in delete_span
            and last_overlap_span != first_overlap_span
        ):
405
            line['spans'].remove(last_overlap_span)
许瑞's avatar
许瑞 committed
406

407
408
409
410
411
    remain_txt = ''
    for span in line['spans']:
        span_txt = '<span>'
        for char in span['chars']:
            span_txt = span_txt + char['c']
赵小蒙's avatar
赵小蒙 committed
412

413
        span_txt = span_txt + '</span>'
赵小蒙's avatar
赵小蒙 committed
414
415

        remain_txt = remain_txt + span_txt
许瑞's avatar
许瑞 committed
416

赵小蒙's avatar
赵小蒙 committed
417
    # logger.info(f"<== succ replace, text is 【{remain_txt}】, equation is 【{eqinfo['latex_text']}】")
许瑞's avatar
许瑞 committed
418

赵小蒙's avatar
赵小蒙 committed
419
420
421
422
    return True


def replace_eq_blk(eqinfo, text_block):
423
424
425
    """替换行内公式."""
    for line in text_block['lines']:
        line_bbox = line['bbox']
许瑞's avatar
许瑞 committed
426
        if (
427
428
            _is_xin(eqinfo['bbox'], line_bbox)
            or __y_overlap_ratio(eqinfo['bbox'], line_bbox) > 0.6
许瑞's avatar
许瑞 committed
429
430
        ):  # 定位到行, 使用y方向重合率是因为有的时候,一个行的宽度会小于公式位置宽度:行很高,公式很窄,
            replace_succ = replace_line_v2(eqinfo, line)
431
            if not replace_succ:  # 有的时候,一个pdf的line高度从API里会计算的有问题,因此在行内span级别会替换不成功,这就需要继续重试下一行
赵小蒙's avatar
赵小蒙 committed
432
433
434
435
436
437
438
439
440
                continue
            else:
                break
    else:
        return False
    return True


def replace_inline_equations(inline_equation_bboxes, raw_text_blocks):
441
    """替换行内公式."""
赵小蒙's avatar
赵小蒙 committed
442
    for eqinfo in inline_equation_bboxes:
443
        eqbox = eqinfo['bbox']
赵小蒙's avatar
赵小蒙 committed
444
        for blk in raw_text_blocks:
445
            if _is_xin(eqbox, blk['bbox']):
赵小蒙's avatar
赵小蒙 committed
446
                if not replace_eq_blk(eqinfo, blk):
447
                    logger.warning(f'行内公式没有替换成功:{eqinfo} ')
赵小蒙's avatar
赵小蒙 committed
448
449
450
451
452
                else:
                    break

    return raw_text_blocks

许瑞's avatar
许瑞 committed
453

赵小蒙's avatar
赵小蒙 committed
454
def remove_chars_in_text_blocks(text_blocks):
455
    """删除text_blocks里的char."""
赵小蒙's avatar
赵小蒙 committed
456
    for blk in text_blocks:
457
458
459
        for line in blk['lines']:
            for span in line['spans']:
                _ = span.pop('chars', 'no such key')
赵小蒙's avatar
赵小蒙 committed
460
461
462
    return text_blocks


许瑞's avatar
许瑞 committed
463
464
465
def replace_equations_in_textblock(
    raw_text_blocks, inline_equation_bboxes, interline_equation_bboxes
):
466
    """替换行间和和行内公式为latex."""
许瑞's avatar
许瑞 committed
467
468
469
    raw_text_blocks = remove_text_block_in_interline_equation_bbox(
        interline_equation_bboxes, raw_text_blocks
    )  # 消除重叠:第一步,在公式内部的
470

许瑞's avatar
许瑞 committed
471
472
473
474
    raw_text_blocks = remove_text_block_overlap_interline_equation_bbox(
        interline_equation_bboxes, raw_text_blocks
    )  # 消重,第二步,和公式覆盖的

475
    insert_interline_equations_textblock(interline_equation_bboxes, raw_text_blocks)
赵小蒙's avatar
赵小蒙 committed
476
477
    raw_text_blocks = replace_inline_equations(inline_equation_bboxes, raw_text_blocks)
    return raw_text_blocks
许瑞's avatar
许瑞 committed
478

赵小蒙's avatar
赵小蒙 committed
479
480

def draw_block_on_pdf_with_txt_replace_eq_bbox(json_path, pdf_path):
481
482
483
    """"""
    new_pdf = f'{Path(pdf_path).parent}/{Path(pdf_path).stem}.step3-消除行内公式text_block.pdf'
    with open(json_path, 'r', encoding='utf-8') as f:
赵小蒙's avatar
赵小蒙 committed
484
485
486
487
        obj = json.loads(f.read())

    if os.path.exists(new_pdf):
        os.remove(new_pdf)
488
    new_doc = fitz.open('')
许瑞's avatar
许瑞 committed
489

490
    doc = fitz.open(pdf_path)  # noqa: F841
赵小蒙's avatar
赵小蒙 committed
491
492
493
    new_doc = fitz.open(pdf_path)
    for i in range(len(new_doc)):
        page = new_doc[i]
494
495
496
        inline_equation_bboxes = obj[f'page_{i}']['inline_equations']
        interline_equation_bboxes = obj[f'page_{i}']['interline_equations']
        raw_text_blocks = obj[f'page_{i}']['preproc_blocks']
许瑞's avatar
许瑞 committed
497
498
499
500
501
502
        raw_text_blocks = remove_text_block_in_interline_equation_bbox(
            interline_equation_bboxes, raw_text_blocks
        )  # 消除重叠:第一步,在公式内部的
        raw_text_blocks = remove_text_block_overlap_interline_equation_bbox(
            interline_equation_bboxes, raw_text_blocks
        )  # 消重,第二步,和公式覆盖的
赵小蒙's avatar
赵小蒙 committed
503
        insert_interline_equations_textblock(interline_equation_bboxes, raw_text_blocks)
许瑞's avatar
许瑞 committed
504
505
506
507
        raw_text_blocks = replace_inline_equations(
            inline_equation_bboxes, raw_text_blocks
        )

赵小蒙's avatar
赵小蒙 committed
508
        # 为了检验公式是否重复,把每一行里,含有公式的span背景改成黄色的
509
510
        color_map = [fitz.pdfcolor['blue'], fitz.pdfcolor['green']]  # noqa: F841
        j = 0  # noqa: F841
赵小蒙's avatar
赵小蒙 committed
511
        for blk in raw_text_blocks:
512
            for i, line in enumerate(blk['lines']):
赵小蒙's avatar
赵小蒙 committed
513
514
515
516
517
518
                # line_box = line['bbox']
                # shape = page.new_shape()
                # shape.draw_rect(line_box)
                # shape.finish(color=fitz.pdfcolor['red'], fill=color_map[j%2], fill_opacity=0.3)
                # shape.commit()
                # j = j+1
许瑞's avatar
许瑞 committed
519

520
                for i, span in enumerate(line['spans']):
赵小蒙's avatar
赵小蒙 committed
521
                    shape_page = page.new_shape()
522
523
524
525
526
527
                    span_type = span.get('_type')
                    color = fitz.pdfcolor['blue']
                    if span_type == 'first':
                        color = fitz.pdfcolor['blue']
                    elif span_type == 'tail':
                        color = fitz.pdfcolor['green']
许瑞's avatar
许瑞 committed
528
                    elif span_type == TYPE_INLINE_EQUATION:
529
                        color = fitz.pdfcolor['black']
赵小蒙's avatar
赵小蒙 committed
530
531
                    else:
                        color = None
许瑞's avatar
许瑞 committed
532

533
                    b = span['bbox']
赵小蒙's avatar
赵小蒙 committed
534
                    shape_page.draw_rect(b)
许瑞's avatar
许瑞 committed
535

赵小蒙's avatar
赵小蒙 committed
536
537
538
539
                    shape_page.finish(color=None, fill=color, fill_opacity=0.3)
                    shape_page.commit()

    new_doc.save(new_pdf)
540
    logger.info(f'save ok {new_pdf}')
许瑞's avatar
许瑞 committed
541
    final_json = json.dumps(obj, ensure_ascii=False, indent=2)
542
    with open('equations_test/final_json.json', 'w') as f:
赵小蒙's avatar
赵小蒙 committed
543
        f.write(final_json)
许瑞's avatar
许瑞 committed
544

赵小蒙's avatar
赵小蒙 committed
545
546
547
    return new_pdf


548
if __name__ == '__main__':
赵小蒙's avatar
赵小蒙 committed
549
550
    # draw_block_on_pdf_with_txt_replace_eq_bbox(new_json_path, equation_color_pdf)
    pass