remove_rotate_bbox.py.bak 8.6 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
import math
2
import re
赵小蒙's avatar
赵小蒙 committed
3

4
5
from magic_pdf.config.drop_tag import (EMPTY_SIDE_BLOCK, ROTATE_TEXT,
                                       VERTICAL_TEXT)
赵小蒙's avatar
赵小蒙 committed
6
from magic_pdf.libs.boxbase import is_vbox_on_side
赵小蒙's avatar
赵小蒙 committed
7
8
9


def detect_non_horizontal_texts(result_dict):
10
11
    """This function detects watermarks and vertical margin notes in the
    document.
赵小蒙's avatar
赵小蒙 committed
12
13
14
15
16
17

    Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
    If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
    If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.

    Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
18
    If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page. # noqa: E501
赵小蒙's avatar
赵小蒙 committed
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
    If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.


    Parameters
    ----------
    result_dict : dict
        The result dictionary.

    Returns
    -------
    result_dict : dict
        The updated result dictionary.
    """
    # Dictionary to store information about potential watermarks
    potential_watermarks = {}
    potential_margin_notes = {}

    for page_id, page_content in result_dict.items():
37
        if page_id.startswith('page_'):
赵小蒙's avatar
赵小蒙 committed
38
            for block_id, block_data in page_content.items():
39
40
41
42
43
44
45
46
                if block_id.startswith('block_'):
                    if 'dir' in block_data:
                        coordinates_text = (
                            block_data['bbox'],
                            block_data['text'],
                        )  # Tuple of coordinates and text

                        angle = math.atan2(block_data['dir'][1], block_data['dir'][0])
赵小蒙's avatar
赵小蒙 committed
47
48
49
50
51
52
53
54
55
56
                        angle = abs(math.degrees(angle))

                        if angle > 5 and angle < 85:  # Check if direction is watermarks
                            if coordinates_text in potential_watermarks:
                                potential_watermarks[coordinates_text] += 1
                            else:
                                potential_watermarks[coordinates_text] = 1

                        if angle > 85 and angle < 105:  # Check if direction is vertical
                            if coordinates_text in potential_margin_notes:
57
58
59
                                potential_margin_notes[coordinates_text] += (
                                    1  # Increment count
                                )
赵小蒙's avatar
赵小蒙 committed
60
                            else:
61
62
63
                                potential_margin_notes[coordinates_text] = (
                                    1  # Initialize count
                                )
赵小蒙's avatar
赵小蒙 committed
64
65
66

    # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
    watermark_threshold = len(result_dict) // 2
67
68
69
    watermarks = {
        k: v for k, v in potential_watermarks.items() if v > watermark_threshold
    }
赵小蒙's avatar
赵小蒙 committed
70
71
72

    # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
    margin_note_threshold = len(result_dict) // 2
73
74
75
    margin_notes = {
        k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold
    }
赵小蒙's avatar
赵小蒙 committed
76
77
78

    # Add watermark information to the result dictionary
    for page_id, blocks in result_dict.items():
79
        if page_id.startswith('page_'):
赵小蒙's avatar
赵小蒙 committed
80
            for block_id, block_data in blocks.items():
81
                coordinates_text = (block_data['bbox'], block_data['text'])
赵小蒙's avatar
赵小蒙 committed
82
                if coordinates_text in watermarks:
83
                    block_data['is_watermark'] = 1
赵小蒙's avatar
赵小蒙 committed
84
                else:
85
                    block_data['is_watermark'] = 0
赵小蒙's avatar
赵小蒙 committed
86
87

                if coordinates_text in margin_notes:
88
                    block_data['is_vertical_margin_note'] = 1
赵小蒙's avatar
赵小蒙 committed
89
                else:
90
                    block_data['is_vertical_margin_note'] = 0
赵小蒙's avatar
赵小蒙 committed
91
92
93
94
95
96
97
98

    return result_dict


"""
1. 当一个block里全部文字都不是dir=(1,0),这个block整体去掉
2. 当一个block里全部文字都是dir=(1,0),但是每行只有一个字,这个block整体去掉。这个block必须出现在页面的四周,否则不去掉
"""
99

赵小蒙's avatar
赵小蒙 committed
100
101
102
103
104
105

def __is_a_word(sentence):
    # 如果输入是中文并且长度为1,则返回True
    if re.fullmatch(r'[\u4e00-\u9fa5]', sentence):
        return True
    # 判断是否为单个英文单词或字符(包括ASCII标点)
106
    elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <= 2:
赵小蒙's avatar
赵小蒙 committed
107
108
109
110
111
112
        return True
    else:
        return False


def __get_text_color(num):
113
    """获取字体的颜色RGB值."""
赵小蒙's avatar
赵小蒙 committed
114
115
116
117
118
119
120
    blue = num & 255
    green = (num >> 8) & 255
    red = (num >> 16) & 255
    return red, green, blue


def __is_empty_side_box(text_block):
121
    """是否是边缘上的空白没有任何内容的block."""
赵小蒙's avatar
赵小蒙 committed
122
123
124
    for line in text_block['lines']:
        for span in line['spans']:
            font_color = span['color']
125
126
            r, g, b = __get_text_color(font_color)
            if len(span['text'].strip()) > 0 and (r, g, b) != (255, 255, 255):
赵小蒙's avatar
赵小蒙 committed
127
                return False
128

赵小蒙's avatar
赵小蒙 committed
129
130
131
132
    return True


def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
133
    """返回删除了垂直,水印,旋转的textblock 删除的内容打上tag返回."""
赵小蒙's avatar
赵小蒙 committed
134
    removed_text_block = []
135
136
137
138

    for i, block in enumerate(
        pymu_text_block
    ):  # 格式参考test/assets/papre/pymu_textblocks.json
赵小蒙's avatar
赵小蒙 committed
139
140
        lines = block['lines']
        block_bbox = block['bbox']
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
        if not is_vbox_on_side(
            block_bbox, page_width, page_height, 0.2
        ):  # 保证这些box必须在页面的两边
            continue

        if (
            all(
                [
                    __is_a_word(line['spans'][0]['text'])
                    for line in lines
                    if len(line['spans']) > 0
                ]
            )
            and len(lines) > 1
            and all([len(line['spans']) == 1 for line in lines])
        ):
            is_box_valign = (
                (
                    len(
                        set(
                            [
                                int(line['spans'][0]['bbox'][0])
                                for line in lines
                                if len(line['spans']) > 0
                            ]
                        )
                    )
                    == 1
                )
                and (
                    len(
                        [
                            int(line['spans'][0]['bbox'][0])
                            for line in lines
                            if len(line['spans']) > 0
                        ]
                    )
                    > 1
                )
            )  # 测试bbox在垂直方向是不是x0都相等,也就是在垂直方向排列.同时必须大于等于2个字

赵小蒙's avatar
赵小蒙 committed
182
            if is_box_valign:
183
                block['tag'] = VERTICAL_TEXT
赵小蒙's avatar
赵小蒙 committed
184
185
                removed_text_block.append(block)
                continue
186

赵小蒙's avatar
赵小蒙 committed
187
        for line in lines:
188
            if line['dir'] != (1, 0):
189
                block['tag'] = ROTATE_TEXT
190
191
192
                removed_text_block.append(
                    block
                )  # 只要有一个line不是dir=(1,0),就把整个block都删掉
赵小蒙's avatar
赵小蒙 committed
193
                break
194

赵小蒙's avatar
赵小蒙 committed
195
196
    for block in removed_text_block:
        pymu_text_block.remove(block)
197

赵小蒙's avatar
赵小蒙 committed
198
199
    return pymu_text_block, removed_text_block

200

赵小蒙's avatar
赵小蒙 committed
201
def get_side_boundry(rotate_bbox, page_width, page_height):
202
    """根据rotate_bbox,返回页面的左右正文边界."""
赵小蒙's avatar
赵小蒙 committed
203
204
205
206
    left_x = 0
    right_x = page_width
    for x in rotate_bbox:
        box = x['bbox']
207
        if box[2] < page_width / 2:
赵小蒙's avatar
赵小蒙 committed
208
209
210
            left_x = max(left_x, box[2])
        else:
            right_x = min(right_x, box[0])
211
212

    return left_x + 1, right_x - 1
赵小蒙's avatar
赵小蒙 committed
213
214
215


def remove_side_blank_block(pymu_text_block, page_width, page_height):
216
    """删除页面两侧的空白block."""
赵小蒙's avatar
赵小蒙 committed
217
    removed_text_block = []
218
219
220
221

    for i, block in enumerate(
        pymu_text_block
    ):  # 格式参考test/assets/papre/pymu_textblocks.json
赵小蒙's avatar
赵小蒙 committed
222
        block_bbox = block['bbox']
223
224
225
226
227
        if not is_vbox_on_side(
            block_bbox, page_width, page_height, 0.2
        ):  # 保证这些box必须在页面的两边
            continue

赵小蒙's avatar
赵小蒙 committed
228
        if __is_empty_side_box(block):
229
            block['tag'] = EMPTY_SIDE_BLOCK
赵小蒙's avatar
赵小蒙 committed
230
231
            removed_text_block.append(block)
            continue
232

赵小蒙's avatar
赵小蒙 committed
233
234
    for block in removed_text_block:
        pymu_text_block.remove(block)
235
236

    return pymu_text_block, removed_text_block