citationmarker_remove.py 6.48 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
3
4
5
"""
去掉正文的引文引用marker
https://aicarrier.feishu.cn/wiki/YLOPwo1PGiwFRdkwmyhcZmr0n3d
"""
import re
赵小蒙's avatar
赵小蒙 committed
6
# from magic_pdf.libs.nlp_utils import NLPModels
赵小蒙's avatar
赵小蒙 committed
7
8


赵小蒙's avatar
赵小蒙 committed
9
# __NLP_MODEL = NLPModels()
赵小蒙's avatar
赵小蒙 committed
10
11
12
13
14
15
16
17
18
19
20
21
22

def check_1(spans, cur_span_i):
    """寻找前一个char,如果是句号,逗号,那么就是角标"""
    if cur_span_i==0:
        return False # 不是角标
    pre_span = spans[cur_span_i-1]
    pre_char = pre_span['chars'][-1]['c']
    if pre_char in ['。', ',', '.', ',']:
        return True
    
    return False


赵小蒙's avatar
赵小蒙 committed
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# def check_2(spans, cur_span_i):
#     """检查前面一个span的最后一个单词,如果长度大于5,全都是字母,并且不含大写,就是角标"""
#     pattern = r'\b[A-Z]\.\s[A-Z][a-z]*\b' # 形如A. Bcde, L. Bcde, 人名的缩写
#
#     if cur_span_i==0 and len(spans)>1:
#         next_span = spans[cur_span_i+1]
#         next_txt = "".join([c['c'] for c in next_span['chars']])
#         result = __NLP_MODEL.detect_entity_catgr_using_nlp(next_txt)
#         if result in ["PERSON", "GPE", "ORG"]:
#             return True
#
#         if re.findall(pattern, next_txt):
#             return True
#
#         return False # 不是角标
#     elif cur_span_i==0 and len(spans)==1: # 角标占用了整行?谨慎删除
#         return False
#
#     # 如果这个span是最后一个span,
#     if cur_span_i==len(spans)-1:
#         pre_span = spans[cur_span_i-1]
#         pre_txt = "".join([c['c'] for c in pre_span['chars']])
#         pre_word = pre_txt.split(' ')[-1]
#         result = __NLP_MODEL.detect_entity_catgr_using_nlp(pre_txt)
#         if result in ["PERSON", "GPE", "ORG"]:
#             return True
#
#         if re.findall(pattern, pre_txt):
#             return True
#
#         return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower()
#     else: # 既不是第一个span,也不是最后一个span,那么此时检查一下这个角标距离前后哪个单词更近就属于谁的角标
#         pre_span = spans[cur_span_i-1]
#         next_span = spans[cur_span_i+1]
#         cur_span = spans[cur_span_i]
#         # 找到前一个和后一个span里的距离最近的单词
#         pre_distance = 10000 # 一个很大的数
#         next_distance = 10000 # 一个很大的数
#         for c in pre_span['chars'][::-1]:
#             if c['c'].isalpha():
#                 pre_distance = cur_span['bbox'][0] - c['bbox'][2]
#                 break
#         for c in next_span['chars']:
#             if c['c'].isalpha():
#                 next_distance = c['bbox'][0] - cur_span['bbox'][2]
#                 break
#
#         if pre_distance<next_distance:
#             belong_to_span = pre_span
#         else:
#             belong_to_span = next_span
#
#         txt = "".join([c['c'] for c in belong_to_span['chars']])
#         pre_word = txt.split(' ')[-1]
#         result = __NLP_MODEL.detect_entity_catgr_using_nlp(txt)
#         if result in ["PERSON", "GPE", "ORG"]:
#             return True
#
#         if re.findall(pattern, txt):
#             return True
#
#         return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower()
赵小蒙's avatar
赵小蒙 committed
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115


def check_3(spans, cur_span_i):
    """上标里有[], 有*, 有-, 有逗号"""
    # 如[2-3],[22]  
    # 如 2,3,4
    cur_span_txt = ''.join(c['c'] for c in spans[cur_span_i]['chars']).strip()
    bad_char = ['[', ']', '*', ',']

    if any([c in cur_span_txt for c in bad_char]) and any(character.isdigit() for character in cur_span_txt):
        return True

    # 如2-3, a-b
    patterns = [r'\d+-\d+', r'[a-zA-Z]-[a-zA-Z]', r'[a-zA-Z],[a-zA-Z]']
    for pattern in patterns:  
        match = re.match(pattern, cur_span_txt)
        if match is not None:
            return True

    return False


def remove_citation_marker(with_char_text_blcoks):
    for blk in with_char_text_blcoks:
        for line in blk['lines']:
            # 如果span里的个数少于2个,那只能忽略,角标不可能自己独占一行
            if len(line['spans'])<=1:
                continue

            # 找到高度最高的span作为位置比较的基准
            max_hi_span = line['spans'][0]['bbox']
116
117
118
            min_font_sz = 10000 # line里最小的字体
            max_font_sz = 0   # line里最大的字体
                
赵小蒙's avatar
赵小蒙 committed
119
120
121
122
123
            for s in line['spans']:
                if max_hi_span[3]-max_hi_span[1]<s['bbox'][3]-s['bbox'][1]:
                    max_hi_span = s['bbox']
                if min_font_sz>s['size']:
                    min_font_sz = s['size']
124
125
                if max_font_sz<s['size']:
                    max_font_sz = s['size']
赵小蒙's avatar
赵小蒙 committed
126
127
128
129
130
131
132
133
134
135
                        
            base_span_mid_y = (max_hi_span[3]+max_hi_span[1])/2
            
            
            span_to_del = []
            for i, span in enumerate(line['spans']):
                span_hi = span['bbox'][3]-span['bbox'][1]
                span_mid_y = (span['bbox'][3]+span['bbox'][1])/2
                span_font_sz = span['size']
                
136
137
                if max_font_sz-span_font_sz<1: # 先以字体过滤正文,如果是正文就不再继续判断了
                    continue
138
139
140
141
142

                # 对被除数为0的情况进行过滤
                if span_hi==0 or min_font_sz==0:
                    continue

赵小蒙's avatar
赵小蒙 committed
143
144
145
146
147
148
149
                if (base_span_mid_y-span_mid_y)/span_hi>0.2 or (base_span_mid_y-span_mid_y>0 and abs(span_font_sz-min_font_sz)/min_font_sz<0.1):
                    """
                    1. 它的前一个char如果是句号或者逗号的话,那么肯定是角标而不是公式
                    2. 如果这个角标的前面是一个单词(长度大于5)而不是任何大写或小写的短字母的话 应该也是角标
                    3. 上标里有数字和逗号或者数字+星号的组合,方括号,一般肯定就是角标了
                    4. 这个角标属于前文还是后文要根据距离来判断,如果距离前面的文本太近,那么就是前面的角标,否则就是后面的角标
                    """
赵小蒙's avatar
赵小蒙 committed
150
151
152
153
                    if (check_1(line['spans'], i) or
                        # check_2(line['spans'], i) or
                        check_3(line['spans'], i)
                    ):
赵小蒙's avatar
赵小蒙 committed
154
155
156
157
158
159
160
161
                        """删除掉这个角标:删除这个span, 同时还要更新line的text"""
                        span_to_del.append(span)
            if len(span_to_del)>0:
                for span in span_to_del:
                    line['spans'].remove(span)
                line['text'] = ''.join([c['c'] for s in line['spans'] for c in s['chars']])
    
    return with_char_text_blcoks