Commit dbdbaf58 authored by liukaiwen's avatar liukaiwen
Browse files

Merge branch 'master' of github.com:papayalove/Magic-PDF

parents afe92f07 3711a333
...@@ -3,10 +3,10 @@ ...@@ -3,10 +3,10 @@
https://aicarrier.feishu.cn/wiki/YLOPwo1PGiwFRdkwmyhcZmr0n3d https://aicarrier.feishu.cn/wiki/YLOPwo1PGiwFRdkwmyhcZmr0n3d
""" """
import re import re
from magic_pdf.libs.nlp_utils import NLPModels # from magic_pdf.libs.nlp_utils import NLPModels
__NLP_MODEL = NLPModels() # __NLP_MODEL = NLPModels()
def check_1(spans, cur_span_i): def check_1(spans, cur_span_i):
"""寻找前一个char,如果是句号,逗号,那么就是角标""" """寻找前一个char,如果是句号,逗号,那么就是角标"""
...@@ -20,68 +20,68 @@ def check_1(spans, cur_span_i): ...@@ -20,68 +20,68 @@ def check_1(spans, cur_span_i):
return False return False
def check_2(spans, cur_span_i): # def check_2(spans, cur_span_i):
"""检查前面一个span的最后一个单词,如果长度大于5,全都是字母,并且不含大写,就是角标""" # """检查前面一个span的最后一个单词,如果长度大于5,全都是字母,并且不含大写,就是角标"""
pattern = r'\b[A-Z]\.\s[A-Z][a-z]*\b' # 形如A. Bcde, L. Bcde, 人名的缩写 # pattern = r'\b[A-Z]\.\s[A-Z][a-z]*\b' # 形如A. Bcde, L. Bcde, 人名的缩写
#
if cur_span_i==0 and len(spans)>1: # if cur_span_i==0 and len(spans)>1:
next_span = spans[cur_span_i+1] # next_span = spans[cur_span_i+1]
next_txt = "".join([c['c'] for c in next_span['chars']]) # next_txt = "".join([c['c'] for c in next_span['chars']])
result = __NLP_MODEL.detect_entity_catgr_using_nlp(next_txt) # result = __NLP_MODEL.detect_entity_catgr_using_nlp(next_txt)
if result in ["PERSON", "GPE", "ORG"]: # if result in ["PERSON", "GPE", "ORG"]:
return True # return True
#
if re.findall(pattern, next_txt): # if re.findall(pattern, next_txt):
return True # return True
#
return False # 不是角标 # return False # 不是角标
elif cur_span_i==0 and len(spans)==1: # 角标占用了整行?谨慎删除 # elif cur_span_i==0 and len(spans)==1: # 角标占用了整行?谨慎删除
return False # return False
#
# 如果这个span是最后一个span, # # 如果这个span是最后一个span,
if cur_span_i==len(spans)-1: # if cur_span_i==len(spans)-1:
pre_span = spans[cur_span_i-1] # pre_span = spans[cur_span_i-1]
pre_txt = "".join([c['c'] for c in pre_span['chars']]) # pre_txt = "".join([c['c'] for c in pre_span['chars']])
pre_word = pre_txt.split(' ')[-1] # pre_word = pre_txt.split(' ')[-1]
result = __NLP_MODEL.detect_entity_catgr_using_nlp(pre_txt) # result = __NLP_MODEL.detect_entity_catgr_using_nlp(pre_txt)
if result in ["PERSON", "GPE", "ORG"]: # if result in ["PERSON", "GPE", "ORG"]:
return True # return True
#
if re.findall(pattern, pre_txt): # if re.findall(pattern, pre_txt):
return True # return True
#
return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower() # return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower()
else: # 既不是第一个span,也不是最后一个span,那么此时检查一下这个角标距离前后哪个单词更近就属于谁的角标 # else: # 既不是第一个span,也不是最后一个span,那么此时检查一下这个角标距离前后哪个单词更近就属于谁的角标
pre_span = spans[cur_span_i-1] # pre_span = spans[cur_span_i-1]
next_span = spans[cur_span_i+1] # next_span = spans[cur_span_i+1]
cur_span = spans[cur_span_i] # cur_span = spans[cur_span_i]
# 找到前一个和后一个span里的距离最近的单词 # # 找到前一个和后一个span里的距离最近的单词
pre_distance = 10000 # 一个很大的数 # pre_distance = 10000 # 一个很大的数
next_distance = 10000 # 一个很大的数 # next_distance = 10000 # 一个很大的数
for c in pre_span['chars'][::-1]: # for c in pre_span['chars'][::-1]:
if c['c'].isalpha(): # if c['c'].isalpha():
pre_distance = cur_span['bbox'][0] - c['bbox'][2] # pre_distance = cur_span['bbox'][0] - c['bbox'][2]
break # break
for c in next_span['chars']: # for c in next_span['chars']:
if c['c'].isalpha(): # if c['c'].isalpha():
next_distance = c['bbox'][0] - cur_span['bbox'][2] # next_distance = c['bbox'][0] - cur_span['bbox'][2]
break # break
#
if pre_distance<next_distance: # if pre_distance<next_distance:
belong_to_span = pre_span # belong_to_span = pre_span
else: # else:
belong_to_span = next_span # belong_to_span = next_span
#
txt = "".join([c['c'] for c in belong_to_span['chars']]) # txt = "".join([c['c'] for c in belong_to_span['chars']])
pre_word = txt.split(' ')[-1] # pre_word = txt.split(' ')[-1]
result = __NLP_MODEL.detect_entity_catgr_using_nlp(txt) # result = __NLP_MODEL.detect_entity_catgr_using_nlp(txt)
if result in ["PERSON", "GPE", "ORG"]: # if result in ["PERSON", "GPE", "ORG"]:
return True # return True
#
if re.findall(pattern, txt): # if re.findall(pattern, txt):
return True # return True
#
return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower() # return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower()
def check_3(spans, cur_span_i): def check_3(spans, cur_span_i):
...@@ -143,7 +143,10 @@ def remove_citation_marker(with_char_text_blcoks): ...@@ -143,7 +143,10 @@ def remove_citation_marker(with_char_text_blcoks):
3. 上标里有数字和逗号或者数字+星号的组合,方括号,一般肯定就是角标了 3. 上标里有数字和逗号或者数字+星号的组合,方括号,一般肯定就是角标了
4. 这个角标属于前文还是后文要根据距离来判断,如果距离前面的文本太近,那么就是前面的角标,否则就是后面的角标 4. 这个角标属于前文还是后文要根据距离来判断,如果距离前面的文本太近,那么就是前面的角标,否则就是后面的角标
""" """
if check_1(line['spans'], i) or check_2(line['spans'], i) or check_3(line['spans'], i): if (check_1(line['spans'], i) or
# check_2(line['spans'], i) or
check_3(line['spans'], i)
):
"""删除掉这个角标:删除这个span, 同时还要更新line的text""" """删除掉这个角标:删除这个span, 同时还要更新line的text"""
span_to_del.append(span) span_to_del.append(span)
if len(span_to_del)>0: if len(span_to_del)>0:
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!! 其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!!
""" """
import re
from loguru import logger from loguru import logger
from magic_pdf.rw import AbsReaderWriter from magic_pdf.rw import AbsReaderWriter
...@@ -78,9 +80,27 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr ...@@ -78,9 +80,27 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
return None return None
pdf_info_dict = parse_pdf(parse_pdf_by_txt) pdf_info_dict = parse_pdf(parse_pdf_by_txt)
text_all = ""
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False): for page_dict in pdf_info_dict['pdf_info']:
logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr") for para_block in page_dict['para_blocks']:
if para_block['type'] in ['title', 'text']:
for line in para_block['lines']:
for span in line['spans']:
text_all += span['content']
def calculate_garbled_rate(text):
garbage_regex = re.compile(r'[^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a\u3000-\u303f\uff00-\uffef]')
# 计算乱码字符的数量
garbage_count = len(garbage_regex.findall(text))
total = len(text)
if total == 0:
return 0 # 避免除以零的错误
return garbage_count / total
garbled_rate = calculate_garbled_rate(text_all)
if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False) or garbled_rate > 0.8:
logger.warning(f"parse_pdf_by_txt drop or error or garbled_rate too large, switch to parse_pdf_by_ocr")
pdf_info_dict = parse_pdf(parse_pdf_by_ocr) pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
if pdf_info_dict is None: if pdf_info_dict is None:
raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.") raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
......
...@@ -9,11 +9,8 @@ numpy>=1.21.6 ...@@ -9,11 +9,8 @@ numpy>=1.21.6
pandas>=1.3.5 pandas>=1.3.5
pycld2>=0.41 pycld2>=0.41
regex>=2023.12.25 regex>=2023.12.25
spacy>=3.7.4
termcolor>=2.4.0 termcolor>=2.4.0
wordninja>=2.0.0 wordninja>=2.0.0
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
zh_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.7.0/zh_core_web_sm-3.7.0-py3-none-any.whl
scikit-learn>=1.0.2 scikit-learn>=1.0.2
nltk==3.8.1 nltk==3.8.1
s3pathlib>=2.1.1 s3pathlib>=2.1.1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment