"src/vscode:/vscode.git/clone" did not exist on "9683df262b00bb796719f03a8d6fcc66542dd6e4"
Unverified Commit fe21eebd authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #753 from myhloli/dev

refactor(ocr):Increase the dilation factor in OCR to address the issue of word concatenation.
parents 2a409845 011a1b97
import re import re
import wordninja
from loguru import logger from loguru import logger
from magic_pdf.libs.commons import join_path from magic_pdf.libs.commons import join_path
...@@ -25,37 +24,6 @@ def __is_hyphen_at_line_end(line): ...@@ -25,37 +24,6 @@ def __is_hyphen_at_line_end(line):
return bool(re.search(r'[A-Za-z]+-\s*$', line)) return bool(re.search(r'[A-Za-z]+-\s*$', line))
def split_long_words(text):
segments = text.split(' ')
for i in range(len(segments)):
words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
for j in range(len(words)):
if len(words[j]) > 10:
words[j] = ' '.join(wordninja.split(words[j]))
segments[i] = ''.join(words)
return ' '.join(segments)
def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
markdown = []
for page_info in pdf_info_list:
paras_of_layout = page_info.get('para_blocks')
page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'mm', img_buket_path)
markdown.extend(page_markdown)
return '\n\n'.join(markdown)
def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
markdown = []
for page_info in pdf_info_dict:
paras_of_layout = page_info.get('para_blocks')
page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'nlp')
markdown.extend(page_markdown)
return '\n\n'.join(markdown)
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
img_buket_path): img_buket_path):
markdown_with_para_and_pagination = [] markdown_with_para_and_pagination = []
...@@ -76,45 +44,6 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, ...@@ -76,45 +44,6 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
return markdown_with_para_and_pagination return markdown_with_para_and_pagination
def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
page_markdown = []
for paras in paras_of_layout:
for para in paras:
para_text = ''
for line in para:
for span in line['spans']:
span_type = span.get('type')
content = ''
language = ''
if span_type == ContentType.Text:
content = span['content']
language = detect_lang(content)
if (language == 'en'): # 只对英文长词进行分词处理,中文分词会丢失文本
content = ocr_escape_special_markdown_char(
split_long_words(content))
else:
content = ocr_escape_special_markdown_char(content)
elif span_type == ContentType.InlineEquation:
content = f"${span['content']}$"
elif span_type == ContentType.InterlineEquation:
content = f"\n$$\n{span['content']}\n$$\n"
elif span_type in [ContentType.Image, ContentType.Table]:
if mode == 'mm':
content = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
elif mode == 'nlp':
pass
if content != '':
if language == 'en': # 英文语境下 content间需要空格分隔
para_text += content + ' '
else: # 中文语境下,content间不需要空格分隔
para_text += content
if para_text.strip() == '':
continue
else:
page_markdown.append(para_text.strip() + ' ')
return page_markdown
def ocr_mk_markdown_with_para_core_v2(paras_of_layout, def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
mode, mode,
img_buket_path='', img_buket_path='',
...@@ -207,21 +136,11 @@ def merge_para_with_text(para_block, parse_type="auto", lang=None): ...@@ -207,21 +136,11 @@ def merge_para_with_text(para_block, parse_type="auto", lang=None):
if line_text != '': if line_text != '':
line_lang = detect_lang(line_text) line_lang = detect_lang(line_text)
for span in line['spans']: for span in line['spans']:
span_type = span['type'] span_type = span['type']
content = '' content = ''
if span_type == ContentType.Text: if span_type == ContentType.Text:
content = span['content'] content = ocr_escape_special_markdown_char(span['content'])
# language = detect_lang(content)
language = detect_language(content)
# 判断是否小语种
if lang is not None and lang != 'en':
content = ocr_escape_special_markdown_char(content)
else: # 非小语种逻辑
if language == 'en' and parse_type == 'ocr': # 只对英文长词进行分词处理,中文分词会丢失文本
content = ocr_escape_special_markdown_char(
split_long_words(content))
else:
content = ocr_escape_special_markdown_char(content)
elif span_type == ContentType.InlineEquation: elif span_type == ContentType.InlineEquation:
content = f" ${span['content']}$ " content = f" ${span['content']}$ "
elif span_type == ContentType.InterlineEquation: elif span_type == ContentType.InterlineEquation:
...@@ -242,41 +161,6 @@ def merge_para_with_text(para_block, parse_type="auto", lang=None): ...@@ -242,41 +161,6 @@ def merge_para_with_text(para_block, parse_type="auto", lang=None):
return para_text return para_text
def para_to_standard_format(para, img_buket_path):
para_content = {}
if len(para) == 1:
para_content = line_to_standard_format(para[0], img_buket_path)
elif len(para) > 1:
para_text = ''
inline_equation_num = 0
for line in para:
for span in line['spans']:
language = ''
span_type = span.get('type')
content = ''
if span_type == ContentType.Text:
content = span['content']
language = detect_lang(content)
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
content = ocr_escape_special_markdown_char(
split_long_words(content))
else:
content = ocr_escape_special_markdown_char(content)
elif span_type == ContentType.InlineEquation:
content = f"${span['content']}$"
inline_equation_num += 1
if language == 'en': # 英文语境下 content间需要空格分隔
para_text += content + ' '
else: # 中文语境下,content间不需要空格分隔
para_text += content
para_content = {
'type': 'text',
'text': para_text,
'inline_equation_num': inline_equation_num,
}
return para_content
def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type="auto", lang=None, drop_reason=None): def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type="auto", lang=None, drop_reason=None):
para_type = para_block['type'] para_type = para_block['type']
para_content = {} para_content = {}
...@@ -330,82 +214,6 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type= ...@@ -330,82 +214,6 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, parse_type=
return para_content return para_content
def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
content_list = []
for page_info in pdf_info_dict:
paras_of_layout = page_info.get('para_blocks')
if not paras_of_layout:
continue
for para_block in paras_of_layout:
para_content = para_to_standard_format_v2(para_block,
img_buket_path)
content_list.append(para_content)
return content_list
def line_to_standard_format(line, img_buket_path):
line_text = ''
inline_equation_num = 0
for span in line['spans']:
if not span.get('content'):
if not span.get('image_path'):
continue
else:
if span['type'] == ContentType.Image:
content = {
'type': 'image',
'img_path': join_path(img_buket_path,
span['image_path']),
}
return content
elif span['type'] == ContentType.Table:
content = {
'type': 'table',
'img_path': join_path(img_buket_path,
span['image_path']),
}
return content
else:
if span['type'] == ContentType.InterlineEquation:
interline_equation = span['content']
content = {
'type': 'equation',
'latex': f'$$\n{interline_equation}\n$$'
}
return content
elif span['type'] == ContentType.InlineEquation:
inline_equation = span['content']
line_text += f'${inline_equation}$'
inline_equation_num += 1
elif span['type'] == ContentType.Text:
text_content = ocr_escape_special_markdown_char(
span['content']) # 转义特殊符号
line_text += text_content
content = {
'type': 'text',
'text': line_text,
'inline_equation_num': inline_equation_num,
}
return content
def ocr_mk_mm_standard_format(pdf_info_dict: list):
"""content_list type string
image/text/table/equation(行间的单独拿出来,行内的和text合并) latex string
latex文本字段。 text string 纯文本格式的文本数据。 md string
markdown格式的文本数据。 img_path string s3://full/path/to/img.jpg."""
content_list = []
for page_info in pdf_info_dict:
blocks = page_info.get('preproc_blocks')
if not blocks:
continue
for block in blocks:
for line in block['lines']:
content = line_to_standard_format(line)
content_list.append(content)
return content_list
def union_make(pdf_info_dict: list, def union_make(pdf_info_dict: list,
make_mode: str, make_mode: str,
drop_mode: str, drop_mode: str,
......
...@@ -77,11 +77,11 @@ def layout_model_init(weight, config_file, device): ...@@ -77,11 +77,11 @@ def layout_model_init(weight, config_file, device):
return model return model
def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3, lang=None): def ocr_model_init(show_log: bool = False, det_db_box_thresh=0.3, lang=None, use_dilation=True, det_db_unclip_ratio=2.4):
if lang is not None: if lang is not None:
model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, lang=lang) model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, lang=lang, use_dilation=use_dilation, det_db_unclip_ratio=det_db_unclip_ratio)
else: else:
model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh) model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=det_db_box_thresh, use_dilation=use_dilation, det_db_unclip_ratio=det_db_unclip_ratio)
return model return model
......
...@@ -5,7 +5,6 @@ PyMuPDF>=1.24.9 ...@@ -5,7 +5,6 @@ PyMuPDF>=1.24.9
loguru>=0.6.0 loguru>=0.6.0
numpy>=1.21.6,<2.0.0 numpy>=1.21.6,<2.0.0
fast-langdetect==0.2.0 fast-langdetect==0.2.0
wordninja>=2.0.0
scikit-learn>=1.0.2 scikit-learn>=1.0.2
pdfminer.six==20231228 pdfminer.six==20231228
unimernet==0.2.1 unimernet==0.2.1
......
...@@ -8,7 +8,6 @@ pdfminer.six==20231228 ...@@ -8,7 +8,6 @@ pdfminer.six==20231228
pydantic>=2.7.2,<2.8.0 pydantic>=2.7.2,<2.8.0
PyMuPDF>=1.24.9 PyMuPDF>=1.24.9
scikit-learn>=1.0.2 scikit-learn>=1.0.2
wordninja>=2.0.0
torch>=2.2.2,<=2.3.1 torch>=2.2.2,<=2.3.1
transformers transformers
# The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator. # The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment