"git@developer.sourcefind.cn:hehl2/torchaudio.git" did not exist on "946b180af66971b3252bf0de1d117d72c68ee201"
Unverified Commit 03469909 authored by icecraft's avatar icecraft Committed by GitHub
Browse files

Feat/support footnote in figure (#532)



* feat: support figure footnote

* feat: using the relative position to combine footnote, table, image

* feat: add the readme of projects

* fix: code spell in unittest

---------
Co-authored-by: default avataricecraft <xurui1@pjlab.org.cn>
parent 4331b837
...@@ -3,6 +3,7 @@ repos: ...@@ -3,6 +3,7 @@ repos:
rev: 5.0.4 rev: 5.0.4
hooks: hooks:
- id: flake8 - id: flake8
args: ["--max-line-length=120", "--ignore=E131,E125,W503,W504,E203"]
- repo: https://github.com/PyCQA/isort - repo: https://github.com/PyCQA/isort
rev: 5.11.5 rev: 5.11.5
hooks: hooks:
...@@ -11,6 +12,7 @@ repos: ...@@ -11,6 +12,7 @@ repos:
rev: v0.32.0 rev: v0.32.0
hooks: hooks:
- id: yapf - id: yapf
args: ["--style={based_on_style: google, column_limit: 120, indent_width: 4}"]
- repo: https://github.com/codespell-project/codespell - repo: https://github.com/codespell-project/codespell
rev: v2.2.1 rev: v2.2.1
hooks: hooks:
...@@ -41,4 +43,4 @@ repos: ...@@ -41,4 +43,4 @@ repos:
rev: v1.3.1 rev: v1.3.1
hooks: hooks:
- id: docformatter - id: docformatter
args: ["--in-place", "--wrap-descriptions", "79"] args: ["--in-place", "--wrap-descriptions", "119"]
import re
import wordninja
from loguru import logger from loguru import logger
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.libs.commons import join_path from magic_pdf.libs.commons import join_path
from magic_pdf.libs.language import detect_lang from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
from magic_pdf.libs.ocr_content_type import ContentType, BlockType from magic_pdf.libs.ocr_content_type import BlockType, ContentType
import wordninja
import re
def __is_hyphen_at_line_end(line): def __is_hyphen_at_line_end(line):
...@@ -37,8 +38,9 @@ def split_long_words(text): ...@@ -37,8 +38,9 @@ def split_long_words(text):
def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path): def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
markdown = [] markdown = []
for page_info in pdf_info_list: for page_info in pdf_info_list:
paras_of_layout = page_info.get("para_blocks") paras_of_layout = page_info.get('para_blocks')
page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path) page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'mm', img_buket_path)
markdown.extend(page_markdown) markdown.extend(page_markdown)
return '\n\n'.join(markdown) return '\n\n'.join(markdown)
...@@ -46,29 +48,34 @@ def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path): ...@@ -46,29 +48,34 @@ def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list): def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
markdown = [] markdown = []
for page_info in pdf_info_dict: for page_info in pdf_info_dict:
paras_of_layout = page_info.get("para_blocks") paras_of_layout = page_info.get('para_blocks')
page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp") page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'nlp')
markdown.extend(page_markdown) markdown.extend(page_markdown)
return '\n\n'.join(markdown) return '\n\n'.join(markdown)
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, img_buket_path): def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
img_buket_path):
markdown_with_para_and_pagination = [] markdown_with_para_and_pagination = []
page_no = 0 page_no = 0
for page_info in pdf_info_dict: for page_info in pdf_info_dict:
paras_of_layout = page_info.get("para_blocks") paras_of_layout = page_info.get('para_blocks')
if not paras_of_layout: if not paras_of_layout:
continue continue
page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path) page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'mm', img_buket_path)
markdown_with_para_and_pagination.append({ markdown_with_para_and_pagination.append({
'page_no': page_no, 'page_no':
'md_content': '\n\n'.join(page_markdown) page_no,
'md_content':
'\n\n'.join(page_markdown)
}) })
page_no += 1 page_no += 1
return markdown_with_para_and_pagination return markdown_with_para_and_pagination
def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""): def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
page_markdown = [] page_markdown = []
for paras in paras_of_layout: for paras in paras_of_layout:
for para in paras: for para in paras:
...@@ -81,8 +88,9 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""): ...@@ -81,8 +88,9 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
if span_type == ContentType.Text: if span_type == ContentType.Text:
content = span['content'] content = span['content']
language = detect_lang(content) language = detect_lang(content)
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本 if (language == 'en'): # 只对英文长词进行分词处理,中文分词会丢失文本
content = ocr_escape_special_markdown_char(split_long_words(content)) content = ocr_escape_special_markdown_char(
split_long_words(content))
else: else:
content = ocr_escape_special_markdown_char(content) content = ocr_escape_special_markdown_char(content)
elif span_type == ContentType.InlineEquation: elif span_type == ContentType.InlineEquation:
...@@ -106,7 +114,9 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""): ...@@ -106,7 +114,9 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
return page_markdown return page_markdown
def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""): def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
mode,
img_buket_path=''):
page_markdown = [] page_markdown = []
for para_block in paras_of_layout: for para_block in paras_of_layout:
para_text = '' para_text = ''
...@@ -114,7 +124,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""): ...@@ -114,7 +124,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
if para_type == BlockType.Text: if para_type == BlockType.Text:
para_text = merge_para_with_text(para_block) para_text = merge_para_with_text(para_block)
elif para_type == BlockType.Title: elif para_type == BlockType.Title:
para_text = f"# {merge_para_with_text(para_block)}" para_text = f'# {merge_para_with_text(para_block)}'
elif para_type == BlockType.InterlineEquation: elif para_type == BlockType.InterlineEquation:
para_text = merge_para_with_text(para_block) para_text = merge_para_with_text(para_block)
elif para_type == BlockType.Image: elif para_type == BlockType.Image:
...@@ -130,11 +140,13 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""): ...@@ -130,11 +140,13 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
for block in para_block['blocks']: # 2nd.拼image_caption for block in para_block['blocks']: # 2nd.拼image_caption
if block['type'] == BlockType.ImageCaption: if block['type'] == BlockType.ImageCaption:
para_text += merge_para_with_text(block) para_text += merge_para_with_text(block)
for block in para_block['blocks']: # 2nd.拼image_caption
if block['type'] == BlockType.ImageFootnote:
para_text += merge_para_with_text(block)
elif para_type == BlockType.Table: elif para_type == BlockType.Table:
if mode == 'nlp': if mode == 'nlp':
continue continue
elif mode == 'mm': elif mode == 'mm':
table_caption = ''
for block in para_block['blocks']: # 1st.拼table_caption for block in para_block['blocks']: # 1st.拼table_caption
if block['type'] == BlockType.TableCaption: if block['type'] == BlockType.TableCaption:
para_text += merge_para_with_text(block) para_text += merge_para_with_text(block)
...@@ -163,6 +175,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""): ...@@ -163,6 +175,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
def merge_para_with_text(para_block): def merge_para_with_text(para_block):
def detect_language(text): def detect_language(text):
en_pattern = r'[a-zA-Z]+' en_pattern = r'[a-zA-Z]+'
en_matches = re.findall(en_pattern, text) en_matches = re.findall(en_pattern, text)
...@@ -171,19 +184,19 @@ def merge_para_with_text(para_block): ...@@ -171,19 +184,19 @@ def merge_para_with_text(para_block):
if en_length / len(text) >= 0.5: if en_length / len(text) >= 0.5:
return 'en' return 'en'
else: else:
return "unknown" return 'unknown'
else: else:
return "empty" return 'empty'
para_text = '' para_text = ''
for line in para_block['lines']: for line in para_block['lines']:
line_text = "" line_text = ''
line_lang = "" line_lang = ''
for span in line['spans']: for span in line['spans']:
span_type = span['type'] span_type = span['type']
if span_type == ContentType.Text: if span_type == ContentType.Text:
line_text += span['content'].strip() line_text += span['content'].strip()
if line_text != "": if line_text != '':
line_lang = detect_lang(line_text) line_lang = detect_lang(line_text)
for span in line['spans']: for span in line['spans']:
span_type = span['type'] span_type = span['type']
...@@ -193,7 +206,8 @@ def merge_para_with_text(para_block): ...@@ -193,7 +206,8 @@ def merge_para_with_text(para_block):
# language = detect_lang(content) # language = detect_lang(content)
language = detect_language(content) language = detect_language(content)
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本 if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
content = ocr_escape_special_markdown_char(split_long_words(content)) content = ocr_escape_special_markdown_char(
split_long_words(content))
else: else:
content = ocr_escape_special_markdown_char(content) content = ocr_escape_special_markdown_char(content)
elif span_type == ContentType.InlineEquation: elif span_type == ContentType.InlineEquation:
...@@ -227,12 +241,13 @@ def para_to_standard_format(para, img_buket_path): ...@@ -227,12 +241,13 @@ def para_to_standard_format(para, img_buket_path):
for span in line['spans']: for span in line['spans']:
language = '' language = ''
span_type = span.get('type') span_type = span.get('type')
content = "" content = ''
if span_type == ContentType.Text: if span_type == ContentType.Text:
content = span['content'] content = span['content']
language = detect_lang(content) language = detect_lang(content)
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本 if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
content = ocr_escape_special_markdown_char(split_long_words(content)) content = ocr_escape_special_markdown_char(
split_long_words(content))
else: else:
content = ocr_escape_special_markdown_char(content) content = ocr_escape_special_markdown_char(content)
elif span_type == ContentType.InlineEquation: elif span_type == ContentType.InlineEquation:
...@@ -245,7 +260,7 @@ def para_to_standard_format(para, img_buket_path): ...@@ -245,7 +260,7 @@ def para_to_standard_format(para, img_buket_path):
para_content = { para_content = {
'type': 'text', 'type': 'text',
'text': para_text, 'text': para_text,
'inline_equation_num': inline_equation_num 'inline_equation_num': inline_equation_num,
} }
return para_content return para_content
...@@ -256,37 +271,35 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx): ...@@ -256,37 +271,35 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
para_content = { para_content = {
'type': 'text', 'type': 'text',
'text': merge_para_with_text(para_block), 'text': merge_para_with_text(para_block),
'page_idx': page_idx 'page_idx': page_idx,
} }
elif para_type == BlockType.Title: elif para_type == BlockType.Title:
para_content = { para_content = {
'type': 'text', 'type': 'text',
'text': merge_para_with_text(para_block), 'text': merge_para_with_text(para_block),
'text_level': 1, 'text_level': 1,
'page_idx': page_idx 'page_idx': page_idx,
} }
elif para_type == BlockType.InterlineEquation: elif para_type == BlockType.InterlineEquation:
para_content = { para_content = {
'type': 'equation', 'type': 'equation',
'text': merge_para_with_text(para_block), 'text': merge_para_with_text(para_block),
'text_format': "latex", 'text_format': 'latex',
'page_idx': page_idx 'page_idx': page_idx,
} }
elif para_type == BlockType.Image: elif para_type == BlockType.Image:
para_content = { para_content = {'type': 'image', 'page_idx': page_idx}
'type': 'image',
'page_idx': page_idx
}
for block in para_block['blocks']: for block in para_block['blocks']:
if block['type'] == BlockType.ImageBody: if block['type'] == BlockType.ImageBody:
para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path']) para_content['img_path'] = join_path(
img_buket_path,
block['lines'][0]['spans'][0]['image_path'])
if block['type'] == BlockType.ImageCaption: if block['type'] == BlockType.ImageCaption:
para_content['img_caption'] = merge_para_with_text(block) para_content['img_caption'] = merge_para_with_text(block)
if block['type'] == BlockType.ImageFootnote:
para_content['img_footnote'] = merge_para_with_text(block)
elif para_type == BlockType.Table: elif para_type == BlockType.Table:
para_content = { para_content = {'type': 'table', 'page_idx': page_idx}
'type': 'table',
'page_idx': page_idx
}
for block in para_block['blocks']: for block in para_block['blocks']:
if block['type'] == BlockType.TableBody: if block['type'] == BlockType.TableBody:
if block["lines"][0]["spans"][0].get('latex', ''): if block["lines"][0]["spans"][0].get('latex', ''):
...@@ -305,17 +318,18 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx): ...@@ -305,17 +318,18 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str): def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
content_list = [] content_list = []
for page_info in pdf_info_dict: for page_info in pdf_info_dict:
paras_of_layout = page_info.get("para_blocks") paras_of_layout = page_info.get('para_blocks')
if not paras_of_layout: if not paras_of_layout:
continue continue
for para_block in paras_of_layout: for para_block in paras_of_layout:
para_content = para_to_standard_format_v2(para_block, img_buket_path) para_content = para_to_standard_format_v2(para_block,
img_buket_path)
content_list.append(para_content) content_list.append(para_content)
return content_list return content_list
def line_to_standard_format(line, img_buket_path): def line_to_standard_format(line, img_buket_path):
line_text = "" line_text = ''
inline_equation_num = 0 inline_equation_num = 0
for span in line['spans']: for span in line['spans']:
if not span.get('content'): if not span.get('content'):
...@@ -325,13 +339,15 @@ def line_to_standard_format(line, img_buket_path): ...@@ -325,13 +339,15 @@ def line_to_standard_format(line, img_buket_path):
if span['type'] == ContentType.Image: if span['type'] == ContentType.Image:
content = { content = {
'type': 'image', 'type': 'image',
'img_path': join_path(img_buket_path, span['image_path']) 'img_path': join_path(img_buket_path,
span['image_path']),
} }
return content return content
elif span['type'] == ContentType.Table: elif span['type'] == ContentType.Table:
content = { content = {
'type': 'table', 'type': 'table',
'img_path': join_path(img_buket_path, span['image_path']) 'img_path': join_path(img_buket_path,
span['image_path']),
} }
return content return content
else: else:
...@@ -339,36 +355,33 @@ def line_to_standard_format(line, img_buket_path): ...@@ -339,36 +355,33 @@ def line_to_standard_format(line, img_buket_path):
interline_equation = span['content'] interline_equation = span['content']
content = { content = {
'type': 'equation', 'type': 'equation',
'latex': f"$$\n{interline_equation}\n$$" 'latex': f'$$\n{interline_equation}\n$$'
} }
return content return content
elif span['type'] == ContentType.InlineEquation: elif span['type'] == ContentType.InlineEquation:
inline_equation = span['content'] inline_equation = span['content']
line_text += f"${inline_equation}$" line_text += f'${inline_equation}$'
inline_equation_num += 1 inline_equation_num += 1
elif span['type'] == ContentType.Text: elif span['type'] == ContentType.Text:
text_content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号 text_content = ocr_escape_special_markdown_char(
span['content']) # 转义特殊符号
line_text += text_content line_text += text_content
content = { content = {
'type': 'text', 'type': 'text',
'text': line_text, 'text': line_text,
'inline_equation_num': inline_equation_num 'inline_equation_num': inline_equation_num,
} }
return content return content
def ocr_mk_mm_standard_format(pdf_info_dict: list): def ocr_mk_mm_standard_format(pdf_info_dict: list):
""" """content_list type string
content_list image/text/table/equation(行间的单独拿出来,行内的和text合并) latex string
type string image/text/table/equation(行间的单独拿出来,行内的和text合并) latex文本字段。 text string 纯文本格式的文本数据。 md string
latex string latex文本字段。 markdown格式的文本数据。 img_path string s3://full/path/to/img.jpg."""
text string 纯文本格式的文本数据。
md string markdown格式的文本数据。
img_path string s3://full/path/to/img.jpg
"""
content_list = [] content_list = []
for page_info in pdf_info_dict: for page_info in pdf_info_dict:
blocks = page_info.get("preproc_blocks") blocks = page_info.get('preproc_blocks')
if not blocks: if not blocks:
continue continue
for block in blocks: for block in blocks:
...@@ -378,34 +391,42 @@ def ocr_mk_mm_standard_format(pdf_info_dict: list): ...@@ -378,34 +391,42 @@ def ocr_mk_mm_standard_format(pdf_info_dict: list):
return content_list return content_list
def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_path: str = ""): def union_make(pdf_info_dict: list,
make_mode: str,
drop_mode: str,
img_buket_path: str = ''):
output_content = [] output_content = []
for page_info in pdf_info_dict: for page_info in pdf_info_dict:
if page_info.get("need_drop", False): if page_info.get('need_drop', False):
drop_reason = page_info.get("drop_reason") drop_reason = page_info.get('drop_reason')
if drop_mode == DropMode.NONE: if drop_mode == DropMode.NONE:
pass pass
elif drop_mode == DropMode.WHOLE_PDF: elif drop_mode == DropMode.WHOLE_PDF:
raise Exception(f"drop_mode is {DropMode.WHOLE_PDF} , drop_reason is {drop_reason}") raise Exception((f'drop_mode is {DropMode.WHOLE_PDF} ,'
f'drop_reason is {drop_reason}'))
elif drop_mode == DropMode.SINGLE_PAGE: elif drop_mode == DropMode.SINGLE_PAGE:
logger.warning(f"drop_mode is {DropMode.SINGLE_PAGE} , drop_reason is {drop_reason}") logger.warning((f'drop_mode is {DropMode.SINGLE_PAGE} ,'
f'drop_reason is {drop_reason}'))
continue continue
else: else:
raise Exception(f"drop_mode can not be null") raise Exception('drop_mode can not be null')
paras_of_layout = page_info.get("para_blocks") paras_of_layout = page_info.get('para_blocks')
page_idx = page_info.get("page_idx") page_idx = page_info.get('page_idx')
if not paras_of_layout: if not paras_of_layout:
continue continue
if make_mode == MakeMode.MM_MD: if make_mode == MakeMode.MM_MD:
page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path) page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'mm', img_buket_path)
output_content.extend(page_markdown) output_content.extend(page_markdown)
elif make_mode == MakeMode.NLP_MD: elif make_mode == MakeMode.NLP_MD:
page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp") page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'nlp')
output_content.extend(page_markdown) output_content.extend(page_markdown)
elif make_mode == MakeMode.STANDARD_FORMAT: elif make_mode == MakeMode.STANDARD_FORMAT:
for para_block in paras_of_layout: for para_block in paras_of_layout:
para_content = para_to_standard_format_v2(para_block, img_buket_path, page_idx) para_content = para_to_standard_format_v2(
para_block, img_buket_path, page_idx)
output_content.append(para_content) output_content.append(para_content)
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]: if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
return '\n\n'.join(output_content) return '\n\n'.join(output_content)
......
This diff is collapsed.
This diff is collapsed.
...@@ -71,6 +71,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -71,6 +71,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
tables_list, tables_body_list = [], [] tables_list, tables_body_list = [], []
tables_caption_list, tables_footnote_list = [], [] tables_caption_list, tables_footnote_list = [], []
imgs_list, imgs_body_list, imgs_caption_list = [], [], [] imgs_list, imgs_body_list, imgs_caption_list = [], [], []
imgs_footnote_list = []
titles_list = [] titles_list = []
texts_list = [] texts_list = []
interequations_list = [] interequations_list = []
...@@ -78,7 +79,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -78,7 +79,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
page_layout_list = [] page_layout_list = []
page_dropped_list = [] page_dropped_list = []
tables, tables_body, tables_caption, tables_footnote = [], [], [], [] tables, tables_body, tables_caption, tables_footnote = [], [], [], []
imgs, imgs_body, imgs_caption = [], [], [] imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
titles = [] titles = []
texts = [] texts = []
interequations = [] interequations = []
...@@ -108,6 +109,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -108,6 +109,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
imgs_body.append(bbox) imgs_body.append(bbox)
elif nested_block['type'] == BlockType.ImageCaption: elif nested_block['type'] == BlockType.ImageCaption:
imgs_caption.append(bbox) imgs_caption.append(bbox)
elif nested_block['type'] == BlockType.ImageFootnote:
imgs_footnote.append(bbox)
elif block['type'] == BlockType.Title: elif block['type'] == BlockType.Title:
titles.append(bbox) titles.append(bbox)
elif block['type'] == BlockType.Text: elif block['type'] == BlockType.Text:
...@@ -121,6 +124,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -121,6 +124,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
imgs_list.append(imgs) imgs_list.append(imgs)
imgs_body_list.append(imgs_body) imgs_body_list.append(imgs_body)
imgs_caption_list.append(imgs_caption) imgs_caption_list.append(imgs_caption)
imgs_footnote_list.append(imgs_footnote)
titles_list.append(titles) titles_list.append(titles)
texts_list.append(texts) texts_list.append(texts)
interequations_list.append(interequations) interequations_list.append(interequations)
...@@ -142,6 +146,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -142,6 +146,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True) draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255],
True) True)
draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
True),
draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True) draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True) draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], draw_bbox_without_number(i, interequations_list, page, [0, 255, 0],
...@@ -241,7 +247,7 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename): ...@@ -241,7 +247,7 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename): def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
dropped_bbox_list = [] dropped_bbox_list = []
tables_body_list, tables_caption_list, tables_footnote_list = [], [], [] tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
imgs_body_list, imgs_caption_list = [], [] imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
titles_list = [] titles_list = []
texts_list = [] texts_list = []
interequations_list = [] interequations_list = []
...@@ -250,7 +256,7 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename): ...@@ -250,7 +256,7 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
for i in range(len(model_list)): for i in range(len(model_list)):
page_dropped_list = [] page_dropped_list = []
tables_body, tables_caption, tables_footnote = [], [], [] tables_body, tables_caption, tables_footnote = [], [], []
imgs_body, imgs_caption = [], [] imgs_body, imgs_caption, imgs_footnote = [], [], []
titles = [] titles = []
texts = [] texts = []
interequations = [] interequations = []
...@@ -277,6 +283,8 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename): ...@@ -277,6 +283,8 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
interequations.append(bbox) interequations.append(bbox)
elif layout_det['category_id'] == CategoryId.Abandon: elif layout_det['category_id'] == CategoryId.Abandon:
page_dropped_list.append(bbox) page_dropped_list.append(bbox)
elif layout_det['category_id'] == CategoryId.ImageFootnote:
imgs_footnote.append(bbox)
tables_body_list.append(tables_body) tables_body_list.append(tables_body)
tables_caption_list.append(tables_caption) tables_caption_list.append(tables_caption)
...@@ -287,6 +295,7 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename): ...@@ -287,6 +295,7 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
texts_list.append(texts) texts_list.append(texts)
interequations_list.append(interequations) interequations_list.append(interequations)
dropped_bbox_list.append(page_dropped_list) dropped_bbox_list.append(page_dropped_list)
imgs_footnote_list.append(imgs_footnote)
for i, page in enumerate(pdf_docs): for i, page in enumerate(pdf_docs):
draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158], draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158],
...@@ -299,6 +308,8 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename): ...@@ -299,6 +308,8 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True) draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255], draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255],
True) True)
draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
True)
draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True) draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True) draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True) draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
......
class ContentType: class ContentType:
Image = "image" Image = 'image'
Table = "table" Table = 'table'
Text = "text" Text = 'text'
InlineEquation = "inline_equation" InlineEquation = 'inline_equation'
InterlineEquation = "interline_equation" InterlineEquation = 'interline_equation'
class BlockType: class BlockType:
Image = "image" Image = 'image'
ImageBody = "image_body" ImageBody = 'image_body'
ImageCaption = "image_caption" ImageCaption = 'image_caption'
Table = "table" ImageFootnote = 'image_footnote'
TableBody = "table_body" Table = 'table'
TableCaption = "table_caption" TableBody = 'table_body'
TableFootnote = "table_footnote" TableCaption = 'table_caption'
Text = "text" TableFootnote = 'table_footnote'
Title = "title" Text = 'text'
InterlineEquation = "interline_equation" Title = 'title'
Footnote = "footnote" InterlineEquation = 'interline_equation'
Discarded = "discarded" Footnote = 'footnote'
Discarded = 'discarded'
class CategoryId: class CategoryId:
...@@ -33,3 +35,4 @@ class CategoryId: ...@@ -33,3 +35,4 @@ class CategoryId:
InlineEquation = 13 InlineEquation = 13
InterlineEquation_YOLO = 14 InterlineEquation_YOLO = 14
OcrText = 15 OcrText = 15
ImageFootnote = 101
This diff is collapsed.
from loguru import logger from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
_is_in_or_part_overlap_with_area_ratio,
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \ calculate_overlap_area_in_bbox1_area_ratio)
calculate_overlap_area_in_bbox1_area_ratio, _is_in_or_part_overlap_with_area_ratio
from magic_pdf.libs.drop_tag import DropTag from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import ContentType, BlockType from magic_pdf.libs.ocr_content_type import BlockType, ContentType
from magic_pdf.pre_proc.ocr_span_list_modify import modify_y_axis, modify_inline_equation
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_span
# 将每一个line中的span从左到右排序 # 将每一个line中的span从左到右排序
def line_sort_spans_by_left_to_right(lines): def line_sort_spans_by_left_to_right(lines):
line_objects = [] line_objects = []
for line in lines: for line in lines:
# 按照x0坐标排序 # 按照x0坐标排序
line.sort(key=lambda span: span['bbox'][0]) line.sort(key=lambda span: span['bbox'][0])
line_bbox = [ line_bbox = [
min(span['bbox'][0] for span in line), # x0 min(span['bbox'][0] for span in line), # x0
...@@ -21,8 +18,8 @@ def line_sort_spans_by_left_to_right(lines): ...@@ -21,8 +18,8 @@ def line_sort_spans_by_left_to_right(lines):
max(span['bbox'][3] for span in line), # y1 max(span['bbox'][3] for span in line), # y1
] ]
line_objects.append({ line_objects.append({
"bbox": line_bbox, 'bbox': line_bbox,
"spans": line, 'spans': line,
}) })
return line_objects return line_objects
...@@ -39,16 +36,21 @@ def merge_spans_to_line(spans): ...@@ -39,16 +36,21 @@ def merge_spans_to_line(spans):
for span in spans[1:]: for span in spans[1:]:
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation" # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# image和table类型,同上 # image和table类型,同上
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any( if span['type'] in [
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in ContentType.InterlineEquation, ContentType.Image,
current_line): ContentType.Table
] or any(s['type'] in [
ContentType.InterlineEquation, ContentType.Image,
ContentType.Table
] for s in current_line):
# 则开始新行 # 则开始新行
lines.append(current_line) lines.append(current_line)
current_line = [span] current_line = [span]
continue continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行 # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']): if __is_overlaps_y_exceeds_threshold(span['bbox'],
current_line[-1]['bbox']):
current_line.append(span) current_line.append(span)
else: else:
# 否则,开始新行 # 否则,开始新行
...@@ -71,7 +73,8 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes): ...@@ -71,7 +73,8 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
# 遍历spans,将每个span放入对应的layout中 # 遍历spans,将每个span放入对应的layout中
layout_sapns = [] layout_sapns = []
for span in spans: for span in spans:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.6: if calculate_overlap_area_in_bbox1_area_ratio(
span['bbox'], layout_bbox) > 0.6:
layout_sapns.append(span) layout_sapns.append(span)
# 如果layout_sapns不为空,则放入new_spans中 # 如果layout_sapns不为空,则放入new_spans中
if len(layout_sapns) > 0: if len(layout_sapns) > 0:
...@@ -99,12 +102,10 @@ def merge_lines_to_block(lines): ...@@ -99,12 +102,10 @@ def merge_lines_to_block(lines):
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
blocks = [] blocks = []
for line in lines: for line in lines:
blocks.append( blocks.append({
{ 'bbox': line['bbox'],
"bbox": line["bbox"], 'lines': [line],
"lines": [line], })
}
)
return blocks return blocks
...@@ -121,7 +122,8 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes): ...@@ -121,7 +122,8 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
if block[7] == BlockType.Footnote: if block[7] == BlockType.Footnote:
continue continue
block_bbox = block[:4] block_bbox = block[:4]
if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, layout_bbox) > 0.8: if calculate_overlap_area_in_bbox1_area_ratio(
block_bbox, layout_bbox) > 0.8:
layout_blocks.append(block) layout_blocks.append(block)
# 如果layout_blocks不为空,则放入new_blocks中 # 如果layout_blocks不为空,则放入new_blocks中
...@@ -134,7 +136,8 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes): ...@@ -134,7 +136,8 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
# 如果new_blocks不为空,则对new_blocks中每个block进行排序 # 如果new_blocks不为空,则对new_blocks中每个block进行排序
if len(new_blocks) > 0: if len(new_blocks) > 0:
for bboxes_in_layout_block in new_blocks: for bboxes_in_layout_block in new_blocks:
bboxes_in_layout_block.sort(key=lambda x: x[1]) # 一个layout内部的box,按照y0自上而下排序 bboxes_in_layout_block.sort(
key=lambda x: x[1]) # 一个layout内部的box,按照y0自上而下排序
sort_blocks.extend(bboxes_in_layout_block) sort_blocks.extend(bboxes_in_layout_block)
# sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序 # sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序
...@@ -142,9 +145,7 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes): ...@@ -142,9 +145,7 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
def fill_spans_in_blocks(blocks, spans, radio): def fill_spans_in_blocks(blocks, spans, radio):
''' """将allspans中的span按位置关系,放入blocks中."""
将allspans中的span按位置关系,放入blocks中
'''
block_with_spans = [] block_with_spans = []
for block in blocks: for block in blocks:
block_type = block[7] block_type = block[7]
...@@ -156,17 +157,15 @@ def fill_spans_in_blocks(blocks, spans, radio): ...@@ -156,17 +157,15 @@ def fill_spans_in_blocks(blocks, spans, radio):
block_spans = [] block_spans = []
for span in spans: for span in spans:
span_bbox = span['bbox'] span_bbox = span['bbox']
if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio: if calculate_overlap_area_in_bbox1_area_ratio(
span_bbox, block_bbox) > radio:
block_spans.append(span) block_spans.append(span)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)''' '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
# displayed_list = [] # displayed_list = []
# text_inline_lines = [] # text_inline_lines = []
# modify_y_axis(block_spans, displayed_list, text_inline_lines) # modify_y_axis(block_spans, displayed_list, text_inline_lines)
'''模型识别错误的行间公式, type类型转换成行内公式''' '''模型识别错误的行间公式, type类型转换成行内公式'''
# block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines) # block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
'''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错 '''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错
# block_spans = remove_overlap_between_bbox_for_span(block_spans) # block_spans = remove_overlap_between_bbox_for_span(block_spans)
...@@ -182,12 +181,9 @@ def fill_spans_in_blocks(blocks, spans, radio): ...@@ -182,12 +181,9 @@ def fill_spans_in_blocks(blocks, spans, radio):
def fix_block_spans(block_with_spans, img_blocks, table_blocks): def fix_block_spans(block_with_spans, img_blocks, table_blocks):
''' """1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系 需要将caption和footnote的text_span放入相应img_block和table_block内的
需要将caption和footnote的text_span放入相应img_block和table_block内的 caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
caption_block和footnote_block中
2、同时需要删除block中的spans字段
'''
fix_blocks = [] fix_blocks = []
for block in block_with_spans: for block in block_with_spans:
block_type = block['type'] block_type = block['type']
...@@ -218,16 +214,13 @@ def merge_spans_to_block(spans: list, block_bbox: list, block_type: str): ...@@ -218,16 +214,13 @@ def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
block_spans = [] block_spans = []
# 如果有img_caption,则将img_block中的text_spans放入img_caption_block中 # 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
for span in spans: for span in spans:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.6: if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'],
block_bbox) > 0.6:
block_spans.append(span) block_spans.append(span)
block_lines = merge_spans_to_line(block_spans) block_lines = merge_spans_to_line(block_spans)
# 对line中的span进行排序 # 对line中的span进行排序
sort_block_lines = line_sort_spans_by_left_to_right(block_lines) sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block = { block = {'bbox': block_bbox, 'type': block_type, 'lines': sort_block_lines}
'bbox': block_bbox,
'type': block_type,
'lines': sort_block_lines
}
return block, block_spans return block, block_spans
...@@ -237,11 +230,7 @@ def make_body_block(span: dict, block_bbox: list, block_type: str): ...@@ -237,11 +230,7 @@ def make_body_block(span: dict, block_bbox: list, block_type: str):
'bbox': block_bbox, 'bbox': block_bbox,
'spans': [span], 'spans': [span],
} }
body_block = { body_block = {'bbox': block_bbox, 'type': block_type, 'lines': [body_line]}
'bbox': block_bbox,
'type': block_type,
'lines': [body_line]
}
return body_block return body_block
...@@ -249,13 +238,16 @@ def fix_image_block(block, img_blocks): ...@@ -249,13 +238,16 @@ def fix_image_block(block, img_blocks):
block['blocks'] = [] block['blocks'] = []
# 遍历img_blocks,找到与当前block匹配的img_block # 遍历img_blocks,找到与当前block匹配的img_block
for img_block in img_blocks: for img_block in img_blocks:
if _is_in_or_part_overlap_with_area_ratio(block['bbox'], img_block['bbox'], 0.95): if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
img_block['bbox'], 0.95):
# 创建img_body_block # 创建img_body_block
for span in block['spans']: for span in block['spans']:
if span['type'] == ContentType.Image and img_block['img_body_bbox'] == span['bbox']: if span['type'] == ContentType.Image and img_block[
'img_body_bbox'] == span['bbox']:
# 创建img_body_block # 创建img_body_block
img_body_block = make_body_block(span, img_block['img_body_bbox'], BlockType.ImageBody) img_body_block = make_body_block(
span, img_block['img_body_bbox'], BlockType.ImageBody)
block['blocks'].append(img_body_block) block['blocks'].append(img_body_block)
# 从spans中移除img_body_block中已经放入的span # 从spans中移除img_body_block中已经放入的span
...@@ -265,10 +257,15 @@ def fix_image_block(block, img_blocks): ...@@ -265,10 +257,15 @@ def fix_image_block(block, img_blocks):
# 根据list长度,判断img_block中是否有img_caption # 根据list长度,判断img_block中是否有img_caption
if img_block['img_caption_bbox'] is not None: if img_block['img_caption_bbox'] is not None:
img_caption_block, img_caption_spans = merge_spans_to_block( img_caption_block, img_caption_spans = merge_spans_to_block(
block['spans'], img_block['img_caption_bbox'], BlockType.ImageCaption block['spans'], img_block['img_caption_bbox'],
) BlockType.ImageCaption)
block['blocks'].append(img_caption_block) block['blocks'].append(img_caption_block)
if img_block['img_footnote_bbox'] is not None:
img_footnote_block, img_footnote_spans = merge_spans_to_block(
block['spans'], img_block['img_footnote_bbox'],
BlockType.ImageFootnote)
block['blocks'].append(img_footnote_block)
break break
del block['spans'] del block['spans']
return block return block
...@@ -278,13 +275,17 @@ def fix_table_block(block, table_blocks): ...@@ -278,13 +275,17 @@ def fix_table_block(block, table_blocks):
block['blocks'] = [] block['blocks'] = []
# 遍历table_blocks,找到与当前block匹配的table_block # 遍历table_blocks,找到与当前block匹配的table_block
for table_block in table_blocks: for table_block in table_blocks:
if _is_in_or_part_overlap_with_area_ratio(block['bbox'], table_block['bbox'], 0.95): if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
table_block['bbox'], 0.95):
# 创建table_body_block # 创建table_body_block
for span in block['spans']: for span in block['spans']:
if span['type'] == ContentType.Table and table_block['table_body_bbox'] == span['bbox']: if span['type'] == ContentType.Table and table_block[
'table_body_bbox'] == span['bbox']:
# 创建table_body_block # 创建table_body_block
table_body_block = make_body_block(span, table_block['table_body_bbox'], BlockType.TableBody) table_body_block = make_body_block(
span, table_block['table_body_bbox'],
BlockType.TableBody)
block['blocks'].append(table_body_block) block['blocks'].append(table_body_block)
# 从spans中移除img_body_block中已经放入的span # 从spans中移除img_body_block中已经放入的span
...@@ -294,8 +295,8 @@ def fix_table_block(block, table_blocks): ...@@ -294,8 +295,8 @@ def fix_table_block(block, table_blocks):
# 根据list长度,判断table_block中是否有caption # 根据list长度,判断table_block中是否有caption
if table_block['table_caption_bbox'] is not None: if table_block['table_caption_bbox'] is not None:
table_caption_block, table_caption_spans = merge_spans_to_block( table_caption_block, table_caption_spans = merge_spans_to_block(
block['spans'], table_block['table_caption_bbox'], BlockType.TableCaption block['spans'], table_block['table_caption_bbox'],
) BlockType.TableCaption)
block['blocks'].append(table_caption_block) block['blocks'].append(table_caption_block)
# 如果table_caption_block_spans不为空 # 如果table_caption_block_spans不为空
...@@ -307,8 +308,8 @@ def fix_table_block(block, table_blocks): ...@@ -307,8 +308,8 @@ def fix_table_block(block, table_blocks):
# 根据list长度,判断table_block中是否有table_note # 根据list长度,判断table_block中是否有table_note
if table_block['table_footnote_bbox'] is not None: if table_block['table_footnote_bbox'] is not None:
table_footnote_block, table_footnote_spans = merge_spans_to_block( table_footnote_block, table_footnote_spans = merge_spans_to_block(
block['spans'], table_block['table_footnote_bbox'], BlockType.TableFootnote block['spans'], table_block['table_footnote_bbox'],
) BlockType.TableFootnote)
block['blocks'].append(table_footnote_block) block['blocks'].append(table_footnote_block)
break break
......
# 欢迎来到 MinerU 项目列表
## 项目列表
- [llama_index_rag](./llama_index_rag/README.md): 基于 llama_index 构建轻量级 RAG 系统
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment