Unverified Commit 03469909 authored by icecraft's avatar icecraft Committed by GitHub
Browse files

Feat/support footnote in figure (#532)



* feat: support figure footnote

* feat: using the relative position to combine footnote, table, image

* feat: add the readme of projects

* fix: code spell in unittest

---------
Co-authored-by: default avataricecraft <xurui1@pjlab.org.cn>
parent 4331b837
......@@ -3,6 +3,7 @@ repos:
rev: 5.0.4
hooks:
- id: flake8
args: ["--max-line-length=120", "--ignore=E131,E125,W503,W504,E203"]
- repo: https://github.com/PyCQA/isort
rev: 5.11.5
hooks:
......@@ -11,6 +12,7 @@ repos:
rev: v0.32.0
hooks:
- id: yapf
args: ["--style={based_on_style: google, column_limit: 120, indent_width: 4}"]
- repo: https://github.com/codespell-project/codespell
rev: v2.2.1
hooks:
......@@ -41,4 +43,4 @@ repos:
rev: v1.3.1
hooks:
- id: docformatter
args: ["--in-place", "--wrap-descriptions", "79"]
args: ["--in-place", "--wrap-descriptions", "119"]
import re
import wordninja
from loguru import logger
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.libs.commons import join_path
from magic_pdf.libs.language import detect_lang
from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
import wordninja
import re
from magic_pdf.libs.ocr_content_type import BlockType, ContentType
def __is_hyphen_at_line_end(line):
......@@ -37,8 +38,9 @@ def split_long_words(text):
def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
markdown = []
for page_info in pdf_info_list:
paras_of_layout = page_info.get("para_blocks")
page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
paras_of_layout = page_info.get('para_blocks')
page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'mm', img_buket_path)
markdown.extend(page_markdown)
return '\n\n'.join(markdown)
......@@ -46,29 +48,34 @@ def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
markdown = []
for page_info in pdf_info_dict:
paras_of_layout = page_info.get("para_blocks")
page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
paras_of_layout = page_info.get('para_blocks')
page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'nlp')
markdown.extend(page_markdown)
return '\n\n'.join(markdown)
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, img_buket_path):
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
img_buket_path):
markdown_with_para_and_pagination = []
page_no = 0
for page_info in pdf_info_dict:
paras_of_layout = page_info.get("para_blocks")
paras_of_layout = page_info.get('para_blocks')
if not paras_of_layout:
continue
page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'mm', img_buket_path)
markdown_with_para_and_pagination.append({
'page_no': page_no,
'md_content': '\n\n'.join(page_markdown)
'page_no':
page_no,
'md_content':
'\n\n'.join(page_markdown)
})
page_no += 1
return markdown_with_para_and_pagination
def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
page_markdown = []
for paras in paras_of_layout:
for para in paras:
......@@ -81,8 +88,9 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
if span_type == ContentType.Text:
content = span['content']
language = detect_lang(content)
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
content = ocr_escape_special_markdown_char(split_long_words(content))
if (language == 'en'): # 只对英文长词进行分词处理,中文分词会丢失文本
content = ocr_escape_special_markdown_char(
split_long_words(content))
else:
content = ocr_escape_special_markdown_char(content)
elif span_type == ContentType.InlineEquation:
......@@ -106,7 +114,9 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
return page_markdown
def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
mode,
img_buket_path=''):
page_markdown = []
for para_block in paras_of_layout:
para_text = ''
......@@ -114,7 +124,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
if para_type == BlockType.Text:
para_text = merge_para_with_text(para_block)
elif para_type == BlockType.Title:
para_text = f"# {merge_para_with_text(para_block)}"
para_text = f'# {merge_para_with_text(para_block)}'
elif para_type == BlockType.InterlineEquation:
para_text = merge_para_with_text(para_block)
elif para_type == BlockType.Image:
......@@ -130,11 +140,13 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
for block in para_block['blocks']: # 2nd.拼image_caption
if block['type'] == BlockType.ImageCaption:
para_text += merge_para_with_text(block)
for block in para_block['blocks']: # 2nd.拼image_caption
if block['type'] == BlockType.ImageFootnote:
para_text += merge_para_with_text(block)
elif para_type == BlockType.Table:
if mode == 'nlp':
continue
elif mode == 'mm':
table_caption = ''
for block in para_block['blocks']: # 1st.拼table_caption
if block['type'] == BlockType.TableCaption:
para_text += merge_para_with_text(block)
......@@ -163,6 +175,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
def merge_para_with_text(para_block):
def detect_language(text):
en_pattern = r'[a-zA-Z]+'
en_matches = re.findall(en_pattern, text)
......@@ -171,19 +184,19 @@ def merge_para_with_text(para_block):
if en_length / len(text) >= 0.5:
return 'en'
else:
return "unknown"
return 'unknown'
else:
return "empty"
return 'empty'
para_text = ''
for line in para_block['lines']:
line_text = ""
line_lang = ""
line_text = ''
line_lang = ''
for span in line['spans']:
span_type = span['type']
if span_type == ContentType.Text:
line_text += span['content'].strip()
if line_text != "":
if line_text != '':
line_lang = detect_lang(line_text)
for span in line['spans']:
span_type = span['type']
......@@ -193,7 +206,8 @@ def merge_para_with_text(para_block):
# language = detect_lang(content)
language = detect_language(content)
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
content = ocr_escape_special_markdown_char(split_long_words(content))
content = ocr_escape_special_markdown_char(
split_long_words(content))
else:
content = ocr_escape_special_markdown_char(content)
elif span_type == ContentType.InlineEquation:
......@@ -227,12 +241,13 @@ def para_to_standard_format(para, img_buket_path):
for span in line['spans']:
language = ''
span_type = span.get('type')
content = ""
content = ''
if span_type == ContentType.Text:
content = span['content']
language = detect_lang(content)
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本
content = ocr_escape_special_markdown_char(split_long_words(content))
content = ocr_escape_special_markdown_char(
split_long_words(content))
else:
content = ocr_escape_special_markdown_char(content)
elif span_type == ContentType.InlineEquation:
......@@ -245,7 +260,7 @@ def para_to_standard_format(para, img_buket_path):
para_content = {
'type': 'text',
'text': para_text,
'inline_equation_num': inline_equation_num
'inline_equation_num': inline_equation_num,
}
return para_content
......@@ -256,37 +271,35 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
para_content = {
'type': 'text',
'text': merge_para_with_text(para_block),
'page_idx': page_idx
'page_idx': page_idx,
}
elif para_type == BlockType.Title:
para_content = {
'type': 'text',
'text': merge_para_with_text(para_block),
'text_level': 1,
'page_idx': page_idx
'page_idx': page_idx,
}
elif para_type == BlockType.InterlineEquation:
para_content = {
'type': 'equation',
'text': merge_para_with_text(para_block),
'text_format': "latex",
'page_idx': page_idx
'text_format': 'latex',
'page_idx': page_idx,
}
elif para_type == BlockType.Image:
para_content = {
'type': 'image',
'page_idx': page_idx
}
para_content = {'type': 'image', 'page_idx': page_idx}
for block in para_block['blocks']:
if block['type'] == BlockType.ImageBody:
para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
para_content['img_path'] = join_path(
img_buket_path,
block['lines'][0]['spans'][0]['image_path'])
if block['type'] == BlockType.ImageCaption:
para_content['img_caption'] = merge_para_with_text(block)
if block['type'] == BlockType.ImageFootnote:
para_content['img_footnote'] = merge_para_with_text(block)
elif para_type == BlockType.Table:
para_content = {
'type': 'table',
'page_idx': page_idx
}
para_content = {'type': 'table', 'page_idx': page_idx}
for block in para_block['blocks']:
if block['type'] == BlockType.TableBody:
if block["lines"][0]["spans"][0].get('latex', ''):
......@@ -305,17 +318,18 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
content_list = []
for page_info in pdf_info_dict:
paras_of_layout = page_info.get("para_blocks")
paras_of_layout = page_info.get('para_blocks')
if not paras_of_layout:
continue
for para_block in paras_of_layout:
para_content = para_to_standard_format_v2(para_block, img_buket_path)
para_content = para_to_standard_format_v2(para_block,
img_buket_path)
content_list.append(para_content)
return content_list
def line_to_standard_format(line, img_buket_path):
line_text = ""
line_text = ''
inline_equation_num = 0
for span in line['spans']:
if not span.get('content'):
......@@ -325,13 +339,15 @@ def line_to_standard_format(line, img_buket_path):
if span['type'] == ContentType.Image:
content = {
'type': 'image',
'img_path': join_path(img_buket_path, span['image_path'])
'img_path': join_path(img_buket_path,
span['image_path']),
}
return content
elif span['type'] == ContentType.Table:
content = {
'type': 'table',
'img_path': join_path(img_buket_path, span['image_path'])
'img_path': join_path(img_buket_path,
span['image_path']),
}
return content
else:
......@@ -339,36 +355,33 @@ def line_to_standard_format(line, img_buket_path):
interline_equation = span['content']
content = {
'type': 'equation',
'latex': f"$$\n{interline_equation}\n$$"
'latex': f'$$\n{interline_equation}\n$$'
}
return content
elif span['type'] == ContentType.InlineEquation:
inline_equation = span['content']
line_text += f"${inline_equation}$"
line_text += f'${inline_equation}$'
inline_equation_num += 1
elif span['type'] == ContentType.Text:
text_content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
text_content = ocr_escape_special_markdown_char(
span['content']) # 转义特殊符号
line_text += text_content
content = {
'type': 'text',
'text': line_text,
'inline_equation_num': inline_equation_num
'inline_equation_num': inline_equation_num,
}
return content
def ocr_mk_mm_standard_format(pdf_info_dict: list):
"""
content_list
type string image/text/table/equation(行间的单独拿出来,行内的和text合并)
latex string latex文本字段。
text string 纯文本格式的文本数据。
md string markdown格式的文本数据。
img_path string s3://full/path/to/img.jpg
"""
"""content_list type string
image/text/table/equation(行间的单独拿出来,行内的和text合并) latex string
latex文本字段。 text string 纯文本格式的文本数据。 md string
markdown格式的文本数据。 img_path string s3://full/path/to/img.jpg."""
content_list = []
for page_info in pdf_info_dict:
blocks = page_info.get("preproc_blocks")
blocks = page_info.get('preproc_blocks')
if not blocks:
continue
for block in blocks:
......@@ -378,34 +391,42 @@ def ocr_mk_mm_standard_format(pdf_info_dict: list):
return content_list
def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_path: str = ""):
def union_make(pdf_info_dict: list,
make_mode: str,
drop_mode: str,
img_buket_path: str = ''):
output_content = []
for page_info in pdf_info_dict:
if page_info.get("need_drop", False):
drop_reason = page_info.get("drop_reason")
if page_info.get('need_drop', False):
drop_reason = page_info.get('drop_reason')
if drop_mode == DropMode.NONE:
pass
elif drop_mode == DropMode.WHOLE_PDF:
raise Exception(f"drop_mode is {DropMode.WHOLE_PDF} , drop_reason is {drop_reason}")
raise Exception((f'drop_mode is {DropMode.WHOLE_PDF} ,'
f'drop_reason is {drop_reason}'))
elif drop_mode == DropMode.SINGLE_PAGE:
logger.warning(f"drop_mode is {DropMode.SINGLE_PAGE} , drop_reason is {drop_reason}")
logger.warning((f'drop_mode is {DropMode.SINGLE_PAGE} ,'
f'drop_reason is {drop_reason}'))
continue
else:
raise Exception(f"drop_mode can not be null")
raise Exception('drop_mode can not be null')
paras_of_layout = page_info.get("para_blocks")
page_idx = page_info.get("page_idx")
paras_of_layout = page_info.get('para_blocks')
page_idx = page_info.get('page_idx')
if not paras_of_layout:
continue
if make_mode == MakeMode.MM_MD:
page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'mm', img_buket_path)
output_content.extend(page_markdown)
elif make_mode == MakeMode.NLP_MD:
page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
page_markdown = ocr_mk_markdown_with_para_core_v2(
paras_of_layout, 'nlp')
output_content.extend(page_markdown)
elif make_mode == MakeMode.STANDARD_FORMAT:
for para_block in paras_of_layout:
para_content = para_to_standard_format_v2(para_block, img_buket_path, page_idx)
para_content = para_to_standard_format_v2(
para_block, img_buket_path, page_idx)
output_content.append(para_content)
if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
return '\n\n'.join(output_content)
......
This diff is collapsed.
This diff is collapsed.
......@@ -71,6 +71,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
tables_list, tables_body_list = [], []
tables_caption_list, tables_footnote_list = [], []
imgs_list, imgs_body_list, imgs_caption_list = [], [], []
imgs_footnote_list = []
titles_list = []
texts_list = []
interequations_list = []
......@@ -78,7 +79,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
page_layout_list = []
page_dropped_list = []
tables, tables_body, tables_caption, tables_footnote = [], [], [], []
imgs, imgs_body, imgs_caption = [], [], []
imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], []
titles = []
texts = []
interequations = []
......@@ -108,6 +109,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
imgs_body.append(bbox)
elif nested_block['type'] == BlockType.ImageCaption:
imgs_caption.append(bbox)
elif nested_block['type'] == BlockType.ImageFootnote:
imgs_footnote.append(bbox)
elif block['type'] == BlockType.Title:
titles.append(bbox)
elif block['type'] == BlockType.Text:
......@@ -121,6 +124,7 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
imgs_list.append(imgs)
imgs_body_list.append(imgs_body)
imgs_caption_list.append(imgs_caption)
imgs_footnote_list.append(imgs_footnote)
titles_list.append(titles)
texts_list.append(texts)
interequations_list.append(interequations)
......@@ -142,6 +146,8 @@ def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename):
draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255],
True)
draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
True),
draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
draw_bbox_without_number(i, interequations_list, page, [0, 255, 0],
......@@ -241,7 +247,7 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
dropped_bbox_list = []
tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
imgs_body_list, imgs_caption_list = [], []
imgs_body_list, imgs_caption_list, imgs_footnote_list = [], [], []
titles_list = []
texts_list = []
interequations_list = []
......@@ -250,7 +256,7 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
for i in range(len(model_list)):
page_dropped_list = []
tables_body, tables_caption, tables_footnote = [], [], []
imgs_body, imgs_caption = [], []
imgs_body, imgs_caption, imgs_footnote = [], [], []
titles = []
texts = []
interequations = []
......@@ -277,6 +283,8 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
interequations.append(bbox)
elif layout_det['category_id'] == CategoryId.Abandon:
page_dropped_list.append(bbox)
elif layout_det['category_id'] == CategoryId.ImageFootnote:
imgs_footnote.append(bbox)
tables_body_list.append(tables_body)
tables_caption_list.append(tables_caption)
......@@ -287,6 +295,7 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
texts_list.append(texts)
interequations_list.append(interequations)
dropped_bbox_list.append(page_dropped_list)
imgs_footnote_list.append(imgs_footnote)
for i, page in enumerate(pdf_docs):
draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158],
......@@ -299,6 +308,8 @@ def drow_model_bbox(model_list: list, pdf_bytes, out_path, filename):
draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255],
True)
draw_bbox_with_number(i, imgs_footnote_list, page, [255, 178, 102],
True)
draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
......
class ContentType:
Image = "image"
Table = "table"
Text = "text"
InlineEquation = "inline_equation"
InterlineEquation = "interline_equation"
Image = 'image'
Table = 'table'
Text = 'text'
InlineEquation = 'inline_equation'
InterlineEquation = 'interline_equation'
class BlockType:
Image = "image"
ImageBody = "image_body"
ImageCaption = "image_caption"
Table = "table"
TableBody = "table_body"
TableCaption = "table_caption"
TableFootnote = "table_footnote"
Text = "text"
Title = "title"
InterlineEquation = "interline_equation"
Footnote = "footnote"
Discarded = "discarded"
Image = 'image'
ImageBody = 'image_body'
ImageCaption = 'image_caption'
ImageFootnote = 'image_footnote'
Table = 'table'
TableBody = 'table_body'
TableCaption = 'table_caption'
TableFootnote = 'table_footnote'
Text = 'text'
Title = 'title'
InterlineEquation = 'interline_equation'
Footnote = 'footnote'
Discarded = 'discarded'
class CategoryId:
......@@ -33,3 +35,4 @@ class CategoryId:
InlineEquation = 13
InterlineEquation_YOLO = 14
OcrText = 15
ImageFootnote = 101
This diff is collapsed.
from loguru import logger
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
calculate_overlap_area_in_bbox1_area_ratio, _is_in_or_part_overlap_with_area_ratio
from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
_is_in_or_part_overlap_with_area_ratio,
calculate_overlap_area_in_bbox1_area_ratio)
from magic_pdf.libs.drop_tag import DropTag
from magic_pdf.libs.ocr_content_type import ContentType, BlockType
from magic_pdf.pre_proc.ocr_span_list_modify import modify_y_axis, modify_inline_equation
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_span
from magic_pdf.libs.ocr_content_type import BlockType, ContentType
# 将每一个line中的span从左到右排序
def line_sort_spans_by_left_to_right(lines):
line_objects = []
for line in lines:
# 按照x0坐标排序
# 按照x0坐标排序
line.sort(key=lambda span: span['bbox'][0])
line_bbox = [
min(span['bbox'][0] for span in line), # x0
......@@ -21,8 +18,8 @@ def line_sort_spans_by_left_to_right(lines):
max(span['bbox'][3] for span in line), # y1
]
line_objects.append({
"bbox": line_bbox,
"spans": line,
'bbox': line_bbox,
'spans': line,
})
return line_objects
......@@ -39,16 +36,21 @@ def merge_spans_to_line(spans):
for span in spans[1:]:
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
# image和table类型,同上
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in
current_line):
if span['type'] in [
ContentType.InterlineEquation, ContentType.Image,
ContentType.Table
] or any(s['type'] in [
ContentType.InterlineEquation, ContentType.Image,
ContentType.Table
] for s in current_line):
# 则开始新行
lines.append(current_line)
current_line = [span]
continue
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
if __is_overlaps_y_exceeds_threshold(span['bbox'],
current_line[-1]['bbox']):
current_line.append(span)
else:
# 否则,开始新行
......@@ -71,7 +73,8 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
# 遍历spans,将每个span放入对应的layout中
layout_sapns = []
for span in spans:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.6:
if calculate_overlap_area_in_bbox1_area_ratio(
span['bbox'], layout_bbox) > 0.6:
layout_sapns.append(span)
# 如果layout_sapns不为空,则放入new_spans中
if len(layout_sapns) > 0:
......@@ -99,12 +102,10 @@ def merge_lines_to_block(lines):
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
blocks = []
for line in lines:
blocks.append(
{
"bbox": line["bbox"],
"lines": [line],
}
)
blocks.append({
'bbox': line['bbox'],
'lines': [line],
})
return blocks
......@@ -121,7 +122,8 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
if block[7] == BlockType.Footnote:
continue
block_bbox = block[:4]
if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, layout_bbox) > 0.8:
if calculate_overlap_area_in_bbox1_area_ratio(
block_bbox, layout_bbox) > 0.8:
layout_blocks.append(block)
# 如果layout_blocks不为空,则放入new_blocks中
......@@ -134,7 +136,8 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
# 如果new_blocks不为空,则对new_blocks中每个block进行排序
if len(new_blocks) > 0:
for bboxes_in_layout_block in new_blocks:
bboxes_in_layout_block.sort(key=lambda x: x[1]) # 一个layout内部的box,按照y0自上而下排序
bboxes_in_layout_block.sort(
key=lambda x: x[1]) # 一个layout内部的box,按照y0自上而下排序
sort_blocks.extend(bboxes_in_layout_block)
# sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序
......@@ -142,9 +145,7 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes):
def fill_spans_in_blocks(blocks, spans, radio):
'''
将allspans中的span按位置关系,放入blocks中
'''
"""将allspans中的span按位置关系,放入blocks中."""
block_with_spans = []
for block in blocks:
block_type = block[7]
......@@ -156,17 +157,15 @@ def fill_spans_in_blocks(blocks, spans, radio):
block_spans = []
for span in spans:
span_bbox = span['bbox']
if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio:
if calculate_overlap_area_in_bbox1_area_ratio(
span_bbox, block_bbox) > radio:
block_spans.append(span)
'''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
# displayed_list = []
# text_inline_lines = []
# modify_y_axis(block_spans, displayed_list, text_inline_lines)
'''模型识别错误的行间公式, type类型转换成行内公式'''
# block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
'''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错
# block_spans = remove_overlap_between_bbox_for_span(block_spans)
......@@ -182,12 +181,9 @@ def fill_spans_in_blocks(blocks, spans, radio):
def fix_block_spans(block_with_spans, img_blocks, table_blocks):
'''
1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
需要将caption和footnote的text_span放入相应img_block和table_block内的
caption_block和footnote_block中
2、同时需要删除block中的spans字段
'''
"""1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系
需要将caption和footnote的text_span放入相应img_block和table_block内的
caption_block和footnote_block中 2、同时需要删除block中的spans字段."""
fix_blocks = []
for block in block_with_spans:
block_type = block['type']
......@@ -218,16 +214,13 @@ def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
block_spans = []
# 如果有img_caption,则将img_block中的text_spans放入img_caption_block中
for span in spans:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.6:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'],
block_bbox) > 0.6:
block_spans.append(span)
block_lines = merge_spans_to_line(block_spans)
# 对line中的span进行排序
sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
block = {
'bbox': block_bbox,
'type': block_type,
'lines': sort_block_lines
}
block = {'bbox': block_bbox, 'type': block_type, 'lines': sort_block_lines}
return block, block_spans
......@@ -237,11 +230,7 @@ def make_body_block(span: dict, block_bbox: list, block_type: str):
'bbox': block_bbox,
'spans': [span],
}
body_block = {
'bbox': block_bbox,
'type': block_type,
'lines': [body_line]
}
body_block = {'bbox': block_bbox, 'type': block_type, 'lines': [body_line]}
return body_block
......@@ -249,13 +238,16 @@ def fix_image_block(block, img_blocks):
block['blocks'] = []
# 遍历img_blocks,找到与当前block匹配的img_block
for img_block in img_blocks:
if _is_in_or_part_overlap_with_area_ratio(block['bbox'], img_block['bbox'], 0.95):
if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
img_block['bbox'], 0.95):
# 创建img_body_block
for span in block['spans']:
if span['type'] == ContentType.Image and img_block['img_body_bbox'] == span['bbox']:
if span['type'] == ContentType.Image and img_block[
'img_body_bbox'] == span['bbox']:
# 创建img_body_block
img_body_block = make_body_block(span, img_block['img_body_bbox'], BlockType.ImageBody)
img_body_block = make_body_block(
span, img_block['img_body_bbox'], BlockType.ImageBody)
block['blocks'].append(img_body_block)
# 从spans中移除img_body_block中已经放入的span
......@@ -265,10 +257,15 @@ def fix_image_block(block, img_blocks):
# 根据list长度,判断img_block中是否有img_caption
if img_block['img_caption_bbox'] is not None:
img_caption_block, img_caption_spans = merge_spans_to_block(
block['spans'], img_block['img_caption_bbox'], BlockType.ImageCaption
)
block['spans'], img_block['img_caption_bbox'],
BlockType.ImageCaption)
block['blocks'].append(img_caption_block)
if img_block['img_footnote_bbox'] is not None:
img_footnote_block, img_footnote_spans = merge_spans_to_block(
block['spans'], img_block['img_footnote_bbox'],
BlockType.ImageFootnote)
block['blocks'].append(img_footnote_block)
break
del block['spans']
return block
......@@ -278,13 +275,17 @@ def fix_table_block(block, table_blocks):
block['blocks'] = []
# 遍历table_blocks,找到与当前block匹配的table_block
for table_block in table_blocks:
if _is_in_or_part_overlap_with_area_ratio(block['bbox'], table_block['bbox'], 0.95):
if _is_in_or_part_overlap_with_area_ratio(block['bbox'],
table_block['bbox'], 0.95):
# 创建table_body_block
for span in block['spans']:
if span['type'] == ContentType.Table and table_block['table_body_bbox'] == span['bbox']:
if span['type'] == ContentType.Table and table_block[
'table_body_bbox'] == span['bbox']:
# 创建table_body_block
table_body_block = make_body_block(span, table_block['table_body_bbox'], BlockType.TableBody)
table_body_block = make_body_block(
span, table_block['table_body_bbox'],
BlockType.TableBody)
block['blocks'].append(table_body_block)
# 从spans中移除img_body_block中已经放入的span
......@@ -294,8 +295,8 @@ def fix_table_block(block, table_blocks):
# 根据list长度,判断table_block中是否有caption
if table_block['table_caption_bbox'] is not None:
table_caption_block, table_caption_spans = merge_spans_to_block(
block['spans'], table_block['table_caption_bbox'], BlockType.TableCaption
)
block['spans'], table_block['table_caption_bbox'],
BlockType.TableCaption)
block['blocks'].append(table_caption_block)
# 如果table_caption_block_spans不为空
......@@ -307,8 +308,8 @@ def fix_table_block(block, table_blocks):
# 根据list长度,判断table_block中是否有table_note
if table_block['table_footnote_bbox'] is not None:
table_footnote_block, table_footnote_spans = merge_spans_to_block(
block['spans'], table_block['table_footnote_bbox'], BlockType.TableFootnote
)
block['spans'], table_block['table_footnote_bbox'],
BlockType.TableFootnote)
block['blocks'].append(table_footnote_block)
break
......
# 欢迎来到 MinerU 项目列表
## 项目列表
- [llama_index_rag](./llama_index_rag/README.md): 基于 llama_index 构建轻量级 RAG 系统
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment