Commit 864e9535 authored by 赵小蒙's avatar 赵小蒙
Browse files

span->line现基于模型的layout进行拼接

parent 0c279ffc
......@@ -57,4 +57,4 @@ if __name__ == '__main__':
# logger.info(markdown_content)
# save_markdown(markdown_text, ocr_json_file_path)
except Exception as e:
logger.error(e)
logger.exception(e)
......@@ -11,7 +11,7 @@ from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
from magic_pdf.pre_proc.detect_header import parse_headers
from magic_pdf.pre_proc.detect_page_number import parse_pageNos
from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans
from magic_pdf.pre_proc.ocr_dict_merge import remove_overlaps_min_spans, merge_spans_to_line_by_layout
from magic_pdf.pre_proc.ocr_remove_spans import remove_spans_by_bboxes
......@@ -151,10 +151,10 @@ def parse_pdf_by_ocr(
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
# 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
layout_bboxes = layout_detect(ocr_page_info['subfield_dets'], page)
layout_bboxes = layout_detect(ocr_page_info['subfield_dets'], page, ocr_page_info)
# 将spans合并成line(在layout内,从上到下,从左到右)
lines = merge_spans_to_line(spans, layout_bboxes)
lines = merge_spans_to_line_by_layout(spans, layout_bboxes)
# logger.info(lines)
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
......
......@@ -66,7 +66,7 @@ def adjust_layouts(layout_bboxes):
return layout_bboxes
def layout_detect(layout_info, page: fitz.Page):
def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
"""
对输入的布局信息进行解析,提取出每个子布局的边界框,并对所有子布局进行排序调整。
......@@ -77,7 +77,7 @@ def layout_detect(layout_info, page: fitz.Page):
list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
"""
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(layout_info, page)
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(ocr_page_info, page)
# 初始化布局边界框列表
layout_bboxes = []
# 遍历每个子布局
......
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio
from loguru import logger
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
calculate_overlap_area_in_bbox1_area_ratio
# 删除重叠spans中较小的那些
......@@ -14,6 +17,24 @@ def remove_overlaps_min_spans(spans):
return spans
# 将每一个line中的span从左到右排序
def line_sort_spans_by_left_to_right(lines):
line_objects = []
for line in lines:
# 按照x0坐标排序
line.sort(key=lambda span: span['bbox'][0])
line_bbox = [
min(span['bbox'][0] for span in line), # x0
min(span['bbox'][1] for span in line), # y0
max(span['bbox'][2] for span in line), # x1
max(span['bbox'][3] for span in line), # y1
]
line_objects.append({
"bbox": line_bbox,
"spans": line,
})
return line_objects
def merge_spans_to_line(spans):
# 按照y0坐标排序
spans.sort(key=lambda span: span['bbox'][1])
......@@ -23,7 +44,8 @@ def merge_spans_to_line(spans):
for span in spans[1:]:
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
# image和table类型,同上
if span['type'] in ["displayed_equation", "image", "table"] or any(s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
if span['type'] in ["displayed_equation", "image", "table"] or any(
s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
# 则开始新行
lines.append(current_line)
current_line = [span]
......@@ -41,20 +63,25 @@ def merge_spans_to_line(spans):
if current_line:
lines.append(current_line)
# 计算每行的边界框,并对每行中的span按照x0进行排序
line_objects = []
for line in lines:
# 按照x0坐标排序
line.sort(key=lambda span: span['bbox'][0])
line_bbox = [
min(span['bbox'][0] for span in line), # x0
min(span['bbox'][1] for span in line), # y0
max(span['bbox'][2] for span in line), # x1
max(span['bbox'][3] for span in line), # y1
]
line_objects.append({
"bbox": line_bbox,
"spans": line,
})
return lines
return line_objects
def merge_spans_to_line_by_layout(spans, layout_bboxes):
lines = []
new_spans = []
for item in layout_bboxes:
layout_bbox = item['layout_bbox']
# 遍历spans,将每个span放入对应的layout中
layout_sapns = []
for span in spans:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.8:
layout_sapns.append(span)
new_spans.append(layout_sapns)
for layout_sapns in new_spans:
layout_lines = merge_spans_to_line(layout_sapns)
lines.extend(layout_lines)
#对line中的span进行排序
lines = line_sort_spans_by_left_to_right(lines)
return lines
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment