Commit f5dc261d authored by liukaiwen's avatar liukaiwen
Browse files

Merge branch 'master' into dev-in-line-bbox

parents 1f468bed 32fd7f95
from magic_pdf.libs.commons import fitz # PyMuPDF
def draw_bbox(i, bbox_list, page, rgb_config):
def draw_bbox_without_number(i, bbox_list, page, rgb_config):
new_rgb = []
for item in rgb_config:
item = float(item) / 255
......@@ -12,6 +12,19 @@ def draw_bbox(i, bbox_list, page, rgb_config):
page.draw_rect(rect_coords, color=new_rgb, fill=None, width=0.5, overlay=True) # Draw the rectangle
def draw_bbox_with_number(i, bbox_list, page, rgb_config):
new_rgb = []
for item in rgb_config:
item = float(item) / 255
new_rgb.append(item)
page_data = bbox_list[i]
for j, bbox in enumerate(page_data):
x0, y0, x1, y1 = bbox
rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
page.draw_rect(rect_coords, color=new_rgb, fill=None, width=0.5, overlay=True) # Draw the rectangle
page.insert_text((x0, y0), str(j + 1), fontsize=10, color=new_rgb) # Insert the index at the top left corner of the rectangle
def draw_layout_bbox(pdf_info_dict, input_path, out_path):
layout_bbox_list = []
for page in pdf_info_dict.values():
......@@ -22,13 +35,7 @@ def draw_layout_bbox(pdf_info_dict, input_path, out_path):
doc = fitz.open(input_path)
for i, page in enumerate(doc):
# 获取当前页面的数据
page_data = layout_bbox_list[i]
for j, bbox in enumerate(page_data):
x0, y0, x1, y1 = bbox
rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
page.draw_rect(rect_coords, color=(1, 0, 0), fill=None, width=0.5, overlay=True) # Draw the rectangle
page.insert_text((x0, y0), str(j + 1), fontsize=10, color=(1, 0, 0)) # Insert the index at the top left corner of the rectangle
draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0])
# Save the PDF
doc.save(f"{out_path}/layout.pdf")
......@@ -56,11 +63,9 @@ def draw_text_bbox(pdf_info_dict, input_path, out_path):
doc = fitz.open(input_path)
for i, page in enumerate(doc):
# 获取当前页面的数据
draw_bbox(i, text_list, page, [255, 0, 0])
draw_bbox(i, inline_equation_list, page, [0, 255, 0])
draw_bbox(i, displayed_equation_list, page, [0, 0, 255])
draw_bbox_without_number(i, text_list, page, [255, 0, 0])
draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0])
draw_bbox_without_number(i, displayed_equation_list, page, [0, 0, 255])
# Save the PDF
doc.save(f"{out_path}/text.pdf")
......@@ -22,12 +22,12 @@ from magic_pdf.pre_proc.detect_page_number import parse_pageNos
from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table
from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
from magic_pdf.pre_proc.ocr_dict_merge import (
remove_overlaps_min_spans,
merge_spans_to_line_by_layout,
modify_y_axis,
modify_inline_equation
)
from magic_pdf.pre_proc.ocr_remove_spans import remove_spans_by_bboxes
from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remove_overlaps_min_spans, \
adjust_bbox_for_standalone_block
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
......@@ -195,6 +195,8 @@ def parse_pdf_by_ocr(
spans = remove_overlap_between_bbox(spans)
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
spans = adjust_bbox_for_standalone_block(spans)
# 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
layout_bboxes, layout_tree = layout_detect(ocr_page_info['subfield_dets'], page, ocr_page_info)
......
......@@ -4,19 +4,6 @@ from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox
calculate_overlap_area_in_bbox1_area_ratio
# 删除重叠spans中较小的那些
def remove_overlaps_min_spans(spans):
for span1 in spans.copy():
for span2 in spans.copy():
if span1 != span2:
overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.8)
if overlap_box is not None:
bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
if bbox_to_remove is not None:
spans.remove(bbox_to_remove)
return spans
# 将每一个line中的span从左到右排序
def line_sort_spans_by_left_to_right(lines):
line_objects = []
......
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
# 遍历spans, 判断是否在removed_span_block_bboxes中
# 如果是, 则删除该span 否则, 保留该span
need_remove_spans = []
for span in spans:
for removed_bbox in need_remove_spans_bboxes:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
need_remove_spans.append(span)
break
for span in need_remove_spans:
spans.remove(span)
return spans
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio
def remove_overlaps_min_spans(spans):
# 删除重叠spans中较小的那些
for span1 in spans.copy():
for span2 in spans.copy():
if span1 != span2:
overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
if overlap_box is not None:
bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
if bbox_to_remove is not None:
spans.remove(bbox_to_remove)
return spans
def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
# 遍历spans, 判断是否在removed_span_block_bboxes中
# 如果是, 则删除该span 否则, 保留该span
need_remove_spans = []
for span in spans:
for removed_bbox in need_remove_spans_bboxes:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
need_remove_spans.append(span)
break
for span in need_remove_spans:
spans.remove(span)
return spans
def adjust_bbox_for_standalone_block(spans):
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
for sb_span in spans:
if sb_span['type'] in ["displayed_equation", "image", "table"]:
for text_span in spans:
if text_span['type'] in ['text', 'inline_equation']:
# 判断span2的纵向高度是否被span所覆盖
if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]:
# 判断span2是否在span左边
if text_span['bbox'][0] < sb_span['bbox'][0]:
# 调整span的y0和span2的y0一致
sb_span['bbox'][1] = text_span['bbox'][1]
return spans
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment