Commit 1f468bed authored by liukaiwen's avatar liukaiwen
Browse files

add modify inline equation y axis

add false displayed equation to inline equation
parent 63969109
...@@ -24,7 +24,8 @@ from magic_pdf.pre_proc.ocr_detect_layout import layout_detect ...@@ -24,7 +24,8 @@ from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
from magic_pdf.pre_proc.ocr_dict_merge import ( from magic_pdf.pre_proc.ocr_dict_merge import (
remove_overlaps_min_spans, remove_overlaps_min_spans,
merge_spans_to_line_by_layout, merge_spans_to_line_by_layout,
modify_y_axis modify_y_axis,
modify_inline_equation
) )
from magic_pdf.pre_proc.ocr_remove_spans import remove_spans_by_bboxes from magic_pdf.pre_proc.ocr_remove_spans import remove_spans_by_bboxes
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
...@@ -184,8 +185,11 @@ def parse_pdf_by_ocr( ...@@ -184,8 +185,11 @@ def parse_pdf_by_ocr(
spans = cut_image_and_table(spans, page, page_id, book_name, save_path) spans = cut_image_and_table(spans, page, page_id, book_name, save_path)
# 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧) # 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
displayed_list = []
text_inline_lines = []
modify_y_axis(spans, displayed_list, text_inline_lines)
# 模型识别错误的行间公式, type类型转换成行内公式 # 模型识别错误的行间公式, type类型转换成行内公式
spans = modify_y_axis(spans) spans = modify_inline_equation(spans, displayed_list, text_inline_lines)
# bbox去除粘连 # bbox去除粘连
spans = remove_overlap_between_bbox(spans) spans = remove_overlap_between_bbox(spans)
......
...@@ -94,12 +94,8 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes): ...@@ -94,12 +94,8 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
def modify_y_axis(spans: list): def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
inline_list = [] # displayed_list = []
displayed_list = []
text_list = []
image_list = []
table_list = []
spans.sort(key=lambda span: span['bbox'][1]) spans.sort(key=lambda span: span['bbox'][1])
...@@ -111,7 +107,7 @@ def modify_y_axis(spans: list): ...@@ -111,7 +107,7 @@ def modify_y_axis(spans: list):
line_first_y0 = spans[0]["bbox"][1] line_first_y0 = spans[0]["bbox"][1]
line_first_y = spans[0]["bbox"][3] line_first_y = spans[0]["bbox"][3]
#用于给行间公式搜索 #用于给行间公式搜索
text_inline_lines = [] # text_inline_lines = []
for span in spans[1:]: for span in spans[1:]:
# if span.get("content","") == "78.": # if span.get("content","") == "78.":
# print("debug") # print("debug")
...@@ -133,9 +129,8 @@ def modify_y_axis(spans: list): ...@@ -133,9 +129,8 @@ def modify_y_axis(spans: list):
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行 # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']): if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
if span["bbox"][1] < line_first_y0: if span["type"] == "text":
line_first_y0 = span["bbox"][1] line_first_y0 = span["bbox"][1]
if span["bbox"][3] > line_first_y:
line_first_y = span["bbox"][3] line_first_y = span["bbox"][3]
current_line.append(span) current_line.append(span)
...@@ -164,6 +159,10 @@ def modify_y_axis(spans: list): ...@@ -164,6 +159,10 @@ def modify_y_axis(spans: list):
for span in current_line: for span in current_line:
span["bbox"][1] = line_first_y0 span["bbox"][1] = line_first_y0
span["bbox"][3] = line_first_y span["bbox"][3] = line_first_y
# return spans, displayed_list, text_inline_lines
def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
#错误行间公式转行内公式 #错误行间公式转行内公式
j = 0 j = 0
for i in range(len(displayed_list)): for i in range(len(displayed_list)):
...@@ -180,7 +179,12 @@ def modify_y_axis(spans: list): ...@@ -180,7 +179,12 @@ def modify_y_axis(spans: list):
# span["bbox"][3] = y1 # span["bbox"][3] = y1
#调整公式类型 #调整公式类型
if span["type"] == "displayed_equation": if span["type"] == "displayed_equation":
span["type"] = "inline_equation" if j+1 >= len(text_inline_lines):
span["type"] = "inline_equation"
else:
y0_next, y1_next = text_inline_lines[j + 1][1]
if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)):
span["type"] = "inline_equation"
break break
elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)): elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
break break
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment