Commit 63969109 authored by 赵小蒙's avatar 赵小蒙
Browse files

移动modify_y_axis在pipeline中的位置

parent 61405b8a
...@@ -177,9 +177,6 @@ def parse_pdf_by_ocr( ...@@ -177,9 +177,6 @@ def parse_pdf_by_ocr(
# 删除重叠spans中较小的那些 # 删除重叠spans中较小的那些
spans = remove_overlaps_min_spans(spans) spans = remove_overlaps_min_spans(spans)
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
spans = modify_y_axis(spans)
# 删除remove_span_block_bboxes中的bbox # 删除remove_span_block_bboxes中的bbox
spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes) spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
...@@ -187,8 +184,8 @@ def parse_pdf_by_ocr( ...@@ -187,8 +184,8 @@ def parse_pdf_by_ocr(
spans = cut_image_and_table(spans, page, page_id, book_name, save_path) spans = cut_image_and_table(spans, page, page_id, book_name, save_path)
# 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧) # 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
# 模型识别错误的行间公式, type类型转换成行内公式 # 模型识别错误的行间公式, type类型转换成行内公式
spans = modify_y_axis(spans)
# bbox去除粘连 # bbox去除粘连
spans = remove_overlap_between_bbox(spans) spans = remove_overlap_between_bbox(spans)
......
...@@ -64,7 +64,7 @@ def adjust_layouts(layout_bboxes, page_boundry, page_id): ...@@ -64,7 +64,7 @@ def adjust_layouts(layout_bboxes, page_boundry, page_id):
# 排序调整布局边界框列表 # 排序调整布局边界框列表
new_bboxes = [] new_bboxes = []
for layout_bbox in layout_bboxes: for layout_bbox in layout_bboxes:
new_bboxes.append([layout_bbox[0], layout_bbox[1], layout_bbox[2], layout_bbox[3], None, None, None, None, None, None, None, None,None]) new_bboxes.append([layout_bbox[0], layout_bbox[1], layout_bbox[2], layout_bbox[3], None, None, None, None, None, None, None, None, None])
layout_bboxes, layout_tree = get_bboxes_layout(new_bboxes, page_boundry, page_id) layout_bboxes, layout_tree = get_bboxes_layout(new_bboxes, page_boundry, page_id)
......
...@@ -9,7 +9,7 @@ def remove_overlaps_min_spans(spans): ...@@ -9,7 +9,7 @@ def remove_overlaps_min_spans(spans):
for span1 in spans.copy(): for span1 in spans.copy():
for span2 in spans.copy(): for span2 in spans.copy():
if span1 != span2: if span1 != span2:
overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.5) overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.8)
if overlap_box is not None: if overlap_box is not None:
bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None) bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
if bbox_to_remove is not None: if bbox_to_remove is not None:
...@@ -113,8 +113,8 @@ def modify_y_axis(spans: list): ...@@ -113,8 +113,8 @@ def modify_y_axis(spans: list):
#用于给行间公式搜索 #用于给行间公式搜索
text_inline_lines = [] text_inline_lines = []
for span in spans[1:]: for span in spans[1:]:
if span.get("content","") == "78.": # if span.get("content","") == "78.":
print("debug") # print("debug")
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation" # 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
# image和table类型,同上 # image和table类型,同上
if span['type'] in ["displayed_equation", "image", "table"] or any( if span['type'] in ["displayed_equation", "image", "table"] or any(
...@@ -167,8 +167,8 @@ def modify_y_axis(spans: list): ...@@ -167,8 +167,8 @@ def modify_y_axis(spans: list):
#错误行间公式转行内公式 #错误行间公式转行内公式
j = 0 j = 0
for i in range(len(displayed_list)): for i in range(len(displayed_list)):
if i == 8: # if i == 8:
print("debug") # print("debug")
span = displayed_list[i] span = displayed_list[i]
span_y0, span_y = span["bbox"][1], span["bbox"][3] span_y0, span_y = span["bbox"][1], span["bbox"][3]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment