Commit 63969109 authored by 赵小蒙's avatar 赵小蒙
Browse files

移动modify_y_axis在pipeline中的位置

parent 61405b8a
......@@ -177,9 +177,6 @@ def parse_pdf_by_ocr(
# 删除重叠spans中较小的那些
spans = remove_overlaps_min_spans(spans)
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整低于文字的y0
spans = modify_y_axis(spans)
# 删除remove_span_block_bboxes中的bbox
spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
......@@ -187,8 +184,8 @@ def parse_pdf_by_ocr(
spans = cut_image_and_table(spans, page, page_id, book_name, save_path)
# 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
# 模型识别错误的行间公式, type类型转换成行内公式
spans = modify_y_axis(spans)
# bbox去除粘连
spans = remove_overlap_between_bbox(spans)
......
......@@ -64,7 +64,7 @@ def adjust_layouts(layout_bboxes, page_boundry, page_id):
# 排序调整布局边界框列表
new_bboxes = []
for layout_bbox in layout_bboxes:
new_bboxes.append([layout_bbox[0], layout_bbox[1], layout_bbox[2], layout_bbox[3], None, None, None, None, None, None, None, None,None])
new_bboxes.append([layout_bbox[0], layout_bbox[1], layout_bbox[2], layout_bbox[3], None, None, None, None, None, None, None, None, None])
layout_bboxes, layout_tree = get_bboxes_layout(new_bboxes, page_boundry, page_id)
......
......@@ -9,7 +9,7 @@ def remove_overlaps_min_spans(spans):
for span1 in spans.copy():
for span2 in spans.copy():
if span1 != span2:
overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.5)
overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.8)
if overlap_box is not None:
bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
if bbox_to_remove is not None:
......@@ -113,8 +113,8 @@ def modify_y_axis(spans: list):
#用于给行间公式搜索
text_inline_lines = []
for span in spans[1:]:
if span.get("content","") == "78.":
print("debug")
# if span.get("content","") == "78.":
# print("debug")
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
# image和table类型,同上
if span['type'] in ["displayed_equation", "image", "table"] or any(
......@@ -167,8 +167,8 @@ def modify_y_axis(spans: list):
#错误行间公式转行内公式
j = 0
for i in range(len(displayed_list)):
if i == 8:
print("debug")
# if i == 8:
# print("debug")
span = displayed_list[i]
span_y0, span_y = span["bbox"][1], span["bbox"][3]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment