"git@developer.sourcefind.cn:modelzoo/resnet50_tensorflow.git" did not exist on "5bbe41c4d33bf575b8ea455bf777414c329bea99"
Commit b6f051d8 authored by 赵小蒙's avatar 赵小蒙
Browse files

在layout.pdf中绘制drop的bbox

parent 85587b25
def mk_nlp_markdown(pdf_info_dict: dict): def mk_nlp_markdown(pdf_info_dict: dict):
markdown = [] markdown = []
for _, page_info in pdf_info_dict.items(): for _, page_info in pdf_info_dict.items():
...@@ -22,6 +21,7 @@ def mk_nlp_markdown(pdf_info_dict: dict): ...@@ -22,6 +21,7 @@ def mk_nlp_markdown(pdf_info_dict: dict):
markdown.append(line_text.strip() + ' ') markdown.append(line_text.strip() + ' ')
return '\n'.join(markdown) return '\n'.join(markdown)
def mk_mm_markdown(pdf_info_dict: dict): def mk_mm_markdown(pdf_info_dict: dict):
markdown = [] markdown = []
......
...@@ -27,15 +27,22 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config): ...@@ -27,15 +27,22 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config):
def draw_layout_bbox(pdf_info_dict, input_path, out_path): def draw_layout_bbox(pdf_info_dict, input_path, out_path):
layout_bbox_list = [] layout_bbox_list = []
dropped_bbox_list = []
for page in pdf_info_dict.values(): for page in pdf_info_dict.values():
page_list = [] page_layout_list = []
page_dropped_list = []
for layout in page['layout_bboxes']: for layout in page['layout_bboxes']:
page_list.append(layout['layout_bbox']) page_layout_list.append(layout['layout_bbox'])
layout_bbox_list.append(page_list) layout_bbox_list.append(page_layout_list)
for drop_tag, dropped_bboxes in page['dropped_bboxes'].items():
for dropped_bbox in dropped_bboxes:
page_dropped_list.append(dropped_bbox)
dropped_bbox_list.append(page_dropped_list)
doc = fitz.open(input_path) doc = fitz.open(input_path)
for i, page in enumerate(doc): for i, page in enumerate(doc):
draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0]) draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0])
draw_bbox_without_number(i, dropped_bbox_list, page, [0, 255, 0])
# Save the PDF # Save the PDF
doc.save(f"{out_path}/layout.pdf") doc.save(f"{out_path}/layout.pdf")
......
...@@ -32,7 +32,8 @@ from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox ...@@ -32,7 +32,8 @@ from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree, def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, inline_equations, images, tables, interline_equations, inline_equations,
dropped_text_block, dropped_image_block, dropped_table_block): dropped_text_block, dropped_image_block, dropped_table_block,
need_remove_spans_bboxes_dict):
return_dict = { return_dict = {
'preproc_blocks': blocks, 'preproc_blocks': blocks,
'layout_bboxes': layout_bboxes, 'layout_bboxes': layout_bboxes,
...@@ -46,6 +47,7 @@ def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, lay ...@@ -46,6 +47,7 @@ def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, lay
'dropped_text_block': dropped_text_block, 'dropped_text_block': dropped_text_block,
'dropped_image_block': dropped_image_block, 'dropped_image_block': dropped_image_block,
'dropped_table_block': dropped_table_block, 'dropped_table_block': dropped_table_block,
'dropped_bboxes': need_remove_spans_bboxes_dict,
} }
return return_dict return return_dict
...@@ -233,7 +235,8 @@ def parse_pdf_by_ocr( ...@@ -233,7 +235,8 @@ def parse_pdf_by_ocr(
# 构造pdf_info_dict # 构造pdf_info_dict
page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree, page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, inline_equations, images, tables, interline_equations, inline_equations,
dropped_text_block, dropped_image_block, dropped_table_block) dropped_text_block, dropped_image_block, dropped_table_block,
need_remove_spans_bboxes_dict)
pdf_info_dict[f"page_{page_id}"] = page_info pdf_info_dict[f"page_{page_id}"] = page_info
# 在测试时,保存调试信息 # 在测试时,保存调试信息
......
...@@ -60,7 +60,7 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes): ...@@ -60,7 +60,7 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
# 遍历spans,将每个span放入对应的layout中 # 遍历spans,将每个span放入对应的layout中
layout_sapns = [] layout_sapns = []
for span in spans: for span in spans:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.8: if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.65:
layout_sapns.append(span) layout_sapns.append(span)
# 如果layout_sapns不为空,则放入new_spans中 # 如果layout_sapns不为空,则放入new_spans中
if len(layout_sapns) > 0: if len(layout_sapns) > 0:
......
...@@ -37,18 +37,18 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict): ...@@ -37,18 +37,18 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
dropped_text_block = [] dropped_text_block = []
dropped_image_block = [] dropped_image_block = []
dropped_table_block = [] dropped_table_block = []
for key, value in need_remove_spans_bboxes_dict.items(): for drop_tag, removed_bboxes in need_remove_spans_bboxes_dict.items():
# logger.info(f"remove spans by bbox dict, key: {key}, value: {value}") # logger.info(f"remove spans by bbox dict, drop_tag: {drop_tag}, removed_bboxes: {removed_bboxes}")
need_remove_spans = [] need_remove_spans = []
for span in spans: for span in spans:
for removed_bbox in value: for removed_bbox in removed_bboxes:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5: if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
need_remove_spans.append(span) need_remove_spans.append(span)
break break
for span in need_remove_spans: for span in need_remove_spans:
spans.remove(span) spans.remove(span)
span['tag'] = key span['tag'] = drop_tag
if span['type'] in ['text', 'inline_equation', 'displayed_equation']: if span['type'] in ['text', 'inline_equation', 'displayed_equation']:
dropped_text_block.append(span) dropped_text_block.append(span)
elif span['type'] == 'image': elif span['type'] == 'image':
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment