在layout.pdf中绘制drop的bbox

b6f051d8 · 赵小蒙 · 85587b25 · b6f051d8 · b6f051d8 · b6f051d8
Commit b6f051d8 authored Mar 14, 2024 by 赵小蒙
5 changed files
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
 def mk_nlp_markdown(pdf_info_dict: dict):
    markdown = []
    for _, page_info in pdf_info_dict.items():
@@ -22,6 +21,7 @@ def mk_nlp_markdown(pdf_info_dict: dict):
                markdown.append(line_text.strip() + '  ')
    return '\n'.join(markdown)
 def mk_mm_markdown(pdf_info_dict: dict):
    markdown = []

--- a/magic_pdf/libs/draw_bbox.py
+++ b/magic_pdf/libs/draw_bbox.py
@@ -27,15 +27,22 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config):
 def draw_layout_bbox(pdf_info_dict, input_path, out_path):
    layout_bbox_list = []
+    dropped_bbox_list = []
    for page in pdf_info_dict.values():
-        page_list = []
+        page_layout_list = []
+        page_dropped_list = []
        for layout in page['layout_bboxes']:
-            page_list.append(layout['layout_bbox'])
+            page_layout_list.append(layout['layout_bbox'])
-        layout_bbox_list.append(page_list)
+        layout_bbox_list.append(page_layout_list)
+        for drop_tag, dropped_bboxes in page['dropped_bboxes'].items():
+            for dropped_bbox in dropped_bboxes:
+                page_dropped_list.append(dropped_bbox)
+        dropped_bbox_list.append(page_dropped_list)
    doc = fitz.open(input_path)
    for i, page in enumerate(doc):
        draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0])
+        draw_bbox_without_number(i, dropped_bbox_list, page, [0, 255, 0])
    # Save the PDF
    doc.save(f"{out_path}/layout.pdf")

--- a/magic_pdf/pdf_parse_by_ocr.py
+++ b/magic_pdf/pdf_parse_by_ocr.py
@@ -32,7 +32,8 @@ from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
 def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
                             images, tables, interline_equations, inline_equations,
-                             dropped_text_block, dropped_image_block, dropped_table_block):
+                             dropped_text_block, dropped_image_block, dropped_table_block,
+                             need_remove_spans_bboxes_dict):
    return_dict = {
        'preproc_blocks': blocks,
        'layout_bboxes': layout_bboxes,
@@ -46,6 +47,7 @@ def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, lay
        'dropped_text_block': dropped_text_block,
        'dropped_image_block': dropped_image_block,
        'dropped_table_block': dropped_table_block,
+        'dropped_bboxes': need_remove_spans_bboxes_dict,
    }
    return return_dict
@@ -233,7 +235,8 @@ def parse_pdf_by_ocr(
        # 构造pdf_info_dict
        page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
                                             images, tables, interline_equations, inline_equations,
-                                             dropped_text_block, dropped_image_block, dropped_table_block)
+                                             dropped_text_block, dropped_image_block, dropped_table_block,
+                                             need_remove_spans_bboxes_dict)
        pdf_info_dict[f"page_{page_id}"] = page_info
    # 在测试时,保存调试信息

--- a/magic_pdf/pre_proc/ocr_dict_merge.py
+++ b/magic_pdf/pre_proc/ocr_dict_merge.py
@@ -60,7 +60,7 @@ def merge_spans_to_line_by_layout(spans, layout_bboxes):
        # 遍历spans,将每个span放入对应的layout中
        layout_sapns = []
        for span in spans:
-            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.8:
+            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.65:
                layout_sapns.append(span)
        # 如果layout_sapns不为空，则放入new_spans中
        if len(layout_sapns) > 0:

--- a/magic_pdf/pre_proc/ocr_span_list_modify.py
+++ b/magic_pdf/pre_proc/ocr_span_list_modify.py
@@ -37,18 +37,18 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
    dropped_text_block = []
    dropped_image_block = []
    dropped_table_block = []
-    for key, value in need_remove_spans_bboxes_dict.items():
+    for drop_tag, removed_bboxes in need_remove_spans_bboxes_dict.items():
-        # logger.info(f"remove spans by bbox dict, key: {key}, value: {value}")
+        # logger.info(f"remove spans by bbox dict, drop_tag: {drop_tag}, removed_bboxes: {removed_bboxes}")
        need_remove_spans = []
        for span in spans:
-            for removed_bbox in value:
+            for removed_bbox in removed_bboxes:
                if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
                    need_remove_spans.append(span)
                    break
        for span in need_remove_spans:
            spans.remove(span)
-            span['tag'] = key
+            span['tag'] = drop_tag
            if span['type'] in ['text', 'inline_equation', 'displayed_equation']:
                dropped_text_block.append(span)
            elif span['type'] == 'image':