Merge pull request #834 from myhloli/dev

feat(pdf_parse): improve span filtering and add new block types

Merge pull request #834 from myhloli/dev
feat(pdf_parse): improve span filtering and add new block types
099f19f2 · Xiaomeng Zhao · GitHub · 73afb7d6 · 149132d6 · 099f19f2
Unverified Commit 099f19f2 authored Nov 01, 2024 by Xiaomeng Zhao Committed by GitHub Nov 01, 2024
5 changed files
--- a/docs/output_file_en_us.md
+++ b/docs/output_file_en_us.md
@@ -175,11 +175,14 @@ Detailed explanation of second-level block types
 | :----------------- | :--------------------- |
 | image_body         | Main body of the image |
 | image_caption      | Image description text |
+| image_footnote     | Image footnote         |
 | table_body         | Main body of the table |
 | table_caption      | Table description text |
 | table_footnote     | Table footnote         |
 | text               | Text block             |
 | title              | Title block            |
+| index              | Index block            |
+| list               | List block             |
 | interline_equation | Block formula          |
 <br>

--- a/docs/output_file_zh_cn.md
+++ b/docs/output_file_zh_cn.md
@@ -174,12 +174,15 @@ poly 坐标的格式 \[x0, y0, x1, y1, x2, y2, x3, y3\], 分别表示左上、
 | :----------------- | :------------- |
 | image_body         | 图像的本体     |
 | image_caption      | 图像的描述文本 |
-| table_body         | 表格本体       |
+| image_footnote     | 图像的脚注   |
+| table_body         | 表格本体    |
 | table_caption      | 表格的描述文本 |
-| table_footnote     | 表格的脚注     |
+| table_footnote     | 表格的脚注   |
-| text               | 文本块         |
+| text               | 文本块     |
-| title              | 标题块         |
+| title              | 标题块     |
-| interline_equation | 行间公式块     |
+| index              | 目录块     |
+| list               | 列表块     |
+| interline_equation | 行间公式块   |
 <br>

--- a/magic_pdf/libs/draw_bbox.py
+++ b/magic_pdf/libs/draw_bbox.py
@@ -249,7 +249,8 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename):
                        page_dropped_list.append(span['bbox'])
        dropped_list.append(page_dropped_list)
        # 构造其余useful_list
-        for block in page['para_blocks']:
+        # for block in page['para_blocks']:  # span直接用分段合并前的结果就可以
+        for block in page['preproc_blocks']:
            if block['type'] in [
                BlockType.Text,
                BlockType.Title,

--- a/magic_pdf/libs/version.py
+++ b/magic_pdf/libs/version.py
-__version__ = "0.8.0"
+__version__ = "0.9.0"
--- a/magic_pdf/pdf_parse_union_core_v2.py
+++ b/magic_pdf/pdf_parse_union_core_v2.py
@@ -382,39 +382,44 @@ def revert_group_blocks(blocks):
    return new_blocks
-def remove_outside_spans(spans, all_bboxes):
+def remove_outside_spans(spans, all_bboxes, all_discarded_blocks):
-    image_bboxes = []
+    def get_block_bboxes(blocks, block_type_list):
-    table_bboxes = []
+        return [block[0:4] for block in blocks if block[7] in block_type_list]
-    other_block_bboxes = []
-    for block in all_bboxes:
+    image_bboxes = get_block_bboxes(all_bboxes, [BlockType.ImageBody])
-        block_type = block[7]
+    table_bboxes = get_block_bboxes(all_bboxes, [BlockType.TableBody])
-        block_bbox = block[0:4]
+    other_block_type = []
+    for block_type in BlockType.__dict__.values():
-        if block_type == BlockType.ImageBody:
+        if not isinstance(block_type, str):
-            image_bboxes.append(block_bbox)
+            continue
-        elif block_type == BlockType.TableBody:
+        if block_type not in [BlockType.ImageBody, BlockType.TableBody]:
-            table_bboxes.append(block_bbox)
+            other_block_type.append(block_type)
-        else:
+    other_block_bboxes = get_block_bboxes(all_bboxes, other_block_type)
-            other_block_bboxes.append(block_bbox)
+    discarded_block_bboxes = get_block_bboxes(all_discarded_blocks, [BlockType.Discarded])
    new_spans = []
    for span in spans:
-        if span['type'] == ContentType.Image:
+        span_bbox = span['bbox']
-            for block_bbox in image_bboxes:
+        span_type = span['type']
-                if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
-                    new_spans.append(span)
+        if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.4 for block_bbox in
-                    break
+               discarded_block_bboxes):
-        elif span['type'] == ContentType.Table:
+            new_spans.append(span)
-            for block_bbox in table_bboxes:
+            continue
-                if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
-                    new_spans.append(span)
+        if span_type == ContentType.Image:
-                    break
+            if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
+                   image_bboxes):
+                new_spans.append(span)
+        elif span_type == ContentType.Table:
+            if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
+                   table_bboxes):
+                new_spans.append(span)
        else:
-            for block_bbox in other_block_bboxes:
+            if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in
-                if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
+                   other_block_bboxes):
-                    new_spans.append(span)
+                new_spans.append(span)
-                    break
    return new_spans
@@ -488,7 +493,8 @@ def parse_page_core(
        raise Exception('parse_mode must be txt or ocr')
    """在删除重复span之前，应该通过image_body和table_body的block过滤一下image和table的span"""
-    spans = remove_outside_spans(spans, all_bboxes)
+    """顺便删除大水印并保留abandon的span"""
+    spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks)
    """删除重叠spans中置信度较低的那些"""
    spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)