fix(pdf_parse): improve span removal logic for all content types

- Update remove_outside_spans function to handle all content types - Add processing for text and equation spans - Improve overlap calculation for better accuracy

fix(pdf_parse): improve span removal logic for all content types
- Update remove_outside_spans function to handle all content types - Add processing for text and equation spans - Improve overlap calculation for better accuracy
509128d5 · myhloli · eeda90af · 509128d5
Commit 509128d5 authored Nov 01, 2024 by myhloli
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 4 deletions

magic_pdf/pdf_parse_union_core_v2.py magic_pdf/pdf_parse_union_core_v2.py +4 -4

No files found.
--- a/magic_pdf/pdf_parse_union_core_v2.py
+++ b/magic_pdf/pdf_parse_union_core_v2.py
@@ -385,17 +385,17 @@ def revert_group_blocks(blocks):
 def remove_outside_spans(spans, all_bboxes):
    image_bboxes = []
    table_bboxes = []
-    all_block_bboxes = []
+    other_block_bboxes = []
    for block in all_bboxes:
        block_type = block[7]
        block_bbox = block[0:4]
-        all_block_bboxes.append(block_bbox)
        if block_type == BlockType.ImageBody:
            image_bboxes.append(block_bbox)
        elif block_type == BlockType.TableBody:
            table_bboxes.append(block_bbox)
        else:
-            continue
+            other_block_bboxes.append(block_bbox)
    new_spans = []
@@ -411,7 +411,7 @@ def remove_outside_spans(spans, all_bboxes):
                    new_spans.append(span)
                    break
        elif span['type'] in [ContentType.Text, ContentType.InlineEquation, ContentType.InterlineEquation]:
-            for block_bbox in all_block_bboxes:
+            for block_bbox in other_block_bboxes:
                if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.5:
                    new_spans.append(span)
                    break