ocr_remove_spans.py 607 Bytes
Newer Older
1
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
2
3
4
5


def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
    # 遍历spans, 判断是否在removed_span_block_bboxes中
6
    # 如果是, 则删除该span 否则, 保留该span
7
8
    need_remove_spans = []
    for span in spans:
9
10
        for removed_bbox in need_remove_spans_bboxes:
            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
11
12
13
14
15
16
17
                need_remove_spans.append(span)
                break

    for span in need_remove_spans:
        spans.remove(span)

    return spans