remove_bbox_overlap.py 1.77 KB
Newer Older
许瑞's avatar
许瑞 committed
1
2
3
4
5
from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_in


def _remove_overlap_between_bbox(spans):
    res = []
许瑞's avatar
许瑞 committed
6
7
8
9
10
11
12
13
14
15
16
17
18

    keeps = [True] * len(spans)
    for i in range(len(spans)):
        for j in range(len(spans)):
            if i == j:
                continue
            if _is_in(spans[i]["bbox"], spans[j]["bbox"]):
                keeps[i] = False

    for idx, v in enumerate(spans):
        if not keeps[idx]:
            continue

许瑞's avatar
许瑞 committed
19
        for i in range(len(res)):
许瑞's avatar
许瑞 committed
20
            if  _is_in(v["bbox"], res[i]["bbox"]):
许瑞's avatar
许瑞 committed
21
22
23
24
25
26
27
28
                continue
            if _is_in_or_part_overlap(res[i]["bbox"], v["bbox"]):
                ix0, iy0, ix1, iy1 = res[i]["bbox"]
                x0, y0, x1, y1 = v["bbox"]

                diff_x = min(x1, ix1) - max(x0, ix0)
                diff_y = min(y1, iy1) - max(y0, iy0)

许瑞's avatar
许瑞 committed
29
                if diff_y > diff_x:
许瑞's avatar
许瑞 committed
30
31
                    if x1 >= ix1:
                        mid = (x0 + ix1) // 2
32
33
                        ix1 = min(mid - 0.25, ix1)
                        x0 = max(mid + 0.25, x0)
许瑞's avatar
许瑞 committed
34
35
                    else:
                        mid = (ix0 + x1) // 2
36
37
                        ix0 = max(mid + 0.25, ix0)
                        x1 = min(mid -0.25, x1)
许瑞's avatar
许瑞 committed
38
39
40
                else:
                    if y1 >= iy1:
                        mid = (y0 + iy1) // 2
41
42
                        y0 = max(mid + 0.25, y0)
                        iy1 = min(iy1, mid-0.25)
许瑞's avatar
许瑞 committed
43
44
                    else:
                        mid = (iy0 + y1) // 2
45
46
                        y1 = min(y1, mid-0.25)
                        iy0 = max(mid + 0.25, iy0)
许瑞's avatar
许瑞 committed
47
48
49
50
51
52
53
54
55
                res[i]["bbox"] = [ix0, iy0, ix1, iy1]
                v["bbox"] = [x0, y0, x1, y1]

        res.append(v)
    return res


def remove_overlap_between_bbox(spans):
    return _remove_overlap_between_bbox(spans)