draw.py 5.5 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import sys

from libs.commons import fitz

from para.commons import *


if sys.version_info[0] >= 3:
    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore


class DrawAnnos:
    """
    This class draws annotations on the pdf file

    ----------------------------------------
                Color Code
    ----------------------------------------
        Red: (1, 0, 0)
        Green: (0, 1, 0)
        Blue: (0, 0, 1)
        Yellow: (1, 1, 0) - mix of red and green
        Cyan: (0, 1, 1) - mix of green and blue
        Magenta: (1, 0, 1) - mix of red and blue
        White: (1, 1, 1) - red, green and blue full intensity
        Black: (0, 0, 0) - no color component whatsoever
        Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components
        Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component
    """

    def __init__(self) -> None:
        pass

    def __is_nested_list(self, lst):
        """
        This function returns True if the given list is a nested list of any degree.
        """
        if isinstance(lst, list):
            return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst)
        return False

    def __valid_rect(self, bbox):
        # Ensure that the rectangle is not empty or invalid
        if isinstance(bbox[0], list):
            return False  # It's a nested list, hence it can't be valid rect
        else:
            return bbox[0] < bbox[2] and bbox[1] < bbox[3]

    def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)):
        """
        This function draws the nested boxes

        Parameters
        ----------
        page : fitz.Page
            page
        nested_bbox : list
            nested bbox
        color : tuple
            color, by default (0, 1, 1)    # draw with cyan color for combined paragraph
        """
        if self.__is_nested_list(nested_bbox):  # If it's a nested list
            for bbox in nested_bbox:
                self.__draw_nested_boxes(page, bbox, color)  # Recursively call the function
        elif self.__valid_rect(nested_bbox):  # If valid rectangle
            para_rect = fitz.Rect(nested_bbox)
            para_anno = page.add_rect_annot(para_rect)
            para_anno.set_colors(stroke=color)  # draw with cyan color for combined paragraph
            para_anno.set_border(width=1)
            para_anno.update()

    def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path):
        pdf_doc = open_pdf(input_pdf_path)

        if pdf_dic is None:
            pdf_dic = {}

        if output_pdf_path is None:
            output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf")

        for page_id, page in enumerate(pdf_doc):  # type: ignore
            page_key = f"page_{page_id}"
            for ele_key, ele_data in pdf_dic[page_key].items():
                if ele_key == "para_blocks":
                    para_blocks = ele_data
                    for para_block in para_blocks:
                        if "paras" in para_block.keys():
                            paras = para_block["paras"]
                            for para_key, para_content in paras.items():
                                para_bbox = para_content["para_bbox"]
                                # print(f"para_bbox: {para_bbox}")
                                # print(f"is a nested list: {self.__is_nested_list(para_bbox)}")
                                if self.__is_nested_list(para_bbox) and len(para_bbox) > 1:
                                    color = (0, 1, 1)
                                    self.__draw_nested_boxes(
                                        page, para_bbox, color
                                    )  # draw with cyan color for combined paragraph
                                else:
                                    if self.__valid_rect(para_bbox):
                                        para_rect = fitz.Rect(para_bbox)
                                        para_anno = page.add_rect_annot(para_rect)
                                        para_anno.set_colors(stroke=(0, 1, 0))  # draw with green color for normal paragraph
                                        para_anno.set_border(width=0.5)
                                        para_anno.update()

                                is_para_title = para_content["is_para_title"]
                                if is_para_title:
                                    if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1:
                                        color = (0, 0, 1)
                                        self.__draw_nested_boxes(
                                            page, para_content["para_bbox"], color
                                        )  # draw with cyan color for combined title
                                    else:
                                        if self.__valid_rect(para_content["para_bbox"]):
                                            para_rect = fitz.Rect(para_content["para_bbox"])
                                            if self.__valid_rect(para_content["para_bbox"]):
                                                para_anno = page.add_rect_annot(para_rect)
                                                para_anno.set_colors(stroke=(0, 0, 1))  # draw with blue color for normal title
                                                para_anno.set_border(width=0.5)
                                                para_anno.update()

        pdf_doc.save(output_pdf_path)
        pdf_doc.close()