doc_analyze_by_pp_structurev2.py 4.18 KB
Newer Older
1
2
3
4
5
6
7
8
9
import random

import fitz
import cv2
from paddleocr import PPStructure
from PIL import Image
from loguru import logger
import numpy as np

blue's avatar
blue committed
10

11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
def region_to_bbox(region):
    x0 = region[0][0]
    y0 = region[0][1]
    x1 = region[2][0]
    y1 = region[2][1]
    return [x0, y0, x1, y1]


def dict_compare(d1, d2):
    return d1.items() == d2.items()


def remove_duplicates_dicts(lst):
    unique_dicts = []
    for dict_item in lst:
blue's avatar
blue committed
26
27
28
        if not any(
            dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
        ):
29
30
31
            unique_dicts.append(dict_item)
    return unique_dicts

blue's avatar
blue committed
32
33

def load_imags_from_pdf(pdf_bytes: bytes, dpi=200):
34
35
36
37
38
39
40
41
42
43
44
45
46
47
    imgs = []
    with fitz.open("pdf", pdf_bytes) as doc:
        for index in range(0, doc.page_count):
            page = doc[index]
            dpi = 200
            mat = fitz.Matrix(dpi / 72, dpi / 72)
            pm = page.get_pixmap(matrix=mat, alpha=False)

            # if width or height > 2000 pixels, don't enlarge the image
            # if pm.width > 2000 or pm.height > 2000:
            #     pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)

            img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
            img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
blue's avatar
blue committed
48
            img_dict = {"img": img, "width": pm.width, "height": pm.height}
49
50
            imgs.append(img_dict)

blue's avatar
blue committed
51
52
53
54
55
56
57

class CustomPaddleModel:
    def __init___(self, ocr: bool = False, show_log: bool = False):
        self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)

    def __call__(self, img):
        result = self.model(img)
58
59
        spans = []
        for line in result:
blue's avatar
blue committed
60
61
            line.pop("img")
            """
62
63
64
65
66
67
68
69
70
71
72
73
            为paddle输出适配type no.    
            title: 0 # 标题
            text: 1 # 文本
            header: 2 # abandon
            footer: 2 # abandon
            reference: 1 # 文本 or abandon
            equation: 8 # 行间公式 block
            equation: 14 # 行间公式 text
            figure: 3 # 图片
            figure_caption: 4 # 图片描述
            table: 5 # 表格
            table_caption: 6 # 表格描述
blue's avatar
blue committed
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
            """
            if line["type"] == "title":
                line["category_id"] = 0
            elif line["type"] in ["text", "reference"]:
                line["category_id"] = 1
            elif line["type"] == "figure":
                line["category_id"] = 3
            elif line["type"] == "figure_caption":
                line["category_id"] = 4
            elif line["type"] == "table":
                line["category_id"] = 5
            elif line["type"] == "table_caption":
                line["category_id"] = 6
            elif line["type"] == "equation":
                line["category_id"] = 8
            elif line["type"] in ["header", "footer"]:
                line["category_id"] = 2
91
92
            else:
                logger.warning(f"unknown type: {line['type']}")
93
94
95

            # 兼容不输出score的paddleocr版本
            if line.get("score") is None:
blue's avatar
blue committed
96
                line["score"] = 0.5 + random.random() * 0.5
97

blue's avatar
blue committed
98
            res = line.pop("res", None)
99
100
            if res is not None and len(res) > 0:
                for span in res:
blue's avatar
blue committed
101
102
103
104
105
106
                    new_span = {
                        "category_id": 15,
                        "bbox": region_to_bbox(span["text_region"]),
                        "score": span["confidence"],
                        "text": span["text"],
                    }
107
108
109
110
111
112
                    spans.append(new_span)

        if len(spans) > 0:
            result.extend(spans)

        result = remove_duplicates_dicts(result)
blue's avatar
blue committed
113
114
115
116
117
118
        return result


def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
    imgs = load_imags_from_pdf(pdf_bytes)
    custom_paddle =  CustomPaddleModel()
119

blue's avatar
blue committed
120
121
122
123
124
125
126
127
    model_json = []
    for index, img_dict in enumerate(imgs):
        img = img_dict["img"]
        page_width = img_dict["width"]
        page_height = img_dict["height"]
        result = custom_paddle(img)
        page_info = {"page_no": index, "height": page_height, "width": page_width}
        page_dict = {"layout_dets": result, "page_info": page_info}
128
129
130
131

        model_json.append(page_dict)

    return model_json