doc_analyze_by_custom_model.py 2.18 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
3
4
import fitz
import cv2
from PIL import Image
import numpy as np
5
from loguru import logger
赵小蒙's avatar
赵小蒙 committed
6
7

from magic_pdf.model.model_list import MODEL
8
import magic_pdf.model as model_config
赵小蒙's avatar
赵小蒙 committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44


def dict_compare(d1, d2):
    return d1.items() == d2.items()


def remove_duplicates_dicts(lst):
    unique_dicts = []
    for dict_item in lst:
        if not any(
                dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
        ):
            unique_dicts.append(dict_item)
    return unique_dicts


def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
    images = []
    with fitz.open("pdf", pdf_bytes) as doc:
        for index in range(0, doc.page_count):
            page = doc[index]
            mat = fitz.Matrix(dpi / 72, dpi / 72)
            pm = page.get_pixmap(matrix=mat, alpha=False)

            # if width or height > 2000 pixels, don't enlarge the image
            # if pm.width > 2000 or pm.height > 2000:
            #     pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)

            img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
            img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
            img_dict = {"img": img, "width": pm.width, "height": pm.height}
            images.append(img_dict)
    return images


def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False, model=MODEL.Paddle):
45
46
47
48
49
50
51

    if model_config.__use_inside_model__:
        from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
    else:
        logger.error("use_inside_model is False, not allow to use inside model")
        exit(1)

赵小蒙's avatar
赵小蒙 committed
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
    images = load_images_from_pdf(pdf_bytes)
    custom_model = None
    if model == MODEL.Paddle:
        custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log)
    else:
        pass
    model_json = []
    for index, img_dict in enumerate(images):
        img = img_dict["img"]
        page_width = img_dict["width"]
        page_height = img_dict["height"]
        result = custom_model(img)
        page_info = {"page_no": index, "height": page_height, "width": page_width}
        page_dict = {"layout_dets": result, "page_info": page_info}

        model_json.append(page_dict)

    return model_json