doc_analyze_by_custom_model.py 2.31 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
import fitz
import numpy as np
3
from loguru import logger
赵小蒙's avatar
赵小蒙 committed
4
from magic_pdf.model.model_list import MODEL
5
import magic_pdf.model as model_config
赵小蒙's avatar
赵小蒙 committed
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22


def dict_compare(d1, d2):
    return d1.items() == d2.items()


def remove_duplicates_dicts(lst):
    unique_dicts = []
    for dict_item in lst:
        if not any(
                dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
        ):
            unique_dicts.append(dict_item)
    return unique_dicts


def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
23
24
25
26
27
    try:
        import cv2
        from PIL import Image
    except ImportError:
        logger.error("opencv-python and Pillow are not installed, please install by pip.")
赵小蒙's avatar
赵小蒙 committed
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
    images = []
    with fitz.open("pdf", pdf_bytes) as doc:
        for index in range(0, doc.page_count):
            page = doc[index]
            mat = fitz.Matrix(dpi / 72, dpi / 72)
            pm = page.get_pixmap(matrix=mat, alpha=False)

            # if width or height > 2000 pixels, don't enlarge the image
            # if pm.width > 2000 or pm.height > 2000:
            #     pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)

            img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
            img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
            img_dict = {"img": img, "width": pm.width, "height": pm.height}
            images.append(img_dict)
    return images


def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False, model=MODEL.Paddle):
47
48
49
50
51
52
53

    if model_config.__use_inside_model__:
        from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
    else:
        logger.error("use_inside_model is False, not allow to use inside model")
        exit(1)

赵小蒙's avatar
赵小蒙 committed
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
    images = load_images_from_pdf(pdf_bytes)
    custom_model = None
    if model == MODEL.Paddle:
        custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log)
    else:
        pass
    model_json = []
    for index, img_dict in enumerate(images):
        img = img_dict["img"]
        page_width = img_dict["width"]
        page_height = img_dict["height"]
        result = custom_model(img)
        page_info = {"page_no": index, "height": page_height, "width": page_width}
        page_dict = {"layout_dets": result, "page_info": page_info}

        model_json.append(page_dict)

    return model_json