doc_analyze_by_custom_model.py 2.36 KB
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
2
import fitz
import numpy as np
3
from loguru import logger
赵小蒙's avatar
赵小蒙 committed
4
from magic_pdf.model.model_list import MODEL
5
import magic_pdf.model as model_config
赵小蒙's avatar
赵小蒙 committed
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22


def dict_compare(d1, d2):
    return d1.items() == d2.items()


def remove_duplicates_dicts(lst):
    unique_dicts = []
    for dict_item in lst:
        if not any(
                dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
        ):
            unique_dicts.append(dict_item)
    return unique_dicts


def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
23
24
25
    try:
        from PIL import Image
    except ImportError:
赵小蒙's avatar
update:  
赵小蒙 committed
26
27
28
        logger.error("Pillow not installed, please install by pip.")
        exit(1)

赵小蒙's avatar
赵小蒙 committed
29
30
31
32
33
34
35
    images = []
    with fitz.open("pdf", pdf_bytes) as doc:
        for index in range(0, doc.page_count):
            page = doc[index]
            mat = fitz.Matrix(dpi / 72, dpi / 72)
            pm = page.get_pixmap(matrix=mat, alpha=False)

赵小蒙's avatar
update:  
赵小蒙 committed
36
37
38
            # if width or height > 3000 pixels, don't enlarge the image
            if pix.width > 3000 or pix.height > 3000:
                pix = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
赵小蒙's avatar
赵小蒙 committed
39

赵小蒙's avatar
update:  
赵小蒙 committed
40
41
            img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
            img = np.array(img)
赵小蒙's avatar
赵小蒙 committed
42
43
44
45
46
47
            img_dict = {"img": img, "width": pm.width, "height": pm.height}
            images.append(img_dict)
    return images


def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False, model=MODEL.Paddle):
48
49
50
51
52
53
54

    if model_config.__use_inside_model__:
        from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
    else:
        logger.error("use_inside_model is False, not allow to use inside model")
        exit(1)

赵小蒙's avatar
赵小蒙 committed
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
    images = load_images_from_pdf(pdf_bytes)
    custom_model = None
    if model == MODEL.Paddle:
        custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log)
    else:
        pass
    model_json = []
    for index, img_dict in enumerate(images):
        img = img_dict["img"]
        page_width = img_dict["width"]
        page_height = img_dict["height"]
        result = custom_model(img)
        page_info = {"page_no": index, "height": page_height, "width": page_width}
        page_dict = {"layout_dets": result, "page_info": page_info}

        model_json.append(page_dict)

赵小蒙's avatar
update:  
赵小蒙 committed
72
73
    # @todo 把公式识别放在后置位置,待整本全部模型结果出来之后再补公式数据

赵小蒙's avatar
赵小蒙 committed
74
    return model_json