doc_analyze_by_pp_structurev2.py 3.89 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import random

import fitz
import cv2
from paddleocr import PPStructure
from PIL import Image
from loguru import logger
import numpy as np

def region_to_bbox(region):
    x0 = region[0][0]
    y0 = region[0][1]
    x1 = region[2][0]
    y1 = region[2][1]
    return [x0, y0, x1, y1]


def dict_compare(d1, d2):
    return d1.items() == d2.items()


def remove_duplicates_dicts(lst):
    unique_dicts = []
    for dict_item in lst:
        if not any(dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts):
            unique_dicts.append(dict_item)
    return unique_dicts
def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
    ocr_engine = PPStructure(table=False, ocr=ocr, show_log=show_log)

    imgs = []
    with fitz.open("pdf", pdf_bytes) as doc:
        for index in range(0, doc.page_count):
            page = doc[index]
            dpi = 200
            mat = fitz.Matrix(dpi / 72, dpi / 72)
            pm = page.get_pixmap(matrix=mat, alpha=False)

            # if width or height > 2000 pixels, don't enlarge the image
            # if pm.width > 2000 or pm.height > 2000:
            #     pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)

            img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
            img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
            img_dict = {
                "img": img,
                "width": pm.width,
                "height": pm.height
            }
            imgs.append(img_dict)

    model_json = []
    for index, img_dict in enumerate(imgs):
        img = img_dict['img']
        page_width = img_dict['width']
        page_height = img_dict['height']
        result = ocr_engine(img)
        spans = []
        for line in result:
            line.pop('img')
            '''
            为paddle输出适配type no.    
            title: 0 # 标题
            text: 1 # 文本
            header: 2 # abandon
            footer: 2 # abandon
            reference: 1 # 文本 or abandon
            equation: 8 # 行间公式 block
            equation: 14 # 行间公式 text
            figure: 3 # 图片
            figure_caption: 4 # 图片描述
            table: 5 # 表格
            table_caption: 6 # 表格描述
            '''
            if line['type'] == 'title':
                line['category_id'] = 0
            elif line['type'] in ['text', 'reference']:
                line['category_id'] = 1
            elif line['type'] == 'figure':
                line['category_id'] = 3
            elif line['type'] == 'figure_caption':
                line['category_id'] = 4
            elif line['type'] == 'table':
                line['category_id'] = 5
            elif line['type'] == 'table_caption':
                line['category_id'] = 6
            elif line['type'] == 'equation':
                line['category_id'] = 8
            elif line['type'] in ['header', 'footer']:
                line['category_id'] = 2
            else:
                logger.warning(f"unknown type: {line['type']}")
            line['score'] = 0.5 + random.random() * 0.5

            res = line.pop('res', None)
            if res is not None and len(res) > 0:
                for span in res:
                    new_span = {'category_id': 15,
                                'bbox': region_to_bbox(span['text_region']),
                                'score': span['confidence'],
                                'text': span['text']
                                }
                    spans.append(new_span)

        if len(spans) > 0:
            result.extend(spans)

        result = remove_duplicates_dicts(result)

        page_info = {
            "page_no": index,
            "height": page_height,
            "width": page_width
        }
        page_dict = {
            "layout_dets": result,
            "page_info": page_info
        }

        model_json.append(page_dict)

    return model_json