ocr_cut_image.py 981 Bytes
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
from magic_pdf.libs.commons import join_path
赵小蒙's avatar
赵小蒙 committed
2
from magic_pdf.libs.ocr_content_type import ContentType
赵小蒙's avatar
赵小蒙 committed
3
4
5
from magic_pdf.libs.pdf_image_tools import cut_image


6
def cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client):
7
8
9

    """spark环境book_name为pdf_bytes_md5,本地环境会传正常bookname"""

赵小蒙's avatar
赵小蒙 committed
10
11
12
13
14
15
16
17
    def s3_return_path(type):
        return join_path(book_name, type)

    def img_save_path(type):
        return join_path(save_path, s3_return_path(type))

    for span in spans:
        span_type = span['type']
赵小蒙's avatar
赵小蒙 committed
18
        if span_type == ContentType.Image:
19
            span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'), s3_return_path=s3_return_path('images'), img_s3_client=img_s3_client)
赵小蒙's avatar
赵小蒙 committed
20
        elif span_type == ContentType.Table:
21
            span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'), s3_return_path=s3_return_path('tables'), img_s3_client=img_s3_client)
赵小蒙's avatar
赵小蒙 committed
22
23

    return spans