ocr_cut_image.py 732 Bytes
Newer Older
赵小蒙's avatar
赵小蒙 committed
1
from magic_pdf.libs.commons import join_path
赵小蒙's avatar
赵小蒙 committed
2
from magic_pdf.libs.ocr_content_type import ContentType
赵小蒙's avatar
赵小蒙 committed
3
4
5
from magic_pdf.libs.pdf_image_tools import cut_image


6
def cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
7

8
9
    def return_path(type):
        return join_path(pdf_bytes_md5, type)
赵小蒙's avatar
赵小蒙 committed
10
11
12

    for span in spans:
        span_type = span['type']
赵小蒙's avatar
赵小蒙 committed
13
        if span_type == ContentType.Image:
14
            span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'), imageWriter=imageWriter)
赵小蒙's avatar
赵小蒙 committed
15
        elif span_type == ContentType.Table:
16
            span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'), imageWriter=imageWriter)
赵小蒙's avatar
赵小蒙 committed
17
18

    return spans