Commit 389826c5 authored by 赵小蒙's avatar 赵小蒙
Browse files

update custom model framework

parent c96aa88d
import fitz
import cv2
from PIL import Image
import numpy as np
from magic_pdf.model.model_list import MODEL
from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
def dict_compare(d1, d2):
return d1.items() == d2.items()
def remove_duplicates_dicts(lst):
unique_dicts = []
for dict_item in lst:
if not any(
dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
):
unique_dicts.append(dict_item)
return unique_dicts
def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
images = []
with fitz.open("pdf", pdf_bytes) as doc:
for index in range(0, doc.page_count):
page = doc[index]
mat = fitz.Matrix(dpi / 72, dpi / 72)
pm = page.get_pixmap(matrix=mat, alpha=False)
# if width or height > 2000 pixels, don't enlarge the image
# if pm.width > 2000 or pm.height > 2000:
# pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
img_dict = {"img": img, "width": pm.width, "height": pm.height}
images.append(img_dict)
return images
def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False, model=MODEL.Paddle):
images = load_images_from_pdf(pdf_bytes)
custom_model = None
if model == MODEL.Paddle:
custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log)
else:
pass
model_json = []
for index, img_dict in enumerate(images):
img = img_dict["img"]
page_width = img_dict["width"]
page_height = img_dict["height"]
result = custom_model(img)
page_info = {"page_no": index, "height": page_height, "width": page_width}
page_dict = {"layout_dets": result, "page_info": page_info}
model_json.append(page_dict)
return model_json
class MODEL:
Paddle = "pp_structure_v2"
import random
import fitz
import cv2
from paddleocr import PPStructure
from PIL import Image
from loguru import logger
import numpy as np
from paddleocr import PPStructure
def region_to_bbox(region):
......@@ -16,41 +12,8 @@ def region_to_bbox(region):
return [x0, y0, x1, y1]
def dict_compare(d1, d2):
return d1.items() == d2.items()
def remove_duplicates_dicts(lst):
unique_dicts = []
for dict_item in lst:
if not any(
dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
):
unique_dicts.append(dict_item)
return unique_dicts
def load_imags_from_pdf(pdf_bytes: bytes, dpi=200):
imgs = []
with fitz.open("pdf", pdf_bytes) as doc:
for index in range(0, doc.page_count):
page = doc[index]
dpi = 200
mat = fitz.Matrix(dpi / 72, dpi / 72)
pm = page.get_pixmap(matrix=mat, alpha=False)
# if width or height > 2000 pixels, don't enlarge the image
# if pm.width > 2000 or pm.height > 2000:
# pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
img_dict = {"img": img, "width": pm.width, "height": pm.height}
imgs.append(img_dict)
class CustomPaddleModel:
def __init___(self, ocr: bool = False, show_log: bool = False):
def __init__(self, ocr: bool = False, show_log: bool = False):
self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
def __call__(self, img):
......@@ -109,23 +72,4 @@ class CustomPaddleModel:
if len(spans) > 0:
result.extend(spans)
result = remove_duplicates_dicts(result)
return result
def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
imgs = load_imags_from_pdf(pdf_bytes)
custom_paddle = CustomPaddleModel()
model_json = []
for index, img_dict in enumerate(imgs):
img = img_dict["img"]
page_width = img_dict["width"]
page_height = img_dict["height"]
result = custom_paddle(img)
page_info = {"page_no": index, "height": page_height, "width": page_width}
page_dict = {"layout_dets": result, "page_info": page_info}
model_json.append(page_dict)
return model_json
from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.pipe.AbsPipe import AbsPipe
from magic_pdf.user_api import parse_ocr_pdf
......
from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.json_compressor import JsonCompressor
from magic_pdf.pipe.AbsPipe import AbsPipe
......
......@@ -3,7 +3,7 @@ import json
from loguru import logger
from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.libs.commons import join_path
......
......@@ -16,7 +16,7 @@ import re
from loguru import logger
from magic_pdf.libs.version import __version__
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw import AbsReaderWriter
from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt
......@@ -104,11 +104,15 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
return garbage_count / total
def calculate_not_printable_rate(text):
printable = sum(1 for c in text if c.isprintable())
printable_text = ""
for c in text:
if c.isprintable():
printable_text += c
printable_total = len(printable_text)
total = len(text)
if total == 0:
return 0 # 避免除以零的错误
return (total - printable) / total
return (total - printable_total) / total
not_common_character_rate = calculate_not_common_character_rate(text_all)
not_printable_rate = calculate_not_printable_rate(text_all)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment