Commit 389826c5 authored by 赵小蒙's avatar 赵小蒙
Browse files

update custom model framework

parent c96aa88d
import fitz
import cv2
from PIL import Image
import numpy as np
from magic_pdf.model.model_list import MODEL
from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
def dict_compare(d1, d2):
return d1.items() == d2.items()
def remove_duplicates_dicts(lst):
unique_dicts = []
for dict_item in lst:
if not any(
dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
):
unique_dicts.append(dict_item)
return unique_dicts
def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
images = []
with fitz.open("pdf", pdf_bytes) as doc:
for index in range(0, doc.page_count):
page = doc[index]
mat = fitz.Matrix(dpi / 72, dpi / 72)
pm = page.get_pixmap(matrix=mat, alpha=False)
# if width or height > 2000 pixels, don't enlarge the image
# if pm.width > 2000 or pm.height > 2000:
# pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
img_dict = {"img": img, "width": pm.width, "height": pm.height}
images.append(img_dict)
return images
def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False, model=MODEL.Paddle):
images = load_images_from_pdf(pdf_bytes)
custom_model = None
if model == MODEL.Paddle:
custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log)
else:
pass
model_json = []
for index, img_dict in enumerate(images):
img = img_dict["img"]
page_width = img_dict["width"]
page_height = img_dict["height"]
result = custom_model(img)
page_info = {"page_no": index, "height": page_height, "width": page_width}
page_dict = {"layout_dets": result, "page_info": page_info}
model_json.append(page_dict)
return model_json
class MODEL:
Paddle = "pp_structure_v2"
import random import random
import fitz
import cv2
from paddleocr import PPStructure
from PIL import Image
from loguru import logger from loguru import logger
import numpy as np from paddleocr import PPStructure
def region_to_bbox(region): def region_to_bbox(region):
...@@ -16,41 +12,8 @@ def region_to_bbox(region): ...@@ -16,41 +12,8 @@ def region_to_bbox(region):
return [x0, y0, x1, y1] return [x0, y0, x1, y1]
def dict_compare(d1, d2):
return d1.items() == d2.items()
def remove_duplicates_dicts(lst):
unique_dicts = []
for dict_item in lst:
if not any(
dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
):
unique_dicts.append(dict_item)
return unique_dicts
def load_imags_from_pdf(pdf_bytes: bytes, dpi=200):
imgs = []
with fitz.open("pdf", pdf_bytes) as doc:
for index in range(0, doc.page_count):
page = doc[index]
dpi = 200
mat = fitz.Matrix(dpi / 72, dpi / 72)
pm = page.get_pixmap(matrix=mat, alpha=False)
# if width or height > 2000 pixels, don't enlarge the image
# if pm.width > 2000 or pm.height > 2000:
# pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
img_dict = {"img": img, "width": pm.width, "height": pm.height}
imgs.append(img_dict)
class CustomPaddleModel: class CustomPaddleModel:
def __init___(self, ocr: bool = False, show_log: bool = False): def __init__(self, ocr: bool = False, show_log: bool = False):
self.model = PPStructure(table=False, ocr=ocr, show_log=show_log) self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
def __call__(self, img): def __call__(self, img):
...@@ -109,23 +72,4 @@ class CustomPaddleModel: ...@@ -109,23 +72,4 @@ class CustomPaddleModel:
if len(spans) > 0: if len(spans) > 0:
result.extend(spans) result.extend(spans)
result = remove_duplicates_dicts(result)
return result return result
def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
imgs = load_imags_from_pdf(pdf_bytes)
custom_paddle = CustomPaddleModel()
model_json = []
for index, img_dict in enumerate(imgs):
img = img_dict["img"]
page_width = img_dict["width"]
page_height = img_dict["height"]
result = custom_paddle(img)
page_info = {"page_no": index, "height": page_height, "width": page_width}
page_dict = {"layout_dets": result, "page_info": page_info}
model_json.append(page_dict)
return model_json
from magic_pdf.libs.MakeContentConfig import DropMode from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.pipe.AbsPipe import AbsPipe from magic_pdf.pipe.AbsPipe import AbsPipe
from magic_pdf.user_api import parse_ocr_pdf from magic_pdf.user_api import parse_ocr_pdf
......
from magic_pdf.libs.MakeContentConfig import DropMode from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.libs.json_compressor import JsonCompressor from magic_pdf.libs.json_compressor import JsonCompressor
from magic_pdf.pipe.AbsPipe import AbsPipe from magic_pdf.pipe.AbsPipe import AbsPipe
......
...@@ -3,7 +3,7 @@ import json ...@@ -3,7 +3,7 @@ import json
from loguru import logger from loguru import logger
from magic_pdf.libs.MakeContentConfig import DropMode from magic_pdf.libs.MakeContentConfig import DropMode
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
from magic_pdf.libs.commons import join_path from magic_pdf.libs.commons import join_path
......
...@@ -16,7 +16,7 @@ import re ...@@ -16,7 +16,7 @@ import re
from loguru import logger from loguru import logger
from magic_pdf.libs.version import __version__ from magic_pdf.libs.version import __version__
from magic_pdf.model.doc_analyze_by_pp_structurev2 import doc_analyze from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.rw import AbsReaderWriter from magic_pdf.rw import AbsReaderWriter
from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr from magic_pdf.pdf_parse_by_ocr_v2 import parse_pdf_by_ocr
from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt from magic_pdf.pdf_parse_by_txt_v2 import parse_pdf_by_txt
...@@ -104,11 +104,15 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr ...@@ -104,11 +104,15 @@ def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWr
return garbage_count / total return garbage_count / total
def calculate_not_printable_rate(text): def calculate_not_printable_rate(text):
printable = sum(1 for c in text if c.isprintable()) printable_text = ""
for c in text:
if c.isprintable():
printable_text += c
printable_total = len(printable_text)
total = len(text) total = len(text)
if total == 0: if total == 0:
return 0 # 避免除以零的错误 return 0 # 避免除以零的错误
return (total - printable) / total return (total - printable_total) / total
not_common_character_rate = calculate_not_common_character_rate(text_all) not_common_character_rate = calculate_not_common_character_rate(text_all)
not_printable_rate = calculate_not_printable_rate(text_all) not_printable_rate = calculate_not_printable_rate(text_all)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment