"vscode:/vscode.git/clone" did not exist on "57e470abe22973a17b0610bd260105e7f6c8da0d"
Commit 7f2f2c0f authored by myhloli's avatar myhloli
Browse files

refactor(ocr): Fix the error of paddleocr failing to initialize in a multi-threaded environment

parent d4345b6e
import cv2
import numpy as np
from loguru import logger
from io import BytesIO
from PIL import Image
import base64
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold
from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line
from ppocr.utils.utility import check_and_read
def img_decode(content: bytes):
np_arr = np.frombuffer(content, dtype=np.uint8)
return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
def check_img(img):
if isinstance(img, bytes):
img = img_decode(img)
if isinstance(img, str):
image_file = img
img, flag_gif, flag_pdf = check_and_read(image_file)
if not flag_gif and not flag_pdf:
with open(image_file, 'rb') as f:
img_str = f.read()
img = img_decode(img_str)
if img is None:
try:
buf = BytesIO()
image = BytesIO(img_str)
im = Image.open(image)
rgb = im.convert('RGB')
rgb.save(buf, 'jpeg')
buf.seek(0)
image_bytes = buf.read()
data_base64 = str(base64.b64encode(image_bytes),
encoding="utf-8")
image_decode = base64.b64decode(data_base64)
img_array = np.frombuffer(image_decode, np.uint8)
img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
except:
logger.error("error in loading image:{}".format(image_file))
return None
if img is None:
logger.error("error in loading image:{}".format(image_file))
return None
if isinstance(img, np.ndarray) and len(img.shape) == 2:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
return img
def bbox_to_points(bbox):
""" 将bbox格式转换为四个顶点的数组 """
......
import copy
import time
import cv2
import numpy as np
from paddleocr import PaddleOCR
from paddleocr.paddleocr import check_img, logger
from paddleocr.ppocr.utils.utility import alpha_to_color, binarize_img
from paddleocr.tools.infer.predict_system import sorted_boxes
from paddleocr.tools.infer.utility import get_rotate_crop_image, get_minarea_rect_crop
from ppocr.utils.logging import get_logger
from ppocr.utils.utility import alpha_to_color, binarize_img
from tools.infer.predict_system import sorted_boxes
from tools.infer.utility import get_rotate_crop_image, get_minarea_rect_crop
from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes, check_img
from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes
logger = get_logger()
class ModifiedPaddleOCR(PaddleOCR):
......
......@@ -2,8 +2,8 @@ import os
import cv2
import numpy as np
from paddleocr.ppstructure.table.predict_table import TableSystem
from paddleocr.ppstructure.utility import init_args
from ppstructure.table.predict_table import TableSystem
from ppstructure.utility import init_args
from PIL import Image
from magic_pdf.config.constants import * # noqa: F403
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment