"packaging/vscode:/vscode.git/clone" did not exist on "16caba5e72d4995a1743c72824b923920060cb26"
Unverified Commit f4ffdfe8 authored by Xiaomeng Zhao's avatar Xiaomeng Zhao Committed by GitHub
Browse files

Merge pull request #2062 from myhloli/dev

feat: support 3.10~3.12 & remove paddle
parents ec566d22 cb3a4314
lang:
ch:
det: ch_PP-OCRv4_det_infer.pth
rec: ch_PP-OCRv4_rec_infer.pth
dict: ppocr_keys_v1.txt
en:
det: en_PP-OCRv3_det_infer.pth
rec: en_PP-OCRv4_rec_infer.pth
dict: en_dict.txt
korean:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: korean_PP-OCRv3_rec_infer.pth
dict: korean_dict.txt
japan:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: japan_PP-OCRv3_rec_infer.pth
dict: japan_dict.txt
chinese_cht:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: chinese_cht_PP-OCRv3_rec_infer.pth
dict: chinese_cht_dict.txt
ta:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: ta_PP-OCRv3_rec_infer.pth
dict: ta_dict.txt
te:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: te_PP-OCRv3_rec_infer.pth
dict: te_dict.txt
ka:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: ka_PP-OCRv3_rec_infer.pth
dict: ka_dict.txt
latin:
det: en_PP-OCRv3_det_infer.pth
rec: latin_PP-OCRv3_rec_infer.pth
dict: latin_dict.txt
arabic:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: arabic_PP-OCRv3_rec_infer.pth
dict: arabic_dict.txt
cyrillic:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: cyrillic_PP-OCRv3_rec_infer.pth
dict: cyrillic_dict.txt
devanagari:
det: Multilingual_PP-OCRv3_det_infer.pth
rec: devanagari_PP-OCRv3_rec_infer.pth
dict: devanagari_dict.txt
\ No newline at end of file
# Copyright (c) Opendatalab. All rights reserved.
\ No newline at end of file
import cv2
import copy
import numpy as np
import math
import time
import torch
from ...pytorchocr.base_ocr_v20 import BaseOCRV20
from . import pytorchocr_utility as utility
from ...pytorchocr.postprocess import build_post_process
class TextClassifier(BaseOCRV20):
def __init__(self, args, **kwargs):
self.device = args.device
self.cls_image_shape = [int(v) for v in args.cls_image_shape.split(",")]
self.cls_batch_num = args.cls_batch_num
self.cls_thresh = args.cls_thresh
postprocess_params = {
'name': 'ClsPostProcess',
"label_list": args.label_list,
}
self.postprocess_op = build_post_process(postprocess_params)
self.weights_path = args.cls_model_path
self.yaml_path = args.cls_yaml_path
network_config = utility.get_arch_config(self.weights_path)
super(TextClassifier, self).__init__(network_config, **kwargs)
self.cls_image_shape = [int(v) for v in args.cls_image_shape.split(",")]
self.limited_max_width = args.limited_max_width
self.limited_min_width = args.limited_min_width
self.load_pytorch_weights(self.weights_path)
self.net.eval()
self.net.to(self.device)
def resize_norm_img(self, img):
imgC, imgH, imgW = self.cls_image_shape
h = img.shape[0]
w = img.shape[1]
ratio = w / float(h)
imgW = max(min(imgW, self.limited_max_width), self.limited_min_width)
ratio_imgH = math.ceil(imgH * ratio)
ratio_imgH = max(ratio_imgH, self.limited_min_width)
if ratio_imgH > imgW:
resized_w = imgW
else:
resized_w = int(math.ceil(imgH * ratio))
resized_image = cv2.resize(img, (resized_w, imgH))
resized_image = resized_image.astype('float32')
if self.cls_image_shape[0] == 1:
resized_image = resized_image / 255
resized_image = resized_image[np.newaxis, :]
else:
resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5
resized_image /= 0.5
padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
padding_im[:, :, 0:resized_w] = resized_image
return padding_im
def __call__(self, img_list):
img_list = copy.deepcopy(img_list)
img_num = len(img_list)
# Calculate the aspect ratio of all text bars
width_list = []
for img in img_list:
width_list.append(img.shape[1] / float(img.shape[0]))
# Sorting can speed up the cls process
indices = np.argsort(np.array(width_list))
cls_res = [['', 0.0]] * img_num
batch_num = self.cls_batch_num
elapse = 0
for beg_img_no in range(0, img_num, batch_num):
end_img_no = min(img_num, beg_img_no + batch_num)
norm_img_batch = []
max_wh_ratio = 0
for ino in range(beg_img_no, end_img_no):
h, w = img_list[indices[ino]].shape[0:2]
wh_ratio = w * 1.0 / h
max_wh_ratio = max(max_wh_ratio, wh_ratio)
for ino in range(beg_img_no, end_img_no):
norm_img = self.resize_norm_img(img_list[indices[ino]])
norm_img = norm_img[np.newaxis, :]
norm_img_batch.append(norm_img)
norm_img_batch = np.concatenate(norm_img_batch)
norm_img_batch = norm_img_batch.copy()
starttime = time.time()
with torch.no_grad():
inp = torch.from_numpy(norm_img_batch)
inp = inp.to(self.device)
prob_out = self.net(inp)
prob_out = prob_out.cpu().numpy()
cls_result = self.postprocess_op(prob_out)
elapse += time.time() - starttime
for rno in range(len(cls_result)):
label, score = cls_result[rno]
cls_res[indices[beg_img_no + rno]] = [label, score]
if '180' in label and score > self.cls_thresh:
img_list[indices[beg_img_no + rno]] = cv2.rotate(
img_list[indices[beg_img_no + rno]], 1)
return img_list, cls_res, elapse
import sys
import numpy as np
import time
import torch
from ...pytorchocr.base_ocr_v20 import BaseOCRV20
from . import pytorchocr_utility as utility
from ...pytorchocr.data import create_operators, transform
from ...pytorchocr.postprocess import build_post_process
class TextDetector(BaseOCRV20):
def __init__(self, args, **kwargs):
self.args = args
self.det_algorithm = args.det_algorithm
self.device = args.device
pre_process_list = [{
'DetResizeForTest': {
'limit_side_len': args.det_limit_side_len,
'limit_type': args.det_limit_type,
}
}, {
'NormalizeImage': {
'std': [0.229, 0.224, 0.225],
'mean': [0.485, 0.456, 0.406],
'scale': '1./255.',
'order': 'hwc'
}
}, {
'ToCHWImage': None
}, {
'KeepKeys': {
'keep_keys': ['image', 'shape']
}
}]
postprocess_params = {}
if self.det_algorithm == "DB":
postprocess_params['name'] = 'DBPostProcess'
postprocess_params["thresh"] = args.det_db_thresh
postprocess_params["box_thresh"] = args.det_db_box_thresh
postprocess_params["max_candidates"] = 1000
postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio
postprocess_params["use_dilation"] = args.use_dilation
postprocess_params["score_mode"] = args.det_db_score_mode
elif self.det_algorithm == "DB++":
postprocess_params['name'] = 'DBPostProcess'
postprocess_params["thresh"] = args.det_db_thresh
postprocess_params["box_thresh"] = args.det_db_box_thresh
postprocess_params["max_candidates"] = 1000
postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio
postprocess_params["use_dilation"] = args.use_dilation
postprocess_params["score_mode"] = args.det_db_score_mode
pre_process_list[1] = {
'NormalizeImage': {
'std': [1.0, 1.0, 1.0],
'mean':
[0.48109378172549, 0.45752457890196, 0.40787054090196],
'scale': '1./255.',
'order': 'hwc'
}
}
elif self.det_algorithm == "EAST":
postprocess_params['name'] = 'EASTPostProcess'
postprocess_params["score_thresh"] = args.det_east_score_thresh
postprocess_params["cover_thresh"] = args.det_east_cover_thresh
postprocess_params["nms_thresh"] = args.det_east_nms_thresh
elif self.det_algorithm == "SAST":
pre_process_list[0] = {
'DetResizeForTest': {
'resize_long': args.det_limit_side_len
}
}
postprocess_params['name'] = 'SASTPostProcess'
postprocess_params["score_thresh"] = args.det_sast_score_thresh
postprocess_params["nms_thresh"] = args.det_sast_nms_thresh
self.det_sast_polygon = args.det_sast_polygon
if self.det_sast_polygon:
postprocess_params["sample_pts_num"] = 6
postprocess_params["expand_scale"] = 1.2
postprocess_params["shrink_ratio_of_width"] = 0.2
else:
postprocess_params["sample_pts_num"] = 2
postprocess_params["expand_scale"] = 1.0
postprocess_params["shrink_ratio_of_width"] = 0.3
elif self.det_algorithm == "PSE":
postprocess_params['name'] = 'PSEPostProcess'
postprocess_params["thresh"] = args.det_pse_thresh
postprocess_params["box_thresh"] = args.det_pse_box_thresh
postprocess_params["min_area"] = args.det_pse_min_area
postprocess_params["box_type"] = args.det_pse_box_type
postprocess_params["scale"] = args.det_pse_scale
self.det_pse_box_type = args.det_pse_box_type
elif self.det_algorithm == "FCE":
pre_process_list[0] = {
'DetResizeForTest': {
'rescale_img': [1080, 736]
}
}
postprocess_params['name'] = 'FCEPostProcess'
postprocess_params["scales"] = args.scales
postprocess_params["alpha"] = args.alpha
postprocess_params["beta"] = args.beta
postprocess_params["fourier_degree"] = args.fourier_degree
postprocess_params["box_type"] = args.det_fce_box_type
else:
print("unknown det_algorithm:{}".format(self.det_algorithm))
sys.exit(0)
self.preprocess_op = create_operators(pre_process_list)
self.postprocess_op = build_post_process(postprocess_params)
self.weights_path = args.det_model_path
self.yaml_path = args.det_yaml_path
network_config = utility.get_arch_config(self.weights_path)
super(TextDetector, self).__init__(network_config, **kwargs)
self.load_pytorch_weights(self.weights_path)
self.net.eval()
self.net.to(self.device)
def order_points_clockwise(self, pts):
"""
reference from: https://github.com/jrosebr1/imutils/blob/master/imutils/perspective.py
# sort the points based on their x-coordinates
"""
xSorted = pts[np.argsort(pts[:, 0]), :]
# grab the left-most and right-most points from the sorted
# x-roodinate points
leftMost = xSorted[:2, :]
rightMost = xSorted[2:, :]
# now, sort the left-most coordinates according to their
# y-coordinates so we can grab the top-left and bottom-left
# points, respectively
leftMost = leftMost[np.argsort(leftMost[:, 1]), :]
(tl, bl) = leftMost
rightMost = rightMost[np.argsort(rightMost[:, 1]), :]
(tr, br) = rightMost
rect = np.array([tl, tr, br, bl], dtype="float32")
return rect
def clip_det_res(self, points, img_height, img_width):
for pno in range(points.shape[0]):
points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1))
points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1))
return points
def filter_tag_det_res(self, dt_boxes, image_shape):
img_height, img_width = image_shape[0:2]
dt_boxes_new = []
for box in dt_boxes:
box = self.order_points_clockwise(box)
box = self.clip_det_res(box, img_height, img_width)
rect_width = int(np.linalg.norm(box[0] - box[1]))
rect_height = int(np.linalg.norm(box[0] - box[3]))
if rect_width <= 3 or rect_height <= 3:
continue
dt_boxes_new.append(box)
dt_boxes = np.array(dt_boxes_new)
return dt_boxes
def filter_tag_det_res_only_clip(self, dt_boxes, image_shape):
img_height, img_width = image_shape[0:2]
dt_boxes_new = []
for box in dt_boxes:
box = self.clip_det_res(box, img_height, img_width)
dt_boxes_new.append(box)
dt_boxes = np.array(dt_boxes_new)
return dt_boxes
def __call__(self, img):
ori_im = img.copy()
data = {'image': img}
data = transform(data, self.preprocess_op)
img, shape_list = data
if img is None:
return None, 0
img = np.expand_dims(img, axis=0)
shape_list = np.expand_dims(shape_list, axis=0)
img = img.copy()
starttime = time.time()
with torch.no_grad():
inp = torch.from_numpy(img)
inp = inp.to(self.device)
outputs = self.net(inp)
preds = {}
if self.det_algorithm == "EAST":
preds['f_geo'] = outputs['f_geo'].cpu().numpy()
preds['f_score'] = outputs['f_score'].cpu().numpy()
elif self.det_algorithm == 'SAST':
preds['f_border'] = outputs['f_border'].cpu().numpy()
preds['f_score'] = outputs['f_score'].cpu().numpy()
preds['f_tco'] = outputs['f_tco'].cpu().numpy()
preds['f_tvo'] = outputs['f_tvo'].cpu().numpy()
elif self.det_algorithm in ['DB', 'PSE', 'DB++']:
preds['maps'] = outputs['maps'].cpu().numpy()
elif self.det_algorithm == 'FCE':
for i, (k, output) in enumerate(outputs.items()):
preds['level_{}'.format(i)] = output
else:
raise NotImplementedError
post_result = self.postprocess_op(preds, shape_list)
dt_boxes = post_result[0]['points']
if (self.det_algorithm == "SAST" and
self.det_sast_polygon) or (self.det_algorithm in ["PSE", "FCE"] and
self.postprocess_op.box_type == 'poly'):
dt_boxes = self.filter_tag_det_res_only_clip(dt_boxes, ori_im.shape)
else:
dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape)
elapse = time.time() - starttime
return dt_boxes, elapse
from PIL import Image
import cv2
import numpy as np
import math
import time
import torch
from ...pytorchocr.base_ocr_v20 import BaseOCRV20
from . import pytorchocr_utility as utility
from ...pytorchocr.postprocess import build_post_process
class TextRecognizer(BaseOCRV20):
def __init__(self, args, **kwargs):
self.device = args.device
self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")]
self.character_type = args.rec_char_type
self.rec_batch_num = args.rec_batch_num
self.rec_algorithm = args.rec_algorithm
self.max_text_length = args.max_text_length
postprocess_params = {
'name': 'CTCLabelDecode',
"character_type": args.rec_char_type,
"character_dict_path": args.rec_char_dict_path,
"use_space_char": args.use_space_char
}
if self.rec_algorithm == "SRN":
postprocess_params = {
'name': 'SRNLabelDecode',
"character_type": args.rec_char_type,
"character_dict_path": args.rec_char_dict_path,
"use_space_char": args.use_space_char
}
elif self.rec_algorithm == "RARE":
postprocess_params = {
'name': 'AttnLabelDecode',
"character_type": args.rec_char_type,
"character_dict_path": args.rec_char_dict_path,
"use_space_char": args.use_space_char
}
elif self.rec_algorithm == 'NRTR':
postprocess_params = {
'name': 'NRTRLabelDecode',
"character_dict_path": args.rec_char_dict_path,
"use_space_char": args.use_space_char
}
elif self.rec_algorithm == "SAR":
postprocess_params = {
'name': 'SARLabelDecode',
"character_dict_path": args.rec_char_dict_path,
"use_space_char": args.use_space_char
}
elif self.rec_algorithm == 'ViTSTR':
postprocess_params = {
'name': 'ViTSTRLabelDecode',
"character_dict_path": args.rec_char_dict_path,
"use_space_char": args.use_space_char
}
elif self.rec_algorithm == "CAN":
self.inverse = args.rec_image_inverse
postprocess_params = {
'name': 'CANLabelDecode',
"character_dict_path": args.rec_char_dict_path,
"use_space_char": args.use_space_char
}
elif self.rec_algorithm == 'RFL':
postprocess_params = {
'name': 'RFLLabelDecode',
"character_dict_path": None,
"use_space_char": args.use_space_char
}
self.postprocess_op = build_post_process(postprocess_params)
self.limited_max_width = args.limited_max_width
self.limited_min_width = args.limited_min_width
self.weights_path = args.rec_model_path
self.yaml_path = args.rec_yaml_path
network_config = utility.get_arch_config(self.weights_path)
weights = self.read_pytorch_weights(self.weights_path)
self.out_channels = self.get_out_channels(weights)
if self.rec_algorithm == 'NRTR':
self.out_channels = list(weights.values())[-1].numpy().shape[0]
elif self.rec_algorithm == 'SAR':
self.out_channels = list(weights.values())[-3].numpy().shape[0]
kwargs['out_channels'] = self.out_channels
super(TextRecognizer, self).__init__(network_config, **kwargs)
self.load_state_dict(weights)
self.net.eval()
self.net.to(self.device)
def resize_norm_img(self, img, max_wh_ratio):
imgC, imgH, imgW = self.rec_image_shape
if self.rec_algorithm == 'NRTR' or self.rec_algorithm == 'ViTSTR':
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# return padding_im
image_pil = Image.fromarray(np.uint8(img))
if self.rec_algorithm == 'ViTSTR':
img = image_pil.resize([imgW, imgH], Image.BICUBIC)
else:
img = image_pil.resize([imgW, imgH], Image.ANTIALIAS)
img = np.array(img)
norm_img = np.expand_dims(img, -1)
norm_img = norm_img.transpose((2, 0, 1))
if self.rec_algorithm == 'ViTSTR':
norm_img = norm_img.astype(np.float32) / 255.
else:
norm_img = norm_img.astype(np.float32) / 128. - 1.
return norm_img
elif self.rec_algorithm == 'RFL':
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
resized_image = cv2.resize(
img, (imgW, imgH), interpolation=cv2.INTER_CUBIC)
resized_image = resized_image.astype('float32')
resized_image = resized_image / 255
resized_image = resized_image[np.newaxis, :]
resized_image -= 0.5
resized_image /= 0.5
return resized_image
assert imgC == img.shape[2]
max_wh_ratio = max(max_wh_ratio, imgW / imgH)
imgW = int((imgH * max_wh_ratio))
imgW = max(min(imgW, self.limited_max_width), self.limited_min_width)
h, w = img.shape[:2]
ratio = w / float(h)
ratio_imgH = math.ceil(imgH * ratio)
ratio_imgH = max(ratio_imgH, self.limited_min_width)
if ratio_imgH > imgW:
resized_w = imgW
else:
resized_w = int(ratio_imgH)
resized_image = cv2.resize(img, (resized_w, imgH))
resized_image = resized_image.astype('float32')
resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5
resized_image /= 0.5
padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
padding_im[:, :, 0:resized_w] = resized_image
return padding_im
def resize_norm_img_svtr(self, img, image_shape):
imgC, imgH, imgW = image_shape
resized_image = cv2.resize(
img, (imgW, imgH), interpolation=cv2.INTER_LINEAR)
resized_image = resized_image.astype('float32')
resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5
resized_image /= 0.5
return resized_image
def resize_norm_img_srn(self, img, image_shape):
imgC, imgH, imgW = image_shape
img_black = np.zeros((imgH, imgW))
im_hei = img.shape[0]
im_wid = img.shape[1]
if im_wid <= im_hei * 1:
img_new = cv2.resize(img, (imgH * 1, imgH))
elif im_wid <= im_hei * 2:
img_new = cv2.resize(img, (imgH * 2, imgH))
elif im_wid <= im_hei * 3:
img_new = cv2.resize(img, (imgH * 3, imgH))
else:
img_new = cv2.resize(img, (imgW, imgH))
img_np = np.asarray(img_new)
img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
img_black[:, 0:img_np.shape[1]] = img_np
img_black = img_black[:, :, np.newaxis]
row, col, c = img_black.shape
c = 1
return np.reshape(img_black, (c, row, col)).astype(np.float32)
def srn_other_inputs(self, image_shape, num_heads, max_text_length):
imgC, imgH, imgW = image_shape
feature_dim = int((imgH / 8) * (imgW / 8))
encoder_word_pos = np.array(range(0, feature_dim)).reshape(
(feature_dim, 1)).astype('int64')
gsrm_word_pos = np.array(range(0, max_text_length)).reshape(
(max_text_length, 1)).astype('int64')
gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length))
gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape(
[-1, 1, max_text_length, max_text_length])
gsrm_slf_attn_bias1 = np.tile(
gsrm_slf_attn_bias1,
[1, num_heads, 1, 1]).astype('float32') * [-1e9]
gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape(
[-1, 1, max_text_length, max_text_length])
gsrm_slf_attn_bias2 = np.tile(
gsrm_slf_attn_bias2,
[1, num_heads, 1, 1]).astype('float32') * [-1e9]
encoder_word_pos = encoder_word_pos[np.newaxis, :]
gsrm_word_pos = gsrm_word_pos[np.newaxis, :]
return [
encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
gsrm_slf_attn_bias2
]
def process_image_srn(self, img, image_shape, num_heads, max_text_length):
norm_img = self.resize_norm_img_srn(img, image_shape)
norm_img = norm_img[np.newaxis, :]
[encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \
self.srn_other_inputs(image_shape, num_heads, max_text_length)
gsrm_slf_attn_bias1 = gsrm_slf_attn_bias1.astype(np.float32)
gsrm_slf_attn_bias2 = gsrm_slf_attn_bias2.astype(np.float32)
encoder_word_pos = encoder_word_pos.astype(np.int64)
gsrm_word_pos = gsrm_word_pos.astype(np.int64)
return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
gsrm_slf_attn_bias2)
def resize_norm_img_sar(self, img, image_shape,
width_downsample_ratio=0.25):
imgC, imgH, imgW_min, imgW_max = image_shape
h = img.shape[0]
w = img.shape[1]
valid_ratio = 1.0
# make sure new_width is an integral multiple of width_divisor.
width_divisor = int(1 / width_downsample_ratio)
# resize
ratio = w / float(h)
resize_w = math.ceil(imgH * ratio)
if resize_w % width_divisor != 0:
resize_w = round(resize_w / width_divisor) * width_divisor
if imgW_min is not None:
resize_w = max(imgW_min, resize_w)
if imgW_max is not None:
valid_ratio = min(1.0, 1.0 * resize_w / imgW_max)
resize_w = min(imgW_max, resize_w)
resized_image = cv2.resize(img, (resize_w, imgH))
resized_image = resized_image.astype('float32')
# norm
if image_shape[0] == 1:
resized_image = resized_image / 255
resized_image = resized_image[np.newaxis, :]
else:
resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5
resized_image /= 0.5
resize_shape = resized_image.shape
padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32)
padding_im[:, :, 0:resize_w] = resized_image
pad_shape = padding_im.shape
return padding_im, resize_shape, pad_shape, valid_ratio
def norm_img_can(self, img, image_shape):
img = cv2.cvtColor(
img, cv2.COLOR_BGR2GRAY) # CAN only predict gray scale image
if self.inverse:
img = 255 - img
if self.rec_image_shape[0] == 1:
h, w = img.shape
_, imgH, imgW = self.rec_image_shape
if h < imgH or w < imgW:
padding_h = max(imgH - h, 0)
padding_w = max(imgW - w, 0)
img_padded = np.pad(img, ((0, padding_h), (0, padding_w)),
'constant',
constant_values=(255))
img = img_padded
img = np.expand_dims(img, 0) / 255.0 # h,w,c -> c,h,w
img = img.astype('float32')
return img
def __call__(self, img_list):
img_num = len(img_list)
# Calculate the aspect ratio of all text bars
width_list = []
for img in img_list:
width_list.append(img.shape[1] / float(img.shape[0]))
# Sorting can speed up the recognition process
indices = np.argsort(np.array(width_list))
# rec_res = []
rec_res = [['', 0.0]] * img_num
batch_num = self.rec_batch_num
elapse = 0
for beg_img_no in range(0, img_num, batch_num):
end_img_no = min(img_num, beg_img_no + batch_num)
norm_img_batch = []
max_wh_ratio = 0
for ino in range(beg_img_no, end_img_no):
# h, w = img_list[ino].shape[0:2]
h, w = img_list[indices[ino]].shape[0:2]
wh_ratio = w * 1.0 / h
max_wh_ratio = max(max_wh_ratio, wh_ratio)
for ino in range(beg_img_no, end_img_no):
if self.rec_algorithm == "SAR":
norm_img, _, _, valid_ratio = self.resize_norm_img_sar(
img_list[indices[ino]], self.rec_image_shape)
norm_img = norm_img[np.newaxis, :]
valid_ratio = np.expand_dims(valid_ratio, axis=0)
valid_ratios = []
valid_ratios.append(valid_ratio)
norm_img_batch.append(norm_img)
elif self.rec_algorithm == "SVTR":
norm_img = self.resize_norm_img_svtr(img_list[indices[ino]],
self.rec_image_shape)
norm_img = norm_img[np.newaxis, :]
norm_img_batch.append(norm_img)
elif self.rec_algorithm == "SRN":
norm_img = self.process_image_srn(img_list[indices[ino]],
self.rec_image_shape, 8,
self.max_text_length)
encoder_word_pos_list = []
gsrm_word_pos_list = []
gsrm_slf_attn_bias1_list = []
gsrm_slf_attn_bias2_list = []
encoder_word_pos_list.append(norm_img[1])
gsrm_word_pos_list.append(norm_img[2])
gsrm_slf_attn_bias1_list.append(norm_img[3])
gsrm_slf_attn_bias2_list.append(norm_img[4])
norm_img_batch.append(norm_img[0])
elif self.rec_algorithm == "CAN":
norm_img = self.norm_img_can(img_list[indices[ino]],
max_wh_ratio)
norm_img = norm_img[np.newaxis, :]
norm_img_batch.append(norm_img)
norm_image_mask = np.ones(norm_img.shape, dtype='float32')
word_label = np.ones([1, 36], dtype='int64')
norm_img_mask_batch = []
word_label_list = []
norm_img_mask_batch.append(norm_image_mask)
word_label_list.append(word_label)
else:
norm_img = self.resize_norm_img(img_list[indices[ino]],
max_wh_ratio)
norm_img = norm_img[np.newaxis, :]
norm_img_batch.append(norm_img)
norm_img_batch = np.concatenate(norm_img_batch)
norm_img_batch = norm_img_batch.copy()
if self.rec_algorithm == "SRN":
starttime = time.time()
encoder_word_pos_list = np.concatenate(encoder_word_pos_list)
gsrm_word_pos_list = np.concatenate(gsrm_word_pos_list)
gsrm_slf_attn_bias1_list = np.concatenate(
gsrm_slf_attn_bias1_list)
gsrm_slf_attn_bias2_list = np.concatenate(
gsrm_slf_attn_bias2_list)
with torch.no_grad():
inp = torch.from_numpy(norm_img_batch)
encoder_word_pos_inp = torch.from_numpy(encoder_word_pos_list)
gsrm_word_pos_inp = torch.from_numpy(gsrm_word_pos_list)
gsrm_slf_attn_bias1_inp = torch.from_numpy(gsrm_slf_attn_bias1_list)
gsrm_slf_attn_bias2_inp = torch.from_numpy(gsrm_slf_attn_bias2_list)
# if self.use_gpu:
# inp = inp.cuda()
# encoder_word_pos_inp = encoder_word_pos_inp.cuda()
# gsrm_word_pos_inp = gsrm_word_pos_inp.cuda()
# gsrm_slf_attn_bias1_inp = gsrm_slf_attn_bias1_inp.cuda()
# gsrm_slf_attn_bias2_inp = gsrm_slf_attn_bias2_inp.cuda()
inp = inp.to(self.device)
encoder_word_pos_inp = encoder_word_pos_inp.to(self.device)
gsrm_word_pos_inp = gsrm_word_pos_inp.to(self.device)
gsrm_slf_attn_bias1_inp = gsrm_slf_attn_bias1_inp.to(self.device)
gsrm_slf_attn_bias2_inp = gsrm_slf_attn_bias2_inp.to(self.device)
backbone_out = self.net.backbone(inp) # backbone_feat
prob_out = self.net.head(backbone_out, [encoder_word_pos_inp, gsrm_word_pos_inp, gsrm_slf_attn_bias1_inp, gsrm_slf_attn_bias2_inp])
# preds = {"predict": prob_out[2]}
preds = {"predict": prob_out["predict"]}
elif self.rec_algorithm == "SAR":
starttime = time.time()
# valid_ratios = np.concatenate(valid_ratios)
# inputs = [
# norm_img_batch,
# valid_ratios,
# ]
with torch.no_grad():
inp = torch.from_numpy(norm_img_batch)
# if self.use_gpu:
# inp = inp.cuda()
inp = inp.to(self.device)
preds = self.net(inp)
elif self.rec_algorithm == "CAN":
starttime = time.time()
norm_img_mask_batch = np.concatenate(norm_img_mask_batch)
word_label_list = np.concatenate(word_label_list)
inputs = [norm_img_batch, norm_img_mask_batch, word_label_list]
inp = [torch.from_numpy(e_i) for e_i in inputs]
inp = [e_i.to(self.device) for e_i in inp]
with torch.no_grad():
outputs = self.net(inp)
outputs = [v.cpu().numpy() for k, v in enumerate(outputs)]
preds = outputs
else:
starttime = time.time()
with torch.no_grad():
inp = torch.from_numpy(norm_img_batch)
inp = inp.to(self.device)
prob_out = self.net(inp)
if isinstance(prob_out, list):
preds = [v.cpu().numpy() for v in prob_out]
else:
preds = prob_out.cpu().numpy()
rec_result = self.postprocess_op(preds)
for rno in range(len(rec_result)):
rec_res[indices[beg_img_no + rno]] = rec_result[rno]
elapse += time.time() - starttime
return rec_res, elapse
import cv2
import copy
import numpy as np
from . import predict_rec
from . import predict_det
from . import predict_cls
class TextSystem(object):
def __init__(self, args, **kwargs):
self.text_detector = predict_det.TextDetector(args, **kwargs)
self.text_recognizer = predict_rec.TextRecognizer(args, **kwargs)
self.use_angle_cls = args.use_angle_cls
self.drop_score = args.drop_score
if self.use_angle_cls:
self.text_classifier = predict_cls.TextClassifier(args, **kwargs)
def get_rotate_crop_image(self, img, points):
'''
img_height, img_width = img.shape[0:2]
left = int(np.min(points[:, 0]))
right = int(np.max(points[:, 0]))
top = int(np.min(points[:, 1]))
bottom = int(np.max(points[:, 1]))
img_crop = img[top:bottom, left:right, :].copy()
points[:, 0] = points[:, 0] - left
points[:, 1] = points[:, 1] - top
'''
img_crop_width = int(
max(
np.linalg.norm(points[0] - points[1]),
np.linalg.norm(points[2] - points[3])))
img_crop_height = int(
max(
np.linalg.norm(points[0] - points[3]),
np.linalg.norm(points[1] - points[2])))
pts_std = np.float32([[0, 0], [img_crop_width, 0],
[img_crop_width, img_crop_height],
[0, img_crop_height]])
M = cv2.getPerspectiveTransform(points, pts_std)
dst_img = cv2.warpPerspective(
img,
M, (img_crop_width, img_crop_height),
borderMode=cv2.BORDER_REPLICATE,
flags=cv2.INTER_CUBIC)
dst_img_height, dst_img_width = dst_img.shape[0:2]
if dst_img_height * 1.0 / dst_img_width >= 1.5:
dst_img = np.rot90(dst_img)
return dst_img
def __call__(self, img):
ori_im = img.copy()
dt_boxes, elapse = self.text_detector(img)
print("dt_boxes num : {}, elapse : {}".format(
len(dt_boxes), elapse))
if dt_boxes is None:
return None, None
img_crop_list = []
dt_boxes = sorted_boxes(dt_boxes)
for bno in range(len(dt_boxes)):
tmp_box = copy.deepcopy(dt_boxes[bno])
img_crop = self.get_rotate_crop_image(ori_im, tmp_box)
img_crop_list.append(img_crop)
if self.use_angle_cls:
img_crop_list, angle_list, elapse = self.text_classifier(
img_crop_list)
print("cls num : {}, elapse : {}".format(
len(img_crop_list), elapse))
rec_res, elapse = self.text_recognizer(img_crop_list)
print("rec_res num : {}, elapse : {}".format(
len(rec_res), elapse))
# self.print_draw_crop_rec_res(img_crop_list, rec_res)
filter_boxes, filter_rec_res = [], []
for box, rec_reuslt in zip(dt_boxes, rec_res):
text, score = rec_reuslt
if score >= self.drop_score:
filter_boxes.append(box)
filter_rec_res.append(rec_reuslt)
return filter_boxes, filter_rec_res
def sorted_boxes(dt_boxes):
"""
Sort text boxes in order from top to bottom, left to right
args:
dt_boxes(array):detected text boxes with shape [4, 2]
return:
sorted boxes(array) with shape [4, 2]
"""
num_boxes = dt_boxes.shape[0]
sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
_boxes = list(sorted_boxes)
for i in range(num_boxes - 1):
if abs(_boxes[i + 1][0][1] - _boxes[i][0][1]) < 10 and \
(_boxes[i + 1][0][0] < _boxes[i][0][0]):
tmp = _boxes[i]
_boxes[i] = _boxes[i + 1]
_boxes[i + 1] = tmp
return _boxes
import os
import math
from pathlib import Path
import numpy as np
import cv2
import argparse
root_dir = Path(__file__).resolve().parent.parent.parent
DEFAULT_CFG_PATH = root_dir / "pytorchocr" / "utils" / "resources" / "arch_config.yaml"
def init_args():
def str2bool(v):
return v.lower() in ("true", "t", "1")
parser = argparse.ArgumentParser()
# params for prediction engine
parser.add_argument("--use_gpu", type=str2bool, default=False)
parser.add_argument("--det", type=str2bool, default=True)
parser.add_argument("--rec", type=str2bool, default=True)
parser.add_argument("--device", type=str, default='cpu')
# parser.add_argument("--ir_optim", type=str2bool, default=True)
# parser.add_argument("--use_tensorrt", type=str2bool, default=False)
# parser.add_argument("--use_fp16", type=str2bool, default=False)
parser.add_argument("--gpu_mem", type=int, default=500)
parser.add_argument("--warmup", type=str2bool, default=False)
# params for text detector
parser.add_argument("--image_dir", type=str)
parser.add_argument("--det_algorithm", type=str, default='DB')
parser.add_argument("--det_model_path", type=str)
parser.add_argument("--det_limit_side_len", type=float, default=960)
parser.add_argument("--det_limit_type", type=str, default='max')
# DB parmas
parser.add_argument("--det_db_thresh", type=float, default=0.3)
parser.add_argument("--det_db_box_thresh", type=float, default=0.6)
parser.add_argument("--det_db_unclip_ratio", type=float, default=1.5)
parser.add_argument("--max_batch_size", type=int, default=10)
parser.add_argument("--use_dilation", type=str2bool, default=False)
parser.add_argument("--det_db_score_mode", type=str, default="fast")
# EAST parmas
parser.add_argument("--det_east_score_thresh", type=float, default=0.8)
parser.add_argument("--det_east_cover_thresh", type=float, default=0.1)
parser.add_argument("--det_east_nms_thresh", type=float, default=0.2)
# SAST parmas
parser.add_argument("--det_sast_score_thresh", type=float, default=0.5)
parser.add_argument("--det_sast_nms_thresh", type=float, default=0.2)
parser.add_argument("--det_sast_polygon", type=str2bool, default=False)
# PSE parmas
parser.add_argument("--det_pse_thresh", type=float, default=0)
parser.add_argument("--det_pse_box_thresh", type=float, default=0.85)
parser.add_argument("--det_pse_min_area", type=float, default=16)
parser.add_argument("--det_pse_box_type", type=str, default='box')
parser.add_argument("--det_pse_scale", type=int, default=1)
# FCE parmas
parser.add_argument("--scales", type=list, default=[8, 16, 32])
parser.add_argument("--alpha", type=float, default=1.0)
parser.add_argument("--beta", type=float, default=1.0)
parser.add_argument("--fourier_degree", type=int, default=5)
parser.add_argument("--det_fce_box_type", type=str, default='poly')
# params for text recognizer
parser.add_argument("--rec_algorithm", type=str, default='CRNN')
parser.add_argument("--rec_model_path", type=str)
parser.add_argument("--rec_image_inverse", type=str2bool, default=True)
parser.add_argument("--rec_image_shape", type=str, default="3, 48, 320")
parser.add_argument("--rec_char_type", type=str, default='ch')
parser.add_argument("--rec_batch_num", type=int, default=6)
parser.add_argument("--max_text_length", type=int, default=25)
parser.add_argument("--use_space_char", type=str2bool, default=True)
parser.add_argument("--drop_score", type=float, default=0.5)
parser.add_argument("--limited_max_width", type=int, default=1280)
parser.add_argument("--limited_min_width", type=int, default=16)
parser.add_argument(
"--vis_font_path", type=str,
default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), 'doc/fonts/simfang.ttf'))
parser.add_argument(
"--rec_char_dict_path",
type=str,
default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
'pytorchocr/utils/ppocr_keys_v1.txt'))
# params for text classifier
parser.add_argument("--use_angle_cls", type=str2bool, default=False)
parser.add_argument("--cls_model_path", type=str)
parser.add_argument("--cls_image_shape", type=str, default="3, 48, 192")
parser.add_argument("--label_list", type=list, default=['0', '180'])
parser.add_argument("--cls_batch_num", type=int, default=6)
parser.add_argument("--cls_thresh", type=float, default=0.9)
parser.add_argument("--enable_mkldnn", type=str2bool, default=False)
parser.add_argument("--use_pdserving", type=str2bool, default=False)
# params for e2e
parser.add_argument("--e2e_algorithm", type=str, default='PGNet')
parser.add_argument("--e2e_model_path", type=str)
parser.add_argument("--e2e_limit_side_len", type=float, default=768)
parser.add_argument("--e2e_limit_type", type=str, default='max')
# PGNet parmas
parser.add_argument("--e2e_pgnet_score_thresh", type=float, default=0.5)
parser.add_argument(
"--e2e_char_dict_path", type=str,
default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
'pytorchocr/utils/ic15_dict.txt'))
parser.add_argument("--e2e_pgnet_valid_set", type=str, default='totaltext')
parser.add_argument("--e2e_pgnet_polygon", type=bool, default=True)
parser.add_argument("--e2e_pgnet_mode", type=str, default='fast')
# SR parmas
parser.add_argument("--sr_model_path", type=str)
parser.add_argument("--sr_image_shape", type=str, default="3, 32, 128")
parser.add_argument("--sr_batch_num", type=int, default=1)
# params .yaml
parser.add_argument("--det_yaml_path", type=str, default=None)
parser.add_argument("--rec_yaml_path", type=str, default=None)
parser.add_argument("--cls_yaml_path", type=str, default=None)
parser.add_argument("--e2e_yaml_path", type=str, default=None)
parser.add_argument("--sr_yaml_path", type=str, default=None)
# multi-process
parser.add_argument("--use_mp", type=str2bool, default=False)
parser.add_argument("--total_process_num", type=int, default=1)
parser.add_argument("--process_id", type=int, default=0)
parser.add_argument("--benchmark", type=str2bool, default=False)
parser.add_argument("--save_log_path", type=str, default="./log_output/")
parser.add_argument("--show_log", type=str2bool, default=True)
return parser
def parse_args():
parser = init_args()
return parser.parse_args()
def get_default_config(args):
return vars(args)
def read_network_config_from_yaml(yaml_path, char_num=None):
if not os.path.exists(yaml_path):
raise FileNotFoundError('{} is not existed.'.format(yaml_path))
import yaml
with open(yaml_path, encoding='utf-8') as f:
res = yaml.safe_load(f)
if res.get('Architecture') is None:
raise ValueError('{} has no Architecture'.format(yaml_path))
if res['Architecture']['Head']['name'] == 'MultiHead' and char_num is not None:
res['Architecture']['Head']['out_channels_list'] = {
'CTCLabelDecode': char_num,
'SARLabelDecode': char_num + 2,
'NRTRLabelDecode': char_num + 3
}
return res['Architecture']
def AnalysisConfig(weights_path, yaml_path=None, char_num=None):
if not os.path.exists(os.path.abspath(weights_path)):
raise FileNotFoundError('{} is not found.'.format(weights_path))
if yaml_path is not None:
return read_network_config_from_yaml(yaml_path, char_num=char_num)
def resize_img(img, input_size=600):
"""
resize img and limit the longest side of the image to input_size
"""
img = np.array(img)
im_shape = img.shape
im_size_max = np.max(im_shape[0:2])
im_scale = float(input_size) / float(im_size_max)
img = cv2.resize(img, None, None, fx=im_scale, fy=im_scale)
return img
def str_count(s):
"""
Count the number of Chinese characters,
a single English character and a single number
equal to half the length of Chinese characters.
args:
s(string): the input of string
return(int):
the number of Chinese characters
"""
import string
count_zh = count_pu = 0
s_len = len(s)
en_dg_count = 0
for c in s:
if c in string.ascii_letters or c.isdigit() or c.isspace():
en_dg_count += 1
elif c.isalpha():
count_zh += 1
else:
count_pu += 1
return s_len - math.ceil(en_dg_count / 2)
def base64_to_cv2(b64str):
import base64
data = base64.b64decode(b64str.encode('utf8'))
data = np.fromstring(data, np.uint8)
data = cv2.imdecode(data, cv2.IMREAD_COLOR)
return data
def get_arch_config(model_path):
from omegaconf import OmegaConf
all_arch_config = OmegaConf.load(DEFAULT_CFG_PATH)
path = Path(model_path)
file_name = path.stem
if file_name not in all_arch_config:
raise ValueError(f"architecture {file_name} is not in arch_config.yaml")
arch_config = all_arch_config[file_name]
return arch_config
\ No newline at end of file
...@@ -23,25 +23,17 @@ class RapidTableModel(object): ...@@ -23,25 +23,17 @@ class RapidTableModel(object):
self.table_model = RapidTable(input_args) self.table_model = RapidTable(input_args)
# if ocr_engine is None: # self.ocr_model_name = "RapidOCR"
# self.ocr_model_name = "RapidOCR" # if torch.cuda.is_available():
# if torch.cuda.is_available(): # from rapidocr_paddle import RapidOCR
# from rapidocr_paddle import RapidOCR # self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True)
# self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True)
# else:
# from rapidocr_onnxruntime import RapidOCR
# self.ocr_engine = RapidOCR()
# else: # else:
# self.ocr_model_name = "PaddleOCR" # from rapidocr_onnxruntime import RapidOCR
# self.ocr_engine = ocr_engine # self.ocr_engine = RapidOCR()
self.ocr_model_name = "PaddleOCR"
self.ocr_engine = ocr_engine
self.ocr_model_name = "RapidOCR"
if torch.cuda.is_available():
from rapidocr_paddle import RapidOCR
self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True)
else:
from rapidocr_onnxruntime import RapidOCR
self.ocr_engine = RapidOCR()
def predict(self, image): def predict(self, image):
......
import torch
from struct_eqtable import build_model
from magic_pdf.model.sub_modules.table.table_utils import minify_html
class StructTableModel:
def __init__(self, model_path, max_new_tokens=1024, max_time=60):
# init
assert torch.cuda.is_available(), "CUDA must be available for StructEqTable model."
self.model = build_model(
model_ckpt=model_path,
max_new_tokens=max_new_tokens,
max_time=max_time,
lmdeploy=False,
flash_attn=False,
batch_size=1,
).cuda()
self.default_format = "html"
def predict(self, images, output_format=None, **kwargs):
if output_format is None:
output_format = self.default_format
else:
if output_format not in ['latex', 'markdown', 'html']:
raise ValueError(f"Output format {output_format} is not supported.")
results = self.model(
images, output_format=output_format
)
if output_format == "html":
results = [minify_html(html) for html in results]
return results
import os
import cv2
import numpy as np
from paddleocr import PaddleOCR
from ppstructure.table.predict_table import TableSystem
from ppstructure.utility import init_args
from PIL import Image
from magic_pdf.config.constants import * # noqa: F403
class TableMasterPaddleModel(object):
"""This class is responsible for converting image of table into HTML format
using a pre-trained model.
Attributes:
- table_sys: An instance of TableSystem initialized with parsed arguments.
Methods:
- __init__(config): Initializes the model with configuration parameters.
- img2html(image): Converts a PIL Image or NumPy array to HTML string.
- parse_args(**kwargs): Parses configuration arguments.
"""
def __init__(self, config):
"""
Parameters:
- config (dict): Configuration dictionary containing model_dir and device.
"""
args = self.parse_args(**config)
self.table_sys = TableSystem(args)
def img2html(self, image):
"""
Parameters:
- image (PIL.Image or np.ndarray): The image of the table to be converted.
Return:
- HTML (str): A string representing the HTML structure with content of the table.
"""
if isinstance(image, Image.Image):
image = np.asarray(image)
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
pred_res, _ = self.table_sys(image)
pred_html = pred_res['html']
# res = '<td><table border="1">' + pred_html.replace("<html><body><table>", "").replace(
# "</table></body></html>","") + "</table></td>\n"
return pred_html
def parse_args(self, **kwargs):
parser = init_args()
model_dir = kwargs.get('model_dir')
table_model_dir = os.path.join(model_dir, TABLE_MASTER_DIR) # noqa: F405
table_char_dict_path = os.path.join(model_dir, TABLE_MASTER_DICT) # noqa: F405
det_model_dir = os.path.join(model_dir, DETECT_MODEL_DIR) # noqa: F405
rec_model_dir = os.path.join(model_dir, REC_MODEL_DIR) # noqa: F405
rec_char_dict_path = os.path.join(model_dir, REC_CHAR_DICT) # noqa: F405
device = kwargs.get('device', 'cpu')
use_gpu = True if device.startswith('cuda') else False
config = {
'use_gpu': use_gpu,
'table_max_len': kwargs.get('table_max_len', TABLE_MAX_LEN), # noqa: F405
'table_algorithm': 'TableMaster',
'table_model_dir': table_model_dir,
'table_char_dict_path': table_char_dict_path,
'det_model_dir': det_model_dir,
'rec_model_dir': rec_model_dir,
'rec_char_dict_path': rec_char_dict_path,
}
parser.set_defaults(**config)
return parser.parse_args([])
...@@ -4,6 +4,7 @@ import os ...@@ -4,6 +4,7 @@ import os
import re import re
import statistics import statistics
import time import time
import warnings
from typing import List from typing import List
import cv2 import cv2
...@@ -21,12 +22,9 @@ from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir, get_l ...@@ -21,12 +22,9 @@ from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir, get_l
from magic_pdf.libs.convert_utils import dict_to_list from magic_pdf.libs.convert_utils import dict_to_list
from magic_pdf.libs.hash_utils import compute_md5 from magic_pdf.libs.hash_utils import compute_md5
from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
from magic_pdf.libs.performance_stats import measure_time, PerformanceStats
from magic_pdf.model.magic_model import MagicModel from magic_pdf.model.magic_model import MagicModel
from magic_pdf.post_proc.llm_aided import llm_aided_formula, llm_aided_text, llm_aided_title from magic_pdf.post_proc.llm_aided import llm_aided_formula, llm_aided_text, llm_aided_title
from concurrent.futures import ThreadPoolExecutor
from magic_pdf.model.sub_modules.model_init import AtomModelSingleton from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
from magic_pdf.post_proc.para_split_v3 import para_split from magic_pdf.post_proc.para_split_v3 import para_split
from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2 from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
...@@ -195,7 +193,7 @@ def calculate_contrast(img, img_mode) -> float: ...@@ -195,7 +193,7 @@ def calculate_contrast(img, img_mode) -> float:
std_dev = np.std(gray_img) std_dev = np.std(gray_img)
# 对比度定义为标准差除以平均值(加上小常数避免除零错误) # 对比度定义为标准差除以平均值(加上小常数避免除零错误)
contrast = std_dev / (mean_value + 1e-6) contrast = std_dev / (mean_value + 1e-6)
# logger.info(f"contrast: {contrast}") # logger.debug(f"contrast: {contrast}")
return round(contrast, 2) return round(contrast, 2)
# @measure_time # @measure_time
...@@ -288,33 +286,39 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang ...@@ -288,33 +286,39 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
if len(need_ocr_spans) > 0: if len(need_ocr_spans) > 0:
# 初始化ocr模型 # 初始化ocr模型
atom_model_manager = AtomModelSingleton() # atom_model_manager = AtomModelSingleton()
ocr_model = atom_model_manager.get_atom_model( # ocr_model = atom_model_manager.get_atom_model(
atom_model_name='ocr', # atom_model_name='ocr',
ocr_show_log=False, # ocr_show_log=False,
det_db_box_thresh=0.3, # det_db_box_thresh=0.3,
lang=lang # lang=lang
) # )
for span in need_ocr_spans: for span in need_ocr_spans:
# 对span的bbox截图再ocr # 对span的bbox截图再ocr
span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2') span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2')
# 计算span的对比度,低于0.20的span不进行ocr # 计算span的对比度,低于0.20的span不进行ocr
if calculate_contrast(span_img, img_mode='bgr') <= 0.20: if calculate_contrast(span_img, img_mode='bgr') <= 0.17:
spans.remove(span) spans.remove(span)
continue continue
# pass
span['content'] = ''
span['score'] = 1
span['np_img'] = span_img
ocr_res = ocr_model.ocr(span_img, det=False)
if ocr_res and len(ocr_res) > 0: # ocr_res = ocr_model.ocr(span_img, det=False)
if len(ocr_res[0]) > 0: # if ocr_res and len(ocr_res) > 0:
ocr_text, ocr_score = ocr_res[0][0] # if len(ocr_res[0]) > 0:
# logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}") # ocr_text, ocr_score = ocr_res[0][0]
if ocr_score > 0.5 and len(ocr_text) > 0: # # logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}")
span['content'] = ocr_text # if ocr_score > 0.5 and len(ocr_text) > 0:
span['score'] = ocr_score # span['content'] = ocr_text
else: # span['score'] = float(round(ocr_score, 2))
spans.remove(span) # else:
# spans.remove(span)
return spans return spans
...@@ -372,9 +376,12 @@ def do_predict(boxes: List[List[int]], model) -> List[int]: ...@@ -372,9 +376,12 @@ def do_predict(boxes: List[List[int]], model) -> List[int]:
from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import ( from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import (
boxes2inputs, parse_logits, prepare_inputs) boxes2inputs, parse_logits, prepare_inputs)
inputs = boxes2inputs(boxes) with warnings.catch_warnings():
inputs = prepare_inputs(inputs, model) warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")
logits = model(**inputs).logits.cpu().squeeze(0)
inputs = boxes2inputs(boxes)
inputs = prepare_inputs(inputs, model)
logits = model(**inputs).logits.cpu().squeeze(0)
return parse_logits(logits, len(boxes)) return parse_logits(logits, len(boxes))
...@@ -951,7 +958,47 @@ def pdf_parse_union( ...@@ -951,7 +958,47 @@ def pdf_parse_union(
) )
pdf_info_dict[f'page_{page_id}'] = page_info pdf_info_dict[f'page_{page_id}'] = page_info
# PerformanceStats.print_stats() need_ocr_list = []
img_crop_list = []
text_block_list = []
for pange_id, page_info in pdf_info_dict.items():
for block in page_info['preproc_blocks']:
if block['type'] in ['table', 'image']:
for sub_block in block['blocks']:
if sub_block['type'] in ['image_caption', 'image_footnote', 'table_caption', 'table_footnote']:
text_block_list.append(sub_block)
elif block['type'] in ['text', 'title']:
text_block_list.append(block)
for block in page_info['discarded_blocks']:
text_block_list.append(block)
for block in text_block_list:
for line in block['lines']:
for span in line['spans']:
if 'np_img' in span:
need_ocr_list.append(span)
img_crop_list.append(span['np_img'])
span.pop('np_img')
if len(img_crop_list) > 0:
# Get OCR results for this language's images
atom_model_manager = AtomModelSingleton()
ocr_model = atom_model_manager.get_atom_model(
atom_model_name='ocr',
ocr_show_log=False,
det_db_box_thresh=0.3,
lang=lang
)
rec_start = time.time()
ocr_res_list = ocr_model.ocr(img_crop_list, det=False)[0]
# Verify we have matching counts
assert len(ocr_res_list) == len(need_ocr_list), f'ocr_res_list: {len(ocr_res_list)}, need_ocr_list: {len(need_ocr_list)}'
# Process OCR results for this language
for index, span in enumerate(need_ocr_list):
ocr_text, ocr_score = ocr_res_list[index]
span['content'] = ocr_text
span['score'] = float(round(ocr_score, 2))
rec_time = time.time() - rec_start
logger.info(f'ocr-dynamic-rec time: {round(rec_time, 2)}, total images processed: {len(img_crop_list)}')
"""分段""" """分段"""
para_split(pdf_info_dict) para_split(pdf_info_dict)
......
model:
arch: unimernet
model_type: unimernet
model_config:
model_name: ./models/unimernet_base
max_seq_len: 1536
load_pretrained: True
pretrained: './models/unimernet_base/pytorch_model.pth'
tokenizer_config:
path: ./models/unimernet_base
datasets:
formula_rec_eval:
vis_processor:
eval:
name: "formula_image_eval"
image_size:
- 192
- 672
run:
runner: runner_iter
task: unimernet_train
batch_size_train: 64
batch_size_eval: 64
num_workers: 1
iters_per_inner_epoch: 2000
max_iters: 60000
seed: 42
output_dir: "../output/demo"
evaluate: True
test_splits: [ "eval" ]
device: "cuda"
world_size: 1
dist_url: "env://"
distributed: True
distributed_type: ddp # or fsdp when train llm
generate_cfg:
temperature: 0.0
\ No newline at end of file
AUG:
DETR: true
CACHE_DIR: ~/cache/huggingface
CUDNN_BENCHMARK: false
DATALOADER:
ASPECT_RATIO_GROUPING: true
FILTER_EMPTY_ANNOTATIONS: false
NUM_WORKERS: 4
REPEAT_THRESHOLD: 0.0
SAMPLER_TRAIN: TrainingSampler
DATASETS:
PRECOMPUTED_PROPOSAL_TOPK_TEST: 1000
PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 2000
PROPOSAL_FILES_TEST: []
PROPOSAL_FILES_TRAIN: []
TEST:
- scihub_train
TRAIN:
- scihub_train
GLOBAL:
HACK: 1.0
ICDAR_DATA_DIR_TEST: ''
ICDAR_DATA_DIR_TRAIN: ''
INPUT:
CROP:
ENABLED: true
SIZE:
- 384
- 600
TYPE: absolute_range
FORMAT: RGB
MASK_FORMAT: polygon
MAX_SIZE_TEST: 1333
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MIN_SIZE_TRAIN:
- 480
- 512
- 544
- 576
- 608
- 640
- 672
- 704
- 736
- 768
- 800
MIN_SIZE_TRAIN_SAMPLING: choice
RANDOM_FLIP: horizontal
MODEL:
ANCHOR_GENERATOR:
ANGLES:
- - -90
- 0
- 90
ASPECT_RATIOS:
- - 0.5
- 1.0
- 2.0
NAME: DefaultAnchorGenerator
OFFSET: 0.0
SIZES:
- - 32
- - 64
- - 128
- - 256
- - 512
BACKBONE:
FREEZE_AT: 2
NAME: build_vit_fpn_backbone
CONFIG_PATH: ''
DEVICE: cuda
FPN:
FUSE_TYPE: sum
IN_FEATURES:
- layer3
- layer5
- layer7
- layer11
NORM: ''
OUT_CHANNELS: 256
IMAGE_ONLY: true
KEYPOINT_ON: false
LOAD_PROPOSALS: false
MASK_ON: true
META_ARCHITECTURE: VLGeneralizedRCNN
PANOPTIC_FPN:
COMBINE:
ENABLED: true
INSTANCES_CONFIDENCE_THRESH: 0.5
OVERLAP_THRESH: 0.5
STUFF_AREA_LIMIT: 4096
INSTANCE_LOSS_WEIGHT: 1.0
PIXEL_MEAN:
- 127.5
- 127.5
- 127.5
PIXEL_STD:
- 127.5
- 127.5
- 127.5
PROPOSAL_GENERATOR:
MIN_SIZE: 0
NAME: RPN
RESNETS:
DEFORM_MODULATED: false
DEFORM_NUM_GROUPS: 1
DEFORM_ON_PER_STAGE:
- false
- false
- false
- false
DEPTH: 50
NORM: FrozenBN
NUM_GROUPS: 1
OUT_FEATURES:
- res4
RES2_OUT_CHANNELS: 256
RES5_DILATION: 1
STEM_OUT_CHANNELS: 64
STRIDE_IN_1X1: true
WIDTH_PER_GROUP: 64
RETINANET:
BBOX_REG_LOSS_TYPE: smooth_l1
BBOX_REG_WEIGHTS:
- 1.0
- 1.0
- 1.0
- 1.0
FOCAL_LOSS_ALPHA: 0.25
FOCAL_LOSS_GAMMA: 2.0
IN_FEATURES:
- p3
- p4
- p5
- p6
- p7
IOU_LABELS:
- 0
- -1
- 1
IOU_THRESHOLDS:
- 0.4
- 0.5
NMS_THRESH_TEST: 0.5
NORM: ''
NUM_CLASSES: 10
NUM_CONVS: 4
PRIOR_PROB: 0.01
SCORE_THRESH_TEST: 0.05
SMOOTH_L1_LOSS_BETA: 0.1
TOPK_CANDIDATES_TEST: 1000
ROI_BOX_CASCADE_HEAD:
BBOX_REG_WEIGHTS:
- - 10.0
- 10.0
- 5.0
- 5.0
- - 20.0
- 20.0
- 10.0
- 10.0
- - 30.0
- 30.0
- 15.0
- 15.0
IOUS:
- 0.5
- 0.6
- 0.7
ROI_BOX_HEAD:
BBOX_REG_LOSS_TYPE: smooth_l1
BBOX_REG_LOSS_WEIGHT: 1.0
BBOX_REG_WEIGHTS:
- 10.0
- 10.0
- 5.0
- 5.0
CLS_AGNOSTIC_BBOX_REG: true
CONV_DIM: 256
FC_DIM: 1024
NAME: FastRCNNConvFCHead
NORM: ''
NUM_CONV: 0
NUM_FC: 2
POOLER_RESOLUTION: 7
POOLER_SAMPLING_RATIO: 0
POOLER_TYPE: ROIAlignV2
SMOOTH_L1_BETA: 0.0
TRAIN_ON_PRED_BOXES: false
ROI_HEADS:
BATCH_SIZE_PER_IMAGE: 512
IN_FEATURES:
- p2
- p3
- p4
- p5
IOU_LABELS:
- 0
- 1
IOU_THRESHOLDS:
- 0.5
NAME: CascadeROIHeads
NMS_THRESH_TEST: 0.5
NUM_CLASSES: 10
POSITIVE_FRACTION: 0.25
PROPOSAL_APPEND_GT: true
SCORE_THRESH_TEST: 0.05
ROI_KEYPOINT_HEAD:
CONV_DIMS:
- 512
- 512
- 512
- 512
- 512
- 512
- 512
- 512
LOSS_WEIGHT: 1.0
MIN_KEYPOINTS_PER_IMAGE: 1
NAME: KRCNNConvDeconvUpsampleHead
NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: true
NUM_KEYPOINTS: 17
POOLER_RESOLUTION: 14
POOLER_SAMPLING_RATIO: 0
POOLER_TYPE: ROIAlignV2
ROI_MASK_HEAD:
CLS_AGNOSTIC_MASK: false
CONV_DIM: 256
NAME: MaskRCNNConvUpsampleHead
NORM: ''
NUM_CONV: 4
POOLER_RESOLUTION: 14
POOLER_SAMPLING_RATIO: 0
POOLER_TYPE: ROIAlignV2
RPN:
BATCH_SIZE_PER_IMAGE: 256
BBOX_REG_LOSS_TYPE: smooth_l1
BBOX_REG_LOSS_WEIGHT: 1.0
BBOX_REG_WEIGHTS:
- 1.0
- 1.0
- 1.0
- 1.0
BOUNDARY_THRESH: -1
CONV_DIMS:
- -1
HEAD_NAME: StandardRPNHead
IN_FEATURES:
- p2
- p3
- p4
- p5
- p6
IOU_LABELS:
- 0
- -1
- 1
IOU_THRESHOLDS:
- 0.3
- 0.7
LOSS_WEIGHT: 1.0
NMS_THRESH: 0.7
POSITIVE_FRACTION: 0.5
POST_NMS_TOPK_TEST: 1000
POST_NMS_TOPK_TRAIN: 2000
PRE_NMS_TOPK_TEST: 1000
PRE_NMS_TOPK_TRAIN: 2000
SMOOTH_L1_BETA: 0.0
SEM_SEG_HEAD:
COMMON_STRIDE: 4
CONVS_DIM: 128
IGNORE_VALUE: 255
IN_FEATURES:
- p2
- p3
- p4
- p5
LOSS_WEIGHT: 1.0
NAME: SemSegFPNHead
NORM: GN
NUM_CLASSES: 10
VIT:
DROP_PATH: 0.1
IMG_SIZE:
- 224
- 224
NAME: layoutlmv3_base
OUT_FEATURES:
- layer3
- layer5
- layer7
- layer11
POS_TYPE: abs
WEIGHTS:
OUTPUT_DIR:
SCIHUB_DATA_DIR_TRAIN: ~/publaynet/layout_scihub/train
SEED: 42
SOLVER:
AMP:
ENABLED: true
BACKBONE_MULTIPLIER: 1.0
BASE_LR: 0.0002
BIAS_LR_FACTOR: 1.0
CHECKPOINT_PERIOD: 2000
CLIP_GRADIENTS:
CLIP_TYPE: full_model
CLIP_VALUE: 1.0
ENABLED: true
NORM_TYPE: 2.0
GAMMA: 0.1
GRADIENT_ACCUMULATION_STEPS: 1
IMS_PER_BATCH: 32
LR_SCHEDULER_NAME: WarmupCosineLR
MAX_ITER: 20000
MOMENTUM: 0.9
NESTEROV: false
OPTIMIZER: ADAMW
REFERENCE_WORLD_SIZE: 0
STEPS:
- 10000
WARMUP_FACTOR: 0.01
WARMUP_ITERS: 333
WARMUP_METHOD: linear
WEIGHT_DECAY: 0.05
WEIGHT_DECAY_BIAS: null
WEIGHT_DECAY_NORM: 0.0
TEST:
AUG:
ENABLED: false
FLIP: true
MAX_SIZE: 4000
MIN_SIZES:
- 400
- 500
- 600
- 700
- 800
- 900
- 1000
- 1100
- 1200
DETECTIONS_PER_IMAGE: 100
EVAL_PERIOD: 1000
EXPECTED_RESULTS: []
KEYPOINT_OKS_SIGMAS: []
PRECISE_BN:
ENABLED: false
NUM_ITER: 200
VERSION: 2
VIS_PERIOD: 0
...@@ -3,12 +3,12 @@ Brotli>=1.1.0 ...@@ -3,12 +3,12 @@ Brotli>=1.1.0
click>=8.1.7 click>=8.1.7
fast-langdetect>=0.2.3,<0.3.0 fast-langdetect>=0.2.3,<0.3.0
loguru>=0.6.0 loguru>=0.6.0
numpy>=1.21.6,<2.0.0 numpy>=1.21.6
pydantic>=2.7.2 pydantic>=2.7.2,<2.11
PyMuPDF>=1.24.9,<=1.24.14 PyMuPDF>=1.24.9,<1.25.0
scikit-learn>=1.0.2 scikit-learn>=1.0.2
torch>=2.2.2,!=2.5.0,!=2.5.1,<=2.6.0 torch>=2.2.2,!=2.5.0,!=2.5.1,<=2.6.0
torchvision torchvision
transformers>=4.49.0 transformers>=4.49.0,<5.0.0
pdfminer.six==20231228 pdfminer.six==20231228
# The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator. # The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
...@@ -37,7 +37,7 @@ if __name__ == '__main__': ...@@ -37,7 +37,7 @@ if __name__ == '__main__':
"models/Layout/YOLO/*", "models/Layout/YOLO/*",
"models/MFD/YOLO/*", "models/MFD/YOLO/*",
"models/MFR/unimernet_hf_small_2503/*", "models/MFR/unimernet_hf_small_2503/*",
"models/OCR/paddleocr/*", "models/OCR/paddleocr_torch/*",
# "models/TabRec/TableMaster/*", # "models/TabRec/TableMaster/*",
# "models/TabRec/StructEqTable/*", # "models/TabRec/StructEqTable/*",
] ]
...@@ -47,11 +47,11 @@ if __name__ == '__main__': ...@@ -47,11 +47,11 @@ if __name__ == '__main__':
print(f'model_dir is: {model_dir}') print(f'model_dir is: {model_dir}')
print(f'layoutreader_model_dir is: {layoutreader_model_dir}') print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
paddleocr_model_dir = model_dir + '/OCR/paddleocr' # paddleocr_model_dir = model_dir + '/OCR/paddleocr'
user_paddleocr_dir = os.path.expanduser('~/.paddleocr') # user_paddleocr_dir = os.path.expanduser('~/.paddleocr')
if os.path.exists(user_paddleocr_dir): # if os.path.exists(user_paddleocr_dir):
shutil.rmtree(user_paddleocr_dir) # shutil.rmtree(user_paddleocr_dir)
shutil.copytree(paddleocr_model_dir, user_paddleocr_dir) # shutil.copytree(paddleocr_model_dir, user_paddleocr_dir)
json_url = 'https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/magic-pdf.template.json' json_url = 'https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/magic-pdf.template.json'
config_file_name = 'magic-pdf.json' config_file_name = 'magic-pdf.json'
......
...@@ -38,7 +38,7 @@ if __name__ == '__main__': ...@@ -38,7 +38,7 @@ if __name__ == '__main__':
"models/Layout/YOLO/*", "models/Layout/YOLO/*",
"models/MFD/YOLO/*", "models/MFD/YOLO/*",
"models/MFR/unimernet_hf_small_2503/*", "models/MFR/unimernet_hf_small_2503/*",
"models/OCR/paddleocr/*", "models/OCR/paddleocr_torch/*",
# "models/TabRec/TableMaster/*", # "models/TabRec/TableMaster/*",
# "models/TabRec/StructEqTable/*", # "models/TabRec/StructEqTable/*",
] ]
...@@ -54,11 +54,11 @@ if __name__ == '__main__': ...@@ -54,11 +54,11 @@ if __name__ == '__main__':
print(f'model_dir is: {model_dir}') print(f'model_dir is: {model_dir}')
print(f'layoutreader_model_dir is: {layoutreader_model_dir}') print(f'layoutreader_model_dir is: {layoutreader_model_dir}')
paddleocr_model_dir = model_dir + '/OCR/paddleocr' # paddleocr_model_dir = model_dir + '/OCR/paddleocr'
user_paddleocr_dir = os.path.expanduser('~/.paddleocr') # user_paddleocr_dir = os.path.expanduser('~/.paddleocr')
if os.path.exists(user_paddleocr_dir): # if os.path.exists(user_paddleocr_dir):
shutil.rmtree(user_paddleocr_dir) # shutil.rmtree(user_paddleocr_dir)
shutil.copytree(paddleocr_model_dir, user_paddleocr_dir) # shutil.copytree(paddleocr_model_dir, user_paddleocr_dir)
json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json' json_url = 'https://github.com/opendatalab/MinerU/raw/master/magic-pdf.template.json'
config_file_name = 'magic-pdf.json' config_file_name = 'magic-pdf.json'
......
...@@ -26,9 +26,10 @@ if __name__ == '__main__': ...@@ -26,9 +26,10 @@ if __name__ == '__main__':
setup( setup(
name="magic_pdf", # 项目名 name="magic_pdf", # 项目名
version=__version__, # 自动从tag中获取版本号 version=__version__, # 自动从tag中获取版本号
packages=find_packages() + ["magic_pdf.resources"], # 包含所有的包 packages=find_packages() + ["magic_pdf.resources"] + ["magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.pytorchocr.utils.resources"], # 包含所有的包
package_data={ package_data={
"magic_pdf.resources": ["**"], # 包含magic_pdf.resources目录下的所有文件 "magic_pdf.resources": ["**"], # 包含magic_pdf.resources目录下的所有文件
"magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.pytorchocr.utils.resources": ["**"], # pytorchocr.resources目录下的所有文件
}, },
install_requires=parse_requirements('requirements.txt'), # 项目依赖的第三方库 install_requires=parse_requirements('requirements.txt'), # 项目依赖的第三方库
extras_require={ extras_require={
...@@ -38,30 +39,21 @@ if __name__ == '__main__': ...@@ -38,30 +39,21 @@ if __name__ == '__main__':
], ],
"full": [ "full": [
"matplotlib<=3.9.0;platform_system=='Windows'", # 3.9.1及之后不提供windows的预编译包,避免一些没有编译环境的windows设备安装失败 "matplotlib<=3.9.0;platform_system=='Windows'", # 3.9.1及之后不提供windows的预编译包,避免一些没有编译环境的windows设备安装失败
"matplotlib;platform_system=='Linux' or platform_system=='Darwin'", # linux 和 macos 不应限制matplotlib的最高版本,以避免无法更新导致的一些bug "matplotlib>=3.10;platform_system=='Linux' or platform_system=='Darwin'", # linux 和 macos 不应限制matplotlib的最高版本,以避免无法更新导致的一些bug
"ultralytics>=8.3.48", # yolov8,公式检测 "ultralytics>=8.3.48", # yolov8,公式检测
"paddleocr==2.7.3", # 2.8.0及2.8.1版本与detectron2有冲突,需锁定2.7.3
"paddlepaddle==3.0.0rc1;platform_system=='Linux' or platform_system=='Darwin'", # 解决linux的段异常问题
"paddlepaddle==2.6.1;platform_system=='Windows'", # windows版本3.0.0效率下降,需锁定2.6.1
"doclayout_yolo==0.0.2b1", # doclayout_yolo "doclayout_yolo==0.0.2b1", # doclayout_yolo
"rapidocr-paddle>=1.4.5,<2.0.0", # rapidocr-paddle "dill>=0.3.9,<1", # doclayout_yolo
"rapidocr_onnxruntime>=1.4.4,<2.0.0",
"rapid_table>=1.0.3,<2.0.0", # rapid_table "rapid_table>=1.0.3,<2.0.0", # rapid_table
"PyYAML", # yaml "PyYAML>=6.0.2,<7", # yaml
"ftfy" "ftfy>=6.3.1,<7", # unimernet_hf
"openai", # openai SDK "openai>=1.70.0,<2", # openai SDK
"shapely>=2.0.7,<3", # imgaug-paddleocr2pytorch
"pyclipper>=1.3.0,<2", # paddleocr2pytorch
"omegaconf>=2.3.0,<3", # paddleocr2pytorch
], ],
"old_linux":[ "old_linux":[
"albumentations<=1.4.20", # 1.4.21引入的simsimd不支持2019年及更早的linux系统 "albumentations<=1.4.20", # 1.4.21引入的simsimd不支持2019年及更早的linux系统
], ],
"layoutlmv3":[
"detectron2"
],
"struct_eqtable":[
"struct-eqtable==0.3.2", # 表格解析
"einops", # struct-eqtable依赖
"accelerate", # struct-eqtable依赖
],
}, },
description="A practical tool for converting PDF to Markdown", # 简短描述 description="A practical tool for converting PDF to Markdown", # 简短描述
long_description=long_description, # 详细描述 long_description=long_description, # 详细描述
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment