import argparse import os import sys import platform import cv2 import numpy as np import paddle from PIL import Image, ImageDraw, ImageFont import math from paddle import inference import time from ppocr.utils.logging import get_logger def str2bool(v): return v.lower() in ("true", "t", "1") def init_args(): parser = argparse.ArgumentParser() # params for prediction engine parser.add_argument("--use_gpu", type=str2bool, default=True) parser.add_argument("--use_xpu", type=str2bool, default=False) parser.add_argument("--ir_optim", type=str2bool, default=True) parser.add_argument("--use_tensorrt", type=str2bool, default=False) parser.add_argument("--min_subgraph_size", type=int, default=15) parser.add_argument("--precision", type=str, default="fp32") parser.add_argument("--gpu_mem", type=int, default=500) # params for text detector parser.add_argument("--image_dir", type=str) parser.add_argument("--det_algorithm", type=str, default='DB') parser.add_argument("--det_model_dir", type=str) parser.add_argument("--det_limit_side_len", type=float, default=960) parser.add_argument("--det_limit_type", type=str, default='max') # DB parmas parser.add_argument("--det_db_thresh", type=float, default=0.3) parser.add_argument("--det_db_box_thresh", type=float, default=0.6) parser.add_argument("--det_db_unclip_ratio", type=float, default=1.5) parser.add_argument("--max_batch_size", type=int, default=10) parser.add_argument("--use_dilation", type=str2bool, default=False) parser.add_argument("--det_db_score_mode", type=str, default="fast") parser.add_argument("--vis_seg_map", type=str2bool, default=False) # params for text recognizer parser.add_argument("--rec_algorithm", type=str, default='SVTR_LCNet') parser.add_argument("--rec_model_dir", type=str) parser.add_argument("--rec_image_shape", type=str, default="3, 48, 320") parser.add_argument("--rec_batch_num", type=int, default=6) parser.add_argument("--max_text_length", type=int, default=25) parser.add_argument( "--rec_char_dict_path", type=str, default="./ppocr/utils/ppocr_keys_v1.txt") parser.add_argument("--use_space_char", type=str2bool, default=True) parser.add_argument( "--vis_font_path", type=str, default="./doc/fonts/simfang.ttf") parser.add_argument("--drop_score", type=float, default=0.5) # params for text classifier parser.add_argument("--use_angle_cls", type=str2bool, default=False) parser.add_argument("--cls_model_dir", type=str) parser.add_argument("--cls_image_shape", type=str, default="3, 48, 192") parser.add_argument("--label_list", type=list, default=['0', '180']) parser.add_argument("--cls_batch_num", type=int, default=6) parser.add_argument("--cls_thresh", type=float, default=0.9) parser.add_argument("--enable_mkldnn", type=str2bool, default=False) parser.add_argument("--cpu_threads", type=int, default=10) parser.add_argument("--use_pdserving", type=str2bool, default=False) parser.add_argument("--warmup", type=str2bool, default=False) # parser.add_argument( "--draw_img_save_dir", type=str, default="./inference_results") parser.add_argument("--save_crop_res", type=str2bool, default=False) parser.add_argument("--crop_res_save_dir", type=str, default="./output") # multi-process parser.add_argument("--use_mp", type=str2bool, default=False) parser.add_argument("--total_process_num", type=int, default=1) parser.add_argument("--process_id", type=int, default=0) parser.add_argument("--benchmark", type=str2bool, default=False) parser.add_argument("--save_log_path", type=str, default="./log_output/") parser.add_argument("--show_log", type=str2bool, default=True) parser.add_argument("--use_onnx", type=str2bool, default=False) return parser def parse_args(): parser = init_args() return parser.parse_args() def create_predictor(args, mode, logger): if mode == "det": model_dir = args.det_model_dir elif mode == 'cls': model_dir = args.cls_model_dir else: model_dir = args.rec_model_dir if model_dir is None: logger.info("not find {} model file path {}".format(mode, model_dir)) sys.exit(0) if args.use_onnx: import onnxruntime as ort model_file_path = model_dir if not os.path.exists(model_file_path): raise ValueError("not find model file path {}".format( model_file_path)) sess = ort.InferenceSession(model_file_path, providers=[('ROCMExecutionProvider', {'device_id': '4'}),'CPUExecutionProvider']) return sess, sess.get_inputs()[0], None, None else: model_file_path = model_dir + "/inference.pdmodel" params_file_path = model_dir + "/inference.pdiparams" if not os.path.exists(model_file_path): raise ValueError("not find model file path {}".format( model_file_path)) if not os.path.exists(params_file_path): raise ValueError("not find params file path {}".format( params_file_path)) config = inference.Config(model_file_path, params_file_path) if hasattr(args, 'precision'): if args.precision == "fp16" and args.use_tensorrt: precision = inference.PrecisionType.Half print("fp16 set success!") elif args.precision == "int8": precision = inference.PrecisionType.Int8 else: precision = inference.PrecisionType.Float32 else: precision = inference.PrecisionType.Float32 if args.use_gpu: gpu_id = get_infer_gpuid() if gpu_id is None: logger.warning( "GPU is not found in current device by nvidia-smi. Please check your device or ignore it if run on jetson." ) config.enable_use_gpu(args.gpu_mem, 0) use_dynamic_shape = True if mode == "det": min_input_shape = { "x": [1, 3, 50, 50], "conv2d_92.tmp_0": [1, 120, 20, 20], "conv2d_91.tmp_0": [1, 24, 10, 10], "conv2d_59.tmp_0": [1, 96, 20, 20], "nearest_interp_v2_1.tmp_0": [1, 256, 10, 10], "nearest_interp_v2_2.tmp_0": [1, 256, 20, 20], "conv2d_124.tmp_0": [1, 256, 20, 20], "nearest_interp_v2_3.tmp_0": [1, 64, 20, 20], "nearest_interp_v2_4.tmp_0": [1, 64, 20, 20], "nearest_interp_v2_5.tmp_0": [1, 64, 20, 20], "elementwise_add_7": [1, 56, 2, 2], "nearest_interp_v2_0.tmp_0": [1, 256, 2, 2] } max_input_shape = { "x": [1, 3, 1536, 1536], "conv2d_92.tmp_0": [1, 120, 400, 400], "conv2d_91.tmp_0": [1, 24, 200, 200], "conv2d_59.tmp_0": [1, 96, 400, 400], "nearest_interp_v2_1.tmp_0": [1, 256, 200, 200], "conv2d_124.tmp_0": [1, 256, 400, 400], "nearest_interp_v2_2.tmp_0": [1, 256, 400, 400], "nearest_interp_v2_3.tmp_0": [1, 64, 400, 400], "nearest_interp_v2_4.tmp_0": [1, 64, 400, 400], "nearest_interp_v2_5.tmp_0": [1, 64, 400, 400], "elementwise_add_7": [1, 56, 400, 400], "nearest_interp_v2_0.tmp_0": [1, 256, 400, 400] } opt_input_shape = { "x": [1, 3, 640, 640], "conv2d_92.tmp_0": [1, 120, 160, 160], "conv2d_91.tmp_0": [1, 24, 80, 80], "conv2d_59.tmp_0": [1, 96, 160, 160], "nearest_interp_v2_1.tmp_0": [1, 256, 80, 80], "nearest_interp_v2_2.tmp_0": [1, 256, 160, 160], "conv2d_124.tmp_0": [1, 256, 160, 160], "nearest_interp_v2_3.tmp_0": [1, 64, 160, 160], "nearest_interp_v2_4.tmp_0": [1, 64, 160, 160], "nearest_interp_v2_5.tmp_0": [1, 64, 160, 160], "elementwise_add_7": [1, 56, 40, 40], "nearest_interp_v2_0.tmp_0": [1, 256, 40, 40] } min_pact_shape = { "nearest_interp_v2_26.tmp_0": [1, 256, 20, 20], "nearest_interp_v2_27.tmp_0": [1, 64, 20, 20], "nearest_interp_v2_28.tmp_0": [1, 64, 20, 20], "nearest_interp_v2_29.tmp_0": [1, 64, 20, 20] } max_pact_shape = { "nearest_interp_v2_26.tmp_0": [1, 256, 400, 400], "nearest_interp_v2_27.tmp_0": [1, 64, 400, 400], "nearest_interp_v2_28.tmp_0": [1, 64, 400, 400], "nearest_interp_v2_29.tmp_0": [1, 64, 400, 400] } opt_pact_shape = { "nearest_interp_v2_26.tmp_0": [1, 256, 160, 160], "nearest_interp_v2_27.tmp_0": [1, 64, 160, 160], "nearest_interp_v2_28.tmp_0": [1, 64, 160, 160], "nearest_interp_v2_29.tmp_0": [1, 64, 160, 160] } min_input_shape.update(min_pact_shape) max_input_shape.update(max_pact_shape) opt_input_shape.update(opt_pact_shape) elif mode == "rec": if args.rec_algorithm not in ["CRNN", "SVTR_LCNet"]: use_dynamic_shape = False imgH = int(args.rec_image_shape.split(',')[-2]) min_input_shape = {"x": [1, 3, imgH, 10]} max_input_shape = {"x": [args.rec_batch_num, 3, imgH, 2304]} opt_input_shape = {"x": [args.rec_batch_num, 3, imgH, 320]} config.exp_disable_tensorrt_ops(["transpose2"]) elif mode == "cls": min_input_shape = {"x": [1, 3, 48, 10]} max_input_shape = {"x": [args.rec_batch_num, 3, 48, 1024]} opt_input_shape = {"x": [args.rec_batch_num, 3, 48, 320]} else: use_dynamic_shape = False if use_dynamic_shape: config.set_trt_dynamic_shape_info( min_input_shape, max_input_shape, opt_input_shape) elif args.use_xpu: config.enable_xpu(10 * 1024 * 1024) else: config.disable_gpu() if hasattr(args, "cpu_threads"): config.set_cpu_math_library_num_threads(args.cpu_threads) else: # default cpu threads as 10 config.set_cpu_math_library_num_threads(10) if args.enable_mkldnn: # cache 10 different shapes for mkldnn to avoid memory leak config.set_mkldnn_cache_capacity(10) config.enable_mkldnn() if args.precision == "fp16": config.enable_mkldnn_bfloat16() # enable memory optim config.enable_memory_optim() config.disable_glog_info() config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass") config.delete_pass("matmul_transpose_reshape_fuse_pass") if mode == 'table': config.delete_pass("fc_fuse_pass") # not supported for table config.switch_use_feed_fetch_ops(False) config.switch_ir_optim(True) # create predictor predictor = inference.create_predictor(config) input_names = predictor.get_input_names() for name in input_names: input_tensor = predictor.get_input_handle(name) output_tensors = get_output_tensors(args, mode, predictor) return predictor, input_tensor, output_tensors, config def get_output_tensors(args, mode, predictor): output_names = predictor.get_output_names() output_tensors = [] if mode == "rec" and args.rec_algorithm in ["CRNN", "SVTR_LCNet"]: output_name = 'softmax_0.tmp_0' if output_name in output_names: return [predictor.get_output_handle(output_name)] else: for output_name in output_names: output_tensor = predictor.get_output_handle(output_name) output_tensors.append(output_tensor) else: for output_name in output_names: output_tensor = predictor.get_output_handle(output_name) output_tensors.append(output_tensor) return output_tensors def get_infer_gpuid(): sysstr = platform.system() if sysstr == "Windows": return 0 if not paddle.core.is_compiled_with_rocm(): cmd = "env | grep CUDA_VISIBLE_DEVICES" else: cmd = "env | grep HIP_VISIBLE_DEVICES" env_cuda = os.popen(cmd).readlines() if len(env_cuda) == 0: return 0 else: gpu_id = env_cuda[0].strip().split("=")[1] return int(gpu_id[0]) def draw_text_det_res(dt_boxes, img_path): src_im = cv2.imread(img_path) for box in dt_boxes: box = np.array(box).astype(np.int32).reshape(-1, 2) cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2) return src_im def draw_ocr_box_txt(image, boxes, txts, scores=None, drop_score=0.5, font_path="./doc/simfang.ttf"): h, w = image.height, image.width img_left = image.copy() img_right = Image.new('RGB', (w, h), (255, 255, 255)) import random random.seed(0) draw_left = ImageDraw.Draw(img_left) draw_right = ImageDraw.Draw(img_right) for idx, (box, txt) in enumerate(zip(boxes, txts)): if scores is not None and scores[idx] < drop_score: continue color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) draw_left.polygon(box, fill=color) draw_right.polygon( [ box[0][0], box[0][1], box[1][0], box[1][1], box[2][0], box[2][1], box[3][0], box[3][1] ], outline=color) box_height = math.sqrt((box[0][0] - box[3][0])**2 + (box[0][1] - box[3][ 1])**2) box_width = math.sqrt((box[0][0] - box[1][0])**2 + (box[0][1] - box[1][ 1])**2) if box_height > 2 * box_width: font_size = max(int(box_width * 0.9), 10) font = ImageFont.truetype(font_path, font_size, encoding="utf-8") cur_y = box[0][1] for c in txt: char_size = font.getsize(c) draw_right.text( (box[0][0] + 3, cur_y), c, fill=(0, 0, 0), font=font) cur_y += char_size[1] else: font_size = max(int(box_height * 0.8), 10) font = ImageFont.truetype(font_path, font_size, encoding="utf-8") draw_right.text( [box[0][0], box[0][1]], txt, fill=(0, 0, 0), font=font) img_left = Image.blend(image, img_left, 0.5) img_show = Image.new('RGB', (w * 2, h), (255, 255, 255)) img_show.paste(img_left, (0, 0, w, h)) img_show.paste(img_right, (w, 0, w * 2, h)) return np.array(img_show) def get_rotate_crop_image(img, points): ''' img_height, img_width = img.shape[0:2] left = int(np.min(points[:, 0])) right = int(np.max(points[:, 0])) top = int(np.min(points[:, 1])) bottom = int(np.max(points[:, 1])) img_crop = img[top:bottom, left:right, :].copy() points[:, 0] = points[:, 0] - left points[:, 1] = points[:, 1] - top ''' assert len(points) == 4, "shape of points must be 4*2" img_crop_width = int( max( np.linalg.norm(points[0] - points[1]), np.linalg.norm(points[2] - points[3]))) img_crop_height = int( max( np.linalg.norm(points[0] - points[3]), np.linalg.norm(points[1] - points[2]))) pts_std = np.float32([[0, 0], [img_crop_width, 0], [img_crop_width, img_crop_height], [0, img_crop_height]]) M = cv2.getPerspectiveTransform(points, pts_std) dst_img = cv2.warpPerspective( img, M, (img_crop_width, img_crop_height), borderMode=cv2.BORDER_REPLICATE, flags=cv2.INTER_CUBIC) dst_img_height, dst_img_width = dst_img.shape[0:2] if dst_img_height * 1.0 / dst_img_width >= 1.5: dst_img = np.rot90(dst_img) return dst_img if __name__ == '__main__': pass