predict_rec.py

import os
import sys
from PIL import Image
__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(__dir__)
sys.path.insert(0, os.path.abspath(os.path.join(__dir__, '../..')))

os.environ["FLAGS_allocator_strategy"] = 'auto_growth'

import cv2
import numpy as np
import math
import time
import traceback
import paddle

import tools.infer.utility as utility
from ppocr.postprocess import build_post_process
from ppocr.utils.logging import get_logger
from ppocr.utils.utility import get_image_file_list, check_and_read_gif

logger = get_logger()

class TextRecognizer(object):
    def __init__(self, args):
        self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")]
        self.rec_batch_num = args.rec_batch_num
        postprocess_params = {
            'name': 'CTCLabelDecode',
            "character_dict_path": args.rec_char_dict_path,
            "use_space_char": args.use_space_char
        }
        self.postprocess_op = build_post_process(postprocess_params)
        self.predictor, self.input_tensor, self.output_tensors, self.config = \
            utility.create_predictor(args, 'rec', logger)
    
    def resize_norm_img_section(self, img, max_wh_ratio):
        # print("rec resize for section")
        imgC, imgH, imgW = self.rec_image_shape

        assert imgC == img.shape[2]
        
        rec_precision_level = os.environ.get("OCR_REC_PRECISION")
        max_w = imgH * 48
        # max_w = 2304
        if rec_precision_level =='0':
            imgW = max_w
        elif rec_precision_level == '1':
            imgW = int((imgH * max_wh_ratio))
            if imgW <= max_w / 2:
                imgW = max_w / 2
            else:
                imgW = max_w
        elif rec_precision_level == '2':
            imgW = int((imgH * max_wh_ratio))
            if imgW <= max_w / 4:
                imgW = max_w / 4
            elif imgW > max_w / 4 and imgW <= max_w / 2:
                imgW = max_w / 2
            elif imgW > max_w / 2 and imgW <= 3 * max_w / 4:
                imgW = 3 * max_w / 4
            else:
                imgW = max_w
        else:
            imgW = int((imgH * max_wh_ratio))
            if imgW <= max_w / 6:
                imgW = max_w / 6
            elif imgW > max_w / 6 and imgW <= max_w / 3:
                imgW = max_w / 3
            elif imgW > max_w / 3 and imgW <= max_w / 2:
                imgW = max_w / 2
            elif imgW > max_w / 2 and imgW <= 2 * max_w / 3:
                imgW = 2 * max_w / 3
            elif imgW > 2 *max_w / 3 and imgW <= 5 * max_w / 6:
                imgW = 5 * max_w / 6
            else:
                imgW = max_w

        imgW = int(imgW)
        h, w = img.shape[:2]
        ratio = w / float(h)
        if math.ceil(imgH * ratio) > imgW:
            resized_w = imgW
        else:
            resized_w = int(math.ceil(imgH * ratio))
        resized_image = cv2.resize(img, (resized_w, imgH))
        resized_image = resized_image.astype('float32')
        resized_image = resized_image.transpose((2, 0, 1)) / 255
        resized_image -= 0.5
        resized_image /= 0.5
        padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
        padding_im[:, :, 0:resized_w] = resized_image
        return padding_im

    def __call__(self, img_list):
        img_num = len(img_list)
        # Calculate the aspect ratio of all text bars
        width_list = []
        for img in img_list:
            width_list.append(img.shape[1] / float(img.shape[0]))
        # Sorting can speed up the recognition process
        indices = np.argsort(np.array(width_list))

        rec_res = [['', 0.0]] * img_num
        if img_num <= 0:
            return rec_res, 0
        max_batnum = 24
        min_batnum = 8
        if os.environ.get("OCR_REC_MAX_BATNUM") is not None:
            max_batnum = int(os.environ.get("OCR_REC_MAX_BATNUM"))
        if os.environ.get("OCR_REC_MIN_BATNUM") is not None:
            min_batnum = int(os.environ.get("OCR_REC_MIN_BATNUM"))
        assert max_batnum / min_batnum == int(max_batnum / min_batnum), "max_batnum must be multiple of min_batnum."
        img_num_left = img_num
        img_no_count = 0
        st = time.time()
        if img_num_left > max_batnum:
            batch_num = max_batnum
            batch_num = int(batch_num)
            for beg_img_no in range(img_no_count, int(img_num_left / batch_num) * batch_num, batch_num):
                end_img_no = beg_img_no + batch_num
                norm_img_batch = []
                max_wh_ratio = 0
                for ino in range(beg_img_no, end_img_no):
                    h, w = img_list[indices[ino]].shape[0:2]
                    wh_ratio = w * 1.0 / h
                    max_wh_ratio = max(max_wh_ratio, wh_ratio)
                for ino in range(beg_img_no, end_img_no):
                    norm_img = self.resize_norm_img_section(img_list[indices[ino]], max_wh_ratio)
                    norm_img = norm_img[np.newaxis, :]
                    norm_img_batch.append(norm_img)

                norm_img_batch = np.concatenate(norm_img_batch, axis=0)
                norm_img_batch = norm_img_batch.copy()

                self.input_tensor.copy_from_cpu(norm_img_batch)
                self.predictor.run()

                outputs = []
                for output_tensor in self.output_tensors:
                    output = output_tensor.copy_to_cpu()
                    outputs.append(output)
                if len(outputs) != 1:
                    preds = outputs
                else:
                    preds = outputs[0]
                rec_result = self.postprocess_op(preds)
                for rno in range(len(rec_result)):
                    rec_res[indices[beg_img_no + rno]] = rec_result[rno]
            img_no_count = int(img_num_left / batch_num) * batch_num
            img_num_left = img_num_left - int(img_num_left / batch_num) * batch_num
                    
        batch_num = math.ceil(img_num_left / min_batnum) * min_batnum
        batch_num = int(batch_num)
        Dnum = batch_num - img_num_left
        for dno in range(Dnum):
            indices = np.append(indices,img_num + dno)
            rec_res.append(['', 0.0])
        
        beg_img_no = img_no_count
        end_img_no = img_num
        norm_img_batch = []
        max_wh_ratio = 0
        for ino in range(beg_img_no, end_img_no):
            h, w = img_list[indices[ino]].shape[0:2]
            wh_ratio = w * 1.0 / h
            max_wh_ratio = max(max_wh_ratio, wh_ratio)
        for ino in range(beg_img_no, end_img_no):
            norm_img = self.resize_norm_img_section(img_list[indices[ino]], max_wh_ratio)
            norm_img = norm_img[np.newaxis, :]
            norm_img_batch.append(norm_img)

        norm_img_batch = np.concatenate(norm_img_batch)
        if norm_img_batch.shape[0] != batch_num:
            img_tmp = np.zeros((batch_num - norm_img_batch.shape[0], norm_img_batch.shape[1], norm_img_batch.shape[2], norm_img_batch.shape[3]), dtype=np.float32)
            norm_img_batch = np.concatenate([norm_img_batch, img_tmp])
        norm_img_batch = norm_img_batch.copy()

        self.input_tensor.copy_from_cpu(norm_img_batch)
        self.predictor.run()

        outputs = []
        for output_tensor in self.output_tensors:
            output = output_tensor.copy_to_cpu()
            outputs.append(output)
        if len(outputs) != 1:
            preds = outputs
        else:
            preds = outputs[0]
        rec_result = self.postprocess_op(preds)
        for rno in range(len(rec_result)):
            rec_res[indices[beg_img_no + rno]] = rec_result[rno]

        return rec_res, time.time() - st


def main(args):
    image_file_list = get_image_file_list(args.image_dir)
    text_recognizer = TextRecognizer(args)
    valid_image_file_list = []
    img_list = []

    logger.info(
        "In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320', "
        "if you are using recognition model with PP-OCRv2 or an older version, please set --rec_image_shape='3,32,320"
    )
    # warmup 2 times
    if args.warmup:
        img = np.random.uniform(0, 255, [48, 320, 3]).astype(np.uint8)
        for i in range(2):
            res = text_recognizer([img] * int(args.rec_batch_num))

    for image_file in image_file_list:
        img, flag = check_and_read_gif(image_file)
        if not flag:
            img = cv2.imread(image_file)
        if img is None:
            logger.info("error in loading image:{}".format(image_file))
            continue
        valid_image_file_list.append(image_file)
        img_list.append(img)
    try:
        rec_res, _ = text_recognizer(img_list)

    except Exception as E:
        logger.info(traceback.format_exc())
        logger.info(E)
        exit()
    for ino in range(len(img_list)):
        logger.info("Predicts of {}:{}".format(valid_image_file_list[ino],
                                               rec_res[ino]))
    if args.benchmark:
        text_recognizer.autolog.report()


if __name__ == "__main__":
    main(utility.parse_args())