inference.py

# -*- coding: utf-8 -*-
import cv2
import numpy as np
from shapely.geometry import Polygon
import pyclipper
import migraphx
import os
from PIL import Image
def AllocateOutputMemory(model):
    outputData={}
    for key in model.get_outputs().keys():
        outputData[key] = migraphx.allocate_gpu(s=model.get_outputs()[key])
    return outputData

class BaseRecLabelDecode(object):
    """
    特征空间映射到文本空间
    """
    def __init__(self, character_dict_path=None, use_space_char=False):
        self.beg_str = "sos"
        self.end_str = "eos"
        self.reverse = False
        self.character_str = []
    
        if character_dict_path is None:
            self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
            dict_character = list(self.character_str)
        else:
            with open(character_dict_path, "rb") as fin:
                lines = fin.readlines()
                for line in lines:
                    line = line.decode("utf-8").strip("\n").strip("\r\n")
                    self.character_str.append(line)
            if use_space_char:
                self.character_str.append(" ")
            dict_character = list(self.character_str)
            if "arabic" in character_dict_path:
                self.reverse = True

        dict_character = self.add_special_char(dict_character)
        self.dict = {}
        for i, char in enumerate(dict_character):
            self.dict[char] = i
        self.character = dict_character
    def pred_reverse(self, pred):
        pred_re = []
        c_current = ""
        for c in pred:
            if not bool(re.search("[a-zA-Z0-9 :*./%+-]", c)):
                if c_current != "":
                    pred_re.append(c_current)
                pred_re.append(c)
                c_current = ""
            else:
                c_current += c
        if c_current != "":
            pred_re.append(c_current)

        return "".join(pred_re[::-1])

    def add_special_char(self, dict_character):
        return dict_character

    def get_word_info(self, text, selection):
        state = None
        word_content = []
        word_col_content = []
        word_list = []
        word_col_list = []
        state_list = []
        valid_col = np.where(selection == True)[0]

        for c_i, char in enumerate(text):
            if "\u4e00" <= char <= "\u9fff":
                c_state = "cn"
            elif bool(re.search("[a-zA-Z0-9]", char)):
                c_state = "en&num"
            else:
                c_state = "splitter"

            if (
                char == "."
                and state == "en&num"
                and c_i + 1 < len(text)
                and bool(re.search("[0-9]", text[c_i + 1]))
            ):   
                c_state = "en&num"
            if (
                char == "-" and state == "en&num"
            ):   
                c_state = "en&num"

            if state == None:
                state = c_state

            if state != c_state:
                if len(word_content) != 0:
                    word_list.append(word_content)
                    word_col_list.append(word_col_content)
                    state_list.append(state)
                    word_content = []
                    word_col_content = []
                state = c_state

            if state != "splitter":
                word_content.append(char)
                word_col_content.append(valid_col[c_i])

        if len(word_content) != 0:
            word_list.append(word_content)
            word_col_list.append(word_col_content)
            state_list.append(state)

        return word_list, word_col_list, state_list

    def decode(
        self,
        text_index,
        text_prob=None,
        is_remove_duplicate=False,
        return_word_box=False,
    ):
        result_list = []
        ignored_tokens = self.get_ignored_tokens()
        batch_size = len(text_index)
    
        for batch_idx in range(batch_size):
            selection = np.ones(len(text_index[batch_idx]), dtype=bool)
            if is_remove_duplicate:
                selection[1:] = text_index[batch_idx][1:] != text_index[batch_idx][:-1]
            for ignored_token in ignored_tokens:
                selection &= text_index[batch_idx] != ignored_token

            char_list = [
                self.character[text_id] for text_id in text_index[batch_idx][selection]
            ]
            if text_prob is not None:
                conf_list = text_prob[batch_idx][selection]
            else:
                conf_list = [1] * len(selection)
            if len(conf_list) == 0:
                conf_list = [0]

            text = "".join(char_list)
           
            if self.reverse: 
                text = self.pred_reverse(text)

            if return_word_box:
                word_list, word_col_list, state_list = self.get_word_info(
                    text, selection
                )

                result_list.append(
                    (
                        text,
                        np.mean(conf_list).tolist(),
                        [
                            len(text_index[batch_idx]),
                            word_list,
                            word_col_list,
                            state_list,
                        ],
                    )
                )
            else:
                result_list.append((text, np.mean(conf_list).tolist()))
        return result_list

    def get_ignored_tokens(self):
        return [0]  

class CTCLabelDecode(BaseRecLabelDecode):
    def __init__(self, character_dict_path=None, use_space_char=False, **kwargs):
        super(CTCLabelDecode, self).__init__(character_dict_path, use_space_char)

    def __call__(self, preds, label=None, return_word_box=False, *args, **kwargs):
        """
        1、获取每个通道上的最大概率值（ppocrv5每次可预测18385个字符）
        2、字符解码 ，从模型输出从特征空间向字符空间映射
        3、输出字符串/字符
        """
        batch_text_list = []
        batch_label_list = []
        for b in range(len(preds)):
            #获取最大概率和最大概率的索引
            preds_idx =  preds[b].argmax(axis=2)
            preds_prob = preds[b].max(axis=2)
            text = self.decode(
                preds_idx,
                preds_prob,
                is_remove_duplicate=True,
                return_word_box=return_word_box,
            )
            if return_word_box:
                for rec_idx, rec in enumerate(text):
                    wh_ratio = kwargs["wh_ratio_list"][b][id][rec_idx]
                    rec[2][0] = rec[2][0] /wh_ratio
            if label is None:
                batch_text_list.append(text)
                continue

            label = self.decode(label)
            batch_text_list.append(text)
            
            batch_label_list.append(label) 

        return batch_text_list, batch_label_list

    def add_special_char(self, dict_character):
        dict_character = ["blank"] + dict_character
        return dict_character
    
class TextRecgnizer(object):
    def __init__(
        self,
        rec_model_path,
        rec_batch_num=1,
        rec_input_size=(48, 480),#(h,w)
        rec_algorithm="SVTR_LCNet",
        precision_mode = "fp16",
        **kwargs
    ):
         
        self.rec_algorithm = rec_algorithm
        self.rec_input_size = rec_input_size
        self.precision_mode = precision_mode
        self.rec_batch_num = rec_batch_num

        self.offload_copy = kwargs.get("offload_copy", True)
        if os.path.exists(rec_model_path) and rec_model_path.endswith(".onnx"):
            self.rec_input_name = "x"
            maxInput={self.rec_input_name:[rec_batch_num,3,self.rec_input_size[0],self.rec_input_size[1]]}
            self.rec_model = migraphx.parse_onnx(rec_model_path,map_input_dims=maxInput)
            if self.precision_mode == "fp16":
                migraphx.quantize_fp16(self.rec_model)

            self.rec_model.compile(t=migraphx.get_target("gpu"),offload_copy=self.offload_copy,device_id=0)
            inputs = self.rec_model.get_inputs()
            outputs = self.rec_model.get_outputs()
            if self.offload_copy==False:
                self.d_mem = AllocateOutputMemory(self.rec_model)
                in_data = np.ones((rec_batch_num,3,self.rec_input_size[0],self.rec_input_size[1]),dtype=np.float32)
                #推理前warm up一次
                self.d_mem[self.rec_input_name] =migraphx.to_gpu(migraphx.argument(in_data))
                self.rec_model.run(self.d_mem)

            else:
                #推理前warm up一次
                in_data = np.ones((rec_batch_num,3,self.rec_input_size[0],self.rec_input_size[1]),dtype=np.float32)
                self.rec_model.run({self.rec_input_name:in_data})
           
            print("Text recognizition model info:")
            print(f"                      inputs info:{inputs}")
            print(f"                      outputs info:{outputs}")

    def __call__(self, batch_img_list):
        """
        1、输入预处理
        2、拼batch
        3、推理
        4、输出字符特征的featmap
        """
        if len(batch_img_list) == 0:
            return []
        width_list = []
        for b in range(len(batch_img_list)):
            for img in batch_img_list[b]:
                width_list.append(img.shape[1] / float(img.shape[0]))
        
        # indices = np.argsort(np.array(width_list))
        input_batch = self.rec_batch_num
        batch_outputs_pre = []
        batch_max_wh_ratio_pre = []
       
        for b in range(len(batch_img_list)):
            im_count = len(batch_img_list[b])
            batch_outputs = []
            batch_max_wh_ratio = []
            for beg_img_no in range(0, im_count, input_batch):
                end_img_no = min(im_count, beg_img_no + input_batch)
                
                batch_norm_imgs = []
                max_wh_ratio = list()
                # N batch
                for ino in range(beg_img_no, end_img_no):
                    norm_img = self.preprocess(batch_img_list[b][ino], max_wh_ratio)
                    norm_img = norm_img[np.newaxis, :].astype(np.float32)
                  
                    batch_norm_imgs.append(norm_img)
                
                if len(batch_norm_imgs)==0:
                    continue
                batch_max_wh_ratio.append(max_wh_ratio)
               
                norm_img_batch = np.concatenate(batch_norm_imgs)
                norm_img_batch = norm_img_batch.copy()

                if self.offload_copy==False:
                    print("offload copy model")
                    self.d_mem[self.rec_input_name] =migraphx.to_gpu(migraphx.argument(norm_img_batch))
                    results = self.rec_model.run(self.d_mem)
                    output = np.array(results[0])
                else:
                    results = self.rec_model.run({self.rec_input_name:norm_img_batch})
                    output = results[0]

                [batch_outputs.append(out) for out in np.array(output)]
            
            batch_outputs_pre.append(np.array(batch_outputs))   
            batch_max_wh_ratio_pre.append(batch_max_wh_ratio)            

        return batch_outputs_pre ,batch_max_wh_ratio_pre      
            
    def preprocess(self, img, max_wh_ratio):
        if isinstance(max_wh_ratio,list) ==False:
            raise TypeError("max_wh_ratio must be list")

        imgH, imgW = self.rec_input_size
        max_h,max_w = self.rec_input_size
        h, w = img.shape[:2]
        #沿着h axixientation 轴进行resize
        if h <= max_h:
            ratio = max_h / h
            w = int(w*ratio)
            if w <= max_w:
                re_size =(w,max_h)
            else:
                re_size = (max_w,max_h)
        else:
            ratio = max_h/h
            w,h = int(w*ratio),max_h
            if w <= max_w:
                re_size = (w,h)
            else:
                re_size = (max_w,h)

        max_wh_ratio.append(ratio)
        resized_image = cv2.resize(img, re_size)
        resized_image = resized_image.astype("float32")
        resized_image = resized_image.transpose((2, 0, 1)) / 255
        resized_image -= 0.5
        resized_image /= 0.5
        padding_im = np.zeros((3, imgH, imgW), dtype=np.float32)
        padding_im[:, :, 0:re_size[0]] = resized_image
        return padding_im

class TextDetector(object):
    def __init__(
        self,
        det_model_path,
        db_input_size=(640,640),
        thresh=0.3,
        box_thresh=0.7,
        max_candidates=1000,
        unclip_ratio=2.0,
        use_dilation=False,
        score_mode="fast",
        box_type="quad",
        precision_mode="float32",
        **kwargs,
    ):

        self.thresh = thresh
        self.db_input_size = db_input_size
        self.box_thresh = box_thresh
        self.max_candidates = max_candidates
        self.unclip_ratio = unclip_ratio
        self.min_size = 3
        self.score_mode = score_mode
        self.box_type = box_type
        self.precision_mode = precision_mode
        assert score_mode in [
            "slow",
            "fast",
        ], "Score mode not support: {}".format(score_mode)
        
        
        self.dilation_kernel = None if not use_dilation else np.array([[1, 1], [1, 1]])
        
        self.offload_copy = kwargs.get("offload_copy", True)
        if os.path.exists(det_model_path) and det_model_path.endswith(".onnx"):
            self.det_input_name = "x"
            maxInput={self.det_input_name:[1,3,db_input_size[0],db_input_size[1]]}
            self.db_model = migraphx.parse_onnx(det_model_path,map_input_dims=maxInput)
            inputs = self.db_model.get_inputs()
            outputs = self.db_model.get_outputs()
                
            if self.precision_mode == "fp16":
                migraphx.quantize_fp16(self.db_model)
            
            self.db_model.compile(t=migraphx.get_target("gpu"),offload_copy=self.offload_copy,device_id=0)

            if self.offload_copy==False:
                self.d_mem = AllocateOutputMemory(self.db_model)
                in_data = np.ones((1,3,db_input_size[0],db_input_size[1]),dtype=np.float32)
                #推理前warm up一次
                self.d_mem[self.det_input_name] =migraphx.to_gpu(migraphx.argument(in_data))
                self.db_model.run(self.d_mem)
            else:
                #推理前warm up一次
                in_data = np.ones((1,3,db_input_size[0],db_input_size[1]),dtype=np.float32)
                self.db_model.run({self.det_input_name:in_data})

            print("Detection model info:")
            print(f"                      inputs info:{inputs}")
            print(f"                      outputs info:{outputs}")
    def polygons_from_bitmap(self, pred, _bitmap,  ratio_w,ratio_h,dest_width, dest_height):

        bitmap = _bitmap
        height, width = bitmap.shape

        boxes = []
        scores = []
        #字符区域提取
        contours, _ = cv2.findContours(
            (bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE
        )

        for contour in contours[: self.max_candidates]:
            epsilon = 0.002 * cv2.arcLength(contour, True)
            approx = cv2.approxPolyDP(contour, epsilon, True)
            points = approx.reshape((-1, 2))
            if points.shape[0] < 4:
                continue

            score = self.box_score_fast(pred, points.reshape(-1, 2))
            if self.box_thresh > score:
                continue

            if points.shape[0] > 2:
                box = self.unclip(points, self.unclip_ratio)
                if len(box) > 1:
                    continue
            else:
                continue
            box = np.array(box).reshape(-1, 2)
            if len(box) == 0:
                continue

            _, sside = self.get_mini_boxes(box.reshape((-1, 1, 2)))
            if sside < self.min_size + 2:
                continue

            box = np.array(box)
            box[:, 0] = np.clip(np.round(box[:, 0] /ratio_w), 0, dest_width)
            box[:, 1] = np.clip(
                np.round(box[:, 1] / ratio_h), 0, dest_height
            )
            boxes.append(box.tolist())
            scores.append(score)
        return boxes, scores

    def boxes_from_bitmap(self, pred, _bitmap,  ratio_w,ratio_h, dest_width, dest_height):
        bitmap = _bitmap
        height, width = bitmap.shape

        outs = cv2.findContours(
            (bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE
        )
        if len(outs) == 3:
            img, contours, _ = outs[0], outs[1], outs[2]
        elif len(outs) == 2:
            contours, _ = outs[0], outs[1]

        num_contours = min(len(contours), self.max_candidates)

        boxes = []
        scores = []
        for index in range(num_contours):
            contour = contours[index]
            points, sside = self.get_mini_boxes(contour)
            if sside < self.min_size:
                continue
            points = np.array(points)
            if self.score_mode == "fast":
                score = self.box_score_fast(pred, points.reshape(-1, 2))
            else:
                score = self.box_score_slow(pred, contour)
            if self.box_thresh > score:
                continue

            box = self.unclip(points, self.unclip_ratio)
            if len(box) > 1:
                continue
            box = np.array(box).reshape(-1, 1, 2)
            box, sside = self.get_mini_boxes(box)
            if sside < self.min_size + 2:
                continue
            box = np.array(box)
            
            box[:, 0] = np.clip(np.round(box[:, 0] / ratio_w), 0, dest_width)
            box[:, 1] = np.clip(
                np.round(box[:, 1] / ratio_h), 0, dest_height
            )
            boxes.append(box.astype("int32"))
            scores.append(score)
        return np.array(boxes, dtype="int32"), scores

    def unclip(self, box, unclip_ratio):
        poly = Polygon(box)
        distance = poly.area * unclip_ratio / poly.length
        offset = pyclipper.PyclipperOffset()
        offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
        expanded = offset.Execute(distance)
        return expanded

    def get_mini_boxes(self, contour):
        bounding_box = cv2.minAreaRect(contour)
        points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])

        index_1, index_2, index_3, index_4 = 0, 1, 2, 3
        if points[1][1] > points[0][1]:
            index_1 = 0
            index_4 = 1
        else:
            index_1 = 1
            index_4 = 0
        if points[3][1] > points[2][1]:
            index_2 = 2
            index_3 = 3
        else:
            index_2 = 3
            index_3 = 2

        box = [points[index_1], points[index_2], points[index_3], points[index_4]]
        return box, min(bounding_box[1])

    def box_score_fast(self, bitmap, _box):
        h, w = bitmap.shape[:2]
        box = _box.copy()
        xmin = np.clip(np.floor(box[:, 0].min()).astype("int32"), 0, w - 1)
        xmax = np.clip(np.ceil(box[:, 0].max()).astype("int32"), 0, w - 1)
        ymin = np.clip(np.floor(box[:, 1].min()).astype("int32"), 0, h - 1)
        ymax = np.clip(np.ceil(box[:, 1].max()).astype("int32"), 0, h - 1)

        mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
        box[:, 0] = box[:, 0] - xmin
        box[:, 1] = box[:, 1] - ymin
        cv2.fillPoly(mask, box.reshape(1, -1, 2).astype("int32"), 1)
        return cv2.mean(bitmap[ymin : ymax + 1, xmin : xmax + 1], mask)[0]
    def box_score_slow(self, bitmap, contour):
        h, w = bitmap.shape[:2]
        contour = contour.copy()
        contour = np.reshape(contour, (-1, 2))

        xmin = np.clip(np.min(contour[:, 0]), 0, w - 1)
        xmax = np.clip(np.max(contour[:, 0]), 0, w - 1)
        ymin = np.clip(np.min(contour[:, 1]), 0, h - 1)
        ymax = np.clip(np.max(contour[:, 1]), 0, h - 1)
        mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
        contour[:, 0] = contour[:, 0] - xmin
        contour[:, 1] = contour[:, 1] - ymin

        cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype("int32"), 1)
        return cv2.mean(bitmap[ymin : ymax + 1, xmin : xmax + 1], mask)[0]
    
    def box_standardization(self,boxes_batch,shape_list):
        dt_batch_boxs = []
        dt_batch_rects = []
        for b in range(len(boxes_batch)):
            src_h, src_w, _, _ = shape_list[b]
            det_boxs = []
            for box in boxes_batch[b]:
                if isinstance(box,list):
                    box = np.array(box)
                
                rect = np.zeros((4, 2), dtype="float32")
                s = box.sum(axis=1)
                rect[0] = box[np.argmin(s)]
                rect[2] = box[np.argmax(s)]
            
                tmp = np.delete(box, (np.argmin(s), np.argmax(s)), axis=0)
                #diff = y-x     bottom-left : y>x  top-right:y<x
                diff = np.diff(np.array(tmp), axis=1)
                rect[1] = tmp[np.argmin(diff)]
                rect[3] = tmp[np.argmax(diff)]
                for i in range(rect.shape[0]):
                    rect[i, 0] = int(min(max(rect[i, 0], 0), src_w - 1))
                    rect[i, 1] = int(min(max(rect[i, 1], 0), src_h - 1))
                b_w = int(np.linalg.norm(box[0] - box[1]))
                b_h = int(np.linalg.norm(box[0] - box[3]))
                if b_w <= 3 or b_h <= 3:
                    continue
            
                det_boxs.append(rect)

            dt_batch_boxs.append(det_boxs)
            
        return dt_batch_boxs
             
    def __call__(self, src_img):
        """
        1、预处理
        2、推理
        3、后处理,输出字符区域的边界框
        4、边界框排序，按照从上到下，从左到右的顺序
        5、边界框坐标映射到原始图片
        """
        data = self.preprocess(src_img)
        if self.offload_copy==False:
            self.d_mem[self.det_input_name] = migraphx.to_gpu(migraphx.argument(data["image"]))
            results = self.db_model.run(self.d_mem)
        else:
            results = self.db_model.run({self.det_input_name:data["image"]})
        
        if self.offload_copy==False :
            result=migraphx.from_gpu(results[0])
            print("offload copy model")
            result = np.array(result)
        else:
            result = results[0]
                        
        shape_list = np.expand_dims(data["shape"], axis=0)
        pred = np.array(result)
        pred = pred[:, 0, :, :]
        segmentation = pred > self.thresh
        boxes_batch = []
        for batch_index in range(pred.shape[0]):
            src_h, src_w, ratio_h, ratio_w = shape_list[batch_index]
            if self.dilation_kernel is not None:
                mask = cv2.dilate(
                    np.array(segmentation[batch_index]).astype(np.uint8),
                    self.dilation_kernel,
                )
            else:
                mask = segmentation[batch_index]
            if self.box_type == "poly":
                boxes, scores = self.polygons_from_bitmap(
                    pred[batch_index], mask, ratio_w,ratio_h, src_w, src_h
                )
            elif self.box_type == "quad":
                boxes, scores = self.boxes_from_bitmap(
                    pred[batch_index], mask, ratio_w,ratio_h, src_w, src_h
                )
            else:
                raise ValueError("box_type can only be one of ['quad', 'poly']")

            boxes_batch.append(boxes)
        
        det_box_batch = self.sorted_boxes(boxes_batch)
        dt_boxes = self.box_standardization(det_box_batch,shape_list)
        return dt_boxes
    
    def preprocess(self, src_img,
                   mean: list = [0.485, 0.456, 0.406],
                   std: list = [0.229, 0.224, 0.225],
                   scale: float = 1.0/255):
           
            data = dict()
            img = src_img.copy()
            src_h, src_w, _ = img.shape
                        
            res_img, [ratio_h, ratio_w] = self.resize_image(img)
            norm_img = (res_img* scale - mean) / std
            image_data = norm_img.transpose(2, 0, 1)
            image_data = np.expand_dims(image_data, axis=0).astype(np.float32)
            image_data = np.ascontiguousarray(image_data)

            data["image"] = image_data
            data["shape"] = np.array([src_h, src_w, ratio_h, ratio_w])
            return data

    def resize_image(self, img):
        h, w, _ = img.shape
        if h > w:
            ratio = float(self.db_input_size[1]) / h
        else:
            ratio = float(self.db_input_size[0]) / w
        resize_h = int(h * ratio)
        resize_w = int(w * ratio)

        resize_h = max(int(round(resize_h / 32) * 32), 32)
        resize_w = max(int(round(resize_w / 32) * 32), 32)

        try:
            if int(resize_w) <= 0 or int(resize_h) <= 0:
                return None, (None, None)
            img = cv2.resize(img, (int(resize_w), int(resize_h)))
        except:
            print(img.shape, resize_w, resize_h)
            raise ValueError("resize error")
        ratio_h = resize_h / float(h)
        ratio_w = resize_w / float(w)
        
        im_pad = np.zeros((self.db_input_size[1], self.db_input_size[0], 3), np.float32)
        im_pad[:resize_h, :resize_w, :] = img
        return im_pad, [ratio_h, ratio_w]
    def sorted_boxes(self,dt_boxes):
        dt_boxes = dt_boxes[0]
        boxes_np = np.array(dt_boxes, dtype=np.int32)
        batch_boxes = list()
        # 计算每个框的参考点（左上角）和几何特征
        top_left = boxes_np[:, 0, :]
        widths = boxes_np[:, 1, 0] - boxes_np[:, 0, 0]
        heights = boxes_np[:, 2, 1] - boxes_np[:, 0, 1]
        avg_height = np.median(heights)
        
        # 按y坐标主要排序，x坐标次要排序
        sorted_indices = np.lexsort((top_left[:, 0], top_left[:, 1]))
        
        # 分组调整：将y坐标相近的框视为同一行
        final_order = []
        original_indices = []
        current_row = [(0, sorted_indices[0])]  # (x_coord, original_idx)
        
        for idx in sorted_indices[1:]:
            # 如果当前框与前一框的y坐标差小于行高的0.6倍，视为同一行
            if abs(top_left[idx,1] - top_left[current_row[-1][1],1]) < avg_height * 0.6:
                current_row.append((top_left[idx,0], idx))
            else:
                # 对当前行按x坐标排序
                current_row_sorted = sorted(current_row, key=lambda x: x[0])
                final_order.extend([x[1] for x in current_row_sorted])
                current_row = [(top_left[idx,0], idx)]
        
        # 添加最后一行
        current_row_sorted = sorted(current_row, key=lambda x: x[0])
        final_order.extend([x[1] for x in current_row_sorted])
        batch_boxes.append(boxes_np[final_order])
        # 返回排序后的框
        return batch_boxes

 
class PPOcrV5():
    def __init__(self,
        det_model_path:str,
        rec_model_path:str,
        char_dict_path:str = "../Resource/ppocr_keys_v5.txt",
        db_input_size :list =  (640,640),
        rec_input_size :list = (48,720),
        seg_thresh:float=0.3,
        box_thresh:float=0.7,
        precision_mode:str='fp16',
        offload_copy:bool=True,
        **kwargs
        ):
        """
            det_model_path: 字符检测模型路径
            rec_model_path: 字符识别模型路径
            seg_thresh:     dbnet 像素分割阈值
            box_thresh:     字符边界框阈值
            db_input_size:  模型输入size
        """
        self.seg_thres = seg_thresh
        self.box_thresh = box_thresh
        self.db_input_size = db_input_size
        self.offload_copy = offload_copy

        if hasattr(kwargs,"max_candidates"):
            self.max_candidates = kwargs["max_candidates"]
        else:
            self.max_candidates = 1000
        if hasattr(kwargs,"unclip_ratio"):
            self.unclip_ratio = kwargs["unclip_ratio"]
        else:
            self.unclip_ratio = 2.0
        
        if hasattr(kwargs,"use_dilation"):
            self.use_dilation = kwargs["use_dilation"]
        else:
            self.use_dilation = False

        if hasattr(kwargs,"score_mode"):
            self.score_mode = kwargs["score_mode"]
        else:
            self.score_mode = "fast"
        
        if hasattr(kwargs,"box_type"):
            self.box_type = kwargs["box_type"]
        else:
            self.box_type = "quad"
        
        self.db_detector = TextDetector(
                det_model_path,
                db_input_size,
                thresh=self.seg_thres,
                box_thresh=self.box_thresh,
                max_candidates=self.max_candidates,
                unclip_ratio=self.unclip_ratio,
                box_type=self.box_type,
                use_dilation=self.use_dilation,
                score_mode=self.score_mode,
                precision_mode=precision_mode,
                offload_copy=offload_copy
        )
        
        self.text_extractor = TextRecgnizer(rec_model_path=rec_model_path,
                                            rec_input_size=rec_input_size,
                                            precision_mode=precision_mode,
                                            offload_copy=offload_copy)
        
        self.ctc_decoder = CTCLabelDecode(character_dict_path=char_dict_path,
                                      use_space_char=True)

       
    def __call__(self, src_img):
        import time
        start = time.time()
        dt_boxs = self.db_detector(src_img)
       
        batch_img_list = self.detection_roi_crop(src_img,dt_boxs)
        batch_outputs_pre ,batch_max_wh_ratio_pre   = self.text_extractor(batch_img_list)
        batch_text_list, batch_label_list = self.ctc_decoder(batch_outputs_pre,return_word_box=False,wh_ratio_list = batch_max_wh_ratio_pre)
        end = time.time()
        batch_text_out = []
        batch_boxes_out = []
        
        for b in range(len(dt_boxs)):
            text_out = []
            boxex_out = []
            for box, rec_result in zip(dt_boxs[b], batch_text_list[b]):
                text, score = rec_result[0], rec_result[1]
                if score >= 0.5:
                    text_out.append(rec_result)
                    boxex_out.append(box)
            
            batch_text_out.append(text_out)
            batch_boxes_out.append(boxex_out)

        for b in range(len(batch_text_out)):
            for text, score in batch_text_out[b]:
                print("{}, {:.3f}".format(text, score))

        res_img = self.vis_boxes(batch_boxes_out,src_img)        
        res_img = self.vis_oct_text(batch_text_out,batch_boxes_out,res_img)
        print(f"[Time info] elapsed:{(end-start)*1000:.4f} ms")
        return res_img
    
    def detection_roi_crop(self,src_img,boxes):
        batch_cut_imgs = list()
        for b in range(len(boxes)):
            crop_imgs = list()
            for tl,tr,br,bl in boxes[b]:
                box = [int(tl[0]),int(tl[1]),int(br[0]),int(br[1])]
                crop_img = src_img[box[1]:box[3], box[0]:box[2],:]  
                crop_imgs.append(crop_img) 

            batch_cut_imgs.append(crop_imgs)
        return batch_cut_imgs
    
    def vis_oct_text(self,batch_text,batch_boxes,src_img,fornt_path="../Resource/fonts/simfang.ttf"):
        from PIL import Image, ImageDraw, ImageFont
        img = np.zeros(src_img.shape, dtype=np.uint8)
        img.fill(255)  

        pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        draw = ImageDraw.Draw(pil_img)

        for b in range(len(batch_text)):
            for id,text in enumerate(batch_text[b]):
                text,conf = text
                f_start = (batch_boxes[b][id][0][0],batch_boxes[b][id][0][1])
                f_end   = (batch_boxes[b][id][2][0],batch_boxes[b][id][2][1])
                w,h = np.array(f_end) - np.array(f_start)
                font_size = int(h*0.9)
                font = ImageFont.truetype(fornt_path, font_size,encoding="utf-8")
                draw.text(f_start, text, font=font, fill=(0, 255, 0))  
        
        res_img = np.concatenate([src_img, np.array(pil_img)], axis=1)
        return res_img
    def vis_boxes(self,boxes, img, colors=(0,255,0), thickness=2):
        for b in range(len(boxes)):
          
            for tl,tr,br,bl in boxes[b]:
                box = [int(tl[0]),int(tl[1]),int(br[0]),int(br[1])]
                cv2.rectangle(img, (box[0],box[1]), (box[2],box[3]), colors, thickness)
               
        return img

if __name__ == '__main__':
    det_onnx_path = "../Resource/Models/ppocrv5_server_det_infer.onnx"
    rec_onnx_path = "../Resource/Models/ppocrv5_server_rec_infer.onnx"
    image_path = "../Resource/Images/demo.png"
    img = cv2.imread(image_path)
    ppocrv5 = PPOcrV5(det_onnx_path,rec_onnx_path,offload_copy=True,precision_mode="fp16")
    res_img = ppocrv5(img)
    cv2.imwrite("res.jpg",res_img)