Merge pull request #1597 from tink2123/dygraph_for_srn

【Do not merge】Add srn model

Merge pull request #1597 from tink2123/dygraph_for_srn
【Do not merge】Add srn model
acd479ea · xiaoting · GitHub · fad40158 · 6ebbbfe4 · acd479ea
Unverified Commit acd479ea authored Jan 29, 2021 by xiaoting Committed by GitHub Jan 29, 2021
6 changed files
--- a/ppocr/postprocess/__init__.py
+++ b/ppocr/postprocess/__init__.py
@@ -26,11 +26,12 @@ def build_post_process(config, global_config=None):
    from .db_postprocess import DBPostProcess
    from .east_postprocess import EASTPostProcess
    from .sast_postprocess import SASTPostProcess
-    from .rec_postprocess import CTCLabelDecode, AttnLabelDecode
+    from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode
    from .cls_postprocess import ClsPostProcess

    support_dict = [
-        'DBPostProcess', 'EASTPostProcess', 'SASTPostProcess', 'CTCLabelDecode', 'AttnLabelDecode', 'ClsPostProcess'
+        'DBPostProcess', 'EASTPostProcess', 'SASTPostProcess', 'CTCLabelDecode',
+        'AttnLabelDecode', 'ClsPostProcess', 'SRNLabelDecode'
    ]

    config = copy.deepcopy(config)

--- a/ppocr/postprocess/rec_postprocess.py
+++ b/ppocr/postprocess/rec_postprocess.py
@@ -33,6 +33,9 @@ class BaseRecLabelDecode(object):
        assert character_type in support_character_type, "Only {} are supported now but get {}".format(
            support_character_type, character_type)

+        self.beg_str = "sos"
+        self.end_str = "eos"
+
        if character_type == "en":
            self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
            dict_character = list(self.character_str)
@@ -109,7 +112,6 @@ class CTCLabelDecode(BaseRecLabelDecode):
    def __call__(self, preds, label=None, *args, **kwargs):
        if isinstance(preds, paddle.Tensor):
            preds = preds.numpy()
-
        preds_idx = preds.argmax(axis=2)
        preds_prob = preds.max(axis=2)
        text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True)
@@ -158,3 +160,84 @@ class AttnLabelDecode(BaseRecLabelDecode):
            assert False, "unsupport type %s in get_beg_end_flag_idx" \
                          % beg_or_end
        return idx
+
+
+class SRNLabelDecode(BaseRecLabelDecode):
+    """ Convert between text-label and text-index """
+
+    def __init__(self,
+                 character_dict_path=None,
+                 character_type='en',
+                 use_space_char=False,
+                 **kwargs):
+        super(SRNLabelDecode, self).__init__(character_dict_path,
+                                             character_type, use_space_char)
+
+    def __call__(self, preds, label=None, *args, **kwargs):
+        pred = preds['predict']
+        char_num = len(self.character_str) + 2
+        if isinstance(pred, paddle.Tensor):
+            pred = pred.numpy()
+        pred = np.reshape(pred, [-1, char_num])
+
+        preds_idx = np.argmax(pred, axis=1)
+        preds_prob = np.max(pred, axis=1)
+
+        preds_idx = np.reshape(preds_idx, [-1, 25])
+
+        preds_prob = np.reshape(preds_prob, [-1, 25])
+
+        text = self.decode(preds_idx, preds_prob)
+
+        if label is None:
+            text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
+            return text
+        label = self.decode(label)
+        return text, label
+
+    def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
+        """ convert text-index into text-label. """
+        result_list = []
+        ignored_tokens = self.get_ignored_tokens()
+        batch_size = len(text_index)
+
+        for batch_idx in range(batch_size):
+            char_list = []
+            conf_list = []
+            for idx in range(len(text_index[batch_idx])):
+                if text_index[batch_idx][idx] in ignored_tokens:
+                    continue
+                if is_remove_duplicate:
+                    # only for predict
+                    if idx > 0 and text_index[batch_idx][idx - 1] == text_index[
+                            batch_idx][idx]:
+                        continue
+                char_list.append(self.character[int(text_index[batch_idx][
+                    idx])])
+                if text_prob is not None:
+                    conf_list.append(text_prob[batch_idx][idx])
+                else:
+                    conf_list.append(1)
+
+            text = ''.join(char_list)
+            result_list.append((text, np.mean(conf_list)))
+        return result_list
+
+    def add_special_char(self, dict_character):
+        dict_character = dict_character + [self.beg_str, self.end_str]
+        return dict_character
+
+    def get_ignored_tokens(self):
+        beg_idx = self.get_beg_end_flag_idx("beg")
+        end_idx = self.get_beg_end_flag_idx("end")
+        return [beg_idx, end_idx]
+
+    def get_beg_end_flag_idx(self, beg_or_end):
+        if beg_or_end == "beg":
+            idx = np.array(self.dict[self.beg_str])
+        elif beg_or_end == "end":
+            idx = np.array(self.dict[self.end_str])
+        else:
+            assert False, "unsupport type %s in get_beg_end_flag_idx" \
+                          % beg_or_end
+        return idx
--- a/tools/export_model.py
+++ b/tools/export_model.py
@@ -31,6 +31,14 @@ from ppocr.utils.logging import get_logger
 from tools.program import load_config, merge_config, ArgsParser


+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-c", "--config", help="configuration file to use")
+    parser.add_argument(
+        "-o", "--output_path", type=str, default='./output/infer/')
+    return parser.parse_args()
+
+
 def main():
    FLAGS = ArgsParser().parse_args()
    config = load_config(FLAGS.config)
@@ -52,23 +60,39 @@ def main():

    save_path = '{}/inference'.format(config['Global']['save_inference_dir'])

-    infer_shape = [3, -1, -1]
-    if config['Architecture']['model_type'] == "rec":
-        infer_shape = [3, 32, -1]  # for rec model, H must be 32
-        if 'Transform' in config['Architecture'] and config['Architecture'][
-                'Transform'] is not None and config['Architecture'][
-                    'Transform']['name'] == 'TPS':
-            logger.info(
-                'When there is tps in the network, variable length input is not supported, and the input size needs to be the same as during training'
-            )
-            infer_shape[-1] = 100
-
-    model = to_static(
-        model,
-        input_spec=[
+    if config['Architecture']['algorithm'] == "SRN":
+        other_shape = [
            paddle.static.InputSpec(
-                shape=[None] + infer_shape, dtype='float32')
-        ])
+                shape=[None, 1, 64, 256], dtype='float32'), [
+                    paddle.static.InputSpec(
+                        shape=[None, 256, 1],
+                        dtype="int64"), paddle.static.InputSpec(
+                            shape=[None, 25, 1],
+                            dtype="int64"), paddle.static.InputSpec(
+                                shape=[None, 8, 25, 25], dtype="int64"),
+                    paddle.static.InputSpec(
+                        shape=[None, 8, 25, 25], dtype="int64")
+                ]
+        ]
+        model = to_static(model, input_spec=other_shape)
+    else:
+        infer_shape = [3, -1, -1]
+        if config['Architecture']['model_type'] == "rec":
+            infer_shape = [3, 32, -1]  # for rec model, H must be 32
+            if 'Transform' in config['Architecture'] and config['Architecture'][
+                    'Transform'] is not None and config['Architecture'][
+                        'Transform']['name'] == 'TPS':
+                logger.info(
+                    'When there is tps in the network, variable length input is not supported, and the input size needs to be the same as during training'
+                )
+                infer_shape[-1] = 100
+        model = to_static(
+            model,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None] + infer_shape, dtype='float32')
+            ])
+
    paddle.jit.save(model, save_path)
    logger.info('inference model is saved to {}'.format(save_path))


--- a/tools/infer/predict_rec.py
+++ b/tools/infer/predict_rec.py
@@ -25,6 +25,7 @@ import numpy as np
 import math
 import time
 import traceback
+import paddle

 import tools.infer.utility as utility
 from ppocr.postprocess import build_post_process
@@ -46,6 +47,13 @@ class TextRecognizer(object):
            "character_dict_path": args.rec_char_dict_path,
            "use_space_char": args.use_space_char
        }
+        if self.rec_algorithm == "SRN":
+            postprocess_params = {
+                'name': 'SRNLabelDecode',
+                "character_type": args.rec_char_type,
+                "character_dict_path": args.rec_char_dict_path,
+                "use_space_char": args.use_space_char
+            }
        self.postprocess_op = build_post_process(postprocess_params)
        self.predictor, self.input_tensor, self.output_tensors = \
            utility.create_predictor(args, 'rec', logger)
@@ -70,6 +78,78 @@ class TextRecognizer(object):
        padding_im[:, :, 0:resized_w] = resized_image
        return padding_im

+    def resize_norm_img_srn(self, img, image_shape):
+        imgC, imgH, imgW = image_shape
+
+        img_black = np.zeros((imgH, imgW))
+        im_hei = img.shape[0]
+        im_wid = img.shape[1]
+
+        if im_wid <= im_hei * 1:
+            img_new = cv2.resize(img, (imgH * 1, imgH))
+        elif im_wid <= im_hei * 2:
+            img_new = cv2.resize(img, (imgH * 2, imgH))
+        elif im_wid <= im_hei * 3:
+            img_new = cv2.resize(img, (imgH * 3, imgH))
+        else:
+            img_new = cv2.resize(img, (imgW, imgH))
+
+        img_np = np.asarray(img_new)
+        img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
+        img_black[:, 0:img_np.shape[1]] = img_np
+        img_black = img_black[:, :, np.newaxis]
+
+        row, col, c = img_black.shape
+        c = 1
+
+        return np.reshape(img_black, (c, row, col)).astype(np.float32)
+
+    def srn_other_inputs(self, image_shape, num_heads, max_text_length):
+
+        imgC, imgH, imgW = image_shape
+        feature_dim = int((imgH / 8) * (imgW / 8))
+
+        encoder_word_pos = np.array(range(0, feature_dim)).reshape(
+            (feature_dim, 1)).astype('int64')
+        gsrm_word_pos = np.array(range(0, max_text_length)).reshape(
+            (max_text_length, 1)).astype('int64')
+
+        gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length))
+        gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape(
+            [-1, 1, max_text_length, max_text_length])
+        gsrm_slf_attn_bias1 = np.tile(
+            gsrm_slf_attn_bias1,
+            [1, num_heads, 1, 1]).astype('float32') * [-1e9]
+
+        gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape(
+            [-1, 1, max_text_length, max_text_length])
+        gsrm_slf_attn_bias2 = np.tile(
+            gsrm_slf_attn_bias2,
+            [1, num_heads, 1, 1]).astype('float32') * [-1e9]
+
+        encoder_word_pos = encoder_word_pos[np.newaxis, :]
+        gsrm_word_pos = gsrm_word_pos[np.newaxis, :]
+
+        return [
+            encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
+            gsrm_slf_attn_bias2
+        ]
+
+    def process_image_srn(self, img, image_shape, num_heads, max_text_length):
+        norm_img = self.resize_norm_img_srn(img, image_shape)
+        norm_img = norm_img[np.newaxis, :]
+
+        [encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \
+            self.srn_other_inputs(image_shape, num_heads, max_text_length)
+
+        gsrm_slf_attn_bias1 = gsrm_slf_attn_bias1.astype(np.float32)
+        gsrm_slf_attn_bias2 = gsrm_slf_attn_bias2.astype(np.float32)
+        encoder_word_pos = encoder_word_pos.astype(np.int64)
+        gsrm_word_pos = gsrm_word_pos.astype(np.int64)
+
+        return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
+                gsrm_slf_attn_bias2)
+
    def __call__(self, img_list):
        img_num = len(img_list)
        # Calculate the aspect ratio of all text bars
@@ -93,21 +173,64 @@ class TextRecognizer(object):
                wh_ratio = w * 1.0 / h
                max_wh_ratio = max(max_wh_ratio, wh_ratio)
            for ino in range(beg_img_no, end_img_no):
-                # norm_img = self.resize_norm_img(img_list[ino], max_wh_ratio)
-                norm_img = self.resize_norm_img(img_list[indices[ino]],
-                                                max_wh_ratio)
-                norm_img = norm_img[np.newaxis, :]
-                norm_img_batch.append(norm_img)
+                if self.rec_algorithm != "SRN":
+                    norm_img = self.resize_norm_img(img_list[indices[ino]],
+                                                    max_wh_ratio)
+                    norm_img = norm_img[np.newaxis, :]
+                    norm_img_batch.append(norm_img)
+                else:
+                    norm_img = self.process_image_srn(
+                        img_list[indices[ino]], self.rec_image_shape, 8, 25)
+                    encoder_word_pos_list = []
+                    gsrm_word_pos_list = []
+                    gsrm_slf_attn_bias1_list = []
+                    gsrm_slf_attn_bias2_list = []
+                    encoder_word_pos_list.append(norm_img[1])
+                    gsrm_word_pos_list.append(norm_img[2])
+                    gsrm_slf_attn_bias1_list.append(norm_img[3])
+                    gsrm_slf_attn_bias2_list.append(norm_img[4])
+                    norm_img_batch.append(norm_img[0])
            norm_img_batch = np.concatenate(norm_img_batch)
            norm_img_batch = norm_img_batch.copy()
-            starttime = time.time()
-            self.input_tensor.copy_from_cpu(norm_img_batch)
-            self.predictor.run()
-            outputs = []
-            for output_tensor in self.output_tensors:
-                output = output_tensor.copy_to_cpu()
-                outputs.append(output)
-            preds = outputs[0]
+
+            if self.rec_algorithm == "SRN":
+                starttime = time.time()
+                encoder_word_pos_list = np.concatenate(encoder_word_pos_list)
+                gsrm_word_pos_list = np.concatenate(gsrm_word_pos_list)
+                gsrm_slf_attn_bias1_list = np.concatenate(
+                    gsrm_slf_attn_bias1_list)
+                gsrm_slf_attn_bias2_list = np.concatenate(
+                    gsrm_slf_attn_bias2_list)
+
+                inputs = [
+                    norm_img_batch,
+                    encoder_word_pos_list,
+                    gsrm_word_pos_list,
+                    gsrm_slf_attn_bias1_list,
+                    gsrm_slf_attn_bias2_list,
+                ]
+                input_names = self.predictor.get_input_names()
+                for i in range(len(input_names)):
+                    input_tensor = self.predictor.get_input_handle(input_names[
+                        i])
+                    input_tensor.copy_from_cpu(inputs[i])
+                self.predictor.run()
+                outputs = []
+                for output_tensor in self.output_tensors:
+                    output = output_tensor.copy_to_cpu()
+                    outputs.append(output)
+                preds = {"predict": outputs[2]}
+            else:
+                starttime = time.time()
+                self.input_tensor.copy_from_cpu(norm_img_batch)
+                self.predictor.run()
+
+                outputs = []
+                for output_tensor in self.output_tensors:
+                    output = output_tensor.copy_to_cpu()
+                    outputs.append(output)
+                preds = outputs[0]
+
            rec_result = self.postprocess_op(preds)
            for rno in range(len(rec_result)):
                rec_res[indices[beg_img_no + rno]] = rec_result[rno]

--- a/tools/infer_rec.py
+++ b/tools/infer_rec.py
@@ -62,7 +62,13 @@ def main():
        elif op_name in ['RecResizeImg']:
            op[op_name]['infer_mode'] = True
        elif op_name == 'KeepKeys':
-            op[op_name]['keep_keys'] = ['image']
+            if config['Architecture']['algorithm'] == "SRN":
+                op[op_name]['keep_keys'] = [
+                    'image', 'encoder_word_pos', 'gsrm_word_pos',
+                    'gsrm_slf_attn_bias1', 'gsrm_slf_attn_bias2'
+                ]
+            else:
+                op[op_name]['keep_keys'] = ['image']
        transforms.append(op)
    global_config['infer_mode'] = True
    ops = create_operators(transforms, global_config)
@@ -74,10 +80,25 @@ def main():
            img = f.read()
            data = {'image': img}
        batch = transform(data, ops)
+        if config['Architecture']['algorithm'] == "SRN":
+            encoder_word_pos_list = np.expand_dims(batch[1], axis=0)
+            gsrm_word_pos_list = np.expand_dims(batch[2], axis=0)
+            gsrm_slf_attn_bias1_list = np.expand_dims(batch[3], axis=0)
+            gsrm_slf_attn_bias2_list = np.expand_dims(batch[4], axis=0)
+
+            others = [
+                paddle.to_tensor(encoder_word_pos_list),
+                paddle.to_tensor(gsrm_word_pos_list),
+                paddle.to_tensor(gsrm_slf_attn_bias1_list),
+                paddle.to_tensor(gsrm_slf_attn_bias2_list)
+            ]

        images = np.expand_dims(batch[0], axis=0)
        images = paddle.to_tensor(images)
-        preds = model(images)
+        if config['Architecture']['algorithm'] == "SRN":
+            preds = model(images, others)
+        else:
+            preds = model(images)
        post_result = post_process_class(preds)
        for rec_reuslt in post_result:
            logger.info('\t result: {}'.format(rec_reuslt))

--- a/tools/program.py
+++ b/tools/program.py
@@ -174,6 +174,7 @@ def train(config,
    best_model_dict = {main_indicator: 0}
    best_model_dict.update(pre_best_model_dict)
    train_stats = TrainingStats(log_smooth_window, ['lr'])
+    model_average = False
    model.train()

    if 'start_epoch' in best_model_dict:
@@ -194,7 +195,12 @@ def train(config,
                break
            lr = optimizer.get_lr()
            images = batch[0]
-            preds = model(images)
+            if config['Architecture']['algorithm'] == "SRN":
+                others = batch[-4:]
+                preds = model(images, others)
+                model_average = True
+            else:
+                preds = model(images)
            loss = loss_class(preds, batch)
            avg_loss = loss['loss']
            avg_loss.backward()
@@ -238,7 +244,14 @@ def train(config,
            # eval
            if global_step > start_eval_step and \
                    (global_step - start_eval_step) % eval_batch_step == 0 and dist.get_rank() == 0:
-                cur_metric = eval(model, valid_dataloader, post_process_class,
+                if model_average:
+                    Model_Average = paddle.incubate.optimizer.ModelAverage(
+                        0.15,
+                        parameters=model.parameters(),
+                        min_average_window=10000,
+                        max_average_window=15625)
+                    Model_Average.apply()
+                cur_metirc = eval(model, valid_dataloader, post_process_class,
                                  eval_class)
                cur_metric_str = 'cur metric, {}'.format(', '.join(
                    ['{}: {}'.format(k, v) for k, v in cur_metric.items()]))
@@ -273,6 +286,7 @@ def train(config,
                                          best_model_dict[main_indicator],
                                          global_step)
            global_step += 1
+            optimizer.clear_grad()
            batch_start = time.time()
        if dist.get_rank() == 0:
            save_model(
@@ -312,8 +326,9 @@ def eval(model, valid_dataloader, post_process_class, eval_class):
            if idx >= len(valid_dataloader):
                break
            images = batch[0]
+            others = batch[-4:]
            start = time.time()
-            preds = model(images)
+            preds = model(images, others)

            batch = [item.numpy() for item in batch]
            # Obtain usable results from post-processing methods