Merge pull request #3106 from MissPenguin/dygraph

add train code for table

Merge pull request #3106 from MissPenguin/dygraph
add train code for table
01c4ee5d · MissPenguin · GitHub · 7bcea8d0 · 7bcabe0f · 01c4ee5d
Unverified Commit 01c4ee5d authored Jun 23, 2021 by MissPenguin Committed by GitHub Jun 23, 2021
8 changed files
--- a/ppocr/modeling/heads/table_att_head.py
+++ b/ppocr/modeling/heads/table_att_head.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+class TableAttentionHead(nn.Layer):
+    def __init__(self, in_channels, hidden_size, loc_type, in_max_len=488, **kwargs):
+        super(TableAttentionHead, self).__init__()
+        self.input_size = in_channels[-1]
+        self.hidden_size = hidden_size
+        self.elem_num = 30
+        self.max_text_length = 100
+        self.max_elem_length = 500
+        self.max_cell_num = 500
+        self.structure_attention_cell = AttentionGRUCell(
+            self.input_size, hidden_size, self.elem_num, use_gru=False)
+        self.structure_generator = nn.Linear(hidden_size, self.elem_num)
+        self.loc_type = loc_type
+        self.in_max_len = in_max_len
+        if self.loc_type == 1:
+            self.loc_generator = nn.Linear(hidden_size, 4)
+        else:
+            if self.in_max_len == 640:
+                self.loc_fea_trans = nn.Linear(400, self.max_elem_length+1)
+            elif self.in_max_len == 800:
+                self.loc_fea_trans = nn.Linear(625, self.max_elem_length+1)
+            else:
+                self.loc_fea_trans = nn.Linear(256, self.max_elem_length+1)
+            self.loc_generator = nn.Linear(self.input_size + hidden_size, 4)
+    def _char_to_onehot(self, input_char, onehot_dim):
+        input_ont_hot = F.one_hot(input_char, onehot_dim)
+        return input_ont_hot
+    def forward(self, inputs, targets=None):
+        # if and else branch are both needed when you want to assign a variable
+        # if you modify the var in just one branch, then the modification will not work.
+        fea = inputs[-1]
+        if len(fea.shape) == 3:
+            pass
+        else:
+            last_shape = int(np.prod(fea.shape[2:])) # gry added
+            fea = paddle.reshape(fea, [fea.shape[0], fea.shape[1], last_shape])
+            fea = fea.transpose([0, 2, 1])  # (NTC)(batch, width, channels)
+        batch_size = fea.shape[0]
+        hidden = paddle.zeros((batch_size, self.hidden_size))
+        output_hiddens = []
+        if self.training and targets is not None:
+            structure = targets[0]
+            for i in range(self.max_elem_length+1):
+                elem_onehots = self._char_to_onehot(
+                    structure[:, i], onehot_dim=self.elem_num)
+                (outputs, hidden), alpha = self.structure_attention_cell(
+                    hidden, fea, elem_onehots)
+                output_hiddens.append(paddle.unsqueeze(outputs, axis=1))
+            output = paddle.concat(output_hiddens, axis=1)
+            structure_probs = self.structure_generator(output)
+            if self.loc_type == 1:
+                loc_preds = self.loc_generator(output)
+                loc_preds = F.sigmoid(loc_preds)
+            else:
+                loc_fea = fea.transpose([0, 2, 1])
+                loc_fea = self.loc_fea_trans(loc_fea)
+                loc_fea = loc_fea.transpose([0, 2, 1])
+                loc_concat = paddle.concat([output, loc_fea], axis=2)
+                loc_preds = self.loc_generator(loc_concat)
+                loc_preds = F.sigmoid(loc_preds)
+        else:
+            temp_elem = paddle.zeros(shape=[batch_size], dtype="int32")
+            structure_probs = None
+            loc_preds = None
+            elem_onehots = None
+            outputs = None
+            alpha = None
+            max_elem_length = paddle.to_tensor(self.max_elem_length)
+            i = 0
+            while i < max_elem_length+1:
+                elem_onehots = self._char_to_onehot(
+                    temp_elem, onehot_dim=self.elem_num)
+                (outputs, hidden), alpha = self.structure_attention_cell(
+                    hidden, fea, elem_onehots)
+                output_hiddens.append(paddle.unsqueeze(outputs, axis=1))
+                structure_probs_step = self.structure_generator(outputs)
+                temp_elem = structure_probs_step.argmax(axis=1, dtype="int32")
+                i += 1
+            output = paddle.concat(output_hiddens, axis=1)
+            structure_probs = self.structure_generator(output)
+            structure_probs = F.softmax(structure_probs)
+            if self.loc_type == 1:
+                loc_preds = self.loc_generator(output)
+                loc_preds = F.sigmoid(loc_preds)
+            else:
+                loc_fea = fea.transpose([0, 2, 1])
+                loc_fea = self.loc_fea_trans(loc_fea)
+                loc_fea = loc_fea.transpose([0, 2, 1])
+                loc_concat = paddle.concat([output, loc_fea], axis=2)
+                loc_preds = self.loc_generator(loc_concat)
+                loc_preds = F.sigmoid(loc_preds)
+        return {'structure_probs':structure_probs, 'loc_preds':loc_preds}
+class AttentionGRUCell(nn.Layer):
+    def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False):
+        super(AttentionGRUCell, self).__init__()
+        self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False)
+        self.h2h = nn.Linear(hidden_size, hidden_size)
+        self.score = nn.Linear(hidden_size, 1, bias_attr=False)
+        self.rnn = nn.GRUCell(
+            input_size=input_size + num_embeddings, hidden_size=hidden_size)
+        self.hidden_size = hidden_size
+    def forward(self, prev_hidden, batch_H, char_onehots):
+        batch_H_proj = self.i2h(batch_H)
+        prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden), axis=1)
+        res = paddle.add(batch_H_proj, prev_hidden_proj)
+        res = paddle.tanh(res)
+        e = self.score(res)
+        alpha = F.softmax(e, axis=1)
+        alpha = paddle.transpose(alpha, [0, 2, 1])
+        context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1)
+        concat_context = paddle.concat([context, char_onehots], 1)
+        cur_hidden = self.rnn(concat_context, prev_hidden)
+        return cur_hidden, alpha
+class AttentionLSTM(nn.Layer):
+    def __init__(self, in_channels, out_channels, hidden_size, **kwargs):
+        super(AttentionLSTM, self).__init__()
+        self.input_size = in_channels
+        self.hidden_size = hidden_size
+        self.num_classes = out_channels
+        self.attention_cell = AttentionLSTMCell(
+            in_channels, hidden_size, out_channels, use_gru=False)
+        self.generator = nn.Linear(hidden_size, out_channels)
+    def _char_to_onehot(self, input_char, onehot_dim):
+        input_ont_hot = F.one_hot(input_char, onehot_dim)
+        return input_ont_hot
+    def forward(self, inputs, targets=None, batch_max_length=25):
+        batch_size = inputs.shape[0]
+        num_steps = batch_max_length
+        hidden = (paddle.zeros((batch_size, self.hidden_size)), paddle.zeros(
+            (batch_size, self.hidden_size)))
+        output_hiddens = []
+        if targets is not None:
+            for i in range(num_steps):
+                # one-hot vectors for a i-th char
+                char_onehots = self._char_to_onehot(
+                    targets[:, i], onehot_dim=self.num_classes)
+                hidden, alpha = self.attention_cell(hidden, inputs,
+                                                    char_onehots)
+                hidden = (hidden[1][0], hidden[1][1])
+                output_hiddens.append(paddle.unsqueeze(hidden[0], axis=1))
+            output = paddle.concat(output_hiddens, axis=1)
+            probs = self.generator(output)
+        else:
+            targets = paddle.zeros(shape=[batch_size], dtype="int32")
+            probs = None
+            for i in range(num_steps):
+                char_onehots = self._char_to_onehot(
+                    targets, onehot_dim=self.num_classes)
+                hidden, alpha = self.attention_cell(hidden, inputs,
+                                                    char_onehots)
+                probs_step = self.generator(hidden[0])
+                hidden = (hidden[1][0], hidden[1][1])
+                if probs is None:
+                    probs = paddle.unsqueeze(probs_step, axis=1)
+                else:
+                    probs = paddle.concat(
+                        [probs, paddle.unsqueeze(
+                            probs_step, axis=1)], axis=1)
+                next_input = probs_step.argmax(axis=1)
+                targets = next_input
+        return probs
+class AttentionLSTMCell(nn.Layer):
+    def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False):
+        super(AttentionLSTMCell, self).__init__()
+        self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False)
+        self.h2h = nn.Linear(hidden_size, hidden_size)
+        self.score = nn.Linear(hidden_size, 1, bias_attr=False)
+        if not use_gru:
+            self.rnn = nn.LSTMCell(
+                input_size=input_size + num_embeddings, hidden_size=hidden_size)
+        else:
+            self.rnn = nn.GRUCell(
+                input_size=input_size + num_embeddings, hidden_size=hidden_size)
+        self.hidden_size = hidden_size
+    def forward(self, prev_hidden, batch_H, char_onehots):
+        batch_H_proj = self.i2h(batch_H)
+        prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden[0]), axis=1)
+        res = paddle.add(batch_H_proj, prev_hidden_proj)
+        res = paddle.tanh(res)
+        e = self.score(res)
+        alpha = F.softmax(e, axis=1)
+        alpha = paddle.transpose(alpha, [0, 2, 1])
+        context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1)
+        concat_context = paddle.concat([context, char_onehots], 1)
+        cur_hidden = self.rnn(concat_context, prev_hidden)
+        return cur_hidden, alpha
--- a/ppocr/modeling/necks/__init__.py
+++ b/ppocr/modeling/necks/__init__.py
@@ -21,7 +21,8 @@ def build_neck(config):
    from .sast_fpn import SASTFPN
    from .rnn import SequenceEncoder
    from .pg_fpn import PGFPN
-    support_dict = ['DBFPN', 'EASTFPN', 'SASTFPN', 'SequenceEncoder', 'PGFPN']
+    from .table_fpn import TableFPN
+    support_dict = ['DBFPN', 'EASTFPN', 'SASTFPN', 'SequenceEncoder', 'PGFPN', 'TableFPN']
    module_name = config.pop('name')
    assert module_name in support_dict, Exception('neck only support {}'.format(

--- a/ppocr/modeling/necks/table_fpn.py
+++ b/ppocr/modeling/necks/table_fpn.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+from paddle import nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+class TableFPN(nn.Layer):
+    def __init__(self, in_channels, out_channels, **kwargs):
+        super(TableFPN, self).__init__()
+        self.out_channels = 512
+        weight_attr = paddle.nn.initializer.KaimingUniform()
+        self.in2_conv = nn.Conv2D(
+            in_channels=in_channels[0],
+            out_channels=self.out_channels,
+            kernel_size=1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.in3_conv = nn.Conv2D(
+            in_channels=in_channels[1],
+            out_channels=self.out_channels,
+            kernel_size=1,
+            stride = 1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.in4_conv = nn.Conv2D(
+            in_channels=in_channels[2],
+            out_channels=self.out_channels,
+            kernel_size=1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.in5_conv = nn.Conv2D(
+            in_channels=in_channels[3],
+            out_channels=self.out_channels,
+            kernel_size=1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.p5_conv = nn.Conv2D(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels // 4,
+            kernel_size=3,
+            padding=1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.p4_conv = nn.Conv2D(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels // 4,
+            kernel_size=3,
+            padding=1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.p3_conv = nn.Conv2D(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels // 4,
+            kernel_size=3,
+            padding=1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.p2_conv = nn.Conv2D(
+            in_channels=self.out_channels,
+            out_channels=self.out_channels // 4,
+            kernel_size=3,
+            padding=1,
+            weight_attr=ParamAttr(initializer=weight_attr),
+            bias_attr=False)
+        self.fuse_conv = nn.Conv2D(
+            in_channels=self.out_channels * 4,
+            out_channels=512,
+            kernel_size=3,
+            padding=1,
+            weight_attr=ParamAttr(initializer=weight_attr), bias_attr=False)
+    def forward(self, x):
+        c2, c3, c4, c5 = x
+        in5 = self.in5_conv(c5)
+        in4 = self.in4_conv(c4)
+        in3 = self.in3_conv(c3)
+        in2 = self.in2_conv(c2)
+        out4 = in4 + F.upsample(
+            in5, size=in4.shape[2:4], mode="nearest", align_mode=1)  # 1/16
+        out3 = in3 + F.upsample(
+            out4, size=in3.shape[2:4], mode="nearest", align_mode=1)  # 1/8
+        out2 = in2 + F.upsample(
+            out3, size=in2.shape[2:4], mode="nearest", align_mode=1)  # 1/4
+        p4 = F.upsample(out4, size=in5.shape[2:4], mode="nearest", align_mode=1)
+        p3 = F.upsample(out3, size=in5.shape[2:4], mode="nearest", align_mode=1)
+        p2 = F.upsample(out2, size=in5.shape[2:4], mode="nearest", align_mode=1)
+        fuse = paddle.concat([in5, p4, p3, p2], axis=1)
+        fuse_conv = self.fuse_conv(fuse) * 0.005
+        return [c5 + fuse_conv]
--- a/tools/eval.py
+++ b/tools/eval.py
@@ -55,6 +55,7 @@ def main():
    model = build_model(config['Architecture'])
    use_srn = config['Architecture']['algorithm'] == "SRN"
+    model_type = config['Architecture']['model_type']
    best_model_dict = init_model(config, model)
    if len(best_model_dict):
@@ -67,7 +68,7 @@ def main():
    # start eval
    metric = program.eval(model, valid_dataloader, post_process_class,
-                          eval_class, use_srn)
+                          eval_class, model_type, use_srn)
    logger.info('metric eval ***************')
    for k, v in metric.items():
        logger.info('{}:{}'.format(k, v))

--- a/tools/export_model.py
+++ b/tools/export_model.py
@@ -60,7 +60,8 @@ def export_single_model(model, arch_config, save_path, logger):
                    "When there is tps in the network, variable length input is not supported, and the input size needs to be the same as during training"
                )
                infer_shape[-1] = 100
+        elif arch_config["model_type"] == "table":
+            infer_shape = [3, 488, 488]
        model = to_static(
            model,
            input_spec=[

--- a/tools/infer/utility.py
+++ b/tools/infer/utility.py
@@ -331,10 +331,11 @@ def create_predictor(args, mode, logger):
    config.disable_glog_info()
    config.delete_pass("conv_transpose_eltwiseadd_bn_fuse_pass")
+    if mode == 'structure':
+        config.delete_pass("fc_fuse_pass") # not supported for table    
    config.switch_use_feed_fetch_ops(False)
    config.switch_ir_optim(True)
-    if mode == 'structure':
-        config.switch_ir_optim(False)
    # create predictor
    predictor = inference.create_predictor(config)
    input_names = predictor.get_input_names()

--- a/tools/infer_table.py
+++ b/tools/infer_table.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import os
+import sys
+import json
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(__dir__)
+sys.path.append(os.path.abspath(os.path.join(__dir__, '..')))
+os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
+import paddle
+from paddle.jit import to_static
+from ppocr.data import create_operators, transform
+from ppocr.modeling.architectures import build_model
+from ppocr.postprocess import build_post_process
+from ppocr.utils.save_load import init_model
+from ppocr.utils.utility import get_image_file_list
+import tools.program as program
+import cv2
+def main(config, device, logger, vdl_writer):
+    global_config = config['Global']
+    # build post process
+    post_process_class = build_post_process(config['PostProcess'],
+                                            global_config)
+    # build model
+    if hasattr(post_process_class, 'character'):
+        config['Architecture']["Head"]['out_channels'] = len(
+            getattr(post_process_class, 'character'))
+    model = build_model(config['Architecture'])
+    init_model(config, model, logger)
+    # create data ops
+    transforms = []
+    use_padding = False
+    for op in config['Eval']['dataset']['transforms']:
+        op_name = list(op)[0]
+        if 'Label' in op_name:
+            continue
+        if op_name == 'KeepKeys':
+            op[op_name]['keep_keys'] = ['image']
+        if op_name == "ResizeTableImage":
+            use_padding = True
+            padding_max_len = op['ResizeTableImage']['max_len']
+        transforms.append(op)
+    global_config['infer_mode'] = True
+    ops = create_operators(transforms, global_config)
+    model.eval()
+    for file in get_image_file_list(config['Global']['infer_img']):
+        logger.info("infer_img: {}".format(file))
+        with open(file, 'rb') as f:
+            img = f.read()
+            data = {'image': img}
+        batch = transform(data, ops)
+        images = np.expand_dims(batch[0], axis=0)
+        images = paddle.to_tensor(images)
+        preds = model(images)
+        post_result = post_process_class(preds)
+        res_html_code = post_result['res_html_code']
+        res_loc = post_result['res_loc']
+        img = cv2.imread(file)
+        imgh, imgw = img.shape[0:2]
+        res_loc_final = []
+        for rno in range(len(res_loc[0])):
+            x0, y0, x1, y1 = res_loc[0][rno]
+            left = max(int(imgw * x0), 0)
+            top = max(int(imgh * y0), 0)
+            right = min(int(imgw * x1), imgw - 1)
+            bottom = min(int(imgh * y1), imgh - 1)
+            cv2.rectangle(img, (left, top), (right, bottom), (0, 0, 255), 2)
+            res_loc_final.append([left, top, right, bottom])
+        res_loc_str = json.dumps(res_loc_final)
+        logger.info("result: {}, {}".format(res_html_code, res_loc_final))
+    logger.info("success!")
+if __name__ == '__main__':
+    config, device, logger, vdl_writer = program.preprocess()
+    main(config, device, logger, vdl_writer)
--- a/tools/program.py
+++ b/tools/program.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -186,6 +186,7 @@ def train(config,
    model.train()
    use_srn = config['Architecture']['algorithm'] == "SRN"
+    model_type = config['Architecture']['model_type']
    if 'start_epoch' in best_model_dict:
        start_epoch = best_model_dict['start_epoch']
@@ -208,9 +209,9 @@ def train(config,
            lr = optimizer.get_lr()
            images = batch[0]
            if use_srn:
-                others = batch[-4:]
-                preds = model(images, others)
                model_average = True
+            if use_srn or model_type == 'table':
+                preds = model(images, data=batch[1:])
            else:
                preds = model(images)
            loss = loss_class(preds, batch)
@@ -232,6 +233,9 @@ def train(config,
            if cal_metric_during_train:  # only rec and cls need
                batch = [item.numpy() for item in batch]
+                if model_type == 'table':
+                    eval_class(preds, batch)
+                else:
                    post_result = post_process_class(preds, batch[1])
                    eval_class(post_result, batch)
                metric = eval_class.get_metric()
@@ -269,6 +273,7 @@ def train(config,
                    valid_dataloader,
                    post_process_class,
                    eval_class,
+                    model_type,
                    use_srn=use_srn)
                cur_metric_str = 'cur metric, {}'.format(', '.join(
                    ['{}: {}'.format(k, v) for k, v in cur_metric.items()]))
@@ -336,7 +341,11 @@ def train(config,
    return
-def eval(model, valid_dataloader, post_process_class, eval_class,
+def eval(model,
+         valid_dataloader,
+         post_process_class,
+         eval_class,
+         model_type,
         use_srn=False):
    model.eval()
    with paddle.no_grad():
@@ -350,18 +359,18 @@ def eval(model, valid_dataloader, post_process_class, eval_class,
                break
            images = batch[0]
            start = time.time()
+            if use_srn or model_type == 'table':
-            if use_srn:
+                preds = model(images, data=batch[1:])
-                others = batch[-4:]
-                preds = model(images, others)
            else:
                preds = model(images)
            batch = [item.numpy() for item in batch]
            # Obtain usable results from post-processing methods
-            post_result = post_process_class(preds, batch[1])
            total_time += time.time() - start
            # Evaluate the results of the current batch
+            if model_type == 'table':
+                eval_class(preds, batch)
+            else:
+                post_result = post_process_class(preds, batch[1])
                eval_class(post_result, batch)
            pbar.update(1)
            total_frame += len(images)
@@ -386,7 +395,7 @@ def preprocess(is_train=False):
    alg = config['Architecture']['algorithm']
    assert alg in [
        'EAST', 'DB', 'SAST', 'Rosetta', 'CRNN', 'STARNet', 'RARE', 'SRN',
-        'CLS', 'PGNet', 'Distillation'
+        'CLS', 'PGNet', 'Distillation', 'TableAttn'
    ]
    device = 'gpu:{}'.format(dist.ParallelEnv().dev_id) if use_gpu else 'cpu'