Merge pull request #5174 from WenmuZhou/fix_vqa

vqa code integrated into ppocr training system

Merge pull request #5174 from WenmuZhou/fix_vqa
vqa code integrated into ppocr training system
8bae1e40 · MissPenguin · GitHub · 9fa209e3 · 1cbe4bf2 · 8bae1e40
Unverified Commit 8bae1e40 authored Jan 14, 2022 by MissPenguin Committed by GitHub Jan 14, 2022
20 changed files
--- a/configs/vqa/re/layoutxlm.yml
+++ b/configs/vqa/re/layoutxlm.yml
+Global:
+  use_gpu: True
+  epoch_num: &epoch_num 200
+  log_smooth_window: 10
+  print_batch_step: 10
+  save_model_dir: ./output/re_layoutxlm/
+  save_epoch_step: 2000
+  # evaluation is run every 10 iterations after the 0th iteration
+  eval_batch_step: [ 0, 19 ]
+  cal_metric_during_train: False
+  save_inference_dir:
+  use_visualdl: False
+  seed: 2022
+  infer_img: doc/vqa/input/zh_val_21.jpg
+  save_res_path: ./output/re/
+
+Architecture:
+  model_type: vqa
+  algorithm: &algorithm "LayoutXLM"
+  Transform:
+  Backbone:
+    name: LayoutXLMForRe
+    pretrained: True
+    checkpoints: 
+
+Loss:
+  name: LossFromOutput
+  key: loss
+  reduction: mean
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  clip_norm: 10
+  lr:
+    learning_rate: 0.00005
+  regularizer:
+    name: L2
+    factor: 0.00000
+    
+PostProcess:
+  name: VQAReTokenLayoutLMPostProcess
+
+Metric:
+  name: VQAReTokenMetric
+  main_indicator: hmean
+
+Train:
+  dataset:
+    name: SimpleDataSet
+    data_dir: train_data/XFUND/zh_train/image
+    label_file_list: 
+      - train_data/XFUND/zh_train/xfun_normalize_train.json
+    ratio_list: [ 1.0 ]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: True
+          algorithm: *algorithm
+          class_path: &class_path ppstructure/vqa/labels/labels_ser.txt
+      - VQATokenPad:
+          max_seq_len: &max_seq_len 512
+          return_attention_mask: True
+      - VQAReTokenRelation:
+      - VQAReTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          keep_keys: [ 'input_ids', 'bbox', 'image', 'attention_mask', 'token_type_ids','entities', 'relations'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 4
+    collate_fn: ListCollator
+
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: train_data/XFUND/zh_val/image
+    label_file_list:
+      - train_data/XFUND/zh_val/xfun_normalize_val.json
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: True
+          algorithm: *algorithm
+          class_path: *class_path
+      - VQATokenPad:
+          max_seq_len: *max_seq_len
+          return_attention_mask: True
+      - VQAReTokenRelation:
+      - VQAReTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          keep_keys: [ 'input_ids', 'bbox', 'image', 'attention_mask', 'token_type_ids','entities', 'relations'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 4
+    collate_fn: ListCollator
--- a/configs/vqa/ser/layoutlm.yml
+++ b/configs/vqa/ser/layoutlm.yml
+Global:
+  use_gpu: True
+  epoch_num: &epoch_num 200
+  log_smooth_window: 10
+  print_batch_step: 10
+  save_model_dir: ./output/ser_layoutlm/
+  save_epoch_step: 2000
+  # evaluation is run every 10 iterations after the 0th iteration
+  eval_batch_step: [ 0, 19 ]
+  cal_metric_during_train: False
+  save_inference_dir:
+  use_visualdl: False
+  seed: 2022
+  infer_img: doc/vqa/input/zh_val_0.jpg
+  save_res_path: ./output/ser/
+
+Architecture:
+  model_type: vqa
+  algorithm: &algorithm "LayoutLM"
+  Transform:
+  Backbone:
+    name: LayoutLMForSer
+    pretrained: True
+    checkpoints:
+    num_classes: &num_classes 7
+
+Loss:
+  name: VQASerTokenLayoutLMLoss
+  num_classes: *num_classes
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Linear
+    learning_rate: 0.00005
+    epochs: *epoch_num
+    warmup_epoch: 2
+  regularizer:
+    name: L2
+    factor: 0.00000
+    
+PostProcess:
+  name: VQASerTokenLayoutLMPostProcess
+  class_path: &class_path ppstructure/vqa/labels/labels_ser.txt
+
+Metric:
+  name: VQASerTokenMetric
+  main_indicator: hmean
+
+Train:
+  dataset:
+    name: SimpleDataSet
+    data_dir: train_data/XFUND/zh_train/image
+    label_file_list: 
+      - train_data/XFUND/zh_train/xfun_normalize_train.json
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: False
+          algorithm: *algorithm
+          class_path: *class_path
+      - VQATokenPad:
+          max_seq_len: &max_seq_len 512
+          return_attention_mask: True
+      - VQASerTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          keep_keys: [ 'input_ids','labels', 'bbox', 'image', 'attention_mask', 'token_type_ids'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 4
+
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: train_data/XFUND/zh_val/image
+    label_file_list:
+      - train_data/XFUND/zh_val/xfun_normalize_val.json
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: False
+          algorithm: *algorithm
+          class_path: *class_path
+      - VQATokenPad:
+          max_seq_len: *max_seq_len
+          return_attention_mask: True
+      - VQASerTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          keep_keys: [ 'input_ids', 'labels', 'bbox', 'image', 'attention_mask', 'token_type_ids'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 4
--- a/configs/vqa/ser/layoutxlm.yml
+++ b/configs/vqa/ser/layoutxlm.yml
+Global:
+  use_gpu: True
+  epoch_num: &epoch_num 200
+  log_smooth_window: 10
+  print_batch_step: 10
+  save_model_dir: ./output/ser_layoutxlm/
+  save_epoch_step: 2000
+  # evaluation is run every 10 iterations after the 0th iteration
+  eval_batch_step: [ 0, 19 ]
+  cal_metric_during_train: False
+  save_inference_dir:
+  use_visualdl: False
+  seed: 2022
+  infer_img: doc/vqa/input/zh_val_42.jpg
+  save_res_path: ./output/ser
+
+Architecture:
+  model_type: vqa
+  algorithm: &algorithm "LayoutXLM"
+  Transform:
+  Backbone:
+    name: LayoutXLMForSer
+    pretrained: True
+    checkpoints:
+    num_classes: &num_classes 7
+
+Loss:
+  name: VQASerTokenLayoutLMLoss
+  num_classes: *num_classes
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Linear
+    learning_rate: 0.00005
+    epochs: *epoch_num
+    warmup_epoch: 2
+  regularizer:
+    name: L2
+    factor: 0.00000
+    
+PostProcess:
+  name: VQASerTokenLayoutLMPostProcess
+  class_path: &class_path ppstructure/vqa/labels/labels_ser.txt
+
+Metric:
+  name: VQASerTokenMetric
+  main_indicator: hmean
+
+Train:
+  dataset:
+    name: SimpleDataSet
+    data_dir: train_data/XFUND/zh_train/image
+    label_file_list: 
+      - train_data/XFUND/zh_train/xfun_normalize_train.json
+    ratio_list: [ 1.0 ]
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: False
+          algorithm: *algorithm
+          class_path: *class_path
+      - VQATokenPad:
+          max_seq_len: &max_seq_len 512
+          return_attention_mask: True
+      - VQASerTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          keep_keys: [ 'input_ids','labels', 'bbox', 'image', 'attention_mask', 'token_type_ids'] # dataloader will return list in this order
+  loader:
+    shuffle: True
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 4
+
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: train_data/XFUND/zh_val/image
+    label_file_list:
+      - train_data/XFUND/zh_val/xfun_normalize_val.json
+    transforms:
+      - DecodeImage: # load image
+          img_mode: RGB
+          channel_first: False
+      - VQATokenLabelEncode: # Class handling label
+          contains_re: False
+          algorithm: *algorithm
+          class_path: *class_path
+      - VQATokenPad:
+          max_seq_len: *max_seq_len
+          return_attention_mask: True
+      - VQASerTokenChunk:
+          max_seq_len: *max_seq_len
+      - Resize:
+          size: [224,224]
+      - NormalizeImage:
+          scale: 1
+          mean: [ 123.675, 116.28, 103.53 ]
+          std: [ 58.395, 57.12, 57.375 ]
+          order: 'hwc'
+      - ToCHWImage:
+      - KeepKeys:
+          keep_keys: [ 'input_ids', 'labels', 'bbox', 'image', 'attention_mask', 'token_type_ids'] # dataloader will return list in this order
+  loader:
+    shuffle: False
+    drop_last: False
+    batch_size_per_card: 8
+    num_workers: 4
--- a/deploy/slim/quantization/export_model.py
+++ b/deploy/slim/quantization/export_model.py
@@ -76,7 +76,7 @@ def main():
    }
    FLAGS = ArgsParser().parse_args()
    config = load_config(FLAGS.config)
-    merge_config(FLAGS.opt)
+    config = merge_config(config, FLAGS.opt)
    logger = get_logger()
    # build post process


--- a/ppstructure/vqa/images/input/zh_val_0.jpg
+++ b/ppstructure/vqa/images/input/zh_val_0.jpg
--- a/ppstructure/vqa/images/input/zh_val_21.jpg
+++ b/ppstructure/vqa/images/input/zh_val_21.jpg
--- a/ppstructure/vqa/images/input/zh_val_40.jpg
+++ b/ppstructure/vqa/images/input/zh_val_40.jpg
--- a/ppstructure/vqa/images/input/zh_val_42.jpg
+++ b/ppstructure/vqa/images/input/zh_val_42.jpg
--- a/ppstructure/vqa/images/result_re/zh_val_21_re.jpg
+++ b/ppstructure/vqa/images/result_re/zh_val_21_re.jpg
--- a/ppstructure/vqa/images/result_re/zh_val_40_re.jpg
+++ b/ppstructure/vqa/images/result_re/zh_val_40_re.jpg
--- a/ppstructure/vqa/images/result_ser/zh_val_0_ser.jpg
+++ b/ppstructure/vqa/images/result_ser/zh_val_0_ser.jpg
--- a/ppstructure/vqa/images/result_ser/zh_val_42_ser.jpg
+++ b/ppstructure/vqa/images/result_ser/zh_val_42_ser.jpg
--- a/ppocr/data/__init__.py
+++ b/ppocr/data/__init__.py
@@ -87,13 +87,19 @@ def build_dataloader(config, mode, device, logger, seed=None):
            shuffle=shuffle,
            drop_last=drop_last)

+    if 'collate_fn' in loader_config:
+        from . import collate_fn
+        collate_fn = getattr(collate_fn, loader_config['collate_fn'])()
+    else:
+        collate_fn = None
    data_loader = DataLoader(
        dataset=dataset,
        batch_sampler=batch_sampler,
        places=device,
        num_workers=num_workers,
        return_list=True,
-        use_shared_memory=use_shared_memory)
+        use_shared_memory=use_shared_memory,
+        collate_fn=collate_fn)

    # support exit using ctrl+c
    signal.signal(signal.SIGINT, term_mp)

--- a/ppstructure/vqa/data_collator.py
+++ b/ppstructure/vqa/data_collator.py
@@ -15,20 +15,20 @@
 import paddle
 import numbers
 import numpy as np
+from collections import defaultdict


-class DataCollator:
+class DictCollator(object):
    """
    data batch
    """

    def __call__(self, batch):
-        data_dict = {}
+        # todo：support batch operators 
+        data_dict = defaultdict(list)
        to_tensor_keys = []
        for sample in batch:
            for k, v in sample.items():
-                if k not in data_dict:
-                    data_dict[k] = []
                if isinstance(v, (np.ndarray, paddle.Tensor, numbers.Number)):
                    if k not in to_tensor_keys:
                        to_tensor_keys.append(k)
@@ -36,3 +36,23 @@ class DataCollator:
        for k in to_tensor_keys:
            data_dict[k] = paddle.to_tensor(data_dict[k])
        return data_dict
+
+
+class ListCollator(object):
+    """
+    data batch
+    """
+
+    def __call__(self, batch):
+        # todo：support batch operators 
+        data_dict = defaultdict(list)
+        to_tensor_idxs = []
+        for sample in batch:
+            for idx, v in enumerate(sample):
+                if isinstance(v, (np.ndarray, paddle.Tensor, numbers.Number)):
+                    if idx not in to_tensor_idxs:
+                        to_tensor_idxs.append(idx)
+                data_dict[idx].append(v)
+        for idx in to_tensor_idxs:
+            data_dict[idx] = paddle.to_tensor(data_dict[idx])
+        return list(data_dict.values())
--- a/ppocr/data/imaug/__init__.py
+++ b/ppocr/data/imaug/__init__.py
@@ -34,6 +34,8 @@ from .sast_process import *
 from .pg_process import *
 from .gen_table_mask import *

+from .vqa import *
+

 def transform(data, ops=None):
    """ transform """

--- a/ppocr/data/imaug/label_ops.py
+++ b/ppocr/data/imaug/label_ops.py
@@ -17,6 +17,7 @@ from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals

+import copy
 import numpy as np
 import string
 from shapely.geometry import LineString, Point, Polygon
@@ -782,3 +783,176 @@ class SARLabelEncode(BaseRecLabelEncode):

    def get_ignored_tokens(self):
        return [self.padding_idx]
+
+
+class VQATokenLabelEncode(object):
+    """
+    Label encode for NLP VQA methods
+    """
+
+    def __init__(self,
+                 class_path,
+                 contains_re=False,
+                 add_special_ids=False,
+                 algorithm='LayoutXLM',
+                 infer_mode=False,
+                 ocr_engine=None,
+                 **kwargs):
+        super(VQATokenLabelEncode, self).__init__()
+        from paddlenlp.transformers import LayoutXLMTokenizer, LayoutLMTokenizer
+        from ppocr.utils.utility import load_vqa_bio_label_maps
+        tokenizer_dict = {
+            'LayoutXLM': {
+                'class': LayoutXLMTokenizer,
+                'pretrained_model': 'layoutxlm-base-uncased'
+            },
+            'LayoutLM': {
+                'class': LayoutLMTokenizer,
+                'pretrained_model': 'layoutlm-base-uncased'
+            }
+        }
+        self.contains_re = contains_re
+        tokenizer_config = tokenizer_dict[algorithm]
+        self.tokenizer = tokenizer_config['class'].from_pretrained(
+            tokenizer_config['pretrained_model'])
+        self.label2id_map, id2label_map = load_vqa_bio_label_maps(class_path)
+        self.add_special_ids = add_special_ids
+        self.infer_mode = infer_mode
+        self.ocr_engine = ocr_engine
+
+    def __call__(self, data):
+        # load bbox and label info
+        ocr_info = self._load_ocr_info(data)
+
+        height, width, _ = data['image'].shape
+
+        words_list = []
+        bbox_list = []
+        input_ids_list = []
+        token_type_ids_list = []
+        segment_offset_id = []
+        gt_label_list = []
+
+        entities = []
+
+        # for re
+        train_re = self.contains_re and not self.infer_mode
+        if train_re:
+            relations = []
+            id2label = {}
+            entity_id_to_index_map = {}
+            empty_entity = set()
+
+        data['ocr_info'] = copy.deepcopy(ocr_info)
+
+        for info in ocr_info:
+            if train_re:
+                # for re
+                if len(info["text"]) == 0:
+                    empty_entity.add(info["id"])
+                    continue
+                id2label[info["id"]] = info["label"]
+                relations.extend([tuple(sorted(l)) for l in info["linking"]])
+            # smooth_box
+            bbox = self._smooth_box(info["bbox"], height, width)
+
+            text = info["text"]
+            encode_res = self.tokenizer.encode(
+                text, pad_to_max_seq_len=False, return_attention_mask=True)
+
+            if not self.add_special_ids:
+                # TODO: use tok.all_special_ids to remove
+                encode_res["input_ids"] = encode_res["input_ids"][1:-1]
+                encode_res["token_type_ids"] = encode_res["token_type_ids"][1:
+                                                                            -1]
+                encode_res["attention_mask"] = encode_res["attention_mask"][1:
+                                                                            -1]
+            # parse label
+            if not self.infer_mode:
+                label = info['label']
+                gt_label = self._parse_label(label, encode_res)
+
+            # construct entities for re
+            if train_re:
+                if gt_label[0] != self.label2id_map["O"]:
+                    entity_id_to_index_map[info["id"]] = len(entities)
+                    label = label.upper()
+                    entities.append({
+                        "start": len(input_ids_list),
+                        "end":
+                        len(input_ids_list) + len(encode_res["input_ids"]),
+                        "label": label.upper(),
+                    })
+            else:
+                entities.append({
+                    "start": len(input_ids_list),
+                    "end": len(input_ids_list) + len(encode_res["input_ids"]),
+                    "label": 'O',
+                })
+            input_ids_list.extend(encode_res["input_ids"])
+            token_type_ids_list.extend(encode_res["token_type_ids"])
+            bbox_list.extend([bbox] * len(encode_res["input_ids"]))
+            words_list.append(text)
+            segment_offset_id.append(len(input_ids_list))
+            if not self.infer_mode:
+                gt_label_list.extend(gt_label)
+
+        data['input_ids'] = input_ids_list
+        data['token_type_ids'] = token_type_ids_list
+        data['bbox'] = bbox_list
+        data['attention_mask'] = [1] * len(input_ids_list)
+        data['labels'] = gt_label_list
+        data['segment_offset_id'] = segment_offset_id
+        data['tokenizer_params'] = dict(
+            padding_side=self.tokenizer.padding_side,
+            pad_token_type_id=self.tokenizer.pad_token_type_id,
+            pad_token_id=self.tokenizer.pad_token_id)
+        data['entities'] = entities
+
+        if train_re:
+            data['relations'] = relations
+            data['id2label'] = id2label
+            data['empty_entity'] = empty_entity
+            data['entity_id_to_index_map'] = entity_id_to_index_map
+        return data
+
+    def _load_ocr_info(self, data):
+        def trans_poly_to_bbox(poly):
+            x1 = np.min([p[0] for p in poly])
+            x2 = np.max([p[0] for p in poly])
+            y1 = np.min([p[1] for p in poly])
+            y2 = np.max([p[1] for p in poly])
+            return [x1, y1, x2, y2]
+
+        if self.infer_mode:
+            ocr_result = self.ocr_engine.ocr(data['image'], cls=False)
+            ocr_info = []
+            for res in ocr_result:
+                ocr_info.append({
+                    "text": res[1][0],
+                    "bbox": trans_poly_to_bbox(res[0]),
+                    "poly": res[0],
+                })
+            return ocr_info
+        else:
+            info = data['label']
+            # read text info
+            info_dict = json.loads(info)
+            return info_dict["ocr_info"]
+
+    def _smooth_box(self, bbox, height, width):
+        bbox[0] = int(bbox[0] * 1000.0 / width)
+        bbox[2] = int(bbox[2] * 1000.0 / width)
+        bbox[1] = int(bbox[1] * 1000.0 / height)
+        bbox[3] = int(bbox[3] * 1000.0 / height)
+        return bbox
+
+    def _parse_label(self, label, encode_res):
+        gt_label = []
+        if label.lower() == "other":
+            gt_label.extend([0] * len(encode_res["input_ids"]))
+        else:
+            gt_label.append(self.label2id_map[("b-" + label).upper()])
+            gt_label.extend([self.label2id_map[("i-" + label).upper()]] *
+                            (len(encode_res["input_ids"]) - 1))
+        return gt_label
--- a/ppocr/data/imaug/operators.py
+++ b/ppocr/data/imaug/operators.py
@@ -170,17 +170,19 @@ class Resize(object):

    def __call__(self, data):
        img = data['image']
+        if 'polys' in data:
            text_polys = data['polys']

        img_resize, [ratio_h, ratio_w] = self.resize_image(img)
+        if 'polys' in data:
            new_boxes = []
            for box in text_polys:
                new_box = []
                for cord in box:
                    new_box.append([cord[0] * ratio_w, cord[1] * ratio_h])
                new_boxes.append(new_box)
-        data['image'] = img_resize
            data['polys'] = np.array(new_boxes, dtype=np.float32)
+        data['image'] = img_resize
        return data



--- a/ppocr/data/imaug/vqa/__init__.py
+++ b/ppocr/data/imaug/vqa/__init__.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .token import VQATokenPad, VQASerTokenChunk, VQAReTokenChunk, VQAReTokenRelation
+
+__all__ = [
+    'VQATokenPad', 'VQASerTokenChunk', 'VQAReTokenChunk', 'VQAReTokenRelation'
+]
--- a/ppocr/data/imaug/vqa/token/__init__.py
+++ b/ppocr/data/imaug/vqa/token/__init__.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .vqa_token_chunk import VQASerTokenChunk, VQAReTokenChunk
+from .vqa_token_pad import VQATokenPad
+from .vqa_token_relation import VQAReTokenRelation
--- a/ppocr/data/imaug/vqa/token/vqa_token_chunk.py
+++ b/ppocr/data/imaug/vqa/token/vqa_token_chunk.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class VQASerTokenChunk(object):
+    def __init__(self, max_seq_len=512, infer_mode=False, **kwargs):
+        self.max_seq_len = max_seq_len
+        self.infer_mode = infer_mode
+
+    def __call__(self, data):
+        encoded_inputs_all = []
+        seq_len = len(data['input_ids'])
+        for index in range(0, seq_len, self.max_seq_len):
+            chunk_beg = index
+            chunk_end = min(index + self.max_seq_len, seq_len)
+            encoded_inputs_example = {}
+            for key in data:
+                if key in [
+                        'label', 'input_ids', 'labels', 'token_type_ids',
+                        'bbox', 'attention_mask'
+                ]:
+                    if self.infer_mode and key == 'labels':
+                        encoded_inputs_example[key] = data[key]
+                    else:
+                        encoded_inputs_example[key] = data[key][chunk_beg:
+                                                                chunk_end]
+                else:
+                    encoded_inputs_example[key] = data[key]
+
+            encoded_inputs_all.append(encoded_inputs_example)
+        return encoded_inputs_all[0]
+
+
+class VQAReTokenChunk(object):
+    def __init__(self,
+                 max_seq_len=512,
+                 entities_labels=None,
+                 infer_mode=False,
+                 **kwargs):
+        self.max_seq_len = max_seq_len
+        self.entities_labels = {
+            'HEADER': 0,
+            'QUESTION': 1,
+            'ANSWER': 2
+        } if entities_labels is None else entities_labels
+        self.infer_mode = infer_mode
+
+    def __call__(self, data):
+        # prepare data
+        entities = data.pop('entities')
+        relations = data.pop('relations')
+        encoded_inputs_all = []
+        for index in range(0, len(data["input_ids"]), self.max_seq_len):
+            item = {}
+            for key in data:
+                if key in [
+                        'label', 'input_ids', 'labels', 'token_type_ids',
+                        'bbox', 'attention_mask'
+                ]:
+                    if self.infer_mode and key == 'labels':
+                        item[key] = data[key]
+                    else:
+                        item[key] = data[key][index:index + self.max_seq_len]
+                else:
+                    item[key] = data[key]
+            # select entity in current chunk
+            entities_in_this_span = []
+            global_to_local_map = {}  #
+            for entity_id, entity in enumerate(entities):
+                if (index <= entity["start"] < index + self.max_seq_len and
+                        index <= entity["end"] < index + self.max_seq_len):
+                    entity["start"] = entity["start"] - index
+                    entity["end"] = entity["end"] - index
+                    global_to_local_map[entity_id] = len(entities_in_this_span)
+                    entities_in_this_span.append(entity)
+
+            # select relations in current chunk
+            relations_in_this_span = []
+            for relation in relations:
+                if (index <= relation["start_index"] < index + self.max_seq_len
+                        and index <= relation["end_index"] <
+                        index + self.max_seq_len):
+                    relations_in_this_span.append({
+                        "head": global_to_local_map[relation["head"]],
+                        "tail": global_to_local_map[relation["tail"]],
+                        "start_index": relation["start_index"] - index,
+                        "end_index": relation["end_index"] - index,
+                    })
+            item.update({
+                "entities": self.reformat(entities_in_this_span),
+                "relations": self.reformat(relations_in_this_span),
+            })
+            item['entities']['label'] = [
+                self.entities_labels[x] for x in item['entities']['label']
+            ]
+            encoded_inputs_all.append(item)
+        return encoded_inputs_all[0]
+
+    def reformat(self, data):
+        new_data = {}
+        for item in data:
+            for k, v in item.items():
+                if k not in new_data:
+                    new_data[k] = []
+                new_data[k].append(v)
+        return new_data