layoutlmv2.yml 3.14 KB
Newer Older
WenmuZhou's avatar
WenmuZhou committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
Global:
  use_gpu: True
  epoch_num: &epoch_num 200
  log_smooth_window: 10
  print_batch_step: 10
  save_model_dir: ./output/re_layoutlmv2/
  save_epoch_step: 2000
  # evaluation is run every 10 iterations after the 0th iteration
  eval_batch_step: [ 0, 19 ]
  cal_metric_during_train: False
  save_inference_dir:
  use_visualdl: False
  seed: 2048
  infer_img: doc/vqa/input/zh_val_21.jpg
  save_res_path: ./output/re/

Architecture:
  model_type: vqa
  algorithm: &algorithm "LayoutLMv2"
  Transform:
  Backbone:
    name: LayoutLMv2ForRe
    pretrained: True
    checkpoints: 

Loss:
  name: LossFromOutput
  key: loss
  reduction: mean

Optimizer:
  name: AdamW
  beta1: 0.9
  beta2: 0.999
  clip_norm: 10
  lr:
WenmuZhou's avatar
WenmuZhou committed
37
38
    learning_rate: 0.00005
    warmup_epoch: 10
WenmuZhou's avatar
WenmuZhou committed
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
  regularizer:
    name: L2
    factor: 0.00000
    
PostProcess:
  name: VQAReTokenLayoutLMPostProcess

Metric:
  name: VQAReTokenMetric
  main_indicator: hmean

Train:
  dataset:
    name: SimpleDataSet
    data_dir: train_data/XFUND/zh_train/image
    label_file_list: 
      - train_data/XFUND/zh_train/xfun_normalize_train.json
    ratio_list: [ 1.0 ]
    transforms:
      - DecodeImage: # load image
          img_mode: RGB
          channel_first: False
      - VQATokenLabelEncode: # Class handling label
          contains_re: True
          algorithm: *algorithm
          class_path: &class_path ppstructure/vqa/labels/labels_ser.txt
      - VQATokenPad:
          max_seq_len: &max_seq_len 512
          return_attention_mask: True
      - VQAReTokenRelation:
      - VQAReTokenChunk:
          max_seq_len: *max_seq_len
      - Resize:
          size: [224,224]
      - NormalizeImage:
          scale: 1./255.
          mean: [0.485, 0.456, 0.406]
          std: [0.229, 0.224, 0.225]
          order: 'hwc'
      - ToCHWImage:
      - KeepKeys:
          keep_keys: [ 'input_ids', 'bbox', 'image', 'attention_mask', 'token_type_ids','entities', 'relations'] # dataloader will return list in this order
  loader:
    shuffle: True
    drop_last: False
    batch_size_per_card: 8
    num_workers: 8
    collate_fn: ListCollator

Eval:
  dataset:
    name: SimpleDataSet
    data_dir: train_data/XFUND/zh_val/image
    label_file_list:
      - train_data/XFUND/zh_val/xfun_normalize_val.json
    transforms:
      - DecodeImage: # load image
          img_mode: RGB
          channel_first: False
      - VQATokenLabelEncode: # Class handling label
          contains_re: True
          algorithm: *algorithm
          class_path: *class_path
      - VQATokenPad:
          max_seq_len: *max_seq_len
          return_attention_mask: True
      - VQAReTokenRelation:
      - VQAReTokenChunk:
          max_seq_len: *max_seq_len
      - Resize:
          size: [224,224]
      - NormalizeImage:
          scale: 1./255.
          mean: [0.485, 0.456, 0.406]
          std: [0.229, 0.224, 0.225]
          order: 'hwc'
      - ToCHWImage:
      - KeepKeys:
          keep_keys: [ 'input_ids', 'bbox', 'image', 'attention_mask', 'token_type_ids','entities', 'relations'] # dataloader will return list in this order
  loader:
    shuffle: False
    drop_last: False
    batch_size_per_card: 8
    num_workers: 8
    collate_fn: ListCollator