labse_base.yaml 2.1 KB
Newer Older
Hongkun Yu's avatar
Hongkun Yu committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
task:
  hub_module_url: ''
  model:
    bidirectional: true
    max_sequence_length: 32
    logit_scale: 100
    logit_margin: 0.3
  init_checkpoint: 'the pre-trained BERT checkpoint using the labse vocab.'
  train_data:
    drop_remainder: true
    global_batch_size: 4096
    input_path: 'the path to train partition'
    left_text_fields: ['src_raw']
    right_text_fields: ['tgt_raw']
    vocab_file: 'the path to vocab.txt'
    lower_case: false
    is_training: true
    seq_length: 32
    sharding: false
    cycle_length: 4
    shuffle_buffer_size: 1000
    tfds_as_supervised: false
    tfds_data_dir: ''
    tfds_name: ''
    tfds_skip_decoding_feature: ''
    tfds_split: ''
  validation_data:
    block_length: 1
    cache: false
    cycle_length: 4
    drop_remainder: false
    global_batch_size: 32000
    input_path: 'the path to validation partition'
    left_text_fields: ['src_raw']
    right_text_fields: ['tgt_raw']
    vocab_file: 'the path to vocab.txt'
    lower_case: false
    is_training: false
    seq_length: 32
    sharding: true
    shuffle_buffer_size: 1000
    tfds_as_supervised: false
    tfds_data_dir: ''
    tfds_name: ''
    tfds_skip_decoding_feature: ''
    tfds_split: ''
trainer:
  checkpoint_interval: 1000
  eval_tf_function: true
  max_to_keep: 5
  optimizer_config:
    learning_rate:
      polynomial:
        cycle: false
        decay_steps: 500000
        end_learning_rate: 0.0
        initial_learning_rate: 1.0e-04
        name: PolynomialDecay
        power: 1.0
      type: polynomial
    optimizer:
      adamw:
        amsgrad: false
        beta_1: 0.9
        beta_2: 0.999
        epsilon: 1.0e-05
        exclude_from_weight_decay: null
        include_in_weight_decay: null
        name: AdamWeightDecay
        weight_decay_rate: 0.0
        gradient_clip_norm: 100
      type: adamw
    warmup:
      polynomial:
        name: polynomial
        power: 1
        warmup_steps: 5000
      type: polynomial
  steps_per_loop: 1000
  summary_interval: 1000
  train_tf_function: true
  train_tf_while_loop: true
  train_steps: 500000
  validation_interval: 1000
  validation_steps: 100