huggingface.yaml 3.26 KB
Newer Older
wxj's avatar
wxj committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
name: HuggingFaceEncoder
do_training: True # set to False if only preprocessing data
do_testing: False # set to True to run evaluation on test data after training

model:
  beam_size: 4
  len_pen: 0.6
  max_generation_delta: -1
  label_smoothing: 0.1
  shared_tokenizer: false
  preproc_out_dir: null
  src_language: 'en'
  tgt_language: 'de'

  train_ds:
    src_file_name: null
    tgt_file_name: null
    use_tarred_dataset: False # if true tar_file_name and meta_file_name will be used (or created automatically) 
    # config for preprocessing training data and creating a tarred datset automatically
    tar_file_prefix: parallel # prefix for tar file names
    tar_files: null # if data has already been preprocessed (rest of config ignored)
    metadata_file: null # metadata for tarred dataset
    lines_per_dataset_fragment: 1000000 # Number of lines to consider for bucketing and padding
    num_batches_per_tarfile: 100 # Number of batches (pickle files) within each tarfile
    tar_shuffle_n: 100 # How many samples to look ahead and load to be shuffled
    shard_strategy: scatter # tarred dataset shard distribution strategy
    n_preproc_jobs: -2 # number of processes to use for data preprocessing (-2 means all but 2)
    tokens_in_batch: 512
    clean: true
    max_seq_length: 512
    shuffle: true
    num_samples: -1
    drop_last: false
    pin_memory: false
    num_workers: 8

  validation_ds:
    src_file_name: null
    tgt_file_name: null
    tokens_in_batch: 512
    clean: false
    max_seq_length: 512
    shuffle: false
    num_samples: -1
    drop_last: false
    pin_memory: false
    num_workers: 8

  test_ds:
    src_file_name: null
    tgt_file_name: null
    tokens_in_batch: 512
    clean: false
    max_seq_length: 512
    shuffle: false
    num_samples: -1
    drop_last: false
    pin_memory: false
    num_workers: 8

  optim:
    name: adam
    lr: 0.001
    betas:
      - 0.9
      - 0.98
    weight_decay: 0.0
    sched:
      name: InverseSquareRootAnnealing
      min_lr: 0.0
      last_epoch: -1
      warmup_ratio: 0.1

  encoder_tokenizer:
    library: huggingface
    tokenizer_model: null
    vocab_file: null
    special_tokens: null
    vocab_size: null

  decoder_tokenizer:
    library: huggingface
    tokenizer_model: null
    vocab_file: null
    special_tokens: null
    vocab_size: null

  encoder:
    library: huggingface
    model_name: bert-base-uncased
    pretrained: false

  decoder:
    library: nemo
    model_name: null
    pretrained: false
    max_sequence_length: 512
    num_token_types: 2
    embedding_dropout: 0.1
    learn_positional_encodings: false
    hidden_size: 512
    inner_size: 2048
    num_layers: 6
    num_attention_heads: 8
    ffn_dropout: 0.1
    attn_score_dropout: 0.1
    attn_layer_dropout: 0.1
    hidden_act: relu
    pre_ln: false

  head:
    num_layers: 1
    activation: relu
    log_softmax: true
    dropout: 0.0
    use_transformer_init: true

trainer:
  devices: 4
  num_nodes: 1
  max_epochs: 200
  precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
  accelerator: gpu
  enable_checkpointing: False
  logger: False
  log_every_n_steps: 50  # Interval of logging.
  check_val_every_n_epoch: 1
  benchmark: False

exp_manager:
  name: HuggingFaceEncoder
  files_to_copy: []