name: HuggingFaceEncoder do_training: True # set to False if only preprocessing data do_testing: False # set to True to run evaluation on test data after training model: beam_size: 4 len_pen: 0.6 max_generation_delta: -1 label_smoothing: 0.1 shared_tokenizer: false preproc_out_dir: null src_language: 'en' tgt_language: 'de' train_ds: src_file_name: null tgt_file_name: null use_tarred_dataset: False # if true tar_file_name and meta_file_name will be used (or created automatically) # config for preprocessing training data and creating a tarred datset automatically tar_file_prefix: parallel # prefix for tar file names tar_files: null # if data has already been preprocessed (rest of config ignored) metadata_file: null # metadata for tarred dataset lines_per_dataset_fragment: 1000000 # Number of lines to consider for bucketing and padding num_batches_per_tarfile: 100 # Number of batches (pickle files) within each tarfile tar_shuffle_n: 100 # How many samples to look ahead and load to be shuffled shard_strategy: scatter # tarred dataset shard distribution strategy n_preproc_jobs: -2 # number of processes to use for data preprocessing (-2 means all but 2) tokens_in_batch: 512 clean: true max_seq_length: 512 shuffle: true num_samples: -1 drop_last: false pin_memory: false num_workers: 8 validation_ds: src_file_name: null tgt_file_name: null tokens_in_batch: 512 clean: false max_seq_length: 512 shuffle: false num_samples: -1 drop_last: false pin_memory: false num_workers: 8 test_ds: src_file_name: null tgt_file_name: null tokens_in_batch: 512 clean: false max_seq_length: 512 shuffle: false num_samples: -1 drop_last: false pin_memory: false num_workers: 8 optim: name: adam lr: 0.001 betas: - 0.9 - 0.98 weight_decay: 0.0 sched: name: InverseSquareRootAnnealing min_lr: 0.0 last_epoch: -1 warmup_ratio: 0.1 encoder_tokenizer: library: huggingface tokenizer_model: null vocab_file: null special_tokens: null vocab_size: null decoder_tokenizer: library: huggingface tokenizer_model: null vocab_file: null special_tokens: null vocab_size: null encoder: library: huggingface model_name: bert-base-uncased pretrained: false decoder: library: nemo model_name: null pretrained: false max_sequence_length: 512 num_token_types: 2 embedding_dropout: 0.1 learn_positional_encodings: false hidden_size: 512 inner_size: 2048 num_layers: 6 num_attention_heads: 8 ffn_dropout: 0.1 attn_score_dropout: 0.1 attn_layer_dropout: 0.1 hidden_act: relu pre_ln: false head: num_layers: 1 activation: relu log_softmax: true dropout: 0.0 use_transformer_init: true trainer: devices: 4 num_nodes: 1 max_epochs: 200 precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0 accelerator: gpu enable_checkpointing: False logger: False log_every_n_steps: 50 # Interval of logging. check_val_every_n_epoch: 1 benchmark: False exp_manager: name: HuggingFaceEncoder files_to_copy: []