name: MegatronEncoder do_training: True # set to False if only preprocessing data do_testing: False # set to True to run evaluation on test data after training model: beam_size: 4 len_pen: 0.6 max_generation_delta: -1 label_smoothing: 0.1 shared_tokenizer: false preproc_out_dir: null src_language: 'en' tgt_language: 'de' train_ds: src_file_name: null tgt_file_name: null use_tarred_dataset: False # if true tar_file_name and meta_file_name will be used (or created automatically) # config for preprocessing training data and creating a tarred datset automatically tar_file_prefix: parallel # prefix for tar file names tar_files: null # if data has already been preprocessed (rest of config ignored) metadata_file: null # metadata for tarred dataset lines_per_dataset_fragment: 1000000 # Number of lines to consider for bucketing and padding num_batches_per_tarfile: 100 # Number of batches (pickle files) within each tarfile tar_shuffle_n: 100 # How many samples to look ahead and load to be shuffled shard_strategy: scatter # tarred dataset shard distribution strategy n_preproc_jobs: -2 # number of processes to use for data preprocessing (-2 means all but 2) tokens_in_batch: 512 clean: true max_seq_length: 512 shuffle: true num_samples: -1 drop_last: false pin_memory: false num_workers: 8 validation_ds: src_file_name: null tgt_file_name: null tokens_in_batch: 512 clean: false max_seq_length: 512 shuffle: false num_samples: -1 drop_last: false pin_memory: false num_workers: 8 test_ds: src_file_name: null tgt_file_name: null tokens_in_batch: 512 clean: false max_seq_length: 512 shuffle: false num_samples: -1 drop_last: false pin_memory: false num_workers: 8 optim: name: adam lr: 0.001 betas: - 0.9 - 0.98 weight_decay: 0.0 sched: name: InverseSquareRootAnnealing min_lr: 0.0 last_epoch: -1 warmup_ratio: 0.1 encoder_tokenizer: library: megatron tokenizer_model: null vocab_file: null special_tokens: null vocab_size: null model_name: null decoder_tokenizer: library: sentencepiece tokenizer_model: null vocab_file: null special_tokens: null vocab_size: null encoder: library: megatron # If using a pretrained megatron bert model from NGC, then use the corresponding model name # For example, 'megatron-bert-345m-uncased'. # If restoring from a local checkpoint, then use either 'megatron-bert-uncased' or 'megatron-bert-cased' model_name: megatron-bert-uncased # or megatron-bert-cased # If restoring from a model parallel checkpoint, then checkpoint_file should be a path to # the directory containing the megatron-lm checkpoints. The directory will have the structure: # /path/to/my/checkpoint/ # ├── mp_rank_00 # │ └── model_optim_rng.pt # └── mp_rank_01 # └── model_optim_rng.pt # If not using a model parallel checkpoint, then use the full path to the checkpoint: # /path/to/my/checkpoint/model_optim_rng.pt checkpoint_file: null vocab_file : null pretrained: true # only pretrained=true supported for now # model architecture configuration hidden_size: 1024 num_attention_heads: 16 num_layers: 24 max_position_embeddings: 512 num_tokentypes: 0 decoder: library: nemo model_name: null pretrained: false max_sequence_length: 512 num_token_types: 2 embedding_dropout: 0.1 learn_positional_encodings: false hidden_size: 512 inner_size: 2048 num_layers: 6 num_attention_heads: 8 ffn_dropout: 0.1 attn_score_dropout: 0.1 attn_layer_dropout: 0.1 hidden_act: relu pre_ln: false head: num_layers: 1 activation: relu log_softmax: true dropout: 0.0 use_transformer_init: true trainer: devices: 4 num_nodes: 1 max_epochs: 200 precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0 accelerator: gpu strategy: ddp enable_checkpointing: False logger: False log_every_n_steps: 50 # Interval of logging. check_val_every_n_epoch: 1 exp_manager: name: ${name} files_to_copy: []