name: MegatronEncoder
do_training: True # set to False if only preprocessing data
do_testing: False # set to True to run evaluation on test data after training

model:
  beam_size: 4
  len_pen: 0.6
  max_generation_delta: -1
  label_smoothing: 0.1
  shared_tokenizer: false
  preproc_out_dir: null
  src_language: 'en'
  tgt_language: 'de'

  train_ds:
    src_file_name: null
    tgt_file_name: null
    use_tarred_dataset: False # if true tar_file_name and meta_file_name will be used (or created automatically) 
    # config for preprocessing training data and creating a tarred datset automatically
    tar_file_prefix: parallel # prefix for tar file names
    tar_files: null # if data has already been preprocessed (rest of config ignored)
    metadata_file: null # metadata for tarred dataset
    lines_per_dataset_fragment: 1000000 # Number of lines to consider for bucketing and padding
    num_batches_per_tarfile: 100 # Number of batches (pickle files) within each tarfile
    tar_shuffle_n: 100 # How many samples to look ahead and load to be shuffled
    shard_strategy: scatter # tarred dataset shard distribution strategy
    n_preproc_jobs: -2 # number of processes to use for data preprocessing (-2 means all but 2)
    tokens_in_batch: 512
    clean: true
    max_seq_length: 512
    shuffle: true
    num_samples: -1
    drop_last: false
    pin_memory: false
    num_workers: 8

  validation_ds:
    src_file_name: null
    tgt_file_name: null
    tokens_in_batch: 512
    clean: false
    max_seq_length: 512
    shuffle: false
    num_samples: -1
    drop_last: false
    pin_memory: false
    num_workers: 8

  test_ds:
    src_file_name: null
    tgt_file_name: null
    tokens_in_batch: 512
    clean: false
    max_seq_length: 512
    shuffle: false
    num_samples: -1
    drop_last: false
    pin_memory: false
    num_workers: 8

  optim:
    name: adam
    lr: 0.001
    betas:
      - 0.9
      - 0.98
    weight_decay: 0.0
    sched:
      name: InverseSquareRootAnnealing
      min_lr: 0.0
      last_epoch: -1
      warmup_ratio: 0.1

  encoder_tokenizer:
    library: megatron
    tokenizer_model: null
    vocab_file: null
    special_tokens: null
    vocab_size: null
    model_name: null

  decoder_tokenizer:
    library: sentencepiece
    tokenizer_model: null
    vocab_file: null
    special_tokens: null
    vocab_size: null

  encoder:
    library: megatron

    # If using a pretrained megatron bert model from NGC, then use the corresponding model name
    # For example, 'megatron-bert-345m-uncased'.
    # If restoring from a local checkpoint, then use either 'megatron-bert-uncased' or 'megatron-bert-cased'
    model_name: megatron-bert-uncased # or megatron-bert-cased

    # If restoring from a model parallel checkpoint, then checkpoint_file should be a path to 
    # the directory containing the megatron-lm checkpoints. The directory will have the structure:

    #     /path/to/my/checkpoint/
    # ├── mp_rank_00
    # │   └── model_optim_rng.pt
    # └── mp_rank_01
    #     └── model_optim_rng.pt

    # If not using a model parallel checkpoint, then use the full path to the checkpoint:

    # /path/to/my/checkpoint/model_optim_rng.pt
    checkpoint_file: null
    vocab_file : null

    pretrained: true # only pretrained=true supported for now

    # model architecture configuration
    hidden_size: 1024
    num_attention_heads: 16
    num_layers: 24
    max_position_embeddings: 512
    num_tokentypes: 0

  decoder:
    library: nemo
    model_name: null
    pretrained: false
    max_sequence_length: 512
    num_token_types: 2
    embedding_dropout: 0.1
    learn_positional_encodings: false
    hidden_size: 512
    inner_size: 2048
    num_layers: 6
    num_attention_heads: 8
    ffn_dropout: 0.1
    attn_score_dropout: 0.1
    attn_layer_dropout: 0.1
    hidden_act: relu
    pre_ln: false

  head:
    num_layers: 1
    activation: relu
    log_softmax: true
    dropout: 0.0
    use_transformer_init: true

trainer:
  devices: 4
  num_nodes: 1
  max_epochs: 200
  precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
  accelerator: gpu
  strategy: ddp
  enable_checkpointing: False
  logger: False
  log_every_n_steps: 50  # Interval of logging.
  check_val_every_n_epoch: 1

exp_manager:
  name: ${name}
  files_to_copy: []