# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

pretrained_model: null # pretrained model from list_available_models()
do_training: true # true for training mode, false for testing 
trainer:
  devices: 1 # number of GPUs (0 for CPU), or list of the GPUs to use e.g. [0, 1]
  num_nodes: 1
  max_epochs: 3
  max_steps: -1 # precedence over max_epochs
  accumulate_grad_batches: 1 # accumulates grads every k batches
  gradient_clip_val: 1.0
  precision: 16 # Should be set to 16 for O1 and O2 to enable the AMP.
  accelerator: gpu
  log_every_n_steps: 5  # Interval of logging.
  val_check_interval: 1.0  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
  num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
  enable_checkpointing: False # Provided by exp_manager
  logger: False  # Provided by exp_manager 

model:
  # all models
  tensor_model_parallel_size: 1
  nemo_path: null # filename to save the model and associated artifacts to .nemo file
  library: huggingface # [huggingface, megatron]
  save_model: False # save validation model checkpoints

  language_model:
    pretrained_model_name: gpt2 # main config to select model (between bert, gpt2, t5/bart based models) see docs/source/nlp/dialogue.rst for full list of options
    lm_checkpoint: null
    config_file: null # json file, precedence over config
    config: null

  tokenizer:
    tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece
    vocab_file: null # path to vocab file
    tokenizer_model: null # only used if tokenizer is sentencepiece
    special_tokens: null
  
  # Dialogue GPT Classification/Generation and Dialogue S2S Generation Model args
  tokens_to_generate: 32 # for generation mode only

  # Intent Slot Classification model args
  class_balancing: ${model.dataset.class_balancing}
  intent_loss_weight: 0.6 # relation of intent to slot loss in total loss (between 0 to 1)
  data_dir: ${model.dataset.data_dir}
  classifier_head:
    num_output_layers: 2
    fc_dropout: 0.1
  
  # Dialogue GPT Classification Megatron Prompt Learning model args
  prompt_learning: false # please change to true to activate prompt learning
  language_model_path: ${model.language_model.lm_checkpoint}
  new_tasks: ['intent_and_slot']
  prompt_tuning:
    new_prompt_init_methods: ['text']
    new_prompt_init_text: ['intent_and_slot']
  p_tuning: # P-tuning specific params
    dropout: 0.0
    num_layers: 2
    encoder_type: mlp # lstm or tpmlp or embedding
  prompt_learning_nemo_path: prompt_learning.nemo
  data: {}
  virtual_prompt_style: 'p-tuning' # 'prompt-tuning'
  encoder_seq_length: 2048
  pipeline_model_parallel_size: 1
  data_parallel_size: 1
  global_batch_size: 8
  micro_batch_size: 8

  task_templates:
  - taskname: "intent_and_slot"
    prompt_template: "<|VIRTUAL_PROMPT_0|> {utterance} \nintent: {intent} \nslot: {slot}"
    total_virtual_tokens: 10
    answer_only_loss: True
    virtual_token_splits: [10]
    truncate_field: null

  # SGDQA args
  encoder:
    dropout: 0.1
  
  # Zero Shot Intent Model args
  original_nemo_checkpoint: null ## cannot directly load as .nemo uses the pre-refactor model, therefore transfer its attributes over

  dataset:

    ## All tasks/models
    data_dir: ??? # location to load data from 
    dialogues_example_dir: ??? # store prediction files
    task: sgd # [sgd, assistant, zero_shot, ms_marco, sgd_generation, design, mellon_qa]
    debug_mode: false # small number of examples for debugging
    max_seq_length: 128 # the maximum number of tokens per sample
    
    ## Dialogue S2S and GPT Generation Model params
    input_field: utterance+response # passage+utterance, utterance, response, utterance+response, system_actions
    output_field: fluent_response # response, fluent_response, system_utterance
    
    ## Dialogue GPT Classification Model params
    field: intent # [intent, slots, service]
    few_shot: 0 # int ; 0 to 10, for number of examples in prompt
    eval_mode: ranking # ranking or generation or binary_score
    binary_score_subsample: false # subsample negative examples for binary score training
    binary_score_subsample_ratio: 2 # number of negative examples per postive example
    prompt_template: default # default, prompt_tuning, i_want_to # "This example is" for zeroshotintentmodel #acts_slots_values, slots_values, values for DialogueS2SGenerationDataset
    target_template: default # default, with_description, with_slots
    
    ## SGD task specific params
    system_utterance: prev_turn # prev_turn, next_turn: prev_turn (default for sgdqa) takes the system utterance that precede the user utterance; next_turn (for sgd_generation) takes the system utterance that follows the user utterance
    num_tasks: 1 # number of task heads 1 for DialogGPTClassification and 6 for SGDQA
    
    ## SGD and Zero Shot task specific params
    preprocess_intent_function: default # default, lowercase, description # remove_domain for zero_shot task

    ## SGDQA model specific params
    subsample: false # balances negative and positive training examples for improved performance
    task_name: sgd_single_domain # or from [sgd_all, sgd_all_single, sgd_multi_domain, debug_sample]
    state_tracker: nemotracker # or baseline
    use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up
    use_fuzzy_match: true # Whether to use fuzzy string matching when comparing non-categorical slot values. Should be set to False when conducting multiwoz style evaluation.
    joint_acc_across_turn: false # Whether to compute joint goal accuracy across turn instead of across service. Should be set to True when conducting multiwoz style evaluation.
    max_num_cat_slot: 6 # maximum number of different categorical slots per service in dataset 
    max_num_noncat_slot: 12 # maximum number of different non-categorical slots per service in dataset 
    max_value_per_cat_slot: 12 # maximum number of different categorical slot values per service in dataset 
    max_num_intent: 4 # maximum number of different intents per service in dataset 
    num_samples: -1 # restrict num_samples to an int value, if -1 all samples will be used
    pad_label: -1 # if -1 not slot token will be used
    ignore_extra_tokens: false
    ignore_start_end: true # do not use first and last token for slot training
    do_lowercase: false 

    #Zero Shot Intent Model args
    class_balancing: null # or weighted_loss
    num_classes: 3 

    # Mellon QA, MS Marco and Design task
    dev_proportion: 10 # These datasets do not have a dedicated dev set, therefore need to split train into a new train and dev. Indicate an integer (5-90) for the proporton for dev set 

  train_ds:
    ds_item: "train"
    prefix: train
    batch_size: 16
    shuffle: true
    num_workers: 3
    drop_last: false
    pin_memory: false

  validation_ds:
    prefix: test
    ds_item: ["dev"]
    batch_size: 8
    shuffle: false
    num_workers: 3
    drop_last: false
    pin_memory: false

  test_ds:
    prefix: test
    ds_item: ["test"]
    batch_size: 8
    shuffle: false
    num_workers: 3
    drop_last: false
    pin_memory: false

  optim:
    name: adamw
    lr: 1e-4
    # optimizer arguments
    betas: [0.9, 0.999]
    weight_decay: 0.01

    # scheduler setup
    sched:
      name: PolynomialDecayAnnealing
      # Scheduler params
      warmup_steps: null
      warmup_ratio: 0.02
      last_epoch: -1
      # pytorch lightning args
      monitor: val_loss
      reduce_on_plateau: false

exp_manager:
  exp_dir: null  # exp_dir for your experiment, if None, defaults to "./nemo_experiments"
  name: "SGDGEN"  # The name of your model
  create_wandb_logger: True
  wandb_logger_kwargs:
    name: ???
    project: SGDGEN
  create_tensorboard_logger: True  # Whether you want exp_manger to create a tb logger
  create_checkpoint_callback: True  # Whether you want exp_manager to create a modelcheckpoint callback
  resume_if_exists: false
  resume_ignore_no_checkpoint: false