# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. pretrained_model: null # pretrained model from list_available_models() do_training: true # true for training mode, false for testing trainer: devices: 1 # number of GPUs (0 for CPU), or list of the GPUs to use e.g. [0, 1] num_nodes: 1 max_epochs: 3 max_steps: -1 # precedence over max_epochs accumulate_grad_batches: 1 # accumulates grads every k batches gradient_clip_val: 1.0 precision: 16 # Should be set to 16 for O1 and O2 to enable the AMP. accelerator: gpu log_every_n_steps: 5 # Interval of logging. val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it enable_checkpointing: False # Provided by exp_manager logger: False # Provided by exp_manager model: # all models tensor_model_parallel_size: 1 nemo_path: null # filename to save the model and associated artifacts to .nemo file library: huggingface # [huggingface, megatron] save_model: False # save validation model checkpoints language_model: pretrained_model_name: gpt2 # main config to select model (between bert, gpt2, t5/bart based models) see docs/source/nlp/dialogue.rst for full list of options lm_checkpoint: null config_file: null # json file, precedence over config config: null tokenizer: tokenizer_name: ${model.language_model.pretrained_model_name} # or sentencepiece vocab_file: null # path to vocab file tokenizer_model: null # only used if tokenizer is sentencepiece special_tokens: null # Dialogue GPT Classification/Generation and Dialogue S2S Generation Model args tokens_to_generate: 32 # for generation mode only # Intent Slot Classification model args class_balancing: ${model.dataset.class_balancing} intent_loss_weight: 0.6 # relation of intent to slot loss in total loss (between 0 to 1) data_dir: ${model.dataset.data_dir} classifier_head: num_output_layers: 2 fc_dropout: 0.1 # Dialogue GPT Classification Megatron Prompt Learning model args prompt_learning: false # please change to true to activate prompt learning language_model_path: ${model.language_model.lm_checkpoint} new_tasks: ['intent_and_slot'] prompt_tuning: new_prompt_init_methods: ['text'] new_prompt_init_text: ['intent_and_slot'] p_tuning: # P-tuning specific params dropout: 0.0 num_layers: 2 encoder_type: mlp # lstm or tpmlp or embedding prompt_learning_nemo_path: prompt_learning.nemo data: {} virtual_prompt_style: 'p-tuning' # 'prompt-tuning' encoder_seq_length: 2048 pipeline_model_parallel_size: 1 data_parallel_size: 1 global_batch_size: 8 micro_batch_size: 8 task_templates: - taskname: "intent_and_slot" prompt_template: "<|VIRTUAL_PROMPT_0|> {utterance} \nintent: {intent} \nslot: {slot}" total_virtual_tokens: 10 answer_only_loss: True virtual_token_splits: [10] truncate_field: null # SGDQA args encoder: dropout: 0.1 # Zero Shot Intent Model args original_nemo_checkpoint: null ## cannot directly load as .nemo uses the pre-refactor model, therefore transfer its attributes over dataset: ## All tasks/models data_dir: ??? # location to load data from dialogues_example_dir: ??? # store prediction files task: sgd # [sgd, assistant, zero_shot, ms_marco, sgd_generation, design, mellon_qa] debug_mode: false # small number of examples for debugging max_seq_length: 128 # the maximum number of tokens per sample ## Dialogue S2S and GPT Generation Model params input_field: utterance+response # passage+utterance, utterance, response, utterance+response, system_actions output_field: fluent_response # response, fluent_response, system_utterance ## Dialogue GPT Classification Model params field: intent # [intent, slots, service] few_shot: 0 # int ; 0 to 10, for number of examples in prompt eval_mode: ranking # ranking or generation or binary_score binary_score_subsample: false # subsample negative examples for binary score training binary_score_subsample_ratio: 2 # number of negative examples per postive example prompt_template: default # default, prompt_tuning, i_want_to # "This example is" for zeroshotintentmodel #acts_slots_values, slots_values, values for DialogueS2SGenerationDataset target_template: default # default, with_description, with_slots ## SGD task specific params system_utterance: prev_turn # prev_turn, next_turn: prev_turn (default for sgdqa) takes the system utterance that precede the user utterance; next_turn (for sgd_generation) takes the system utterance that follows the user utterance num_tasks: 1 # number of task heads 1 for DialogGPTClassification and 6 for SGDQA ## SGD and Zero Shot task specific params preprocess_intent_function: default # default, lowercase, description # remove_domain for zero_shot task ## SGDQA model specific params subsample: false # balances negative and positive training examples for improved performance task_name: sgd_single_domain # or from [sgd_all, sgd_all_single, sgd_multi_domain, debug_sample] state_tracker: nemotracker # or baseline use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up use_fuzzy_match: true # Whether to use fuzzy string matching when comparing non-categorical slot values. Should be set to False when conducting multiwoz style evaluation. joint_acc_across_turn: false # Whether to compute joint goal accuracy across turn instead of across service. Should be set to True when conducting multiwoz style evaluation. max_num_cat_slot: 6 # maximum number of different categorical slots per service in dataset max_num_noncat_slot: 12 # maximum number of different non-categorical slots per service in dataset max_value_per_cat_slot: 12 # maximum number of different categorical slot values per service in dataset max_num_intent: 4 # maximum number of different intents per service in dataset num_samples: -1 # restrict num_samples to an int value, if -1 all samples will be used pad_label: -1 # if -1 not slot token will be used ignore_extra_tokens: false ignore_start_end: true # do not use first and last token for slot training do_lowercase: false #Zero Shot Intent Model args class_balancing: null # or weighted_loss num_classes: 3 # Mellon QA, MS Marco and Design task dev_proportion: 10 # These datasets do not have a dedicated dev set, therefore need to split train into a new train and dev. Indicate an integer (5-90) for the proporton for dev set train_ds: ds_item: "train" prefix: train batch_size: 16 shuffle: true num_workers: 3 drop_last: false pin_memory: false validation_ds: prefix: test ds_item: ["dev"] batch_size: 8 shuffle: false num_workers: 3 drop_last: false pin_memory: false test_ds: prefix: test ds_item: ["test"] batch_size: 8 shuffle: false num_workers: 3 drop_last: false pin_memory: false optim: name: adamw lr: 1e-4 # optimizer arguments betas: [0.9, 0.999] weight_decay: 0.01 # scheduler setup sched: name: PolynomialDecayAnnealing # Scheduler params warmup_steps: null warmup_ratio: 0.02 last_epoch: -1 # pytorch lightning args monitor: val_loss reduce_on_plateau: false exp_manager: exp_dir: null # exp_dir for your experiment, if None, defaults to "./nemo_experiments" name: "SGDGEN" # The name of your model create_wandb_logger: True wandb_logger_kwargs: name: ??? project: SGDGEN create_tensorboard_logger: True # Whether you want exp_manger to create a tb logger create_checkpoint_callback: True # Whether you want exp_manager to create a modelcheckpoint callback resume_if_exists: false resume_ignore_no_checkpoint: false