# WARNING: Yaml configs is currently an experimental feature language_model: # model architecture num_layers: 24 hidden_size: 1024 num_attention_heads: 16 num_query_groups: null ffn_hidden_size: null kv_channels: null hidden_dropout: 0.0 attention_dropout: 0.0 fp32_residual_connection: False apply_residual_connection_post_layernorm: False layernorm_epsilon: 1.e-5 layernorm_zero_centered_gamma: True add_bias_linear: False bias_activation_fusion: False add_qkv_bias: False gated_linear_unit: False activation_func: swiglu num_moe_experts: null rotary_interleaved: False window_size: null # initialization init_method: null init_method_std: 0.02 output_layer_init_method: null # mixed-precision apply_query_key_layer_scaling: False attention_softmax_in_fp32: False # fusion bias_swiglu_fusion: True masked_softmax_fusion: True persist_layer_norm: False memory_efficient_layer_norm: False bias_dropout_fusion: True apply_rope_fusion: True # activation recomputation recompute_granularity: null recompute_method: null recompute_num_layers: null distribute_saved_activations: null # fp8 related fp8: null fp8_margin: 0 fp8_interval: 1 fp8_amax_history_len: 1 fp8_amax_compute_algo: "most_recent" fp8_wgrad: True # miscellaneous clone_scatter_output_in_embedding: True normalization: "LayerNorm" # alt value supported by TE: "RMSNorm" # MoE related moe_router_load_balancing_type: "aux_loss" moe_router_topk: 2 moe_router_group_topk: null moe_router_num_groups: null moe_grouped_gemm: False moe_aux_loss_coeff: 0 # 1e-2 would be a good start value for load balance loss. moe_z_loss_coeff: null # 1e-3 would be a good start value for z-loss moe_input_jitter_eps: null moe_token_dropping: False model_parallel: # Model parallelism tensor_model_parallel_size: 1 context_parallel_size: 1 pipeline_model_parallel_size: 1 virtual_pipeline_model_parallel_size: null sequence_parallel: True expert_model_parallel_size: 1 # Initialization perform_initialization: True use_cpu_initialization: null # Training fp16: False bf16: True params_dtype: null # Set from above arguments for core timers: null # Optimizations gradient_accumulation_fusion: True async_tensor_model_parallel_allreduce: True tp_comm_overlap: False # Debug Options tp_comm_split_ag: True tp_comm_atomic_ag: True tp_comm_split_rs: True tp_comm_atomic_rs: True tp_comm_bulk_wgrad: True tp_comm_bulk_dgrad: True # Parallelism finalize_model_grads_func: null # Pipeline Parallel pipeline_dtype: null grad_scale_func: null enable_autocast: False autocast_dtype: null variable_seq_lengths: False num_microbatches_with_partial_activation_checkpoints: null overlap_p2p_comm: False batch_p2p_comm: True batch_p2p_sync: True use_ring_exchange_p2p: False deallocate_pipeline_outputs: False no_sync_func: null grad_sync_func: null param_sync_func: null pipeline_model_parallel_split_rank: null # CPU Offloading cpu_offloading: False cpu_offloading_num_layers: 0 _cpu_offloading_context: null cpu_offloading_weights: False cpu_offloading_activations: True # Timing barrier_with_L1_time: True # training: use_legacy_models: False spec: null micro_batch_size: 2 global_batch_size: 128 rampup_batch_size: [32, 32, 65324160] check_for_nan_in_loss_and_grad: True num_layers_per_virtual_pipeline_stage: null encoder_num_layers: null decoder_num_layers: null rotary_seq_len_interpolation_factor: null add_position_embedding: False make_vocab_size_divisible_by: 128 group_query_attention: False exit_signal_handler: False exit_duration_in_mins: null exit_interval: null untie_embeddings_and_output_weights: True position_embedding_type: rope rotary_percent: 0.5 openai_gelu: False squared_relu: False swiglu: True onnx_safe: null bert_binary_head: True max_position_embeddings: 4096 transformer_impl: local use_flash_attn: False seed: 1234 data_parallel_random_init: False # Optimizer optimizer: adam lr: 2.5e-4 lr_decay_style: cosine lr_decay_iters: null lr_decay_samples: 255126953 lr_warmup_fraction: null lr_warmup_iters: 0 lr_warmup_samples: 81381 lr_warmup_init: 0.0 min_lr: 2.5e-5 weight_decay: 0.1 start_weight_decay: null end_weight_decay: null weight_decay_incr_style: constant clip_grad: 1.0 adam_beta1: 0.9 adam_beta2: 0.95 adam_eps: 1.e-08 sgd_momentum: 0.9 override_opt_param_scheduler: False use_checkpoint_opt_param_scheduler: False # checkpointing arguments save: null save_interval: 20000 no_save_optim: null no_save_rng: null load: null no_load_optim: null no_load_rng: null finetune: False use_checkpoint_args: False exit_on_missing_checkpoint: False # loss arguments loss_scale: null initial_loss_scale: 4294967296 min_loss_scale: 1.0 loss_scale_window: 1000 hysteresis: 2 accumulate_allreduce_grads_in_fp32: False fp16_lm_cross_entropy: False # distributed arguments distributed_backend: nccl distributed_timeout_minutes: 10 overlap_grad_reduce: False align_grad_reduce: True overlap_param_gather: False align_param_gather: False scatter_gather_tensors_in_pipeline: True local_rank: null lazy_mpu_init: null empty_unused_memory_level: 0 standalone_embedding_stage: False use_distributed_optimizer: False nccl_communicator_config_path: null train_iters: null eval_iters: 32 eval_interval: 2000 skip_train: False adlr_autoresume: False adlr_autoresume_interval: 1000 # garbage collection manual_gc: False manual_gc_interval: 0 manual_gc_eval: True tp_comm_overlap_cfg: null #data data_path: null split: '99,1,0' train_data_path: null valid_data_path: null test_data_path: null data_cache_path: null mock_data: False vocab_size: null vocab_file: null merge_file: null vocab_extra_ids: 0 seq_length: 4096 encoder_seq_length: null decoder_seq_length: null retriever_seq_length: 256 sample_rate: 1.0 mask_prob: 0.15 short_seq_prob: 0.1 num_workers: 2 tokenizer_type: GPTSentencePieceTokenizer tokenizer_model: null reset_position_ids: False reset_attention_mask: False eod_mask_loss: False train_samples: 268554688 dataloader_type: null #profile: profile: False profile_ranks: [0] profile_step_end: 12 profile_step_start: 10 #logging: log_params_norm: True log_num_zeros_in_grad: True log_throughput: False log_progress: False timing_log_level: 0 timing_log_option: minmax tensorboard_log_interval: 1 tensorboard_queue_size: 1000 log_timers_to_tensorboard: False log_validation_ppl_to_tensorboard: False log_memory_to_tensorboard: False log_world_size_to_tensorboard: False log_loss_scale_to_tensorboard: True wandb_project: '' wandb_exp_name: '' wandb_save_dir: '' enable_one_logger: True one_logger_project: megatron-lm one_logger_run_name: null log_interval: 100 tensorboard_dir: null