更新代码

688448db · silencealiang · a02a5490 · 688448db · 688448db · 688448db
Commit 688448db authored Mar 14, 2025 by silencealiang
20 changed files
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
--- a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh
+++ b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh
--- a/examples/gpt3/gpt_config.yaml
+++ b/examples/gpt3/gpt_config.yaml
-# WARNING: Yaml configs is currently an experimental feature
-language_model:
-  # model architecture
-  num_layers: 24
-  hidden_size: 1024
-  num_attention_heads: 16
-  num_query_groups: null
-
-  ffn_hidden_size: null
-  kv_channels: null
-  hidden_dropout: 0.0
-  attention_dropout: 0.0
-  fp32_residual_connection: False
-
-  apply_residual_connection_post_layernorm: False
-  layernorm_epsilon: 1.e-5
-  layernorm_zero_centered_gamma: True
-  add_bias_linear: False
-  bias_activation_fusion: False
-  add_qkv_bias: False
-  gated_linear_unit: False
-  activation_func: swiglu
-  num_moe_experts: null
-  rotary_interleaved: False
-  window_size: null
-
-  # initialization
-  init_method: null
-  init_method_std: 0.02
-  output_layer_init_method: null
-
-  # mixed-precision
-  apply_query_key_layer_scaling: False
-  attention_softmax_in_fp32: False
-
-  # fusion
-  bias_swiglu_fusion: True
-  masked_softmax_fusion: True
-  persist_layer_norm: False
-  memory_efficient_layer_norm: False
-  bias_dropout_fusion: True
-  apply_rope_fusion: True
-
-  # activation recomputation
-  recompute_granularity: null
-  recompute_method: null
-  recompute_num_layers: null
-  distribute_saved_activations: null
-
-  # fp8 related
-  fp8: null
-  fp8_margin: 0
-  fp8_interval: 1
-  fp8_amax_history_len: 1
-  fp8_amax_compute_algo: "most_recent"
-  fp8_wgrad: True
-
-  # miscellaneous
-  clone_scatter_output_in_embedding: True
-
-  normalization: "LayerNorm"  # alt value supported by TE: "RMSNorm"
-
-  # MoE related
-  moe_router_load_balancing_type: "aux_loss"
-  moe_router_topk: 2
-  moe_router_topk_limited_devices: null
-  moe_grouped_gemm: False
-  moe_aux_loss_coeff: 0  # 1e-2 would be a good start value for load balance loss.
-  moe_z_loss_coeff: null  # 1e-3 would be a good start value for z-loss
-  moe_input_jitter_eps: null
-  moe_token_dropping: False
-
-model_parallel:
-  # Model parallelism
-  tensor_model_parallel_size: 1
-  context_parallel_size: 1
-  pipeline_model_parallel_size: 1
-  virtual_pipeline_model_parallel_size: null
-  sequence_parallel: True
-  expert_model_parallel_size: 1
-
-  # Initialization
-  perform_initialization: True
-  use_cpu_initialization: null
-
-  # Training
-  fp16: False
-  bf16: True
-  params_dtype: null # Set from above arguments for core
-  timers: null
-
-  # Optimizations
-  gradient_accumulation_fusion: True
-  async_tensor_model_parallel_allreduce: True
-  tp_comm_overlap: False
-
-  # Debug Options
-  tp_comm_split_ag: True
-  tp_comm_atomic_ag: True
-  tp_comm_split_rs: True
-  tp_comm_atomic_rs: True
-  tp_comm_bulk_wgrad: True
-  tp_comm_bulk_dgrad: True
-
-  # Parallelism
-  finalize_model_grads_func: null
-
-  # Pipeline Parallel
-  pipeline_dtype: null
-  grad_scale_func: null
-  enable_autocast: False
-  autocast_dtype: null
-  variable_seq_lengths: False
-  num_microbatches_with_partial_activation_checkpoints: null
-  overlap_p2p_comm: False
-  batch_p2p_comm: True
-  batch_p2p_sync: True
-  use_ring_exchange_p2p: False
-  deallocate_pipeline_outputs: False
-  no_sync_func: null
-  grad_sync_func: null
-  param_sync_func: null
-  pipeline_model_parallel_split_rank: null
-
-  # CPU Offloading
-  cpu_offloading: False
-  cpu_offloading_num_layers: 0
-  _cpu_offloading_context: null
-  cpu_offloading_weights: False
-  cpu_offloading_activations: True
-
-  # Timing
-  barrier_with_L1_time: True
-
-# training:
-use_legacy_models: False
-spec: null
-micro_batch_size: 2
-global_batch_size: 128
-rampup_batch_size: [32, 32, 65324160] 
-check_for_nan_in_loss_and_grad: True
-num_layers_per_virtual_pipeline_stage: null
-
-encoder_num_layers: null
-decoder_num_layers: null
-rotary_seq_len_interpolation_factor: null
-add_position_embedding: False
-make_vocab_size_divisible_by: 128
-group_query_attention: False
-
-
-exit_signal_handler: False
-exit_duration_in_mins: null
-exit_interval: null
-
-untie_embeddings_and_output_weights: True
-position_embedding_type: rope
-rotary_percent: 0.5
-openai_gelu: False
-squared_relu: False
-swiglu: True
-onnx_safe: null
-bert_binary_head: True
-max_position_embeddings: 4096
-
-transformer_impl: local
-use_flash_attn: False
-seed: 1234
-data_parallel_random_init: False
-
-# Optimizer
-optimizer: adam
-lr: 2.5e-4
-lr_decay_style: cosine
-lr_decay_iters: null
-lr_decay_samples: 255126953
-lr_warmup_fraction: null
-lr_warmup_iters: 0
-lr_warmup_samples: 81381
-lr_warmup_init: 0.0
-min_lr: 2.5e-5
-weight_decay: 0.1
-start_weight_decay: null
-end_weight_decay: null
-weight_decay_incr_style: constant
-clip_grad: 1.0
-adam_beta1: 0.9
-adam_beta2: 0.95
-adam_eps: 1.e-08
-sgd_momentum: 0.9
-override_opt_param_scheduler: False
-use_checkpoint_opt_param_scheduler: False
-
-# checkpointing arguments
-save: null
-save_interval: 20000
-no_save_optim: null
-no_save_rng: null
-load: null
-no_load_optim: null
-no_load_rng: null
-finetune: False
-use_checkpoint_args: False
-exit_on_missing_checkpoint: False
-
-# loss arguments
-loss_scale: null
-initial_loss_scale: 4294967296
-min_loss_scale: 1.0
-loss_scale_window: 1000 
-hysteresis: 2
-accumulate_allreduce_grads_in_fp32: False
-fp16_lm_cross_entropy: False
-
-# distributed arguments
-distributed_backend: nccl
-distributed_timeout_minutes: 10
-overlap_grad_reduce: False
-align_grad_reduce: True
-overlap_param_gather: False
-align_param_gather: False
-scatter_gather_tensors_in_pipeline: True
-local_rank: null
-lazy_mpu_init: null
-empty_unused_memory_level: 0
-standalone_embedding_stage: False
-use_distributed_optimizer: False
-nccl_communicator_config_path: null
-
-train_iters: null
-eval_iters: 32
-eval_interval: 2000
-skip_train: False
-
-adlr_autoresume: False
-adlr_autoresume_interval: 1000
-
-# garbage collection
-manual_gc: False
-manual_gc_interval: 0
-manual_gc_eval: True
-
-tp_comm_overlap_cfg: null
-
-#data
-data_path: null
-split: '99,1,0'
-train_data_path: null
-valid_data_path: null
-test_data_path: null
-data_cache_path: null
-mock_data: False
-vocab_size: null
-vocab_file: null
-merge_file: null
-vocab_extra_ids: 0
-seq_length: 4096
-encoder_seq_length: null
-decoder_seq_length: null
-retriever_seq_length: 256
-sample_rate: 1.0
-mask_prob: 0.15
-short_seq_prob: 0.1
-num_workers: 2
-tokenizer_type: GPTSentencePieceTokenizer
-tokenizer_model: null
-reset_position_ids: False
-reset_attention_mask: False
-eod_mask_loss: False
-train_samples: 268554688
-dataloader_type: null
-
-#profile:
-profile: False
-profile_ranks: [0]
-profile_step_end: 12
-profile_step_start: 10
-
-#logging:
-log_params_norm: True
-log_num_zeros_in_grad: True
-log_throughput: False
-log_progress: False
-timing_log_level: 0
-timing_log_option: minmax
-tensorboard_log_interval: 1
-tensorboard_queue_size: 1000
-log_timers_to_tensorboard: False
-log_validation_ppl_to_tensorboard: False
-log_memory_to_tensorboard: False
-log_world_size_to_tensorboard: False
-log_loss_scale_to_tensorboard: True
-wandb_project: ''
-wandb_exp_name: ''
-wandb_save_dir: ''
-enable_one_logger: True
-one_logger_project: megatron-lm
-one_logger_run_name: null
-log_interval: 100
-tensorboard_dir: null
+# WARNING: Yaml configs is currently an experimental feature
+language_model:
+  # model architecture
+  num_layers: 24
+  hidden_size: 1024
+  num_attention_heads: 16
+  num_query_groups: null
+
+  ffn_hidden_size: null
+  kv_channels: null
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  fp32_residual_connection: False
+
+  apply_residual_connection_post_layernorm: False
+  layernorm_epsilon: 1.e-5
+  layernorm_zero_centered_gamma: True
+  add_bias_linear: False
+  bias_activation_fusion: False
+  add_qkv_bias: False
+  gated_linear_unit: False
+  activation_func: swiglu
+  num_moe_experts: null
+  rotary_interleaved: False
+  window_size: null
+
+  # initialization
+  init_method: null
+  init_method_std: 0.02
+  output_layer_init_method: null
+
+  # mixed-precision
+  apply_query_key_layer_scaling: False
+  attention_softmax_in_fp32: False
+
+  # fusion
+  bias_swiglu_fusion: True
+  masked_softmax_fusion: True
+  persist_layer_norm: False
+  memory_efficient_layer_norm: False
+  bias_dropout_fusion: True
+  apply_rope_fusion: True
+
+  # activation recomputation
+  recompute_granularity: null
+  recompute_method: null
+  recompute_num_layers: null
+  distribute_saved_activations: null
+
+  # fp8 related
+  fp8: null
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: "most_recent"
+  fp8_wgrad: True
+
+  # miscellaneous
+  clone_scatter_output_in_embedding: True
+
+  normalization: "LayerNorm"  # alt value supported by TE: "RMSNorm"
+
+  # MoE related
+  moe_router_load_balancing_type: "aux_loss"
+  moe_router_topk: 2
+  moe_router_group_topk: null
+  moe_router_num_groups: null
+  moe_grouped_gemm: False
+  moe_aux_loss_coeff: 0  # 1e-2 would be a good start value for load balance loss.
+  moe_z_loss_coeff: null  # 1e-3 would be a good start value for z-loss
+  moe_input_jitter_eps: null
+  moe_token_dropping: False
+
+model_parallel:
+  # Model parallelism
+  tensor_model_parallel_size: 1
+  context_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  sequence_parallel: True
+  expert_model_parallel_size: 1
+
+  # Initialization
+  perform_initialization: True
+  use_cpu_initialization: null
+
+  # Training
+  fp16: False
+  bf16: True
+  params_dtype: null # Set from above arguments for core
+  timers: null
+
+  # Optimizations
+  gradient_accumulation_fusion: True
+  async_tensor_model_parallel_allreduce: True
+  tp_comm_overlap: False
+
+  # Debug Options
+  tp_comm_split_ag: True
+  tp_comm_atomic_ag: True
+  tp_comm_split_rs: True
+  tp_comm_atomic_rs: True
+  tp_comm_bulk_wgrad: True
+  tp_comm_bulk_dgrad: True
+
+  # Parallelism
+  finalize_model_grads_func: null
+
+  # Pipeline Parallel
+  pipeline_dtype: null
+  grad_scale_func: null
+  enable_autocast: False
+  autocast_dtype: null
+  variable_seq_lengths: False
+  num_microbatches_with_partial_activation_checkpoints: null
+  overlap_p2p_comm: False
+  batch_p2p_comm: True
+  batch_p2p_sync: True
+  use_ring_exchange_p2p: False
+  deallocate_pipeline_outputs: False
+  no_sync_func: null
+  grad_sync_func: null
+  param_sync_func: null
+  pipeline_model_parallel_split_rank: null
+
+  # CPU Offloading
+  cpu_offloading: False
+  cpu_offloading_num_layers: 0
+  _cpu_offloading_context: null
+  cpu_offloading_weights: False
+  cpu_offloading_activations: True
+
+  # Timing
+  barrier_with_L1_time: True
+
+# training:
+use_legacy_models: False
+spec: null
+micro_batch_size: 2
+global_batch_size: 128
+rampup_batch_size: [32, 32, 65324160] 
+check_for_nan_in_loss_and_grad: True
+num_layers_per_virtual_pipeline_stage: null
+
+encoder_num_layers: null
+decoder_num_layers: null
+rotary_seq_len_interpolation_factor: null
+add_position_embedding: False
+make_vocab_size_divisible_by: 128
+group_query_attention: False
+
+
+exit_signal_handler: False
+exit_duration_in_mins: null
+exit_interval: null
+
+untie_embeddings_and_output_weights: True
+position_embedding_type: rope
+rotary_percent: 0.5
+openai_gelu: False
+squared_relu: False
+swiglu: True
+onnx_safe: null
+bert_binary_head: True
+max_position_embeddings: 4096
+
+transformer_impl: local
+use_flash_attn: False
+seed: 1234
+data_parallel_random_init: False
+
+# Optimizer
+optimizer: adam
+lr: 2.5e-4
+lr_decay_style: cosine
+lr_decay_iters: null
+lr_decay_samples: 255126953
+lr_warmup_fraction: null
+lr_warmup_iters: 0
+lr_warmup_samples: 81381
+lr_warmup_init: 0.0
+min_lr: 2.5e-5
+weight_decay: 0.1
+start_weight_decay: null
+end_weight_decay: null
+weight_decay_incr_style: constant
+clip_grad: 1.0
+adam_beta1: 0.9
+adam_beta2: 0.95
+adam_eps: 1.e-08
+sgd_momentum: 0.9
+override_opt_param_scheduler: False
+use_checkpoint_opt_param_scheduler: False
+
+# checkpointing arguments
+save: null
+save_interval: 20000
+no_save_optim: null
+no_save_rng: null
+load: null
+no_load_optim: null
+no_load_rng: null
+finetune: False
+use_checkpoint_args: False
+exit_on_missing_checkpoint: False
+
+# loss arguments
+loss_scale: null
+initial_loss_scale: 4294967296
+min_loss_scale: 1.0
+loss_scale_window: 1000 
+hysteresis: 2
+accumulate_allreduce_grads_in_fp32: False
+fp16_lm_cross_entropy: False
+
+# distributed arguments
+distributed_backend: nccl
+distributed_timeout_minutes: 10
+overlap_grad_reduce: False
+align_grad_reduce: True
+overlap_param_gather: False
+align_param_gather: False
+scatter_gather_tensors_in_pipeline: True
+local_rank: null
+lazy_mpu_init: null
+empty_unused_memory_level: 0
+standalone_embedding_stage: False
+use_distributed_optimizer: False
+nccl_communicator_config_path: null
+
+train_iters: null
+eval_iters: 32
+eval_interval: 2000
+skip_train: False
+
+adlr_autoresume: False
+adlr_autoresume_interval: 1000
+
+# garbage collection
+manual_gc: False
+manual_gc_interval: 0
+manual_gc_eval: True
+
+tp_comm_overlap_cfg: null
+
+#data
+data_path: null
+split: '99,1,0'
+train_data_path: null
+valid_data_path: null
+test_data_path: null
+data_cache_path: null
+mock_data: False
+vocab_size: null
+vocab_file: null
+merge_file: null
+vocab_extra_ids: 0
+seq_length: 4096
+encoder_seq_length: null
+decoder_seq_length: null
+retriever_seq_length: 256
+sample_rate: 1.0
+mask_prob: 0.15
+short_seq_prob: 0.1
+num_workers: 2
+tokenizer_type: GPTSentencePieceTokenizer
+tokenizer_model: null
+reset_position_ids: False
+reset_attention_mask: False
+eod_mask_loss: False
+train_samples: 268554688
+dataloader_type: null
+
+#profile:
+profile: False
+profile_ranks: [0]
+profile_step_end: 12
+profile_step_start: 10
+
+#logging:
+log_params_norm: True
+log_num_zeros_in_grad: True
+log_throughput: False
+log_progress: False
+timing_log_level: 0
+timing_log_option: minmax
+tensorboard_log_interval: 1
+tensorboard_queue_size: 1000
+log_timers_to_tensorboard: False
+log_validation_ppl_to_tensorboard: False
+log_memory_to_tensorboard: False
+log_world_size_to_tensorboard: False
+log_loss_scale_to_tensorboard: True
+wandb_project: ''
+wandb_exp_name: ''
+wandb_save_dir: ''
+enable_one_logger: True
+one_logger_project: megatron-lm
+one_logger_run_name: null
+log_interval: 100
+tensorboard_dir: null
--- a/examples/gpt3/hostfile_gpt_567B
+++ b/examples/gpt3/hostfile_gpt_567B
--- a/run_GPT-MOE_1nodes.sh
+++ b/run_GPT-MOE_1nodes.sh
@@ -7,10 +7,10 @@ do
    fi
 done

-mpirun -np 8 --allow-run-as-root \
-             train_GPT-MOE_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
+mpirun -np 8  --allow-run-as-root \
+              train_gpt_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1

 wait

 rm -rf CKPT
-rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
+rm -rf mixtral_dataset/my-mixtral_text_document
--- a/run_mixtral8x7B_2nodes.sh
+++ b/run_mixtral8x7B_2nodes.sh
@@ -7,13 +7,13 @@ do
    fi
 done

-mpirun -np 16 --hostfile mixtralnodes \
+mpirun -np 512 --hostfile hostfile_gpt_567B \
              --allow-run-as-root \
              --bind-to none \
              --mca plm_rsh_no_tree_spawn 1 \
-              train_mixtral_8x7B_2nodes.sh node021 --profiling=$profiling > output.log 2>&1
+              train_gpt_567B_multinodes.sh node002 --profiling=$profiling > output.log 2>&1

 wait

 rm -rf CKPT
-#rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
+#rm -rf mixtral_dataset/my-mixtral_text_document
--- a/examples/gpt3/train_gpt3_175b_distributed.sh
+++ b/examples/gpt3/train_gpt3_175b_distributed.sh
-#!/bin/bash
-
-# Runs the "175B" parameter model
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-GPUS_PER_NODE=8
-# Change for multinode config
-MASTER_ADDR=localhost
-MASTER_PORT=6000
-NUM_NODES=1
-NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
-
-CHECKPOINT_PATH=$1 #<Specify path>
-TENSORBOARD_LOGS_PATH=$2 #<Specify path>
-VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
-MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
-DATA_PATH=$5 #<Specify path and file prefix>_text_document
-
-DISTRIBUTED_ARGS=(
-    --nproc_per_node $GPUS_PER_NODE 
-    --nnodes $NUM_NODES 
-    --master_addr $MASTER_ADDR 
-    --master_port $MASTER_PORT
-)
-
-GPT_MODEL_ARGS=(
-    --num-layers 96 
-    --hidden-size 12288 
-    --num-attention-heads 96 
-    --seq-length 2048 
-    --max-position-embeddings 2048 
-    --attention-backend auto # Can use (flash/fused/unfused/local)
-)
-
-TRAINING_ARGS=(
-    --micro-batch-size 1 
-    --global-batch-size 1536 
-    --rampup-batch-size 16 16 5859375 
-    --train-iters 500000 
-    --weight-decay 0.1 
-    --adam-beta1 0.9 
-    --adam-beta2 0.95 
-    --init-method-std 0.006 
-    --clip-grad 1.0 
-    --fp16
-    --lr 6.0e-5 
-    --lr-decay-style cosine 
-    --min-lr 6.0e-6
-    --lr-warmup-fraction .001 
-    --lr-decay-iters 430000 
-)
-
-MODEL_PARALLEL_ARGS=(
-	--tensor-model-parallel-size 8 
-	--pipeline-model-parallel-size 16 
-)
-
-DATA_ARGS=(
-    --data-path $DATA_PATH 
-    --vocab-file $VOCAB_FILE 
-    --merge-file $MERGE_FILE 
-    --split 949,50,1
-)
-
-EVAL_AND_LOGGING_ARGS=(
-    --log-interval 100
-    --save-interval 10000 
-    --eval-interval 1000 
-    --save $CHECKPOINT_PATH 
-    --load $CHECKPOINT_PATH 
-    --eval-iters 10
-    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
-)
-
-torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
-    ${GPT_MODEL_ARGS[@]} \
-    ${TRAINING_ARGS[@]} \
-    ${MODEL_PARALLEL_ARGS[@]} \
-    ${DATA_ARGS[@]} \
-    ${EVAL_AND_LOGGING_ARGS[@]}
+#!/bin/bash
+
+# Runs the "175B" parameter model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NUM_NODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
+
+CHECKPOINT_PATH=$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=$2 #<Specify path>
+VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
+MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
+DATA_PATH=$5 #<Specify path and file prefix>_text_document
+
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE 
+    --nnodes $NUM_NODES 
+    --master_addr $MASTER_ADDR 
+    --master_port $MASTER_PORT
+)
+
+GPT_MODEL_ARGS=(
+    --num-layers 96 
+    --hidden-size 12288 
+    --num-attention-heads 96 
+    --seq-length 2048 
+    --max-position-embeddings 2048 
+    --attention-backend auto # Can use (flash/fused/unfused/local)
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 1 
+    --global-batch-size 1536 
+    --rampup-batch-size 16 16 5859375 
+    --train-iters 500000 
+    --weight-decay 0.1 
+    --adam-beta1 0.9 
+    --adam-beta2 0.95 
+    --init-method-std 0.006 
+    --clip-grad 1.0 
+    --fp16
+    --lr 6.0e-5 
+    --lr-decay-style cosine 
+    --min-lr 6.0e-6
+    --lr-warmup-fraction .001 
+    --lr-decay-iters 430000 
+)
+
+MODEL_PARALLEL_ARGS=(
+	--tensor-model-parallel-size 8 
+	--pipeline-model-parallel-size 16 
+)
+
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --vocab-file $VOCAB_FILE 
+    --merge-file $MERGE_FILE 
+    --split 949,50,1
+)
+
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 100
+    --save-interval 10000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+
+torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
+    ${GPT_MODEL_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${EVAL_AND_LOGGING_ARGS[@]}
--- a/train_GPT-MOE_567B_1nodes.sh
+++ b/train_GPT-MOE_567B_1nodes.sh
@@ -4,18 +4,23 @@ for para in $*
 do
    if [[ $para == --profiling* ]];then
        profiling=${para#*=}
-        export GPU_FLUSH_ON_EXECUTION=1
-        export HIP_DIRECT_DISPATCH=0
    fi
 done

+# Runs GPT 567B model
 source /opt/dtk/env.sh
-# Runs Mixtral 8x7B model
+
+# defauat env
+CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
+export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH
+export GLOG_minloglevel=3
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HSA_FORCE_FINE_GRAIN_PCIE=1
 export OMP_NUM_THREADS=1
 export GPU_MAX_HW_QUEUES=10

+# nccl env
 export NCCL_ALGO=Ring
 export NCCL_MIN_NCHANNELS=32
 export NCCL_MAX_NCHANNELS=32
@@ -23,9 +28,10 @@ export NCCL_NET_GDR_LEVEL=7
 export NCCL_NET_GDR_READ=1
 export RCCL_SDMA_COPY_ENABLE=0
 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-#export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+
+# enable BatchLinear
 export GROUPED_GEMM_BatchLinear=1
-export GLOG_minloglevel=3

 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
@@ -96,7 +102,6 @@ TRAINING_ARGS=(
    --bf16
    --overlap-param-gather
    --overlap-grad-reduce
-    #--tp-comm-overlap
 )

 TORCH_PROFIE_ARGS=(
@@ -104,18 +109,10 @@ TORCH_PROFIE_ARGS=(
    --profile-ranks 0 1 2 3 4 5 6 7
    --profile-step-start 3
    --profile-step-end 4
-    --profile-dir torch_prof_gpt_1nodes
+    --profile-dir torch_prof_gpt_1nodes_tp2-pp1-ep8-ep_tp1
    --use-pytorch-profiler
 )

-HIP_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-step-start 4
-    --profile-step-end 5
-    --use-hip-profiler
-)
-
 MODEL_PARALLEL_ARGS=(
    --tensor-model-parallel-size 2
    --pipeline-model-parallel-size 1
@@ -157,10 +154,6 @@ APP="python3 -u pretrain_gpt.py \

 if [[ $profiling == "torch" ]]; then
    APP+=" ${TORCH_PROFIE_ARGS[@]}"
-elif [[ $profiling == "hip" ]]; then
-    mkdir -p hip_prof_data
-    APP+=" ${HIP_PROFIE_ARGS[@]}"
-    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
 fi

 #for hygon cpu
@@ -205,4 +198,4 @@ case ${LOCAL_RANK} in
  ${APP}
  #numactl --cpunodebind=7 --membind=7 ${APP}
  ;;
-esac
\ No newline at end of file
+esac
--- a/train_GPT-MOE_567B.sh
+++ b/train_GPT-MOE_567B.sh
@@ -4,18 +4,23 @@ for para in $*
 do
    if [[ $para == --profiling* ]];then
        profiling=${para#*=}
-        export GPU_FLUSH_ON_EXECUTION=1
-        export HIP_DIRECT_DISPATCH=0
    fi
 done

+# Runs GPT 567B model
 source /opt/dtk/env.sh
-# Runs Mixtral 8x7B model
+
+# defauat env
+CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
+MEGATRON_PATH=$( dirname $( dirname ${CURRENT_DIR}))
+export PYTHONPATH=${MEGATRON_PATH}:$PYTHONPATH
+export GLOG_minloglevel=3
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HSA_FORCE_FINE_GRAIN_PCIE=1
 export OMP_NUM_THREADS=1
 export GPU_MAX_HW_QUEUES=10

+# nccl env
 export NCCL_ALGO=Ring
 export NCCL_MIN_NCHANNELS=32
 export NCCL_MAX_NCHANNELS=32
@@ -23,9 +28,10 @@ export NCCL_NET_GDR_LEVEL=7
 export NCCL_NET_GDR_READ=1
 export RCCL_SDMA_COPY_ENABLE=0
 export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
-#export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+
+# enable BatchLinear
 export GROUPED_GEMM_BatchLinear=1
-export GLOG_minloglevel=3

 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
@@ -49,7 +55,7 @@ MODEL_ARGS=(
    --disable-bias-linear
    --seq-length 8192
    --max-position-embeddings 32768
-    --num-layers 64
+    --num-layers 32 #64
    --hidden-size 8192
    --ffn-hidden-size 32768
    --num-attention-heads 64
@@ -72,7 +78,7 @@ MOE_ARGS=(
    --moe-token-dispatcher-type alltoall
    --moe-expert-capacity-factor 0.5
    --moe-pad-expert-input-to-capacity
-    --moe-grouped-gemm
+    #--moe-grouped-gemm
 )

 DATA_ARGS=(
@@ -84,7 +90,7 @@ DATA_ARGS=(

 TRAINING_ARGS=(
    --micro-batch-size 1
-    --global-batch-size 4096
+    --global-batch-size 1024
    --lr 1e-4
    --train-iters 10
    --lr-decay-iters 320000
@@ -96,7 +102,6 @@ TRAINING_ARGS=(
    --bf16
    --overlap-param-gather
    --overlap-grad-reduce
-    #--tp-comm-overlap
 )

 TORCH_PROFIE_ARGS=(
@@ -104,23 +109,16 @@ TORCH_PROFIE_ARGS=(
    --profile-ranks 0 1 2 3 4 5 6 7
    --profile-step-start 3
    --profile-step-end 4
-    --profile-dir torch_prof_gpt
+    --profile-dir torch_prof_gpt_64nodes_tp2-pp16-ep16-ep_tp1-cp2
    --use-pytorch-profiler
 )

-HIP_PROFIE_ARGS=(
-    --profile
-    --profile-ranks 0 1 2 3 4 5 6 7
-    --profile-step-start 4
-    --profile-step-end 5
-    --use-hip-profiler
-)
-
 MODEL_PARALLEL_ARGS=(
    --tensor-model-parallel-size 2
    --pipeline-model-parallel-size 16
    --expert-model-parallel-size 16
    --expert-tensor-parallel-size 1
+    --context-parallel-size 2
    --use-distributed-optimizer
    --sequence-parallel
 )
@@ -157,10 +155,6 @@ APP="python3 -u pretrain_gpt.py \

 if [[ $profiling == "torch" ]]; then
    APP+=" ${TORCH_PROFIE_ARGS[@]}"
-elif [[ $profiling == "hip" ]]; then
-    mkdir -p hip_prof_data
-    APP+=" ${HIP_PROFIE_ARGS[@]}"
-    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
 fi

 #for hygon cpu

--- a/examples/inference/gpt/gpt_batch_inference.py
+++ b/examples/inference/gpt/gpt_batch_inference.py
-import os
-from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import InferenceWrapperConfig
-from pretrain_gpt import model_provider
-import torch
-import sys
-from argparse import Namespace
-from megatron.core.inference.engines.abstract_engine import AbstractEngine
-from megatron.core.inference.engines.mcore_engine import MCoreEngine
-from megatron.core.inference.sampling_params import SamplingParams
-from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper
-from megatron.core.inference.inference_request import InferenceRequest
-from megatron.core.inference.text_generation_controllers.text_generation_controller import TextGenerationController
-from megatron.core.transformer.module import MegatronModule
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                             os.path.pardir, os.path.pardir)))
-
-from megatron.training import get_args
-from megatron.training import get_tokenizer
-from megatron.training.checkpointing import load_checkpoint
-from megatron.core import mpu
-from megatron.training.initialize import initialize_megatron
-from megatron.training import get_model
-from typing import List
-
-def add_text_generate_args(parser):
-    """Text generation arguments."""
-    group = parser.add_argument_group(title='text generation')
-
-    group.add_argument("--temperature", type=float, default=1.0,
-                       help='Sampling temperature.')
-    group.add_argument("--top_k", type=int, default=1,
-                       help='Top k sampling.')
-    group.add_argument("--top_p", type=float, default=0.0,
-                       help='Top p sampling.')
-    group.add_argument("--return-log-probs", action='store_true', default=False,
-                       help='Return the log probabilities of the final output tokens')
-    group.add_argument("--num-tokens-to-generate", type=int, default=30,
-                       help='Number of tokens to generate for each prompt')
-    group.add_argument("--prompts", metavar='N', type=str, nargs='+',
-                       help='Input prompts with each prompt within quotes and seperated by space')
-    group.add_argument("--max-batch-size", type=int, default=1,
-                       help='Max number of prompts to process at once')
-    return parser
-
-
-def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngine:
-    """Utility to get the relevant backend for running inference
-
-    This function will automatically chose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet. 
-
-    Args:
-        args (Namespace): The user arguments parsed from command line
-        model (MegatronModule): The megatron model . 
-
-    Returns:
-        AbstractBackend: The chosen backend
-    """
-    tokenizer = get_tokenizer()
-
-    inference_wrapper_config = InferenceWrapperConfig(
-        hidden_size=args.hidden_size,
-        inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
-        fp32_residual_connection=args.fp32_residual_connection,
-        params_dtype=args.params_dtype,
-        padded_vocab_size=args.padded_vocab_size
-    )
-
-    inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config)
-    text_generation_controller = TextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer)
-    return MCoreEngine(text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size)
-            
-def main():
-    """Main program."""
-
-    # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
-    # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
-    initialize_megatron(extra_args_provider=add_text_generate_args,
-                        args_defaults={'no_load_rng': True,
-                                       'no_load_optim': True,
-                                       'micro_batch_size': 1, 
-                                       'exit_on_missing_checkpoint': True})
-
-    # Set up model and load checkpoint
-    model = get_model(model_provider, wrap_with_ddp=False)
-    load_checkpoint(model, None, None)
-    model = model[0]
-
-    args = get_args()
-
-    inference_engine = get_inference_engine(args, model)
-
-    sampling_params = SamplingParams(
-        temperature=args.temperature, 
-        top_k=args.top_k, 
-        top_p=args.top_p, 
-        return_log_probs=args.return_log_probs, 
-        num_tokens_to_generate=args.num_tokens_to_generate)
-
-    results: List[InferenceRequest] = inference_engine.generate(
-        prompts=args.prompts, sampling_params=sampling_params
-    )
-    
-    if torch.distributed.get_rank() == 0:
-        for idx, result in enumerate(results):
-            print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
-            result = {
-                'id': result.request_id,
-                'input_prompt': result.prompt, 
-                'generated_text': result.generated_text,
-                'generated_tokens' : result.generated_tokens
-                }
-            print(result)
-
-if __name__ == "__main__":
-    main()
+import os
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+from pretrain_gpt import model_provider
+import torch
+import sys
+import time
+import tqdm
+import warnings
+from argparse import Namespace
+from megatron.core.inference.engines.abstract_engine import AbstractEngine
+from megatron.core.inference.engines.mcore_engine import MCoreEngine
+from megatron.core.inference.sampling_params import SamplingParams
+from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
+    GPTInferenceWrapper,
+)
+from megatron.core.inference.inference_request import InferenceRequest
+from megatron.core.inference.text_generation_controllers.text_generation_controller import (
+    TextGenerationController,
+)
+from megatron.core.transformer.module import MegatronModule
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+)
+
+from megatron.training import get_args
+from megatron.training import get_tokenizer
+from megatron.training.checkpointing import load_checkpoint
+from megatron.core import mpu
+from megatron.training.initialize import initialize_megatron
+from megatron.training import get_model
+import asyncio
+from typing import AsyncIterator, List
+
+
+
+def add_text_generate_args(parser):
+    """Text generation arguments."""
+    group = parser.add_argument_group(title='text generation')
+
+    group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
+    group.add_argument("--top_k", type=int, default=1, help='Top k sampling.')
+    group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
+    group.add_argument(
+        "--return-log-probs",
+        action='store_true',
+        default=False,
+        help='Return the log probabilities of the final output tokens',
+    )
+    group.add_argument(
+        "--num-tokens-to-generate",
+        type=int,
+        default=30,
+        help='Number of tokens to generate for each prompt',
+    )
+    group.add_argument(
+        "--prompts",
+        metavar='N',
+        type=str,
+        nargs='+',
+        help='Input prompts with each prompt within quotes and seperated by space',
+    )
+    group.add_argument(
+        "--max-batch-size", type=int, default=8, dest="inference_max_requests",
+        help='Max number of prompts to process at once'
+    )
+    group.add_argument("--stream", action="store_true", default=False, help="Stream output tokens")
+    return parser
+
+
+def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngine:
+    """Utility to get the relevant backend for running inference
+
+    This function will automatically chose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet.
+
+    Args:
+        args (Namespace): The user arguments parsed from command line
+        model (MegatronModule): The megatron model .
+
+    Returns:
+        AbstractBackend: The chosen backend
+    """
+    tokenizer = get_tokenizer()
+
+    inference_wrapper_config = InferenceWrapperConfig(
+        hidden_size=args.hidden_size,
+        inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
+        fp32_residual_connection=args.fp32_residual_connection,
+        params_dtype=args.params_dtype,
+        padded_vocab_size=args.padded_vocab_size,
+        inference_max_requests=args.inference_max_requests,
+        inference_max_seq_length=args.inference_max_seq_length,
+    )
+
+    inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config)
+    text_generation_controller = TextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer)
+    return MCoreEngine(text_generation_controller=text_generation_controller)
+
+
+async def generate(
+    inference_engine: MCoreEngine,
+    sampling_params: SamplingParams,
+    prompts: List[str],
+) -> List[InferenceRequest]:
+    async def collect_stream(prompt, request_id, stream_generator):
+        print(f"Request {request_id}: {prompt}", end="", flush=True)
+        prev_idx = 0
+        async for output in stream_generator:
+            print(output.generated_text[prev_idx:], end="", flush=True)
+            prev_idx = len(output.generated_text)
+        print()
+
+    request_ids: List[str] = [
+        inference_engine.add_request(
+            prompt=prompt, inference_parameters=sampling_params, streaming=True
+        )
+        for prompt in prompts
+    ]
+    stream_generators = [inference_engine.get_stream_generator(request_id) for request_id in request_ids]
+
+    tasks = [
+        asyncio.create_task(collect_stream(prompt, request_id, stream_generator))
+        for (prompt, request_id, stream_generator) in zip(prompts, request_ids, stream_generators)
+    ]
+
+    await inference_engine.run_engine_async()
+    await asyncio.gather(*tasks)
+
+    results: List[InferenceRequest] = [
+        inference_engine.scheduler.completed_request_pool[request_id] for request_id in request_ids
+    ]
+
+    return results
+
+def main():
+    """Main program."""
+
+    # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
+    # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
+    initialize_megatron(
+        extra_args_provider=add_text_generate_args,
+        args_defaults={
+            'no_load_rng': True,
+            'no_load_optim': True,
+            'micro_batch_size': 1,
+            'exit_on_missing_checkpoint': True,
+        },
+    )
+
+    # Set up model and load checkpoint
+    model = get_model(model_provider, wrap_with_ddp=False)
+    load_checkpoint(model, None, None)
+    model = model[0]
+
+    args = get_args()
+
+    inference_engine = get_inference_engine(args, model)
+
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        return_log_probs=args.return_log_probs,
+        num_tokens_to_generate=args.num_tokens_to_generate,
+    )
+
+    if args.enable_cuda_graph:
+        print(f"Running warmup for CUDA graphs...")
+        inference_engine.generate(
+                prompts=args.prompts, sampling_params=sampling_params
+            )
+
+    start_time = time.perf_counter()
+    if args.stream:
+        results: List[InferenceRequest] = asyncio.run(generate(inference_engine, sampling_params, args.prompts))
+    else:
+        results: List[InferenceRequest] = inference_engine.generate(
+            prompts=args.prompts, sampling_params=sampling_params,
+        )
+    end_time = time.perf_counter()
+    latency = end_time - start_time
+
+    if torch.distributed.get_rank() == 0:
+        for idx, result in enumerate(results):
+            print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
+            result = {
+                'id': result.request_id,
+                'input_prompt': result.prompt,
+                'generated_text': result.generated_text,
+                'generated_tokens': result.generated_tokens,
+                'latency': latency,
+            }
+            print(result)
+
+    torch.distributed.destroy_process_group()
+
+if __name__ == "__main__":
+    main()
--- a/examples/inference/llama_mistral/run_text_generation_llama3.1.sh
+++ b/examples/inference/llama_mistral/run_text_generation_llama3.1.sh
-#!/bin/bash
-# This example will start serving the Llama3.1-8B model
-export NCCL_IB_SL=1
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NVTE_APPLY_QK_LAYER_SCALING=0
-
-DISTRIBUTED_ARGS="--nproc_per_node 1 \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr 0.0.0.0 \
-                  --master_port 6000"
-
-# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
-if [ -z "$1" ] || [ -z "$2" ]; then
-  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
-  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
-  exit 1
-fi
-
-# Assign command-line arguments to variables
-CHECKPOINT=$1
-TOKENIZER_MODEL=$2
-
-pip install flask-restful
-
-torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
-      --use-checkpoint-args \
-      --disable-bias-linear \
-      --tokenizer-type HuggingFaceTokenizer \
-      --tokenizer-model ${TOKENIZER_MODEL} \
-      --transformer-impl transformer_engine \
-      --normalization RMSNorm \
-      --group-query-attention \
-      --num-query-groups 8 \
-      --no-masked-softmax-fusion \
-      --attention-softmax-in-fp32 \
-      --attention-dropout 0.0 \
-      --hidden-dropout 0.0 \
-      --untie-embeddings-and-output-weights \
-      --position-embedding-type rope \
-      --rotary-percent 1.0 \
-      --rotary-base 500000 \
-      --use-rope-scaling \
-      --use-rotary-position-embeddings \
-      --swiglu \
-      --tensor-model-parallel-size 1  \
-      --pipeline-model-parallel-size 1  \
-      --num-layers 32  \
-      --hidden-size 4096  \
-      --ffn-hidden-size 14336 \
-      --load ${CHECKPOINT}  \
-      --num-attention-heads 32  \
-      --max-position-embeddings 131072  \
-      --bf16  \
-      --micro-batch-size 1  \
-      --seq-length 8192
+#!/bin/bash
+# This example will start serving the Llama3.1-8B model
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr 0.0.0.0 \
+                  --master_port 6000"
+
+# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
+if [ -z "$1" ] || [ -z "$2" ]; then
+  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
+  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
+  exit 1
+fi
+
+# Assign command-line arguments to variables
+CHECKPOINT=$1
+TOKENIZER_MODEL=$2
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+      --use-checkpoint-args \
+      --disable-bias-linear \
+      --tokenizer-type HuggingFaceTokenizer \
+      --tokenizer-model ${TOKENIZER_MODEL} \
+      --transformer-impl transformer_engine \
+      --normalization RMSNorm \
+      --group-query-attention \
+      --num-query-groups 8 \
+      --no-masked-softmax-fusion \
+      --attention-softmax-in-fp32 \
+      --attention-dropout 0.0 \
+      --hidden-dropout 0.0 \
+      --untie-embeddings-and-output-weights \
+      --position-embedding-type rope \
+      --rotary-percent 1.0 \
+      --rotary-base 500000 \
+      --use-rope-scaling \
+      --use-rotary-position-embeddings \
+      --swiglu \
+      --tensor-model-parallel-size 1  \
+      --pipeline-model-parallel-size 1  \
+      --num-layers 32  \
+      --hidden-size 4096  \
+      --ffn-hidden-size 14336 \
+      --load ${CHECKPOINT}  \
+      --num-attention-heads 32  \
+      --max-position-embeddings 131072  \
+      --bf16  \
+      --micro-batch-size 1  \
+      --seq-length 8192
--- a/examples/inference/llama_mistral/run_text_generation_llama3.sh
+++ b/examples/inference/llama_mistral/run_text_generation_llama3.sh
-#!/bin/bash
-# This example will start serving the Llama3-8B model
-export NCCL_IB_SL=1
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NVTE_APPLY_QK_LAYER_SCALING=0
-
-DISTRIBUTED_ARGS="--nproc_per_node 1 \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr 0.0.0.0 \
-                  --master_port 6000"
-
-# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
-if [ -z "$1" ] || [ -z "$2" ]; then
-  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
-  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
-  exit 1
-fi
-
-# Assign command-line arguments to variables
-CHECKPOINT=$1
-TOKENIZER_MODEL=$2
-
-pip install flask-restful
-
-torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
-      --use-checkpoint-args \
-      --disable-bias-linear \
-      --tokenizer-type HuggingFaceTokenizer \
-      --tokenizer-model ${TOKENIZER_MODEL} \
-      --transformer-impl transformer_engine \
-      --normalization RMSNorm \
-      --group-query-attention \
-      --num-query-groups 8 \
-      --no-masked-softmax-fusion \
-      --attention-softmax-in-fp32 \
-      --attention-dropout 0.0 \
-      --hidden-dropout 0.0 \
-      --untie-embeddings-and-output-weights \
-      --position-embedding-type rope \
-      --rotary-percent 1.0 \
-      --rotary-base 500000 \
-      --use-rotary-position-embeddings \
-      --swiglu \
-      --tensor-model-parallel-size 1  \
-      --pipeline-model-parallel-size 1  \
-      --num-layers 32  \
-      --hidden-size 4096  \
-      --ffn-hidden-size 14336 \
-      --load ${CHECKPOINT}  \
-      --num-attention-heads 32  \
-      --max-position-embeddings 8192  \
-      --bf16  \
-      --micro-batch-size 1  \
-      --seq-length 8192
+#!/bin/bash
+# This example will start serving the Llama3-8B model
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr 0.0.0.0 \
+                  --master_port 6000"
+
+# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
+if [ -z "$1" ] || [ -z "$2" ]; then
+  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
+  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
+  exit 1
+fi
+
+# Assign command-line arguments to variables
+CHECKPOINT=$1
+TOKENIZER_MODEL=$2
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+      --use-checkpoint-args \
+      --disable-bias-linear \
+      --tokenizer-type HuggingFaceTokenizer \
+      --tokenizer-model ${TOKENIZER_MODEL} \
+      --transformer-impl transformer_engine \
+      --normalization RMSNorm \
+      --group-query-attention \
+      --num-query-groups 8 \
+      --no-masked-softmax-fusion \
+      --attention-softmax-in-fp32 \
+      --attention-dropout 0.0 \
+      --hidden-dropout 0.0 \
+      --untie-embeddings-and-output-weights \
+      --position-embedding-type rope \
+      --rotary-percent 1.0 \
+      --rotary-base 500000 \
+      --use-rotary-position-embeddings \
+      --swiglu \
+      --tensor-model-parallel-size 1  \
+      --pipeline-model-parallel-size 1  \
+      --num-layers 32  \
+      --hidden-size 4096  \
+      --ffn-hidden-size 14336 \
+      --load ${CHECKPOINT}  \
+      --num-attention-heads 32  \
+      --max-position-embeddings 8192  \
+      --bf16  \
+      --micro-batch-size 1  \
+      --seq-length 8192
--- a/examples/inference/llama_mistral/run_text_generation_mistral.sh
+++ b/examples/inference/llama_mistral/run_text_generation_mistral.sh
-#!/bin/bash
-# This example will start serving the Mistral-7B-v0.3 model
-export NCCL_IB_SL=1
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-DISTRIBUTED_ARGS="--nproc_per_node 1 \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr 0.0.0.0 \
-                  --master_port 6000"
-
-# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
-if [ -z "$1" ] || [ -z "$2" ]; then
-  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
-  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
-  exit 1
-fi
-
-# Assign command-line arguments to variables
-CHECKPOINT=$1
-TOKENIZER_MODEL=$2
-
-pip install flask-restful
-
-torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
-       --tokenizer-type HuggingFaceTokenizer \
-       --tokenizer-model ${TOKENIZER_MODEL} \
-       --use-checkpoint-args \
-       --apply-layernorm-1p \
-       --transformer-impl transformer_engine \
-       --normalization RMSNorm \
-       --group-query-attention \
-       --num-query-groups 8 \
-       --no-masked-softmax-fusion \
-       --use-flash-attn \
-       --untie-embeddings-and-output-weights \
-       --disable-bias-linear \
-       --position-embedding-type rope \
-       --rotary-percent 1.0 \
-       --rotary-base 1000000 \
-       --swiglu \
-       --ffn-hidden-size 14336 \
-       --tensor-model-parallel-size 1  \
-       --pipeline-model-parallel-size 1  \
-       --num-layers 32  \
-       --hidden-size 4096  \
-       --load ${CHECKPOINT}  \
-       --num-attention-heads 32  \
-       --max-position-embeddings 4096  \
-       --bf16  \
-       --micro-batch-size 1  \
-       --seq-length 4096  \
-       --seed 101
+#!/bin/bash
+# This example will start serving the Mistral-7B-v0.3 model
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr 0.0.0.0 \
+                  --master_port 6000"
+
+# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
+if [ -z "$1" ] || [ -z "$2" ]; then
+  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
+  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
+  exit 1
+fi
+
+# Assign command-line arguments to variables
+CHECKPOINT=$1
+TOKENIZER_MODEL=$2
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tokenizer-type HuggingFaceTokenizer \
+       --tokenizer-model ${TOKENIZER_MODEL} \
+       --use-checkpoint-args \
+       --apply-layernorm-1p \
+       --transformer-impl transformer_engine \
+       --normalization RMSNorm \
+       --group-query-attention \
+       --num-query-groups 8 \
+       --no-masked-softmax-fusion \
+       --use-flash-attn \
+       --untie-embeddings-and-output-weights \
+       --disable-bias-linear \
+       --position-embedding-type rope \
+       --rotary-percent 1.0 \
+       --rotary-base 1000000 \
+       --swiglu \
+       --ffn-hidden-size 14336 \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 32  \
+       --hidden-size 4096  \
+       --load ${CHECKPOINT}  \
+       --num-attention-heads 32  \
+       --max-position-embeddings 4096  \
+       --bf16  \
+       --micro-batch-size 1  \
+       --seq-length 4096  \
+       --seed 101
--- a/examples/inference/run_text_generation_server_345M.sh
+++ b/examples/inference/run_text_generation_server_345M.sh
-#!/bin/bash
-# This example will start serving the 345M model.
-DISTRIBUTED_ARGS="--nproc_per_node 1 \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-
-CHECKPOINT=<Path to checkpoint (e.g /345m)>
-VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
-MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
-
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-pip install flask-restful
-
-torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
-       --tensor-model-parallel-size 1  \
-       --pipeline-model-parallel-size 1  \
-       --num-layers 24  \
-       --hidden-size 1024  \
-       --load ${CHECKPOINT}  \
-       --num-attention-heads 16  \
-       --max-position-embeddings 1024  \
-       --tokenizer-type GPT2BPETokenizer  \
-       --fp16  \
-       --micro-batch-size 1  \
-       --seq-length 1024  \
-       --vocab-file $VOCAB_FILE  \
-       --merge-file $MERGE_FILE  \
-       --seed 42
+#!/bin/bash
+# This example will start serving the 345M model.
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+CHECKPOINT=<Path to checkpoint (e.g /345m)>
+VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
+MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 24  \
+       --hidden-size 1024  \
+       --load ${CHECKPOINT}  \
+       --num-attention-heads 16  \
+       --max-position-embeddings 1024  \
+       --tokenizer-type GPT2BPETokenizer  \
+       --fp16  \
+       --micro-batch-size 1  \
+       --seq-length 1024  \
+       --vocab-file $VOCAB_FILE  \
+       --merge-file $MERGE_FILE  \
+       --seed 42
--- a/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh
+++ b/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh
-#!/bin/bash
-# This example will start serving the 345M model that is partitioned 8 way tensor parallel
-DISTRIBUTED_ARGS="--nproc_per_node 8 \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-
-CHECKPOINT=<Path to checkpoint (e.g /345m)>
-VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
-MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
-
-pip install flask-restful
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
-       --tensor-model-parallel-size 8  \
-       --pipeline-model-parallel-size 1  \
-       --num-layers 24  \
-       --hidden-size 1024  \
-       --load ${CHECKPOINT}  \
-       --num-attention-heads 16  \
-       --max-position-embeddings 1024  \
-       --tokenizer-type GPT2BPETokenizer  \
-       --fp16  \
-       --micro-batch-size 1  \
-       --seq-length 1024  \
-       --vocab-file $VOCAB_FILE  \
-       --merge-file $MERGE_FILE  \
-       --seed 42
+#!/bin/bash
+# This example will start serving the 345M model that is partitioned 8 way tensor parallel
+DISTRIBUTED_ARGS="--nproc_per_node 8 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+CHECKPOINT=<Path to checkpoint (e.g /345m)>
+VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
+MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
+
+pip install flask-restful
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tensor-model-parallel-size 8  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 24  \
+       --hidden-size 1024  \
+       --load ${CHECKPOINT}  \
+       --num-attention-heads 16  \
+       --max-position-embeddings 1024  \
+       --tokenizer-type GPT2BPETokenizer  \
+       --fp16  \
+       --micro-batch-size 1  \
+       --seq-length 1024  \
+       --vocab-file $VOCAB_FILE  \
+       --merge-file $MERGE_FILE  \
+       --seed 42
--- a/examples/mamba/run_text_gen_server_8b.sh
+++ b/examples/mamba/run_text_gen_server_8b.sh
-#!/bin/bash
-
-# Use: ./run_text_gen_server_8b.sh <checkpoint-path> <tokenizer-path>
-# To launch the client: python ../../tools/text_generation_cli.py <URL-provided-by-server>
-
-CHECKPOINT_PATH=$1
-TOKENIZER_PATH=$2
-
-DISTRIBUTED_ARGS="--nproc_per_node 1 \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-
-export NCCL_IB_SL=1
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-export NCCL_IB_TIMEOUT=19
-export NCCL_IB_QPS_PER_CONNECTION=4
-
-export TRITON_CACHE_DIR="./triton-cache/"
-export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager"
-
-torchrun $DISTRIBUTED_ARGS ../../tools/run_mamba_text_generation_server.py \
-       --tensor-model-parallel-size 1  \
-       --pipeline-model-parallel-size 1  \
-       --untie-embeddings-and-output-weights \
-       --num-layers 56  \
-       --hidden-size 4096  \
-       --load ${CHECKPOINT_PATH}  \
-       --num-attention-heads 32  \
-       --group-query-attention \
-       --num-query-groups 8 \
-       --hybrid-attention-ratio 0.08 \
-       --hybrid-mlp-ratio 0.5 \
-       --attention-dropout 0.0 \
-       --hidden-dropout 0.0 \
-       --disable-bias-linear \
-       --normalization RMSNorm \
-       --seq-length 4096  \
-       --max-position-embeddings 4096  \
-       --position-embedding-type none \
-       --tokenizer-type GPTSentencePieceTokenizer  \
-       --tokenizer-model ${TOKENIZER_PATH} \
-       --distributed-backend nccl \
-       --distributed-timeout-minutes 1440 \
-       --bf16  \
-       --micro-batch-size 1  \
-       --use-mcore-models \
-       --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
-       --seed 42
+#!/bin/bash
+
+# Use: ./run_text_gen_server_8b.sh <checkpoint-path> <tokenizer-path>
+# To launch the client: python ../../tools/text_generation_cli.py <URL-provided-by-server>
+
+CHECKPOINT_PATH=$1
+TOKENIZER_PATH=$2
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_IB_TIMEOUT=19
+export NCCL_IB_QPS_PER_CONNECTION=4
+
+export TRITON_CACHE_DIR="./triton-cache/"
+export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager"
+
+torchrun $DISTRIBUTED_ARGS ../../tools/run_mamba_text_generation_server.py \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 1  \
+       --untie-embeddings-and-output-weights \
+       --num-layers 56  \
+       --hidden-size 4096  \
+       --load ${CHECKPOINT_PATH}  \
+       --num-attention-heads 32  \
+       --group-query-attention \
+       --num-query-groups 8 \
+       --hybrid-attention-ratio 0.08 \
+       --hybrid-mlp-ratio 0.5 \
+       --attention-dropout 0.0 \
+       --hidden-dropout 0.0 \
+       --disable-bias-linear \
+       --normalization RMSNorm \
+       --seq-length 4096  \
+       --max-position-embeddings 4096  \
+       --position-embedding-type none \
+       --tokenizer-type GPTSentencePieceTokenizer  \
+       --tokenizer-model ${TOKENIZER_PATH} \
+       --distributed-backend nccl \
+       --distributed-timeout-minutes 1440 \
+       --bf16  \
+       --micro-batch-size 1  \
+       --use-mcore-models \
+       --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
+       --seed 42