diff --git a/CHANGELOG.md b/CHANGELOG.md
old mode 100755
new mode 100644
diff --git a/CODEOWNERS b/CODEOWNERS
old mode 100755
new mode 100644
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
old mode 100755
new mode 100644
diff --git a/Dockerfile.ci.dev b/Dockerfile.ci.dev
old mode 100755
new mode 100644
diff --git a/Dockerfile.ci.lts b/Dockerfile.ci.lts
old mode 100755
new mode 100644
diff --git a/Dockerfile.linting b/Dockerfile.linting
old mode 100755
new mode 100644
diff --git a/GPT_pretraining.sh b/GPT_pretraining.sh
old mode 100755
new mode 100644
diff --git a/LICENSE b/LICENSE
old mode 100755
new mode 100644
diff --git a/Llama_pretraining.sh b/Llama_pretraining.sh
index 265e2b5006374539df4f79de7f5e6d5015a64d4a..1ac8a7356fc0ffc23dc0c343f38a16177513ddb8 100755
--- a/Llama_pretraining.sh
+++ b/Llama_pretraining.sh
@@ -1,79 +1,64 @@
 #!/bin/bash
 set -eux
-
 #export FLASH_ATTENTION_PRINT_PARAM=1
 # Runs the "7B" parameter model
 export HSA_FORCE_FINE_GRAIN_PCIE=1
 export OMP_NUM_THREADS=1
 export NCCL_P2P_LEVEL=PXB # SYS
-
 #export HIP_ALLOC_INITIALIZE=0
-#export GPU_MAX_HW_QUEUES=20 # sglang空泡
-
+# export GPU_MAX_HW_QUEUES=10
 export NCCL_ALGO=Ring
 export NCCL_NCHANNELS_PER_PEER=16
-export NCCL_MIN_NCHANNELS=20
+export NCCL_MIN_NCHANNELS=32 # 20
+export NCCL_MAX_NCHANNELS=32 # 20
 export NCCL_IB_TIMEOUT=22
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-
-export NCCL_IB_HCA=mlx5_1,mlx5_2
-# export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,,mlx5_4,,mlx5_5,,mlx5_6,,mlx5_7
-export NCCL_NET_GDR_LEVEL=SYS
-export NCCL_NET_GDR_READ=0
+export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_NET_GDR_READ=1
+export RCCL_SDMA_COPY_ENABLE=0
+export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+# export NCCL_TOPO_FILE="/workspace/rccl-test/rccl-tests-0204/topo-input.xml"
 export GLOG_minloglevel=3 # 打印error级别的nccl日志
-
-# export TORCH_COMPILE_DEBUG=1 # 查看编译后的图
-
 source /opt/dtk/env.sh
 # 导入hipblaslt库
 # export LD_LIBRARY_PATH=/data/hipblaslt-install-0904/lib:$LD_LIBRARY_PATH 
+# export LD_LIBRARY_PATH=/data/hipblaslt-install-dtk-25.04-0212/lib:$LD_LIBRARY_PATH 
+export LD_LIBRARY_PATH=/public/home/xingjl/dependency/hipblaslt-install-0227/lib:$LD_LIBRARY_PATH 
 # 更新rocblas
 # export LD_LIBRARY_PATH=/data/rocblas-install_qwen1211/lib:$LD_LIBRARY_PATH
 # export LD_LIBRARY_PATH=/data/rocblas-install_qwen1228/lib:$LD_LIBRARY_PATH
-# export LD_LIBRARY_PATH=/data/rocblas-install_0107_trans/lib:$LD_LIBRARY_PATH
-
+export LD_LIBRARY_PATH=/public/home/xingjl/dependency/rocblas-install-0224/lib:$LD_LIBRARY_PATH
+# export LD_LIBRARY_PATH=/data/rocblas-install-0118-bf16/lib:$LD_LIBRARY_PATH
 # torch控制多流转单流
-# export ALLREDUCE_STREAM_WITH_COMPUTE=1
-
+export ALLREDUCE_STREAM_WITH_COMPUTE=1
+export SENDRECV_STREAM_WITH_COMPUTE=1 
 # prof采集添加同步, 避免卡顿
 # export GPU_FLUSH_ON_EXECUTION=1
 # export HIP_DIRECT_DISPATCH=0
-
 # 采集rocblas size
-export ROCBLAS_LAYER=3
+# export ROCBLAS_LAYER=3
 # 采集 fa size
 # export FLASH_ATTENTION_PRINT_PARAM=1
-
-CHECKPOINT_PATH=./tmp_7b #$1 #<Specify path>
+#增加编译缓存
+export cache_size_limit=64
+# CHECKPOINT_PATH=./Llama-2-7b-hf-to-meg-tp1-pp2 #CHECKPOINT_PATH=./tmp_7b # 
+SAVE_PATH=./tmp_7b
 TENSORBOARD_LOGS_PATH=./tmp_7b  #$2 #<Specify path>
-DATA_PATH="/data/datasets/nemo_pretrain/oscar-1GB/oscar-1GB-llama_text_document" 
-
+DATA_PATH="/public/home/xingjl/megatron-lm/llama2_dataset/oscar-1GB_head-llama2_text_document" #<Specify path and file prefix>_text_document
+# DATA_PATH="/data/datasets/oscar-1GB-head/oscar-1GB_head-llama2_text_document" #<Specify path and file prefix>_text_document
 GPT_MODEL_ARGS=(
     --num-layers 32
     --hidden-size 4096
     --ffn-hidden-size 11008 
     --num-attention-heads 32
     --max-position-embeddings 4096
-
     --normalization RMSNorm 
-    --position-embedding-type rope
+    --position-embedding-type rope # none # 
     --untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性
 )
-
-# GPT_MODEL_ARGS=(
-#     --num-layers 40
-#     --hidden-size 5120
-#     --ffn-hidden-size 13824 
-#     --num-attention-heads 40
-#     --max-position-embeddings 4096
-
-#     --normalization RMSNorm 
-#     --position-embedding-type rope
-#     --untie-embeddings-and-output-weights # 分开处理embed和输出权重, 增加灵活性
-# )
-
-# export NVTE_FLASH_ATTN=1 # 走cutlass
-export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
+export NVTE_FLASH_ATTN=1 # 走cutlass
+# export NVTE_FLASH_ATTN_TRITON=1 # 走triton_fa
 # --transformer-impl transformer_engine # 走core用这两组参数
     # --use-mcore-models
     # --transformer-impl local # 走legacy用这两组参数
@@ -82,8 +67,8 @@ TRAINING_ARGS=(
     --transformer-impl local # 走legacy用这两组参数
     --use-legacy-models 
     --micro-batch-size 1
-    --global-batch-size 64 #240 #60 #512 #64
-    --train-iters 10
+    --global-batch-size 64 #32 #240 #60 #512 #64
+    --train-iters 50
     --weight-decay 0.1 
     --adam-beta1 0.9 
     --adam-beta2 0.95 
@@ -96,7 +81,7 @@ TRAINING_ARGS=(
     --disable-bias-linear
     --attention-dropout 0
     --hidden-dropout 0
-    --no-gradient-accumulation-fusion # 开启后精度不对, apex更新后可以开启
+    # --no-gradient-accumulation-fusion
     --swiglu
     --lr 3.0e-5 
     --lr-decay-style cosine 
@@ -109,37 +94,52 @@ TRAINING_ARGS=(
     # --recompute-method block
     --overlap-grad-reduce # 重叠ddp grad reduce
     # --tp-comm-overlap # tensor parallel comm和gemm重叠, 优化项未适配
-    # --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠, 优化项未适配
+    # --tp-comm-overlap-rs-dgrad # reduce-scatter和dgrad gemm重叠
     --use-flash-attn-cutlass
 )
+# 使用torch fa的环境变量
+# export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
+# export TORCHINDUCTOR_BENCHMARK_FUSION=1
+# export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
+# export TORCHINDUCTOR_MAX_AUTOTUNE=1
+# export TORCHINDUCTOR_CACHE_DIR=./cache
 # --use-flash-attn-cutlass # cutlass fa
 # --use-flash-attn-triton # triton fa
-
+# --use-flash-attn-torch # torch fa
 MODEL_PARALLEL_ARGS=(
     --sequence-parallel
-	--tensor-model-parallel-size 2
+	--tensor-model-parallel-size 1
 	--pipeline-model-parallel-size 2
+    # --context-parallel-size 2
+    # --num-layers-per-virtual-pipeline-stage 4
+  # --microbatch-group-size-per-virtual-pipeline-stage 1
+#   --no-overlap-p2p-communication # 开启后
 )
-
 DATA_ARGS=(
     --data-path $DATA_PATH 
     --seq-length 4096 #4096
     --split 949,50,1
     --tokenizer-type Llama2Tokenizer
-    --tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model
+    --tokenizer-model /public/home/xingjl/megatron-lm/llama2_dataset/tokenizer.model
+    # --tokenizer-model /data/model_weights/llama2_7b_hf/tokenizer.model
 )
-
 EVAL_AND_LOGGING_ARGS=(
     --log-interval 1
     --log-throughput
     --save-interval 1000 
     --eval-interval 1000 
-    --save $CHECKPOINT_PATH 
-    --load $CHECKPOINT_PATH 
-    --eval-iters 10
+    #--save $SAVE_PATH 
+    #--load $SAVE_PATH 
+    --eval-iters 3
     --tensorboard-dir $TENSORBOARD_LOGS_PATH 
 )
-
+# FINETUNE_ARGS=(
+#     # --finetune
+#     # --pretrained-checkpoint $CHECKPOINT_PATH
+#     --load $CHECKPOINT_PATH
+#     --no-load-optim
+#     --no-load-rng
+# )
 PROFILE_ARGS=(
     --profile
     --profile-step-start 4
@@ -148,20 +148,17 @@ PROFILE_ARGS=(
     --profile-ranks 0 1 2 3 4 5 6 7
     --profile-dir prof_data
 )
-
 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
 WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
 DIST_URL=${1}
-DIST_PORT=34567
-
+DIST_PORT=34577
 DISTRIBUTED_ARGS=(
     --rank ${RANK}
     --world-size ${WORLD_SIZE}
     --local-rank ${LOCAL_RANK}
     --dist-url tcp://${DIST_URL}:${DIST_PORT}
 )
-
 APP="python -u pretrain_gpt.py \
         ${GPT_MODEL_ARGS[@]} \
         ${TRAINING_ARGS[@]} \
@@ -169,53 +166,52 @@ APP="python -u pretrain_gpt.py \
         ${DATA_ARGS[@]} \
         ${EVAL_AND_LOGGING_ARGS[@]} \
         ${DISTRIBUTED_ARGS[@]} \
-        
 "
 # 开启profile
 # ${PROFILE_ARGS[@]} \
-
+# export HIP_VISIBLE_DEVICES=0,7 #  # 4,5,6,7 #,
 export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 #  # 4,5,6,7 #,
 # export CUDA_VISIBLE_DEVICES=4,5,6,7 # 0,1,2,3,
-${APP}
-# case ${LOCAL_RANK} in
-# [0])
-# #   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-#   ${APP}
-#   # numactl --cpunodebind=0 --membind=0 ${APP}
-#   ;;
-# [1])
-# #   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-#   ${APP}
-#   # numactl --cpunodebind=0 --membind=0 ${APP}
-#   ;;
-# [2])
-# #   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-#   ${APP}
-#   # numactl --cpunodebind=0 --membind=0 ${APP}
-#   ;;
-# [3])
-# #   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-#   ${APP}
-#   # numactl --cpunodebind=0 --membind=0 ${APP}
-#   ;;
-# [4])
-#   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-#   ${APP}
-#   # numactl --cpunodebind=0 --membind=0 ${APP}
-#   ;;
-# [5])
-#   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-#   ${APP}
-#   # numactl --cpunodebind=0 --membind=0 ${APP}
-#   ;;
-# [6])
-#   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-#   ${APP}
-#   # numactl --cpunodebind=0 --membind=0 ${APP}
-#   ;;
-# [7])
-#   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-#   ${APP}
-#   # numactl --cpunodebind=0 --membind=0 ${APP}
-#   ;;
-# esac
+# ${APP}
+case ${LOCAL_RANK} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[4])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=4 --membind=4 ${APP}
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[5])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=5 --membind=5 ${APP}
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[6])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=6 --membind=6 ${APP}
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[7])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  numactl --cpunodebind=7 --membind=7 ${APP}
+  # hipprof --hip-trace --trace-off numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+esac
\ No newline at end of file
diff --git a/MANIFEST.in b/MANIFEST.in
old mode 100755
new mode 100644
diff --git a/README.md.origin b/README.md.origin
old mode 100755
new mode 100644
diff --git a/docs/llama_mistral.md b/docs/llama_mistral.md
old mode 100755
new mode 100644
diff --git a/docs/source/api-guide/context_parallel.rst b/docs/source/api-guide/context_parallel.rst
old mode 100755
new mode 100644
diff --git a/docs/source/api-guide/datasets.rst b/docs/source/api-guide/datasets.rst
old mode 100755
new mode 100644
diff --git a/docs/source/api-guide/dist_checkpointing.rst b/docs/source/api-guide/dist_checkpointing.rst
old mode 100755
new mode 100644
diff --git a/docs/source/api-guide/dist_checkpointing.strategies.rst b/docs/source/api-guide/dist_checkpointing.strategies.rst
old mode 100755
new mode 100644
diff --git a/docs/source/api-guide/dist_optimizer.md b/docs/source/api-guide/dist_optimizer.md
old mode 100755
new mode 100644
diff --git a/docs/source/api-guide/distributed.rst b/docs/source/api-guide/distributed.rst
old mode 100755
new mode 100644
diff --git a/docs/source/api-guide/encoder_decoder_parallelism.rst b/docs/source/api-guide/encoder_decoder_parallelism.rst
old mode 100755
new mode 100644
diff --git a/docs/source/api-guide/fusions.rst b/docs/source/api-guide/fusions.rst
old mode 100755
new mode 100644
diff --git a/docs/source/api-guide/index.rst b/docs/source/api-guide/index.rst
old mode 100755
new mode 100644
diff --git a/docs/source/api-guide/models.bert.rst b/docs/source/api-guide/models.bert.rst
old mode 100755
new mode 100644
diff --git a/docs/source/api-guide/models.gpt.rst b/docs/source/api-guide/models.gpt.rst
old mode 100755
new mode 100644
diff --git a/docs/source/api-guide/models.rst b/docs/source/api-guide/models.rst
old mode 100755
new mode 100644
diff --git a/docs/source/api-guide/models.t5.rst b/docs/source/api-guide/models.t5.rst
old mode 100755
new mode 100644
diff --git a/docs/source/api-guide/moe.rst b/docs/source/api-guide/moe.rst
old mode 100755
new mode 100644
diff --git a/docs/source/api-guide/num_microbatches_calculator.rst b/docs/source/api-guide/num_microbatches_calculator.rst
old mode 100755
new mode 100644
diff --git a/docs/source/api-guide/optimizer_param_scheduler.rst b/docs/source/api-guide/optimizer_param_scheduler.rst
old mode 100755
new mode 100644
diff --git a/docs/source/api-guide/pipeline_parallel.rst b/docs/source/api-guide/pipeline_parallel.rst
old mode 100755
new mode 100644
diff --git a/docs/source/api-guide/tensor_parallel.rst b/docs/source/api-guide/tensor_parallel.rst
old mode 100755
new mode 100644
diff --git a/docs/source/api-guide/transformer.rst b/docs/source/api-guide/transformer.rst
old mode 100755
new mode 100644
diff --git a/docs/source/images/context_parallel/CP_overview.png b/docs/source/images/context_parallel/CP_overview.png
old mode 100755
new mode 100644
diff --git a/docs/source/images/context_parallel/CP_results.png b/docs/source/images/context_parallel/CP_results.png
old mode 100755
new mode 100644
diff --git a/docs/source/images/distrib_optimizer/data_flow.png b/docs/source/images/distrib_optimizer/data_flow.png
old mode 100755
new mode 100644
diff --git a/docs/source/images/distrib_optimizer/sharding_scheme.png b/docs/source/images/distrib_optimizer/sharding_scheme.png
old mode 100755
new mode 100644
diff --git a/docs/source/images/moe/token_drop.png b/docs/source/images/moe/token_drop.png
old mode 100755
new mode 100644
diff --git a/docs/source/index.rst b/docs/source/index.rst
old mode 100755
new mode 100644
diff --git a/docs/source/user-guide/index.rst b/docs/source/user-guide/index.rst
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/detxoify_lm/README.md b/examples/academic_paper_scripts/detxoify_lm/README.md
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py b/examples/academic_paper_scripts/detxoify_lm/annotations/filter-selfgeneration.py
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py b/examples/academic_paper_scripts/detxoify_lm/annotations/perspective_api_annotate.py
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh b/examples/academic_paper_scripts/detxoify_lm/annotations/preprocess.sh
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/detxoify_lm/finetune_gpt.py b/examples/academic_paper_scripts/detxoify_lm/finetune_gpt.py
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/detxoify_lm/finetune_gpt_distributed-1.3b.sh b/examples/academic_paper_scripts/detxoify_lm/finetune_gpt_distributed-1.3b.sh
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/detxoify_lm/generate-1.3b.sh b/examples/academic_paper_scripts/detxoify_lm/generate-1.3b.sh
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py b/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/detxoify_lm/perspective_api.py b/examples/academic_paper_scripts/detxoify_lm/perspective_api.py
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh b/examples/academic_paper_scripts/detxoify_lm/self_generation/selfgenerate-1.3b-unconditional.sh
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/msdp/README.md b/examples/academic_paper_scripts/msdp/README.md
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/msdp/data_processing.sh b/examples/academic_paper_scripts/msdp/data_processing.sh
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/msdp/eval_knwl_generation.sh b/examples/academic_paper_scripts/msdp/eval_knwl_generation.sh
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/msdp/eval_resp_generation.sh b/examples/academic_paper_scripts/msdp/eval_resp_generation.sh
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/msdp/prep_resp_gen.sh b/examples/academic_paper_scripts/msdp/prep_resp_gen.sh
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh b/examples/academic_paper_scripts/msdp/prompt_knwl_gen.sh
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/msdp/prompt_resp_gen.sh b/examples/academic_paper_scripts/msdp/prompt_resp_gen.sh
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/sc21/CONFIG.sh b/examples/academic_paper_scripts/sc21/CONFIG.sh
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/sc21/README.md b/examples/academic_paper_scripts/sc21/README.md
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/sc21/SBATCH.sh b/examples/academic_paper_scripts/sc21/SBATCH.sh
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/sc21/SRUN.sh b/examples/academic_paper_scripts/sc21/SRUN.sh
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/sc21/run_figure_11.sh b/examples/academic_paper_scripts/sc21/run_figure_11.sh
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/sc21/run_figure_12.sh b/examples/academic_paper_scripts/sc21/run_figure_12.sh
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/sc21/run_figure_13.sh b/examples/academic_paper_scripts/sc21/run_figure_13.sh
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/sc21/run_figure_14.sh b/examples/academic_paper_scripts/sc21/run_figure_14.sh
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/sc21/run_figure_15.sh b/examples/academic_paper_scripts/sc21/run_figure_15.sh
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/sc21/run_figure_16.sh b/examples/academic_paper_scripts/sc21/run_figure_16.sh
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/sc21/run_figure_17.sh b/examples/academic_paper_scripts/sc21/run_figure_17.sh
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/sc21/run_figure_18.sh b/examples/academic_paper_scripts/sc21/run_figure_18.sh
old mode 100755
new mode 100644
diff --git a/examples/academic_paper_scripts/sc21/run_table_1.sh b/examples/academic_paper_scripts/sc21/run_table_1.sh
old mode 100755
new mode 100644
diff --git a/examples/bert/README.md b/examples/bert/README.md
old mode 100755
new mode 100644
diff --git a/examples/bert/train_bert_340m_distributed.sh b/examples/bert/train_bert_340m_distributed.sh
old mode 100755
new mode 100644
diff --git a/examples/export/README.md b/examples/export/README.md
old mode 100755
new mode 100644
diff --git a/examples/export/knowledge_distillation/pretrain_gpt_modelopt.py b/examples/export/knowledge_distillation/pretrain_gpt_modelopt.py
old mode 100755
new mode 100644
diff --git a/examples/export/ptq_and_trtllm_export/README.md b/examples/export/ptq_and_trtllm_export/README.md
old mode 100755
new mode 100644
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama2_7b.sh
old mode 100755
new mode 100644
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh
old mode 100755
new mode 100644
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh
old mode 100755
new mode 100644
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b.sh
old mode 100755
new mode 100644
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh
old mode 100755
new mode 100644
diff --git a/examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh b/examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh
old mode 100755
new mode 100644
diff --git a/examples/export/ptq_and_trtllm_export/text_generation_ptq.py b/examples/export/ptq_and_trtllm_export/text_generation_ptq.py
old mode 100755
new mode 100644
diff --git a/examples/export/ptq_and_trtllm_export/trtllm_text_generation.py b/examples/export/ptq_and_trtllm_export/trtllm_text_generation.py
old mode 100755
new mode 100644
diff --git a/examples/export/trtllm_export/README.md b/examples/export/trtllm_export/README.md
old mode 100755
new mode 100644
diff --git a/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py b/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
old mode 100755
new mode 100644
diff --git a/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py b/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
old mode 100755
new mode 100644
diff --git a/examples/gpt3/README.md b/examples/gpt3/README.md
old mode 100755
new mode 100644
diff --git a/examples/gpt3/gpt_config.yaml b/examples/gpt3/gpt_config.yaml
old mode 100755
new mode 100644
index 443e4b79b88daf8d3c3b0ed0bc5cae04529db940..06257827fdfbd32d262d0da032930ebbaaf578aa
--- a/examples/gpt3/gpt_config.yaml
+++ b/examples/gpt3/gpt_config.yaml
@@ -63,6 +63,7 @@ language_model:
   # MoE related
   moe_router_load_balancing_type: "aux_loss"
   moe_router_topk: 2
+  moe_router_topk_limited_devices: null
   moe_grouped_gemm: False
   moe_aux_loss_coeff: 0  # 1e-2 would be a good start value for load balance loss.
   moe_z_loss_coeff: null  # 1e-3 would be a good start value for z-loss
diff --git a/examples/gpt3/train_gpt3_175b_distributed.sh b/examples/gpt3/train_gpt3_175b_distributed.sh
old mode 100755
new mode 100644
diff --git a/examples/inference/README.md b/examples/inference/README.md
old mode 100755
new mode 100644
index bd8e738e55b60f38c94323a7adf445e3f7474a7e..b4b07cbc6ab88a1b3453bcecbb9534d8026a6f64
--- a/examples/inference/README.md
+++ b/examples/inference/README.md
@@ -1,5 +1,5 @@
 ### Megatron Core Inference Documentation
-This guide will walk you through how you can use megatron core for inference on your models. 
+This guide provides an example for Megatron Core for running model inference. 
 
 ### Contents
 - [Megatron Core Inference Documentation](#megatron-core-inference-documentation)
@@ -18,21 +18,21 @@ This guide will walk you through how you can use megatron core for inference on
 <br>
 
 #### 1. Quick Start
-This will walk you through the flow of running batch inference on a GPT model trained using megatron core. The file can be found at [simple_gpt_batch_inference.py](./gpt/simple_gpt_batch_inference.py)
+This example runs batch inference on a GPT model trained using Megatron Core. The entrypoint is [simple_gpt_batch_inference.py](./gpt/gpt_batch_inference.py)
 
 <br>
 
-##### 1.1 Understanding The Code
-***STEP 1 - We initialize model parallel and other default arguments***
-We can default micro batch size to be 1, since for TP models it is not used, and for PP models it is calculated during runtime. 
+##### 1.1 Code Walkthrough 
+***STEP 1 - Initialize model parallel and other default arguments***
+The micro batch size is set as 1 as it is not used in tensor-parallelism only, and for pipeline-parallel models it is calculated at runtime. 
 ```python
     initialize_megatron(
         args_defaults={'no_load_rng': True, 'no_load_optim': True, 'micro_batch_size': 1}
     )
 ```
 
-***STEP 2 - We load the model using the model_provider_function***
-NOTE: The model provider function in the script supports MCore and Legacy models. 
+***STEP 2 - Load the model using the model_provider_function***
+NOTE: The model provider function supports both MCore and Legacy models. 
 
 ```python
     model = get_model(model_provider, wrap_with_ddp=False)
@@ -41,10 +41,10 @@ NOTE: The model provider function in the script supports MCore and Legacy models
 ```
 
 ***STEP 3 - Choose an engine***
-One of the important elements of the generate function is an inference engine. In this example we will be choosing the [megatron core engine](../../megatron/core/inference/engine/mcore_engine.py) with a [simple text generation controller](../../megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py), the default engine. Other engines that will be supported in the future are TRTLLMEngine.
+Text generation requires an inference engine, which includes a scheduler. The default engine is the [Megatron Core engine](../../megatron/core/inference/engine/mcore_engine.py) with a simple [text generation controller](../../megatron/core/inference/text_generation_controllers/text_generation_controller.py). TRTLLMEngine will be supported in the future.
 ```python
     inference_wrapped_model = GPTInferenceWrapper(model, args)
-    text_generation_controller = SimpleTextGenerationController(
+    text_generation_controller = TextGenerationController(
         inference_wrapped_model=inference_wrapped_model, 
         tokenizer=tokenizer
     )
@@ -53,12 +53,12 @@ One of the important elements of the generate function is an inference engine. I
     )
 ```
 
-***STEP 4 - Run the generate function and display results***
-We use default values for the [common inference params](../../megatron/core/inference/common_inference_params.py). Customize this if you want to change top_p, top_k, number of tokens to generate etc. 
-*Note that the result is returned as a list of [InferenceRequests](../../megatron/core/inference/inference_request.py)*
+***STEP 4 - Run text generation***
+The [SamplingParams](../../megatron/core/inference/sampling_params.py) contains suggested defaults. Customize this to change top_p, top_k, number of tokens to generate etc. 
+*Note: The result is returned as a list of [InferenceRequests](../../megatron/core/inference/inference_request.py)*
 ```python
     results: List[InferenceRequest] = inference_engine.generate(
-        prompts=args.prompts, common_inference_params=common_inference_params
+        prompts=args.prompts, sampling_params=sampling_params
     )
     
     if torch.distributed.get_rank() == 0:
@@ -76,12 +76,12 @@ We use default values for the [common inference params](../../megatron/core/infe
 <br>
 
 ##### 1.2 Running The Code
-An example run script is shown below. Change the tokenizer paths, inference params, and other settings for your model. 
+An example run script is shown below. Set the tokenizer paths, inference params, and other settings appropriately. 
 
-For a quick recap on inference params refer to [this blog](https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910) 
+For a quick recap on sampling parameters, refer to [this blog](https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910).
 
 ```
-#In a slurm cluster (You could also use docker)
+# In a slurm cluster (You could also use docker)
 ACCOUNT=<account>
 MLM_PATH=/path/to/megatron-lm
 GPT_CKPT=/path/to/gpt/ckpt
@@ -133,8 +133,8 @@ NOTE: Other parameters which can be customized for inference are :-
 --top_p (top_p sampling)
 --num-tokens-to-generate (Number of tokens to generate for each prompt)
 --inference-batch-times-seqlen-threshold (During inference, if batch-size times sequence-length is smaller than this threshold then we will not use pipelining, otherwise we will.')
---use-dist-ckpt (If you are using dist checkpoint format for the model)
---use-legacy-models (If you are using legacy gpt model instead of mcore gpt model)
+--use-dist-ckpt (If using dist checkpoint format for the model)
+--use-legacy-models (If using legacy gpt model instead of mcore gpt model)
 
 ```
 
@@ -142,16 +142,17 @@ NOTE: Other parameters which can be customized for inference are :-
 <br>
 
 
-#### 2. Flow of Control In MCore Backend
-The following is what happens in the [simple_gpt_batch_inference.py](./gpt/simple_gpt_batch_inference.py).
-* We call  [mcore_engine](../../megatron/core/inference/engines/mcore_engine.py) **generate()** function with all our input prompts.
-* The scheduler in the engine will add these prompts to the [active requests] pool (../../megatron/core/inference/inference_request.py) until we hit the max batch size, and then it will put the rest in the waiting requests pool. 
-* The engine will then run until all requests (waiting + active) are completed 
+#### 2. Control Flow in the MCore Backend
+An example of inference with static batching is provided in [gpt_batch_inference.py](./gpt/gpt_batch_inference.py).
+* [mcore_engine](../../megatron/core/inference/engines/mcore_engine.py) **generate()** function is called with the input prompts.
+* The `Scheduler` in the engine will add these prompts to the [active requests] pool (../../megatron/core/inference/inference_request.py) until max batch size is hit. Remaining requests will be added to the waiting requests pool. 
+* The engine will run until all requests (waiting + active) are completed. 
     * The active requests are passed into  **generate_all_output_tokens_static_batch()** of the text generation controller . 
-    * This function uses the [model_inference_wrappers](../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) **prep_model_for_inference()** , and then runs an auto regressive loop
-    * In the auto regressive loop, the **get_batch_for_context_window()** method of the inference wrapper is called to get the required input, passes it into the **run_one_forward_step()** method, which calls the appropriate (PP, TP) model `.forward()` methods to get the output logits
-    * The output logits are synchronized across all pipeline parallel ranks
-    * The text generation controller obtains the log probabilities and samples tokens based on the strategy defined in the common inference parameters.
+    * This function uses the **prep_model_for_inference()** method of the [model_inference_wrappers](../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) and runs an autoregressive sampling loop
+    * In the autoregressive loop, the **get_batch_for_context_window()** method of the inference wrapper is called to slice out the input tokens and masks
+    * Input tokens and masks are passed it into the **run_one_forward_step()** method, which calls the model `.forward()` method to get the output logits
+    * Output logits are synchronized across all pipeline parallel ranks
+    * The text generation controller obtains the log probabilities and samples tokens based on the strategy defined in the sampling parameters.
     * The sampled tokens are then appended to the input prompt tokens for the next iteration 
     * The **update_generation_status()** method of the text generation controller checks which prompts have finished generating or hit a stop condition
     * After the inference loop, the result is detokenized and stored as an attribute of the InferenceRequest. These requests are marked as completed. 
@@ -160,16 +161,18 @@ The following is what happens in the [simple_gpt_batch_inference.py](./gpt/simpl
 <br>
 
 #### 3. Customizing The Inference Pipeline
-The following guide will walk you through how you can customize different parts of the inference pipeline. There are three levels at which you can customize the pipeline. 
-* **Inference engine** - Highest level of customization. Currently we support the MCore Engine. Change this to add a new engine.
-* **Text generation controller** - Extend this to customize tokenization, detokenization, or implement a new sampling strategy.
+
+The inference pipeline supports three levels of customization:
+
+* **Inference engine** - The MCore Engine is currently supported. Change this to add a new backend.
+* **Text generation controller** - The main sampling loop. This can be customized to support alternative tokenization, detokenization, or to implement a new sampling strategy.
 * **Inference Wrapped Model** - Change this to support a new model.
 * **Modify Inference Parameters** - Change this to update top_p, top_k, number of tokens to be generated, temperature, or other sampling parameters.
 
 <br>
 
 ##### 3.1. Create Your Own Inference Backend 
-This is the highest level of customization. The  [abstract_engine.py](./../../megatron/core/inference/engine/abstract_engine.py) file has a generate method that can be extended to support a new backend. 
+The  [abstract_engine.py](./../../megatron/core/inference/engine/abstract_engine.py) file contains a `generate` method that can be extended to support a new backend. 
 
 ```python
 class AbstractEngine(ABC):
@@ -177,15 +180,17 @@ class AbstractEngine(ABC):
     def generate(self) -> dict:
         """The abstract backend's generate function. 
 
-        To define your own backend, make sure you implement this and return the outputs as a dictionary . 
-
+        To define a new backend, implement this method and return the outputs as a dictionary. 
+```
 
 <br>
 
-##### 3.2. Create Your Own Text Generation Controller
-In case you want to use the megatron core backend, but would like to overwrite the tokenization, text generation or detokenization extend the [simple_text_generation_controller.py](../../megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py). The class has the following methods
+##### 3.2. Implement a new Sampling Loop 
+
+The [TextGenerationController](../../megatron/core/inference/text_generation_controllers/text_generation_controller.py) contains the main sampling loop and can be modified to support new tokenization, detokenization, or sampling strategies.
+
 ``` python
-class SimpleTextGenerationController:
+class TextGenerationController:
 
     def tokenize_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
         """Utility to tokenize the input prompts"""
@@ -193,12 +198,12 @@ class SimpleTextGenerationController:
     def sample_from_logits(
         self,
         last_token_logits: torch.Tensor,
-        common_inference_params: CommonInferenceParams,
+        sampling_params: SamplingParams,
         vocab_size: int,
     ) -> torch.Tensor:
         """Samples the logits to generate outputs
 
-        Given the logits of the last token, this function samples it according to the parameters defined in common_inference_params and returns the samples
+        Given the logits of the last token, this function samples according to the parameters defined in sampling_params and returns the sampled tokens.
         """
 
     def update_generation_status(
@@ -229,12 +234,12 @@ class SimpleTextGenerationController:
 <br>
 
 ##### 3.3. Support Other Models
-In order to support other models please extend the [abstract_model_inference_wrapper.py](./../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) file. The abstract wrapper already supports the following :
-* Forward method which automatically calls the appropriate forward method (PP or TP etc) depending on model parallel settings
-* Initalizes the model and puts it in eval mode
-* Obtains the input parameters (batch size, max seq length) and has an instance of the input 
+Extend [abstract_model_inference_wrapper.py](./../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) to support other models. The abstract model wrapper implements: 
+* Forward method which calls the model `forward` method depending on model parallel settings
+* Initializes the model and puts it in `.eval()` mode
+* Setup for the input parameters (max batch size, max seq length) 
 
-The main methods to change for your model might be the following: 
+The following methods should be implemented: 
 ```python
 class AbstractModelInferenceWrapper:
     def prep_model_for_inference(self, prompts_tokens: torch.Tensor):
@@ -247,28 +252,28 @@ class AbstractModelInferenceWrapper:
     def get_batch_for_context_window(self) -> List:
         """Returns the input data for inference 
 
-        This function gets called iteratively in the inference loop . It can be used to extract relevant input from the prompt tokens, attention mask etc. required for each step in inference.
+        This function gets called iteratively in the inference loop. It can be used to extract relevant input from the prompt tokens, attention mask etc. required for each step in inference.
 ```
 
-Refer to [gpt_inference_wrapper.py](../../megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py) for an example of extending this for GPTModel.
+Refer to [gpt_inference_wrapper.py](../../megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py) for an example of implementing this for GPTModel.
 
 <br>
 
 ##### 3.3. Modify Inference Parameters
-We use  [common inference params](../../megatron/core/inference/common_inference_params.py) for text generation. Customize this if you want to change top_p, top_k, number of tokens to generate etc. If you want to add other attributes that you would use in the inference loop, you can do that as shown below
+We use  [common inference params](../../megatron/core/inference/sampling_params.py) for text generation. Customize this if you want to change top_p, top_k, number of tokens to generate etc. If you want to add other attributes that you would use in the inference loop, you can do that as shown below
 
 ```
-from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.sampling_params import SamplingParams
 
-c = CommonInferenceParams(temperature=0.5)
+c = SamplingParams(temperature=0.5)
 c.add_attributes({'min_length':4, 'eod_id':153})
 ```
 
 <br>
 
 #### 4. Future work
-The following are planned for the future releases . 
+The following features are planned for the future releases. 
 * Dynamic batching 
 * Paged Attention
 * TRTLLM Engine support
-* Support for Multimodal model inference
\ No newline at end of file
+* Support for multimodal inference
\ No newline at end of file
diff --git a/examples/inference/gpt/simple_gpt_batch_inference.py b/examples/inference/gpt/gpt_batch_inference.py
old mode 100755
new mode 100644
similarity index 91%
rename from examples/inference/gpt/simple_gpt_batch_inference.py
rename to examples/inference/gpt/gpt_batch_inference.py
index 5c7ae5bd773cd41437650caa01e06664c7e506c2..050b230cef70d56203b7f9270a6166d7251f0769
--- a/examples/inference/gpt/simple_gpt_batch_inference.py
+++ b/examples/inference/gpt/gpt_batch_inference.py
@@ -6,10 +6,10 @@ import sys
 from argparse import Namespace
 from megatron.core.inference.engines.abstract_engine import AbstractEngine
 from megatron.core.inference.engines.mcore_engine import MCoreEngine
-from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.sampling_params import SamplingParams
 from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper
 from megatron.core.inference.inference_request import InferenceRequest
-from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import SimpleTextGenerationController
+from megatron.core.inference.text_generation_controllers.text_generation_controller import TextGenerationController
 from megatron.core.transformer.module import MegatronModule
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
                                              os.path.pardir, os.path.pardir)))
@@ -66,7 +66,7 @@ def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngi
     )
 
     inference_wrapped_model = GPTInferenceWrapper(model, inference_wrapper_config)
-    text_generation_controller = SimpleTextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer)
+    text_generation_controller = TextGenerationController(inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer)
     return MCoreEngine(text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size)
             
 def main():
@@ -89,7 +89,7 @@ def main():
 
     inference_engine = get_inference_engine(args, model)
 
-    common_inference_params = CommonInferenceParams(
+    sampling_params = SamplingParams(
         temperature=args.temperature, 
         top_k=args.top_k, 
         top_p=args.top_p, 
@@ -97,7 +97,7 @@ def main():
         num_tokens_to_generate=args.num_tokens_to_generate)
 
     results: List[InferenceRequest] = inference_engine.generate(
-        prompts=args.prompts, common_inference_params=common_inference_params
+        prompts=args.prompts, sampling_params=sampling_params
     )
     
     if torch.distributed.get_rank() == 0:
diff --git a/examples/inference/llama_mistral/huggingface_reference.py b/examples/inference/llama_mistral/huggingface_reference.py
old mode 100755
new mode 100644
diff --git a/examples/inference/llama_mistral/run_text_generation_llama3.1.sh b/examples/inference/llama_mistral/run_text_generation_llama3.1.sh
old mode 100755
new mode 100644
diff --git a/examples/inference/llama_mistral/run_text_generation_llama3.sh b/examples/inference/llama_mistral/run_text_generation_llama3.sh
old mode 100755
new mode 100644
diff --git a/examples/inference/llama_mistral/run_text_generation_mistral.sh b/examples/inference/llama_mistral/run_text_generation_mistral.sh
old mode 100755
new mode 100644
diff --git a/examples/inference/run_text_generation_server_345M.sh b/examples/inference/run_text_generation_server_345M.sh
old mode 100755
new mode 100644
diff --git a/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh b/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh
old mode 100755
new mode 100644
diff --git a/examples/inference/t5/simple_t5_batch_inference.py b/examples/inference/t5/simple_t5_batch_inference.py
old mode 100755
new mode 100644
index 3f4557d3c2dac2ae1394adfae6d79899d9b0aa11..b4226d7de0f8352fd74bedf047559f0a7819ea84
--- a/examples/inference/t5/simple_t5_batch_inference.py
+++ b/examples/inference/t5/simple_t5_batch_inference.py
@@ -5,7 +5,7 @@ from argparse import Namespace
 import torch
 
 import pretrain_t5
-from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.sampling_params import SamplingParams
 from megatron.core.inference.engines.abstract_engine import AbstractEngine
 from megatron.core.inference.engines.mcore_engine import MCoreEngine
 from megatron.core.inference.inference_request import InferenceRequest
@@ -120,7 +120,7 @@ def main():
 
     inference_engine = get_inference_engine(args, model)
 
-    common_inference_params = CommonInferenceParams(
+    sampling_params = SamplingParams(
         temperature=args.temperature,
         top_k=args.top_k,
         top_p=args.top_p,
@@ -138,7 +138,7 @@ def main():
         prompts=args.prompts,
         add_BOS=True,
         encoder_prompts=args.encoder_prompts,
-        common_inference_params=common_inference_params,
+        sampling_params=sampling_params,
     )
 
     if torch.distributed.get_rank() == 0:
diff --git a/examples/mamba/.gitignore b/examples/mamba/.gitignore
old mode 100755
new mode 100644
diff --git a/examples/mamba/Dockerfile b/examples/mamba/Dockerfile
old mode 100755
new mode 100644
diff --git a/examples/mamba/README.md b/examples/mamba/README.md
old mode 100755
new mode 100644
diff --git a/examples/mamba/run_text_gen_server_8b.sh b/examples/mamba/run_text_gen_server_8b.sh
old mode 100755
new mode 100644
diff --git a/examples/mamba/run_text_gen_server_8b_gpt3.sh b/examples/mamba/run_text_gen_server_8b_gpt3.sh
old mode 100755
new mode 100644
diff --git a/examples/mamba/train.sh b/examples/mamba/train.sh
old mode 100755
new mode 100644
diff --git a/examples/mixtral/README.md b/examples/mixtral/README.md
old mode 100755
new mode 100644
diff --git a/examples/mixtral/train_mixtral_8x7b_distributed.sh b/examples/mixtral/train_mixtral_8x7b_distributed.sh
old mode 100755
new mode 100644
diff --git a/examples/multimodal/Dockerfile b/examples/multimodal/Dockerfile
old mode 100755
new mode 100644
diff --git a/examples/multimodal/README.md b/examples/multimodal/README.md
old mode 100755
new mode 100644
index 62e47567b939865fa73346dc8e452f18f02685b4..a65839f8f15f0ada9a38bc5081e74e6251c298d6
--- a/examples/multimodal/README.md
+++ b/examples/multimodal/README.md
@@ -16,7 +16,7 @@ You can build a docker container using `examples/multimodal/Dockerfile` to run t
 
 ### Language model
 
-Follow the instructions in [Mistral](../../docs/llama_mistral.md#mistral-7b) to download weights for Mistral-7B-Instruct-v0.3 (Base or Instruct) from HuggingFace and convert to mcore format with tensor parallel size 4.
+Follow the instructions in [Mistral](../../docs/llama_mistral.md#mistral-7b) to download weights for Mistral-7B-Instruct-v0.3 from HuggingFace and convert to mcore format with tensor parallel size 4.
 Please use the tokenizer from HuggingFace.
 
 ### Vision model
@@ -113,7 +113,7 @@ Run the following script:
 
 ```
 examples/multimodal/text_generation_mistral_clip.sh --input-image-path /path/to/input/images --output-path /some/output/directory \
-    --model-path /path/to/model.pt --tokenizer-path /path/to/tokenizer/ --gt-path /path/to/groundtruth/file --task generation-task-name
+    --model-path /path/to/model.pt --gt-path /path/to/groundtruth/file --task generation-task-name
 ```
 
 where `--task generation-task-name` is the name of the evaluation benchmark such as `captioning` or `MMMU`.
diff --git a/examples/multimodal/assets/pretrain_curves.png b/examples/multimodal/assets/pretrain_curves.png
old mode 100755
new mode 100644
diff --git a/examples/multimodal/combine_lm_vision_checkpoints.sh b/examples/multimodal/combine_lm_vision_checkpoints.sh
old mode 100755
new mode 100644
diff --git a/examples/multimodal/combine_state_dicts.py b/examples/multimodal/combine_state_dicts.py
old mode 100755
new mode 100644
diff --git a/examples/multimodal/config.py b/examples/multimodal/config.py
old mode 100755
new mode 100644
index 343fcd589688b3e5bf1254189450e4fb06b88b6f..ee404604b650d32f4535a53dfba24498d9ab4f77
--- a/examples/multimodal/config.py
+++ b/examples/multimodal/config.py
@@ -7,34 +7,20 @@ from megatron.training.activations import fast_gelu, quick_gelu, squared_relu
 
 
 def get_language_model_config(config):
-    if config.language_model_type == "2b":
+    if config.language_model_type == "llama3_8b":
+        config.activation_func = torch.nn.functional.silu
         config.add_bias_linear = False
         config.bias_activation_fusion = False
         config.gated_linear_unit = True
-        config.apply_query_key_layer_scaling = True
-        config.layernorm_zero_centered_gamma = True
-        config.bias_dropout_fusion = False
-        config.rotary_percent = 0.5
-        config.apply_rope_fusion = False
-        config.attention_softmax_in_fp32 = True
-    elif config.language_model_type == "8b":
-        config.add_bias_linear = False
-        config.bias_activation_fusion = False
-        config.gated_linear_unit = False
-        config.apply_query_key_layer_scaling = True
-        config.layernorm_zero_centered_gamma = True
+        config.apply_query_key_layer_scaling = False
+        config.layernorm_zero_centered_gamma = (
+            False  # Zero centered gamma not supported for RMSNorm
+        )
         config.bias_dropout_fusion = False
-        config.rotary_percent = 0.5
-        config.attention_dropout = 0.0
         config.apply_rope_fusion = False
-        config.activation_func = squared_relu
-        config.ffn_hidden_size = 16384
-        config.masked_softmax_fusion = True
         config.attention_softmax_in_fp32 = True
-        config.num_query_groups = 32
-        config.kv_channels = 128
-        config.rotary_interleaved = False
-    elif config.language_model_type == "llama3_8b":
+        config.ffn_hidden_size = 14336
+    elif config.language_model_type == "mistral_7b":
         config.activation_func = torch.nn.functional.silu
         config.add_bias_linear = False
         config.bias_activation_fusion = False
@@ -47,7 +33,7 @@ def get_language_model_config(config):
         config.apply_rope_fusion = False
         config.attention_softmax_in_fp32 = True
         config.ffn_hidden_size = 14336
-    elif config.language_model_type == "mistral_7b":
+    elif config.language_model_type == "yi-34b":
         config.activation_func = torch.nn.functional.silu
         config.add_bias_linear = False
         config.bias_activation_fusion = False
@@ -59,10 +45,11 @@ def get_language_model_config(config):
         config.bias_dropout_fusion = False
         config.apply_rope_fusion = False
         config.attention_softmax_in_fp32 = True
-        config.ffn_hidden_size = 14336
-    elif config.language_model_type == "yi-34b":
+        config.ffn_hidden_size = 20480
+    elif config.language_model_type == "qwen2.5_7B":
         config.activation_func = torch.nn.functional.silu
         config.add_bias_linear = False
+        config.add_qkv_bias = True
         config.bias_activation_fusion = False
         config.gated_linear_unit = True
         config.apply_query_key_layer_scaling = False
@@ -72,7 +59,7 @@ def get_language_model_config(config):
         config.bias_dropout_fusion = False
         config.apply_rope_fusion = False
         config.attention_softmax_in_fp32 = True
-        config.ffn_hidden_size = 20480
+        config.ffn_hidden_size = 18944
     elif config.language_model_type == "qwen2.0_72B":
         config.activation_func = torch.nn.functional.silu
         config.add_bias_linear = False
@@ -168,13 +155,7 @@ def get_vision_projection_config(config, hidden_size):
     config.bias_activation_fusion = False
     config.add_bias_linear = False
     config.hidden_size = hidden_size  # Used as the vision projection output size, i.e., the input to the language model.
-    if config.language_model_type == "2b":
-        config.ffn_hidden_size = 5440
-        config.activation_func = torch.nn.functional.gelu
-    if config.language_model_type == "8b":
-        config.ffn_hidden_size = 16384
-        config.activation_func = squared_relu
-    elif config.language_model_type == "llama3_8b":
+    if config.language_model_type == "llama3_8b":
         config.ffn_hidden_size = 14336
         config.activation_func = torch.nn.functional.gelu
     elif config.language_model_type == "mistral_7b":
@@ -185,6 +166,9 @@ def get_vision_projection_config(config, hidden_size):
         config.ffn_hidden_size = 20480
         config.normalization = "LayerNorm"
         config.activation_func = torch.nn.functional.gelu
+    elif config.language_model_type == "qwen2.5_7B":
+        config.ffn_hidden_size = 3584
+        config.activation_func = torch.nn.functional.gelu
     elif config.language_model_type == "qwen2.0_72B":
         config.ffn_hidden_size = 29568
         config.normalization = "LayerNorm"
diff --git a/examples/multimodal/convert_llava_pretrain_to_wds.py b/examples/multimodal/convert_llava_pretrain_to_wds.py
old mode 100755
new mode 100644
diff --git a/examples/multimodal/dataloader_provider.py b/examples/multimodal/dataloader_provider.py
old mode 100755
new mode 100644
diff --git a/examples/multimodal/dataset_helpers.py b/examples/multimodal/dataset_helpers.py
old mode 100755
new mode 100644
index de76f8e45e3a32e3e2a429128ee484d4185e39f9..ecbbc502c08bcda12d52c74eaabdbd3ffc3d774b
--- a/examples/multimodal/dataset_helpers.py
+++ b/examples/multimodal/dataset_helpers.py
@@ -2,16 +2,19 @@
 import bisect
 import dataclasses
 import json
+import re
 import sys
 import traceback
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple, Union
 
 from image_processing import get_visual_transform
+from PIL import Image
+from torchvision.transforms import ToPILImage
 import numpy as np
 import torch
 
-from megatron.core.models.multimodal.llava_model import IGNORE_INDEX, IMAGE_TOKEN
+from megatron.core.models.multimodal.llava_model import IGNORE_INDEX, IMAGE_TOKEN, VIDEO_TOKEN
 from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
 from megatron.energon import (
     Batch,
@@ -175,6 +178,10 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked,
 
         self.img_h, self.img_w = self.args.img_h, self.args.img_w
 
+        # This map is used to reduce the number of tiles used per image if the number of tokens is
+        # larger than the decoder_seq_length.
+        self.num_tiles_degradation_map = {12:8, 8:6, 6:4, 4:2, 2:1, 1:1}
+
     def _get_total_seq_length(self, input_ids, num_tiles):
         """Calculate expected sequence length given text tokens length and number of tiles."""
         total_num_images = len(num_tiles)
@@ -237,7 +244,7 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked,
 
         prompt_idx = np.random.randint(len(prompt_list))
         cur_prompt = prompt_list[prompt_idx]
-        cur_prompt = "<image>\n" + cur_prompt + "\n"
+        cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt + "\n"
 
         caption = sample.caption.strip()
 
@@ -282,7 +289,7 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked,
         # LLAVA training: override text-prompt with just the image.
         conv = [
             # Note: no system message.
-            {"role": "user", "content": "<image>\n"},
+            {"role": "user", "content": IMAGE_TOKEN + "\n"},
             {"role": "assistant", "content": sample.answers},
         ]
 
@@ -307,66 +314,130 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked,
         """Encode SFT sample."""
         augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False
         has_video = sample.__subflavors__['has_video'] if 'has_video' in sample.__subflavors__ else False
-        has_image = sample.__subflavors__['has_image'] if 'has_image' in sample.__subflavors__ else False
-        has_image = has_image or (hasattr(sample, "images") and len(sample.images) > 0)
 
-        if has_video:
-            # Grab the selected frames of the video as a tensor with shape
-            # fhwc: (num_frames, height, width, num_channels).
-            video_fhwc = sample.images[0].permute(0, 2, 3, 1)
-            selected_frames = torch.linspace(
-                0, video_fhwc.shape[0] - 1, self.args.num_frames).long()
-            video_frame_fhwc = video_fhwc[selected_frames]
-            imgs = []
-            for video_frame_hwc in video_frame_fhwc:
-                imgs += get_visual_transform(
-                    video_frame_hwc, self.img_h, self.img_w,
-                    self.args.use_tiling, self.args.max_num_tiles,
-                    self.args.use_thumbnail, augment, self.args.vision_model_type)
-            num_tiles = [len(imgs)]
-        elif has_image:
-            imgs = get_visual_transform(
-                sample.images[0], self.img_h, self.img_w, self.args.use_tiling, self.args.max_num_tiles, self.args.use_thumbnail, augment,
-                self.args.vision_model_type,
-            )
-            num_tiles = [len(imgs)]
-        else:
-            imgs = num_tiles = []
-            sample.__key__ = "{}-{}".format("no-image", sample.__key__)
+        has_image = False
+        if hasattr(sample, "images"):
+            # If this is a text-only sample and we are freezing the LM,
+            # then use a dummy input image.
+            if len(sample.images) == 0 and self.args.freeze_LM:
+                empty_img = Image.new('RGB', (self.args.img_w, self.args.img_h), (255, 255, 255))
+                sample.images.append(empty_img)
+            if len(sample.images) > 0 and not has_video:
+                has_image = True
 
-        conversation = []
         # Note: Some tokenizers may ignore the system prompt.
-        conversation.append({"role": "system", "content": "Answer the questions."})
-
-        has_image_token = False
-
+        conversation = [{"role": "system", "content": "Answer the questions."}]
+        # Format the conversation as a list of "user" / "assistant" turns.
         for text in sample.texts:
-            if IMAGE_TOKEN in text["value"]:
-                has_image_token = True
-
-            if text["from"] == "human":
-                role = "user"
-            elif text["from"] == "gpt":
-                role = "assistant"
-            else:
-                raise RuntimeError(f"unexpected role {text['from']} in {sample.texts}")
-
-            turn = {"role": role, "content": text["value"]}
-            conversation.append(turn)
-
-        # If the sample contains an image but none of the user messages has an image token,
-        # then add it to the first user message.
-        if len(imgs) > 0 and not has_image_token:
+            error_msg = f"unexpected role {text['from']} in {sample.texts}"
+            assert text["from"] in ["human", "gpt"], error_msg
+            conversation.append({
+                "role": "user" if text["from"] == "human" else "assistant",
+                "content": text["value"]})
+
+        # Replace the image tags <image-idx> with IMAGE_TOKEN and count the number of image tags
+        number_image_tags = 0
+        image_tag_ids_list = []
+        for turn in conversation:
+            if turn["role"] == "user":
+                image_tag_ids = [int(x) - 1 for x in re.findall(r"<image-(\d+)>", turn["content"])]
+                image_tag_ids_list.extend(image_tag_ids)
+                turn["content"] = re.sub(r"<image-\d+>", IMAGE_TOKEN, turn["content"])
+                number_image_tags += turn["content"].count(IMAGE_TOKEN)
+                # For videos, we replace the image tag with the video tag
+                if has_video:
+                    turn["content"] = turn["content"].replace(IMAGE_TOKEN, VIDEO_TOKEN)
+
+        # We re-order the images in sample.images according to how they appear in the conversation.
+        if len(image_tag_ids_list) > 0:
+            sample.images = [sample.images[idx] for idx in image_tag_ids_list]
+
+        # If there is only one image, but several image tags, we assume all the tags refer to the
+        # same image and duplicate the image:
+        if len(sample.images) == 1 and number_image_tags > 1:
+            sample.images = sample.images * number_image_tags
+
+        number_of_images = len(sample.images)
+        # Fail if there are more image or video tags than image or videos:
+        error_msg = (
+            f"Found {number_image_tags} image tags for {number_of_images} images. {sample.texts}")
+        assert number_image_tags <= number_of_images, error_msg
+
+        # If there are less image of video tags than image or videos, prepend the tags to the first
+        # user message:
+        if number_image_tags < number_of_images:
             for turn in conversation:
                 if turn["role"] == "user":
-                    turn["content"] = f"{IMAGE_TOKEN}\n" + turn["content"]
+                    tag_to_add = VIDEO_TOKEN if has_video else IMAGE_TOKEN
+                    turn["content"] = tag_to_add*(number_of_images-number_image_tags) + "\n" + turn["content"]
                     break
 
         input_ids, target = self.tokenizer.tokenize_conversation(conversation, True, False)
 
+        if has_image:
+            imgs = []
+            num_tiles = []
+            max_num_tiles = self.args.max_num_tiles
+            # We keep a buffer of 4 tokens for the question,
+            # the rest can be used for image tokens.
+            max_image_token_allowed = self.args.decoder_seq_length - len(input_ids) - 4
+            # We start by extracting as many tiles per image as possible, and decrease the max
+            # number of tiles if there are too many image tokens.
+            while True:
+                imgs = []
+                num_tiles = []
+                for img in sample.images:
+                    img_tiles = get_visual_transform(
+                        img, self.img_h, self.img_w, self.args.use_tiling, max_num_tiles,
+                        self.args.use_thumbnail, augment, self.args.vision_model_type)
+                    imgs += img_tiles
+                    num_tiles += [len(img_tiles)]
+                if max_num_tiles == 1:
+                    break
+                if sum(num_tiles) * self.token_per_img_tile > max_image_token_allowed:
+                    if max_num_tiles in self.num_tiles_degradation_map:
+                        max_num_tiles = self.num_tiles_degradation_map[max_num_tiles]
+                    else:
+                        raise RuntimeError((
+                            f"Tried to decrease the number of tiles {max_num_tiles} but it's not ",
+                            f"defined in the degradation map {self.num_tiles_degradation_map}"))
+                else:
+                    break
+        elif has_video:
+            # We don't use tiling for videos to limit the number of tokens.
+            use_tiling=False
+            # Grab the selected frames of the video as a tensor with shape
+            # fhwc: (num_frames, num_channels, height, width).
+            video_fchw = sample.images[0].permute(0, 1, 2, 3)
+            selected_frames = torch.linspace(
+                0, video_fchw.shape[0] - 1, self.args.num_frames).long()
+            video_fchw = video_fchw[selected_frames]
+            imgs = []
+            for video_chw in video_fchw:
+                to_pil = ToPILImage()
+                video_chw = to_pil(video_chw)
+                imgs += get_visual_transform(
+                    video_chw, self.img_h, self.img_w, use_tiling, self.args.max_num_tiles,
+                    self.args.use_thumbnail, augment, self.args.vision_model_type)
+            num_tiles = [len(imgs)]
+        else:
+            imgs = num_tiles = []
+
         if self.is_packing_enabled:
             input_ids, target = self._truncate_for_packing(input_ids, target, num_tiles)
 
+        # Some final checks with respect to the number of image tokens and images on the tokenized
+        # conversation. There can still be errors, for instance if a non-video sample happens to
+        # have our pre-defined video token, or if the packing truncation removed a necessary image
+        # tag.
+        number_image_token = np.sum(input_ids == self.img_token_id)
+        error_msg = (
+            f"Found {number_image_token} image tokens for len({num_tiles}) = {len(num_tiles)} image tiles in {conversation}.")
+        assert number_image_token == len(num_tiles), error_msg
+        error_msg = (
+            f"Found sum({num_tiles}) = {np.sum(num_tiles)} tiles for {len(imgs)} images in {conversation}.")
+        assert np.sum(num_tiles) == len(imgs), error_msg
+
         return ImageTaskSample(
             __key__=sample.__key__,
             __restore_key__=sample.__restore_key__,
@@ -407,8 +478,8 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked,
 
         if isinstance(sample, MultiChoiceVQASample):
             cur_prompt = format_multichoice_question(sample.context, sample.choices)
-            if "<image>" not in cur_prompt:
-                cur_prompt = "<image>\n" + cur_prompt
+            if IMAGE_TOKEN not in cur_prompt:
+                cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt
             cur_answer = format_multichoice_answer(sample.correct_choice_idx)
         elif isinstance(sample, VQASample):
             if 'docvqa' in sample.__key__:
@@ -423,8 +494,8 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked,
 
             cur_prompt = cur_prompt.format(sample.context)
 
-            if "<image>" not in cur_prompt:
-                cur_prompt = "<image>\n" + cur_prompt
+            if IMAGE_TOKEN not in cur_prompt:
+                cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt
 
             if isinstance(sample.answers, list):
                 answer_list = sample.answers
@@ -505,11 +576,11 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked,
         prompt_list = self.manual_prompts["DocPretraining"]["raw"]
         prompt_idx = np.random.randint(len(prompt_list))
         cur_prompt = prompt_list[prompt_idx]
-        if "<image>" not in cur_prompt:
-            cur_prompt = "<image>\n" + cur_prompt
+        if IMAGE_TOKEN not in cur_prompt:
+            cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt
 
-        # Make sure there is no extra <image> tag.
-        sample.text = sample.text.replace("<image>", "")
+        # Make sure there is no extra IMAGE_TOKEN tag.
+        sample.text = sample.text.replace(IMAGE_TOKEN, "")
 
         caption = sample.text.strip()
 
@@ -526,8 +597,8 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked,
         ref = sample.text
         region = sample.words_boxes
 
-        # Make sure there is no extra <image> tag
-        ref = ref.replace("<image>", "")
+        # Make sure there is no extra IMAGE_TOKEN tag
+        ref = ref.replace(IMAGE_TOKEN, "")
 
         if len(region) == 4:
             region = f"<box>({region[0]},{region[1]}),({region[2]},{region[3]})</box>"
@@ -550,8 +621,8 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked,
         prompt_idx = np.random.randint(len(prompt_list))
         cur_prompt = prompt_list[prompt_idx]
         cur_prompt = cur_prompt.format(prompt_content)
-        if "<image>" not in cur_prompt:
-            cur_prompt = "<image>\n" + cur_prompt
+        if IMAGE_TOKEN not in cur_prompt:
+            cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt
 
         return sample, cur_prompt, answer
 
@@ -559,8 +630,8 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked,
         """Format bbox coordinates as text."""
         assert len(bbox) == 4 or len(bbox) == 8
 
-        # Make sure there is no extra <image> tag
-        text = text.replace("<image>", "")
+        # Make sure there is no extra IMAGE_TOKEN tag
+        text = text.replace(IMAGE_TOKEN, "")
 
         if len(bbox) == 4:
             label_str = f"<ref>{text}</ref><box>({bbox[0]},{bbox[1]}),({bbox[2]},{bbox[3]})</box>"
@@ -582,8 +653,8 @@ class TaskEncoder(DefaultTaskEncoder[OCRSample, OCRSample, ImageTaskBatchPacked,
         prompt_idx = np.random.randint(len(prompt_list))
         cur_prompt = prompt_list[prompt_idx]
 
-        if "<image>" not in cur_prompt:
-            cur_prompt = "<image>\n" + cur_prompt
+        if IMAGE_TOKEN not in cur_prompt:
+            cur_prompt = IMAGE_TOKEN + "\n" + cur_prompt
         cur_answer = answer
 
         return sample, cur_prompt, cur_answer
diff --git a/examples/multimodal/evaluate_ai2d.py b/examples/multimodal/evaluation/evaluate_ai2d.py
old mode 100755
new mode 100644
similarity index 72%
rename from examples/multimodal/evaluate_ai2d.py
rename to examples/multimodal/evaluation/evaluate_ai2d.py
index 2d5db67b67d076e6d43a815997175325d5bb25ea..39b866ae4a030c2911a197fef6a1be0e19b0cfc4
--- a/examples/multimodal/evaluate_ai2d.py
+++ b/examples/multimodal/evaluation/evaluate_ai2d.py
@@ -9,19 +9,25 @@ def merge_input_files(input_path):
     """Merge input files to a format compatible with the evaluator."""
     input_file_paths, output_file_path = get_input_output_paths(input_path, task="AI2D")
 
-    results = []
+    results = dict()
 
     for input_file_path in input_file_paths:
         with open(input_file_path, "r") as input_file:
             for line in input_file:
                 res = json.loads(line)
-                results.append(
-                    {
-                        "question_id": res["sample_id"],
-                        "answer": res["answer"],
-                        "gt_answer": res["gt_answer"],
-                    }
-                )
+                sample_id = res["sample_id"]
+
+                # Ignore possible duplicates.
+                if sample_id in results:
+                    continue
+
+                results[sample_id] = {
+                    "question_id": sample_id,
+                    "answer": res["answer"],
+                    "gt_answer": res["gt_answer"],
+                }
+
+    results = list(results.values())
 
     with open(output_file_path, "w") as output_file:
         json.dump(results, output_file)
diff --git a/examples/multimodal/evaluate_chartqa.py b/examples/multimodal/evaluation/evaluate_chartqa.py
old mode 100755
new mode 100644
similarity index 77%
rename from examples/multimodal/evaluate_chartqa.py
rename to examples/multimodal/evaluation/evaluate_chartqa.py
index e9238069d463a038c0e1b52e571e930c01b24b6a..53d4944f46e364b4cb68f8ef22dabccbf66ef3ca
--- a/examples/multimodal/evaluate_chartqa.py
+++ b/examples/multimodal/evaluation/evaluate_chartqa.py
@@ -9,15 +9,22 @@ def merge_input_files(input_path):
     """Merge input files to a format compatible with the evaluator."""
     input_file_paths, output_file_path = get_input_output_paths(input_path, task="ChartQA")
 
-    results = []
+    results = dict()
 
     for input_file_path in input_file_paths:
         with open(input_file_path, "r") as input_file:
             for line in input_file:
                 res = json.loads(line)
-                res["question_id"] = res["sample_id"]
+                sample_id = res["sample_id"]
 
-                results.append(res)
+                # Ignore possible duplicates.
+                if sample_id in results:
+                    continue
+
+                res["question_id"] = sample_id
+                results[sample_id] = res
+
+    results = list(results.values())
 
     with open(output_file_path, "w") as output_file:
         json.dump(results, output_file)
diff --git a/examples/multimodal/evaluate_coco.py b/examples/multimodal/evaluation/evaluate_coco.py
old mode 100755
new mode 100644
similarity index 77%
rename from examples/multimodal/evaluate_coco.py
rename to examples/multimodal/evaluation/evaluate_coco.py
index a717090c9274781f7aabd0f5cfbc3b8b032fc689..8eeb367e8f3bb0c38bd3b0f44b8f54f0c7d32636
--- a/examples/multimodal/evaluate_coco.py
+++ b/examples/multimodal/evaluation/evaluate_coco.py
@@ -11,20 +11,28 @@ def convert_to_coco_format(input_path):
     """Convert input files to COCO compatible format."""
     input_file_paths, output_file_path = get_input_output_paths(input_path, task="captioning")
 
-    captions = []
+    results = dict()
 
     for input_file_path in input_file_paths:
         with open(input_file_path, "r") as input_file:
             for line in input_file:
                 res = json.loads(line)
+                sample_id = res["sample_id"]
 
-                question_id = res['sample_id']
-                caption = res['caption'].rstrip('.').lower()
+                # Ignore possible duplicates.
+                if sample_id in results:
+                    continue
 
-                captions.append({"image_id": question_id, "caption": caption})
+                caption = res["caption"].rstrip(".").lower()
+                results[sample_id] = {
+                    "image_id": sample_id,
+                    "caption": caption,
+                }
+
+    results = list(results.values())
 
     with open(output_file_path, "w") as output_file:
-        json.dump(captions, output_file, indent=4)
+        json.dump(results, output_file, indent=4)
 
     return output_file_path
 
diff --git a/examples/multimodal/evaluate_mathvista.py b/examples/multimodal/evaluation/evaluate_mathvista.py
old mode 100755
new mode 100644
similarity index 92%
rename from examples/multimodal/evaluate_mathvista.py
rename to examples/multimodal/evaluation/evaluate_mathvista.py
index 3474c5f25e9e750ba4f77238b82ef8aaa4d7193b..a55f312f21986fb46644eb4e36979c342a2b7411
--- a/examples/multimodal/evaluate_mathvista.py
+++ b/examples/multimodal/evaluation/evaluate_mathvista.py
@@ -11,13 +11,21 @@ def merge_input_files(input_path):
     """Merge input files to a format compatible with the evaluator."""
     input_file_paths, output_file_path = get_input_output_paths(input_path, task="MathVista")
 
-    results = []
+    results = dict()
 
     for input_file_path in input_file_paths:
         with open(input_file_path, "r") as input_file:
             for line in input_file:
                 res = json.loads(line)
-                results.append(res)
+                sample_id = res["sample_id"]
+
+                # Remove possible duplicates.
+                if sample_id in results:
+                    continue
+
+                results[sample_id] = res
+
+    results = list(results.values())
 
     with open(output_file_path, "w") as output_file:
         json.dump(results, output_file)
diff --git a/examples/multimodal/evaluate_mmmu.py b/examples/multimodal/evaluation/evaluate_mmmu.py
old mode 100755
new mode 100644
similarity index 91%
rename from examples/multimodal/evaluate_mmmu.py
rename to examples/multimodal/evaluation/evaluate_mmmu.py
index 66118fa905d69df3a1d2a07e9baa6236dd11d823..798c42bfa76009653927aa4f1339411807fb905f
--- a/examples/multimodal/evaluate_mmmu.py
+++ b/examples/multimodal/evaluation/evaluate_mmmu.py
@@ -2,9 +2,15 @@ import argparse
 import glob
 import json
 import os
+import sys
 import re
 import subprocess
 
+# Get the absolute path of the parent directory
+parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
+# Add the parent directory to sys.path
+sys.path.insert(0, parent_dir)
+
 from run_text_generation import get_output_path
 from config import EvaluationConfig
 
@@ -48,6 +54,10 @@ def convert_to_mmmu_format(input_path):
                     )
 
                 # MMMU eval script expects just a sample_id to prediction mapping.
+                # Skip possible duplicates.
+                if sample_id in output:
+                    continue
+
                 output[sample_id] = prediction
 
     with open(output_file_path, "w") as output_file:
diff --git a/examples/multimodal/evaluate_ocrbench.py b/examples/multimodal/evaluation/evaluate_ocrbench.py
old mode 100755
new mode 100644
similarity index 95%
rename from examples/multimodal/evaluate_ocrbench.py
rename to examples/multimodal/evaluation/evaluate_ocrbench.py
index bc2b901065f53255a0cf4cabaa4893122d579566..b37473a67dbaeef121e734340a6161358ac0203b
--- a/examples/multimodal/evaluate_ocrbench.py
+++ b/examples/multimodal/evaluation/evaluate_ocrbench.py
@@ -8,13 +8,21 @@ def merge_input_files(input_path):
     """Merge input files to a format compatible with the evaluator."""
     input_file_paths, output_file_path = get_input_output_paths(input_path, task="OCRBench")
 
-    results = []
+    results = dict()
 
     for input_file_path in input_file_paths:
         with open(input_file_path, "r") as input_file:
             for line in input_file:
                 res = json.loads(line)
-                results.append(res)
+                sample_id = res["sample_id"]
+
+                # Remove possible duplicates.
+                if sample_id in results:
+                    continue
+
+                results[sample_id] = res
+
+    results = list(results.values())
 
     with open(output_file_path, "w") as output_file:
         json.dump(results, output_file)
diff --git a/examples/multimodal/evaluate_textvqa.py b/examples/multimodal/evaluation/evaluate_textvqa.py
old mode 100755
new mode 100644
similarity index 72%
rename from examples/multimodal/evaluate_textvqa.py
rename to examples/multimodal/evaluation/evaluate_textvqa.py
index c9bba7134ba9f7e3a925dbcc529ec97da60fac92..af782bdf0318b664e37d9a106e36e66e5f5ad63c
--- a/examples/multimodal/evaluate_textvqa.py
+++ b/examples/multimodal/evaluation/evaluate_textvqa.py
@@ -9,22 +9,25 @@ def merge_input_files(input_path):
     """Merge input files to a format compatible with the evaluator."""
     input_file_paths, output_file_path = get_input_output_paths(input_path, task="TextVQA")
 
-    results = []
+    results = dict()
 
     for input_file_path in input_file_paths:
         with open(input_file_path, "r") as input_file:
             for line in input_file:
                 res = json.loads(line)
-                results.append(
-                    {
-                        "question_id": res["sample_id"],
-                        "answer": res["answer"],
-                        "gt_answer": res["gt_answer"],
-                    }
-                )
-
-    # Make order deterministic.
-    # results = sorted(results, key=lambda d: d["question_id"])
+                sample_id = res["sample_id"]
+
+                # Remove possible duplicates.
+                if sample_id in results:
+                    continue
+
+                results[sample_id] = {
+                    "question_id": sample_id,
+                    "answer": res["answer"],
+                    "gt_answer": res["gt_answer"],
+                }
+
+    results = list(results.values())
 
     with open(output_file_path, "w") as output_file:
         json.dump(results, output_file)
diff --git a/examples/multimodal/evaluate_vqav2.py b/examples/multimodal/evaluation/evaluate_vqav2.py
old mode 100755
new mode 100644
similarity index 88%
rename from examples/multimodal/evaluate_vqav2.py
rename to examples/multimodal/evaluation/evaluate_vqav2.py
index 0b1b9209bef3bfb5bd644ed28d5464c951965654..7807d80723f5aa67c7fcadd695e78643fd52cb6d
--- a/examples/multimodal/evaluate_vqav2.py
+++ b/examples/multimodal/evaluation/evaluate_vqav2.py
@@ -9,15 +9,22 @@ def merge_input_files(input_path):
     """Merge input files to a format compatible with the evaluator."""
     input_file_paths, output_file_path = get_input_output_paths(input_path, task="VQAv2")
 
-    results = []
+    results = dict()
 
     for input_file_path in input_file_paths:
         with open(input_file_path, "r") as input_file:
             for line in input_file:
                 res = json.loads(line)
-                res["question_id"] = res["sample_id"]
+                sample_id = res["sample_id"]
 
-                results.append(res)
+                # Skip possible duplicates.
+                if sample_id in results:
+                    continue
+
+                res["question_id"] = sample_id
+                results[sample_id] = res
+
+    results = list(results.values())
 
     with open(output_file_path, "w") as output_file:
         json.dump(results, output_file)
@@ -57,6 +64,9 @@ def compute_vqa_accuracy(result_file, task):
             assert len(gt) == 1, "expected exactly one groundtruth answer."
             gt = gt[0]
 
+            pred = pred.rstrip("%")
+            gt = gt.rstrip("%")
+
             if is_number(pred) and is_number(gt):
                 pred = float(pred)
                 gt = float(gt)
diff --git a/examples/multimodal/evaluation_datasets.py b/examples/multimodal/evaluation/evaluation_datasets.py
old mode 100755
new mode 100644
similarity index 88%
rename from examples/multimodal/evaluation_datasets.py
rename to examples/multimodal/evaluation/evaluation_datasets.py
index 97f9ba926f1435960444626c3af41496d1bea837..50a50d56871bddd9de59c3b1444186c749892db8
--- a/examples/multimodal/evaluation_datasets.py
+++ b/examples/multimodal/evaluation/evaluation_datasets.py
@@ -188,7 +188,7 @@ class MMMUDataset(torch.utils.data.Dataset):
         use_tiling,
         max_num_tiles,
         use_thumbnail,
-        single_image,
+        prompt_style,
         vision_model_type,
     ):
         import datasets
@@ -246,7 +246,7 @@ class MMMUDataset(torch.utils.data.Dataset):
         self._use_tiling = use_tiling
         self._max_num_tiles = max_num_tiles
         self._use_thumbnail = use_thumbnail
-        self._single_image = single_image
+        self._prompt_style = prompt_style
         self._vision_model_type = vision_model_type
 
     def __len__(self):
@@ -258,7 +258,7 @@ class MMMUDataset(torch.utils.data.Dataset):
         sample = self._dataset[idx]
 
         # Use the single image approach from the MMMU repo.
-        if self._single_image:
+        if self._prompt_style == "single_image":
             sample = process_single_sample(sample)
             sample = construct_prompt(sample, self._config)
 
@@ -274,7 +274,69 @@ class MMMUDataset(torch.utils.data.Dataset):
                 vision_model_type=self._vision_model_type,
             )
             sample_num_tiles = [len(sample_imgs)]
-        else:
+
+            prompt = sample["final_input_prompt"]
+            for i in range(8):
+                prompt = prompt.replace(f"<image {i}>", "")
+            sample["final_input_prompt"] = f"<image>\n{prompt}"
+        elif self._prompt_style == "vlmevalkit":
+            sample = construct_prompt(sample, self._config)
+
+            if sample["question_type"] == "multiple-choice":
+                question = sample["question"]
+
+                options = ""
+                for k, v in sample["index2ans"].items():
+                    options += f"{k}. {v}\n"
+
+                final_prompt = f"{question}\n"
+                if "hint" in sample:
+                    final_prompt += f"Hint: {sample['hint']}\n"
+
+                if "task_instructions" in sample:
+                    final_prompt += f"Task instructions: {sample['task_instructions']}\n"
+
+                final_prompt += options
+                final_prompt += "Answer with the option's letter from the given choices directly."
+
+                sample["final_input_prompt"] = final_prompt.rstrip()
+            else:
+                question = sample["question"]
+                final_prompt = f"{question}\n"
+                final_prompt += "Answer the question directly."
+                sample["final_input_prompt"] = final_prompt.rstrip()
+
+            sample_imgs = []
+            sample_num_tiles = []
+
+            img_indices = sorted(list(set(re.findall(r"<image (\d+)", sample["final_input_prompt"]))))
+            # If there are multiple input images, we need to avoid the number of image embeddings getting too large.
+            adjusted_max_num_tiles = max(1, self._max_num_tiles // len(img_indices))
+            adjusted_max_num_tiles = min(adjusted_max_num_tiles, self._max_num_tiles)
+
+            for img_idx in img_indices:
+                img_key = f"image_{img_idx}"
+                img_str = f"<image {img_idx}>"
+
+                img = sample[img_key]
+                assert img is not None, f"{img_str} is in prompt but not in sample images"
+
+                imgs = get_visual_transform(
+                    img,
+                    self._img_h,
+                    self._img_w,
+                    self._use_tiling,
+                    adjusted_max_num_tiles,
+                    self._use_thumbnail,
+                    augment=False,
+                    vision_model_type=self._vision_model_type,
+                )  # List of tiles.
+
+                sample_imgs.extend(imgs)
+                sample_num_tiles.append(len(imgs))
+
+            sample["final_input_prompt"] = " ".join([f'<image {i + 1}><image>' for i in range(len(img_indices))]) + "\n" + sample["final_input_prompt"]
+        elif self._prompt_style == "multi_image":
             sample = construct_prompt(sample, self._config)
 
             sample_imgs = []
@@ -315,6 +377,8 @@ class MMMUDataset(torch.utils.data.Dataset):
                 assert (
                     f"<image {i}>" not in sample["final_input_prompt"]
                 ), "prompt contains unhandled image tags"
+        else:
+            raise ValueError(f"unknown prompt style {self._prompt_style}")
 
         # MMMU specific metadata.
         metadata = {"question_type": sample["question_type"]}
@@ -323,10 +387,6 @@ class MMMUDataset(torch.utils.data.Dataset):
             metadata["all_choices"] = sample["all_choices"]
 
         prompt = sample['final_input_prompt']
-        if self._single_image:
-            for i in range(8):
-                prompt = prompt.replace(f"<image {i}>", "")
-            prompt = f"<image>\n{prompt}"
 
         tile_count = torch.tensor(sample_num_tiles, dtype=torch.int)
 
@@ -780,8 +840,10 @@ def get_evaluation_dataset(
             vision_model_type,
         )
     elif task == 'MMMU':
-        # Note: single_image=True uses only one image like in the MMMU repo example.
-        # single_image=False uses all images in the sample.
+        # Note:
+        # - prompt_style="single_image" uses only one image like in the MMMU repo example.
+        # - prompt_style="multi_image" uses multiple input images.
+        # - prompt_style="vlmevalkit" is similar to https://github.com/open-compass/VLMEvalKit/blob/5d3cebcf18ef4bfbadc3bd3ef80bdc7aad2c6557/vlmeval/vlm/internvl_chat.py#L499
         dataset = MMMUDataset(
             input_image_path,
             num_samples_per_partition,
@@ -792,7 +854,7 @@ def get_evaluation_dataset(
             use_tiling,
             max_num_tiles,
             use_thumbnail,
-            single_image=True,
+            prompt_style="single_image",
             vision_model_type=vision_model_type,
         )
     elif task == "VideoMME":
diff --git a/examples/multimodal/image_processing.py b/examples/multimodal/image_processing.py
old mode 100755
new mode 100644
diff --git a/examples/multimodal/layer_specs.py b/examples/multimodal/layer_specs.py
old mode 100755
new mode 100644
diff --git a/examples/multimodal/manual_prompts.json b/examples/multimodal/manual_prompts.json
old mode 100755
new mode 100644
diff --git a/examples/multimodal/model.py b/examples/multimodal/model.py
old mode 100755
new mode 100644
index 6db834e97a1d643955cf12905eb3ed84f0541a08..a28a428325b8db9c7c1268080979889935dcc396
--- a/examples/multimodal/model.py
+++ b/examples/multimodal/model.py
@@ -136,6 +136,20 @@ def model_provider(
     else:
         vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules
 
+    # Toggle --recompute* for the vision and language model separately.
+    if args.recompute_vision:
+        if vision_config.recompute_method is not None and vision_config.recompute_granularity is not None:
+            vision_config.recompute_num_layers = vision_config.num_layers
+    else:
+        vision_config.recompute_granularity = None
+        vision_config.recompute_method = None
+        vision_config.recompute_num_layers = None
+
+    vision_projection_config.recompute_granularity = None
+    vision_projection_config.recompute_method = None
+    vision_projection_config.recompute_num_layers = None
+
+
     tokenizer = get_tokenizer()
     image_token_index = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
 
diff --git a/examples/multimodal/model_converter/clip_converter.py b/examples/multimodal/model_converter/clip_converter.py
old mode 100755
new mode 100644
diff --git a/examples/multimodal/model_converter/internvit_converter.py b/examples/multimodal/model_converter/internvit_converter.py
old mode 100755
new mode 100644
diff --git a/examples/multimodal/model_converter/siglip_converter.py b/examples/multimodal/model_converter/siglip_converter.py
old mode 100755
new mode 100644
diff --git a/examples/multimodal/model_converter/vision_model_tester.py b/examples/multimodal/model_converter/vision_model_tester.py
old mode 100755
new mode 100644
diff --git a/examples/multimodal/multimodal_args.py b/examples/multimodal/multimodal_args.py
old mode 100755
new mode 100644
index 4b2be450afb33ce985ea052c55ad01abc5a3c548..eb56118e71613ea7fae6f81ff44f2969f26b4533
--- a/examples/multimodal/multimodal_args.py
+++ b/examples/multimodal/multimodal_args.py
@@ -49,7 +49,7 @@ def add_multimodal_extra_args(parser):
     group.add_argument(
         "--tokenizer-prompt-format",
         type=str,
-        choices=["mistral", "llama3", "chatml", "nvlm-yi-34b", "qwen2p0"],
+        choices=["mistral", "llama3", "chatml", "nvlm-yi-34b", "qwen2p0", "qwen2p5"],
         required=True,
         help="Prompt format to use with the tokenizer.",
     )
@@ -71,5 +71,9 @@ def add_multimodal_extra_args(parser):
     group.add_argument(
         "--packing-seq-length", type=int, default=0, help="Packing sequence length. Must be > 0 if using packing."
     )
+    group.add_argument(
+        "--recompute-vision", action="store_true", default=False, help="Enable activation checkpointing in the vision model"
+    )
+
 
     return parser
diff --git a/examples/multimodal/nvlm/README.md b/examples/multimodal/nvlm/README.md
old mode 100755
new mode 100644
index 7eddbb7efa9162edb02e118ce7bb5d95151ca944..bb576bb40355a02fbe2701fdaf85d6ee9a8058e3
--- a/examples/multimodal/nvlm/README.md
+++ b/examples/multimodal/nvlm/README.md
@@ -5,6 +5,13 @@ Please refer to the [NVLM paper](https://arxiv.org/pdf/2409.11402) for details.
 
 *NOTE: VLMs in Megatron are under active development and are expected to change.*
 
+# Checkpoints
+
+NVLM 1.0 model weights are publicly available in HuggingFace and Megatron format.
+
+- NVLM-1.0-D 72B [HuggingFace version](https://huggingface.co/nvidia/NVLM-D-72B)
+- NVLM-1.0-D 72B [Megatron-Core version](https://huggingface.co/nvidia/NVLM-D-72B-mcore) 
+
 # Setup
 
 ## Docker image
@@ -32,7 +39,7 @@ NVLM 1.0 34B starts from [NousResearch/Nous-Hermes-2-Yi-34B](https://huggingface
 Please download it and run the following command to convert it to Megatron format.
 ```
 python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \
-    --load-dir <hf model directory> --save-dir <output dir> --tokenizer-model <hf model directory> \
+    --load-dir <hf model directory> --save-dir <output dir> --tokenizer-model <hf model name/directory> \
     --saver-transformer-impl transformer_engine --model-size yi-34B --make-vocab-size-divisible-by 1
 ```
 
@@ -42,7 +49,7 @@ NVLM 1.0 72B starts from [Qwen/Qwen2-72B-Instruct](https://huggingface.co/Qwen/Q
 Please download it and run the following command to convert it to Megatron format.
 ```
 python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \
-    --load-dir <hf model directory> --save-dir <output directory> --tokenizer-model <hf model directory> \
+    --load-dir <hf model directory> --save-dir <output directory> --tokenizer-model <hf model name/directory> \
     --saver-transformer-impl transformer_engine --model-size qwen2.5-72Bf
 ```
 
diff --git a/examples/multimodal/nvlm/internvit.py b/examples/multimodal/nvlm/internvit.py
old mode 100755
new mode 100644
diff --git a/examples/multimodal/nvlm/nvlm_prompts.json b/examples/multimodal/nvlm/nvlm_prompts.json
old mode 100755
new mode 100644
diff --git a/examples/multimodal/nvlm/pp_checkpoint_converter.py b/examples/multimodal/nvlm/pp_checkpoint_converter.py
old mode 100755
new mode 100644
diff --git a/examples/multimodal/nvlm/pretrain_blend.yaml b/examples/multimodal/nvlm/pretrain_blend.yaml
old mode 100755
new mode 100644
diff --git a/examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh b/examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh
old mode 100755
new mode 100644
index 320c7ad3f517a10db6556ca28b363059d3a04f6b..008a17ac43d936c79c5cd655e57509fe2abf8ec9
--- a/examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh
+++ b/examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh
@@ -62,7 +62,7 @@ OPTIONS=" \
     --exit-duration-in-mins 230 \
     --disable-bias-linear \
     --tokenizer-type MultimodalTokenizer \
-    --tokenizer-model ${WORKSPACE}/<path to tokenizer model> \
+    --tokenizer-model Qwen/Qwen2-72B-Instruct \
     --tokenizer-prompt-format qwen2p0 \
     --transformer-impl transformer_engine \
     --normalization RMSNorm \
diff --git a/examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh b/examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh
old mode 100755
new mode 100644
index c36cb05990cf36ddb2c952630eaf9d55afc76f28..00f94352774518b1c8dc478c98808a16a3398b75
--- a/examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh
+++ b/examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh
@@ -75,7 +75,7 @@ OPTIONS=" \
     --decoder-seq-length ${DECODER_SEQ_LEN} \
     --max-position-embeddings ${MAX_POS_EMBED} \
     --tokenizer-type MultimodalTokenizer \
-    --tokenizer-model ${WORKSPACE}/<path to tokenizer> \
+    --tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \
     --tokenizer-prompt-format nvlm-yi-34b \
     --vocab-size 64000 \
     --make-vocab-size-divisible-by 1 \
diff --git a/examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh b/examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh
old mode 100755
new mode 100644
index 35cd90409a98948a9e7287a92431cac9614f4e95..e3b001c7aaee4544fde590ee41a8ae0d01497d36
--- a/examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh
+++ b/examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh
@@ -97,7 +97,7 @@ do
         --decoder-seq-length ${DECODER_SEQ_LEN} \
         --max-position-embeddings ${MAX_POS_EMBED} \
         --tokenizer-type MultimodalTokenizer \
-        --tokenizer-model <tokenizer model path> \
+        --tokenizer-model Qwen/Qwen2-72B-Instruct \
         --tokenizer-prompt-format qwen2p0 \
         --position-embedding-type rope \
         --rotary-percent 1.0 \
diff --git a/examples/multimodal/nvlm/run_text_generation_qwen25_7b_siglip.sh b/examples/multimodal/nvlm/run_text_generation_qwen25_7b_siglip.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3b6221996c8294790b946f3c453d01eb71b692e7
--- /dev/null
+++ b/examples/multimodal/nvlm/run_text_generation_qwen25_7b_siglip.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+export TOKENIZERS_PARALLELISM="false"
+
+INPUT_IMAGE_PATH="placeholder"
+GROUNDTRUTH_PATH="placeholder"
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        -i|--input-image-path)
+            INPUT_IMAGE_PATH="$2"
+            shift
+            shift
+            ;;
+        -o|--output-path)
+            OUTPUT_PATH="$2"
+            shift
+            shift
+            ;;
+        -m|--model-path)
+            MODEL_PATH="$2"
+            shift
+            shift
+            ;;
+        -t|--task)
+            TASK="$2"
+            shift
+            shift
+            ;;
+        -g|--gt-path)
+            GROUNDTRUTH_PATH="$2"
+            shift
+            shift
+            ;;
+        -*|--*)
+            echo "Invalid option $1"
+            exit 1
+            ;;
+    esac
+done
+
+# Please modify these as needed.
+NUM_PARTITIONS=0
+START=0
+END=0
+
+
+SEQ_LEN=256
+DECODER_SEQ_LEN=8192
+EXTRA_ARGS=" --pixel-shuffle --use-tiling --max-num-tiles 12 --use-thumbnail"
+
+for PARTITION_ID in $( eval echo {$START..$END} )
+do
+    torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
+        --attention-softmax-in-fp32 \
+        --transformer-impl transformer_engine \
+        --use-te \
+        --use-checkpoint-args \
+        --normalization RMSNorm \
+        --norm-epsilon 1e-06 \
+        --language-model-type=qwen2.5_7B \
+        --untie-embeddings-and-output-weights \
+        --disable-bias-linear \
+        --position-embedding-type rope \
+        --rotary-percent 1.0 \
+        --rotary-base 1000000 \
+        --swiglu \
+        --attention-dropout 0.0 \
+        --hidden-dropout 0.0 \
+        --tensor-model-parallel-size 4 \
+        --pipeline-model-parallel-size 1 \
+        --group-query-attention \
+        --num-query-groups 4 \
+        --num-layers 28 \
+        --hidden-size 3584 \
+        --ffn-hidden-size 18944 \
+        --add-qkv-bias \
+        --num-attention-heads 28 \
+        --max-position-embeddings 32768  \
+        --no-masked-softmax-fusion \
+        --load ${MODEL_PATH} \
+        --tokenizer-type MultimodalTokenizer \
+        --tokenizer-model Qwen/Qwen2.5-7B-Instruct \
+        --tokenizer-prompt-format qwen2p5 \
+        --bf16 \
+        --micro-batch-size 1 \
+        --seq-length ${SEQ_LEN} \
+        --decoder-seq-length ${DECODER_SEQ_LEN} \
+        --out-seq-length 128 \
+        --temperature 1.0 \
+        --img-h 448 \
+        --img-w 448 \
+        --patch-dim 14 \
+        --seed 153 \
+        --top_k 1 \
+        --no-load-rng \
+        --no-load-optim \
+        --input-image-path ${INPUT_IMAGE_PATH} \
+        --num-partitions ${NUM_PARTITIONS} \
+        --partition-id ${PARTITION_ID} \
+        --output-path ${OUTPUT_PATH} \
+        --gt-path ${GROUNDTRUTH_PATH} \
+        --task ${TASK} \
+        ${EXTRA_ARGS} \
+        --special-tokens "<image>" "<img>" "</img>" \
+        --vision-model-type siglip \
+        --ckpt-format torch
+done
diff --git a/examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh b/examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh
old mode 100755
new mode 100644
index 0437e4c16d68378a39b24aa9e7d08cc05b815e5b..341f4e4b0a79b3f212996672c0aee0e1c85f4ef3
--- a/examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh
+++ b/examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh
@@ -95,7 +95,7 @@ do
         --decoder-seq-length ${DECODER_SEQ_LEN} \
         --max-position-embeddings ${MAX_POS_EMBED} \
         --tokenizer-type MultimodalTokenizer \
-        --tokenizer-model <tokenizer model path> \
+        --tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \
         --tokenizer-prompt-format nvlm-yi-34b \
         --vocab-size 64000 \
         --make-vocab-size-divisible-by 1 \
@@ -135,6 +135,6 @@ do
         --gt-path ${GROUNDTRUTH_PATH} \
         ${EXTRA_ARGS} \
         --task ${TASK} \
-        --image-tag-type nlvm \
+        --image-tag-type nvlm \
         --ckpt-format torch
 done
diff --git a/examples/multimodal/nvlm/sft_34b_internvit.sh b/examples/multimodal/nvlm/sft_34b_internvit.sh
old mode 100755
new mode 100644
index 3d585d8d37233a2322ba169f9b6bd86006d35c73..0dff9461dae1f38255093afc893ad1110bc5ad6b
--- a/examples/multimodal/nvlm/sft_34b_internvit.sh
+++ b/examples/multimodal/nvlm/sft_34b_internvit.sh
@@ -80,7 +80,7 @@ OPTIONS=" \
     --decoder-seq-length ${DECODER_SEQ_LEN} \
     --max-position-embeddings ${MAX_POS_EMBED} \
     --tokenizer-type MultimodalTokenizer \
-    --tokenizer-model ${WORKSPACE}/<tokenizer path> \
+    --tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \
     --tokenizer-prompt-format nvlm-yi-34b \
     --vocab-size 64000 \
     --make-vocab-size-divisible-by 1 \
diff --git a/examples/multimodal/nvlm/sft_blend.yaml b/examples/multimodal/nvlm/sft_blend.yaml
old mode 100755
new mode 100644
diff --git a/examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh b/examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh
old mode 100755
new mode 100644
index adb1d1b14c34e7e2774ad8a60cdd6ca5e47f103f..3b472259b94cb8ebe1e29a4695f594247af113a9
--- a/examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh
+++ b/examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh
@@ -67,7 +67,7 @@ OPTIONS=" \
     --exit-duration-in-mins 230 \
     --disable-bias-linear \
     --tokenizer-type MultimodalTokenizer \
-    --tokenizer-model ${WORKSPACE}/<tokenizer model path> \
+    --tokenizer-model Qwen/Qwen2-72B-Instruct \
     --tokenizer-prompt-format qwen2p0 \
     --transformer-impl transformer_engine \
     --normalization RMSNorm \
diff --git a/examples/multimodal/pretrain_dataset.yaml b/examples/multimodal/pretrain_dataset.yaml
old mode 100755
new mode 100644
diff --git a/examples/multimodal/pretrain_mistral_clip.sh b/examples/multimodal/pretrain_mistral_clip.sh
old mode 100755
new mode 100644
index ea1f741aed91493f192e82f78279497c8cf4d535..90b0053d19fd3d556d336093afc3414425eb8664
--- a/examples/multimodal/pretrain_mistral_clip.sh
+++ b/examples/multimodal/pretrain_mistral_clip.sh
@@ -24,11 +24,6 @@ if [[ -z $LOAD_NAME ]]; then
     exit 1
 fi
 
-if [[ -z $TOKENIZER_MODEL ]]; then
-    echo "Please set TOKENIZER_MODEL for tokenizer model name."
-    exit 1
-fi
-
 CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
 
 DATA_TRAIN="${SOURCE}/examples/multimodal/pretrain_dataset.yaml"
@@ -93,7 +88,7 @@ OPTIONS=" \
     --eval-iters 10 \
     --eval-interval 1000 \
     --tokenizer-type MultimodalTokenizer \
-    --tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \
+    --tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \
     --tokenizer-prompt-format mistral \
     --data-path ${DATA_TRAIN} \
     --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
diff --git a/examples/multimodal/run_text_generation.py b/examples/multimodal/run_text_generation.py
old mode 100755
new mode 100644
index f4bb5025ff780d0599a9be6fc6a82221f700cbfd..cbde6680cc26bb9b3b7d4592d466906452064e6d
--- a/examples/multimodal/run_text_generation.py
+++ b/examples/multimodal/run_text_generation.py
@@ -14,11 +14,13 @@ sys.path.append(
 import torch
 import yaml
 from config import EvaluationConfig
-from evaluation_datasets import get_evaluation_dataset
+from evaluation.evaluation_datasets import get_evaluation_dataset
 from model import model_provider
 from multimodal_args import add_multimodal_extra_args
 
 from megatron.core import parallel_state
+from megatron.core.enums import ModelType
+from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN
 from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
 from megatron.inference.text_generation.api import generate_and_post_process
 from megatron.inference.text_generation.forward_step import ForwardStep
@@ -36,7 +38,7 @@ def add_text_generation_args(parser):
     group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
     group.add_argument("--top_k", type=int, default=0, help='Top k sampling.')
     group.add_argument(
-        "--out-seq-length", type=int, default=1024, help='Length of the output generated text.'
+        "--out-seq-length", type=int, default=128, help='Length of the output generated text.'
     )
     group.add_argument("--output-path", type=str, help='Output file path')
     group.add_argument('--input-image-path', type=str, help="Input image directory")
@@ -206,8 +208,8 @@ def generate_samples(model, config: EvaluationConfig, print_output):
                 if config.task == "VideoMME":
                     output["questions"][0][output_name] = generated
                 else:
-                    output[output_name] = generated
                     output["prompt"] = prompt
+                    output[output_name] = generated
 
                 if config.task == "captioning":
                     output["ground_truth"] = answers
@@ -354,7 +356,7 @@ class VLMForwardStep(ForwardStep):
         )
 
     def __call__(self, tokens, position_ids, attention_mask):
-        num_image_tokens = (tokens == self.model.image_token_index).sum().item()
+        num_image_tokens = (tokens == self.model.module.image_token_index).sum().item()
         num_tokens = tokens.size(1)
         recv_buffer_seq_length = None
         if num_image_tokens > 0:
@@ -406,7 +408,7 @@ def get_conversation(task, question):
             {"role": "system", "content": "Answer the questions."},
             {
                 "role": "user",
-                "content": "<image>\nProvide a one-sentence caption for provided image.",
+                "content": f"{IMAGE_TOKEN}\nProvide a one-sentence caption for provided image.",
             },
         ]
     elif task in ("TextVQA", "VQAv2", "ChartQA"):
@@ -414,13 +416,13 @@ def get_conversation(task, question):
             {"role": "system", "content": "Answer the questions."},
             {
                 "role": "user",
-                "content": f"<image>\n{question}\nAnswer the question using a single word or phrase.",
+                "content": f"{IMAGE_TOKEN}\n{question}\nAnswer the question using a single word or phrase.",
             },
         ]
     elif task in ("OCRBench", "MathVista", "AI2D"):
         conversation = [
             {"role": "system", "content": "Answer the questions."},
-            {"role": "user", "content": f"<image>\n{question}"},
+            {"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"},
         ]
     elif task == "MMMU":
         conversation = [
@@ -441,7 +443,7 @@ def get_conversation(task, question):
 
         conversation = [
             {"role": "system", "content": "Answer the questions."},
-            {"role": "user", "content": f"<image>\n{question}"},
+            {"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"},
         ]
 
     return conversation
@@ -464,11 +466,13 @@ def get_prompt_and_generated(prompt_and_generation, prompt_format):
         prompt = splitted[0]
         generated = splitted[1]
         generated = generated.split("<|im_end|>")[0]
-    elif prompt_format in ("nvlm-yi-34b", "qwen2p0"):
+    elif prompt_format in ("nvlm-yi-34b", "qwen2p0", "qwen2p5"):
         splitted = prompt_and_generation.split("<|im_start|>assistant\n")
         prompt = splitted[0]
         generated = splitted[1]
         generated = generated.split("<|im_end|>")[0]
+    else:
+        raise ValueError(f"Prompt format {prompt_format} is not supported.")
 
     # Remove possible garbage.
     generated = generated.strip()
@@ -489,11 +493,11 @@ def main():
 
     args = get_args()
 
-    def wrapped_model_provider(pre_process, post_process):
-        return model_provider(pre_process, post_process, parallel_output=False)
+    def wrapped_model_provider(pre_process, post_process, add_encoder, add_decoder):
+        return model_provider(pre_process, post_process, add_encoder, add_decoder, parallel_output=False)
 
     # Set up model and load checkpoint.
-    model = get_model(wrapped_model_provider, wrap_with_ddp=False)
+    model = get_model(wrapped_model_provider, model_type=ModelType.encoder_and_decoder, wrap_with_ddp=False)
 
     if args.load is not None:
         _ = load_checkpoint(model, None, None)
diff --git a/examples/multimodal/sft_dataset.yaml b/examples/multimodal/sft_dataset.yaml
old mode 100755
new mode 100644
diff --git a/examples/multimodal/sft_mistral_clip.sh b/examples/multimodal/sft_mistral_clip.sh
old mode 100755
new mode 100644
index 8a083cc1f2e81e3c8340957d1d9fd1636bf117f3..94ff208eb4df632b597daa49bc3a1fbff62fe8d1
--- a/examples/multimodal/sft_mistral_clip.sh
+++ b/examples/multimodal/sft_mistral_clip.sh
@@ -29,11 +29,6 @@ if [[ -z $LOAD_ITER ]]; then
     exit 1
 fi
 
-if [[ -z $TOKENIZER_MODEL ]]; then
-    echo "Please set TOKENIZER_MODEL for tokenizer model name."
-    exit 1
-fi
-
 CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
 
 DATA_TRAIN="${SOURCE}/examples/multimodal/sft_dataset.yaml"
@@ -98,7 +93,7 @@ OPTIONS=" \
     --eval-iters 10 \
     --eval-interval 500 \
     --tokenizer-type MultimodalTokenizer \
-    --tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \
+    --tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \
     --tokenizer-prompt-format mistral \
     --data-path ${DATA_TRAIN} \
     --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
diff --git a/examples/multimodal/text_generation_mistral_clip.sh b/examples/multimodal/text_generation_mistral_clip.sh
old mode 100755
new mode 100644
index ca98ff277a3729646a63f6de0958f323ed8e2276..c1ef7bcee897812fef976531e2a5bba961141b42
--- a/examples/multimodal/text_generation_mistral_clip.sh
+++ b/examples/multimodal/text_generation_mistral_clip.sh
@@ -4,12 +4,13 @@ export NCCL_IB_SL=1
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export NVTE_APPLY_QK_LAYER_SCALING=0
 
+INPUT_IMAGE_PATH="placeholder"
 GROUNDTRUTH_PATH="placeholder"
 NUM_FRAMES=1
 
 while [[ $# -gt 0 ]]; do
     case $1 in
-        --input-image-path)
+        -i|--input-image-path)
             INPUT_IMAGE_PATH="$2"
             shift
             shift
@@ -19,11 +20,6 @@ while [[ $# -gt 0 ]]; do
             shift
             shift
             ;;
-        -g|--groundtruth-path)
-            GROUNDTRUTH_PATH="$2"
-            shift
-            shift
-            ;;
         -o|--output-path)
             OUTPUT_PATH="$2"
             shift
@@ -34,12 +30,7 @@ while [[ $# -gt 0 ]]; do
             shift
             shift
             ;;
-        -t|--tokenizer-path)
-            TOKENIZER_PATH="$2"
-            shift
-            shift
-            ;;
-        --task)
+        -t|--task)
             TASK="$2"
             shift
             shift
@@ -92,7 +83,7 @@ do
         --no-masked-softmax-fusion \
         --load ${MODEL_PATH} \
         --tokenizer-type MultimodalTokenizer \
-        --tokenizer-model ${TOKENIZER_PATH} \
+        --tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \
         --tokenizer-prompt-format mistral \
         --bf16 \
         --micro-batch-size 1 \
diff --git a/examples/multimodal/train.py b/examples/multimodal/train.py
old mode 100755
new mode 100644
index 5ff2121b3d04c1a0f4f0733aac6526a65956c66d..1dc68d1173bfee00dd77d971c2b150b024acf421
--- a/examples/multimodal/train.py
+++ b/examples/multimodal/train.py
@@ -48,7 +48,7 @@ def get_batch(data_iterator):
     pp_size = get_pipeline_model_parallel_world_size()
     if not is_first_or_last_stage(pp_size, args.encoder_pipeline_model_parallel_size):
         # Note these are all set to None above.
-        return tokens, labels, loss_mask, attention_mask, position_ids, imgs, num_tiles
+        return tokens, labels, loss_mask, attention_mask, position_ids, imgs, num_tiles, packed_seq_params
 
     # Broadcast data.
     torch.cuda.nvtx.range_push("get_data")
@@ -66,7 +66,7 @@ def get_batch(data_iterator):
     cu_lengths = tensor_parallel.broadcast_data(["cu_lengths"], data, torch.int32)["cu_lengths"]
     max_lengths = tensor_parallel.broadcast_data(["max_lengths"], data, torch.int32)["max_lengths"]
 
-    # Dummy image, no image.
+    # No image input (text-only sample) if the dataloader produced a dummy image.
     if imgs.shape == torch.Size([1, 1]):
         # FIXME: text-only data can cause a hang if the vision model is own its own pipeline rank and --freeze-ViT is enabled.
         imgs = torch.tensor([], dtype=torch.float32, device=data_text.device)
diff --git a/examples/retro/README.md b/examples/retro/README.md
old mode 100755
new mode 100644
diff --git a/examples/retro/preprocess_data.sh b/examples/retro/preprocess_data.sh
old mode 100755
new mode 100644
diff --git a/examples/retro/train_retro_2b_distributed.sh b/examples/retro/train_retro_2b_distributed.sh
old mode 100755
new mode 100644
diff --git a/examples/run_simple_mcore_train_loop.py b/examples/run_simple_mcore_train_loop.py
old mode 100755
new mode 100644
diff --git a/examples/t5/README.md b/examples/t5/README.md
old mode 100755
new mode 100644
diff --git a/examples/t5/t5_mcore_train_curve.png b/examples/t5/t5_mcore_train_curve.png
old mode 100755
new mode 100644
diff --git a/examples/t5/train_t5_220m_distributed.sh b/examples/t5/train_t5_220m_distributed.sh
old mode 100755
new mode 100644
diff --git a/hostfile_gptmoe b/hostfile_gptmoe
new file mode 100644
index 0000000000000000000000000000000000000000..7dd871e4a7f81126a9ceef3fab50472a07b82f91
--- /dev/null
+++ b/hostfile_gptmoe
@@ -0,0 +1,16 @@
+node007 slots=8
+node008 slots=8
+node009 slots=8
+node010 slots=8
+node011 slots=8
+node012 slots=8
+node013 slots=8
+node014 slots=8
+node015 slots=8
+node016 slots=8
+node017 slots=8
+node018 slots=8
+node019 slots=8
+node020 slots=8
+node021 slots=8
+node022 slots=8
diff --git a/hostfile_mixtral8x7B b/hostfile_mixtral8x7B
new file mode 100644
index 0000000000000000000000000000000000000000..582f84799c0e467438b171420cf9f68a5d726720
--- /dev/null
+++ b/hostfile_mixtral8x7B
@@ -0,0 +1,2 @@
+node007 slots=8
+node008 slots=8
diff --git a/images/model_table.png b/images/model_table.png
old mode 100755
new mode 100644
diff --git a/images/strong_scaling.png b/images/strong_scaling.png
old mode 100755
new mode 100644
diff --git a/images/weak_scaling.png b/images/weak_scaling.png
old mode 100755
new mode 100644
diff --git a/kill.sh b/kill.sh
new file mode 100644
index 0000000000000000000000000000000000000000..688f93a6ec728bbc159d533e050cc5e65d0d00bf
--- /dev/null
+++ b/kill.sh
@@ -0,0 +1,3 @@
+ps -ef | grep python3 | grep -v grep | awk '{print "kill "$2}' >& .tmp
+bash .tmp
+
diff --git a/megatron/core/QuickStart.md b/megatron/core/QuickStart.md
old mode 100755
new mode 100644
diff --git a/megatron/core/README.md b/megatron/core/README.md
old mode 100755
new mode 100644
diff --git a/megatron/core/README_STRAGGLER.md b/megatron/core/README_STRAGGLER.md
old mode 100755
new mode 100644
diff --git a/megatron/core/__init__.py b/megatron/core/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/config_logger.py b/megatron/core/config_logger.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/Makefile b/megatron/core/datasets/Makefile
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/__init__.py b/megatron/core/datasets/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/bert_dataset.py b/megatron/core/datasets/bert_dataset.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/blended_dataset.py b/megatron/core/datasets/blended_dataset.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/blended_megatron_dataset_builder.py b/megatron/core/datasets/blended_megatron_dataset_builder.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/blended_megatron_dataset_config.py b/megatron/core/datasets/blended_megatron_dataset_config.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/gpt_dataset.py b/megatron/core/datasets/gpt_dataset.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/helpers.cpp b/megatron/core/datasets/helpers.cpp
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/helpers.py b/megatron/core/datasets/helpers.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/helpers_cpp.cpython-310-x86_64-linux-gnu.so_bak b/megatron/core/datasets/helpers_cpp.cpython-310-x86_64-linux-gnu.so_bak
new file mode 100755
index 0000000000000000000000000000000000000000..c398ffc62ef7822e975f481e70ff69ece755f775
Binary files /dev/null and b/megatron/core/datasets/helpers_cpp.cpython-310-x86_64-linux-gnu.so_bak differ
diff --git a/megatron/core/datasets/indexed_dataset.py b/megatron/core/datasets/indexed_dataset.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/masked_dataset.py b/megatron/core/datasets/masked_dataset.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/megatron_dataset.py b/megatron/core/datasets/megatron_dataset.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/megatron_tokenizer.py b/megatron/core/datasets/megatron_tokenizer.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/multimodal_dataset.py b/megatron/core/datasets/multimodal_dataset.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/readme.md b/megatron/core/datasets/readme.md
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/__init__.py b/megatron/core/datasets/retro/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/config/__init__.py b/megatron/core/datasets/retro/config/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/config/bert_embedders.py b/megatron/core/datasets/retro/config/bert_embedders.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/config/config.py b/megatron/core/datasets/retro/config/config.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/config/gpt_chunk_datasets.py b/megatron/core/datasets/retro/config/gpt_chunk_datasets.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/config/tokenizers.py b/megatron/core/datasets/retro/config/tokenizers.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/db/__init__.py b/megatron/core/datasets/retro/db/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/db/build.py b/megatron/core/datasets/retro/db/build.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/db/dataset.py b/megatron/core/datasets/retro/db/dataset.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/db/utils.py b/megatron/core/datasets/retro/db/utils.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/external_libs.py b/megatron/core/datasets/retro/external_libs.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/index/__init__.py b/megatron/core/datasets/retro/index/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/index/build.py b/megatron/core/datasets/retro/index/build.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/index/factory.py b/megatron/core/datasets/retro/index/factory.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/index/index.py b/megatron/core/datasets/retro/index/index.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/index/indexes/__init__.py b/megatron/core/datasets/retro/index/indexes/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/index/indexes/faiss_base.py b/megatron/core/datasets/retro/index/indexes/faiss_base.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/index/indexes/faiss_par_add.py b/megatron/core/datasets/retro/index/indexes/faiss_par_add.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/index/utils.py b/megatron/core/datasets/retro/index/utils.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/index/validate.py b/megatron/core/datasets/retro/index/validate.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/query/__init__.py b/megatron/core/datasets/retro/query/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/query/gpt_chunk_dataset.py b/megatron/core/datasets/retro/query/gpt_chunk_dataset.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py b/megatron/core/datasets/retro/query/multi_split_gpt_dataset.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/query/query.py b/megatron/core/datasets/retro/query/query.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/query/retro_dataset.py b/megatron/core/datasets/retro/query/retro_dataset.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/query/utils.py b/megatron/core/datasets/retro/query/utils.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/retro/utils.py b/megatron/core/datasets/retro/utils.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/t5_dataset.py b/megatron/core/datasets/t5_dataset.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/utils.py b/megatron/core/datasets/utils.py
old mode 100755
new mode 100644
diff --git a/megatron/core/datasets/utils_s3.py b/megatron/core/datasets/utils_s3.py
old mode 100755
new mode 100644
diff --git a/megatron/core/dist_checkpointing/__init__.py b/megatron/core/dist_checkpointing/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/dist_checkpointing/core.py b/megatron/core/dist_checkpointing/core.py
old mode 100755
new mode 100644
diff --git a/megatron/core/dist_checkpointing/dict_utils.py b/megatron/core/dist_checkpointing/dict_utils.py
old mode 100755
new mode 100644
diff --git a/megatron/core/dist_checkpointing/exchange_utils.py b/megatron/core/dist_checkpointing/exchange_utils.py
old mode 100755
new mode 100644
diff --git a/megatron/core/dist_checkpointing/mapping.py b/megatron/core/dist_checkpointing/mapping.py
old mode 100755
new mode 100644
index 2ddfcf3b315a912ef523dc118933644ccbdc97f6..d376c6374baf7053b549f0686713e0e1c672d7b2
--- a/megatron/core/dist_checkpointing/mapping.py
+++ b/megatron/core/dist_checkpointing/mapping.py
@@ -119,7 +119,8 @@ class ShardedTensor(ShardedBase):
                     self.init_data(device='meta')
                     if self.data.shape != real_data.shape:
                         raise CheckpointingException(
-                            f'Data shape doesnt match expected {self.data.shape} for {self}'
+                            f'Data shape {real_data.shape} doesnt match'
+                            f' expected {self.data.shape} for {self}'
                         )
                 finally:
                     self.data = real_data
diff --git a/megatron/core/dist_checkpointing/optimizer.py b/megatron/core/dist_checkpointing/optimizer.py
old mode 100755
new mode 100644
diff --git a/megatron/core/dist_checkpointing/serialization.py b/megatron/core/dist_checkpointing/serialization.py
old mode 100755
new mode 100644
index 3be5777e747b66742a67a9cc6279961a4b516de5..600dd87e5438620a9214d97f2c688f056e5c4aef
--- a/megatron/core/dist_checkpointing/serialization.py
+++ b/megatron/core/dist_checkpointing/serialization.py
@@ -104,8 +104,6 @@ def load(
 
     checkpoint_dir = Path(checkpoint_dir)
     common_state_dict = common_strategy.load_common(checkpoint_dir)
-    if not sharded_state_dict:
-        return common_state_dict
 
     sharded_state_dict, nonpersistent_state_dict, sh_ten_factories = load_preprocess(
         sharded_state_dict
diff --git a/megatron/core/dist_checkpointing/state_dict_transformation.py b/megatron/core/dist_checkpointing/state_dict_transformation.py
old mode 100755
new mode 100644
diff --git a/megatron/core/dist_checkpointing/strategies/__init__.py b/megatron/core/dist_checkpointing/strategies/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/dist_checkpointing/strategies/async_utils.py b/megatron/core/dist_checkpointing/strategies/async_utils.py
old mode 100755
new mode 100644
diff --git a/megatron/core/dist_checkpointing/strategies/base.py b/megatron/core/dist_checkpointing/strategies/base.py
old mode 100755
new mode 100644
diff --git a/megatron/core/dist_checkpointing/strategies/common.py b/megatron/core/dist_checkpointing/strategies/common.py
old mode 100755
new mode 100644
diff --git a/megatron/core/dist_checkpointing/strategies/filesystem_async.py b/megatron/core/dist_checkpointing/strategies/filesystem_async.py
old mode 100755
new mode 100644
diff --git a/megatron/core/dist_checkpointing/strategies/fully_parallel.py b/megatron/core/dist_checkpointing/strategies/fully_parallel.py
old mode 100755
new mode 100644
diff --git a/megatron/core/dist_checkpointing/strategies/resharding.py b/megatron/core/dist_checkpointing/strategies/resharding.py
old mode 100755
new mode 100644
diff --git a/megatron/core/dist_checkpointing/strategies/state_dict_saver.py b/megatron/core/dist_checkpointing/strategies/state_dict_saver.py
old mode 100755
new mode 100644
diff --git a/megatron/core/dist_checkpointing/strategies/tensorstore.py b/megatron/core/dist_checkpointing/strategies/tensorstore.py
old mode 100755
new mode 100644
diff --git a/megatron/core/dist_checkpointing/strategies/torch.py b/megatron/core/dist_checkpointing/strategies/torch.py
old mode 100755
new mode 100644
diff --git a/megatron/core/dist_checkpointing/strategies/two_stage.py b/megatron/core/dist_checkpointing/strategies/two_stage.py
old mode 100755
new mode 100644
diff --git a/megatron/core/dist_checkpointing/strategies/zarr.py b/megatron/core/dist_checkpointing/strategies/zarr.py
old mode 100755
new mode 100644
diff --git a/megatron/core/dist_checkpointing/utils.py b/megatron/core/dist_checkpointing/utils.py
old mode 100755
new mode 100644
diff --git a/megatron/core/dist_checkpointing/validation.py b/megatron/core/dist_checkpointing/validation.py
old mode 100755
new mode 100644
index 48e023dc3945ff05cd9d009fbe296ce328d0937f..546ec3547f9144bcfaa6c3dcc88e3e7011c10c05
--- a/megatron/core/dist_checkpointing/validation.py
+++ b/megatron/core/dist_checkpointing/validation.py
@@ -412,7 +412,7 @@ def validate_sharding_integrity(
         CheckpointingException for invalid access pattern
     """
 
-    if common_state_dict:
+    if common_state_dict is not None:
         _validate_common_state_dict(common_state_dict)
 
     if torch.distributed.get_rank() != 0:
@@ -461,10 +461,15 @@ def _validate_sharding_for_key(rank_sharding: List[Tuple[int, ShardedTensor]]):
             lambda x: x[1],
             _validate_sharding_for_key_flattened,
         )
-    else:
-        if not torch.all(shard_access_cnt == 1):
-            logger.error(f'Invalid access pattern for {rank_sharding[0][1]}: {shard_access_cnt}')
-            raise CheckpointingException(f'Invalid access pattern for {rank_sharding[0][1]}')
+        # For each shard with at least 1 flattened tensor in it, the above
+        # `_validate_sharding_for_key_flattened` ensure a correct consistent pattern
+        # The only thing that can go wrong at this point is that some shard don't have
+        # *any* representatives which will be checked later by comparing `shard_access_cnt == 1`
+        shard_access_cnt = torch.minimum(shard_access_cnt, torch.tensor([1]))
+    if not torch.all(shard_access_cnt == 1):
+        raise CheckpointingException(
+            f'Invalid access pattern for {rank_sharding[0][1]}: {shard_access_cnt}'
+        )
 
 
 def _compute_shards_access(rank_sharding):
@@ -489,16 +494,10 @@ def _validate_sharding_for_key_flattened(tensors_by_shard):
         all_slices.append((sharding.flattened_range.start, sharding.flattened_range.stop))
 
     starts, stops = map(np.asarray, zip(*sorted(all_slices)))
-    if (
-        starts[0] != 0
-        or stops[-1] != np.product(local_shape)
-        or not np.all(starts[1:] == stops[:-1])
-    ):
-        logger.error(
-            f'Flattened ranges dont cover the whole shard {tensors_by_shard[0]}. Ranges: {(starts, stops)}'
-        )
+    expected_size = np.product(local_shape)
+    if starts[0] != 0 or stops[-1] != expected_size or not np.all(starts[1:] == stops[:-1]):
         raise CheckpointingException(
-            f'Flattened ranges dont cover the whole shard {tensors_by_shard[0]}. Ranges: {(starts, stops)}'
+            f'Flattened ranges dont cover the whole shard {tensors_by_shard[0]} of size {expected_size}. Ranges: {(starts, stops)}'
         )
 
 
diff --git a/megatron/core/distributed/README.md b/megatron/core/distributed/README.md
old mode 100755
new mode 100644
diff --git a/megatron/core/distributed/__init__.py b/megatron/core/distributed/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/distributed/data_parallel_base.py b/megatron/core/distributed/data_parallel_base.py
old mode 100755
new mode 100644
diff --git a/megatron/core/distributed/distributed_data_parallel.py b/megatron/core/distributed/distributed_data_parallel.py
old mode 100755
new mode 100644
index 3a23426eca03f97c8ac88b131e3d1f50cde86e62..ea08db6c127e6413a365ea8b51139ac698dd8c27
--- a/megatron/core/distributed/distributed_data_parallel.py
+++ b/megatron/core/distributed/distributed_data_parallel.py
@@ -7,6 +7,7 @@ import torch
 
 from .. import parallel_state
 from ..config_logger import has_config_logger_enabled, log_config_to_disk
+from ..transformer.cuda_graphs import is_graph_capturing
 from ..transformer.transformer_config import TransformerConfig
 from ..utils import is_float8tensor, log_single_rank
 from .data_parallel_base import _BaseDataParallel
@@ -151,12 +152,20 @@ class DistributedDataParallel(_BaseDataParallel):
                     with_context_parallel=True
                 )
                 if self.ddp_config.average_in_collective:
-                    # Collective is averaging gradients in collective with data_parallel_group.
-                    assert (
-                        gradient_scaling_factor
-                        / parallel_state.get_data_parallel_world_size(with_context_parallel=True)
-                        == target_gradient_scaling_factor
-                    )
+                    if self.ddp_config.num_distributed_optimizer_instances == 1:
+                        # Collective is averaging gradients in collective with data_parallel_group.
+                        assert (
+                            gradient_scaling_factor
+                            / torch.distributed.get_world_size(group=data_parallel_group)
+                            == target_gradient_scaling_factor
+                        )
+                    else:
+                        # For non-expert parameters, gradient_scaling_factor is 1.
+                        # For expert parameters, gradient_scaling_factor is 1/ep_size.
+                        assert (gradient_scaling_factor == 1) or (
+                            gradient_scaling_factor
+                            == (1.0 / parallel_state.get_expert_model_parallel_world_size())
+                        )
                 else:
                     assert gradient_scaling_factor == target_gradient_scaling_factor
 
@@ -297,9 +306,10 @@ class DistributedDataParallel(_BaseDataParallel):
                 self._make_forward_pre_hook()
             )
 
-    def disable_forward_pre_hook(self):
+    def disable_forward_pre_hook(self, param_sync: bool = True):
         """
         Disable forward pre-hooks needed for param all-gather overlap with forward compute.
+        Skip synchronous param all-gather if `param_sync` is False.
         """
         assert self.use_forward_hook
         # De-register forward pre-hook for all sub-modules.
@@ -310,7 +320,8 @@ class DistributedDataParallel(_BaseDataParallel):
         assert len(self.remove_forward_pre_hook_handles) == 0
 
         # Force synchronize parameters.
-        self.start_param_sync(force_sync=True)
+        if param_sync:
+            self.start_param_sync(force_sync=True)
 
     def _make_forward_pre_hook(self):
         """
@@ -323,6 +334,9 @@ class DistributedDataParallel(_BaseDataParallel):
                 self.use_forward_hook
             ), "Should use pre-hook only when overlap_param_gather is True"
 
+            if is_graph_capturing():
+                return
+
             # Make sure all parameters in this module have been all-gathered as necessary.
             for param in module.parameters(recurse=False):
                 # Skip parameters without an associated buffer (such parameters have a
@@ -353,6 +367,9 @@ class DistributedDataParallel(_BaseDataParallel):
         """
 
         def hook(*unused):
+            if is_graph_capturing():
+                return
+
             if param in self.param_to_bucket_group:
                 assert param.requires_grad
                 if self.ddp_config.overlap_grad_reduce:
diff --git a/megatron/core/distributed/distributed_data_parallel_config.py b/megatron/core/distributed/distributed_data_parallel_config.py
old mode 100755
new mode 100644
diff --git a/megatron/core/distributed/finalize_model_grads.py b/megatron/core/distributed/finalize_model_grads.py
old mode 100755
new mode 100644
diff --git a/megatron/core/distributed/param_and_grad_buffer.py b/megatron/core/distributed/param_and_grad_buffer.py
old mode 100755
new mode 100644
index 00c8fdd69db447a71518792d10c926c45a17c795..5095a7c7f3b44f8a7040d3263873aebd2a76b681
--- a/megatron/core/distributed/param_and_grad_buffer.py
+++ b/megatron/core/distributed/param_and_grad_buffer.py
@@ -270,13 +270,12 @@ class _ParamAndGradBucketGroup:
         if self.ddp_config.average_in_collective:
             reduce_op = torch.distributed.ReduceOp.AVG
 
-        # Stream synchronization logic of the CUDA streams that is
-        # implemented below for the gradient reduction within and across
-        # distributed optimizer instances.
+        # We use the following stream synchronization for the gradient reduction
+        # within and across DistOpt instances.
 
-        # Compute Stream - -------------Gradient Compute-------------------
-        # Comm. Stream   - ------(wait for nccl)-----(wait for nccl)-------
-        # NCCL Stream    -       -------RS------     -------AR------
+        # Compute Stream: -------------Gradient compute-------------------
+        # Comm. Stream:   ------(wait for NCCL)-----(wait for NCCL)-------
+        # NCCL Stream:          -------RS------     -------AR------
 
         # Use async communications only when overlap_grad_reduce is True.
         async_op = (
@@ -287,13 +286,13 @@ class _ParamAndGradBucketGroup:
             self.ddp_config.num_distributed_optimizer_instances > 1
             and self.ddp_config.overlap_grad_reduce
         ):
-            # Assign a communication stream if we use partial DP DistOpt and we
-            # need to overlap communication
+            # Assign a communication stream if we have multiple DistOpt instances and we
+            # need to overlap communication.
             stream_context = torch.cuda.stream(self.communication_stream)
 
             # The RS/AR communication stream needs to wait for the default stream
             # to complete its gradient computation before launching the next
-            # gradient reduction collective
+            # gradient reduction collective.
             self.communication_stream.wait_stream(torch.cuda.default_stream())
         else:
             stream_context = nullcontext()
@@ -314,24 +313,21 @@ class _ParamAndGradBucketGroup:
                         local_data_view,
                         bucket.grad_data,
                         op=reduce_op,
-                        group=self.intra_distributed_optimizer_instance_group,
+                        group=communication_group,
                         async_op=async_op,
                     )
                 else:
                     torch.distributed.all_reduce(
-                        bucket.grad_data,
-                        op=reduce_op,
-                        group=self.data_parallel_group,
-                        async_op=async_op,
+                        bucket.grad_data, op=reduce_op, group=communication_group, async_op=async_op
                     )
 
-        # When enabling partial DP domain DistOpt, we need to All-Reduce across all partial domains
+        # With multiple DistOpt instances, we need to all-reduce across instances.
         if (
             self.ddp_config.use_distributed_optimizer
             and self.ddp_config.num_distributed_optimizer_instances > 1
         ):
 
-            # Create a new coalescing facility for the inter partial DP-AllReduce here
+            # Create a new coalescing manager for the inter-instance all-reduce.
             with stream_context, _coalescing_manager(
                 self.inter_distributed_optimizer_instance_group, async_ops=async_op
             ) as cm:
@@ -366,13 +362,13 @@ class _ParamAndGradBucketGroup:
         communication call to complete. When ddp_config.overlap_grad_reduce is set to False,
         makes synchronous call.
         """
-        # If overlap_grad_reduce is False, start (and finish) synchronous communication call here.
         self.param_gather_dispatched = False
+        # If overlap_grad_reduce is False, start (and finish) synchronous communication call here.
         if not self.ddp_config.overlap_grad_reduce:
             self.start_grad_sync()
             return
-        # When using partial DP DistOpt, we don't need to sync as we launch comms on a separate
-        # communication stream
+        # When using multiple DistOpt instances, we don't need to sync here as we launch
+        # communications on a separate communication stream.
         if self.ddp_config.num_distributed_optimizer_instances > 1:
             torch.cuda.default_stream().wait_stream(self.communication_stream)
             return
diff --git a/megatron/core/distributed/torch_fully_sharded_data_parallel.py b/megatron/core/distributed/torch_fully_sharded_data_parallel.py
old mode 100755
new mode 100644
diff --git a/megatron/core/enums.py b/megatron/core/enums.py
old mode 100755
new mode 100644
diff --git a/megatron/core/export/__init__.py b/megatron/core/export/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/export/data_type.py b/megatron/core/export/data_type.py
old mode 100755
new mode 100644
diff --git a/megatron/core/export/export_config.py b/megatron/core/export/export_config.py
old mode 100755
new mode 100644
diff --git a/megatron/core/export/model_type.py b/megatron/core/export/model_type.py
old mode 100755
new mode 100644
diff --git a/megatron/core/export/trtllm/__init__.py b/megatron/core/export/trtllm/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/export/trtllm/engine_builder/__init__.py b/megatron/core/export/trtllm/engine_builder/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py b/megatron/core/export/trtllm/engine_builder/trtllm_engine_builder.py
old mode 100755
new mode 100644
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py b/megatron/core/export/trtllm/model_to_trllm_mapping/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py b/megatron/core/export/trtllm/model_to_trllm_mapping/default_conversion_dict.py
old mode 100755
new mode 100644
diff --git a/megatron/core/export/trtllm/trt_model_config.py b/megatron/core/export/trtllm/trt_model_config.py
old mode 100755
new mode 100644
diff --git a/megatron/core/export/trtllm/trt_model_type.py b/megatron/core/export/trtllm/trt_model_type.py
old mode 100755
new mode 100644
diff --git a/megatron/core/export/trtllm/trtllm_helper.py b/megatron/core/export/trtllm/trtllm_helper.py
old mode 100755
new mode 100644
diff --git a/megatron/core/export/trtllm/trtllm_layers.py b/megatron/core/export/trtllm/trtllm_layers.py
old mode 100755
new mode 100644
diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py b/megatron/core/export/trtllm/trtllm_weights_converter/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py b/megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py
old mode 100755
new mode 100644
diff --git a/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py b/megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py
old mode 100755
new mode 100644
diff --git a/megatron/core/extensions/__init__.py b/megatron/core/extensions/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py
old mode 100755
new mode 100644
index 62336cdb034919241112baa10c6407cb00506892..a89e272e51ef9a12b1421c3054f40f95f3c53d15
--- a/megatron/core/extensions/transformer_engine.py
+++ b/megatron/core/extensions/transformer_engine.py
@@ -13,8 +13,8 @@ from packaging.version import Version as PkgVersion
 from torch import Tensor
 from torch.nn.parameter import Parameter
 
-from megatron.core import ModelParallelConfig
 from megatron.core.dist_checkpointing.utils import replace_prefix_for_sharding
+from megatron.core.model_parallel_config import ModelParallelConfig
 from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.parallel_state import (
     get_context_parallel_global_ranks,
@@ -654,6 +654,23 @@ class TEDotProductAttention(te.pytorch.DotProductAttention):
         else:
             kv_channels = self.config.kv_channels
 
+        self.kept_packed_seq_params = set(
+            field.name for field in dataclasses.fields(PackedSeqParams)
+        )
+        if get_te_version() < PkgVersion("1.3.0"):
+            # TE 1.3.0 introduces precomputing max_seqlen to remove unnecessary kernels and D2H
+            # copies (#555)
+            # These two arguments did not exist prior to 1.3.0
+            self.kept_packed_seq_params.discard("max_seqlen_q")
+            self.kept_packed_seq_params.discard("max_seqlen_kv")
+
+        if get_te_version() < PkgVersion("1.10.0"):
+            # TE 1.8.0 introduces cu_seqlens_padded which is the cu_seqlens with paddings counted
+            # in each individual sequence in THD format dataset
+            # These two arguments did not exist prior to 1.8.0. Full support added in 1.10.0 (#1012)
+            self.kept_packed_seq_params.discard("cu_seqlens_q_padded")
+            self.kept_packed_seq_params.discard("cu_seqlens_kv_padded")
+
         super().__init__(
             num_attention_heads=self.config.num_attention_heads,
             kv_channels=kv_channels,
@@ -683,7 +700,9 @@ class TEDotProductAttention(te.pytorch.DotProductAttention):
     ):
         """Forward."""
         packed_seq_kwargs = (
-            dataclasses.asdict(packed_seq_params) if packed_seq_params is not None else {}
+            {key: getattr(packed_seq_params, key) for key in self.kept_packed_seq_params}
+            if packed_seq_params is not None
+            else {}
         )
         # overwrite self.qkv_format depending on self.config.apply_rope_fusion, which can be set
         # after init
@@ -692,24 +711,10 @@ class TEDotProductAttention(te.pytorch.DotProductAttention):
 
         qkv_format = packed_seq_kwargs.get('qkv_format', self.qkv_format)
 
-        if get_te_version() < PkgVersion("1.3.0"):
-            # TE 1.3.0 introduces precomputing max_seqlen to remove unnecessary kernels and D2H
-            # copies (#555)
-            # These two arguments did not exist prior to 1.3.0
-            packed_seq_kwargs.pop("max_seqlen_q", None)
-            packed_seq_kwargs.pop("max_seqlen_kv", None)
-
-        if get_te_version() < PkgVersion("1.10.0"):
-            # TE 1.8.0 introduces cu_seqlens_padded which is the cu_seqlens with paddings counted
-            # in each individual sequence in THD format dataset
-            # These two arguments did not exist prior to 1.8.0.Full support added in 1.10.0 (#1012)
-            packed_seq_kwargs.pop("cu_seqlens_q_padded", None)
-            packed_seq_kwargs.pop("cu_seqlens_kv_padded", None)
-
         # WAR for peak memory usage.
         # See https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/2388
         if self.config.apply_rope_fusion and qkv_format == 'bshd':
-            query, key, value = [x.contiguous().transpose(0, 1) for x in (query, key, value)]
+            query, key, value = [x.transpose(0, 1).contiguous() for x in (query, key, value)]
             # In PyTorch, the following two tensors are in fact the same:
             #   Tensor with shape (1, S, H, D) and stride (S*H*D, H*D, D, 1)
             #   Tensor with shape (1, S, H, D) and stride (H*D, H*D, D, 1)
@@ -760,7 +765,7 @@ class TEDotProductAttention(te.pytorch.DotProductAttention):
 
 if is_te_min_version("1.9.0.dev0"):
 
-    class TEGroupedLinear(te.pytorch.GroupedLinear):
+    class TEGroupedLinear(te.pytorch.BatchLinear if int(os.getenv("GROUPED_GEMM_BatchLinear", '0')) else te.pytorch.GroupedLinear):
         """
         Wrapper for the Transformer-Engine's `GroupedLinear` layer.
 
@@ -1229,8 +1234,14 @@ try:
 
     from transformer_engine.pytorch.attention import FusedRoPEFunc
 
-    def fused_apply_rotary_pos_emb(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    def fused_apply_rotary_pos_emb(
+        t: torch.Tensor, freqs: torch.Tensor, transpose_output_memory: bool = False
+    ) -> torch.Tensor:
         """Apply rotary positional embedding to input tensor T in `sbhd` format."""
+        if transpose_output_memory:
+            warnings.warn(
+                "transpose_output_memory is not supported by TE's fused RoPE and will be ignored."
+            )
         return FusedRoPEFunc.apply(t, freqs, "sbhd")
 
     def fused_apply_rotary_pos_emb_thd(
diff --git a/megatron/core/fusions/__init__.py b/megatron/core/fusions/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/fusions/fused_bias_dropout.py b/megatron/core/fusions/fused_bias_dropout.py
old mode 100755
new mode 100644
diff --git a/megatron/core/fusions/fused_bias_geglu.py b/megatron/core/fusions/fused_bias_geglu.py
old mode 100755
new mode 100644
diff --git a/megatron/core/fusions/fused_bias_gelu.py b/megatron/core/fusions/fused_bias_gelu.py
old mode 100755
new mode 100644
diff --git a/megatron/core/fusions/fused_bias_swiglu.py b/megatron/core/fusions/fused_bias_swiglu.py
old mode 100755
new mode 100644
diff --git a/megatron/core/fusions/fused_cross_entropy.py b/megatron/core/fusions/fused_cross_entropy.py
old mode 100755
new mode 100644
diff --git a/megatron/core/fusions/fused_layer_norm.py b/megatron/core/fusions/fused_layer_norm.py
old mode 100755
new mode 100644
diff --git a/megatron/core/fusions/fused_softmax.py b/megatron/core/fusions/fused_softmax.py
old mode 100755
new mode 100644
diff --git a/megatron/core/inference/__init__.py b/megatron/core/inference/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/inference/ammo_support/__init__.py b/megatron/core/inference/ammo_support/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/inference/ammo_support/gpt/model_specs.py b/megatron/core/inference/ammo_support/gpt/model_specs.py
old mode 100755
new mode 100644
diff --git a/megatron/core/inference/ammo_support/gpt/state_dict_hooks.py b/megatron/core/inference/ammo_support/gpt/state_dict_hooks.py
old mode 100755
new mode 100644
diff --git a/megatron/core/inference/common_inference_params.py b/megatron/core/inference/common_inference_params.py
old mode 100755
new mode 100644
index 22353088f8adbda6f331cce26a7a1297aa7faccd..7955bb6fc143bf5daf754d20817b23e1d0ab5d72
--- a/megatron/core/inference/common_inference_params.py
+++ b/megatron/core/inference/common_inference_params.py
@@ -1,29 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-from dataclasses import dataclass
-
-
-@dataclass
-class CommonInferenceParams:
-    """Inference parameters sent along with the prompts
-
-    For an explanation of these parameters refer to this blog https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910
-    """
-
-    temperature: float = 1.0
-    top_k: int = 0
-    top_p: float = 0.0
-    return_log_probs: bool = False
-    num_tokens_to_generate: int = 30
-
-    def add_attributes(self, attribute_value_pair: dict):
-        """Utility to add more attributes to inference params
-
-        Use this method to pass in a custom dictonary to add more inference parameter attributes to the instance you created. Use as follows
-        c = CommonInferenceParams
-        c.add_attributes({'min_length':4, 'eod_id':153})
-
-        Args:
-            attribute_value_pair (dict): A dictionary containing attributes as the key names and their values as the values.
-        """
-        for key, value in attribute_value_pair.items():
-            setattr(self, key, value)
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+from megatron.core.inference.sampling_params import (  # noqa: F401 # pylint: disable=unused-import
+    SamplingParams as CommonInferenceParams,
+)
diff --git a/megatron/core/inference/communication_utils.py b/megatron/core/inference/communication_utils.py
old mode 100755
new mode 100644
diff --git a/megatron/core/inference/engines/__init__.py b/megatron/core/inference/engines/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/inference/engines/abstract_engine.py b/megatron/core/inference/engines/abstract_engine.py
old mode 100755
new mode 100644
diff --git a/megatron/core/inference/engines/mcore_engine.py b/megatron/core/inference/engines/mcore_engine.py
old mode 100755
new mode 100644
index fe8160228bdb6dd7e1bc966273c677ca50e7ae4f..28ef46bf9292999ee68028d98c41ff8e8aa74f43
--- a/megatron/core/inference/engines/mcore_engine.py
+++ b/megatron/core/inference/engines/mcore_engine.py
@@ -3,12 +3,12 @@ from typing import Dict, List
 
 import torch
 
-from megatron.core.inference.common_inference_params import CommonInferenceParams
 from megatron.core.inference.engines.abstract_engine import AbstractEngine
 from megatron.core.inference.inference_request import InferenceRequest
+from megatron.core.inference.sampling_params import SamplingParams
 from megatron.core.inference.scheduler import Scheduler
-from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import (
-    SimpleTextGenerationController,
+from megatron.core.inference.text_generation_controllers.text_generation_controller import (
+    TextGenerationController,
 )
 
 
@@ -19,7 +19,7 @@ class MCoreEngine(AbstractEngine):
     Supports any model that is callable (Accepts the inputs and outputs the tensor)
 
     Args:
-        text_generation_controller (SimpleTextGenerationController): A text generation
+        text_generation_controller (TextGenerationController): A text generation
             controller that will be used to define how to preprocess prompts, generate
             outputs and detokenizer the output tokens.
         max_batch_size : The maxinum number of requests to process at once
@@ -29,7 +29,7 @@ class MCoreEngine(AbstractEngine):
 
     def __init__(
         self,
-        text_generation_controller: SimpleTextGenerationController,
+        text_generation_controller: TextGenerationController,
         max_batch_size,
         random_seed: int = None,
     ):
@@ -42,7 +42,8 @@ class MCoreEngine(AbstractEngine):
         prompts: List[str],
         add_BOS: bool = False,
         encoder_prompts: List[str] = None,
-        common_inference_params: CommonInferenceParams = None,
+        common_inference_params: SamplingParams = None,
+        sampling_params: SamplingParams = None,
     ) -> dict:
         """The megatron core inference backend generate function
 
@@ -54,13 +55,19 @@ class MCoreEngine(AbstractEngine):
             prompts (List[str]): All the prompts as a list of strings
             add_BOS (bool): Whether to add BOS token to beginning of prompts
             encoder_prompts (List[dict]): All the encoder prompts as a list of strings
-            common_inference_params (CommonInferenceParams): The inference parameters
+            common_inference_params: Deprecated. Only used for backward compatibility with
+            MCore <= 0.9.0. Use `sampling_params` going forward.
+            sampling_params (SamplingParams): The request-level sampling parameters
 
         Returns:
             List[InferenceRequest]: The output is list of inference requests containing the
             generated tokens, texts and log probs if required
         """
         # TODO :M core- get rng state tracker
+
+        if common_inference_params:
+            sampling_params = common_inference_params
+
         if self.random_seed:
             torch.random.manual_seed(self.random_seed)
 
@@ -73,7 +80,7 @@ class MCoreEngine(AbstractEngine):
                 prompt=prompt,
                 prompt_tokens=prompt_tokens,
                 encoder_prompt=encoder_prompt,
-                inference_parameters=common_inference_params,
+                inference_parameters=sampling_params,
             )
 
         self.run_engine()
diff --git a/megatron/core/inference/inference_request.py b/megatron/core/inference/inference_request.py
old mode 100755
new mode 100644
index 4825dfd3661d8b26e0cec5f003fdf1486886a2d5..ea0d67bfea26112db6219f53a3fcbec244e58ca3
--- a/megatron/core/inference/inference_request.py
+++ b/megatron/core/inference/inference_request.py
@@ -5,7 +5,7 @@ from typing import List
 
 import torch
 
-from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.sampling_params import SamplingParams
 
 
 # class syntax
@@ -28,7 +28,7 @@ class InferenceRequest:
 
     request_id: str
     prompt: str
-    inference_parameters: CommonInferenceParams
+    inference_parameters: SamplingParams
     prompt_tokens: List[int]
     arrival_time: float
     status: Status
diff --git a/megatron/core/inference/model_inference_wrappers/__init__.py b/megatron/core/inference/model_inference_wrappers/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py
old mode 100755
new mode 100644
diff --git a/megatron/core/inference/model_inference_wrappers/gpt/__init__.py b/megatron/core/inference/model_inference_wrappers/gpt/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py
old mode 100755
new mode 100644
diff --git a/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py b/megatron/core/inference/model_inference_wrappers/inference_wrapper_config.py
old mode 100755
new mode 100644
diff --git a/megatron/core/inference/model_inference_wrappers/t5/__init__.py b/megatron/core/inference/model_inference_wrappers/t5/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py b/megatron/core/inference/model_inference_wrappers/t5/t5_inference_wrapper.py
old mode 100755
new mode 100644
diff --git a/megatron/core/inference/modelopt_support/__init__.py b/megatron/core/inference/modelopt_support/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/inference/modelopt_support/gpt/__init__.py b/megatron/core/inference/modelopt_support/gpt/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/inference/modelopt_support/gpt/model_specs.py b/megatron/core/inference/modelopt_support/gpt/model_specs.py
old mode 100755
new mode 100644
diff --git a/megatron/core/inference/modelopt_support/gpt/state_dict_hooks.py b/megatron/core/inference/modelopt_support/gpt/state_dict_hooks.py
old mode 100755
new mode 100644
diff --git a/megatron/core/inference/sampling_params.py b/megatron/core/inference/sampling_params.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ffcb6321dc13cb04574accaa0636987454222fa
--- /dev/null
+++ b/megatron/core/inference/sampling_params.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from dataclasses import dataclass
+
+
+@dataclass
+class SamplingParams:
+    """Inference parameters sent along with the prompts.
+    This class contains request-level attributes that control the sampling techniques used when
+    generating text. This is distinct from megatron.core.InferenceParams, which is sets model-level
+    inference attributes such as the maximum sequence length, and contains the KV cache.
+
+    For an explanation of these parameters refer to this blog
+    https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-
+    temperature-parameters-ed6a31313910
+    """
+
+    temperature: float = 1.0
+    top_k: int = 0
+    top_p: float = 0.0
+    return_log_probs: bool = False
+    num_tokens_to_generate: int = 30
+
+    def add_attributes(self, attribute_value_pair: dict):
+        """Utility to add more attributes to sampling params
+
+        Use this method to pass in a custom dictionary to add more sampling parameter attributes.
+        c = SamplingParams
+        c.add_attributes({'min_length':4, 'eod_id':153})
+
+        Args:
+            attribute_value_pair (dict): A dictionary containing attributes as the key names and
+            their values as the values.
+        """
+        for key, value in attribute_value_pair.items():
+            setattr(self, key, value)
diff --git a/megatron/core/inference/scheduler.py b/megatron/core/inference/scheduler.py
old mode 100755
new mode 100644
index 00ab81b4abdb055e38e61b967c58509f9d773536..ef177232b42419fdae7284155701e403368561a6
--- a/megatron/core/inference/scheduler.py
+++ b/megatron/core/inference/scheduler.py
@@ -6,8 +6,8 @@ from typing import Dict
 
 import torch
 
-from megatron.core.inference.common_inference_params import CommonInferenceParams
 from megatron.core.inference.inference_request import InferenceRequest, Status
+from megatron.core.inference.sampling_params import SamplingParams
 from megatron.core.inference.utils import Counter
 
 
@@ -33,7 +33,7 @@ class Scheduler:
         prompt: str,
         prompt_tokens: torch.Tensor,
         encoder_prompt: str = None,
-        inference_parameters: CommonInferenceParams = None,
+        inference_parameters: SamplingParams = None,
         arrival_time: float = None,
     ):
         """Add an incoming request
@@ -45,7 +45,7 @@ class Scheduler:
             prompt (str): Input prompt string
             prompt_tokens (torch.Tensor): A torch tensor having the input prompts tokenized
             encoder_prompt (str): Encoder input string
-            inference_parameters (CommonInferenceParams): The inference parameters
+            inference_parameters (SamplingParams): The inference parameters
             arrival_time (float, optional): The incoming request time. Defaults to None.
         """
         request_id = str(next(self.request_counter))
diff --git a/megatron/core/inference/text_generation_controllers/__init__.py b/megatron/core/inference/text_generation_controllers/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py b/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py
old mode 100755
new mode 100644
index 61beff0211ce25bfa5c207d7cf95c170ae0956e4..0c2a41be44a10228dce66541e0f93559e691f288
--- a/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py
+++ b/megatron/core/inference/text_generation_controllers/encoder_decoder_text_generation_controller.py
@@ -4,15 +4,15 @@ from typing import OrderedDict
 import torch
 
 from megatron.core.inference.inference_request import InferenceRequest
-from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import (
-    SimpleTextGenerationController,
+from megatron.core.inference.text_generation_controllers.text_generation_controller import (
+    TextGenerationController,
 )
 
 
-class EncoderDecoderTextGenerationController(SimpleTextGenerationController):
+class EncoderDecoderTextGenerationController(TextGenerationController):
     """The text generation controller for encoder-decoder architecture
 
-    This class ingherits from SimpleTextGenerationController, adding features
+    This class inherits from TextGenerationController, adding features
     relating to encoder input encoder_prompt
 
     """
diff --git a/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py b/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py
old mode 100755
new mode 100644
index 1103089935ab01ee885f2264790a58c5a93bac64..f97df132493416b0f53c267b0c9088ef7f668a0d
--- a/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py
+++ b/megatron/core/inference/text_generation_controllers/simple_text_generation_controller.py
@@ -1,400 +1,5 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-from typing import List, OrderedDict, Tuple
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 
-import torch
-import torch.nn.functional as F
-
-from megatron.core import parallel_state
-from megatron.core.inference.common_inference_params import CommonInferenceParams
-from megatron.core.inference.communication_utils import broadcast_from_last_pipeline_stage
-from megatron.core.inference.inference_request import InferenceRequest, Status
-from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import (
-    AbstractModelInferenceWrapper,
+from megatron.core.inference.text_generation_controllers.text_generation_controller import (  # noqa: F401 # pylint: disable=unused-import
+    TextGenerationController as SimpleTextGenerationController,
 )
-
-
-class SimpleTextGenerationController:
-    """The basic text generation controller
-
-    This class is responsible for tokenizing the input , running the inference, sampling
-    and also detokenizing the output
-
-    Args:
-        inference_wrapped_model (AbstractModelInferenceWrapper): A model that
-            is wrapped using the specs given in the abstract_model_inference_wrapper.py
-        tokenizer (_type_): Tokenizer used for tokenizing and detokenizing the prompts
-    """
-
-    def __init__(self, inference_wrapped_model: AbstractModelInferenceWrapper, tokenizer):
-        self.inference_wrapped_model = inference_wrapped_model
-        self.tokenizer = tokenizer
-
-        # For models without pipeline parallelism, is_first_stage and is_last_stage returns True
-        self.model_is_pipeline_parallel = not (
-            parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage()
-        )
-
-    def tokenize_prompt(
-        self, prompt: str, add_BOS: bool = False
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Utility to tokenize the input prompts
-
-        Args:
-            prompt (str): The input prompt
-
-        Returns:
-            torch.Tensor: Returns the tokenized prompt
-        """
-        prompt_tokens = self.tokenizer.tokenize(prompt)
-
-        if add_BOS:
-            prompt_tokens = [self.tokenizer.bos] + prompt_tokens
-
-        return prompt_tokens
-
-    def detokenize_generations(self, prompt_tokens_with_generated_tokens: torch.Tensor) -> str:
-        """Detokenize the output generations
-
-        Args:
-            prompt_tokens_with_generated_tokens (torch.Tensor): The input prompt
-            tokens plus the generated tokens
-
-        Returns:
-            str: The detokenized output
-        """
-        tokens = prompt_tokens_with_generated_tokens.cpu().numpy().tolist()
-        return self.tokenizer.detokenize(tokens)
-
-    def sample_from_logits(
-        self,
-        last_token_logits: torch.Tensor,
-        common_inference_params: CommonInferenceParams,
-        vocab_size: int = None,
-    ) -> torch.Tensor:
-        """Samples the logits to generate outputs
-
-        Given the logits of the last token, this function samples it
-        according to the parameters defined in common_inference_params
-        and returns the samples
-
-        Args:
-            last_token_logits (torch.Tensor): The last token logits. A tensor of
-                size [batch_size, vocab_size]
-            common_inference_params (CommonInferenceParams): The paramters to use
-                for inference
-            vocab_size (int): Obtained from the tokenizer. Defaults to None
-
-        Returns:
-            torch.Tensor: 1D tensor of the sampled logits with [batch_size] elements
-        """
-
-        top_p = common_inference_params.top_p
-        top_k = common_inference_params.top_k
-        temperature = common_inference_params.temperature
-
-        assert not (top_k > 0 and top_p > 0), 'Cannot have top-p and top-k both greater than zero'
-        assert top_p <= 1.0, 'top-p should be in (0,1]'
-
-        def modify_logits_for_top_k_filtering(logits, top_k):
-            """Set the logits for none top-k values to -inf."""
-            filter_ = logits < torch.topk(logits, top_k)[0][..., -1, None]
-            logits.masked_fill_(filter_, float('-Inf'))
-
-        def modify_logits_for_top_p_filtering(logits, top_p):
-            """Set the logits for none top-p values to -inf."""
-            # First sort and calculate cumulative sum of probabilities.
-            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
-            cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
-
-            # Filteration based on the cumulative sum.
-            filter_ = cumulative_probs > top_p
-            # This shift by 1 is weird and I cannot justify it. This existed
-            # in the original implementation:
-            #   https://github.com/ari-holtzman/degen/blob/master/gen.py
-            # and I guess it is needed so keeping it for now.
-            filter_[:, 1:] = filter_[:, :-1].clone()
-            # Make sure we at least have one token to select from.
-            filter_[..., 0] = 0
-
-            # Fill in the filtered part
-            filter_ = filter_.scatter(1, sorted_indices, filter_)
-            logits.masked_fill_(filter_, float('-Inf'))
-
-        # Greedy sampling
-        if top_k == 1:
-            sampled_logits = torch.argmax(last_token_logits, dim=-1)
-        else:
-            last_token_logits = last_token_logits.clone()
-            if temperature != 1.0:
-                last_token_logits.div_(temperature)
-
-            if top_k > 1:
-                assert top_k <= last_token_logits.size(1), 'top-k is larger than logit size.'
-                if vocab_size:
-                    assert top_k < vocab_size, 'top-k is larger than vocab size.'
-                modify_logits_for_top_k_filtering(last_token_logits, top_k)
-
-            elif top_p > 0.0:
-                modify_logits_for_top_p_filtering(last_token_logits, top_p)
-
-            # After filtering, we need to recalculate the distribution.
-            probabilities = last_token_logits.softmax(dim=-1)
-            sampled_logits = torch.multinomial(probabilities, num_samples=1).view(-1)
-
-            # If vocab size is provided, make sure the samples are in in the range [0, vocab-size).
-            if vocab_size:
-                sampled_logits = torch.clamp(sampled_logits, min=0, max=(vocab_size - 1))
-        return sampled_logits
-
-    def update_generation_status(
-        self,
-        updated_prompts_tokens: torch.Tensor,
-        generation_started: torch.Tensor,
-        current_context_end_position: int,
-        is_generation_done_tensor: torch.Tensor,
-        generated_sequence_lengths: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Checks which prompts have reached an end condition
-
-        We check which prompts have reached an end condition and set the corresponding
-        flags of the is_generation_done_tensor to True. The generated sequence lengths
-        increase as we keep generating, until that prompts hits an end condition. The
-        generation_started tensor determines which prompts have started generating.
-
-        Args:
-            updated_prompts_tokens (torch.Tensor): The prompts tokens updated with the latest
-                generated tokens. A tensor of shape [batch_size, max_seq_len]
-                (i.e max_seq_len = max_prompt_len + tokens_to_generate)
-            generation_started (torch.Tensor): A boolean tensor of shape [batch_size]. True
-                indicates the prompt at that index has started generating tokens.
-            current_context_end_position (int): An integer indicating which position to
-                extract from the prompts tokens to get the latest generated tokens.
-            is_generation_done_tensor (torch.Tensor): A boolean tensor of shape [batch_size].
-                True indicates the prompt at that index has reached end condition.
-            generated_sequence_lengths (torch.Tensor): A int tensor of shape [batch_size].
-                Each value represents the generated sequence lengths for that prompt.
-
-        Returns:
-            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Returns the boolean
-                is_generation_done_tensor and the generated_sequence_lengths after updating it
-        """
-        latest_samples = updated_prompts_tokens[:, current_context_end_position]
-        # Make sure we are checking eod criterion only for prompts that have started generating
-        # (i.e) We only look at the generated tokenns and not the input tokens.
-        reached_eod = (latest_samples == self.tokenizer.eod) & generation_started
-        is_generation_done_tensor = is_generation_done_tensor | reached_eod
-        # We increment generated sequence lengths when that prompt has not hit the
-        # EOD and generation has started
-        generated_sequence_lengths += ~is_generation_done_tensor & generation_started
-
-        return is_generation_done_tensor, generated_sequence_lengths
-
-    def pad_input_prompt_tokens(
-        self,
-        batch_prompt_tokens_list: List[List[int]],
-        max_prompt_length_in_batch: int,
-        num_tokens_to_generate: int,
-    ) -> torch.Tensor:
-        """Method to pad input prompts
-
-        Given a list of prompts, pad them all to uniform length
-
-        Args:
-            batch_prompt_tokens_list (List[List[int]]): A list containing the prompt tokens
-            max_prompt_length_in_batch (int): Maximum of the length of the input prompt tokens
-            num_tokens_togenerate (int): The number of tokens to generate for each prompt
-
-        Returns:
-            torch.Tensor: A torch tensor of shape [bs, max_seq_len] (i.e)
-            max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate,
-            with extra indices for each tensor padded with mask id.
-        """
-        max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate
-
-        for prompt_tokens in batch_prompt_tokens_list:
-            padding_size = max_seq_len - len(prompt_tokens)
-            prompt_tokens.extend([self.tokenizer.eod] * padding_size)
-
-        return torch.tensor(batch_prompt_tokens_list).cuda()
-
-    def generate_output_tokens_dynamic_batch(
-        self, active_requests: OrderedDict[int, InferenceRequest]
-    ) -> OrderedDict[int, InferenceRequest]:
-        """Utility to generate the output tokens and probabilities for the prompts
-
-        This utility generates the output tokens for a dynamic batch. It will run one forward step
-        at a time, and pass control back to the engine, which will update the request pool and call
-        this method again.
-
-        Args:
-            active_requests (OrderedDict[int, InferenceRequest]): The input active requests.
-
-        Returns:
-            OrderedDict[int, InferenceRequest]: The result for each of the incoming requests
-            after running one forward step.
-        """
-        raise Exception("Not implemented yet")
-
-    def generate_all_output_tokens_static_batch(
-        self, active_requests: OrderedDict[int, InferenceRequest]
-    ) -> OrderedDict[int, InferenceRequest]:
-        """Utility to generate the all the output tokens and probabilities for the prompts .
-
-        This utility generates the output tokens for a static batch. It runs the forward steps till
-        all prompts complete generation, updates the status of these requests to completed, adds
-        the generated result and returns these requests
-
-        Args:
-            active_requests (OrderedDict[int, InferenceRequest]): The input active requests.
-
-        Returns:
-            OrderedDict[int, InferenceRequest]: The result for each of the incoming requests
-        """
-        batch_prompt_tokens_list = list(
-            map(lambda request: request.prompt_tokens, active_requests.values())
-        )
-        prompt_lengths_in_batch = torch.tensor(
-            [len(prompt_tokens) for prompt_tokens in batch_prompt_tokens_list]
-        ).cuda()
-        max_prompt_length_in_batch = max(prompt_lengths_in_batch)
-        min_prompt_length_in_batch = min(prompt_lengths_in_batch)
-
-        # For batch inference the inference params are the same for all request
-        common_inference_params: CommonInferenceParams = list(active_requests.values())[
-            0
-        ].inference_parameters
-
-        # max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate
-        batch_prompt_tokens = self.pad_input_prompt_tokens(
-            batch_prompt_tokens_list,
-            max_prompt_length_in_batch=max_prompt_length_in_batch,
-            num_tokens_to_generate=common_inference_params.num_tokens_to_generate,
-        )
-        batch_size, max_sequence_length = batch_prompt_tokens.shape
-
-        # Pre allocate log probs tensor
-        output_log_probs = None
-        if common_inference_params.return_log_probs:
-            output_log_probs = torch.empty(
-                (batch_size, max_sequence_length - 1), dtype=torch.float32
-            ).cuda()
-
-        # An array to check which of the prompts have reached end of generation condition
-        is_generation_done_tensor = torch.zeros(batch_size, dtype=torch.bool).cuda()
-
-        # An array to act as a counter to keep track of generated sequence lengths
-        generated_sequence_lengths = torch.zeros(batch_size).cuda()
-
-        with torch.no_grad():
-
-            self.prep_model_for_inference(
-                prompts_tokens=batch_prompt_tokens, active_requests=active_requests
-            )
-
-            context_start_position = 0
-            # Pick the context window that we need to pass through the network.
-            for context_end_position in range(min_prompt_length_in_batch, max_sequence_length):
-
-                inference_input = self.inference_wrapped_model.get_batch_for_context_window(
-                    context_start_position, context_end_position
-                )
-
-                # Returns the final logits of shape [batch_size, context_length, vocab_size]
-                # Note: This is returned in all TP ranks or last PP stage in PP models
-                logits = self.inference_wrapped_model.run_one_forward_step(inference_input)
-                if self.model_is_pipeline_parallel:
-                    context_length = context_end_position - context_start_position
-                    logits = broadcast_from_last_pipeline_stage(
-                        [batch_size, context_length, self.tokenizer.vocab_size],
-                        dtype=self.inference_wrapped_model.inference_wrapper_config.params_dtype,
-                        tensor=logits,
-                    )
-
-                # Indicates which of the input prompts have started generating tokens.
-                # A 1D boolean tensor with [batch_size] elements (i.e) The shortest
-                # prompts will start generating first and so on
-                generation_started = prompt_lengths_in_batch <= context_end_position
-                last_token_logits = logits[:, -1, :]
-                sampled_logits = self.sample_from_logits(
-                    last_token_logits, common_inference_params, self.tokenizer.vocab_size
-                )
-
-                # Substitute the sampled logits only for only the prompts that
-                # have started generating tokens
-                batch_prompt_tokens[generation_started, context_end_position] = sampled_logits[
-                    generation_started
-                ]
-
-                if common_inference_params.return_log_probs:
-                    log_probs = F.log_softmax(logits, dim=2)
-                    indices = torch.unsqueeze(
-                        batch_prompt_tokens[
-                            :, (context_start_position + 1) : (context_end_position + 1)
-                        ],
-                        2,
-                    )
-                    # Get the log probabilities for only the prompt tokens
-                    output_log_probs[:, context_start_position:context_end_position] = torch.gather(
-                        log_probs, 2, indices
-                    ).squeeze(2)
-
-                context_start_position = context_end_position
-
-                # Check end of generation status for each tensor
-                # and update generated sequence lengths
-                (is_generation_done_tensor, generated_sequence_lengths) = (
-                    self.update_generation_status(
-                        updated_prompts_tokens=batch_prompt_tokens,
-                        generation_started=generation_started,
-                        current_context_end_position=context_end_position,
-                        is_generation_done_tensor=is_generation_done_tensor,
-                        generated_sequence_lengths=generated_sequence_lengths,
-                    )
-                )
-                # Boolean flag indicating if all prompts are finished
-                all_prompts_done = torch.all(is_generation_done_tensor)
-                if all_prompts_done:
-                    break
-
-        # Include all the generated tokens
-        batch_prompt_tokens_with_generations = batch_prompt_tokens[:, : (context_end_position + 1)]
-        if common_inference_params.return_log_probs:
-            output_log_probs = output_log_probs[:, :context_end_position]
-
-        generated_sequence_lengths[
-            generated_sequence_lengths > common_inference_params.num_tokens_to_generate
-        ] = common_inference_params.num_tokens_to_generate
-
-        for idx, request in enumerate(active_requests.values()):
-            input_prompt_length = int(prompt_lengths_in_batch[idx])
-            # Shorter prompts might have generated more than required tokens. So we trim them down
-            required_sequence_length = int(
-                min(generated_sequence_lengths[idx], common_inference_params.num_tokens_to_generate)
-            )
-            # Extract only the generated tokens
-            required_result_tokens = batch_prompt_tokens_with_generations[
-                idx, input_prompt_length : (input_prompt_length + required_sequence_length)
-            ]
-
-            request.generated_length = required_sequence_length
-            request.generated_tokens = required_result_tokens
-            request.generated_log_probs = (
-                None
-                if output_log_probs is None
-                else output_log_probs[idx, input_prompt_length:required_sequence_length]
-            )
-            request.status = Status.COMPLETED
-            request.generated_text = self.detokenize_generations(required_result_tokens)
-
-        return active_requests
-
-    def prep_model_for_inference(
-        self, prompts_tokens: torch.Tensor, active_requests: OrderedDict[int, InferenceRequest]
-    ):
-        """Preparing batch for inference, using respective wrapper's prep_model_for_inference method
-
-        Args:
-            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length]
-            active_requests (OrderedDict[int, InferenceRequest]): The input active requests
-        """
-        self.inference_wrapped_model.prep_model_for_inference(prompts_tokens=prompts_tokens)
diff --git a/megatron/core/inference/text_generation_controllers/text_generation_controller.py b/megatron/core/inference/text_generation_controllers/text_generation_controller.py
new file mode 100644
index 0000000000000000000000000000000000000000..f15c819c43a5f824224b958e78d2359260a18640
--- /dev/null
+++ b/megatron/core/inference/text_generation_controllers/text_generation_controller.py
@@ -0,0 +1,400 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+from typing import List, OrderedDict, Tuple
+
+import torch
+import torch.nn.functional as F
+
+from megatron.core import parallel_state
+from megatron.core.inference.communication_utils import broadcast_from_last_pipeline_stage
+from megatron.core.inference.inference_request import InferenceRequest, Status
+from megatron.core.inference.model_inference_wrappers.abstract_model_inference_wrapper import (
+    AbstractModelInferenceWrapper,
+)
+from megatron.core.inference.sampling_params import SamplingParams
+
+
+class TextGenerationController:
+    """The text generation controller (the main sampling loop)
+
+    This class tokenizes the input, runs inference, samples from logits, and detokenizes the output.
+
+    Args:
+        inference_wrapped_model (AbstractModelInferenceWrapper): A model that
+            is wrapped using the specs given in the abstract_model_inference_wrapper.py
+        tokenizer (_type_): Tokenizer used for tokenizing and detokenizing the prompts
+    """
+
+    def __init__(self, inference_wrapped_model: AbstractModelInferenceWrapper, tokenizer):
+        self.inference_wrapped_model = inference_wrapped_model
+        self.tokenizer = tokenizer
+
+        # For models without pipeline parallelism, is_first_stage and is_last_stage returns True
+        self.model_is_pipeline_parallel = not (
+            parallel_state.is_pipeline_first_stage() and parallel_state.is_pipeline_last_stage()
+        )
+
+    def tokenize_prompt(
+        self, prompt: str, add_BOS: bool = False
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Utility to tokenize the input prompts
+
+        Args:
+            prompt (str): The input prompt
+
+        Returns:
+            torch.Tensor: Returns the tokenized prompt
+        """
+        prompt_tokens = self.tokenizer.tokenize(prompt)
+
+        if add_BOS:
+            prompt_tokens = [self.tokenizer.bos] + prompt_tokens
+
+        return prompt_tokens
+
+    def detokenize_generations(self, prompt_tokens_with_generated_tokens: torch.Tensor) -> str:
+        """Detokenize the output generations
+
+        Args:
+            prompt_tokens_with_generated_tokens (torch.Tensor): The input prompt
+            tokens plus the generated tokens
+
+        Returns:
+            str: The detokenized output
+        """
+        tokens = prompt_tokens_with_generated_tokens.cpu().numpy().tolist()
+        return self.tokenizer.detokenize(tokens)
+
+    def sample_from_logits(
+        self,
+        last_token_logits: torch.Tensor,
+        sampling_params: SamplingParams = None,
+        vocab_size: int = None,
+        **kwargs
+    ) -> torch.Tensor:
+        """Samples the logits to generate outputs
+
+        Given the logits of the last token, this function samples it
+        according to the parameters defined in sampling_params
+        and returns the samples
+
+        Args:
+            last_token_logits (torch.Tensor): The last token logits. A tensor of
+                size [batch_size, vocab_size]
+            sampling_params (SamplingParams): The parameters to use for inference.
+            vocab_size (int): Obtained from the tokenizer. Defaults to None
+
+        Returns:
+            torch.Tensor: 1D tensor of the sampled logits with [batch_size] elements
+        """
+
+        if kwargs.get('common_inference_params'):
+            sampling_params = kwargs['common_inference_params']
+
+        top_p = sampling_params.top_p
+        top_k = sampling_params.top_k
+        temperature = sampling_params.temperature
+
+        assert not (top_k > 0 and top_p > 0), 'Cannot have top-p and top-k both greater than zero'
+        assert top_p <= 1.0, 'top-p should be in (0,1]'
+
+        def modify_logits_for_top_k_filtering(logits, top_k):
+            """Set the logits for none top-k values to -inf."""
+            filter_ = logits < torch.topk(logits, top_k)[0][..., -1, None]
+            logits.masked_fill_(filter_, float('-Inf'))
+
+        def modify_logits_for_top_p_filtering(logits, top_p):
+            """Set the logits for none top-p values to -inf."""
+            # First sort and calculate cumulative sum of probabilities.
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+            cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+
+            # Filteration based on the cumulative sum.
+            filter_ = cumulative_probs > top_p
+            # This shift by 1 is weird and I cannot justify it. This existed
+            # in the original implementation:
+            #   https://github.com/ari-holtzman/degen/blob/master/gen.py
+            # and I guess it is needed so keeping it for now.
+            filter_[:, 1:] = filter_[:, :-1].clone()
+            # Make sure we at least have one token to select from.
+            filter_[..., 0] = 0
+
+            # Fill in the filtered part
+            filter_ = filter_.scatter(1, sorted_indices, filter_)
+            logits.masked_fill_(filter_, float('-Inf'))
+
+        # Greedy sampling
+        if top_k == 1:
+            sampled_logits = torch.argmax(last_token_logits, dim=-1)
+        else:
+            last_token_logits = last_token_logits.clone()
+            if temperature != 1.0:
+                last_token_logits.div_(temperature)
+
+            if top_k > 1:
+                assert top_k <= last_token_logits.size(1), 'top-k is larger than logit size.'
+                if vocab_size:
+                    assert top_k < vocab_size, 'top-k is larger than vocab size.'
+                modify_logits_for_top_k_filtering(last_token_logits, top_k)
+
+            elif top_p > 0.0:
+                modify_logits_for_top_p_filtering(last_token_logits, top_p)
+
+            # After filtering, we need to recalculate the distribution.
+            probabilities = last_token_logits.softmax(dim=-1)
+            sampled_logits = torch.multinomial(probabilities, num_samples=1).view(-1)
+
+            # If vocab size is provided, make sure the samples are in in the range [0, vocab-size).
+            if vocab_size:
+                sampled_logits = torch.clamp(sampled_logits, min=0, max=(vocab_size - 1))
+        return sampled_logits
+
+    def update_generation_status(
+        self,
+        updated_prompts_tokens: torch.Tensor,
+        generation_started: torch.Tensor,
+        current_context_end_position: int,
+        is_generation_done_tensor: torch.Tensor,
+        generated_sequence_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Checks which prompts have reached an end condition
+
+        We check which prompts have reached an end condition and set the corresponding
+        flags of the is_generation_done_tensor to True. The generated sequence lengths
+        increase as we keep generating, until that prompts hits an end condition. The
+        generation_started tensor determines which prompts have started generating.
+
+        Args:
+            updated_prompts_tokens (torch.Tensor): The prompts tokens updated with the latest
+                generated tokens. A tensor of shape [batch_size, max_seq_len]
+                (i.e max_seq_len = max_prompt_len + tokens_to_generate)
+            generation_started (torch.Tensor): A boolean tensor of shape [batch_size]. True
+                indicates the prompt at that index has started generating tokens.
+            current_context_end_position (int): An integer indicating which position to
+                extract from the prompts tokens to get the latest generated tokens.
+            is_generation_done_tensor (torch.Tensor): A boolean tensor of shape [batch_size].
+                True indicates the prompt at that index has reached end condition.
+            generated_sequence_lengths (torch.Tensor): A int tensor of shape [batch_size].
+                Each value represents the generated sequence lengths for that prompt.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: Returns the boolean
+                is_generation_done_tensor and the generated_sequence_lengths after updating it
+        """
+        latest_samples = updated_prompts_tokens[:, current_context_end_position]
+        # Make sure we are checking eod criterion only for prompts that have started generating
+        # (i.e) We only look at the generated tokenns and not the input tokens.
+        reached_eod = (latest_samples == self.tokenizer.eod) & generation_started
+        is_generation_done_tensor = is_generation_done_tensor | reached_eod
+        # We increment generated sequence lengths when that prompt has not hit the
+        # EOD and generation has started
+        generated_sequence_lengths += ~is_generation_done_tensor & generation_started
+
+        return is_generation_done_tensor, generated_sequence_lengths
+
+    def pad_input_prompt_tokens(
+        self,
+        batch_prompt_tokens_list: List[List[int]],
+        max_prompt_length_in_batch: int,
+        num_tokens_to_generate: int,
+    ) -> torch.Tensor:
+        """Method to pad input prompts
+
+        Given a list of prompts, pad them all to uniform length
+
+        Args:
+            batch_prompt_tokens_list (List[List[int]]): A list containing the prompt tokens
+            max_prompt_length_in_batch (int): Maximum of the length of the input prompt tokens
+            num_tokens_togenerate (int): The number of tokens to generate for each prompt
+
+        Returns:
+            torch.Tensor: A torch tensor of shape [bs, max_seq_len] (i.e)
+            max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate,
+            with extra indices for each tensor padded with mask id.
+        """
+        max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate
+
+        for prompt_tokens in batch_prompt_tokens_list:
+            padding_size = max_seq_len - len(prompt_tokens)
+            prompt_tokens.extend([self.tokenizer.eod] * padding_size)
+
+        return torch.tensor(batch_prompt_tokens_list).cuda()
+
+    def generate_output_tokens_dynamic_batch(
+        self, active_requests: OrderedDict[int, InferenceRequest]
+    ) -> OrderedDict[int, InferenceRequest]:
+        """Utility to generate the output tokens and probabilities for the prompts
+
+        This utility generates the output tokens for a dynamic batch. It will run one forward step
+        at a time, and pass control back to the engine, which will update the request pool and call
+        this method again.
+
+        Args:
+            active_requests (OrderedDict[int, InferenceRequest]): The input active requests.
+
+        Returns:
+            OrderedDict[int, InferenceRequest]: The result for each of the incoming requests
+            after running one forward step.
+        """
+        raise Exception("Not implemented yet")
+
+    def generate_all_output_tokens_static_batch(
+        self, active_requests: OrderedDict[int, InferenceRequest]
+    ) -> OrderedDict[int, InferenceRequest]:
+        """Utility to generate the all the output tokens and probabilities for the prompts .
+
+        This utility generates the output tokens for a static batch. It runs the forward steps till
+        all prompts complete generation, updates the status of these requests to completed, adds
+        the generated result and returns these requests
+
+        Args:
+            active_requests (OrderedDict[int, InferenceRequest]): The input active requests.
+
+        Returns:
+            OrderedDict[int, InferenceRequest]: The result for each of the incoming requests
+        """
+        batch_prompt_tokens_list = list(
+            map(lambda request: request.prompt_tokens, active_requests.values())
+        )
+        prompt_lengths_in_batch = torch.tensor(
+            [len(prompt_tokens) for prompt_tokens in batch_prompt_tokens_list]
+        ).cuda()
+        max_prompt_length_in_batch = max(prompt_lengths_in_batch)
+        min_prompt_length_in_batch = min(prompt_lengths_in_batch)
+
+        # For batch inference the inference params are the same for all request
+        sampling_params: SamplingParams = list(active_requests.values())[0].inference_parameters
+
+        # max_seq_len = max_prompt_length_in_batch + num_tokens_to_generate
+        batch_prompt_tokens = self.pad_input_prompt_tokens(
+            batch_prompt_tokens_list,
+            max_prompt_length_in_batch=max_prompt_length_in_batch,
+            num_tokens_to_generate=sampling_params.num_tokens_to_generate,
+        )
+        batch_size, max_sequence_length = batch_prompt_tokens.shape
+
+        # Pre allocate log probs tensor
+        output_log_probs = None
+        if sampling_params.return_log_probs:
+            output_log_probs = torch.empty(
+                (batch_size, max_sequence_length - 1), dtype=torch.float32
+            ).cuda()
+
+        # An array to check which of the prompts have reached end of generation condition
+        is_generation_done_tensor = torch.zeros(batch_size, dtype=torch.bool).cuda()
+
+        # An array to act as a counter to keep track of generated sequence lengths
+        generated_sequence_lengths = torch.zeros(batch_size).cuda()
+
+        with torch.no_grad():
+
+            self.prep_model_for_inference(
+                prompts_tokens=batch_prompt_tokens, active_requests=active_requests
+            )
+
+            context_start_position = 0
+            # Pick the context window that we need to pass through the network.
+            for context_end_position in range(min_prompt_length_in_batch, max_sequence_length):
+
+                inference_input = self.inference_wrapped_model.get_batch_for_context_window(
+                    context_start_position, context_end_position
+                )
+
+                # Returns the final logits of shape [batch_size, context_length, vocab_size]
+                # Note: This is returned in all TP ranks or last PP stage in PP models
+                logits = self.inference_wrapped_model.run_one_forward_step(inference_input)
+                if self.model_is_pipeline_parallel:
+                    context_length = context_end_position - context_start_position
+                    logits = broadcast_from_last_pipeline_stage(
+                        [batch_size, context_length, self.tokenizer.vocab_size],
+                        dtype=self.inference_wrapped_model.inference_wrapper_config.params_dtype,
+                        tensor=logits,
+                    )
+
+                # Indicates which of the input prompts have started generating tokens.
+                # A 1D boolean tensor with [batch_size] elements (i.e) The shortest
+                # prompts will start generating first and so on
+                generation_started = prompt_lengths_in_batch <= context_end_position
+                last_token_logits = logits[:, -1, :]
+                sampled_logits = self.sample_from_logits(
+                    last_token_logits, sampling_params, self.tokenizer.vocab_size
+                )
+
+                # Substitute the sampled logits only for only the prompts that
+                # have started generating tokens
+                batch_prompt_tokens[generation_started, context_end_position] = sampled_logits[
+                    generation_started
+                ]
+
+                if sampling_params.return_log_probs:
+                    log_probs = F.log_softmax(logits, dim=2)
+                    indices = torch.unsqueeze(
+                        batch_prompt_tokens[
+                            :, (context_start_position + 1) : (context_end_position + 1)
+                        ],
+                        2,
+                    )
+                    # Get the log probabilities for only the prompt tokens
+                    output_log_probs[:, context_start_position:context_end_position] = torch.gather(
+                        log_probs, 2, indices
+                    ).squeeze(2)
+
+                context_start_position = context_end_position
+
+                # Check end of generation status for each tensor
+                # and update generated sequence lengths
+                (is_generation_done_tensor, generated_sequence_lengths) = (
+                    self.update_generation_status(
+                        updated_prompts_tokens=batch_prompt_tokens,
+                        generation_started=generation_started,
+                        current_context_end_position=context_end_position,
+                        is_generation_done_tensor=is_generation_done_tensor,
+                        generated_sequence_lengths=generated_sequence_lengths,
+                    )
+                )
+                # Boolean flag indicating if all prompts are finished
+                all_prompts_done = torch.all(is_generation_done_tensor)
+                if all_prompts_done:
+                    break
+
+        # Include all the generated tokens
+        batch_prompt_tokens_with_generations = batch_prompt_tokens[:, : (context_end_position + 1)]
+        if sampling_params.return_log_probs:
+            output_log_probs = output_log_probs[:, :context_end_position]
+
+        generated_sequence_lengths[
+            generated_sequence_lengths > sampling_params.num_tokens_to_generate
+        ] = sampling_params.num_tokens_to_generate
+
+        for idx, request in enumerate(active_requests.values()):
+            input_prompt_length = int(prompt_lengths_in_batch[idx])
+            # Shorter prompts might have generated more than required tokens. So we trim them down
+            required_sequence_length = int(
+                min(generated_sequence_lengths[idx], sampling_params.num_tokens_to_generate)
+            )
+            # Extract only the generated tokens
+            required_result_tokens = batch_prompt_tokens_with_generations[
+                idx, input_prompt_length : (input_prompt_length + required_sequence_length)
+            ]
+
+            request.generated_length = required_sequence_length
+            request.generated_tokens = required_result_tokens
+            request.generated_log_probs = (
+                None
+                if output_log_probs is None
+                else output_log_probs[idx, input_prompt_length:required_sequence_length]
+            )
+            request.status = Status.COMPLETED
+            request.generated_text = self.detokenize_generations(required_result_tokens)
+
+        return active_requests
+
+    def prep_model_for_inference(
+        self, prompts_tokens: torch.Tensor, active_requests: OrderedDict[int, InferenceRequest]
+    ):
+        """Preparing batch for inference, using respective wrapper's prep_model_for_inference method
+
+        Args:
+            prompts_tokens (torch.Tensor): A tensor of shape [batch_size, max_sequence_length]
+            active_requests (OrderedDict[int, InferenceRequest]): The input active requests
+        """
+        self.inference_wrapped_model.prep_model_for_inference(prompts_tokens=prompts_tokens)
diff --git a/megatron/core/inference/utils.py b/megatron/core/inference/utils.py
old mode 100755
new mode 100644
diff --git a/megatron/core/inference_params.py b/megatron/core/inference_params.py
old mode 100755
new mode 100644
diff --git a/megatron/core/jit.py b/megatron/core/jit.py
old mode 100755
new mode 100644
index 5b1dfff3e7786af920e99bff9b3491793e5a0c91..c35c41b9fa226b928e7dc35d5dcec95f2b6a6c2c
--- a/megatron/core/jit.py
+++ b/megatron/core/jit.py
@@ -7,4 +7,18 @@ from megatron.core.utils import is_torch_min_version
 jit_fuser = torch.jit.script
 # nvFuser is deprecated in PyTorch JIT starting from 2.2
 if is_torch_min_version("2.2.0a0"):
-    jit_fuser = torch.compile
+    jit_fuser = torch.compile(mode='max-autotune-no-cudagraphs')
+
+# Decorator to disable Torch Dynamo
+# See: https://github.com/NVIDIA/TransformerEngine/issues/308
+no_torch_dynamo = lambda recursive=True: lambda func: func
+if torch.__version__ >= "2":
+    import torch._dynamo
+
+    if torch.__version__ >= "2.1":
+        no_torch_dynamo = lambda recursive=True: lambda f: torch._dynamo.disable(
+            f, recursive=recursive
+        )
+    else:
+        # no "recursive" option in pyTorch 2.0 - it acts as if recursive was True
+        no_torch_dynamo = lambda recursive=True: torch._dynamo.disable
diff --git a/megatron/core/model_parallel_config.py b/megatron/core/model_parallel_config.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/T5/__init__.py b/megatron/core/models/T5/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/T5/t5_model.py b/megatron/core/models/T5/t5_model.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/T5/t5_spec.py b/megatron/core/models/T5/t5_spec.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/__init__.py b/megatron/core/models/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/bert/__init__.py b/megatron/core/models/bert/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/bert/bert_layer_specs.py b/megatron/core/models/bert/bert_layer_specs.py
old mode 100755
new mode 100644
index 80893d54aca05467442705bf9a5b7c650985bec0..4edc2ed6285f3ad7b0f4ecc5c66121f804335cc8
--- a/megatron/core/models/bert/bert_layer_specs.py
+++ b/megatron/core/models/bert/bert_layer_specs.py
@@ -1,4 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import warnings
+
 from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
 from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
 from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
@@ -28,38 +30,60 @@ try:
     HAVE_APEX = True
     LNImpl = FusedLayerNorm
 except ImportError:
-    import warnings
 
     from megatron.core.transformer.torch_norm import WrappedTorchNorm
 
-    warnings.warn(f'Apex is not installed. Falling back to Torch Norm')
+    warnings.warn('Apex is not installed. Falling back to Torch Norm')
     LNImpl = WrappedTorchNorm
 
-# Use this spec to use lower level Transformer Engine modules (required for fp8 training)
-bert_layer_with_transformer_engine_spec = ModuleSpec(
-    module=TransformerLayer,
-    submodules=TransformerLayerSubmodules(
-        self_attention=ModuleSpec(
-            module=SelfAttention,
-            params={"attn_mask_type": AttnMaskType.padding},
-            submodules=SelfAttentionSubmodules(
-                linear_qkv=TELayerNormColumnParallelLinear,
-                core_attention=TEDotProductAttention,
-                linear_proj=TERowParallelLinear,
-                q_layernorm=IdentityOp,
-                k_layernorm=IdentityOp,
+
+def get_bert_layer_with_transformer_engine_spec():
+    """Use this spec to use lower-level Transformer Engine modules (required for fp8 training).
+
+    Returns:
+        ModuleSpec: Module specification with TE modules
+    """
+    if not HAVE_TE:
+        raise ImportError(
+            "Transformer Engine is not installed. Please use local Bert layer spec instead."
+        )
+
+    return ModuleSpec(
+        module=TransformerLayer,
+        submodules=TransformerLayerSubmodules(
+            self_attention=ModuleSpec(
+                module=SelfAttention,
+                params={"attn_mask_type": AttnMaskType.padding},
+                submodules=SelfAttentionSubmodules(
+                    linear_qkv=TELayerNormColumnParallelLinear,
+                    core_attention=TEDotProductAttention,
+                    linear_proj=TERowParallelLinear,
+                    q_layernorm=IdentityOp,
+                    k_layernorm=IdentityOp,
+                ),
             ),
-        ),
-        self_attn_bda=get_bias_dropout_add,
-        mlp=ModuleSpec(
-            module=MLP,
-            submodules=MLPSubmodules(
-                linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear
+            self_attn_bda=get_bias_dropout_add,
+            mlp=ModuleSpec(
+                module=MLP,
+                submodules=MLPSubmodules(
+                    linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear
+                ),
             ),
+            mlp_bda=get_bias_dropout_add,
         ),
-        mlp_bda=get_bias_dropout_add,
-    ),
-)
+    )
+
+
+def __getattr__(name):
+    if name == 'bert_layer_with_transformer_engine_spec':
+        warnings.warn(
+            """Attribute bert_layer_specs.bert_layer_with_transformer_engine_spec is on a
+            deprecation track and will be removed in future releases. Please migrate to
+            bert_layer_specs.get_bert_layer_with_transformer_engine_spec()."""
+        )
+
+        return get_bert_layer_with_transformer_engine_spec()
+
 
 # Use this spec for an implementation using only modules in megatron core
 bert_layer_local_spec = ModuleSpec(
diff --git a/megatron/core/models/bert/bert_lm_head.py b/megatron/core/models/bert/bert_lm_head.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/bert/bert_model.py b/megatron/core/models/bert/bert_model.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/bert/pooler.py b/megatron/core/models/bert/pooler.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/common/__init__.py b/megatron/core/models/common/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/common/embeddings/__init__.py b/megatron/core/models/common/embeddings/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/common/embeddings/language_model_embedding.py b/megatron/core/models/common/embeddings/language_model_embedding.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/common/embeddings/rope_utils.py b/megatron/core/models/common/embeddings/rope_utils.py
old mode 100755
new mode 100644
index f1d7ad48d2b9620fce14e15de512c309cbb02da4..3dd5193ca246d3b648917fa8d33f673a2039ee33
--- a/megatron/core/models/common/embeddings/rope_utils.py
+++ b/megatron/core/models/common/embeddings/rope_utils.py
@@ -17,23 +17,24 @@ from megatron.core.utils import is_te_min_version
 
 logger = logging.getLogger(__name__)
 
+# Prefer fused RoPE from Apex as we need the `transpose_output_memory` argument for the bshd trick.
+# See https://gitlab-master.nvidia.com/ADLR/megatron-lm/-/merge_requests/2469.
 try:
-    from megatron.core.extensions.transformer_engine import (
-        fused_apply_rotary_pos_emb,
-        fused_apply_rotary_pos_emb_thd,
-    )
-
-    HAVE_APPLY_ROPE_FUSION = True
+    from apex.transformer.functional import fused_apply_rotary_pos_emb
 except ImportError:
     try:
-        from apex.transformer.functional import (
-            fused_apply_rotary_pos_emb,
-            fused_apply_rotary_pos_emb_thd,
-        )
+        from megatron.core.extensions.transformer_engine import fused_apply_rotary_pos_emb
+    except:
+        fused_apply_rotary_pos_emb = None
+
 
-        HAVE_APPLY_ROPE_FUSION = True
+try:
+    from megatron.core.extensions.transformer_engine import fused_apply_rotary_pos_emb_thd
+except ImportError:
+    try:
+        from apex.transformer.functional import fused_apply_rotary_pos_emb_thd
     except ImportError:
-        HAVE_APPLY_ROPE_FUSION = False
+        fused_apply_rotary_pos_emb_thd = None
 
 
 try:
@@ -188,8 +189,10 @@ def apply_rotary_pos_emb(
 
     if config.apply_rope_fusion:
         if cu_seqlens is None:
-            return fused_apply_rotary_pos_emb(t, freqs)
+            assert fused_apply_rotary_pos_emb is not None, "apply_rope_fusion is not available."
+            return fused_apply_rotary_pos_emb(t, freqs, transpose_output_memory=True)
         else:
+            assert fused_apply_rotary_pos_emb_thd is not None, "apply_rope_fusion is not available."
             cp_size = parallel_state.get_context_parallel_world_size()
             if cp_size > 1:
                 if not is_te_min_version("1.11.0", check_equality=False):
diff --git a/megatron/core/models/common/embeddings/rotary_pos_embedding.py b/megatron/core/models/common/embeddings/rotary_pos_embedding.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/common/language_module/__init__.py b/megatron/core/models/common/language_module/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/common/vision_module/__init__.py b/megatron/core/models/common/vision_module/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/common/vision_module/vision_module.py b/megatron/core/models/common/vision_module/vision_module.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/gpt/__init__.py b/megatron/core/models/gpt/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/gpt/gpt_layer_specs.py b/megatron/core/models/gpt/gpt_layer_specs.py
old mode 100755
new mode 100644
index 749be324ed62fe9c96efcf126902d448d8313881..d0e48c190cacc27a944e9a4bc3a748e3c4570eb7
--- a/megatron/core/models/gpt/gpt_layer_specs.py
+++ b/megatron/core/models/gpt/gpt_layer_specs.py
@@ -1,16 +1,16 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
+import warnings
 from typing import Optional
 
 from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+from megatron.core.models.gpt.moe_module_specs import get_moe_module_spec
 from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
 from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
 from megatron.core.transformer.dot_product_attention import DotProductAttention
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.identity_op import IdentityOp
 from megatron.core.transformer.mlp import MLP, MLPSubmodules
-from megatron.core.transformer.moe.moe_layer import MoELayer, MoESubmodules
-from megatron.core.transformer.moe.shared_experts import SharedExpertMLP
 from megatron.core.transformer.multi_latent_attention import (
     MLASelfAttention,
     MLASelfAttentionSubmodules,
@@ -26,12 +26,10 @@ from megatron.core.utils import is_te_min_version
 
 try:
     from megatron.core.extensions.transformer_engine import (
-        TEColumnParallelGroupedLinear,
         TEColumnParallelLinear,
         TEDotProductAttention,
         TELayerNormColumnParallelLinear,
         TENorm,
-        TERowParallelGroupedLinear,
         TERowParallelLinear,
     )
 
@@ -47,8 +45,6 @@ try:
     HAVE_APEX = True
     LNImpl = FusedLayerNorm
 except ImportError:
-    import warnings
-
     from megatron.core.transformer.torch_norm import WrappedTorchNorm
 
     warnings.warn('Apex is not installed. Falling back to Torch Norm')
@@ -60,7 +56,8 @@ def get_gpt_layer_with_transformer_engine_spec(
     moe_grouped_gemm: Optional[bool] = False,
     qk_layernorm: Optional[bool] = False,
     multi_latent_attention: Optional[bool] = False,
-    fp8: Optional[str] = None,
+    fp8: Optional[str] = None,  # pylint: disable=unused-arguments
+    moe_use_legacy_grouped_gemm: Optional[bool] = False,
 ) -> ModuleSpec:
     """Use this spec to use lower-level Transformer Engine modules (required for fp8 training).
 
@@ -69,13 +66,24 @@ def get_gpt_layer_with_transformer_engine_spec(
         num_experts (int, optional): Number of experts. Defaults to None.
         moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False.
         qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False.
-        fp8 (str, optional): Flag to decide the linear layer spec for MoE. Defaults to None.
+        fp8 (str, optional): Deprecated. For temporary Nemo compatibility.
+        moe_use_legacy_grouped_gemm (bool, optional): Force use the legacy GroupedMLP.
+                                                      Defaults to False.
 
     Returns:
         ModuleSpec: Module specification with TE modules
     """
+    if fp8 is not None:
+        warnings.warn(
+            'The fp8 argument in "get_gpt_layer_with_transformer_engine_spec" has been deprecated'
+            ' and will be removed soon. Please update your code accordingly.'
+        )
+
     mlp = _get_mlp_module_spec(
-        use_te=True, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm, fp8=fp8
+        use_te=True,
+        num_experts=num_experts,
+        moe_grouped_gemm=moe_grouped_gemm,
+        moe_use_legacy_grouped_gemm=moe_use_legacy_grouped_gemm,
     )
 
     if multi_latent_attention:
@@ -138,6 +146,8 @@ def get_gpt_layer_local_spec(
     moe_grouped_gemm: Optional[bool] = False,
     qk_layernorm: Optional[bool] = False,
     multi_latent_attention: Optional[bool] = False,
+    fp8: Optional[str] = None,  # pylint: disable=unused-arguments
+    moe_use_legacy_grouped_gemm: Optional[bool] = False,
 ) -> ModuleSpec:
     """Use this spec for an implementation using only modules in Megatron-Core.
 
@@ -146,13 +156,24 @@ def get_gpt_layer_local_spec(
         num_experts (int, optional): Number of experts. Defaults to None.
         moe_grouped_gemm (bool, optional): To use Grouped GEMM. Defaults to False.
         qk_layernorm (bool, optional): To use layernorm for queries/keys. Defaults to False.
+        fp8 (str, optional): Deprecated. For temporary Nemo compatibility.
+        moe_use_legacy_grouped_gemm (bool, optional): Force use the legacy GroupedMLP.
+                                                      Defaults to False.
 
     Returns:
         ModuleSpec: Module specification with Megatron-Core modules
     """
+    if fp8 is not None:
+        warnings.warn(
+            'The fp8 argument in "get_gpt_layer_local_spec" has been deprecated'
+            ' and will be removed soon. Please update your code accordingly.'
+        )
 
     mlp = _get_mlp_module_spec(
-        use_te=False, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm
+        use_te=False,
+        num_experts=num_experts,
+        moe_grouped_gemm=moe_grouped_gemm,
+        moe_use_legacy_grouped_gemm=moe_use_legacy_grouped_gemm,
     )
 
     if multi_latent_attention:
@@ -213,63 +234,33 @@ def _get_mlp_module_spec(
     use_te: Optional[bool] = True,
     num_experts: Optional[int] = None,
     moe_grouped_gemm: Optional[bool] = False,
-    fp8: Optional[str] = None,
+    fp8: Optional[str] = None,  # pylint: disable=unused-arguments
+    moe_use_legacy_grouped_gemm: Optional[bool] = False,
 ) -> ModuleSpec:
-    """Helper function to get module spec for MLP"""
-    if num_experts is not None:
-        moe_spec = _get_moe_module_spec(
-            use_te=True, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm, fp8=fp8
+    """Helper function to get module spec for MLP/MoE"""
+    if fp8 is not None:
+        warnings.warn(
+            'The fp8 argument in "_get_mlp_module_spec" has been deprecated'
+            ' and will be removed soon. Please update your code accordingly.'
         )
-        return moe_spec
-
-    return ModuleSpec(
-        module=MLP,
-        submodules=MLPSubmodules(
-            linear_fc1=TELayerNormColumnParallelLinear if use_te else ColumnParallelLinear,
-            linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
-        ),
-    )
 
-
-def _get_moe_module_spec(
-    use_te: Optional[bool] = True,
-    num_experts: Optional[int] = None,
-    moe_grouped_gemm: Optional[bool] = False,
-    fp8: Optional[str] = None,
-) -> ModuleSpec:
-    """Helper function to get module spec for MoE"""
     if num_experts is None:
-        return None
-    if use_te and moe_grouped_gemm:
-        linear_fc1 = TEColumnParallelGroupedLinear
-        linear_fc2 = TERowParallelGroupedLinear
-    elif use_te and fp8:
-        linear_fc1 = TEColumnParallelLinear
-        linear_fc2 = TERowParallelLinear
-    else:
-        linear_fc1 = ColumnParallelLinear
-        linear_fc2 = RowParallelLinear
-
-    use_te_grouped_gemm = use_te and TEColumnParallelGroupedLinear is not None
-
-    return ModuleSpec(
-        module=MoELayer,
-        submodules=MoESubmodules(
-            experts=(
-                MLPSubmodules(linear_fc1=linear_fc1, linear_fc2=linear_fc2)
-                if not moe_grouped_gemm or use_te_grouped_gemm
-                else None
-            ),
-            shared_experts=ModuleSpec(
-                module=SharedExpertMLP,
-                params={"gate": False},
-                submodules=MLPSubmodules(
-                    linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear,
-                    linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
-                ),
+        # Dense MLP w/ or w/o TE modules.
+        return ModuleSpec(
+            module=MLP,
+            submodules=MLPSubmodules(
+                linear_fc1=TELayerNormColumnParallelLinear if use_te else ColumnParallelLinear,
+                linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
             ),
-        ),
-    )
+        )
+    else:
+        # Mixture of experts with modules in megatron core.
+        return get_moe_module_spec(
+            use_te=use_te,
+            num_experts=num_experts,
+            moe_grouped_gemm=moe_grouped_gemm,
+            moe_use_legacy_grouped_gemm=moe_use_legacy_grouped_gemm,
+        )
 
 
 def get_gpt_decoder_block_spec(
@@ -288,7 +279,7 @@ def get_gpt_decoder_block_spec(
             moe_grouped_gemm=False,
             qk_layernorm=config.qk_layernorm,
             multi_latent_attention=config.multi_latent_attention,
-            fp8=config.fp8,
+            moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm,
         )
         if use_transformer_engine
         else get_gpt_layer_local_spec(
@@ -296,6 +287,7 @@ def get_gpt_decoder_block_spec(
             moe_grouped_gemm=False,
             qk_layernorm=config.qk_layernorm,
             multi_latent_attention=config.multi_latent_attention,
+            moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm,
         )
     )
     moe_layer_spec = (
@@ -304,7 +296,7 @@ def get_gpt_decoder_block_spec(
             moe_grouped_gemm=config.moe_grouped_gemm,
             qk_layernorm=config.qk_layernorm,
             multi_latent_attention=config.multi_latent_attention,
-            fp8=config.fp8,
+            moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm,
         )
         if use_transformer_engine
         else get_gpt_layer_local_spec(
@@ -312,6 +304,7 @@ def get_gpt_decoder_block_spec(
             moe_grouped_gemm=config.moe_grouped_gemm,
             qk_layernorm=config.qk_layernorm,
             multi_latent_attention=config.multi_latent_attention,
+            moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm,
         )
     )
 
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/gpt/moe_module_specs.py b/megatron/core/models/gpt/moe_module_specs.py
new file mode 100644
index 0000000000000000000000000000000000000000..513eeddc7e3a12824d97fd12b3b66a644c3ecee7
--- /dev/null
+++ b/megatron/core/models/gpt/moe_module_specs.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+import warnings
+from typing import Optional
+
+from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.mlp import MLPSubmodules
+from megatron.core.transformer.moe.experts import GroupedMLP, SequentialMLP, TEGroupedMLP
+from megatron.core.transformer.moe.moe_layer import MoELayer, MoESubmodules
+from megatron.core.transformer.moe.shared_experts import SharedExpertMLP
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.utils import get_te_version, is_te_min_version
+
+try:
+    from megatron.core.extensions.transformer_engine import (
+        TEColumnParallelGroupedLinear,
+        TEColumnParallelLinear,
+        TERowParallelGroupedLinear,
+        TERowParallelLinear,
+    )
+
+    HAVE_TE = True
+except ImportError:
+    HAVE_TE = False
+
+
+def get_moe_module_spec(
+    use_te: Optional[bool] = True,
+    num_experts: Optional[int] = None,
+    moe_grouped_gemm: Optional[bool] = False,
+    moe_use_legacy_grouped_gemm: Optional[bool] = False,
+) -> ModuleSpec:
+    """Helper function to get module spec for MoE"""
+    assert num_experts is not None
+
+    mlp = MLPSubmodules(
+        linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear,
+        linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
+    )
+
+    # experts spec
+    if moe_grouped_gemm:
+        ## use GroupedMLP
+        if use_te and TEColumnParallelGroupedLinear is not None and not moe_use_legacy_grouped_gemm:
+            ## use TEGroupedLinear
+            expert_module = TEGroupedMLP
+            expert_submodule = MLPSubmodules(
+                linear_fc1=TEColumnParallelGroupedLinear, linear_fc2=TERowParallelGroupedLinear
+            )
+        else:
+            ## use legacy GroupedMLP
+            expert_module = GroupedMLP
+            expert_submodule = None
+            warnings.warn(
+                'The legacy GroupedMLP will be deprecated in Megatron-Core v0.12.0. '
+                'Please update the TransformerEngine to version>=1.7.0 and use TEGroupedMLP.'
+            )
+    else:
+        ## use SequentialMLP
+        expert_module = SequentialMLP
+        if use_te and not is_te_min_version("1.7.0.dev0"):
+            warnings.warn(
+                "Only transformer-engine>=1.7.0 supports MoE experts, "
+                f"but your version is {get_te_version()}. Use local linear implementation instead."
+            )
+            expert_submodule = MLPSubmodules(
+                linear_fc1=ColumnParallelLinear, linear_fc2=RowParallelLinear
+            )
+        else:
+            expert_submodule = mlp
+
+    experts = ModuleSpec(module=expert_module, submodules=expert_submodule)
+
+    # shared experts spec
+    shared_experts = ModuleSpec(module=SharedExpertMLP, params={"gate": False}, submodules=mlp)
+
+    # MoE module spec
+    moe_module_spec = ModuleSpec(
+        module=MoELayer, submodules=MoESubmodules(experts=experts, shared_experts=shared_experts)
+    )
+    return moe_module_spec
diff --git a/megatron/core/models/mamba/__init__.py b/megatron/core/models/mamba/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/mamba/mamba_layer_specs.py b/megatron/core/models/mamba/mamba_layer_specs.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/mamba/mamba_model.py b/megatron/core/models/mamba/mamba_model.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/multimodal/__init__.py b/megatron/core/models/multimodal/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/multimodal/llava_model.py b/megatron/core/models/multimodal/llava_model.py
old mode 100755
new mode 100644
index dafe377456d81eea1973328de8dfe60da6761d5c..3de68b5091719cfd3638cf38ea37ab1963be2fda
--- a/megatron/core/models/multimodal/llava_model.py
+++ b/megatron/core/models/multimodal/llava_model.py
@@ -36,6 +36,7 @@ IGNORE_INDEX = -100  # ID for labels that should be ignored.
 # Image token index can be tokenizer dependent so the default value does not work in all cases.
 DEFAULT_IMAGE_TOKEN_INDEX = -200
 IMAGE_TOKEN = "<image>"
+VIDEO_TOKEN = "<video>"
 
 
 # Note: This is under development and may be missing features.
@@ -828,7 +829,7 @@ class LLaVAModel(MegatronModule):
             ).contiguous()  # [b, text_seq_len, h_language]
 
         # Assume 1 tile per image if the number of tiles is not provided.
-        if num_image_tiles is None:
+        if num_image_tiles is None and images is not None:
             num_image_tiles = torch.ones(images.shape[0], dtype=torch.int, device=input_ids.device)
 
         combined_embeddings, new_labels, new_loss_mask = self._preprocess_data(
diff --git a/megatron/core/models/multimodal/llava_spec.py b/megatron/core/models/multimodal/llava_spec.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/retro/__init__.py b/megatron/core/models/retro/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/retro/base_attention.py b/megatron/core/models/retro/base_attention.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/retro/config.py b/megatron/core/models/retro/config.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/retro/decoder_attention.py b/megatron/core/models/retro/decoder_attention.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/retro/decoder_spec.py b/megatron/core/models/retro/decoder_spec.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/retro/encoder_attention.py b/megatron/core/models/retro/encoder_attention.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/retro/encoder_spec.py b/megatron/core/models/retro/encoder_spec.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/retro/model.py b/megatron/core/models/retro/model.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/retro/utils.py b/megatron/core/models/retro/utils.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/vision/__init__.py b/megatron/core/models/vision/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/vision/clip_vit_model.py b/megatron/core/models/vision/clip_vit_model.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/vision/multimodal_projector.py b/megatron/core/models/vision/multimodal_projector.py
old mode 100755
new mode 100644
diff --git a/megatron/core/models/vision/vit_layer_specs.py b/megatron/core/models/vision/vit_layer_specs.py
old mode 100755
new mode 100644
diff --git a/megatron/core/num_microbatches_calculator.py b/megatron/core/num_microbatches_calculator.py
old mode 100755
new mode 100644
diff --git a/megatron/core/optimizer/__init__.py b/megatron/core/optimizer/__init__.py
old mode 100755
new mode 100644
index 0d3ec5a4814f923e75b5fcf8bf578678509d9d4e..7a97603eba33ea675b33c5b63d7b470d3ac3dbbd
--- a/megatron/core/optimizer/__init__.py
+++ b/megatron/core/optimizer/__init__.py
@@ -262,32 +262,56 @@ def _get_megatron_optimizer_based_on_param_groups(
     Returns:
         Instance of MegatronOptimizer.
     """
-    if config.optimizer == 'adam':
-        optimizer = Adam(
-            param_groups,
-            lr=config.lr,
-            weight_decay=config.weight_decay,
-            betas=(config.adam_beta1, config.adam_beta2),
-            eps=config.adam_eps,
-        )
 
-        def init_state_fn(opt):
-            for group in opt.param_groups:
-                for p in group['params']:
-                    if len(opt.state[p]) == 0:
-                        opt.state[p]['exp_avg'] = torch.zeros_like(p.data)
-                        opt.state[p]['exp_avg_sq'] = torch.zeros_like(p.data)
-
-    elif config.optimizer == 'sgd':
-        optimizer = SGD(
-            param_groups,
-            lr=config.lr,
-            weight_decay=config.weight_decay,
-            momentum=config.sgd_momentum,
-        )
-        init_state_fn = None
+    # when freezing sub-models we may have no trainable parameters on a rank and
+    # hence an empty param_groups. However, we still need to create an optimizer
+    # for the purposes of grad stats reductions
+    if param_groups:
+        if config.optimizer == 'adam':
+            kwargs = {
+                "params": param_groups,
+                "lr": config.lr,
+                "weight_decay": config.weight_decay,
+                "betas": (config.adam_beta1, config.adam_beta2),
+                "eps": config.adam_eps,
+            }
+
+            if config.use_precision_aware_optimizer:
+                kwargs.update(
+                    {
+                        "master_weights": True,
+                        "use_decoupled_grad": True,
+                        "master_weight_dtype": config.main_params_dtype,
+                        "exp_avg_dtype": config.exp_avg_dtype,
+                        "exp_avg_sq_dtype": config.exp_avg_sq_dtype,
+                    }
+                )
+
+            optimizer = Adam(**kwargs)
+
+            def init_state_fn(opt, config=None):
+                for group in opt.param_groups:
+                    for p in group['params']:
+                        if len(opt.state[p]) == 0:
+                            if config is None or not config.use_precision_aware_optimizer:
+                                opt.state[p]['exp_avg'] = torch.zeros_like(p.data)
+                                opt.state[p]['exp_avg_sq'] = torch.zeros_like(p.data)
+                            else:
+                                opt.initialize_state(p)
+
+        elif config.optimizer == 'sgd':
+            optimizer = SGD(
+                param_groups,
+                lr=config.lr,
+                weight_decay=config.weight_decay,
+                momentum=config.sgd_momentum,
+            )
+            init_state_fn = None
+        else:
+            raise Exception('{} optimizer is not supported.'.format(config.optimizer))
     else:
-        raise Exception('{} optimizer is not supported.'.format(config.optimizer))
+        optimizer = None
+        init_state_fn = None
 
     # Mixed precision optimizer.
     # - Note: both the Float16Optimizer and the DistributedOptimizer inherit
@@ -407,6 +431,7 @@ def get_megatron_optimizer(
             model_chunk.overlap_param_gather_with_optimizer_step = (
                 overlap_param_gather_with_optimizer_step
             )
+
         optimizers.append(
             _get_megatron_optimizer_based_on_param_groups(
                 config,
diff --git a/megatron/core/optimizer/clip_grads.py b/megatron/core/optimizer/clip_grads.py
old mode 100755
new mode 100644
index 5c3a6578f4c50fd405a5f4e6b5dde4c4033e6554..0f33f919b635044ecc5a043140e85fdb72a5e82b
--- a/megatron/core/optimizer/clip_grads.py
+++ b/megatron/core/optimizer/clip_grads.py
@@ -139,6 +139,7 @@ def clip_grad_by_total_norm_fp32(
     parameters: Union[List[torch.Tensor], torch.Tensor],
     max_norm: Union[int, float],
     total_norm: float,
+    use_decoupled_grad: bool = False,
 ):
     """Clips gradient of an iterable of parameters in fp32 by total norm.
 
@@ -149,15 +150,23 @@ def clip_grad_by_total_norm_fp32(
             single Tensor that will have gradients normalized.
         max_norm (float or int): max norm of the gradients.
         total_norm (float): total norm of the gradients.
+        use_decoupled_grad (bool, optional): whether to read grad from ".grad" or ".decoupled_grad",
+            default value is False.
     """
     # Grads.
     params = []
     grads = []
     for param in parameters:
-        if param.grad is not None:
-            assert param.grad.type() == 'torch.cuda.FloatTensor'
-            params.append(param)
-            grads.append(to_local_if_dtensor(param.grad).detach())
+        if use_decoupled_grad:
+            if hasattr(param, "decoupled_grad") and param.decoupled_grad is not None:
+                assert param.decoupled_grad.dtype in [torch.float32, torch.bfloat16]
+                params.append(param)
+                grads.append(to_local_if_dtensor(param.decoupled_grad).detach())
+        else:
+            if param.grad is not None:
+                assert param.grad.type() == 'torch.cuda.FloatTensor'
+                params.append(param)
+                grads.append(to_local_if_dtensor(param.grad).detach())
 
     # Scale.
     clip_coeff = max_norm / (total_norm + 1.0e-6)
@@ -171,6 +180,7 @@ def clip_grad_by_total_norm_fp32(
 def count_zeros_fp32(
     parameters: Union[List[torch.Tensor], torch.Tensor],
     grad_stats_parallel_group: torch.distributed.ProcessGroup,
+    use_decoupled_grad: bool = False,
 ) -> float:
     """Counts the number of zeros in gradients associated with the passed-in list of
     parameters.
@@ -182,6 +192,8 @@ def count_zeros_fp32(
         grad_stats_parallel_group (group): Process group for reducing the num_zeros count. This is
             generally the model-parallel group for non-distributed optimizers, and the entire
             world for the distributed optimizer.
+        use_decoupled_grad (bool, optional) whether to read grad from ".grad" or ".decoupled_grad",
+            default value is False.
     """
 
     if isinstance(parameters, torch.Tensor):
@@ -194,14 +206,14 @@ def count_zeros_fp32(
     total_num_zeros = torch.tensor([0.0], dtype=torch.float, device='cuda')
     data_parallel_group = None
     for param in parameters:
-        grad_not_none = param.grad is not None
+        grad_attr = "decoupled_grad" if use_decoupled_grad else "grad"
+        grad_not_none = hasattr(param, grad_attr) and getattr(param, grad_attr) is not None
         is_not_shared = param_is_not_shared(param)
         is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
         if grad_not_none and is_not_shared and is_not_tp_duplicate:
-            data_parallel_group = get_data_parallel_group_if_dtensor(
-                param.grad, data_parallel_group
-            )
-            grad = to_local_if_dtensor(param.grad).detach()
+            grad_obj = getattr(param, grad_attr)
+            data_parallel_group = get_data_parallel_group_if_dtensor(grad_obj, data_parallel_group)
+            grad = to_local_if_dtensor(grad_obj).detach()
             num_zeros = grad.numel() - torch.count_nonzero(grad)
             total_num_zeros = num_zeros + total_num_zeros
 
diff --git a/megatron/core/optimizer/distrib_optimizer.py b/megatron/core/optimizer/distrib_optimizer.py
old mode 100755
new mode 100644
index c952f4ce7ab04db158ea722f5ebe69bd4919f802..6b3c53efcaef5da0f1e68280ef201355acb8f95f
--- a/megatron/core/optimizer/distrib_optimizer.py
+++ b/megatron/core/optimizer/distrib_optimizer.py
@@ -293,6 +293,7 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
         gbuf_ranges: List[Dict],
         param_gbuf_map: Dict[torch.nn.Parameter, Tuple],
         opt_group_ranges: List,
+        config: OptimizerConfig,
     ):
         """
         Create main parameter groups needed for the optimizer step.
@@ -343,38 +344,45 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
                 # fp16, bf16 params.
                 if model_param.type() in ['torch.cuda.HalfTensor', 'torch.cuda.BFloat16Tensor']:
 
-                    # Clone model -> main.
+                    # Generate sharded model param.
                     shard_model_param = model_param.detach().view(-1)[
                         param_range.start : param_range.end
                     ]
-
-                    # If we use FP8 params to initialize FP32 main params (compared to using the
-                    # bf16/fp16 params to initialize the main params), there will be a loss of
-                    # precision at the beginning of training (this problem will not occur if the
-                    # training is long enough or if the main params are loaded from a checkpoint).
-                    if is_float8tensor(model_param) and hasattr(
-                        model_param, 'get_high_precision_init_val'
-                    ):
-                        shard_main_param = (
-                            model_param.get_high_precision_init_val()
-                            .view(-1)[param_range.start : param_range.end]
-                            .clone()
-                            .to(shard_model_param.device)
-                            .float()
-                        )
-                        model_param.clear_high_precision_init_val()
-                    else:
-                        shard_main_param = shard_model_param.clone().float()
-
                     tensor_parallel.copy_tensor_model_parallel_attributes(
                         shard_model_param, model_param
                     )
-                    tensor_parallel.copy_tensor_model_parallel_attributes(
-                        shard_main_param, model_param
-                    )
                     if hasattr(model_param, 'shared'):
                         shard_model_param.shared = model_param.shared
-                        shard_main_param.shared = model_param.shared
+
+                    # Generate main param.
+                    if not config.use_precision_aware_optimizer:
+                        # If we use FP8 params to initialize FP32 main params (compared to using the
+                        # bf16/fp16 params to initialize the main params), there will be a loss of
+                        # precision at the beginning of training (this problem will not occur if the
+                        # training is long enough or if the main params are loaded from a
+                        # checkpoint).
+                        if is_float8tensor(model_param) and hasattr(
+                            model_param, 'get_high_precision_init_val'
+                        ):
+                            shard_main_param = (
+                                model_param.get_high_precision_init_val()
+                                .view(-1)[param_range.start : param_range.end]
+                                .clone()
+                                .to(shard_model_param.device)
+                                .float()
+                            )
+                            model_param.clear_high_precision_init_val()
+                        else:
+                            shard_main_param = shard_model_param.clone().float()
+
+                        tensor_parallel.copy_tensor_model_parallel_attributes(
+                            shard_main_param, model_param
+                        )
+                        if hasattr(model_param, 'shared'):
+                            shard_main_param.shared = model_param.shared
+                    else:
+                        # When using precision-aware optimizer, main params are held by FusedAdam.
+                        shard_main_param = None
 
                     # Add to group.
                     model_float16_params_this_group.append(model_param)
@@ -402,10 +410,16 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
                     )
 
             # Update optimizer's params.
-            group_range["orig_group"]["params"] = [
-                *shard_fp32_params_this_group,
-                *shard_fp32_from_float16_params_this_group,
-            ]
+            if not config.use_precision_aware_optimizer:
+                group_range["orig_group"]["params"] = [
+                    *shard_fp32_params_this_group,
+                    *shard_fp32_from_float16_params_this_group,
+                ]
+            else:
+                group_range["orig_group"]["params"] = [
+                    *shard_fp32_params_this_group,
+                    *shard_float16_params_this_group,
+                ]
 
         return (
             model_float16_groups,
@@ -469,10 +483,16 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
         for model_chunk in self.model_chunks:
             assert self.ddp_config == model_chunk.ddp_config
 
-        assert isinstance(
-            optimizer, Adam
+        assert (
+            isinstance(optimizer, Adam) or optimizer is None
         ), "Only Adam currently supported, due to checkpointing requirements."
 
+        # when freezing sub-models we have no real optimizer
+        # but still need a stub DistributedOptimizer class
+        if optimizer is None:
+            self.is_stub_optimizer = True
+            return
+
         # Model grad buffer ranges.
         assert per_model_buffers is not None, "per_model_buffers must be provided"
         self.buffers = list(itertools.chain(*per_model_buffers.values()))
@@ -528,7 +548,7 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
             self.shard_fp32_groups,
             self.shard_fp32_from_float16_groups,
         ) = self._build_model_and_main_param_groups(
-            self.gbuf_ranges, self.model_param_gbuf_map, self.opt_group_ranges
+            self.gbuf_ranges, self.model_param_gbuf_map, self.opt_group_ranges, config
         )
 
         # Update optimizer groups.
@@ -537,6 +557,8 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
         self.optimizer.param_groups = [g["orig_group"] for g in self.opt_group_ranges]
         self.optimizer.load_state_dict(self.optimizer.state_dict())
 
+        self.is_stub_optimizer = False
+
     def _get_model_param_range_map(self, param: torch.nn.Parameter):
         """
         Given a model param, get the index sub-range of the param that this
@@ -655,9 +677,10 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
                                 (numel,), dtype=torch.float32, device=torch.cuda.current_device()
                             )
 
-                            state_dict_state.append(
-                                (state_order, {"exp_avg": init_shard(), "exp_avg_sq": init_shard()})
-                            )
+                            tensors = {"exp_avg": init_shard(), "exp_avg_sq": init_shard()}
+                            if self.config.use_precision_aware_optimizer:
+                                tensors["master_param"] = init_shard()
+                            state_dict_state.append((state_order, tensors))
 
             # Sort by state order (see method docstring for details).
             state_dict_state.sort(key=lambda s: s[0])
@@ -712,6 +735,55 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
             else:
                 raise NotImplementedError(f'Unknown sharding_type: {sharding_type}')
 
+    def _get_main_param_and_optimizer_states(self, model_param):
+        """Return a dict containing the main param and optimizer states corresponding to the input
+        model_param.
+
+        The structure of the returned dict:
+        tensors = {
+            "param": torch.Tensor
+            "exp_avg": torch.Tensor
+            "exp_avg_sq": torch.Tensor
+        }
+        """
+        group_index, group_order = self.model_param_group_index_map[model_param]
+        if self.config.use_precision_aware_optimizer:
+            sharded_model_param = self.optimizer.param_groups[group_index]["params"][group_order]
+            tensors = {}
+            for k in self.optimizer.state[sharded_model_param]:
+                tensors[k] = self.optimizer.get_unscaled_state(sharded_model_param, k)
+            tensors["param"] = tensors.pop("master_param")
+        else:
+            main_param = self.optimizer.param_groups[group_index]["params"][group_order]
+            optim_state = self.optimizer.state[main_param]
+            tensors = {"param": main_param, **optim_state}
+        return tensors
+
+    def _set_main_param_and_optimizer_states(self, model_param, tensors):
+        """Set the main param and optimizer states corresponding to the input model_param.
+
+        The structure of the input `tensors`:
+        tensors = {
+            "param": torch.Tensor
+            "exp_avg": torch.Tensor
+            "exp_avg_sq": torch.Tensor
+        }
+        """
+        group_index, group_order = self.model_param_group_index_map[model_param]
+        if self.config.use_precision_aware_optimizer:
+            sharded_model_param = self.optimizer.param_groups[group_index]["params"][group_order]
+            for k, v in tensors.items():
+                if k == "param":
+                    self.optimizer.set_scaled_state(sharded_model_param, "master_param", v)
+                else:
+                    self.optimizer.set_scaled_state(sharded_model_param, k, v)
+        else:
+            main_param = self.optimizer.param_groups[group_index]["params"][group_order]
+            optim_state = self.optimizer.state[main_param]
+            dst_tensors = {"param": main_param, **optim_state}
+            for key in dst_tensors:
+                dst_tensors[key].copy_(tensors[key])
+
     def get_parameter_state_fs_bucket_space(self):
         """Get internal representation of parameter state without any copies and modifications.
 
@@ -734,18 +806,13 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
                 for bucket_idx, gbuf_range_map in enumerate(gbuf_range_map_for_all_buckets):
                     bucket_state = []
                     for model_param, param_range_map in gbuf_range_map["param_map"].items():
-
-                        # Main param & optimizer states.
-                        group_index, group_order = self.model_param_group_index_map[model_param]
-                        main_param = self.optimizer.param_groups[group_index]["params"][group_order]
-                        optim_state = self.optimizer.state[main_param]
-
-                        tensors = {
-                            "param": main_param,
-                            **optim_state,
-                            "gbuf_local_start": param_range_map["gbuf_local"].start,
-                            "gbuf_local_end": param_range_map["gbuf_local"].end,
-                        }
+                        tensors = self._get_main_param_and_optimizer_states(model_param)
+                        tensors.update(
+                            {
+                                "gbuf_local_start": param_range_map["gbuf_local"].start,
+                                "gbuf_local_end": param_range_map["gbuf_local"].end,
+                            }
+                        )
                         bucket_state.append(tensors)
                     buckets_state.append(bucket_state)
                 dtype_state[dtype] = buckets_state
@@ -810,13 +877,7 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
 
                     # Build contiguous DP rank shards (for param + optim states).
                     for model_param, param_range_map in gbuf_range_map["param_map"].items():
-
-                        # Main param & optimizer states.
-                        group_index, group_order = self.model_param_group_index_map[model_param]
-                        main_param = self.optimizer.param_groups[group_index]["params"][group_order]
-                        optim_state = self.optimizer.state[main_param]
-
-                        tensors = {"param": main_param, **optim_state}
+                        tensors = self._get_main_param_and_optimizer_states(model_param)
 
                         # Copy states into contiguous shard.
                         gbuf_local_start = param_range_map["gbuf_local"].start
@@ -1108,13 +1169,10 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
             for gbuf_range_map_for_all_buckets in gbuf_range_maps.values():
                 for gbuf_range_map in gbuf_range_map_for_all_buckets:
                     for model_param, param_range_map in gbuf_range_map["param_map"].items():
-                        group_index, group_order = self.model_param_group_index_map[model_param]
                         param_range = param_range_map['param']
-
-                        main_param = self.optimizer.param_groups[group_index]["params"][group_order]
-                        optim_state = self.optimizer.state[main_param]
-
-                        tensors = {"fp32_param": main_param, **optim_state}
+                        # Main param & optimizer states.
+                        tensors = self._get_main_param_and_optimizer_states(model_param)
+                        tensors["fp32_param"] = tensors.pop("param")
                         # Match optimizer parameter with model ShardedTensor (or
                         # ShardedTensorFactory).
                         try:
@@ -1188,13 +1246,7 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
                         bucket_state, gbuf_range_map["param_map"].items()
                     ):
                         # Main param & optimizer states.
-                        group_index, group_order = self.model_param_group_index_map[model_param]
-                        main_param = self.optimizer.param_groups[group_index]["params"][group_order]
-                        optim_state = self.optimizer.state[main_param]
-
-                        dst_tensors = {"param": main_param, **optim_state}
-                        for key in dst_tensors:
-                            dst_tensors[key].copy_(src_tensors[key])
+                        self._set_main_param_and_optimizer_states(model_param, src_tensors)
 
     @torch.no_grad()
     def load_parameter_state_from_fs_model_space(self, state_dict):
@@ -1207,15 +1259,13 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
             for gbuf_range_map_for_all_buckets in gbuf_range_maps.values():
                 for gbuf_range_map in gbuf_range_map_for_all_buckets:
                     for model_param, param_range_map in gbuf_range_map["param_map"].items():
-                        group_index, group_order = self.model_param_group_index_map[model_param]
-                        main_param = self.optimizer.param_groups[group_index]["params"][group_order]
-                        optim_state = self.optimizer.state[main_param]
-
-                        src_tensors = state_dict[param_idx]
-                        dst_tensors = {"fp32_param": main_param, **optim_state}
-                        for key in dst_tensors:
-                            dst_tensors[key].copy_(src_tensors[key])
-
+                        src_tensors = {}
+                        for k, v in state_dict[param_idx].items():
+                            if k == "fp32_param":
+                                src_tensors["param"] = v
+                            else:
+                                src_tensors[k] = v
+                        self._set_main_param_and_optimizer_states(model_param, src_tensors)
                         param_idx += 1
 
     @classmethod
@@ -1390,6 +1440,7 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
                         f"Number of unpadded elements must be same in current run "
                         f"({buffer_numel_unpadded}) and checkpoint ({checkpoint_numel_unpadded})"
                     )
+                recv_tensors = {}
                 for key in ("param", "exp_avg", "exp_avg_sq"):
                     offset_in_world_tensors = 0
                     for bucket_idx, gbuf_range_map in enumerate(gbuf_range_map_for_all_buckets):
@@ -1440,26 +1491,18 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
                             data_parallel_group_gloo,
                         )
 
-                        # Copy local contiguous shards to param/optim shards.
                         for model_param, param_range_map in gbuf_range_map["param_map"].items():
-
-                            # Main param & optimizer states.
-                            group_index, group_order = self.model_param_group_index_map[model_param]
-                            main_param = self.optimizer.param_groups[group_index]["params"][
-                                group_order
-                            ]
-                            if key == "param":
-                                tensor_to_copy_into = main_param
-                            else:
-                                optim_state = self.optimizer.state[main_param]
-                                tensor_to_copy_into = optim_state[key]
-
                             # Copy states into contiguous shard.
                             gbuf_local_start = param_range_map["gbuf_local"].start
                             gbuf_local_end = param_range_map["gbuf_local"].end
-                            tensor_to_copy_into.data.copy_(
-                                recv_tensor[gbuf_local_start:gbuf_local_end]
-                            )
+                            if model_param not in recv_tensors:
+                                recv_tensors[model_param] = {}
+                            recv_tensors[model_param][key] = recv_tensor[
+                                gbuf_local_start:gbuf_local_end
+                            ]
+
+                for model_param, tensors in recv_tensors.items():
+                    self._set_main_param_and_optimizer_states(model_param, tensors)
 
     def split_state_dict_if_needed(self, state_dict):
         """
@@ -1600,6 +1643,8 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
         Args:
             filename (str): path to load parameter state from.
         """
+        if self.is_stub_optimizer:
+            return
         state_dict = None
         if torch.distributed.get_rank(self.data_parallel_group) == 0:
             state_dict = torch.load(filename)
@@ -1618,24 +1663,39 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
         Args:
             set_to_none (bool): if true, set grads to None.
         """
-        for groups in (
+        if self.is_stub_optimizer:
+            return
+        total_groups = [
             self.model_float16_groups,
             self.model_fp32_groups,
             self.shard_float16_groups,  # grad empty/unused here?
             self.shard_fp32_groups,  # throws grad-access warning
-            self.shard_fp32_from_float16_groups,
-        ):
+        ]
+        if not self.config.use_precision_aware_optimizer:
+            total_groups.append(self.shard_fp32_from_float16_groups)
+        for groups in total_groups:
             for group in groups:
-                _zero_grad_group_helper(group, set_to_none)
+                _zero_grad_group_helper(
+                    group, set_to_none, self.config.use_precision_aware_optimizer
+                )
 
     def _collect_main_grad_data_for_unscaling(self):
         """
         Note: this should be equivalent to the float-16 optimizer's method,
         but written differently, so the two should be combined.
         """
-        return [
-            param.grad.data for group in self.optimizer.param_groups for param in group["params"]
-        ]
+        if self.config.use_precision_aware_optimizer:
+            return [
+                param.decoupled_grad.data
+                for group in self.optimizer.param_groups
+                for param in group["params"]
+            ]
+        else:
+            return [
+                param.grad.data
+                for group in self.optimizer.param_groups
+                for param in group["params"]
+            ]
 
     def _get_model_and_main_params_data_float16(self):
         """
@@ -1648,7 +1708,10 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
         ):
             for model_param, main_param in zip(model_group, main_group):
                 model_data.append(model_param.data)
-                main_data.append(main_param.data)
+                if self.config.use_precision_aware_optimizer:
+                    main_data.append(None)
+                else:
+                    main_data.append(main_param.data)
         return model_data, main_data
 
     def _copy_model_grads_to_main_grads(self):
@@ -1659,6 +1722,8 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
         buffer, this method is responsible for copying the updated grads
         from the grad buffer to the main shard's grad field.
         """
+        if self.is_stub_optimizer:
+            return
 
         # Utility method for copying group grads.
         def copy_group_grads(model_groups, shard_main_groups):
@@ -1671,11 +1736,23 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
 
                     model_grad = model_param.main_grad
                     shard_model_grad = model_grad.view(-1)[param_range.start : param_range.end]
-                    shard_main_param.grad = shard_model_grad.float()
+                    if self.config.use_precision_aware_optimizer:
+                        # Pytorch requires a param and its' grad to be the same dtype, but we want
+                        # their types to be different in precision-aware optimizer. So we use
+                        # ".decoupled_grad" to replace ".grad".
+                        # Note that this requires corresponding modifications in the optimizer (Let
+                        # the optimizer read gradients from ".decoupled_grad" instead of ".grad").
+                        shard_main_param.decoupled_grad = shard_model_grad
+                    else:
+                        shard_main_param.grad = shard_model_grad.float()
 
         # Copy model groups to shard groups.
-        copy_group_grads(self.model_float16_groups, self.shard_fp32_from_float16_groups)
-        copy_group_grads(self.model_fp32_groups, self.shard_fp32_groups)
+        if self.config.use_precision_aware_optimizer:
+            copy_group_grads(self.model_float16_groups, self.shard_float16_groups)
+            copy_group_grads(self.model_fp32_groups, self.shard_fp32_groups)
+        else:
+            copy_group_grads(self.model_float16_groups, self.shard_fp32_from_float16_groups)
+            copy_group_grads(self.model_fp32_groups, self.shard_fp32_groups)
 
     def _copy_main_params_to_model_params(self):
         """
@@ -1685,6 +1762,8 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
         buffer, this method is responsible for copying the updated params
         from the main shards into the correct position in the grad buffer.
         """
+        if self.is_stub_optimizer:
+            return
 
         # Utility method for copying group params.
         def copy_group_params(shard_main_groups, model_groups):
@@ -1724,6 +1803,11 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
                     else:
                         shard_model_param.data.copy_(shard_main_param)
 
+        # When using precision-aware optimizer, main params are held by self.optimizer. It will also
+        # do the work of copying data from main params to model params.
+        if self.config.use_precision_aware_optimizer:
+            return
+
         # Copy shard groups to model groups.
         copy_group_params(self.shard_fp32_from_float16_groups, self.model_float16_groups)
         copy_group_params(self.shard_fp32_groups, self.model_fp32_groups)
@@ -1749,6 +1833,11 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
                     shard_model_param = model_param.view(-1)[param_range.start : param_range.end]
                     shard_main_param.data.copy_(shard_model_param)
 
+        # When using precision-aware optimizer, main params are held by self.optimizer. It will also
+        # do the work of copying data from main params to model params.
+        if self.config.use_precision_aware_optimizer:
+            return
+
         # Copy model groups to shard groups.
         copy_group_params(self.model_float16_groups, self.shard_fp32_from_float16_groups)
         copy_group_params(self.model_fp32_groups, self.shard_fp32_groups)
@@ -1758,6 +1847,8 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
         If detect FP8 parameters, update their `_scale_inv` and do reduce-max for their
         `amax_history`.
         """
+        if self.is_stub_optimizer:
+            return
         amaxes = []
         scales = []
         scale_invs = []
diff --git a/megatron/core/optimizer/grad_scaler.py b/megatron/core/optimizer/grad_scaler.py
old mode 100755
new mode 100644
diff --git a/megatron/core/optimizer/optimizer.py b/megatron/core/optimizer/optimizer.py
old mode 100755
new mode 100644
index c48bb580d8e263e030ba228ae8161f91d90093a5..e830bea88d515ea406ec1cf3e260548fb0ee76a6
--- a/megatron/core/optimizer/optimizer.py
+++ b/megatron/core/optimizer/optimizer.py
@@ -4,6 +4,7 @@
 
 import copy
 import math
+import warnings
 from abc import ABC, abstractmethod
 from itertools import chain
 from logging import getLogger
@@ -52,21 +53,25 @@ from .optimizer_config import OptimizerConfig
 logger = getLogger(__name__)
 
 
-def _zero_grad_group_helper(group: List[torch.nn.Parameter], set_to_none: bool):
+def _zero_grad_group_helper(
+    group: List[torch.nn.Parameter], set_to_none: bool, use_decoupled_grad: bool = False
+):
     """
     Zero out the gradient for a group of parameters.
     Note: copied from torch.optim.optimizer.
     """
     for param in group:
-        if param.grad is not None:
+        grad_attr = "decoupled_grad" if use_decoupled_grad else "grad"
+        if hasattr(param, grad_attr) and getattr(param, grad_attr) is not None:
             if set_to_none:
-                param.grad = None
+                setattr(param, grad_attr, None)
             else:
-                if param.grad.grad_fn is not None:
-                    param.grad.detach_()
+                grad_obj = getattr(param, grad_attr)
+                if grad_obj.grad_fn is not None:
+                    grad_obj.detach_()
                 else:
-                    param.grad.requires_grad_(False)
-                param.grad.zero_()
+                    grad_obj.requires_grad_(False)
+                grad_obj.zero_()
 
 
 def _multi_tensor_copy_this_to_that(
@@ -105,7 +110,11 @@ class MegatronOptimizer(ABC):
     ):
         """Input optimizer is the base optimizer (e.g., Adam)."""
         self.optimizer = optimizer
-        assert self.optimizer, 'no optimizer is provided.'
+        if self.optimizer is None:
+            warnings.warn(
+                f"WARNING: there is no optimizer on RANK {torch.distributed.get_rank()}. "
+                "This may be expected if you have frozen sub-models."
+            )
         self.config = config
         self.init_state_fn = init_state_fn
 
@@ -114,9 +123,10 @@ class MegatronOptimizer(ABC):
         Get list of parameters wrapped in optimizer.
         """
         params = []
-        for param_group in self.optimizer.param_groups:
-            for param in param_group['params']:
-                params.append(param)
+        if hasattr(self.optimizer, 'param_groups'):
+            for param_group in self.optimizer.param_groups:
+                for param in param_group['params']:
+                    params.append(param)
         return params
 
     def get_main_grads_for_grad_norm(self) -> List[torch.Tensor]:
@@ -131,7 +141,10 @@ class MegatronOptimizer(ABC):
         params = self.get_parameters()
         grads_for_norm = []
         for param in params:
-            grad = param.grad
+            if self.config.use_precision_aware_optimizer:
+                grad = param.decoupled_grad if hasattr(param, "decoupled_grad") else None
+            else:
+                grad = param.grad
             grad_not_none = grad is not None
             is_not_shared = param_is_not_shared(param)
             is_not_tp_duplicate = tensor_parallel.param_is_not_tensor_parallel_duplicate(param)
@@ -182,18 +195,27 @@ class MegatronOptimizer(ABC):
     def clip_grad_norm(self, clip_grad: float) -> float:
         """Compute and return grad norm, also clip grads."""
         params = self.get_parameters()
-        grads_for_norm = self.get_main_grads_for_grad_norm()
+        if params:
+            grads_for_norm = self.get_main_grads_for_grad_norm()
+        else:
+            grads_for_norm = []
         grad_norm = get_grad_norm_fp32(
             grads_for_norm, grad_stats_parallel_group=self.get_grad_stats_parallel_group()
         )
-        clip_grad_by_total_norm_fp32(params, clip_grad, grad_norm)
+
+        if params:
+            clip_grad_by_total_norm_fp32(
+                params, clip_grad, grad_norm, self.config.use_precision_aware_optimizer
+            )
         return grad_norm
 
     def count_zeros(self) -> float:
         """Count number of zeros in model's gradients."""
         params = self.get_parameters()
         return count_zeros_fp32(
-            params, grad_stats_parallel_group=self.get_grad_stats_parallel_group()
+            params,
+            grad_stats_parallel_group=self.get_grad_stats_parallel_group(),
+            use_decoupled_grad=self.config.use_precision_aware_optimizer,
         )
 
     @abstractmethod
@@ -213,13 +235,6 @@ class MegatronOptimizer(ABC):
         """Simple scaling."""
         return self.get_loss_scale() * loss
 
-    def start_param_sync(self, model_index: int, *unused):
-        """
-        Start parameter synchronization for all optimizers.
-        This is a no-op for all non-distributed optimizers.
-        """
-        pass
-
     @abstractmethod
     def reload_model_params(self):
         """Refreshes any internal state from the current model parameters.
@@ -253,7 +268,10 @@ class MegatronOptimizer(ABC):
     # "optimizer_instance.param_groups"
     # (for example, to adjust the learning rate)
     def _get_param_groups(self):
-        return self.optimizer.param_groups
+        if self.is_stub_optimizer:
+            return []
+        else:
+            return self.optimizer.param_groups
 
     def _set_param_groups(self, value):
         self.optimizer.param_groups = value
@@ -361,15 +379,17 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
     def _unscale_main_grads_and_check_for_nan(self):
 
         # Collect main grads.
-        main_grads = self._collect_main_grad_data_for_unscaling()
+        if not self.is_stub_optimizer:
+            main_grads = self._collect_main_grad_data_for_unscaling()
 
         # Reset found inf.
         self.found_inf.fill_(0.0)
 
-        # Unscale and set found inf/nan
-        torch._amp_foreach_non_finite_check_and_unscale_(
-            main_grads, self.found_inf, self.grad_scaler.inv_scale
-        )
+        if not self.is_stub_optimizer:
+            # Unscale and set found inf/nan
+            torch._amp_foreach_non_finite_check_and_unscale_(
+                main_grads, self.found_inf, self.grad_scaler.inv_scale
+            )
 
         # Update across all model parallel instances.
         torch.distributed.all_reduce(
@@ -393,7 +413,8 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
             timers('optimizer-copy-to-main-grad', log_level=1).start(
                 barrier=self.config.barrier_with_L1_time
             )
-        self._copy_model_grads_to_main_grads()
+        if not self.is_stub_optimizer:
+            self._copy_model_grads_to_main_grads()
         if timers is not None:
             timers('optimizer-copy-to-main-grad').stop()
 
@@ -427,7 +448,8 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
             timers('optimizer-inner-step', log_level=1).start(
                 barrier=self.config.barrier_with_L1_time
             )
-        self.optimizer.step()
+        if not self.is_stub_optimizer:
+            self.optimizer.step()
         if timers is not None:
             timers('optimizer-inner-step').stop()
 
@@ -436,7 +458,8 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
             timers('optimizer-copy-main-to-model-params', log_level=1).start(
                 barrier=self.config.barrier_with_L1_time
             )
-        self._copy_main_params_to_model_params()
+        if not self.is_stub_optimizer:
+            self._copy_main_params_to_model_params()
         if timers is not None:
             timers('optimizer-copy-main-to-model-params').stop()
 
@@ -455,7 +478,7 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
             timers('optimizer-clip-main-grad', log_level=1).start(
                 barrier=self.config.barrier_with_L1_time
             )
-        grad_norm = None
+        grad_norm = 0.0
         if self.config.clip_grad > 0.0:
             grad_norm = self.clip_grad_norm(self.config.clip_grad)
         if timers is not None:
@@ -466,7 +489,7 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
             timers('optimizer-count-zeros', log_level=1).start(
                 barrier=self.config.barrier_with_L1_time
             )
-        num_zeros_in_grad = self.count_zeros() if self.config.log_num_zeros_in_grad else None
+        num_zeros_in_grad = self.count_zeros() if self.config.log_num_zeros_in_grad else 0
         if timers is not None:
             timers('optimizer-count-zeros').stop()
 
@@ -502,56 +525,60 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
 
         # Handle main parameters.
 
-        # Three groups of parameters:
-        #   float16_groups: original float16 parameters
-        #   fp32_from_float16_groups: fp32 copy of float16 parameters
-        #   fp32_from_fp32_groups: original fp32 parameters
-        self.float16_groups = []
-        self.fp32_from_float16_groups = []
-        self.fp32_from_fp32_groups = []
-
-        # For all the groups in the original optimizer:
-        for param_group in self.optimizer.param_groups:
-            float16_params_this_group = []
-            fp32_params_this_group = []
-            fp32_from_float16_params_this_group = []
-            # For all the parameters in this group:
-            for i, param in enumerate(param_group['params']):
-                if param.requires_grad:
-
-                    # float16 params:
-                    if param.type() in ['torch.cuda.HalfTensor', 'torch.cuda.BFloat16Tensor']:
-                        float16_params_this_group.append(param)
-                        # Create a copy
-                        main_param = param.detach().clone().float()
-                        # Copy tensor model parallel attributes.
-                        tensor_parallel.copy_tensor_model_parallel_attributes(main_param, param)
-                        if hasattr(param, 'shared'):
-                            main_param.shared = param.shared
-                        # Replace the optimizer params with the new fp32 copy.
-                        param_group['params'][i] = main_param
-
-                        fp32_from_float16_params_this_group.append(main_param)
-                        # Reset existing state dict key to the new main param.
-                        if param in self.optimizer.state:
-                            self.optimizer.state[main_param] = self.optimizer.state.pop(param)
-                    # fp32 params.
-                    elif param.type() == 'torch.cuda.FloatTensor':
-                        fp32_params_this_group.append(param)
-                        param_group['params'][i] = param
-
-                    else:
-                        raise TypeError(
-                            'Wrapped parameters must be one of '
-                            'torch.cuda.FloatTensor,  '
-                            'torch.cuda.HalfTensor, or '
-                            'torch.cuda.BFloat16Tensor. '
-                            'Received {}'.format(param.type())
-                        )
-
-            self.float16_groups.append(float16_params_this_group)
-            self.fp32_from_float16_groups.append(fp32_from_float16_params_this_group)
-            self.fp32_from_fp32_groups.append(fp32_params_this_group)
+        if optimizer:
+            # Three groups of parameters:
+            #   float16_groups: original float16 parameters
+            #   fp32_from_float16_groups: fp32 copy of float16 parameters
+            #   fp32_from_fp32_groups: original fp32 parameters
+            self.float16_groups = []
+            self.fp32_from_float16_groups = []
+            self.fp32_from_fp32_groups = []
+
+            # For all the groups in the original optimizer:
+            for param_group in self.optimizer.param_groups:
+                float16_params_this_group = []
+                fp32_params_this_group = []
+                fp32_from_float16_params_this_group = []
+                # For all the parameters in this group:
+                for i, param in enumerate(param_group['params']):
+                    if param.requires_grad:
+
+                        # float16 params:
+                        if param.type() in ['torch.cuda.HalfTensor', 'torch.cuda.BFloat16Tensor']:
+                            float16_params_this_group.append(param)
+                            # Create a copy
+                            main_param = param.detach().clone().float()
+                            # Copy tensor model parallel attributes.
+                            tensor_parallel.copy_tensor_model_parallel_attributes(main_param, param)
+                            if hasattr(param, 'shared'):
+                                main_param.shared = param.shared
+                            # Replace the optimizer params with the new fp32 copy.
+                            param_group['params'][i] = main_param
+
+                            fp32_from_float16_params_this_group.append(main_param)
+                            # Reset existing state dict key to the new main param.
+                            if param in self.optimizer.state:
+                                self.optimizer.state[main_param] = self.optimizer.state.pop(param)
+                        # fp32 params.
+                        elif param.type() == 'torch.cuda.FloatTensor':
+                            fp32_params_this_group.append(param)
+                            param_group['params'][i] = param
+
+                        else:
+                            raise TypeError(
+                                'Wrapped parameters must be one of '
+                                'torch.cuda.FloatTensor,  '
+                                'torch.cuda.HalfTensor, or '
+                                'torch.cuda.BFloat16Tensor. '
+                                'Received {}'.format(param.type())
+                            )
+
+                self.float16_groups.append(float16_params_this_group)
+                self.fp32_from_float16_groups.append(fp32_from_float16_params_this_group)
+                self.fp32_from_fp32_groups.append(fp32_params_this_group)
+            self.is_stub_optimizer = False
+        else:
+            self.is_stub_optimizer = True
 
     def zero_grad(self, set_to_none=True):
         """We only need to zero the model related parameters, i.e.,
@@ -559,6 +586,8 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
         fp32_from_float16_groups as a memory optimization to reduce
         fragmentation; in the case of set_to_none==True, the space
         used by this field can be safely deallocated at this point."""
+        if self.is_stub_optimizer:
+            return
         for group in self.float16_groups:
             _zero_grad_group_helper(group, set_to_none)
         for group in self.fp32_from_float16_groups:
@@ -567,6 +596,8 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
             _zero_grad_group_helper(group, set_to_none)
 
     def _collect_main_grad_data_for_unscaling(self):
+        if self.is_stub_optimizer:
+            return
 
         main_grads = []
 
@@ -640,7 +671,7 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
     ):
 
         if is_loading:
-            self.init_state_fn(self.optimizer)
+            self.init_state_fn(self.optimizer, self.config)
 
         state_dict = self.state_dict()
 
@@ -735,9 +766,12 @@ class FP32Optimizer(MegatronOptimizer):
         super(FP32Optimizer, self).__init__(optimizer, config, init_state_fn)
 
         self._scale = torch.tensor([1.0], dtype=torch.float, device='cuda')
+        self.is_stub_optimizer = True if optimizer is None else False
 
     def zero_grad(self, set_to_none=True):
         """Copied from torch.optim.optimizer"""
+        if self.is_stub_optimizer:
+            return
         for group in self.optimizer.param_groups:
             _zero_grad_group_helper(group['params'], set_to_none)
 
@@ -748,6 +782,8 @@ class FP32Optimizer(MegatronOptimizer):
     @torch.no_grad()
     def prepare_grads(self) -> bool:
         """Pre-processing gradients before the optimizer step, returns whether inf/nan is found."""
+        if self.is_stub_optimizer:
+            return False
         timers = self.config.timers
 
         # Copy main_grads to grads.
@@ -767,6 +803,8 @@ class FP32Optimizer(MegatronOptimizer):
     @torch.no_grad()
     def step_with_ready_grads(self) -> bool:
         """Step the optimizer with ready gradients, return successful."""
+        if self.is_stub_optimizer:
+            return True
         timers = self.config.timers
 
         # Update parameters.
@@ -832,7 +870,7 @@ class FP32Optimizer(MegatronOptimizer):
         self, model_sharded_state_dict: ShardedStateDict, is_loading: bool = False
     ):
         if is_loading:
-            self.init_state_fn(self.optimizer)
+            self.init_state_fn(self.optimizer, self.config)
 
         state_dict = self.state_dict()
         id_to_sharded_param_map = get_param_id_to_sharded_param_map(
@@ -900,13 +938,19 @@ class ChainedOptimizer(MegatronOptimizer):
 
     def __init__(self, chained_optimizers: List[MegatronOptimizer]):
         self.model_chunks = []
-        self.config = getattr(chained_optimizers[0], 'config', None)
-        for optimizer in chained_optimizers:
-            if hasattr(optimizer, 'model_chunks'):
-                for model_chunk in optimizer.model_chunks:
-                    if model_chunk not in self.model_chunks:
-                        self.model_chunks.append(model_chunk)
-            assert self.config == getattr(optimizer, 'config', None)
+        # chained_optimizers would be empty in the case that a rank
+        # has no trainable parameters
+        if chained_optimizers:
+            self.config = getattr(chained_optimizers[0], 'config', None)
+            for optimizer in chained_optimizers:
+                if hasattr(optimizer, 'model_chunks'):
+                    for model_chunk in optimizer.model_chunks:
+                        if model_chunk not in self.model_chunks:
+                            self.model_chunks.append(model_chunk)
+                assert self.config == getattr(optimizer, 'config', None)
+            self.is_stub_optimizer = False
+        else:
+            self.is_stub_optimizer = True
         self.chained_optimizers = chained_optimizers
 
     @property
@@ -930,7 +974,10 @@ class ChainedOptimizer(MegatronOptimizer):
             optimizer.zero_grad(set_to_none)
 
     def get_loss_scale(self):
-        return self.chained_optimizers[0].get_loss_scale()
+        if self.chained_optimizers:
+            return self.chained_optimizers[0].get_loss_scale()
+        else:
+            return torch.tensor([1.0], dtype=torch.float32, device=torch.cuda.current_device())
 
     def reload_model_params(self):
         for optimizer in self.chained_optimizers:
@@ -987,6 +1034,8 @@ class ChainedOptimizer(MegatronOptimizer):
     @torch.no_grad()
     def step(self):
         """ChainedOptimizer will step all optimizers one by one."""
+        if self.is_stub_optimizer:
+            return True, 0.0, 0
         found_inf_flag = self.prepare_grads()
         if found_inf_flag:
             return False, None, None
@@ -1005,6 +1054,7 @@ class ChainedOptimizer(MegatronOptimizer):
                     optimizer.get_parameters(),
                     max_norm=optimizer.config.clip_grad,
                     total_norm=grad_norm,
+                    use_decoupled_grad=optimizer.config.use_precision_aware_optimizer,
                 )
 
         # Count the zeros in the grads.
@@ -1062,8 +1112,3 @@ class ChainedOptimizer(MegatronOptimizer):
             optimizer.load_parameter_state_from_dp_zero(
                 state_dict, update_legacy_format=update_legacy_format
             )
-
-    def start_param_sync(self, model_index: int, *unused):
-        """Start parameter synchronization for all optimizers."""
-        for optimizer in self.chained_optimizers:
-            optimizer.start_param_sync(model_index, *unused)
diff --git a/megatron/core/optimizer/optimizer_config.py b/megatron/core/optimizer/optimizer_config.py
old mode 100755
new mode 100644
index 8876d925cbca288348f3c77501fcf8561dba47ba..a580a67dcfb9b042a10d6df460f7f844573809df
--- a/megatron/core/optimizer/optimizer_config.py
+++ b/megatron/core/optimizer/optimizer_config.py
@@ -47,6 +47,23 @@ class OptimizerConfig:
     params_dtype: torch.dtype = torch.float32
     """dtype used when intializing the weights. Defaults to torch.float32."""
 
+    use_precision_aware_optimizer: bool = False
+    """If true, allows optimizer-related tensors (master_param, gradients and optimizer states)
+    to be set to lower precision. Defaults to False.
+    """
+
+    main_grads_dtype: torch.dtype = torch.float32
+    """dtype of main grads when enabling precision-aware-optimizer"""
+
+    main_params_dtype: torch.dtype = torch.float32
+    """dtype of main params when enabling precision-aware-optimizer"""
+
+    exp_avg_dtype: torch.dtype = torch.float32
+    """dtype of exp_avg when enabling precision-aware-optimizer"""
+
+    exp_avg_sq_dtype: torch.dtype = torch.float32
+    """dtype of exp_avg_sq when enabling precision-aware-optimizer"""
+
     ###############
     # Loss scaling
     ###############
@@ -114,3 +131,51 @@ class OptimizerConfig:
 
     config_logger_dir: str = ""
     """When non-empty, dumps entry-point configs to config_logger_dir"""
+
+    def __post_init__(self):
+        """Check the validity of the config."""
+        if self.use_precision_aware_optimizer:
+            assert (
+                self.optimizer == 'adam'
+            ), '--use-precision-aware-optimizer only supported with adam'
+            assert (
+                self.use_distributed_optimizer
+            ), '--use-precision-aware-optimizer only supported with distributed optimizer'
+
+            # Only the FusedAdam in TE supports --use-precision-aware-optimizer.
+            # TODO: Remove this check when apex's FusedAdam is no longer used.
+            try:
+                import inspect
+
+                from transformer_engine.pytorch.optimizers import FusedAdam as Adam
+
+                adam_args = inspect.signature(Adam).parameters
+                arg_names = [
+                    'master_weight_dtype',
+                    'exp_avg_dtype',
+                    'exp_avg_sq_dtype',
+                    'use_decoupled_grad',
+                ]
+                for name in arg_names:
+                    assert name in adam_args, (
+                        "Current FusedAdam of TE doesn't support --use-precision-aware-optimizer, "
+                        "please update TE version."
+                    )
+            except ImportError:
+                raise RuntimeError(
+                    '--use-precision-aware-optimizer requires FusedAdam from TransformerEngine, '
+                    'but not found.'
+                )
+        else:
+            assert (
+                self.main_grads_dtype == torch.float32
+            ), "main_grads_dtype can only be fp32 when not using precision-aware optimizer"
+            assert (
+                self.main_params_dtype == torch.float32
+            ), "main_params_dtype can only be fp32 when not using precision-aware optimizer"
+            assert (
+                self.exp_avg_dtype == torch.float32
+            ), "exp_avg_dtype can only be fp32 when not using precision-aware optimizer"
+            assert (
+                self.exp_avg_sq_dtype == torch.float32
+            ), "exp_avg_sq_dtype can only be fp32 when not using precision-aware optimizer"
diff --git a/megatron/core/optimizer_param_scheduler.py b/megatron/core/optimizer_param_scheduler.py
old mode 100755
new mode 100644
diff --git a/megatron/core/package_info.py b/megatron/core/package_info.py
old mode 100755
new mode 100644
diff --git a/megatron/core/packed_seq_params.py b/megatron/core/packed_seq_params.py
old mode 100755
new mode 100644
diff --git a/megatron/core/parallel_state.py b/megatron/core/parallel_state.py
old mode 100755
new mode 100644
diff --git a/megatron/core/pipeline_parallel/__init__.py b/megatron/core/pipeline_parallel/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/pipeline_parallel/p2p_communication.py b/megatron/core/pipeline_parallel/p2p_communication.py
old mode 100755
new mode 100644
diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py
old mode 100755
new mode 100644
index ca18d4b2f83917e1bf462dc12e575156908c9af3..7d73902213156d522409cf5528307e5ec6d20088
--- a/megatron/core/pipeline_parallel/schedules.py
+++ b/megatron/core/pipeline_parallel/schedules.py
@@ -9,6 +9,7 @@ from torch.autograd.variable import Variable
 from megatron.core import parallel_state
 from megatron.core.enums import ModelType
 from megatron.core.pipeline_parallel import p2p_communication
+from megatron.core.transformer.cuda_graphs import create_cudagraphs
 from megatron.core.transformer.moe.router import MoEAuxLossAutoScaler
 from megatron.core.utils import (
     drain_embedding_wgrad_compute,
@@ -496,6 +497,9 @@ def forward_backward_no_pipelining(
     if config.timers is not None:
         config.timers('forward-backward').stop()
 
+    if hasattr(config, 'enable_cuda_graph') and config.enable_cuda_graph:
+        create_cudagraphs()
+
     return forward_data_store
 
 
@@ -1479,6 +1483,9 @@ def forward_backward_pipelining_with_interleaving(
     if config.timers is not None:
         config.timers('forward-backward').stop()
 
+    if hasattr(config, 'enable_cuda_graph') and config.enable_cuda_graph:
+        create_cudagraphs()
+
     return forward_data_store
 
 
@@ -1874,4 +1881,7 @@ def forward_backward_pipelining_without_interleaving(
     if config.timers is not None:
         config.timers('forward-backward').stop()
 
+    if hasattr(config, 'enable_cuda_graph') and config.enable_cuda_graph:
+        create_cudagraphs()
+
     return forward_data_store
diff --git a/megatron/core/requirements.txt b/megatron/core/requirements.txt
old mode 100755
new mode 100644
diff --git a/megatron/core/rerun_state_machine.py b/megatron/core/rerun_state_machine.py
old mode 100755
new mode 100644
index 22b13b0c9e3ddc899fcf0d3624fba4ee5196b617..4db1ceba79d4c18bf351a8c97de427b70ed1f2a6
--- a/megatron/core/rerun_state_machine.py
+++ b/megatron/core/rerun_state_machine.py
@@ -12,6 +12,9 @@ from typing import Any, Callable, Iterable, NamedTuple, Optional, Set, Tuple, Un
 import numpy as np
 import torch
 
+import megatron.core.parallel_state as mpu
+from megatron.core.dist_checkpointing.mapping import ShardedObject
+
 """DISCLAIMER: THIS IS AN EXPERIMENTAL FEATURE.
 
 The rerun state machine implementation in this file is alpha-level code to help
@@ -34,6 +37,7 @@ EXIT_CODE_RESUME_TO_DISAMBIGUATE: int = 16
 EXIT_CODE_FAILED_ON_RESULT_VALIDATION: int = 17
 
 SerializableStateType = Union[list, dict]
+DataIteratorArgType = Optional[Union["RerunDataIterator", list["RerunDataIterator"]]]
 
 
 class Caller(NamedTuple):
@@ -203,12 +207,14 @@ class RerunStateMachine:
 
         self.saved_results: dict[Call, Any] = {}
         self.stats: dict[Caller, QuickStats] = defaultdict(lambda: QuickStats())
-        logger.warning(f"RerunStateMachine initialized in mode {mode}")
+        if _safe_get_rank() == 0:
+            logger.warning(f"RerunStateMachine initialized in mode {mode}")
 
     def set_mode(self, mode: RerunMode) -> None:
         """Method to set the operating mode"""
 
-        logger.warning(f"Setting RerunStateMachine mode {mode}")
+        if _safe_get_rank() == 0:
+            logger.warning(f"Setting RerunStateMachine mode {mode}")
         self.mode = mode
 
     def get_mode(self) -> RerunMode:
@@ -216,9 +222,7 @@ class RerunStateMachine:
 
         return self.mode
 
-    def should_run_forward_backward(
-        self, data_iterator: Optional[Union["RerunDataIterator", list]]
-    ) -> bool:
+    def should_run_forward_backward(self, data_iterator: DataIteratorArgType) -> bool:
         """Method instructing whether to (re)run the forward-backward pass.
 
         Args:
@@ -243,17 +247,7 @@ class RerunStateMachine:
 
         self.validation_counts = defaultdict(int)
 
-        data_iterators: list[RerunDataIterator] = []
-        if self.mode != RerunMode.DISABLED and data_iterator is not None:
-            if not isinstance(data_iterator, list):
-                data_iterators = [data_iterator]
-            else:
-                data_iterators = data_iterator
-            for d in data_iterators:
-                assert (
-                    isinstance(d, RerunDataIterator),
-                    "data iterator is not wrapped with RerunDataIterator",
-                )
+        data_iterators: list[RerunDataIterator] = self._sanitize_data_iterators(data_iterator)
 
         # Are we about to start the initial run?
         if self.state == RerunState.NOT_RUNNING_YET:
@@ -263,10 +257,9 @@ class RerunStateMachine:
             if self.data_iterator_checkpoints is not None:
                 assert (
                     len(self.data_iterator_checkpoints) == len(data_iterators),
-                    "data_iterator has different length than checkpointed data iterator",
-                )
+                ), "data iterator has different length than checkpointed data iterator"
                 for i, d in enumerate(data_iterators):
-                    d.set_checkpoint_state(self.data_iterator_checkpoints[i])
+                    d.load_state_dict(self.data_iterator_checkpoints[i])
                 self.data_iterator_checkpoints = None
             self._save_state()
             if data_iterators:
@@ -632,17 +625,15 @@ class RerunStateMachine:
         self.last_loss = loss
         return result
 
-    def get_checkpoint_state(
-        self, data_iterator: Optional[Union["RerunDataIterator", list]]
-    ) -> list[dict[str, Any]]:
+    def state_dict(self, data_iterator: DataIteratorArgType, use_dist_ckpt: bool) -> dict[str, Any]:
         """Method that returns a state dict to be checkpointed.
 
         Args:
             data_iterator: the data iterator that needs to be checkpointed (or None
                 if this checkpoint is not requested by the rerun state machine).
+            use_dist_ckpt: generate a distributed checkpoint.
         Returns:
-            A list of state dicts, each state dict representing the rerun state machine
-            for one rank.
+            A state dict representing the rerun state machine.
 
         Example usage:
 
@@ -651,26 +642,15 @@ class RerunStateMachine:
                 ...
                 rerun_state_machine = get_rerun_state_machine()
                 checkpoint['rerun_state_machine'] = (
-                    rerun_state_machine.get_checkpoint_state(data_iterator)
+                    rerun_state_machine.state_dict(data_iterator, False)
                 )
                 ...
                 return checkpoint
         """
 
-        data_iterators: list[RerunDataIterator]
-        if self.mode == RerunMode.DISABLED:
-            data_iterators = []
-        elif isinstance(data_iterator, (list, tuple)):
-            data_iterators = data_iterator
-        else:
-            data_iterators = [data_iterator] if data_iterator is not None else []
-        for d in data_iterators:
-            assert (
-                isinstance(d, RerunDataIterator),
-                "data iterator is not wrapped with RerunDataIterator",
-            )
+        data_iterators: list[RerunDataIterator] = self._sanitize_data_iterators(data_iterator)
 
-        state: dict[str, Any] = {
+        state_dict: dict[str, Any] = {
             'mode': self.mode,
             'state': self.state,
             'current_iteration': self.current_iteration,
@@ -679,7 +659,7 @@ class RerunStateMachine:
             'restart_again_requested': self.restart_again_requested,
             'continue_requested': self.continue_requested,
             # logged_sdc_enabled should not be saved (set at the job startup time).
-            'error_injector_checkpoint': self.error_injector.get_checkpoint_state(),
+            'error_injector_checkpoint': self.error_injector.state_dict(),
             # validation_counts should not be saved (reset at the beginning of the training loop).
             'failed_validation_call': self.failed_validation_call,
             'initial_result': self.initial_result,
@@ -687,29 +667,31 @@ class RerunStateMachine:
             'suspicious_device': self.suspicious_device,
             # No need to save saved_state (RNG state  already captured in checkpoint).
             'data_iterator_checkpoints': (
-                [d.get_checkpoint_state() for d in data_iterators] if data_iterators else None
+                [d.state_dict() for d in data_iterators] if data_iterators else None
             ),
             'last_loss': self.last_loss,
             # No need to save saved_results and stats (resets when job resumes).
         }
-        state_list: list[dict[str, Any]]
-        if (
-            torch.distributed.is_initialized()
-            and torch.distributed.get_world_size() > 1
-            and self.mode != RerunMode.DISABLED
-        ):
-            state_list = [None for i in range(torch.distributed.get_world_size())]
-            torch.distributed.all_gather_object(state_list, state)
-        else:
-            state_list = [state]
-        return state_list
+        if use_dist_ckpt:
+            pp_rank = mpu.get_pipeline_model_parallel_rank()
+            pp_size = mpu.get_pipeline_model_parallel_world_size()
+            tp_rank = mpu.get_tensor_model_parallel_rank()
+            tp_size = mpu.get_tensor_model_parallel_world_size()
+            state_dict = ShardedObject(
+                'rerun_state_machine_state',
+                state_dict,
+                (pp_size, tp_size),
+                (pp_rank, tp_rank),
+                replica_id=mpu.get_data_parallel_rank(with_context_parallel=True),
+            )
+        return state_dict
 
-    def set_checkpoint_state(self, state_list: list[dict[str, Any]]) -> None:
+    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
         """Method that restores the state from a checkpoint.
 
         Args:
-            state_list: the list of state dicts saved in the checkpoint and originally
-                obtained from get_checkpoint_state().
+            state_dict: the state dict saved in the checkpoint and originally
+                obtained from state_dict().
         Returns:
             None
 
@@ -719,31 +701,43 @@ class RerunStateMachine:
                 ...
                 if 'rerun_state_machine' in checkpoint:
                     rerun_state_machine = get_rerun_state_machine()
-                    rerun_state_machine.set_checkpoint_state(checkpoint['rerun_state_machine'])
+                    rerun_state_machine.load_state_dict(checkpoint['rerun_state_machine'])
         """
 
         if self.mode == RerunMode.DISABLED:
             return
-        rank: int = _safe_get_rank()
-        if rank == 0:
-            logger.warning(
-                "Getting RerunStaeMachine state from checkpoint, args rerun options ignored"
-            )
-        state = state_list[rank]
-        self.mode = state['mode']
-        self.state = state['state']
-        self.current_iteration = state['current_iteration']
-        self.rerun_requested = state['rerun_requested']
-        self.checkpoint_requested = state['checkpoint_requested']
-        self.restart_again_requested = state['restart_again_requested']
-        self.continue_requested = state['continue_requested']
-        self.error_injector.set_checkpoint_state(state['error_injector_checkpoint'])
-        self.failed_validation_call = state['failed_validation_call']
-        self.initial_result = state['initial_result']
-        self.suspicious_node = state['suspicious_node']
-        self.suspicious_device = state['suspicious_device']
-        self.data_iterator_checkpoints = state['data_iterator_checkpoints']
-        self.last_loss = state['last_loss']
+        logger.warning("Getting RerunStaeMachine state from checkpoint, args rerun options ignored")
+        self.mode = state_dict['mode']
+        self.state = state_dict['state']
+        self.current_iteration = state_dict['current_iteration']
+        self.rerun_requested = state_dict['rerun_requested']
+        self.checkpoint_requested = state_dict['checkpoint_requested']
+        self.restart_again_requested = state_dict['restart_again_requested']
+        self.continue_requested = state_dict['continue_requested']
+        self.error_injector.load_state_dict(state_dict['error_injector_checkpoint'])
+        self.failed_validation_call = state_dict['failed_validation_call']
+        self.initial_result = state_dict['initial_result']
+        self.suspicious_node = state_dict['suspicious_node']
+        self.suspicious_device = state_dict['suspicious_device']
+        self.data_iterator_checkpoints = state_dict['data_iterator_checkpoints']
+        self.last_loss = state_dict['last_loss']
+
+    def _sanitize_data_iterators(
+        self, data_iterator: DataIteratorArgType
+    ) -> list["RerunDataIterator"]:
+        data_iterators: list[RerunDataIterator]
+        if self.mode == RerunMode.DISABLED:
+            data_iterators = []
+        elif not isinstance(data_iterator, list):
+            data_iterators = [data_iterator]
+        else:
+            data_iterators = data_iterator
+        data_iterators = [d for d in data_iterators if d is not None]
+        for d in data_iterators:
+            assert (
+                isinstance(d, RerunDataIterator),
+            ), "data iterator is not wrapped with RerunDataIterator"
+        return data_iterators
 
     def _get_validation_call_info(self) -> Call:
         """Internal method to get the context about the caller to validate_result()."""
@@ -837,8 +831,8 @@ class RerunDataIterator:
         replay_data_iterator = RerunDataIterator(data_iterator)
     """
 
-    def __init__(self, iterable: Any, make_iterable: bool = True) -> None:
-        self.iterable: Iterable[Any] = iter(iterable) if make_iterable else iterable
+    def __init__(self, iterable: Iterable[Any]) -> None:
+        self.iterable: Iterable[Any] = iterable
         self.saved_microbatches: list[Any] = []
         self.replaying: bool = False
         self.replay_pos: int = 0
@@ -870,7 +864,7 @@ class RerunDataIterator:
         self.replaying = False
         self.saved_microbatches = []
 
-    def get_checkpoint_state(self) -> SerializableStateType:
+    def state_dict(self) -> SerializableStateType:
         """Method to capture the state of the iterator as a serializable dict."""
 
         return {
@@ -879,7 +873,7 @@ class RerunDataIterator:
             'replay_pos': self.replay_pos,
         }
 
-    def set_checkpoint_state(self, state_dict: SerializableStateType) -> None:
+    def load_state_dict(self, state_dict: SerializableStateType) -> None:
         """Method to restore the state saved as a serializable dict."""
 
         self.saved_microbatches = state_dict['saved_microbatches']
@@ -1051,7 +1045,7 @@ class RerunErrorInjector:
         else:
             raise RuntimeError("Should not be here")
 
-    def get_checkpoint_state(self) -> SerializableStateType:
+    def state_dict(self) -> SerializableStateType:
         """Method to capture the state of the error injector as a serializable dict."""
 
         return {
@@ -1061,7 +1055,7 @@ class RerunErrorInjector:
             'injected_error_type': self.injected_error_type,
         }
 
-    def set_checkpoint_state(self, state_dict: SerializableStateType) -> None:
+    def load_state_dict(self, state_dict: SerializableStateType) -> None:
         """Method to restore the state saved as a serializable dict."""
 
         self.error_injection_rate = state_dict['error_injection_rate']
@@ -1107,7 +1101,14 @@ def _set_rerun_state_machine(rerun_state_machine) -> None:
 def _safe_get_rank() -> int:
     """Internal function that safely checks and returns the rank of the caller."""
 
-    return torch.distributed.get_rank() if torch.distributed.is_initialized() else 0
+    if torch.distributed.is_initialized():
+        return torch.distributed.get_rank()
+
+    # If torch.distributed is not initialized, try to read environment variables.
+    try:
+        return int(os.environ.get("RANK", 0))
+    except (ValueError, TypeError):
+        return 0
 
 
 def _compare_floats(a: torch.Tensor, b: torch.Tensor) -> float:
diff --git a/megatron/core/ssm/__init__.py b/megatron/core/ssm/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/ssm/mamba_block.py b/megatron/core/ssm/mamba_block.py
old mode 100755
new mode 100644
diff --git a/megatron/core/ssm/mamba_hybrid_layer_allocation.py b/megatron/core/ssm/mamba_hybrid_layer_allocation.py
old mode 100755
new mode 100644
diff --git a/megatron/core/ssm/mamba_layer.py b/megatron/core/ssm/mamba_layer.py
old mode 100755
new mode 100644
diff --git a/megatron/core/ssm/mamba_mixer.py b/megatron/core/ssm/mamba_mixer.py
old mode 100755
new mode 100644
diff --git a/megatron/core/ssm/triton_cache_manager.py b/megatron/core/ssm/triton_cache_manager.py
old mode 100755
new mode 100644
diff --git a/megatron/core/tensor_parallel/__init__.py b/megatron/core/tensor_parallel/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/tensor_parallel/cross_entropy.py b/megatron/core/tensor_parallel/cross_entropy.py
old mode 100755
new mode 100644
index 27c8f06344062d657d7c272f8468ae8ecda7bdfa..f9f9092458475e89c87a09ff42da0cdca723806c
--- a/megatron/core/tensor_parallel/cross_entropy.py
+++ b/megatron/core/tensor_parallel/cross_entropy.py
@@ -120,6 +120,7 @@ class VocabParallelCrossEntropy:
 
 
 class _VocabParallelCrossEntropy(torch.autograd.Function):
+    @torch.compile(mode='max-autotune-no-cudagraphs')
     @staticmethod
     def forward(ctx, vocab_parallel_logits, target, label_smoothing=0.0):
         """Vocab parallel cross entropy forward function."""
diff --git a/megatron/core/tensor_parallel/data.py b/megatron/core/tensor_parallel/data.py
old mode 100755
new mode 100644
diff --git a/megatron/core/tensor_parallel/layers.py b/megatron/core/tensor_parallel/layers.py
old mode 100755
new mode 100644
index fde8c106f14ce693bd7ab773008cc1ec2f9ce641..0b1449f72996eace9a0ca97068de03a48b476f53
--- a/megatron/core/tensor_parallel/layers.py
+++ b/megatron/core/tensor_parallel/layers.py
@@ -237,7 +237,7 @@ class VocabParallelEmbedding(torch.nn.Module):
             )
             if config.perform_initialization:
                 _initialize_affine_weight_gpu(self.weight, init_method, partition_dim=0, stride=1)
-
+    @torch.compile(mode='max-autotune-no-cudagraphs')
     def forward(self, input_):
         """Forward.
 
diff --git a/megatron/core/tensor_parallel/mappings.py b/megatron/core/tensor_parallel/mappings.py
old mode 100755
new mode 100644
index cdd7206871e83578fed5595d3e1549328236469a..a98f0bc67009441f498bd78d81061513bf3d1079
--- a/megatron/core/tensor_parallel/mappings.py
+++ b/megatron/core/tensor_parallel/mappings.py
@@ -462,13 +462,13 @@ class _AllToAll(torch.autograd.Function):
 # -----------------
 # Helper functions.
 # -----------------
-
+from megatron.core.jit import no_torch_dynamo
 
 def copy_to_tensor_model_parallel_region(input_):
     """Wrapper for autograd function: forward: copy, backward allreduce"""
     return _CopyToModelParallelRegion.apply(input_)
 
-
+@no_torch_dynamo()
 def reduce_from_tensor_model_parallel_region(input_):
     """Wrapper for autograd function: forward: all reduce, backward copy"""
     return _ReduceFromModelParallelRegion.apply(input_)
@@ -501,7 +501,7 @@ def gather_from_sequence_parallel_region(
         input_, tensor_parallel_output_grad, group, output_split_sizes, use_global_buffer
     )
 
-
+@no_torch_dynamo()
 def reduce_scatter_to_sequence_parallel_region(
     input_, group=None, input_split_sizes=None, use_global_buffer=False
 ):
diff --git a/megatron/core/tensor_parallel/random.py b/megatron/core/tensor_parallel/random.py
old mode 100755
new mode 100644
diff --git a/megatron/core/tensor_parallel/utils.py b/megatron/core/tensor_parallel/utils.py
old mode 100755
new mode 100644
diff --git a/megatron/core/timers.py b/megatron/core/timers.py
old mode 100755
new mode 100644
diff --git a/megatron/core/transformer/__init__.py b/megatron/core/transformer/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py
old mode 100755
new mode 100644
diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py
old mode 100755
new mode 100644
index 2588980b5bb316477378dce4f4c7191858a6b87c..20257abc28bd367f4e31b56a9f8aefa3ec760c11
--- a/megatron/core/transformer/cuda_graphs.py
+++ b/megatron/core/transformer/cuda_graphs.py
@@ -1,196 +1,701 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
+import gc
+import inspect
 import logging
-import time
+from collections import defaultdict
+from contextlib import nullcontext
+from dataclasses import fields, is_dataclass
 from enum import Enum
 
 import torch
+from torch.utils._pytree import tree_flatten
 
+from megatron.core import parallel_state
+from megatron.core.transformer.identity_op import IdentityOp
 from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import is_te_min_version
 
 try:
-    from transformer_engine.pytorch import make_graphed_callables
-    from transformer_engine.pytorch.fp8 import FP8GlobalStateManager
+    from transformer_engine.pytorch.distributed import get_all_rng_states, graph_safe_rng_available
+    from transformer_engine.pytorch.fp8 import FP8GlobalStateManager, fp8_autocast
+    from transformer_engine.pytorch.graph import restore_fp8_tensors, save_fp8_tensors
+    from transformer_engine.pytorch.graph import set_capture_end as te_set_capture_end
+    from transformer_engine.pytorch.graph import set_capture_start as te_set_capture_start
+    from transformer_engine.pytorch.module.base import TransformerEngineBaseModule
 
     HAVE_TE_GRAPHS = True
 except:
     HAVE_TE_GRAPHS = False
 
+_IS_GRAPH_CAPTURING = False
 
-class GraphStatus(Enum):
+
+def is_graph_capturing():
+    """Query if currently capturing."""
+
+    return _IS_GRAPH_CAPTURING
+
+
+def _set_capture_start():
+    """Set graph capture has started."""
+
+    _IS_GRAPH_CAPTURING = True
+
+
+def _set_capture_end():
+    """Set graph capture has ended."""
+
+    _IS_GRAPH_CAPTURING = False
+
+
+def _check_supported_type(arg):
+    """Check if arg is a supported type for cudagraph input/outputs."""
+
+    _SUPPORTED_TYPES = {torch.Tensor, type(None), bool, int, str, float}
+    assert type(arg) in _SUPPORTED_TYPES or is_dataclass(
+        arg
+    ), f"Cudagraphs recieved an arg of type {type(arg)} which is not supported."
+
+
+class _CudagraphGlobalRecord:
+    """A global datastructure that records of the ordering of all _CudaGraphRunner's
+    first fwd or bwd passes. 'create_cudagraphs' will use this to create
+    cudagraphs in execution order, which is required for cudagraphs sharing a mempool."""
+
+    """A global flag that if true, all cudagraph runners
+    fwd and bwd passes will be performed using their cudagraphed versions."""
+    cudagraph_created = False
+
+    """A record of fwd and bwd graph creation, populated with 'record_fwd_graph' and 
+    'record_bwd_graph."""
+    cudagraph_record = []
+
+    @classmethod
+    def record_fwd_graph(cls, runner, args, kwargs):
+        """Record a fwd graph to 'cudagraph_record"""
+
+        vpp_rank = parallel_state.get_virtual_pipeline_model_parallel_rank()
+        vpp_rank = 0 if vpp_rank is None else vpp_rank
+        cls.cudagraph_record.append((runner, "fwd", vpp_rank, args, kwargs))
+
+    @classmethod
+    def record_bwd_graph(cls, runner):
+        """Record a bwd graph to 'cudagraph_record"""
+
+        vpp_rank = parallel_state.get_virtual_pipeline_model_parallel_rank()
+        vpp_rank = 0 if vpp_rank is None else vpp_rank
+        cls.cudagraph_record.append((runner, "bwd", vpp_rank))
+
+    @classmethod
+    def create_cudagraphs(cls):
+        """Iterate through 'cudagraph_record' creating graphs in the order in which
+        they were recorded."""
+
+        # Cudagraphs have already been created, check that no cudagraphed modules ran in eager mode
+        if cls.cudagraph_created:
+            assert len(cls.cudagraph_record) == 0, (
+                "One or more _CudaGraphRunners requested to create a graph after cudagraphs",
+                "were already created!",
+            )
+            return
+
+        # No cudagraphs have been created or recorded, so do nothing
+        if len(cls.cudagraph_record) == 0:
+            return
+
+        # Otherwise, create all the recorded cudagraphs.
+        logging.getLogger(__name__).info(f"Creating {len(cls.cudagraph_record)} cudagraphs")
+
+        has_te_modules = False
+        for g in cls.cudagraph_record:
+            base_module = g[0].base_module
+            has_te_modules = has_te_modules or any(
+                [isinstance(m, TransformerEngineBaseModule) for m in base_module.modules()]
+            )
+
+        # If graphing only transformer layers with self attention, then apply the following
+        # transformer layer specific optimizations that reduce memory usage and tensor copies:
+        # These eventually will become unneccessary with:
+        # https://github.com/pytorch/pytorch/pull/137318
+        # 1. Some inputs to TransformerLayer (e.g. rotary_emb) are the same over all layers
+        #    and only need to be set once.
+        # 2. Because the next layer consumes the previous layer's hidden states, all fwd
+        #    cudagraphs can alternate reusing the same hidden_state input, output buffer.
+        #    Similarly, bwd graphs can alternate the same output, input grad buffers.
+        optimize_transformer_layer_graph_buffers = all(
+            [g[0].is_transformer_decoder_layer for g in cls.cudagraph_record]
+        )
+        if optimize_transformer_layer_graph_buffers:
+            prev_fwd_hidden_state_output = None
+            prev_bwd_hidden_state_inputgrad = None
+
+        fwd_mempools = defaultdict(lambda: defaultdict(torch.cuda.graph_pool_handle))
+        bwd_mempool = torch.cuda.graph_pool_handle()
+
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        _set_capture_start()
+        if has_te_modules:
+            te_set_capture_start()
+
+        for idx, g in enumerate(cls.cudagraph_record):
+            runner, graph_type, vp_rank = g[0:3]
+
+            # All model chunks in the same microbatch use the same mempool. For deep pipelines,
+            # i.e. when virtual pipelining is used, additonally all bwd passes share the same
+            # mempool. This reduces memory usage since when there are few graphs per mempool,
+            # the memory usage increases due to fragmentation. Otherwise when VP=1, it is more
+            # effective to have fwd and bwd passes share the same mempool.
+            fwd_mempool = fwd_mempools[vp_rank][runner.position]
+            vpp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
+            if vpp_size is None or vpp_size == 1:
+                bwd_mempool = fwd_mempool
+
+            if optimize_transformer_layer_graph_buffers:
+                if graph_type == 'fwd':
+                    args, kwargs = g[3:]
+
+                    if not runner.is_first_layer:
+                        kwargs['hidden_states'] = prev_fwd_hidden_state_output
+                    runner.create_fwd_graph(fwd_mempool, args, kwargs, clone_inputs=False)
+
+                    # The output of TransformerLayer is: (hidden_states, None)
+                    prev_fwd_hidden_state_output, _ = runner.fwd_graph_outputs
+                else:
+                    runner.create_bwd_graph(
+                        bwd_mempool, static_grad_outputs=prev_bwd_hidden_state_inputgrad
+                    )
+
+                    # The first input grad TransformerLayer is for 'hidden_states'
+                    if not runner.is_last_layer:
+                        prev_bwd_hidden_state_inputgrad = runner.static_grad_inputs[0]
+            else:
+                runner, graph_type = g[0:2]
+                if graph_type == 'fwd':
+                    args, kwargs = g[3:]
+                    runner.create_fwd_graph(fwd_mempool, args, kwargs)
+                else:
+                    runner.create_bwd_graph(bwd_mempool)
+
+        for g in cls.cudagraph_record:
+            runner = g[0]
+            runner.cudagraph_created = True
+
+        cls.cudagraph_created = True
+        cls.cudagraph_record = []
+
+        _set_capture_end()
+        if has_te_modules:
+            te_set_capture_end()
+
+
+def create_cudagraphs():
+    """Should be called at the end of each schedule function,
+    (e.g. forward_backward_pipelining_with_interleaving) in
+    `megatron.core.pipeline_parallel.schedules.py`. During the first step, _CudaGraphRunners
+    populate _CudagraphGlobalRecord with the global order in which cudagraphs should be created.
+    At the end for the first step, this function calls each runner's `create_fwd_graph` and
+    `create_bwd_graph` in the order recorded in _CudagraphGlobalRecord, which allows cudagraphs
+    to be created in execution order, which allows multiple cudagraphs to share a single
+    memory pool, minimizing cudagraph memory usage."""
+
+    _CudagraphGlobalRecord.create_cudagraphs()
+
+
+class _GraphStatus(Enum):
     """An Enum to track if a cudagraph is ready to perform a forward or backward pass."""
 
-    FWD_READY = 0
-    BWD_READY = 1
+    FWD_READY = 0  # Set immediately after a bwd pass
+    BWD_READY = 1  # Set immediately after a fwd pass
 
 
-class GraphStatusFunc(torch.autograd.Function):
-    """Inserts a node into the autograd graph that tracks whether an object has an outstanding
-    backward pass by toggling the value of GraphStatus. This is mainly used to detect when to create
-    multiple graphs per transformer layer for pipeline parallelism.
-    We don't use backward module hooks as they change forward output tensors to views, see:
-    https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.register_full_backward_hook
-    """
+class _CudagraphFuncNoop(torch.autograd.Function):
+    """Inserts a noop node into the autograd graph, used to record when a bwd graph needs
+    to be created."""
 
     @staticmethod
-    def forward(ctx, runner, obj):
-        """Occurs immediately before the graph's forward pass.
-        Marks the graph's backward pass as ready."""
+    def forward(ctx, runner, inputs):
+        """Forward pass, does nothing but registers an autograd node."""
+
+        assert (
+            runner.status == _GraphStatus.FWD_READY
+        ), "Tried calling the fwd cudagraph when the bwd cudagraph was expected to be called next!"
+
         ctx.runner = runner
-        runner.status = GraphStatus.BWD_READY
-        return obj
+        return inputs
 
     @staticmethod
-    def backward(ctx, grad):
-        """Occurs immediately after the graph's backward pass.
-        Marks the graph's forward pass as ready."""
-        assert ctx.runner.status == GraphStatus.BWD_READY
-        ctx.runner.status = GraphStatus.FWD_READY
-        return None, grad
-
-
-class TensorDescription:
-    """Records the attributes of a tensor. Used to check if a
-    tensor argument matches the tensor with which the module
-    was graph captured with."""
-
-    def __init__(self, tensor):
-        self.shape = tuple(tensor.shape)
-        self.dtype = tensor.dtype
-        self.device = tensor.device
-
-    def matches_tensor(self, tensor):
-        """Check if 'tensor' matches the attributes of this TensorDescription."""
-
-        assert torch.is_tensor(tensor)
-        return (
-            tensor.shape == self.shape
-            and tensor.dtype == self.dtype
-            and tensor.device == self.device
-        )
+    def backward(ctx, grads):
+        """If this is the first bwd pass of this runner, record that a
+        bwd graph needs to be created."""
 
+        runner = ctx.runner
+        assert (
+            runner.status == _GraphStatus.BWD_READY
+        ), "Tried calling the bwd cudagraph when the fwd cudagraph was expected to be called next!"
 
-class CudaGraphCallable(torch.nn.Module):
-    """Wraps a module to be cudagraphable, records the output of the cudagraph.
-    Reinserts non-tensor args, kwargs that were previously filtered out by 'get_tensor_args'.
-    """
+        runner.status = _GraphStatus.FWD_READY
+
+        if not runner.bwd_graph_recorded:
+            _CudagraphGlobalRecord.record_bwd_graph(runner)
+            runner.bwd_graph_recorded = True
+
+        return None, grads
+
+
+class _CudagraphFunc(torch.autograd.Function):
+    """Replays the runner's cudagraphs with autograd. Handles copying data into/out of the
+    cudagraph io and fp8 if used."""
+
+    @staticmethod
+    def forward(ctx, runner, is_first_microbatch, *inputs):
+        """Replay the forward graph of the passed runner."""
 
-    def __init__(self, module, groundtruth_args, groundtruth_kwargs):
-        super().__init__()
-        self.add_module('base_module', module)
-
-        # The Pytorch cudagraph API requires only tensor inputs, so we strip
-        # non-tensor arguments and reinsert them in forward() using these groundtruth attributes.
-        # We will also check future calls to the cudagraph against these to ensure the cudagraph
-        # is called with the same inputs as it was captured with.
-        self.groundtruth_outputs = []
-        self.groundtruth_args = tuple(
-            TensorDescription(a) if torch.is_tensor(a) else a for a in groundtruth_args
-        )
-        self.groundtruth_kwargs = {
-            k: TensorDescription(v) if torch.is_tensor(v) else v
-            for k, v in groundtruth_kwargs.items()
-        }
-
-    def forward(self, *arg_tensors, **kwarg_tensors):
-        """Call the forward pass of the cudagraph. Also checks the outputs
-        of the cudagraph matches what the graph was traced with."""
-
-        args = list(self.groundtruth_args)
-        arg_tensors = list(arg_tensors)
-        for idx, groundtruth_arg in enumerate(self.groundtruth_args):
-            if isinstance(groundtruth_arg, TensorDescription):
-                args[idx] = arg_tensors.pop(0)
-
-        kwargs = dict(self.groundtruth_kwargs)
-        for k, v in self.groundtruth_kwargs.items():
-            if isinstance(v, TensorDescription):
-                kwargs[k] = kwarg_tensors[k]
-
-        # Use forward() instead of __call__ to avoid triggering hooks
-        out = self.base_module.forward(*args, **kwargs)
-        if torch.is_tensor(out):
-            out = tuple(out)
-
-        self.groundtruth_outputs = [TensorDescription(o) if torch.is_tensor(o) else o for o in out]
-
-        out = tuple(o for o in out if torch.is_tensor(o))
         assert (
-            len(out) > 0
-        ), """A graphed module returned no tensors in training mode, however the graphed module 
-            must output at least one tensor, so that a corresponding backward node
-            may be registered in the autograd graph."""
+            runner.fwd_graph is not None
+        ), "Tried replaying fwd cudagraph before calling 'create_fwd_cudagraph!"
+        assert (
+            runner.status == _GraphStatus.FWD_READY
+        ), "Tried calling the fwd cudagraph when the bwd cudagraph was expected to be called next!"
+        assert len(inputs) == len(
+            runner.fwd_graph_input_surface
+        ), "Fwd cudagraph received a different number of tensors than what it was graphed with!"
+
+        # Copy new data into fwd graph input buffer
+        for user_input, cudagraph_input in zip(inputs, runner.fwd_graph_input_surface):
+            if user_input.data_ptr() != cudagraph_input.data_ptr():
+                cudagraph_input.copy_(user_input)
 
-        if len(out) == 1:
-            return out[0]
+        ctx.runner = runner
+        if runner.fp8_enabled:
+            for m in runner.base_module.modules():
+                if isinstance(m, TransformerEngineBaseModule):
+                    m.fp8_meta["fp8_group"] = FP8GlobalStateManager.get_fp8_group()
+                    m.fp8_meta["recipe"] = FP8GlobalStateManager.get_fp8_recipe()
+
+                    if is_te_min_version("1.13.0"):
+                        FP8GlobalStateManager.add_fp8_tensors_to_global_buffer(m.fp8_meta)
+                    else:
+                        FP8GlobalStateManager.add_fp8_tensors_to_global_buffer(
+                            m.fp8_meta, fp8_weights=m._get_fp8_params()
+                        )
+
+            is_first_fp8_module = FP8GlobalStateManager.is_first_fp8_module()
+            if is_first_fp8_module:
+                FP8GlobalStateManager.set_skip_fp8_weight_update_tensor(not is_first_microbatch)
+            ctx.is_first_fp8_module = is_first_fp8_module
+
+        runner.fwd_graph.replay()
+
+        # if last transformer layer, return a clone of the cudagraph output buffer, as releasing
+        # the cudagraph output buffer into the rest of the system may allow it to be corrupted
+        if runner.is_last_layer:
+            out = tuple(o.clone().detach() for o in runner.fwd_graph_output_surface)
+        else:
+            out = tuple(o.detach() for o in runner.fwd_graph_output_surface)
         return out
 
+    @staticmethod
+    def backward(ctx, *grads):
+        """Replay the backward graph of the passed runner."""
 
-class CudaGraphRunner(torch.nn.Module):
-    """Wraps a single cudagraph and its expected arguments. Checks that
-    the provided args are the same as what the graph was traced with.
-    """
+        runner = ctx.runner
+        assert (
+            runner.bwd_graph is not None
+        ), "Tried replaying bwd cudagraph before calling 'create_bwd_cudagraph'!"
+        assert (
+            runner.status == _GraphStatus.BWD_READY
+        ), "Tried calling the bwd cudagraph when the fwd cudagraph was expected to be called next!"
+        assert len(grads) == len(
+            runner.static_grad_outputs
+        ), "Bwd cudagraph received a different number of tensors than what it was graphed with!"
+
+        # Copy new data into bwd graph input buffer
+        for user_output_grad, cudagraph_output_grad in zip(grads, runner.static_grad_outputs):
+            if user_output_grad.data_ptr() != cudagraph_output_grad.data_ptr():
+                cudagraph_output_grad.copy_(user_output_grad)
+
+        runner.bwd_graph.replay()
+        runner.status = _GraphStatus.FWD_READY
+
+        # Update FP8 scale factors if needed
+        if runner.fp8_enabled and ctx.is_first_fp8_module:
+            FP8GlobalStateManager.reduce_and_update_fp8_tensors(forward=False)
+
+        # If using gradient_accumulation_fusion, whenever `main_grad` is calculated
+        # the `grad_added_to_main_grad` attribute is expected to set. However when using
+        # cudagraphs this doesn't occur so we emulate this behavior here.
+        for param, grad_added in runner.groundtruth_grad_added_to_main_grad.items():
+            param.grad_added_to_main_grad = grad_added
+
+        if runner.is_first_layer:
+            output_grads = tuple(
+                b.clone().detach() if b is not None else b for b in runner.static_grad_inputs
+            )
+        else:
+            output_grads = tuple(
+                b.detach() if b is not None else b for b in runner.static_grad_inputs
+            )
+        return None, None, *output_grads
+
+
+class _CudaGraphRunner(torch.nn.Module):
+    """Represents the execution of a cudagraphed module for a single microbatch.
+    If there are multiple outstanding microbatches per module, such as for pipeline parallelism,
+    CudaGraphManager automatically creates multiple _CudaGraphRunners per module."""
+
+    def __init__(self, base_module, position):
+        """Creates a _CudaGraphRunner, which holds a single pair of fwd and bwd cudagraphs, which
+        are not created until this runner records its graph creation into
+        '_CudagraphGlobalRecord', and 'create_cudagraphs()' is called."""
 
-    def __init__(self, graphed_module, wrapped_module):
         super().__init__()
 
-        self.graphed_module = graphed_module
-        self.groundtruth_args = wrapped_module.groundtruth_args
-        self.groundtruth_kwargs = wrapped_module.groundtruth_kwargs
-        self.groundtruth_outputs = wrapped_module.groundtruth_outputs
-        self.status = GraphStatus.FWD_READY
+        self.base_module = base_module
+        self.position = position
+        self.fwd_graph = None
+        self.bwd_graph = None
+
+        self.fwd_graph_recorded = False
+        self.bwd_graph_recorded = False
+        self.cudagraph_created = False
+        self.status = _GraphStatus.FWD_READY
+
+        self.fuse_wgrad_accumulation = False
+        self.backward_retain_grad = False
+        self.fp8_enabled = False
+        self.deallocate_pipeline_outputs = False
+        if isinstance(self.base_module.config, TransformerConfig):
+            self.fuse_wgrad_accumulation = self.base_module.config.gradient_accumulation_fusion
+            self.backward_retain_grad = self.base_module.config.cuda_graph_retain_backward_graph
+            self.fp8_enabled = self.base_module.config.fp8 is not None
+            self.deallocate_pipeline_outputs = self.base_module.config.deallocate_pipeline_outputs
+
+            if self.fp8_enabled:
+                self.fp8_recipe = FP8GlobalStateManager.get_fp8_recipe()
+                FP8GlobalStateManager.set_skip_fp8_weight_update_tensor(False)
+
+        from megatron.core.transformer.transformer_layer import TransformerLayer
+
+        self.is_first_layer = None
+        self.is_last_layer = None
+        self.is_transformer_decoder_layer = False
+        if isinstance(base_module, TransformerLayer) and isinstance(
+            base_module.cross_attention, IdentityOp
+        ):
+            self.is_transformer_decoder_layer = True
+
+            total_num_layers = base_module.config.num_layers
+            pp_size = parallel_state.get_pipeline_model_parallel_world_size()
+            vpp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
+            if vpp_size is None:
+                vpp_size = 1
+
+            layers_per_chunk = total_num_layers // vpp_size // pp_size
+            self.is_first_layer = ((base_module.layer_number - 1) % layers_per_chunk) == 0
+            self.is_last_layer = (base_module.layer_number % layers_per_chunk) == 0
+
+    def get_fp8_context(self):
+        """Return a new fp8 context in cudagraph mode."""
+
+        if self.fp8_enabled:
+            return fp8_autocast(
+                enabled=True, calibrating=False, fp8_recipe=self.fp8_recipe, _graph=True
+            )
+        return nullcontext()
+
+    def create_fwd_graph(self, mempool, args, kwargs, clone_inputs=True):
+        """Create a fwd cudagraph for this runner. Should be called inside
+        'create_cudagraphs()'."""
+
+        # save grads and other variables that may be affected by graph warmup
+        if self.training and torch.is_grad_enabled():
+            save_main_grads = [
+                param.main_grad.clone()
+                for param in self.base_module.parameters()
+                if hasattr(param, 'main_grad')
+            ]
+
+        if self.fp8_enabled:
+            if is_te_min_version("1.13.0"):
+                saved_fp8_tensors = save_fp8_tensors([self.base_module], self.fp8_recipe)
+            else:
+                saved_fp8_tensors = save_fp8_tensors(
+                    [self.base_module], self.fp8_recipe.amax_history_len
+                )
+
+        if clone_inputs:
+            args, kwargs = self.replace_tensors(args, kwargs)
 
-    def static_args_match(self, args, kwargs):
+        self.fwd_graph_input_args = args
+        self.fwd_graph_input_kwargs = kwargs
+
+        input_tensors = self.get_tensors(args, kwargs)
+        self.fwd_graph_input_surface = input_tensors + tuple(self.base_module.parameters())
+
+        self.fwd_graph = torch.cuda.CUDAGraph()
+
+        # For cases with multiple active RNG states, e.g. TP.
+        if graph_safe_rng_available():
+            for _, state in get_all_rng_states().items():
+                self.fwd_graph.register_generator_state(state)
+
+        # warmup again as case graph capture mode may execute a different codepath
+        for _ in range(2):
+            with self.get_fp8_context():
+                outputs = self.base_module.forward(
+                    *self.fwd_graph_input_args, **self.fwd_graph_input_kwargs
+                )
+            if self.training and torch.is_grad_enabled():
+                outputs = self.get_tensors(outputs)
+                grad_inputs = torch.autograd.grad(
+                    outputs=tuple(o for o in outputs if o.requires_grad),
+                    inputs=tuple(i for i in self.fwd_graph_input_surface if i.requires_grad),
+                    grad_outputs=tuple(
+                        torch.zeros_like(o) if o.requires_grad else None for o in outputs
+                    ),
+                    only_inputs=True,
+                    allow_unused=True,
+                )
+
+        with self.get_fp8_context():
+            torch.cuda.synchronize()
+            with torch.cuda.graph(self.fwd_graph, pool=mempool):
+                outputs = self.base_module.forward(
+                    *self.fwd_graph_input_args, **self.fwd_graph_input_kwargs
+                )
+
+        # save cudagraph output buffer
+        self.fwd_graph_outputs = outputs
+        self.fwd_graph_output_surface = self.get_tensors(outputs)
+
+        if self.training and torch.is_grad_enabled():
+            assert (
+                len(self.fwd_graph_output_surface) > 0
+            ), """Tried graphing a moudule that returned no tensors in training mode, 
+                however the graphed module must output at least one tensor, 
+                so that a corresponding backward node may be registered in the autograd graph."""
+
+            # restore cached grads
+            for param in self.base_module.parameters():
+                if hasattr(param, 'main_grad'):
+                    saved_grad = save_main_grads.pop(0)
+                    assert (
+                        param.main_grad.shape == saved_grad.shape
+                    ), "Error restoring grads while cudagraphing!"
+                    param.main_grad.copy_(saved_grad)
+
+        if self.fp8_enabled:
+            restore_fp8_tensors([self.base_module], saved_fp8_tensors)
+
+    def create_bwd_graph(self, mempool, static_grad_outputs=None):
+        """Create a bwd cudagraph for this runner. Should be called inside
+        'create_cudagraphs()'."""
+
+        self.bwd_graph = torch.cuda.CUDAGraph()
+
+        # For cases with multiple active RNG states, e.g. TP.
+        if graph_safe_rng_available():
+            for _, state in get_all_rng_states().items():
+                self.bwd_graph.register_generator_state(state)
+
+        if static_grad_outputs is None:
+            static_grad_outputs = tuple(
+                torch.zeros_like(o) if o.requires_grad else None
+                for o in self.fwd_graph_output_surface
+            )
+        else:
+            if torch.is_tensor(static_grad_outputs):
+                static_grad_outputs = (static_grad_outputs,)
+
+        torch.cuda.synchronize()
+        with torch.cuda.graph(self.bwd_graph, pool=mempool):
+            grad_inputs = torch.autograd.grad(
+                outputs=tuple(o for o in self.fwd_graph_output_surface if o.requires_grad),
+                inputs=tuple(i for i in self.fwd_graph_input_surface if i.requires_grad),
+                grad_outputs=tuple(o for o in static_grad_outputs if o is not None),
+                retain_graph=self.backward_retain_grad,
+                only_inputs=True,
+                allow_unused=True,
+            )
+
+        # Constructs a tuple suitable for returning from Graphed.backward:
+        # Pads out the actually-needed grads with Nones in gradient slots for inputs
+        # that don't require grad. I couldn't think of a one-liner for this pattern.
+        static_grad_inputs = []
+        grad_idx = 0
+        for arg in self.fwd_graph_input_surface:
+            if arg.requires_grad:
+                static_grad_inputs.append(grad_inputs[grad_idx])
+                grad_idx += 1
+            else:
+                static_grad_inputs.append(None)
+        static_grad_inputs = tuple(static_grad_inputs)
+
+        self.groundtruth_grad_added_to_main_grad = {}
+        if self.fuse_wgrad_accumulation:
+            for param in self.base_module.parameters():
+                if hasattr(param, "grad_added_to_main_grad"):
+                    self.groundtruth_grad_added_to_main_grad[param] = param.grad_added_to_main_grad
+
+        self.static_grad_outputs = static_grad_outputs
+        self.static_grad_inputs = static_grad_inputs
+
+    def record_graph_capture(self, args, kwargs):
+        """If this is the first time this runner has encountered a fwd pass, a cudagraph needs to
+        be created. Record this to _CudagraphGlobalRecord which will mapped to a cudagraph when
+        'create_cudagraphs()` is called. Subsequent fwd passes will replay the cudagraph.
+        """
+        if not self.fwd_graph_recorded:
+            _CudagraphGlobalRecord.record_fwd_graph(self, args, kwargs)
+            self.fwd_graph_recorded = True
+
+        # Run the forward pass as normal in eager mode.
+        out = super(MegatronModule, self.base_module).__call__(*args, **kwargs)
+
+        # Register a noop autograd node that toggles `self.graph_status` in the bwd pass, which
+        # tracks when the runner completes its bwd pass.
+        # If it's the first bwd encountered by this runner, record it to _CudagraphGlobalRecord
+        out = tuple(_CudagraphFuncNoop.apply(self, o) if torch.is_tensor(o) else o for o in out)
+
+        if self.deallocate_pipeline_outputs:
+            out = tuple(o.clone() if torch.is_tensor(o) else o for o in out)
+
+        return out
+
+    def replay_graph_capture(self, is_first_microbatch, args, kwargs):
+        """Replay the fwd cuda graph with autograd."""
+
+        assert self.matches_graph_inputs(
+            args, kwargs
+        ), "Tried replaying a cudagraph with different arguments than what if was created with!"
+
+        inp_tensors = self.get_tensors(args, kwargs)
+        func_args = inp_tensors + tuple(self.parameters())
+
+        out = _CudagraphFunc.apply(self, is_first_microbatch, *func_args)
+        out = list(out)
+        return tuple(out.pop(0) if torch.is_tensor(o) else o for o in self.fwd_graph_outputs)
+
+    def forward(self, is_first_microbatch, args, kwargs):
+        """Forward pass of the runner. If cudagraphs have not been created, record the
+        execution of this fwd and bwd pass for graph capture. Else, replay the cudagraphs."""
+
+        if not self.cudagraph_created:
+            out = self.record_graph_capture(args, kwargs)
+        else:
+            out = self.replay_graph_capture(is_first_microbatch, args, kwargs)
+
+        # If forward only, next replay should be a forward pass as well
+        if self.training and torch.is_grad_enabled():
+            self.status = _GraphStatus.BWD_READY
+        else:
+            self.status = _GraphStatus.FWD_READY
+
+        return out
+
+    def matches_graph_inputs(self, args, kwargs):
         """Check the the passed args, kwargs match with the arg, kwargs
         the graph was created with."""
 
         def check(val, ref):
-            if isinstance(ref, TensorDescription):
-                return ref.matches_tensor(val)
-            return ref == val
+            _check_supported_type(val)
+            _check_supported_type(ref)
+
+            # check that the args are the same type
+            if not ((type(val) == type(ref)) or (is_dataclass(val) and is_dataclass(ref))):
+                return False
+
+            # if tensors, check they have the same shape, device and type
+            # differing memory layout is allowed as 'copy_' is able to handle different layouts
+            if isinstance(ref, torch.Tensor):
+                return (
+                    val.shape == ref.shape and val.dtype == ref.dtype and val.device == ref.device
+                )
 
-        if len(args) != len(self.groundtruth_args):
+            # if dataclass, check args in fields are the same
+            elif is_dataclass(ref):
+                for field in fields(ref):
+                    if not check(getattr(val, field.name), getattr(ref, field.name)):
+                        return False
+                return True
+            else:
+                return ref == val
+
+        if len(args) != len(self.fwd_graph_input_args):
             return False
-        for idx, groundtruth_arg in enumerate(self.groundtruth_args):
-            if not check(args[idx], groundtruth_arg):
+        for arg, graph_arg in zip(args, self.fwd_graph_input_args):
+            if not check(args, graph_arg):
                 return False
 
-        if kwargs.keys() != self.groundtruth_kwargs.keys():
+        if kwargs.keys() != self.fwd_graph_input_kwargs.keys():
             return False
-        for k, v in self.groundtruth_kwargs.items():
+        for k, v in self.fwd_graph_input_kwargs.items():
             if not check(kwargs[k], v):
                 return False
         return True
 
-    def forward(self, args, kwargs, is_first_microbatch=None):
-        """Call the forward pass of the cuda graph."""
-        if self.training and torch.is_grad_enabled():
-            args = list(args)
-            for pos in range(len(args)):
-                if torch.is_tensor(args[pos]):
-                    args[pos] = GraphStatusFunc.apply(self, args[pos])
-            for k, v in kwargs.items():
-                if torch.is_tensor(v):
-                    kwargs[k] = GraphStatusFunc.apply(self, v)
-
-        ret_tensors = self.graphed_module(is_first_microbatch=is_first_microbatch, *args, **kwargs)
-        ret_tensors = [ret_tensors] if torch.is_tensor(ret_tensors) else list(ret_tensors)
-        out = tuple(
-            ret_tensors.pop(0) if isinstance(o, TensorDescription) else o
-            for o in self.groundtruth_outputs
-        )
-
-        # Check that the static graph matches what was recorded during graph capture
-        assert len(out) == len(self.groundtruth_outputs)
-        for idx, o in enumerate(self.groundtruth_outputs):
-            if isinstance(o, TensorDescription):
-                assert o.matches_tensor(out[idx])
+    def replace_tensors(self, args, kwargs=None):
+        """Replace all tensors inside arg, kwargs with zeroed copies."""
+
+        def clone_tensor(ten):
+            cloned = torch.zeros_like(ten)
+            cloned.requires_grad = ten.requires_grad
+            return cloned
+
+        def process_arg(arg):
+            _check_supported_type(arg)
+            if torch.is_tensor(arg):
+                return clone_tensor(arg)
+            elif is_dataclass(arg):
+                for field in fields(arg):
+                    attr = getattr(arg, field.name)
+                    if torch.is_tensor(attr):
+                        setattr(arg, field.name, clone_tensor(attr))
+            return arg
+
+        args_replaced = []
+        for arg in args:
+            args_replaced.append(process_arg(arg))
+        if kwargs is None:
+            return arg
+
+        kwargs_replaced = {}
+        for k, v in kwargs.items():
+            kwargs_replaced[k] = process_arg(v)
+
+        return args_replaced, kwargs_replaced
+
+    def get_tensors(self, args, kwargs=None):
+        """Filter and flatten all tensors from args and kwargs."""
+
+        def extract_tensors(arg):
+            _check_supported_type(arg)
+            if torch.is_tensor(arg):
+                return [arg]
+            elif is_dataclass(arg):
+                tens = []
+                for field in fields(arg):
+                    attr = getattr(arg, field.name)
+                    if torch.is_tensor(attr):
+                        tens.append(attr)
+                return tens
             else:
-                assert o == out[idx]
+                return []
 
-        if len(out) == 1:
-            return out[0]
-        return out
+        tens = []
+        args, _ = tree_flatten(args)
+        for a in args:
+            tens.extend(extract_tensors(a))
+
+        if kwargs is not None:
+            kwargs, _ = tree_flatten(kwargs)
+            for k in kwargs:
+                tens.extend(extract_tensors(k))
+        return tuple(tens)
 
 
 class CudaGraphManager(torch.nn.Module):
@@ -199,14 +704,29 @@ class CudaGraphManager(torch.nn.Module):
     def __init__(self):
         super().__init__()
         self.cudagraph_runners = []
-        self.is_first_microbatch = True
+        self.is_first_microbatch = False
         assert HAVE_TE_GRAPHS, "CudaGraphManager currently requires TransformerEngine"
 
         # Cudagraph stream capture requires no operations on the default stream prior to the
-        # capture, so change to a side stream. At graph capture change it back.
+        # capture, so change to a side stream.
         self.stream = torch.cuda.current_stream()
         torch.cuda.set_stream(torch.cuda.Stream())
 
+    def call_ddp_preforward_hook(self, module):
+        """Call any DDP pre-forward hooks which are used to launch async data parallel
+        param gather. Any other pre-forward hooks are not allowed."""
+
+        from megatron.core.distributed import distributed_data_parallel
+
+        if module._forward_pre_hooks:
+            for _, hook in module._forward_pre_hooks.items():
+                assert (
+                    inspect.getmodule(hook) == distributed_data_parallel
+                ), "Tried to cudagraph a module with user registered pre-forward hooks, \
+                which is not allowed."
+                # Only hooks from Mcore DDP, which take no args, should be called at this point.
+                hook(module)
+
     def __call__(self, megatron_module, args, kwargs):
         """Calls the forward pass of the cudagraphed module.
 
@@ -230,84 +750,22 @@ class CudaGraphManager(torch.nn.Module):
 
         runner = None
         for _runner in self.cudagraph_runners:
-            if _runner.static_args_match(args, kwargs) and _runner.status == GraphStatus.FWD_READY:
+            if _runner.status == _GraphStatus.FWD_READY:
                 runner = _runner
                 break
 
         if runner is None:
             if self.training and torch.is_grad_enabled():
-                runner = self.create_cudagraph_module(megatron_module, args, kwargs)
+                runner = _CudaGraphRunner(megatron_module, len(self.cudagraph_runners))
                 self.cudagraph_runners.append(runner)
-                logging.getLogger(__name__).info(
-                    f"Creating cudagraph; now have {len(self.cudagraph_runners)}"
-                )
             else:
                 # No cudagraphs were found in inference mode, so fallback to eager since
                 # tensor.requires_grad is needed to correctly trace the backward graph.
                 return super(MegatronModule, megatron_module).__call__(*args, **kwargs)
 
-        tensor_args, tensor_kwargs = self.get_tensor_args(args, kwargs)
-        out = runner(tensor_args, tensor_kwargs, is_first_microbatch=self.is_first_microbatch)
-        self.is_first_microbatch = False
-        return out
-
-    def get_tensor_args(self, args, kwargs):
-        """Filter out non-tensor arguments from args and kwargs.
-        Needed since 'make_graphed_callables' expects Torch.tensor arg, kwargs."""
-        tensor_kwargs = {}
-        for k, v in kwargs.items():
-            if torch.is_tensor(v):
-                tensor_kwargs[k] = v
-        tensor_args = tuple(arg for arg in args if torch.is_tensor(arg))
-        return tensor_args, tensor_kwargs
-
-    def create_cudagraph_module(self, megatron_module, args, kwargs):
-        """Record the graph capture stream. Runs warmup iterations of
-        megatron_module, and creates a autograd function, where the
-        forward, backward functions are the cudagraphs of module's forward,
-        backward passes. Finally wraps this cudagraph function with a CudaGraphRunner.
-        """
-
-        torch.cuda.synchronize()
-        torch.cuda.set_stream(self.stream)
-        start = time.time()
-
-        wrapped_module = CudaGraphCallable(megatron_module, args, kwargs)
-        sample_args, sample_kwargs = self.get_tensor_args(args, kwargs)
-
-        # Cudagraphs require no autograd history recorded on sample inputs
-        sample_args_detached = tuple(n.detach() for n in sample_args)
-        sample_kwargs_detached = {k: v.detach() for k, v in sample_kwargs.items()}
-        sample_args_copy = tuple(torch.clone(n) for n in sample_args_detached)
-        sample_kwargs_copy = {k: torch.clone(v) for k, v in sample_kwargs_detached.items()}
-
-        # Zero out input args inplace so cudagraph warmup doesnt affect grads
-        for orig, detach in zip(sample_args, sample_args_detached):
-            detach.zero_()
-            detach.requires_grad = orig.requires_grad
-        for k, detach in sample_kwargs_detached.items():
-            detach.zero_()
-            detach.requires_grad = sample_kwargs[k].requires_grad
-
-        fp8_enabled = megatron_module.config.fp8 is not None
-        fp8_recipe = FP8GlobalStateManager.get_fp8_recipe() if fp8_enabled else None
-        graphed_module = make_graphed_callables(
-            modules=wrapped_module,
-            sample_args=sample_args_detached,
-            sample_kwargs=sample_kwargs_detached,
-            _order=[1, -1],
-            allow_unused_input=True,
-            fp8_enabled=fp8_enabled,
-            fp8_recipe=fp8_recipe,
-            fp8_weight_caching=True,
-        )
-
-        # Restore zeroed out sample args
-        # Detach again since pytorch prohibits inplace ops on leaf nodes
-        for orig, copy in zip(sample_args, sample_args_copy):
-            orig.detach().copy_(copy)
-        for k, orig in sample_kwargs.items():
-            orig.detach().copy_(sample_kwargs_copy[k])
+        # Trigger Mcore DDP pre-forward hooks
+        self.call_ddp_preforward_hook(megatron_module)
+        for module in megatron_module.modules():
+            self.call_ddp_preforward_hook(module)
 
-        logging.getLogger(__name__).info(f'Time spent in cudagraph capture: {time.time() - start}s')
-        return CudaGraphRunner(graphed_module, wrapped_module)
+        return runner(self.is_first_microbatch, args, kwargs)
diff --git a/megatron/core/transformer/custom_layers/__init__.py b/megatron/core/transformer/custom_layers/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/transformer/custom_layers/transformer_engine.py b/megatron/core/transformer/custom_layers/transformer_engine.py
old mode 100755
new mode 100644
diff --git a/megatron/core/transformer/dot_product_attention.py b/megatron/core/transformer/dot_product_attention.py
old mode 100755
new mode 100644
diff --git a/megatron/core/transformer/enums.py b/megatron/core/transformer/enums.py
old mode 100755
new mode 100644
diff --git a/megatron/core/transformer/identity_op.py b/megatron/core/transformer/identity_op.py
old mode 100755
new mode 100644
diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py
old mode 100755
new mode 100644
diff --git a/megatron/core/transformer/module.py b/megatron/core/transformer/module.py
old mode 100755
new mode 100644
diff --git a/megatron/core/transformer/moe/README.md b/megatron/core/transformer/moe/README.md
old mode 100755
new mode 100644
index aecfe6ee44e3d32ae80249d0b91b5cf725352149..e20b34771b4f2b4a7b1a32bbe0203d0fbfa0de4a
--- a/megatron/core/transformer/moe/README.md
+++ b/megatron/core/transformer/moe/README.md
@@ -57,8 +57,11 @@ Megatron-Core offers rich parallelism mappings, combining Expert Parallelism wit
 | --expert-tensor-parallel-size | Degree of tensor model parallelism of expert layer. Default is same to --tensor-model-parallel-size. |
 | --moe-layer-freq | Frequency between MoE layers and Dense layers. Accepts either: 1) An integer N for 1:N ratio (one expert layer for every N-1 dense layers), 2) A string "N" for the same ratio, or 3) A string with Python list expression for custom patterns like `([1]*3+[0]*1)*3` which gives [1,1,1,0,1,1,1,0,1,1,1,0] where 1=expert layer and 0=dense layer. Examples: `([0]+[1]*23)` for 1 dense layer followed by 23 experts layers, `([1]*3+[0]*2)*2` for three expert layers followed by two dense layers, repeated twice. Default is 1. |
 | --moe-grouped-gemm | When there are multiple experts per rank, launch multiple local GEMM kernels in multiple streams to improve the utilization and performance with GroupedLinear in TransformerEngine. |
-| --moe-router-load-balancing-type | Determines the load balancing strategy for the router. "aux_loss" corresponds to the load balancing loss used in GShard and SwitchTransformer, "sinkhorn" corresponds to the balancing algorithm used in S-BASE, and "none" implies no load balancing. The default is "aux_loss". |
+| --moe-router-load-balancing-type | Determines the load balancing strategy for the router. "aux_loss" corresponds to the load balancing loss used in GShard and SwitchTransformer; "seq_aux_loss" corresponds to the load balancing loss used in DeepSeekV2, which computes the loss for each individual sample; "sinkhorn" corresponds to the balancing algorithm used in S-BASE, and "none" implies no load balancing. The default is "aux_loss". |
 | --moe-router-topk | Number of experts to route to for each token. The default is 2. |  
+| --moe-router-pre-softmax | Enable pre-softmax routing for MoE, which means softmax is before the top-k selection. By default, softmax is done after top-k. |
+| --moe-router-topk-limited-devices | Number of expert parallel ranks to consider for each token during routing. Perform top-k routing on a subset of expert parallel ranks by first selecting N ranks for each token, then conducting top-k selection among experts on these devices. None means no device limitation. Default is None, which means no limited devices. |
+| --moe-router-topk-scaling-factor | Scaling factor for routing score in top-k selection, only works when --moe-router-pre-softmax enabled. Defaults to None, which means no scaling. |
 | --moe-aux-loss-coeff | Scaling coefficient for the aux loss: a starting value of 1e-2 is recommended. Default is 0.0. |
 | --moe-z-loss-coeff | Scaling coefficient for the z-loss: a starting value of 1e-3 is recommended. Default is None. |
 | --moe-input-jitter-eps | Add noise to the input tensor by applying jitter with a specified epsilon value. Default is None. |
diff --git a/megatron/core/transformer/moe/__init__.py b/megatron/core/transformer/moe/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/core/transformer/moe/experts.py b/megatron/core/transformer/moe/experts.py
old mode 100755
new mode 100644
diff --git a/megatron/core/transformer/moe/grouped_gemm_util.py b/megatron/core/transformer/moe/grouped_gemm_util.py
old mode 100755
new mode 100644
diff --git a/megatron/core/transformer/moe/legacy_a2a_token_dispatcher.py b/megatron/core/transformer/moe/legacy_a2a_token_dispatcher.py
old mode 100755
new mode 100644
diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py
old mode 100755
new mode 100644
index faefce4cf025ee9e92ed4242fc66c7fa878c0a41..ea0b0b11e59119b42fc9a0055bb37ec74c398ae8
--- a/megatron/core/transformer/moe/moe_layer.py
+++ b/megatron/core/transformer/moe/moe_layer.py
@@ -9,15 +9,13 @@ import torch
 from megatron.core import parallel_state, tensor_parallel
 from megatron.core.transformer.mlp import MLPSubmodules
 from megatron.core.transformer.module import MegatronModule
-from megatron.core.transformer.moe.experts import GroupedMLP, SequentialMLP, TEGroupedMLP
 from megatron.core.transformer.moe.legacy_a2a_token_dispatcher import MoEAlltoAllSEQTokenDispatcher
 from megatron.core.transformer.moe.router import TopKRouter
-from megatron.core.transformer.moe.shared_experts import SharedExpertMLP
 from megatron.core.transformer.moe.token_dispatcher import (
     MoEAllGatherTokenDispatcher,
     MoEAlltoAllTokenDispatcher,
 )
-from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
 from megatron.core.transformer.transformer_config import TransformerConfig
 
 
@@ -89,20 +87,6 @@ class MoELayer(BaseMoELayer):
         # Initialize router
         self.router = TopKRouter(config=self.config)
 
-        # Initialize experts
-        if self.config.moe_grouped_gemm:
-            if isinstance(self.submodules.experts, MLPSubmodules):
-                self.experts = TEGroupedMLP(
-                    self.num_local_experts, self.config, self.submodules.experts
-                )
-            else:
-                self.experts = GroupedMLP(self.num_local_experts, self.config)
-        else:
-            assert isinstance(self.submodules.experts, MLPSubmodules)
-            self.experts = SequentialMLP(
-                self.num_local_experts, self.config, self.submodules.experts
-            )
-
         # Initialize token dispatcher
         if config.moe_token_dispatcher_type == "allgather":
             self.token_dispatcher = MoEAllGatherTokenDispatcher(
@@ -121,9 +105,12 @@ class MoELayer(BaseMoELayer):
                 f"Unsupported token dispatcher type: {config.moe_token_dispatcher_type}"
             )
 
+        # Initialize experts
+        self.experts = build_module(self.submodules.experts, self.num_local_experts, self.config)
+
         # Initialize shared experts
         if self.use_shared_expert:
-            self.shared_experts = SharedExpertMLP(self.config, self.submodules.shared_experts)
+            self.shared_experts = build_module(self.submodules.shared_experts, config=self.config)
             if self.shared_expert_overlap:
                 self.token_dispatcher.set_shared_experts(self.shared_experts)
 
diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py
old mode 100755
new mode 100644
index 0c1504d41708e49dc1d6195d32c880b2c219dd1d..5f1ada68e08a26fdac1953ee55f7df51d6e9b2b0
--- a/megatron/core/transformer/moe/moe_utils.py
+++ b/megatron/core/transformer/moe/moe_utils.py
@@ -56,6 +56,45 @@ def switch_load_balancing_loss_func(
     return aux_loss
 
 
+def sequence_load_balancing_loss_func(
+    probs: torch.Tensor,
+    routing_map: torch.Tensor,
+    tokens_per_expert: torch.Tensor,
+    batch_size: int,
+    seq_length: int,
+    topk: int,
+    moe_aux_loss_coeff: float,
+    sequence_partition_group=None,
+):
+    """
+    Calculate the auxiliary loss in sequence-level by computing the loss for each individual sample.
+    Refer to the DeepSeek-V2 huggingface repo
+    (https://huggingface.co/deepseek-ai/DeepSeek-V2) for details.
+    """
+    num_sub_sequence = 1
+
+    # If the sequence is partitioned by certain parallelism strategies like Sequence Parallelism
+    # or Context Parallelism, compute the gradient of the auxiliary loss with respect to the full
+    # sequence.
+    if sequence_partition_group is not None:
+        # We can keep `aggregated_probs_per_expert` local since we don't need the gradient for
+        # `tokens_per_expert`, saving one allreduce operation for `aggregated_probs_per_expert`.
+        num_sub_sequence = torch.distributed.get_world_size(sequence_partition_group)
+        torch.distributed.all_reduce(tokens_per_expert, group=sequence_partition_group)
+
+    assert num_sub_sequence == 1, "Do not support sequence aux loss in sequence partition case"
+
+    num_experts = probs.shape[1]
+
+    probs_for_aux_loss = probs.view(seq_length, batch_size, -1)
+    cost_coeff = routing_map.view(seq_length, batch_size, -1).sum(dim=0).float()
+    cost_coeff.div_(seq_length * topk / num_experts)
+    seq_aux_loss = (cost_coeff * probs_for_aux_loss.mean(dim=0)).sum(dim=1).mean()
+    seq_aux_loss *= moe_aux_loss_coeff
+
+    return seq_aux_loss
+
+
 def z_loss_func(logits, z_loss_coeff):
     """Encourages the router's logits to remain small to enhance stability.
     Please refer to the ST-MoE paper (https://arxiv.org/pdf/2202.08906.pdf) for details.
@@ -108,7 +147,7 @@ def get_capacity(num_tokens: int, num_experts: int, capacity_factor: float, min_
 
 
 class MoEAuxLossAutoScaler(torch.autograd.Function):
-    """An AutoScaler that compute and scales the grad for auxiliary loss."""
+    """An AutoScaler that triggers the backward pass and scales the grad for auxiliary loss."""
 
     main_loss_backward_scale: torch.Tensor = torch.tensor(1.0)
 
@@ -167,7 +206,7 @@ def permute(tokens, routing_map, num_out_tokens: int = None):
     """
     num_tokens, hidden = tokens.shape
     num_experts = routing_map.shape[1]
-
+    
     # mask [num_tokens, num_experts] -> [num_experts, num_tokens]
     routing_map = routing_map.bool().T.contiguous()
 
@@ -223,11 +262,58 @@ def unpermute(
 
 def sort_chunks_by_idxs(input: torch.Tensor, split_sizes: torch.Tensor, sorted_idxs: torch.Tensor):
     """Split and sort the input tensor based on the split_sizes and sorted indices."""
+
     input = torch.split(input, split_sizes.tolist(), dim=0)
     output = torch.cat([input[i] for i in sorted_idxs], dim=0)
     return output
 
 
+def device_limited_topk(
+    scores: torch.Tensor,
+    topk: int,
+    num_tokens: int,
+    num_experts: int,
+    moe_router_topk_limited_devices: int,
+):
+    """Perform top-k routing on a subset of expert parallel ranks.
+
+    Selects N ranks for each token, then conducts top-k selection among experts on these devices.
+    See DeepSeek-V2 technical report (https://arxiv.org/pdf/2405.04434) for details.
+
+    Args:
+        scores (torch.Tensor): Softmax scores from the router.
+        topk (int): The number of experts to select for each token.
+        num_tokens (int): The number of tokens.
+        num_experts (int): The number of experts.
+        moe_router_topk_limited_devices (int): Number of expert parallel ranks to consider for
+            each token during routing. None means no device limitation.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Probs and indices tensor.
+    """
+
+    # Organize the experts into groups
+    num_group = (
+        parallel_state.get_expert_model_parallel_world_size()
+    )  # num_group equals to expert parallel size
+    group_scores = scores.view(num_tokens, num_group, -1).max(dim=-1).values
+    group_idx = torch.topk(group_scores, k=moe_router_topk_limited_devices, dim=-1, sorted=False)[1]
+    group_mask = torch.zeros_like(group_scores)
+    group_mask.scatter_(1, group_idx, 1)
+
+    # Mask the experts based on selection groups
+    score_mask = (
+        group_mask.unsqueeze(-1)
+        .expand(num_tokens, num_group, num_experts // num_group)
+        .reshape(num_tokens, -1)
+    )
+
+    masked_scores = scores.masked_fill(~score_mask.bool(), 0.0)
+    probs, top_indices = torch.topk(masked_scores, k=topk, dim=-1)
+
+    return probs, top_indices
+
+@torch.compile(options={"triton.cudagraphs": True, "triton.cudagraph_trees": False})
 def topk_softmax_with_capacity(
     logits: torch.Tensor,
     topk: int,
@@ -235,6 +321,8 @@ def topk_softmax_with_capacity(
     pad_to_capacity: bool = False,
     drop_policy: str = "probs",
     use_pre_softmax: bool = False,
+    moe_router_topk_limited_devices: int = None,
+    moe_router_topk_scaling_factor: float = None,
     deterministic_mode: bool = False,
 ):
     """Apply capacity and padding to the top-k selection.
@@ -247,6 +335,12 @@ def topk_softmax_with_capacity(
         drop_policy (str): The policy to drop tokens. Can be either "prob" or "position".
                            If "prob", the tokens with the lowest probabilities will be dropped.
                            If "position", tokens at the end of each batch will be dropped.
+        use_pre_softmax (bool): Whether to apply softmax before top-k selection.
+        moe_router_topk_limited_devices (int): Number of expert parallel ranks to consider for
+            each token during routing. None means no device limitation.
+        moe_router_topk_scaling_factor (float): Scaling factor for routing score in top-k
+            selection, only works when use_pre_softmax enabled.
+        deterministic_mode (bool): Deprecated.
     Returns:
         Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
             - routing_probs (torch.Tensor): A tensor of shape [num_tokens, num_experts] containing
@@ -255,7 +349,7 @@ def topk_softmax_with_capacity(
               indicating which experts were selected for each token. True values represent
               the selected experts.
             - tokens_per_expert (torch.Tensor): A tensor of shape [num_experts] containing
-              the number of local tokens assigned to each expert.
+              the number of local tokens assigned to each expert before dropping and padding.
     """
     assert logits.dim() == 2, f"Expected 2D logits [num_tokens, num_experts], got {logits.dim()}."
     num_tokens = logits.shape[0]
@@ -263,14 +357,32 @@ def topk_softmax_with_capacity(
     if use_pre_softmax:
         # Pre softmax
         scores = torch.softmax(logits, dim=-1, dtype=torch.float32).type_as(logits)
-        probs, top_indices = torch.topk(scores, k=topk, dim=1)
+
+        if moe_router_topk_limited_devices:
+            probs, top_indices = device_limited_topk(
+                scores, topk, num_tokens, num_experts, moe_router_topk_limited_devices
+            )
+        else:
+            probs, top_indices = torch.topk(scores, k=topk, dim=1)
+
+        # Normalize the probs.
+        if moe_router_topk_scaling_factor:
+            probs = probs * moe_router_topk_scaling_factor
     else:
         # Post softmax
         if topk == 1:
             # Requires applying softmax before selecting the top-k when k is 1,
             # since softmax on a [num_tokens, 1] would yield a zero gradient.
             raise ValueError("Please use --moe-router-pre-softmax when topk is 1.")
-        scores, top_indices = torch.topk(logits, k=topk, dim=1)
+        assert (
+            moe_router_topk_scaling_factor is None
+        ), "moe_router_topk_scaling_factor is not supported with post-softmax"
+        if moe_router_topk_limited_devices:
+            scores, top_indices = device_limited_topk(
+                logits, topk, num_tokens, num_experts, moe_router_topk_limited_devices
+            )
+        else:
+            scores, top_indices = torch.topk(logits, k=topk, dim=1)
         probs = torch.softmax(scores, dim=-1, dtype=torch.float32).type_as(logits)
 
     # TODO Try using element-wise operations instead of scatter?
diff --git a/megatron/core/transformer/moe/router.py b/megatron/core/transformer/moe/router.py
old mode 100755
new mode 100644
index e03bd5c98e11ada76c6af18f91d98083f5130cf3..82d1029a5f7391551d92a1e7f78d80bc9eecb0f5
--- a/megatron/core/transformer/moe/router.py
+++ b/megatron/core/transformer/moe/router.py
@@ -1,6 +1,8 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
 from abc import ABC, abstractmethod
+from functools import partial
+from typing import Callable
 
 import torch
 
@@ -10,6 +12,7 @@ from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.moe.moe_utils import (
     MoEAuxLossAutoScaler,
     save_to_aux_losses_tracker,
+    sequence_load_balancing_loss_func,
     sinkhorn,
     switch_load_balancing_loss_func,
     topk_softmax_with_capacity,
@@ -142,7 +145,7 @@ class TopKRouter(Router):
 
         Returns:
             probs (torch.Tensor): The probabilities of token to experts assignment.
-            indices (torch.Tensor): The mask of token to experts assignment.
+            routing_map (torch.Tensor): The mask of token to experts assignment.
         """
         probs, routing_map, tokens_per_expert = topk_softmax_with_capacity(
             logits,
@@ -151,33 +154,61 @@ class TopKRouter(Router):
             pad_to_capacity=self.config.moe_pad_expert_input_to_capacity,
             drop_policy=self.config.moe_token_drop_policy,
             use_pre_softmax=self.config.moe_router_pre_softmax,
+            moe_router_topk_limited_devices=self.config.moe_router_topk_limited_devices,
+            moe_router_topk_scaling_factor=self.config.moe_router_topk_scaling_factor,
             deterministic_mode=self.config.deterministic_mode,
         )
 
         if self.training:
             # Apply load balancing loss
             scores = torch.softmax(logits, dim=-1, dtype=torch.float32)
-            probs = self.apply_load_balancing_loss(scores, tokens_per_expert, activation=probs)
+            aux_loss_func = partial(
+                switch_load_balancing_loss_func,
+                probs=scores,
+                tokens_per_expert=tokens_per_expert,
+                topk=self.topk,
+            )
+            probs = self.apply_load_balancing_loss(
+                activation=probs, load_balancing_loss_func=aux_loss_func
+            )
         return probs, routing_map
 
-    def apply_load_balancing_loss(
-        self,
-        probs: torch.Tensor,
-        num_local_tokens_per_expert: torch.Tensor,
-        activation: torch.Tensor,
-    ):
-        """Applies auxiliary loss to the MoE layer.
+    def seq_aux_loss_load_balancing(self, logits: torch.Tensor, bsz: int, seq_length: int):
+        """Apply loss-based load balancing to the logits tensor."""
 
-        Args:
-            probs (torch.Tensor): The probs output by the router for each token.
-                [num_tokens, num_experts]
-            num_local_tokens_per_expert (torch.Tensor): The number of tokens per expert.
-                [num_experts]
-            activation (torch.Tensor): The activation tensor to attach the gradient function to.
+        probs, routing_map, tokens_per_expert = topk_softmax_with_capacity(
+            logits,
+            self.topk,
+            capacity_factor=self.config.moe_expert_capacity_factor,
+            pad_to_capacity=self.config.moe_pad_expert_input_to_capacity,
+            drop_policy=self.config.moe_token_drop_policy,
+            use_pre_softmax=self.config.moe_router_pre_softmax,
+            moe_router_topk_limited_devices=self.config.moe_router_topk_limited_devices,
+            moe_router_topk_scaling_factor=self.config.moe_router_topk_scaling_factor,
+            deterministic_mode=self.config.deterministic_mode,
+        )
 
-        Returns:
-            torch.Tensor: The activation tensor with the attached gradient function.
-        """
+        if self.training:
+            scores = torch.softmax(logits, dim=-1, dtype=torch.float32)
+            aux_loss_func = partial(
+                sequence_load_balancing_loss_func,
+                probs=scores,
+                routing_map=routing_map,
+                tokens_per_expert=tokens_per_expert,
+                batch_size=bsz,
+                seq_length=seq_length,
+                topk=self.topk,
+            )
+            probs = self.apply_load_balancing_loss(
+                activation=probs, load_balancing_loss_func=aux_loss_func
+            )
+
+        return probs, routing_map
+
+    def apply_load_balancing_loss(
+        self, activation: torch.Tensor, load_balancing_loss_func: Callable
+    ):
+        """Calculate auxiliary loss, attach gradient function to activation and add to logging."""
         moe_aux_loss_coeff = self.config.moe_aux_loss_coeff
         sequence_partition_group = None
         if self.config.moe_token_dispatcher_type == "alltoall_seq":
@@ -186,12 +217,8 @@ class TopKRouter(Router):
         else:
             sequence_partition_group = parallel_state.get_tensor_and_context_parallel_group()
 
-        aux_loss = switch_load_balancing_loss_func(
-            probs,
-            num_local_tokens_per_expert,
-            self.topk,
-            moe_aux_loss_coeff,
-            sequence_partition_group=sequence_partition_group,
+        aux_loss = load_balancing_loss_func(
+            moe_aux_loss_coeff=moe_aux_loss_coeff, sequence_partition_group=sequence_partition_group
         )
         save_to_aux_losses_tracker(
             "load_balancing_loss",
@@ -257,6 +284,7 @@ class TopKRouter(Router):
             routing_map (torch.Tensor): The mapping of token to experts assignment,
                 with shape [num_tokens, num_experts].
         """
+        seq_length, bsz = logits.shape[:2]
         logits = logits.view(-1, self.config.num_moe_experts)
 
         # Apply Z-Loss
@@ -270,6 +298,8 @@ class TopKRouter(Router):
             scores, routing_map = self.sinkhorn_load_balancing(logits)
         elif self.routing_type == "aux_loss":
             scores, routing_map = self.aux_loss_load_balancing(logits)
+        elif self.routing_type == "seq_aux_loss":
+            scores, routing_map = self.seq_aux_loss_load_balancing(logits, bsz, seq_length)
         elif self.routing_type == "none":
             # A naive top-k routing without load balancing
             scores, routing_map, _ = topk_softmax_with_capacity(
@@ -279,6 +309,7 @@ class TopKRouter(Router):
                 pad_to_capacity=self.config.moe_pad_expert_input_to_capacity,
                 drop_policy=self.config.moe_token_drop_policy,
                 use_pre_softmax=self.config.moe_router_pre_softmax,
+                moe_router_topk_scaling_factor=self.config.moe_router_topk_scaling_factor,
                 deterministic_mode=self.config.deterministic_mode,
             )
         else:
@@ -293,12 +324,10 @@ class TopKRouter(Router):
         Args:
             input (torch.Tensor): Input tensor.
         """
-        self.hidden = input.shape[-1]
 
         # Apply input jitter
         input = self.apply_input_jitter(input)
         logits = self.gating(input)
-        logits = logits.view(-1, self.config.num_moe_experts)
 
         scores, routing_map = self.routing(logits)
 
diff --git a/megatron/core/transformer/moe/shared_experts.py b/megatron/core/transformer/moe/shared_experts.py
old mode 100755
new mode 100644
index 1d4b2a628f5aad38fa77b6a7fc526921dc76e79f..7d1eaef7053ede2711f14d07fceb6881c6c98af4
--- a/megatron/core/transformer/moe/shared_experts.py
+++ b/megatron/core/transformer/moe/shared_experts.py
@@ -17,8 +17,7 @@ from megatron.core.tensor_parallel.mappings import (
     reduce_from_tensor_model_parallel_region,
     reduce_scatter_to_sequence_parallel_region,
 )
-from megatron.core.transformer.mlp import MLP
-from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.mlp import MLP, MLPSubmodules
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.utils import is_torch_min_version, make_sharded_tensor_for_checkpoint
 
@@ -32,15 +31,15 @@ class SharedExpertMLP(MLP):
     # The shared experts are scheduled into this stream to be overlapped with the dispatcher.
     stream = None
 
-    def __init__(self, config: TransformerConfig, spec: ModuleSpec):
+    def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, gate: bool):
         config = deepcopy(config)
         assert config.add_bias_linear == False, "bias is not supported in the shared experts, "
         "please set '--disable-bias-linear' instead."
 
         config.ffn_hidden_size = config.moe_shared_expert_intermediate_size
-        super().__init__(config=config, submodules=spec.submodules)
+        super().__init__(config=config, submodules=submodules)
 
-        self.use_shared_expert_gate = spec.params.get("gate", False)
+        self.use_shared_expert_gate = gate
         if self.use_shared_expert_gate:
             # TODO: Add support for GPU initialization, which requires updating the golden values.
             self.gate_weight = torch.nn.Parameter(torch.empty((1, self.config.hidden_size)))
diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py
old mode 100755
new mode 100644
index dbd768ddae85e1258ee5569a9e962f9c05cdb8da..dc09d97979da980d195418a34e378fd97ad410ed
--- a/megatron/core/transformer/moe/token_dispatcher.py
+++ b/megatron/core/transformer/moe/token_dispatcher.py
@@ -492,6 +492,7 @@ class MoEAlltoAllTokenDispatcher(MoETokenDispatcher):
         self.hidden_shape_before_permute = hidden_states.shape
         if self.cuda_sync_point == "before_permutation_1":
             torch.cuda.current_stream().synchronize()
+
         permutated_local_input_tokens, self.reversed_local_input_permutation_mapping = permute(
             hidden_states, routing_map, num_out_tokens=self.num_out_tokens
         )
@@ -499,9 +500,11 @@ class MoEAlltoAllTokenDispatcher(MoETokenDispatcher):
         # Perform expert parallel AlltoAll communication
         if self.cuda_sync_point == "before_ep_alltoall":
             torch.cuda.current_stream().synchronize()
+
         global_input_tokens = all_to_all(
             self.ep_group, permutated_local_input_tokens, self.output_splits, self.input_splits
         )
+
         if self.shared_experts is not None:
             self.shared_experts.linear_fc1_forward_and_act(global_input_tokens)
 
diff --git a/megatron/core/transformer/moe/upcycling_utils.py b/megatron/core/transformer/moe/upcycling_utils.py
old mode 100755
new mode 100644
diff --git a/megatron/core/transformer/multi_latent_attention.py b/megatron/core/transformer/multi_latent_attention.py
old mode 100755
new mode 100644
diff --git a/megatron/core/transformer/spec_utils.py b/megatron/core/transformer/spec_utils.py
old mode 100755
new mode 100644
diff --git a/megatron/core/transformer/torch_layer_norm.py b/megatron/core/transformer/torch_layer_norm.py
old mode 100755
new mode 100644
diff --git a/megatron/core/transformer/torch_norm.py b/megatron/core/transformer/torch_norm.py
old mode 100755
new mode 100644
diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py
old mode 100755
new mode 100644
index c818e2b27afdc4d3482ae3a0c67f04d905ac990a..d40476d27bca7f56378bf970a3c55a5d2fcbe3df
--- a/megatron/core/transformer/transformer_block.py
+++ b/megatron/core/transformer/transformer_block.py
@@ -576,7 +576,10 @@ class TransformerBlock(MegatronModule):
         non_homogeneous_layers = metadata is not None and metadata.get(
             'non_homogeneous_layers', False
         )
-        if self.config.num_moe_experts is not None:
+        if isinstance(self.config.moe_layer_freq, int):
+            if self.config.moe_layer_freq > 1:
+                non_homogeneous_layers = True
+        elif isinstance(self.config.moe_layer_freq, list):
             non_homogeneous_layers = True
 
         sharded_state_dict = {}
diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py
old mode 100755
new mode 100644
index cc56fd0978325179d38a0f1f852f514d1e29dfe2..adc97a53a8919d6c332b7a5ebee1e7b66c77fae7
--- a/megatron/core/transformer/transformer_config.py
+++ b/megatron/core/transformer/transformer_config.py
@@ -266,23 +266,37 @@ class TransformerConfig(ModelParallelConfig):
     """MoE Feed-Forward Network hidden size"""
 
     moe_router_load_balancing_type: str = "aux_loss"
-    """Determines the load balancing strategy for the router. "aux_loss" corresponds to the load
-    balancing loss used in GShard and SwitchTransformer, "sinkhorn" corresponds to the balancing
-    algorithm used in S-BASE, and "none" implies no load balancing."""
+    """The load balancing strategy for the router. "aux_loss" corresponds to the load balancing loss 
+    used in GShard and SwitchTransformer; "seq_aux_loss" corresponds to the loss used in DeepSeekV2, 
+    which computes the loss for each individual sample; "sinkhorn" corresponds to the balancing 
+    algorithm used in S-BASE, and "none" implies no load balancing. The default is "aux_loss"."""
 
     moe_router_topk: int = 2
     """Number of experts to route to for each token."""
 
+    moe_router_topk_limited_devices: int = None
+    """Number of expert parallel ranks to consider for each token during routing. Perform top-k
+    routing on a subset of expert parallel ranks by first selecting N ranks for each token, then
+    conducting top-k selection among experts on these devices. None means no device limitation."""
+
     moe_router_pre_softmax: bool = False
     """Enable pre-softmax routing for MoE, which means softmax is before the top-k selection. 
     By default, softmax is done after top-k."""
 
+    moe_router_topk_scaling_factor: float = None
+    """Scaling factor for routing score in top-k selection, only works when moe_router_pre_softmax 
+    enabled. Defaults to None, which means no scaling."""
+
     moe_grouped_gemm: bool = False
     """When there are multiple experts per rank, compress multiple local (potentially small) gemms
     in a single kernel launch to improve the utilization and performance by leveraging the Grouped
     GEMM feature introduced since CUTLASS 2.8 (https://github.com/fanshiqing/grouped_gemm).
     """
 
+    moe_use_legacy_grouped_gemm: bool = False
+    """Use legacy GroupedMLP rather than TEGroupedMLP.
+    Note: The legacy one will be deprecated soon."""
+
     moe_aux_loss_coeff: float = 0  # 1e-2 would be a good start value for load balance loss.
     """Scaling coefficient for the aux loss. A starting value of 1e-2 is recommended."""
 
@@ -354,6 +368,11 @@ class TransformerConfig(ModelParallelConfig):
     enable_cuda_graph: bool = False
     """When set to true, TransformerLayer layers are swapped with a CUDA graphed version."""
 
+    cuda_graph_retain_backward_graph: bool = False
+    """When set to true, cudagraph backward passes will be graph captured with 'retain_grad=True'
+    This may enable cudagraphs for certain modules that are not completely cudagraph safe. For 
+    more details, see: https://pytorch.org/docs/stable/generated/torch.Tensor.backward.html."""
+
     external_cuda_graph: bool = False
     """When set to true, TransformerLayer layers are swapped with user provided CUDA graphs."""
 
@@ -428,7 +447,7 @@ class TransformerConfig(ModelParallelConfig):
                 )
             if self.moe_expert_capacity_factor < 0:
                 self.moe_expert_capacity_factor = None
-            if self.moe_router_load_balancing_type not in ["aux_loss", "none"]:
+            if self.moe_router_load_balancing_type not in ["aux_loss", "seq_aux_loss", "none"]:
                 raise ValueError(
                     'moe_expert_capacity_factor only works with aux_loss or none load balancing'
                 )
@@ -529,9 +548,12 @@ class TransformerConfig(ModelParallelConfig):
             if self.rotary_interleaved:
                 raise ValueError("rotary_interleaved does not work with apply_rope_fusion.")
 
-            from megatron.core.models.common.embeddings.rope_utils import HAVE_APPLY_ROPE_FUSION
+            from megatron.core.models.common.embeddings.rope_utils import (
+                fused_apply_rotary_pos_emb,
+                fused_apply_rotary_pos_emb_thd,
+            )
 
-            if not HAVE_APPLY_ROPE_FUSION:
+            if fused_apply_rotary_pos_emb is None and fused_apply_rotary_pos_emb_thd is None:
                 raise ValueError(
                     "apply_rope_fusion is not available. Please install TE >= 1.4 or Apex."
                 )
@@ -569,6 +591,14 @@ class TransformerConfig(ModelParallelConfig):
                     f"but your version is {get_te_version()}."
                 )
 
+        if self.moe_router_topk_limited_devices:
+            if self.moe_router_topk_limited_devices > self.expert_model_parallel_size:
+                raise ValueError(
+                    f"moe_router_topk_limited_devices: {self.moe_router_topk_limited_devices} "
+                    f"must be smaller than expert_model_parallel_size "
+                    f"{self.expert_model_parallel_size}"
+                )
+
         if self.flash_decode and self.fp8:
             raise ValueError("FP8 inference is currently not support with flash decoding.")
 
diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py
old mode 100755
new mode 100644
index 0e7eabbff57091a35dab25fcda97e13b6510dd60..3acde2ba9e0a04796f2fd5edda19559c27497fe7
--- a/megatron/core/transformer/transformer_layer.py
+++ b/megatron/core/transformer/transformer_layer.py
@@ -92,7 +92,7 @@ class TransformerLayer(MegatronModule, BaseTransformerLayer):
         hidden_dropout: float = None,
     ):
         super().__init__(config=config)
-
+        
         if config.enable_cuda_graph and self.training:
             assert (
                 not config.cpu_offloading and config.recompute_granularity is None
diff --git a/megatron/core/transformer/utils.py b/megatron/core/transformer/utils.py
old mode 100755
new mode 100644
diff --git a/megatron/core/utils.py b/megatron/core/utils.py
old mode 100755
new mode 100644
index 6b46f292d57a535eb34bd311e8ab8ce9de749e1c..3bb28042b8a28420d493b44fae1e22cd9c4fb288
--- a/megatron/core/utils.py
+++ b/megatron/core/utils.py
@@ -1413,3 +1413,41 @@ except (ImportError, ModuleNotFoundError):
 def is_float8tensor(tensor: torch.Tensor) -> bool:
     """Check if a tensor is a Transformer Engine Float8Tensor"""
     return HAVE_TE_FLOAT8TENSOR and isinstance(tensor, Float8Tensor)
+
+
+########################
+### context parallel ###
+########################
+
+
+def get_batch_on_this_cp_rank(batch: Dict[str, Any]):
+    """Slice batch input along sequence dimension into multiple chunks,
+    which are parallelized across GPUs in a context parallel group.
+    """
+
+    # With causal masking, each token only attends to its prior tokens. Simply split
+    # sequence into CP chunks can result in severe load imbalance. That's to say, chunks
+    # at the end of sequence have bigger workload than others. To address this issue,
+    # we split sequence into 2*CP ranks. Assuming CP=2, we then get 4 chunks, chunk_0
+    # and chunk_3 are assigned to GPU0, chunk_1 and chunk_2 are assigned to GPU1, so
+    # that we can get balanced workload among GPUs in a context parallel group.
+    cp_size = parallel_state.get_context_parallel_world_size()
+    if cp_size > 1:
+        cp_rank = parallel_state.get_context_parallel_rank()
+        for key, val in batch.items():
+            if val is not None:
+                seq_dim = 1 if key != 'attention_mask' else 2
+                val = val.view(
+                    *val.shape[0:seq_dim],
+                    2 * cp_size,
+                    val.shape[seq_dim] // (2 * cp_size),
+                    *val.shape[(seq_dim + 1) :],
+                )
+                index = torch.tensor(
+                    [cp_rank, (2 * cp_size - cp_rank - 1)], device="cpu", pin_memory=True
+                ).cuda(non_blocking=True)
+                val = val.index_select(seq_dim, index)
+                val = val.view(*val.shape[0:seq_dim], -1, *val.shape[(seq_dim + 2) :])
+                batch[key] = val
+
+    return batch
diff --git a/megatron/inference/__init__.py b/megatron/inference/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/inference/algos/__init__.py b/megatron/inference/algos/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/inference/algos/distillation.py b/megatron/inference/algos/distillation.py
old mode 100755
new mode 100644
diff --git a/megatron/inference/arguments.py b/megatron/inference/arguments.py
old mode 100755
new mode 100644
diff --git a/megatron/inference/checkpointing.py b/megatron/inference/checkpointing.py
old mode 100755
new mode 100644
diff --git a/megatron/inference/docs/distillation.md b/megatron/inference/docs/distillation.md
old mode 100755
new mode 100644
diff --git a/megatron/inference/endpoints/common.py b/megatron/inference/endpoints/common.py
old mode 100755
new mode 100644
diff --git a/megatron/inference/endpoints/completions.py b/megatron/inference/endpoints/completions.py
old mode 100755
new mode 100644
diff --git a/megatron/inference/gpt/__init__.py b/megatron/inference/gpt/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/inference/gpt/loss_func.py b/megatron/inference/gpt/loss_func.py
old mode 100755
new mode 100644
diff --git a/megatron/inference/gpt/model_provider.py b/megatron/inference/gpt/model_provider.py
old mode 100755
new mode 100644
diff --git a/megatron/inference/static/index.html b/megatron/inference/static/index.html
old mode 100755
new mode 100644
diff --git a/megatron/inference/text_generation/__init__.py b/megatron/inference/text_generation/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/inference/text_generation/api.py b/megatron/inference/text_generation/api.py
old mode 100755
new mode 100644
diff --git a/megatron/inference/text_generation/beam_utils.py b/megatron/inference/text_generation/beam_utils.py
old mode 100755
new mode 100644
diff --git a/megatron/inference/text_generation/communication.py b/megatron/inference/text_generation/communication.py
old mode 100755
new mode 100644
diff --git a/megatron/inference/text_generation/forward_step.py b/megatron/inference/text_generation/forward_step.py
old mode 100755
new mode 100644
index 0a89936ed229b92525577805ad89094951ed0a6f..aaa518fad456bb57c1fd0c68bf869144919c10dd
--- a/megatron/inference/text_generation/forward_step.py
+++ b/megatron/inference/text_generation/forward_step.py
@@ -6,11 +6,10 @@ from collections.abc import Iterable
 
 import torch
 
+from megatron.core import InferenceParams, mpu
 from megatron.training import get_args
-from megatron.core import mpu, InferenceParams
-from .communication import (
-    send_to_next_pipeline_rank,
-    recv_from_prev_pipeline_rank_)
+
+from .communication import recv_from_prev_pipeline_rank_, send_to_next_pipeline_rank
 
 
 class ForwardStep:
@@ -46,7 +45,7 @@ class ForwardStep:
         # This runs only if current_batch_x_seqlen > args.inference_batch_times_seqlen_threshold
         # and requires setting args.pipeline_model_parallel > 1. The batch will be split into
         # smaller microbatches to be pipelined through the stages.
-        if self.pipeline_size_larger_than_one:
+        if self.pipeline_size_larger_than_one and self.pipelining_batch_x_seqlen != -1:
             seq_len = tokens.size(1) if recv_buffer_seq_length is None else recv_buffer_seq_length
             current_batch_x_seqlen = tokens.size(0) * seq_len
             if current_batch_x_seqlen >= self.pipelining_batch_x_seqlen:
diff --git a/megatron/inference/text_generation/generation.py b/megatron/inference/text_generation/generation.py
old mode 100755
new mode 100644
diff --git a/megatron/inference/text_generation/sampling.py b/megatron/inference/text_generation/sampling.py
old mode 100755
new mode 100644
diff --git a/megatron/inference/text_generation/tokenization.py b/megatron/inference/text_generation/tokenization.py
old mode 100755
new mode 100644
diff --git a/megatron/inference/text_generation_server.py b/megatron/inference/text_generation_server.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/data/__init__.py b/megatron/legacy/data/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/data/autoaugment.py b/megatron/legacy/data/autoaugment.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/data/biencoder_dataset_utils.py b/megatron/legacy/data/biencoder_dataset_utils.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/data/data_samplers.py b/megatron/legacy/data/data_samplers.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/data/dataset_utils.py b/megatron/legacy/data/dataset_utils.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/data/ict_dataset.py b/megatron/legacy/data/ict_dataset.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/data/image_folder.py b/megatron/legacy/data/image_folder.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/data/multimodal_dataset.py b/megatron/legacy/data/multimodal_dataset.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/data/orqa_wiki_dataset.py b/megatron/legacy/data/orqa_wiki_dataset.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/data/realm_dataset_utils.py b/megatron/legacy/data/realm_dataset_utils.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/data/realm_index.py b/megatron/legacy/data/realm_index.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/data/vit_dataset.py b/megatron/legacy/data/vit_dataset.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/fp16_deprecated/loss_scaler.py b/megatron/legacy/fp16_deprecated/loss_scaler.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/fused_kernels/__init__.py b/megatron/legacy/fused_kernels/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/fused_kernels/compat.h b/megatron/legacy/fused_kernels/compat.h
old mode 100755
new mode 100644
diff --git a/megatron/legacy/fused_kernels/tests/__init__.py b/megatron/legacy/fused_kernels/tests/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/fused_kernels/tests/test_fused_kernels.py b/megatron/legacy/fused_kernels/tests/test_fused_kernels.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/fused_kernels/type_shim.h b/megatron/legacy/fused_kernels/type_shim.h
old mode 100755
new mode 100644
diff --git a/megatron/legacy/indexer.py b/megatron/legacy/indexer.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/__init__.py b/megatron/legacy/model/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/bert_model.py b/megatron/legacy/model/bert_model.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/biencoder_model.py b/megatron/legacy/model/biencoder_model.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/classification.py b/megatron/legacy/model/classification.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/enums.py b/megatron/legacy/model/enums.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/fused_bias_gelu.py b/megatron/legacy/model/fused_bias_gelu.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/fused_layer_norm.py b/megatron/legacy/model/fused_layer_norm.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/fused_softmax.py b/megatron/legacy/model/fused_softmax.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/gpt_model.py b/megatron/legacy/model/gpt_model.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/language_model.py b/megatron/legacy/model/language_model.py
old mode 100755
new mode 100644
index ce893902a87adbdf56310eab35fb949ead716a04..0173a4a662cd7856b09489ab2594180cbd814998
--- a/megatron/legacy/model/language_model.py
+++ b/megatron/legacy/model/language_model.py
@@ -15,6 +15,9 @@ from .module import MegatronModule
 from .transformer import ParallelTransformer
 from .utils import get_linear_layer, init_method_normal, scaled_init_method_normal
 
+import torch._dynamo
+torch._dynamo.config.suppress_errors = True
+
 
 def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, bias=None):
     """LM logits using word embedding weights."""
@@ -468,6 +471,7 @@ class TransformerLanguageModel(MegatronModule):
         else:
             raise Exception('Stage must have at least either encoder or decoder')
 
+    # @torch.compile(mode="max-autotune-no-cudagraphs")
     def forward(
         self,
         enc_input_ids,
diff --git a/megatron/legacy/model/module.py b/megatron/legacy/model/module.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/multiple_choice.py b/megatron/legacy/model/multiple_choice.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/realm_model.py b/megatron/legacy/model/realm_model.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/rms_norm.py b/megatron/legacy/model/rms_norm.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/t5_model.py b/megatron/legacy/model/t5_model.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/transformer.py b/megatron/legacy/model/transformer.py
old mode 100755
new mode 100644
index 3ddf2681293b4f4018d520e318afe87d509bd2aa..7f65b17021d8759060916646b8d4147f6b6cdfd1
--- a/megatron/legacy/model/transformer.py
+++ b/megatron/legacy/model/transformer.py
@@ -456,6 +456,34 @@ class CoreAttention(MegatronModule):
 
         return context_layer
 
+class FlashSelfAttentionTorch(torch.nn.Module):
+    def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
+        super().__init__()
+        assert flash_attn_func is not None, ('Triton version of FlashAttention is not installed.')
+        assert rearrange is not None, 'Please install einops first, e.g., with pip install einops'
+        self.causal = causal
+        self.softmax_scale = softmax_scale
+        self.attention_dropout = attention_dropout
+    def forward(self, q, k, v):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            q, k, v: The tensor containing the query, key, and value. (B, S, H, D)
+        """
+        assert q.dtype in [torch.float16, torch.bfloat16]
+        assert q.is_cuda
+        if os.environ.get('USE_BSHD',None):
+            q, k, v = [rearrange(x, 's b h d -> b s h d').contiguous()
+                       for x in (q, k, v)]
+        else:
+            q, k, v = [rearrange(x, 's b h d -> b h s d').contiguous()
+                       for x in (q, k, v)]
+        output = SDPA(q, k, v, is_causal=self.causal, dropout_p=self.attention_dropout, scale=self.softmax_scale)
+        if os.environ.get('USE_BSHD',None):
+            output = rearrange(output, 'b s h d -> s b (h d)').contiguous()
+        else:
+            output = rearrange(output, 'b h s d -> s b (h d)').contiguous()
+        return output
 
 class FlashSelfAttention(torch.nn.Module):
     """Implement the scaled dot product attention with softmax.
@@ -582,10 +610,11 @@ class ParallelAttention(MegatronModule):
         else:
             kv_projection_size = args.kv_channels * args.num_attention_heads
 
-        self.use_flash_attn = (args.use_flash_attn_cutlass or args.use_flash_attn_triton) \
+        self.use_flash_attn = (args.use_flash_attn_cutlass or args.use_flash_attn_triton or args.use_flash_attn_torch) \
             and attention_type == AttnType.self_attn \
             and self.attn_mask_type == AttnMaskType.causal
         self.use_flash_attn_triton = args.use_flash_attn_triton
+        self.use_flash_attn_torch = args.use_flash_attn_torch
 
         if self.use_flash_attn:
             if args.use_flash_attn_cutlass:
@@ -658,6 +687,8 @@ class ParallelAttention(MegatronModule):
             self.core_attention_flash = FlashSelfAttentionTriton(
                 causal=True, attention_dropout=args.attention_dropout
             )
+        elif self.use_flash_attn_torch:
+            self.core_attention_flash = FlashSelfAttentionTorch(causal=True, attention_dropout=config.attention_dropout)
         elif self.use_flash_attn:
             self.core_attention_flash = FlashSelfAttention(
                 causal=True, attention_dropout=config.attention_dropout
@@ -871,7 +902,7 @@ class ParallelAttention(MegatronModule):
                 context_layer = self.core_attention(
                     query_layer, key_layer, value_layer, attention_mask)
         else:
-            if not self.use_flash_attn_triton:
+            if not self.use_flash_attn_triton and not self.use_flash_attn_torch:
                 query_layer, key_layer, value_layer = [rearrange(x, 's b ... -> b s ...').contiguous()
                        for x in (query_layer, key_layer, value_layer)]
             
@@ -881,7 +912,7 @@ class ParallelAttention(MegatronModule):
             else:
                 context_layer = self.core_attention_flash(query_layer, key_layer, value_layer)
             
-            if not self.use_flash_attn_triton:
+            if not self.use_flash_attn_triton and not self.use_flash_attn_torch:
                 context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous()
 
         # =================
diff --git a/megatron/legacy/model/utils.py b/megatron/legacy/model/utils.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/vision/classification.py b/megatron/legacy/model/vision/classification.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/vision/dino.py b/megatron/legacy/model/vision/dino.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/vision/esvit_swin_backbone.py b/megatron/legacy/model/vision/esvit_swin_backbone.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/vision/inpainting.py b/megatron/legacy/model/vision/inpainting.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/vision/knn_monitor.py b/megatron/legacy/model/vision/knn_monitor.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/vision/mit_backbone.py b/megatron/legacy/model/vision/mit_backbone.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/vision/swin_backbone.py b/megatron/legacy/model/vision/swin_backbone.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/vision/utils.py b/megatron/legacy/model/vision/utils.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/model/vision/vit_backbone.py b/megatron/legacy/model/vision/vit_backbone.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/mpu/tests/__init__.py b/megatron/legacy/mpu/tests/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/mpu/tests/commons.py b/megatron/legacy/mpu/tests/commons.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/mpu/tests/test_cross_entropy.py b/megatron/legacy/mpu/tests/test_cross_entropy.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/mpu/tests/test_data.py b/megatron/legacy/mpu/tests/test_data.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/mpu/tests/test_initialize.py b/megatron/legacy/mpu/tests/test_initialize.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/mpu/tests/test_layers.py b/megatron/legacy/mpu/tests/test_layers.py
old mode 100755
new mode 100644
diff --git a/megatron/legacy/mpu/tests/test_random.py b/megatron/legacy/mpu/tests/test_random.py
old mode 100755
new mode 100644
diff --git a/megatron/training/__init__.py b/megatron/training/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/training/activations.py b/megatron/training/activations.py
old mode 100755
new mode 100644
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
old mode 100755
new mode 100644
index 2cdad94b08d7cc05160da201103bb4dca3d636c8..b1a50760e3131d8d4a5df48a26b736c765c8d1eb
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -201,7 +201,6 @@ def validate_args(args, defaults={}):
         assert args.encoder_tensor_model_parallel_size == args.tensor_model_parallel_size,  "If non-MOE encoder shares first decoder pipeline rank it must have the same TP as the decoder."
 
     if args.encoder_tensor_model_parallel_size > 0:
-        assert args.encoder_pipeline_model_parallel_size > 0, "encoder_pipeline_model_parallel_size must be defined."
         assert args.num_attention_heads % args.encoder_tensor_model_parallel_size == 0
         assert args.encoder_tensor_model_parallel_size <= args.tensor_model_parallel_size, "We do not support encoders with more TP than the decoder."
 
@@ -401,6 +400,14 @@ def validate_args(args, defaults={}):
         assert not args.use_dist_ckpt, \
             '--overlap-param-gather-with-optimizer-step not supported with distributed checkpointing yet'
 
+    dtype_map = {
+        'fp32': torch.float32, 'bf16': torch.bfloat16, 'fp16': torch.float16, 'fp8': torch.uint8,
+    }
+    args.main_grads_dtype = dtype_map[args.main_grads_dtype]
+    args.main_params_dtype = dtype_map[args.main_params_dtype]
+    args.exp_avg_dtype = dtype_map[args.exp_avg_dtype]
+    args.exp_avg_sq_dtype = dtype_map[args.exp_avg_sq_dtype]
+
     if args.fp8_param_gather:
         assert args.use_distributed_optimizer, \
             '--fp8-param-gather only supported with distributed optimizer'
@@ -422,7 +429,11 @@ def validate_args(args, defaults={}):
         args.params_dtype = torch.bfloat16
         # bfloat16 requires gradient accumulation and all-reduce to
         # be done in fp32.
-        if not args.accumulate_allreduce_grads_in_fp32:
+        if args.accumulate_allreduce_grads_in_fp32:
+            assert args.main_grads_dtype == torch.float32, \
+                "--main-grads-dtype can only be fp32 when --accumulate-allreduce-grads-in-fp32 is set"
+
+        if not args.accumulate_allreduce_grads_in_fp32 and args.main_grads_dtype == torch.float32:
             args.accumulate_allreduce_grads_in_fp32 = True
             if args.rank == 0:
                 print('accumulate and all-reduce gradients in fp32 for '
@@ -643,7 +654,7 @@ def validate_args(args, defaults={}):
             '--decoupled-lr and --decoupled-min-lr is not supported in legacy models.'
 
     # FlashAttention
-    args.use_flash_attn = args.use_flash_attn_cutlass or args.use_flash_attn_triton
+    args.use_flash_attn = args.use_flash_attn_cutlass or args.use_flash_attn_triton or args.use_flash_attn_torch
 
     # Legacy RoPE arguments
     if args.use_rotary_position_embeddings:
@@ -1366,6 +1377,8 @@ def _add_training_args(parser):
     group.add_argument('--use-flash-attn-cutlass', action='store_true',
                        help='use FlashAttention implementation of attention. '
                        'https://arxiv.org/abs/2205.14135')
+    group.add_argument('--use-flash-attn-torch', action='store_true',
+                       help='use FlashAttention implementation of attention using torch.')
     group.add_argument('--use-flash-attn-triton', action='store_true',
                        help='use FlashAttention implementation of attention using Triton.')
     group.add_argument('--disable-bias-linear', action='store_false',
@@ -2078,6 +2091,7 @@ def _add_vision_args(parser):
 
 def _add_moe_args(parser):
     group = parser.add_argument_group(title="moe")
+    # General arguments
     group.add_argument('--expert-model-parallel-size', type=int, default=1,
                        help='Degree of expert model parallelism.')
     group.add_argument('--expert-tensor-parallel-size', type=int, default=None,
@@ -2103,16 +2117,23 @@ def _add_moe_args(parser):
                        help='Enable overlapping between shared expert computations and dispatcher communications. '
                        'Without this, the shared epxerts execute after the routed experts. '
                        'Only effective when moe-shared-expert-intermediate-size is set.')
+    group.add_argument('--moe-grouped-gemm', action='store_true',
+                       help='When there are multiple experts per rank, launch multiple local GEMM kernels in multiple streams to improve the utilization and performance with GroupedLinear in TransformerEngine.')
+    # Router arguments
     group.add_argument('--moe-router-load-balancing-type', type=str,
-                       choices=['aux_loss', 'sinkhorn', 'none'],
+                       choices=['aux_loss', 'seq_aux_loss', 'sinkhorn', 'none'],
                        default='aux_loss',
-                       help='Determines the load balancing strategy for the router. "aux_loss" corresponds to the load balancing loss used in GShard and SwitchTransformer, "sinkhorn" corresponds to the balancing algorithm used in S-BASE, and "none" implies no load balancing. The default is "aux_loss".')
+                       help='Determines the load balancing strategy for the router. "aux_loss" corresponds to the load balancing loss used in GShard and SwitchTransformer; "seq_aux_loss" corresponds to the load balancing loss used in DeepSeekV2, which computes the loss for each individual sample; "sinkhorn" corresponds to the balancing algorithm used in S-BASE, and "none" implies no load balancing. The default is "aux_loss".')
     group.add_argument('--moe-router-topk', type=int, default=2,
                        help='Number of experts to route to for each token. The default is 2.')
     group.add_argument('--moe-router-pre-softmax', action='store_true',
                        help='Enable pre-softmax routing for MoE, which means softmax is before the top-k selection. By default, softmax is done after top-k.')
-    group.add_argument('--moe-grouped-gemm', action='store_true',
-                       help='When there are multiple experts per rank, launch multiple local GEMM kernels in multiple streams to improve the utilization and performance with GroupedLinear in TransformerEngine.')
+    group.add_argument('--moe-router-topk-limited-devices', type=int, default=None, 
+                       help='Number of expert parallel ranks to consider for each token during routing. Perform top-k routing on a subset of expert parallel ranks by first selecting N ranks for each token, then conducting top-k selection among experts on these devices. Default is None, which means no limited devices.')
+    group.add_argument('--moe-router-topk-scaling-factor', type=float, default=None,
+                       help='Scaling factor for routing score in top-k selection, only works when --moe-router-pre-softmax enabled. Defaults to None, which means no scaling.')
+    group.add_argument('--moe-use-legacy-grouped-gemm', action='store_true',
+                       help='Use legacy GroupedMLP rather than TEGroupedMLP. Note: The legacy one will be deprecated soon.')
     group.add_argument('--moe-aux-loss-coeff', type=float, default=0.0,
                        help='Scaling coefficient for the aux loss: a starting value of 1e-2 is recommended.')
     group.add_argument('--moe-z-loss-coeff', type=float, default=None,
@@ -2185,4 +2206,18 @@ def _add_experimental_args(parser):
                        'the overidden pattern')
     group.add_argument('--yaml-cfg', type=str, default=None,
                        help = 'Config file to add additional arguments')
+    
+    # Args of precision-aware optimizer
+    group.add_argument('--use-precision-aware-optimizer', action='store_true',
+                       help='Use the precision-aware optimizer in TransformerEngine, which allows '
+                       'setting the main params and optimizer states to lower precision, such as '
+                       'fp16 and fp8.')
+    group.add_argument('--main-grads-dtype', default='fp32', choices=['fp32', 'bf16'],
+                       help='Dtype of main grads when enabling precision-aware-optimizer')
+    group.add_argument('--main-params-dtype', default='fp32', choices=['fp32', 'fp16'],
+                       help='Dtype of main params when enabling precision-aware-optimizer')
+    group.add_argument('--exp-avg-dtype', default='fp32', choices=['fp32', 'fp16', 'fp8'],
+                       help='Dtype of exp_avg when enabling precision-aware-optimizer')
+    group.add_argument('--exp-avg-sq-dtype', default='fp32', choices=['fp32', 'fp16', 'fp8'],
+                       help='Dtype of exp_avg_sq when enabling precision-aware-optimizer')
     return parser
diff --git a/megatron/training/async_utils.py b/megatron/training/async_utils.py
old mode 100755
new mode 100644
diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py
old mode 100755
new mode 100644
index d42d85d02a025fa587e38223e7683627f1f0297e..b51a6c7c781b2c78cebd3a760e0fbbf508338749
--- a/megatron/training/checkpointing.py
+++ b/megatron/training/checkpointing.py
@@ -361,6 +361,12 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati
     # Collect rng state across data parallel ranks.
     rng_state = get_rng_state(ckpt_type != CheckpointType.LEGACY)
 
+    # Collect rerun state across all ranks
+    rerun_state_machine = get_rerun_state_machine()
+    rerun_state = rerun_state_machine.state_dict(
+        data_iterator=train_data_iterator, use_dist_ckpt=ckpt_type != CheckpointType.LEGACY
+    )
+
     # Checkpoint name.
     return_base_dir = (ckpt_type != CheckpointType.LEGACY)
     checkpoint_name = get_checkpoint_name(save_dir, iteration, release=False, pipeline_parallel=pipeline_parallel,
@@ -379,7 +385,8 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati
         optim_checkpoint_name = \
             get_distributed_optimizer_checkpoint_name(checkpoint_name)
         ensure_directory_exists(optim_checkpoint_name)
-        optimizer.save_parameter_state(optim_checkpoint_name)
+        if not optimizer.is_stub_optimizer:
+            optimizer.save_parameter_state(optim_checkpoint_name)
 
     async_save_request = None
     if args.async_save:
@@ -409,7 +416,7 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati
             use_dist_ckpt=ckpt_type != CheckpointType.LEGACY,
             iteration=iteration,
             optim_sd_kwargs=optim_sd_kwargs,
-            train_data_iterator=train_data_iterator,
+            rerun_state=rerun_state,
         )
 
         if args.enable_ft_package and ft_client is not None:
@@ -593,7 +600,7 @@ def save_dataloader_state(train_iterator, iteration, dataloader_save_path):
 
 def generate_state_dict(args, model, optimizer, opt_param_scheduler,
                         rng_state, use_dist_ckpt=False, iteration=None,
-                        optim_sd_kwargs=None, train_data_iterator=None):
+                        optim_sd_kwargs=None, rerun_state=None):
     # Arguments, iteration, and model.
     state_dict = {}
     state_dict['args'] = args
@@ -614,7 +621,7 @@ def generate_state_dict(args, model, optimizer, opt_param_scheduler,
                 model[i].state_dict_for_save_checkpoint())
     # Optimizer stuff.
     if not args.no_save_optim:
-        if optimizer is not None:
+        if optimizer is not None and not optimizer.is_stub_optimizer:
             state_dict['optimizer'] = (optimizer.sharded_state_dict(state_dict, **(optim_sd_kwargs or {}))
                                        if use_dist_ckpt else
                                        optimizer.state_dict())
@@ -623,10 +630,7 @@ def generate_state_dict(args, model, optimizer, opt_param_scheduler,
                 opt_param_scheduler.state_dict()
 
     # Rerun state
-    rerun_state_machine = get_rerun_state_machine()
-    state_dict['rerun_state_machine'] = rerun_state_machine.get_checkpoint_state(
-        train_data_iterator
-    )
+    state_dict['rerun_state_machine'] = rerun_state
 
     # RNG states.
     if not args.no_save_rng:
@@ -1136,6 +1140,17 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
                 gen_sd_optim = None
                 gen_sd_opt_param_scheduler = None
 
+            # Determine if rerun state will be loaded
+            if (ckpt_tp_pp == run_tp_pp and not release and not args.finetune):
+                rerun_state_machine = get_rerun_state_machine()
+                gen_sd_rerun_state = rerun_state_machine.state_dict(
+                    data_iterator=None, use_dist_ckpt=True
+                )
+            else:
+                gen_sd_rerun_state = None
+                if ckpt_tp_pp != run_tp_pp:
+                    print_rank_0("{}: Rerun state will be ignored".format(mismatch_msg))
+
             # [ModelOpt]: Initial loading from non-resume sharded checkpoint to a Distillation Model
             # will result in key mismatch with loss modules potentially containing parameters, since
             # it requires generating a state_dict before loading. Here we hide those modules if present.
@@ -1145,9 +1160,9 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
                         stack.enter_context(m.hide_loss_modules())
                 load_kwargs['sharded_state_dict'] = generate_state_dict(
                     args, model, gen_sd_optim, gen_sd_opt_param_scheduler, gen_sd_rng_state,
-                    use_dist_ckpt=True, optim_sd_kwargs=optim_sd_kwargs, train_data_iterator=None
+                    use_dist_ckpt=True, optim_sd_kwargs=optim_sd_kwargs, rerun_state=gen_sd_rerun_state
                 )
-                                                                        
+
             # When "--fp8-param-gather" is disabled, this function doesn't modify anything.
             fix_fp8_params_lose_precision_when_loading_dist_ckpt(load_kwargs['sharded_state_dict'])
 
@@ -1230,7 +1245,7 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
     if not release and not args.finetune and not args.no_load_optim:
         try:
             # Load state dict.
-            if not skip_load_to_model_and_opt and optimizer is not None:
+            if not skip_load_to_model_and_opt and optimizer is not None and not optimizer.is_stub_optimizer:
                 optimizer.load_state_dict(state_dict['optimizer'])
 
             # Load distributed optimizer's custom parameter state.
@@ -1268,7 +1283,7 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri
     # rerun state
     try:
         if 'rerun_state_machine' in state_dict:
-            get_rerun_state_machine().set_checkpoint_state(state_dict['rerun_state_machine'])
+            get_rerun_state_machine().load_state_dict(state_dict['rerun_state_machine'])
     except Exception as e:
         print(f"Unable to restore RerunMachine from checkpoint: {e}")
         sys.exit()
diff --git a/megatron/training/dist_signal_handler.py b/megatron/training/dist_signal_handler.py
old mode 100755
new mode 100644
diff --git a/megatron/training/ft_integration.py b/megatron/training/ft_integration.py
old mode 100755
new mode 100644
diff --git a/megatron/training/global_vars.py b/megatron/training/global_vars.py
old mode 100755
new mode 100644
diff --git a/megatron/training/initialize.py b/megatron/training/initialize.py
old mode 100755
new mode 100644
diff --git a/megatron/training/log_handler.py b/megatron/training/log_handler.py
old mode 100755
new mode 100644
diff --git a/megatron/training/one_logger_utils.py b/megatron/training/one_logger_utils.py
old mode 100755
new mode 100644
diff --git a/megatron/training/theoretical_memory_usage.py b/megatron/training/theoretical_memory_usage.py
old mode 100755
new mode 100644
diff --git a/megatron/training/tokenizer/__init__.py b/megatron/training/tokenizer/__init__.py
old mode 100755
new mode 100644
diff --git a/megatron/training/tokenizer/bert_tokenization.py b/megatron/training/tokenizer/bert_tokenization.py
old mode 100755
new mode 100644
diff --git a/megatron/training/tokenizer/gpt2_tokenization.py b/megatron/training/tokenizer/gpt2_tokenization.py
old mode 100755
new mode 100644
diff --git a/megatron/training/tokenizer/multimodal_tokenizer.py b/megatron/training/tokenizer/multimodal_tokenizer.py
old mode 100755
new mode 100644
diff --git a/megatron/training/tokenizer/tokenizer.py b/megatron/training/tokenizer/tokenizer.py
old mode 100755
new mode 100644
diff --git a/megatron/training/training.py b/megatron/training/training.py
old mode 100755
new mode 100644
index 89c09ad31cac088fece1d82f77d6ce1e2ec4f872..908b8bf3899a27cece80bf68eba391bc4c0007ff
--- a/megatron/training/training.py
+++ b/megatron/training/training.py
@@ -69,14 +69,16 @@ from megatron.core.num_microbatches_calculator import (
 
 from .async_utils import maybe_finalize_async_save
 from .utils import (
+    append_to_progress_log,
     calc_params_l2_norm,
     check_adlr_autoresume_termination,
+    logical_and_across_model_parallel_group,
+    reduce_max_stat_across_model_parallel_group,
     is_last_rank,
     print_rank_0,
     print_rank_last,
     report_memory,
     unwrap_model,
-    append_to_progress_log,
     update_use_dist_ckpt,
 )
 from .global_vars import (
@@ -86,7 +88,8 @@ from .global_vars import (
     get_timers,
     get_tensorboard_writer,
     get_wandb_writer,
-    get_one_logger)
+    get_one_logger,
+)
 from . import one_logger_utils
 
 from . import ft_integration
@@ -135,13 +138,6 @@ def num_floating_point_operations(args, batch_size):
     # - 2x: A GEMM of a m*n tensor with a n*k tensor requires 2mnk floating-point operations.
     expansion_factor = 3 * 2 * 2
 
-    # print(f"batch_size: {batch_size}, \
-    #       query_projection_to_hidden_size_ratio: {query_projection_to_hidden_size_ratio}, \
-    #       num_experts_routed_to: {num_experts_routed_to}, \
-    #       gated_linear_multiplier: {gated_linear_multiplier}, \
-    #       shared_expert_ffn_hidden_size: {shared_expert_ffn_hidden_size}, \
-    #       gated_linear_multiplier: {gated_linear_multiplier}, \
-    #       ")
     return (
         expansion_factor
         * batch_size
@@ -219,7 +215,7 @@ def get_start_time_from_progress_log():
 
 def preprocess_common_state_dict(common_state_dict):
     import copy
-    # Convert args key of type namespace to dictionary 
+    # Convert args key of type namespace to dictionary
     preprocessed_common_state_dict = copy.deepcopy(common_state_dict)
     preprocessed_common_state_dict['args'] = vars(preprocessed_common_state_dict['args'])
     # Remove rank and local rank from state dict if it exists, since they are expected to be different
@@ -753,7 +749,7 @@ def setup_model_and_optimizer(model_provider_func,
 
 
 def train_step(forward_step_func, data_iterator,
-               model, optimizer, opt_param_scheduler, config): 
+               model, optimizer, opt_param_scheduler, config):
     """Single training step."""
     args = get_args()
     timers = get_timers()
@@ -790,10 +786,20 @@ def train_step(forward_step_func, data_iterator,
         unwrapped_model.cancel_gradients_last_layer(args.curr_iteration)
 
     # Update parameters.
+
     timers('optimizer', log_level=1).start(barrier=args.barrier_with_L1_time)
     update_successful, grad_norm, num_zeros_in_grad = optimizer.step()
     timers('optimizer').stop()
 
+    # when freezing sub-models we may have a mixture of successful and unsucessful ranks,
+    # so we must gather across mp ranks
+    update_successful = logical_and_across_model_parallel_group(update_successful)
+    # grad_norm and num_zeros_in_grad will be None on ranks without trainable params,
+    # so we must gather across mp ranks
+    grad_norm = reduce_max_stat_across_model_parallel_group(grad_norm)
+    if args.log_num_zeros_in_grad:
+        num_zeros_in_grad = reduce_max_stat_across_model_parallel_group(num_zeros_in_grad)
+
     # Vision momentum.
     if getattr(args, 'vision_pretraining', False) and args.vision_pretraining_type == "dino":
         unwrapped_model = unwrap_model(model[0])
@@ -832,7 +838,6 @@ def train_step(forward_step_func, data_iterator,
                     numerator += val
                     denominator += 1
             loss_reduced[key] = numerator / denominator
-        
         return loss_reduced, skipped_iter, should_checkpoint, should_exit, exit_code, grad_norm, num_zeros_in_grad
     return {}, skipped_iter, should_checkpoint, should_exit, exit_code, grad_norm, num_zeros_in_grad
 
@@ -913,6 +918,8 @@ def training_log(loss_dict, total_loss_dict, learning_rate, decoupled_learning_r
     total_iterations = total_loss_dict[advanced_iters_key] + \
                        total_loss_dict[skipped_iters_key]
 
+    # learning rate will be None on ranks without trainable params, so we must gather across mp ranks
+    learning_rate = reduce_max_stat_across_model_parallel_group(learning_rate)
     # Tensorboard values.
     # Timer requires all the ranks to call.
     if args.log_timers_to_tensorboard and \
@@ -930,12 +937,12 @@ def training_log(loss_dict, total_loss_dict, learning_rate, decoupled_learning_r
             wandb_writer.log({'samples vs steps': args.consumed_train_samples},
                              iteration)
         writer.add_scalar('learning-rate', learning_rate, iteration)
-        if args.decoupled_lr is not None:
-            writer.add_scalar('decoupled-learning-rate', decoupled_learning_rate, iteration)
         writer.add_scalar('learning-rate vs samples', learning_rate,
-                          args.consumed_train_samples)
+                            args.consumed_train_samples)
         if wandb_writer:
             wandb_writer.log({'learning-rate': learning_rate}, iteration)
+        if args.decoupled_lr is not None:
+            writer.add_scalar('decoupled-learning-rate', decoupled_learning_rate, iteration)
         if args.skipped_train_samples > 0:
             writer.add_scalar('skipped-train-samples', args.skipped_train_samples, iteration)
             if wandb_writer:
@@ -1035,7 +1042,6 @@ def training_log(loss_dict, total_loss_dict, learning_rate, decoupled_learning_r
                     writer.add_scalar('throughput', throughput, iteration)
                 if wandb_writer:
                     wandb_writer.log({'throughput': throughput}, iteration)
-        assert learning_rate is not None
         # Decoupled_learning_rate should be not None only on first and last pipeline stage.
         log_string += f' learning rate: {learning_rate:.6E} |'
         if args.decoupled_lr is not None and (mpu.is_pipeline_first_stage(ignore_virtual=True) or
@@ -1068,7 +1074,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, decoupled_learning_r
         total_loss_dict[skipped_iters_key] = 0
         total_loss_dict[nan_iters_key] = 0
         print_rank_last(log_string)
-        if report_memory_flag and learning_rate > 0.:
+        if report_memory_flag:
             # Report memory after optimizer state has been initialized.
             if torch.distributed.get_rank() == 0:
                 num_microbatches = get_num_microbatches()
@@ -1120,10 +1126,10 @@ def enable_forward_pre_hook(model_chunks):
         model_chunk.enable_forward_pre_hook()
 
 
-def disable_forward_pre_hook(model_chunks):
+def disable_forward_pre_hook(model_chunks, param_sync=True):
     for model_chunk in model_chunks:
         assert isinstance(model_chunk, DDP)
-        model_chunk.disable_forward_pre_hook()
+        model_chunk.disable_forward_pre_hook(param_sync=param_sync)
 
 
 def save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler,
@@ -1223,7 +1229,6 @@ def post_training_step_callbacks(model, optimizer, opt_param_scheduler, iteratio
             prof.stop()
         else:
             torch.cuda.cudart().cudaProfilerStop()
-        
 
     # Manual garbage collection.
     if args.manual_gc:
@@ -1361,6 +1366,7 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
     timers('interval-time', log_level=0).start(barrier=True)
     print_datetime('before the start of training step')
     report_memory_flag = True
+    pre_hook_enabled = False
     should_exit = False
     exit_code = 0
 
@@ -1414,26 +1420,52 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
             from pathlib import Path
             Path(f"{args.profile_dir}").mkdir(parents=True, exist_ok=True)
             if args.rank in [0]:
-                print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
+                print(p.key_averages(group_by_input_shape=True, 
+                                     group_by_stack_n=5).table(sort_by="self_cuda_time_total", 
+                                                               row_limit=-1, 
+                                                               max_src_column_width=100,
+                                                               max_name_column_width=280, 
+                                                               max_shapes_column_width=200))
+                
             p.export_chrome_trace("{path}/trace_rank{rank}_step{step}.json".format(
                 path=args.profile_dir, rank=torch.distributed.get_rank(), step=p.step_num))
 
         prof = torch.profiler.profile(
-        activities=[
-           torch.profiler.ProfilerActivity.CPU,
-           torch.profiler.ProfilerActivity.CUDA,
-        ],
-        schedule=torch.profiler.schedule(
-            wait=max(args.profile_step_start-1, 0),
-            warmup=1 if args.profile_step_start > 0 else 0,
-            active=args.profile_step_end-args.profile_step_start,
-            repeat=1),
-        on_trace_ready=trace_handler)
+            activities=[
+            torch.profiler.ProfilerActivity.CPU,
+            torch.profiler.ProfilerActivity.CUDA,
+            ],
+            schedule=torch.profiler.schedule(
+                wait=max(args.profile_step_start-1, 0),
+                warmup=1 if args.profile_step_start > 0 else 0,
+                active=args.profile_step_end-args.profile_step_start,
+                repeat=1),
+            record_shapes=True, 
+            #on_trace_ready=torch.profiler.tensorboard_trace_handler('./torch_prof_data'))
+            on_trace_ready=trace_handler)
         prof.start()
     elif args.profile and torch.distributed.get_rank() in args.profile_ranks and args.use_hip_profiler:
         import ctypes
         roctracer = ctypes.cdll.LoadLibrary("/opt/dtk/roctracer/lib/libroctracer64.so")
     
+    start_iteration = iteration
+    # Disable forward pre-hook to start training to ensure that errors in checkpoint loading
+    # or random initialization don't propagate to all ranks in first all-gather (which is a
+    # no-op if things work correctly).
+    if args.use_distributed_optimizer and args.overlap_param_gather:
+        disable_forward_pre_hook(model, param_sync=False)
+        # Also remove param_sync_func temporarily so that sync calls made in
+        # `forward_backward_func` are no-ops.
+        param_sync_func = config.param_sync_func
+        config.param_sync_func = None
+        pre_hook_enabled = False
+    # Also, check weight hash across DP replicas to be very pedantic.
+    if args.check_weight_hash_across_dp_replicas_interval is not None:
+        assert check_param_hashes_across_dp_replicas(model, cross_check=True), \
+            "Parameter hashes not matching across DP replicas"
+        torch.distributed.barrier()
+        print_rank_0(f">>> Weight hashes match after {iteration} iterations...")
+
     # Run training iterations till done.
     while iteration < args.train_iters:
         if args.profile and torch.distributed.get_rank() in args.profile_ranks:
@@ -1456,12 +1488,12 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
         if get_num_microbatches() != num_microbatches and iteration != 0:
             assert get_num_microbatches() > num_microbatches, \
                 (f"Number of microbatches should be increasing due to batch size rampup; "
-                f"instead going from {num_microbatches} to {get_num_microbatches()}")
+                 f"instead going from {num_microbatches} to {get_num_microbatches()}")
             if args.save is not None:
                 save_checkpoint_and_time(iteration, model, optimizer,
-                                        opt_param_scheduler,
-                                        num_floating_point_operations_so_far,
-                                        checkpointing_context, train_data_iterator=train_data_iterator)
+                                         opt_param_scheduler,
+                                         num_floating_point_operations_so_far,
+                                         checkpointing_context, train_data_iterator=train_data_iterator)
         num_microbatches = get_num_microbatches()
         update_num_microbatches(args.consumed_train_samples, consistency_check=True, verbose=True)
 
@@ -1469,23 +1501,41 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
         args.curr_iteration = iteration
         loss_dict, skipped_iter, should_checkpoint, should_exit, exit_code, grad_norm, num_zeros_in_grad = \
             train_step(forward_step_func,
-                    train_data_iterator,
-                    model,
-                    optimizer,
-                    opt_param_scheduler,
-                    config)
+                       train_data_iterator,
+                       model,
+                       optimizer,
+                       opt_param_scheduler,
+                       config)
         if should_checkpoint:
             save_checkpoint_and_time(iteration, model, optimizer,
-                                    opt_param_scheduler,
-                                    num_floating_point_operations_so_far,
-                                    checkpointing_context, train_data_iterator=train_data_iterator)
+                                     opt_param_scheduler,
+                                     num_floating_point_operations_so_far,
+                                     checkpointing_context, train_data_iterator=train_data_iterator)
         if should_exit:
             break
-        # why is skipped_iter ignored?
+
+        # Enable forward pre-hooks after first set of forward and backward passes.
+        # When running in fp16, skip all NaN iterations until steady-state loss scaling value
+        # is reached.
+        if iteration == start_iteration:
+            if skipped_iter:
+                # Only enable forward pre-hook after a training step has successfully run. Relevant
+                # for fp16 codepath where first XX iterations are skipped until steady-state loss
+                # scale value is reached.
+                start_iteration = iteration + 1
+            else:
+                # Enable forward pre-hook after training step has successfully run. All subsequent
+                # forward passes will use the forward pre-hook / `param_sync_func` in
+                # `forward_backward_func`.
+                if args.use_distributed_optimizer and args.overlap_param_gather:
+                    enable_forward_pre_hook(model)
+                    config.param_sync_func = param_sync_func
+                    pre_hook_enabled = True
+
         iteration += 1
         batch_size = mpu.get_data_parallel_world_size() * \
-                    args.micro_batch_size * \
-                    get_num_microbatches()
+                     args.micro_batch_size * \
+                     get_num_microbatches()
         args.consumed_train_samples += batch_size
         num_skipped_samples_in_batch = (get_current_global_batch_size() -
                                         get_current_running_global_batch_size())
@@ -1499,8 +1549,12 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
         num_floating_point_operations_since_last_log_event += num_floating_point_operations_in_batch
 
         # Logging.
-        loss_scale = optimizer.get_loss_scale().item()
+        if not optimizer.is_stub_optimizer:
+            loss_scale = optimizer.get_loss_scale().item()
+        else:
+            loss_scale = 1.0
         params_norm = None
+
         if args.log_params_norm:
             params_norm = calc_params_l2_norm(model)
         learning_rate = None
@@ -1511,11 +1565,11 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
             else:
                 learning_rate = param_group['lr']
         report_memory_flag = training_log(loss_dict, total_loss_dict,
-                                        learning_rate,
-                                        decoupled_learning_rate,
-                                        iteration, loss_scale,
-                                        report_memory_flag, skipped_iter,
-                                        grad_norm, params_norm, num_zeros_in_grad)
+                                          learning_rate,
+                                          decoupled_learning_rate,
+                                          iteration, loss_scale,
+                                          report_memory_flag, skipped_iter,
+                                          grad_norm, params_norm, num_zeros_in_grad)
 
         # Evaluation.
         if args.eval_interval and iteration % args.eval_interval == 0 and \
@@ -1523,16 +1577,17 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
             timers('interval-time').stop()
             if args.use_distributed_optimizer and args.overlap_param_gather:
                 disable_forward_pre_hook(model)
+                pre_hook_enabled = False
             if args.manual_gc and args.manual_gc_eval:
                 # Collect all objects.
                 gc.collect()
             prefix = f'iteration {iteration}'
             timers('eval-time', log_level=0).start(barrier=True)
             evaluate_and_print_results(prefix, forward_step_func,
-                                    valid_data_iterator, model,
-                                    iteration, process_non_loss_data_func,
-                                    config, verbose=False, write_to_tensorboard=True,
-                                    non_loss_data_func=non_loss_data_func)
+                                       valid_data_iterator, model,
+                                       iteration, process_non_loss_data_func,
+                                       config, verbose=False, write_to_tensorboard=True,
+                                       non_loss_data_func=non_loss_data_func)
             eval_duration += timers('eval-time').elapsed()
             eval_iterations += args.eval_iters
             timers('eval-time').stop()
@@ -1543,6 +1598,7 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
                 gc.collect(generation=0)
             if args.use_distributed_optimizer and args.overlap_param_gather:
                 enable_forward_pre_hook(model)
+                pre_hook_enabled = True
             timers('interval-time', log_level=0).start(barrier=True)
 
             if args.enable_ft_package and ft_integration.get_rank_monitor_client() is not None:
@@ -1552,12 +1608,12 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
         # Miscellaneous post-training-step functions (e.g., FT heartbeats, GC).
         # Some of these only happen at specific iterations.
         post_training_step_callbacks(model, optimizer, opt_param_scheduler, iteration, prof,
-                                    num_floating_point_operations_since_last_log_event)
+                                     num_floating_point_operations_since_last_log_event)
 
         # Checkpoint and decide whether to exit.
         should_exit = checkpoint_and_decide_exit(model, optimizer, opt_param_scheduler, iteration,
-                                                num_floating_point_operations_so_far,
-                                                checkpointing_context, train_data_iterator)
+                                                 num_floating_point_operations_so_far,
+                                                 checkpointing_context, train_data_iterator)
         if should_exit:
             break
 
@@ -1569,7 +1625,7 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
         writer.flush()
 
     # Close out pre-hooks if using distributed optimizer and overlapped param gather.
-    if args.use_distributed_optimizer and args.overlap_param_gather:
+    if pre_hook_enabled:
         disable_forward_pre_hook(model)
 
     if args.enable_ft_package and ft_integration.get_rank_monitor_client() is not None:
@@ -1701,7 +1757,9 @@ def evaluate(forward_step_func,
 
     timers('evaluate').stop()
     timers.log(['evaluate'])
-    
+
+    rerun_state_machine.set_mode(rerun_mode)
+
     rerun_state_machine.set_mode(rerun_mode)
 
     return total_loss_dict, collected_non_loss_data, False
@@ -1869,12 +1927,15 @@ def build_train_valid_test_data_iterators(
     def _get_iterator(dataloader_type, dataloader):
         """Return dataset iterator."""
         if dataloader_type == "single":
-            return RerunDataIterator(dataloader)
+            return RerunDataIterator(iter(dataloader))
         elif dataloader_type == "cyclic":
-            return RerunDataIterator(cyclic_iter(dataloader))
+            return RerunDataIterator(iter(cyclic_iter(dataloader)))
         elif dataloader_type == "external":
             # External dataloader is passed through. User is expected to define how to iterate.
-            return RerunDataIterator(dataloader, make_iterable=False)
+            if isinstance(dataloader, list):
+                return [RerunDataIterator(d) for d in dataloader]
+            else:
+                return RerunDataIterator(dataloader)
         else:
             raise RuntimeError("unexpected dataloader type")
 
diff --git a/megatron/training/utils.py b/megatron/training/utils.py
old mode 100755
new mode 100644
index 6c4143609bdcba69f796ddcc382003860b1d02fc..2f517d2be351e1623d53ef24c4b0787b23d7a387
--- a/megatron/training/utils.py
+++ b/megatron/training/utils.py
@@ -36,7 +36,11 @@ from megatron.core import DistributedDataParallel as DDP
 from megatron.core import mpu
 from megatron.core.datasets.utils import get_blend_from_list
 from megatron.core.tensor_parallel import param_is_not_tensor_parallel_duplicate
-from megatron.core.utils import get_data_parallel_group_if_dtensor, to_local_if_dtensor
+from megatron.core.utils import (
+    get_batch_on_this_cp_rank,
+    get_data_parallel_group_if_dtensor,
+    to_local_if_dtensor,
+)
 from megatron.legacy.model import Float16Module
 from megatron.legacy.model.module import param_is_not_shared
 
@@ -90,13 +94,16 @@ def calc_params_l2_norm(model):
 
     # Calculate dense param norm
     dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device='cuda')
-    norm, _ = multi_tensor_applier(
-        multi_tensor_l2norm,
-        dummy_overflow_buf,
-        [params_data],
-        False # no per-parameter norm
-    )
-    norm_2 = norm * norm
+    if len(params_data) > 0:
+        norm, _ = multi_tensor_applier(
+            multi_tensor_l2norm,
+            dummy_overflow_buf,
+            [params_data],
+            False # no per-parameter norm
+        )
+        norm_2 = norm * norm
+    else:
+        norm_2 = torch.tensor([0.0], dtype=torch.float32, device='cuda')
 
     if data_parallel_group is not None:
         torch.distributed.all_reduce(norm_2,
@@ -140,6 +147,41 @@ def average_losses_across_data_parallel_group(losses):
     return averaged_losses
 
 
+def reduce_max_stat_across_model_parallel_group(stat: float) -> float:
+    """
+    Ranks without an optimizer will have no grad_norm or num_zeros_in_grad stats.
+    We need to ensure the logging and writer rank has those values.
+    This function reduces a stat tensor across the model parallel group.
+
+    We use an all_reduce max since the values have already been summed across optimizer ranks where possible
+    """
+    if stat is None:
+        stat = -1.0
+    stat = torch.tensor([stat], dtype=torch.float32, device=torch.cuda.current_device())
+    torch.distributed.all_reduce(
+        stat, op=torch.distributed.ReduceOp.MAX, group=mpu.get_model_parallel_group()
+    )
+    if stat.item() == -1.0:
+        return None
+    else:
+        return stat.item()
+
+
+def logical_and_across_model_parallel_group(input: bool) -> bool:
+    """
+    This function gathers a bool value across the model parallel group
+    """
+    if input is True:
+        input = 1
+    else:
+        input = 0
+    input = torch.tensor([input], dtype=torch.int, device=torch.cuda.current_device())
+    torch.distributed.all_reduce(
+        input, op=torch.distributed.ReduceOp.MIN, group=mpu.get_model_parallel_group()
+    )
+    return bool(input.item())
+
+
 def report_memory(name):
     """Simple GPU memory report."""
     mega_bytes = 1024.0 * 1024.0
@@ -254,39 +296,6 @@ def get_ltor_masks_and_position_ids(data,
     return attention_mask, loss_mask, position_ids
 
 
-def get_batch_on_this_cp_rank(batch):
-    """ Slice batch input along sequence dimension into multiple chunks,
-        which are parallelized across GPUs in a context parallel group.
-    """
-
-    # With causal masking, each token only attends to its prior tokens. Simply split
-    # sequence into CP chunks can result in severe load imbalance. That's to say, chunks
-    # at the end of sequence have bigger workload than others. To address this issue,
-    # we split sequence into 2*CP ranks. Assuming CP=2, we then get 4 chunks, chunk_0
-    # and chunk_3 are assigned to GPU0, chunk_1 and chunk_2 are assigned to GPU1, so
-    # that we can get balanced workload among GPUs in a context parallel group.
-    args = get_args()
-    cp_size = args.context_parallel_size
-    if cp_size > 1:
-        cp_rank = mpu.get_context_parallel_rank()
-        for key, val in batch.items():
-            if val is not None:
-                seq_dim = 1 if key != 'attention_mask' else 2
-                val = val.view(
-                    *val.shape[0:seq_dim],
-                    2 * cp_size,
-                    val.shape[seq_dim] // (2 * cp_size),
-                    *val.shape[(seq_dim + 1) :],
-                )
-                index = torch.tensor([cp_rank, (2 * cp_size - cp_rank - 1)], 
-                                     device="cpu", pin_memory=True).cuda(non_blocking=True)
-                val = val.index_select(seq_dim, index)
-                val = val.view(*val.shape[0:seq_dim], -1, *val.shape[(seq_dim + 2) :])
-                batch[key] = val
-
-    return batch
-
-
 def print_rank_0(message):
     """If distributed is initialized, print only on rank 0."""
     if torch.distributed.is_initialized():
@@ -431,11 +440,11 @@ def get_batch_on_this_tp_rank(data_iterator):
            _broadcast(loss_mask)
            _broadcast(attention_mask)
            _broadcast(position_ids)
- 
+
        elif mpu.is_pipeline_first_stage():
            labels=None
            loss_mask=None
-   
+
            _broadcast(tokens)
            _broadcast(attention_mask)
            _broadcast(position_ids)
@@ -443,11 +452,11 @@ def get_batch_on_this_tp_rank(data_iterator):
        elif mpu.is_pipeline_last_stage():
            tokens=None
            position_ids=None
-    
+
            _broadcast(labels)
            _broadcast(loss_mask)
            _broadcast(attention_mask)
- 
+
        batch = {
            'tokens': tokens,
            'labels': labels,
diff --git a/megatron/training/yaml_arguments.py b/megatron/training/yaml_arguments.py
old mode 100755
new mode 100644
diff --git a/mypy.ini b/mypy.ini
old mode 100755
new mode 100644
diff --git a/pretrain_bert.py b/pretrain_bert.py
old mode 100755
new mode 100644
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
old mode 100755
new mode 100644
index e4f60905d77cfb250647ce12f9d20bbf1ca18a68..a974f6786483a3eb956ea0b7cf8461abee4770a5
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -91,11 +91,11 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
                 if use_te:
                     transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
                         args.num_experts, args.moe_grouped_gemm,
-                        args.qk_layernorm, args.multi_latent_attention, args.fp8)
+                        args.qk_layernorm, args.multi_latent_attention, args.moe_use_legacy_grouped_gemm)
                 else:
                     transformer_layer_spec = get_gpt_layer_local_spec(
                         args.num_experts, args.moe_grouped_gemm,
-                        args.qk_layernorm, args.multi_latent_attention)
+                        args.qk_layernorm, args.multi_latent_attention, args.moe_use_legacy_grouped_gemm)
 
         build_model_context = nullcontext
         build_model_context_args = {}
@@ -129,6 +129,8 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
                 rope_scaling=args.use_rope_scaling
             )
 
+    print_rank_0(model)
+
     return model
 
 
diff --git a/pretrain_ict.py b/pretrain_ict.py
old mode 100755
new mode 100644
diff --git a/pretrain_mamba.py b/pretrain_mamba.py
old mode 100755
new mode 100644
diff --git a/pretrain_retro.py b/pretrain_retro.py
old mode 100755
new mode 100644
diff --git a/pretrain_t5.py b/pretrain_t5.py
old mode 100755
new mode 100644
diff --git a/pretrain_vision_classify.py b/pretrain_vision_classify.py
old mode 100755
new mode 100644
diff --git a/pretrain_vision_dino.py b/pretrain_vision_dino.py
old mode 100755
new mode 100644
diff --git a/pretrain_vision_inpaint.py b/pretrain_vision_inpaint.py
old mode 100755
new mode 100644
diff --git a/pretrain_vlm.py b/pretrain_vlm.py
old mode 100755
new mode 100644
index 605634060f0d4a540661ecef0708604a8c580ca6..1870a77d61647778eed50475f8b2ef5828cdde67
--- a/pretrain_vlm.py
+++ b/pretrain_vlm.py
@@ -83,12 +83,6 @@ def model_provider(
 
     assert args.ckpt_format == 'torch', "Only ckpt-format torch is supported for VLM training currently."
 
-    if args.pipeline_model_parallel_size > 1:
-        assert not args.freeze_LM, "Freezing a pipeline parallel language model is not currently supported"
-
-    if args.encoder_pipeline_model_parallel_size == 1:
-        assert not args.freeze_ViT, "Freezing a vision encoder on its own pipeline rank is not currently supported"
-
     num_image_embeddings = get_num_image_embeddings(
         args.img_h, args.img_w, args.patch_dim, vision_model_type, args.disable_vision_class_token,
         class_token_len=1, pixel_shuffle=False, use_tile_tags=False
@@ -129,7 +123,7 @@ def model_provider(
         language_transformer_layer_spec = decoder_model_with_local_default_spec(
             args.num_experts, args.moe_grouped_gemm
         )
-    
+
     # Prepare mask type for any required padding to support CP/SP sequence sharding.
     if mp_padding_needed > 0:
         if language_transformer_layer_spec.submodules.self_attention.params.get('attn_mask_type', '') == AttnMaskType.causal:
@@ -351,10 +345,10 @@ def get_batch(data_iterator):
     labels = data_i["labels"].long()
     loss_mask = data_f["loss_mask"].float()
     images = data_f["image"].float()
-    
+
     if cp_size > 1 or args.sequence_parallel:
         vision_model_type = "clip"
-        # Calculate the number of image embedding tokens will be added to text tokens 
+        # Calculate the number of image embedding tokens will be added to text tokens
         num_image_embeddings_per_tile = get_num_image_embeddings(
             args.img_h, args.img_w, args.patch_dim, vision_model_type, args.disable_vision_class_token, 1
         )
@@ -367,7 +361,7 @@ def get_batch(data_iterator):
         num_images_per_sample = torch.sum(image_token_mask, dim=-1)
         img_seq_len = (num_image_embeddings_per_tile * num_images_per_sample - num_images_per_sample).max()
         packed_seq_params = _get_packed_seq_params(tokens, img_seq_len, mp_padding_needed_for_text)
-    
+
     # slice batch along sequence dimension for context parallelism
     batch = get_batch_on_this_cp_rank({"tokens": tokens, "position_ids": position_ids})
     attention_mask = None  # Use the attention mask type defined in layer spec. Typically no mask for the vision model and causal mask for the vision model.
diff --git a/pyproject.toml b/pyproject.toml
old mode 100755
new mode 100644
diff --git a/pytest.ini b/pytest.ini
old mode 100755
new mode 100644
diff --git a/requirements.txt b/requirements.txt
index d02b4fea0785ca48c044db196abb29273e034d69..42ed710c9ba6900b0c99cf383232eebbc75cf307 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,7 @@ regex
 pyyaml
 sentencepiece
 pybind11
+blobfile
 
 # ==== test ====
 nltk
diff --git a/requirements/pytorch:24.01/requirements.txt b/requirements/pytorch_24.01/requirements.txt
old mode 100755
new mode 100644
similarity index 100%
rename from requirements/pytorch:24.01/requirements.txt
rename to requirements/pytorch_24.01/requirements.txt
diff --git a/requirements/pytorch:24.07/requirements.txt b/requirements/pytorch_24.07/requirements.txt
old mode 100755
new mode 100644
similarity index 100%
rename from requirements/pytorch:24.07/requirements.txt
rename to requirements/pytorch_24.07/requirements.txt
diff --git a/run.sh b/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2401cc081a1048b0b9e06f863bd14d4401023240
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,16 @@
+export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
+export TORCHINDUCTOR_BENCHMARK_FUSION=1
+export TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES=1
+
+# export TORCHINDUCTOR_BENCHMARK_KERNEL=1
+export TORCHINDUCTOR_MAX_AUTOTUNE=1
+
+#export FLASH_ATTENTION_PRINT_PARAM=1
+export TORCHINDUCTOR_CACHE_DIR=./cache
+
+# export USE_AOTRITON_FA=1
+# export USE_BSHD=1 # use fa bsdh layout
+#for uniq kernel name
+#export TORCHINDUCTOR_UNIQUE_KERNEL_NAMES=1
+
+mpirun --allow-run-as-root -np 8 ./Llama_pretraining.sh localhost
diff --git a/run_GPT-MOE.sh b/run_GPT-MOE.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ae3176a44df824c8301a6209b8ea9b33cc9028ea
--- /dev/null
+++ b/run_GPT-MOE.sh
@@ -0,0 +1,19 @@
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+        export GPU_FLUSH_ON_EXECUTION=1
+        export HIP_DIRECT_DISPATCH=0
+    fi
+done
+
+mpirun -np 512 --hostfile hostfile_gptmoe \
+              --allow-run-as-root \
+              --bind-to none \
+              --mca plm_rsh_no_tree_spawn 1 \
+              train_GPT-MOE_567B.sh node002 --profiling=$profiling > output.log 2>&1
+
+wait
+
+rm -rf CKPT
+#rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
diff --git a/run_GPT-MOE_1nodes.sh b/run_GPT-MOE_1nodes.sh
new file mode 100644
index 0000000000000000000000000000000000000000..39f1b6d2e7e666aff9705f2da212d47c7554a182
--- /dev/null
+++ b/run_GPT-MOE_1nodes.sh
@@ -0,0 +1,16 @@
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+        export GPU_FLUSH_ON_EXECUTION=1
+        export HIP_DIRECT_DISPATCH=0
+    fi
+done
+
+mpirun -np 8  --allow-run-as-root \
+              train_GPT-MOE_567B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
+
+wait
+
+rm -rf CKPT
+rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
diff --git a/run_mixtral8x7B_1nodes.sh b/run_mixtral8x7B_1nodes.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ca42d3e210110ab6c104fd3be74634163fb67cd9
--- /dev/null
+++ b/run_mixtral8x7B_1nodes.sh
@@ -0,0 +1,16 @@
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+        export GPU_FLUSH_ON_EXECUTION=1
+        export HIP_DIRECT_DISPATCH=0
+    fi
+done
+
+mpirun -np 8  --allow-run-as-root \
+              train_mixtral_8x7B_1nodes.sh localhost --profiling=$profiling > output.log 2>&1
+
+wait
+
+rm -rf CKPT
+rm -rf mixtral_dataset/my-mixtral_text_document
diff --git a/run_mixtral8x7B_4nodes.sh b/run_mixtral8x7B_4nodes.sh
new file mode 100644
index 0000000000000000000000000000000000000000..facfc418b73e6150904be4c378d90deddbefefb0
--- /dev/null
+++ b/run_mixtral8x7B_4nodes.sh
@@ -0,0 +1,19 @@
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+        export GPU_FLUSH_ON_EXECUTION=1
+        export HIP_DIRECT_DISPATCH=0
+    fi
+done
+
+mpirun -np 32 --hostfile hostfile_mixtral8x7B \
+              --allow-run-as-root \
+              --bind-to none \
+              --mca plm_rsh_no_tree_spawn 1 \
+              train_mixtral_8x7B_4nodes.sh node066 --profiling=$profiling > output.log 2>&1
+
+wait
+
+rm -rf CKPT
+#rm -rf mixtral_dataset/my-mixtral_text_document
\ No newline at end of file
diff --git a/setup.py b/setup.py
old mode 100755
new mode 100644
diff --git a/tasks/data_utils.py b/tasks/data_utils.py
old mode 100755
new mode 100644
diff --git a/tasks/ensemble_classifier.py b/tasks/ensemble_classifier.py
old mode 100755
new mode 100644
diff --git a/tasks/eval_utils.py b/tasks/eval_utils.py
old mode 100755
new mode 100644
diff --git a/tasks/finetune_utils.py b/tasks/finetune_utils.py
old mode 100755
new mode 100644
diff --git a/tasks/glue/data.py b/tasks/glue/data.py
old mode 100755
new mode 100644
diff --git a/tasks/glue/finetune.py b/tasks/glue/finetune.py
old mode 100755
new mode 100644
diff --git a/tasks/glue/mnli.py b/tasks/glue/mnli.py
old mode 100755
new mode 100644
diff --git a/tasks/glue/qqp.py b/tasks/glue/qqp.py
old mode 100755
new mode 100644
diff --git a/tasks/main.py b/tasks/main.py
old mode 100755
new mode 100644
diff --git a/tasks/msdp/README.md b/tasks/msdp/README.md
old mode 100755
new mode 100644
diff --git a/tasks/msdp/evaluate.py b/tasks/msdp/evaluate.py
old mode 100755
new mode 100644
diff --git a/tasks/msdp/main.py b/tasks/msdp/main.py
old mode 100755
new mode 100644
diff --git a/tasks/msdp/metrics.py b/tasks/msdp/metrics.py
old mode 100755
new mode 100644
diff --git a/tasks/msdp/preprocessing.py b/tasks/msdp/preprocessing.py
old mode 100755
new mode 100644
diff --git a/tasks/msdp/prompt.py b/tasks/msdp/prompt.py
old mode 100755
new mode 100644
diff --git a/tasks/orqa/README.md b/tasks/orqa/README.md
old mode 100755
new mode 100644
diff --git a/tasks/orqa/evaluate_orqa.py b/tasks/orqa/evaluate_orqa.py
old mode 100755
new mode 100644
diff --git a/tasks/orqa/evaluate_utils.py b/tasks/orqa/evaluate_utils.py
old mode 100755
new mode 100644
diff --git a/tasks/orqa/supervised/data.py b/tasks/orqa/supervised/data.py
old mode 100755
new mode 100644
diff --git a/tasks/orqa/supervised/eval_utils.py b/tasks/orqa/supervised/eval_utils.py
old mode 100755
new mode 100644
diff --git a/tasks/orqa/supervised/finetune.py b/tasks/orqa/supervised/finetune.py
old mode 100755
new mode 100644
diff --git a/tasks/orqa/unsupervised/nq.py b/tasks/orqa/unsupervised/nq.py
old mode 100755
new mode 100644
diff --git a/tasks/orqa/unsupervised/qa_utils.py b/tasks/orqa/unsupervised/qa_utils.py
old mode 100755
new mode 100644
diff --git a/tasks/orqa/unsupervised/tokenizers.py b/tasks/orqa/unsupervised/tokenizers.py
old mode 100755
new mode 100644
diff --git a/tasks/quantize/calibrate_gpt.py b/tasks/quantize/calibrate_gpt.py
old mode 100755
new mode 100644
diff --git a/tasks/race/data.py b/tasks/race/data.py
old mode 100755
new mode 100644
diff --git a/tasks/race/finetune.py b/tasks/race/finetune.py
old mode 100755
new mode 100644
diff --git a/tasks/vision/classification/classification.py b/tasks/vision/classification/classification.py
old mode 100755
new mode 100644
diff --git a/tasks/vision/classification/eval_utils.py b/tasks/vision/classification/eval_utils.py
old mode 100755
new mode 100644
diff --git a/tasks/vision/finetune_utils.py b/tasks/vision/finetune_utils.py
old mode 100755
new mode 100644
diff --git a/tasks/vision/main.py b/tasks/vision/main.py
old mode 100755
new mode 100644
diff --git a/tasks/vision/segmentation/cityscapes.py b/tasks/vision/segmentation/cityscapes.py
old mode 100755
new mode 100644
diff --git a/tasks/vision/segmentation/data.py b/tasks/vision/segmentation/data.py
old mode 100755
new mode 100644
diff --git a/tasks/vision/segmentation/finetune_segformer.py b/tasks/vision/segmentation/finetune_segformer.py
old mode 100755
new mode 100644
diff --git a/tasks/vision/segmentation/finetune_setr.py b/tasks/vision/segmentation/finetune_setr.py
old mode 100755
new mode 100644
diff --git a/tasks/vision/segmentation/metrics.py b/tasks/vision/segmentation/metrics.py
old mode 100755
new mode 100644
diff --git a/tasks/vision/segmentation/seg_heads.py b/tasks/vision/segmentation/seg_heads.py
old mode 100755
new mode 100644
diff --git a/tasks/vision/segmentation/seg_models.py b/tasks/vision/segmentation/seg_models.py
old mode 100755
new mode 100644
diff --git a/tasks/vision/segmentation/transforms.py b/tasks/vision/segmentation/transforms.py
old mode 100755
new mode 100644
diff --git a/tasks/vision/segmentation/utils.py b/tasks/vision/segmentation/utils.py
old mode 100755
new mode 100644
diff --git a/tasks/zeroshot_gpt/datasets.py b/tasks/zeroshot_gpt/datasets.py
old mode 100755
new mode 100644
diff --git a/tasks/zeroshot_gpt/detokenizer.py b/tasks/zeroshot_gpt/detokenizer.py
old mode 100755
new mode 100644
diff --git a/tasks/zeroshot_gpt/evaluate.py b/tasks/zeroshot_gpt/evaluate.py
old mode 100755
new mode 100644
diff --git a/tests/__init__.py b/tests/__init__.py
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/__init__.py b/tests/functional_tests/__init__.py
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/python_test_utils/__init__.py b/tests/functional_tests/python_test_utils/__init__.py
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/python_test_utils/common.py b/tests/functional_tests/python_test_utils/common.py
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py b/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/python_test_utils/test_ci_pipeline.py b/tests/functional_tests/python_test_utils/test_ci_pipeline.py
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/python_test_utils/test_fp8_ci_pipeline.py b/tests/functional_tests/python_test_utils/test_fp8_ci_pipeline.py
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py b/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/shell_test_utils/_run_training.sh b/tests/functional_tests/shell_test_utils/_run_training.sh
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/shell_test_utils/run_ci_test.sh b/tests/functional_tests/shell_test_utils/run_ci_test.sh
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_lts.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_dev.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_lts.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_dev.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_lts.json b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_release/golden_values_0.9.0.json b/tests/functional_tests/test_cases/bert/bert_release/golden_values_0.9.0.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_release/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/common/ckpt_converter/__main__.py b/tests/functional_tests/test_cases/common/ckpt_converter/__main__.py
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/common/ckpt_converter/model_config.yaml b/tests/functional_tests/test_cases/common/ckpt_converter/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt-nemo/gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/golden_values_0.8.0.json b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/golden_values_0.8.0.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/golden_values_0.9.0.json b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/golden_values_0.9.0.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_fsdp2_resume_torch_dist_te/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_fsdp2_resume_torch_dist_te/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_fp16/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp2_resume_torch_dist/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp4_resume_torch_dist/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_resume_torch_dist_te_4experts2parallel/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml
old mode 100755
new mode 100644
index d50c59d5f6f6539d5fbf4eabdbefc0ff81647673..150d96aaeebc9b11462ccdc3af1bbd038725a451
--- a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_ep2_te_4experts2parallel/model_config.yaml
@@ -50,4 +50,5 @@ MODEL_ARGS:
   --ckpt-format: torch_dist
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
+  --attention-backend: unfused
 TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp2_pp2_resume_torch_dist_te_2experts/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp4_pp1_resume_torch_dist/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_dist_optimizer_overlap_grad_reduce/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp1_overlap_grad_reduce/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp2_resume_torch/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_overlap_grad_reduce/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_resume_torch/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp1_pp4_vp1_overlap_grad_reduce/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_overlap_grad_reduce/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_4experts/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp2_pp2_resume_torch_overlap_grad_reduce/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_overlap_grad_reduce/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_nightly_dgx_a100_1N8G_tp4_pp1_resume_torch/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_nondet_tp1_pp1_fp8_no_model_parallel/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_bf16_baseline/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp1_fp8_no_model_parallel/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp1_pp2_fp8_pp/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_fp8_tp_pp_sp/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp2_pp2_native_fp8_tp_pp_sp/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_h100_1N8G_mcore_tp4_pp2_fp8_tp_pp/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp1_uniform_full_recompute_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_cp4_a2a_p2p_nondeterministic_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_reshard_2x1x4_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
index d11f459955a75ab9c7c9c5d2f630ecdaa766dcd4..7689c48dcc7db4cf4278e543f42e5b380641a3f6
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_dgx_a100_1N8G/model_config.yaml
@@ -40,7 +40,6 @@ MODEL_ARGS:
   --tensor-model-parallel-size: 1
   --pipeline-model-parallel-size: 2
   --position-embedding-type: rope
-  --no-rope-fusion: true
   --no-ckpt-fully-parallel-save: true
   --deterministic-mode: true
   --no-gradient-accumulation-fusion: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_resume_torch_dist_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
index dd3bf045928a35dc883a80f38fa76f9669122601..922b5eb31ad1d892b3093f91db286456f74900f4
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_dgx_a100_1N8G/model_config.yaml
@@ -40,7 +40,6 @@ MODEL_ARGS:
   --tensor-model-parallel-size: 1
   --pipeline-model-parallel-size: 2
   --position-embedding-type: rope
-  --no-rope-fusion: true
   --no-ckpt-fully-parallel-save: true
   --deterministic-mode: true
   --no-gradient-accumulation-fusion: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp2_rope_embeddings_interleaved_no_fusion_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_disable_bias_linear_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_sequence_parallel_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_swiglu_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_resume_torch_dist_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_sequence_parallel_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_swiglu_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_untie_embeddings_and_outputs_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_decoupled_lr_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_overlap_optimizer_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_calculate_per_token_loss_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_untied_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_tunable_overlap_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp1_pp4_vp1_uneven_pipeline_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_fsdp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_multi_dist_optimizer_instances_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_groupedGEMM_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_resume_torch_dist_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
index a09763fbe58e3bd25640e824fb6ca16faca674f7..6ba3300b831f2d45899f3de40e71aef1fcefc8c9
--- a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_dev.json
@@ -5,15 +5,15 @@
         "step_interval": 5,
         "values": [
             10.79987,
-            10.85947,
-            10.86478,
-            10.80039,
-            10.70971,
-            10.63893,
-            10.19526,
-            10.31102,
-            10.22247,
-            9.91425
+            10.85907,
+            10.86575,
+            10.79932,
+            10.70961,
+            10.63871,
+            10.19492,
+            10.31016,
+            10.22301,
+            9.91473
         ]
     },
     "num-zeros": {
@@ -21,16 +21,16 @@
         "end_step": 50,
         "step_interval": 5,
         "values": [
-            30798.0,
-            37696.0,
-            37844.0,
-            36275.0,
-            33140.0,
-            35137.0,
-            30638.0,
-            35309.0,
-            36677.0,
-            37604.0
+            30795.0,
+            37447.0,
+            37837.0,
+            35948.0,
+            33382.0,
+            34774.0,
+            30403.0,
+            35340.0,
+            36357.0,
+            37792.0
         ]
     },
     "iteration-time": {
@@ -38,16 +38,16 @@
         "end_step": 50,
         "step_interval": 5,
         "values": [
-            12.59746,
-            0.61072,
-            0.61063,
-            0.61049,
-            0.61015,
-            0.60932,
-            0.61233,
-            0.61024,
-            0.61226,
-            0.61621
+            10.77572,
+            0.42536,
+            0.42839,
+            0.42977,
+            0.42283,
+            0.42333,
+            0.43199,
+            0.42998,
+            0.43124,
+            0.43207
         ]
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_dist_optimizer_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts2parallel_top2router_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp1_te_8experts_etp1_ep4_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_defer_embedding_wgrad_compute_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_create_attention_mask_in_dataloader_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_no_mmap_bin_files_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp2_pp2_resume_torch_dist_reshard_1x4xNone_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp1_resume_torch_dist_qk_layernorm_test_mode_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_resume_torch_dist_uninstall_te_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_mcore_tp2_pp2_uninstall_te_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp1_pp4_vp1_resume_torch_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume/model_config.yaml
old mode 100755
new mode 100644
index a32a8f28b9ec25412da1408c6cd757848718b961..fc75e1cbbb008a2262b11c3808b029537ce0d307
--- a/tests/functional_tests/test_cases/gpt/gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_nightly_mcore_te_tp2_pp1_modelopt_distill_resume/model_config.yaml
@@ -62,4 +62,5 @@ MODEL_ARGS:
   --ckpt-format: torch_dist
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
+  --attention-backend: unfused
 TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/golden_values_0.9.0.json b/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/golden_values_0.9.0.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x22b_tp2pp8ep8vpp1_release/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/golden_values_0.8.0.json b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/golden_values_0.8.0.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/golden_values_0.9.0.json b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/golden_values_0.9.0.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_alltoall_tp2pp4ep4_release_sm/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/golden_values_0.9.0.json b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/golden_values_0.9.0.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml b/tests/functional_tests/test_cases/mixtral/mixtral_8x7b_tp1pp4ep8vpp8_release/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values_dev.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values_lts.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_etp3_dgx_a100_1N7G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000000000000000000000000000000000..a2ef225d8354863f7ca1bf243a6a478ab8955b13
--- /dev/null
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [7.96777, 0.62507, 0.62176, 0.62042, 0.62061, 0.62067, 0.62001, 0.61924, 0.61823, 0.6178]}, "forward-compute-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3.04896, 0.30356, 0.30062, 0.29886, 0.29955, 0.29936, 0.29825, 0.29839, 0.2968, 0.29625]}, "backward-compute-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.99454, 0.28657, 0.28691, 0.28667, 0.28654, 0.28672, 0.28654, 0.2861, 0.28657, 0.28683]}, "batch-generator-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.3938, 0.01749, 0.01695, 0.01841, 0.01751, 0.01736, 0.01792, 0.01739, 0.01667, 0.01628]}, "forward-recv-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3.32161, 0.03012, 0.02986, 0.02994, 0.02968, 0.02964, 0.03016, 0.02977, 0.02991, 0.02985]}, "forward-send-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.53192, 0.00018, 0.00018, 0.00018, 0.00019, 0.0002, 0.00019, 0.00019, 0.00019, 0.00018]}, "backward-recv-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.07283, 0.07198, 0.07135, 0.07044, 0.07023, 0.07085, 0.07065, 0.07057, 0.0704, 0.07021]}, "backward-send-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.00023, 0.00029, 0.0002, 0.00027, 0.00027, 0.00032, 0.00032, 0.00028, 0.00027, 0.00021]}, "forward-send-backward-recv-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3.46399, 0.30175, 0.30094, 0.29597, 0.29703, 0.29641, 0.2959, 0.29432, 0.29344, 0.29317]}, "backward-send-forward-recv-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.82172, 0.00243, 0.00247, 0.00234, 0.00236, 0.00228, 0.0023, 0.00235, 0.00232, 0.00233]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [4e-05, 3e-05, 3e-05, 3e-05, 3e-05, 2e-05, 3e-05, 3e-05, 3e-05, 3e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [7e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05]}, "all-grads-sync-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.16382, 0.00025, 0.00025, 0.00025, 0.00024, 0.00024, 0.00024, 0.00024, 0.00023, 0.00026]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [7e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05, 5e-05]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.83319, 0.00053, 0.00052, 0.00044, 0.00052, 0.00043, 0.00043, 0.00043, 0.00043, 0.00043]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.00895, 0.00069, 0.00069, 0.00068, 0.00069, 0.00069, 0.00068, 0.00068, 0.00068, 0.00069]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.00119, 0.00025, 0.00024, 0.00023, 0.00023, 0.00025, 0.00024, 0.00024, 0.00024, 0.00025]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.00014, 9e-05, 9e-05, 8e-05, 8e-05, 9e-05, 9e-05, 8e-05, 9e-05, 9e-05]}, "optimizer-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.84455, 0.00225, 0.00226, 0.00214, 0.00221, 0.00216, 0.00214, 0.00213, 0.00214, 0.00214]}, "learning-rate": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}, "learning-rate vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}, "batch-size": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.19947, 9.20335, 9.20248, 9.19723, 9.19172, 9.18973, 9.18517, 9.17532, 9.17374, 9.1609]}, "lm loss vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.19947, 9.20335, 9.20248, 9.19723, 9.19172, 9.18973, 9.18517, 9.17532, 9.17374, 9.1609]}, "loss-scale": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.93277, 1.00171, 1.00056, 0.944, 1.16867, 0.98576, 0.91686, 0.9042, 0.83078, 0.88219]}, "grad-norm vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.93277, 1.00171, 1.00056, 0.944, 1.16867, 0.98576, 0.91686, 0.9042, 0.83078, 0.88219]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [106.0, 114.0, 108.0, 110.0, 81.0, 105.0, 85.0, 109.0, 146.0, 122.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [106.0, 114.0, 108.0, 110.0, 81.0, 105.0, 85.0, 109.0, 146.0, 122.0]}, "params-norm": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [35.67851, 35.67851, 35.67851, 35.67851, 35.67851, 35.6785, 35.67851, 35.6785, 35.67848, 35.67848]}, "params-norm vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [35.67851, 35.67851, 35.67851, 35.67851, 35.67851, 35.6785, 35.67851, 35.6785, 35.67848, 35.67848]}, "iteration-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [8.83079, 0.64044, 0.63692, 0.63516, 0.63554, 0.63541, 0.63471, 0.63399, 0.63285, 0.63245]}, "lm loss validation": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [9.1542]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 1, "step_interval": 5, "values": [9.1542]}, "lm loss validation ppl": {"start_step": 0, "end_step": 1, "step_interval": 5, "values": [9454.09668]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 1, "step_interval": 5, "values": [9454.09668]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000000000000000000000000000000000..3c933e012364543c3946cf3793bde5662ae33723
--- /dev/null
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [12.7291, 0.62672, 0.60589, 0.60528, 0.60867, 0.60545, 0.60403, 0.61268, 0.61851, 0.60357]}, "forward-compute-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [5.56178, 0.30066, 0.28459, 0.28176, 0.28541, 0.27947, 0.28138, 0.28895, 0.29453, 0.28039]}, "backward-compute-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1.12115, 0.28858, 0.28597, 0.28809, 0.28772, 0.28811, 0.28721, 0.28849, 0.28849, 0.28829]}, "batch-generator-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2.85702, 0.03903, 0.0338, 0.03035, 0.03224, 0.03016, 0.02978, 0.03435, 0.03368, 0.02954]}, "forward-recv-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [5.26228, 0.03127, 0.02963, 0.02987, 0.02952, 0.03226, 0.02962, 0.02934, 0.02956, 0.02928]}, "forward-send-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2.5072, 0.00017, 0.00015, 0.00018, 0.00016, 0.00015, 0.00015, 0.00015, 0.00017, 0.00015]}, "backward-recv-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.07163, 0.07147, 0.0696, 0.06982, 0.07399, 0.0702, 0.06973, 0.07326, 0.07023, 0.06973]}, "backward-send-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.00026, 0.00021, 0.00019, 0.00019, 0.00019, 0.00018, 0.00019, 0.0002, 0.0002, 0.00019]}, "forward-send-backward-recv-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [6.16563, 0.28249, 0.27763, 0.28103, 0.27952, 0.28051, 0.2813, 0.28172, 0.29124, 0.28177]}, "backward-send-forward-recv-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.92523, 0.00228, 0.00214, 0.00215, 0.00226, 0.00213, 0.00217, 0.00235, 0.00224, 0.00219]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [4e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [6e-05, 3e-05, 3e-05, 3e-05, 3e-05, 4e-05, 3e-05, 3e-05, 3e-05, 4e-05]}, "all-grads-sync-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.19033, 0.00022, 0.00021, 0.00022, 0.00022, 0.00023, 0.00022, 0.00022, 0.00022, 0.00022]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [6e-05, 4e-05, 4e-05, 4e-05, 5e-05, 4e-05, 4e-05, 4e-05, 4e-05, 5e-05]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2.24661, 0.00048, 0.00047, 0.00038, 0.00047, 0.00039, 0.00039, 0.00039, 0.00039, 0.0004]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.00926, 0.00069, 0.00062, 0.00063, 0.00063, 0.00063, 0.00062, 0.00063, 0.00062, 0.00062]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.00112, 0.0002, 0.0002, 0.00021, 0.00021, 0.00021, 0.00021, 0.00021, 0.00022, 0.00021]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.00014, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05, 8e-05]}, "optimizer-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [2.25814, 0.0021, 0.00203, 0.00193, 0.00201, 0.00193, 0.00195, 0.00196, 0.00197, 0.00195]}, "learning-rate": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}, "learning-rate vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}, "batch-size": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.19948, 9.20339, 9.20246, 9.19721, 9.1917, 9.18976, 9.18512, 9.17531, 9.17379, 9.16091]}, "lm loss vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.19948, 9.20339, 9.20246, 9.19721, 9.1917, 9.18976, 9.18512, 9.17531, 9.17379, 9.16091]}, "loss-scale": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.93282, 1.00192, 1.00046, 0.94405, 1.16906, 0.98576, 0.91648, 0.90421, 0.83062, 0.8822]}, "grad-norm vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.93282, 1.00192, 1.00046, 0.94405, 1.16906, 0.98576, 0.91648, 0.90421, 0.83062, 0.8822]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [103.0, 122.0, 112.0, 97.0, 93.0, 105.0, 109.0, 107.0, 125.0, 130.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [103.0, 122.0, 112.0, 97.0, 93.0, 105.0, 109.0, 107.0, 125.0, 130.0]}, "params-norm": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [35.67851, 35.67851, 35.67851, 35.67851, 35.67851, 35.67851, 35.67851, 35.6785, 35.67849, 35.67848]}, "params-norm vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [35.67851, 35.67851, 35.67851, 35.67851, 35.67851, 35.67851, 35.67851, 35.6785, 35.67849, 35.67848]}, "iteration-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [15.00501, 0.64144, 0.62022, 0.6193, 0.62312, 0.61981, 0.61869, 0.62693, 0.63288, 0.61782]}, "lm loss validation": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [9.15419]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [9.15419]}, "lm loss validation ppl": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [9453.99707]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [9453.99707]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e2ef184e5e56f278a48663d583e970a3188b3c24
--- /dev/null
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,57 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  GPUS_PER_NODE: 8
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 624
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --num-attention-heads: 12
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --split: 949,50,1
+  --tokenizer-type: NullTokenizer
+  --vocab-size: 8192
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --encoder-pipeline-model-parallel-size: 1
+  --encoder-tensor-model-parallel-size: 4
+  --deterministic-mode: true
+  --attention-softmax-in-fp32: true
+  --ckpt-format: torch
+  --no-gradient-accumulation-fusion: true
+  --bf16: true
+  --img-h: 336
+  --img-w: 336
+  --patch-dim: 14
+  --mock-data: true
+  --freeze-ViT: true
+  --freeze-LM: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G/golden_values_dev.json
new file mode 100644
index 0000000000000000000000000000000000000000..c4c1cffa46380816c980496f2cc7af19667473c8
--- /dev/null
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G/golden_values_dev.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.80164, 0.62602, 0.62115, 0.61347, 0.61356, 0.6148, 0.61452, 0.61389, 0.61239, 0.61187]}, "forward-compute-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [5.14549, 0.30295, 0.29758, 0.29055, 0.29096, 0.29124, 0.29129, 0.2913, 0.29037, 0.28939]}, "backward-compute-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1.12619, 0.28782, 0.28877, 0.28732, 0.28777, 0.28808, 0.28786, 0.28769, 0.28753, 0.28791]}, "batch-generator-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1.29859, 0.02375, 0.02123, 0.01897, 0.01822, 0.01828, 0.01866, 0.01876, 0.01889, 0.01783]}, "forward-recv-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3.69025, 0.02974, 0.02963, 0.03036, 0.03015, 0.03018, 0.03047, 0.03047, 0.03, 0.03017]}, "forward-send-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1.06877, 0.00017, 0.00016, 0.00015, 0.00015, 0.00015, 0.00018, 0.00015, 0.00016, 0.00014]}, "backward-recv-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.07001, 0.07185, 0.07034, 0.07062, 0.07068, 0.07076, 0.07093, 0.07034, 0.07033, 0.07056]}, "backward-send-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.00032, 0.00023, 0.00027, 0.00028, 0.00026, 0.0003, 0.00028, 0.00029, 0.00028, 0.00029]}, "forward-send-backward-recv-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [5.90985, 0.29772, 0.29629, 0.28867, 0.29204, 0.29221, 0.29134, 0.28969, 0.29014, 0.29351]}, "backward-send-forward-recv-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.86713, 0.00263, 0.0025, 0.00238, 0.00246, 0.00238, 0.00237, 0.00259, 0.00243, 0.00254]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3e-05, 2e-05, 2e-05, 2e-05, 2e-05, 2e-05, 3e-05, 3e-05, 2e-05, 2e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [5e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05]}, "all-grads-sync-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.20519, 0.00031, 0.00025, 0.00025, 0.00026, 0.00025, 0.00025, 0.00025, 0.00025, 0.00025]}, "params-all-gather-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.00016, 0.00013, 0.00012, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011, 0.00011]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.00015, 0.00013, 0.00011, 0.00011, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.01362, 0.00058, 0.00048, 0.00041, 0.00047, 0.0004, 0.0004, 0.00039, 0.0004, 0.0004]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.00823, 0.00068, 0.00072, 0.00073, 0.00068, 0.00069, 0.00069, 0.0007, 0.00069, 0.00066]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.00098, 0.00026, 0.00023, 0.00023, 0.00025, 0.00023, 0.00023, 0.00024, 0.00024, 0.00023]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.00019, 0.00018, 0.00015, 0.00016, 0.00015, 0.00016, 0.00016, 0.00015, 0.00015, 0.00015]}, "optimizer-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.02427, 0.00277, 0.00256, 0.00257, 0.00249, 0.00243, 0.00242, 0.00241, 0.00241, 0.00237]}, "learning-rate": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}, "learning-rate vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}, "batch-size": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.19947, 9.20335, 9.20248, 9.19723, 9.19172, 9.18973, 9.18517, 9.17532, 9.17374, 9.1609]}, "lm loss vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.19947, 9.20335, 9.20248, 9.19723, 9.19172, 9.18973, 9.18517, 9.17532, 9.17374, 9.1609]}, "loss-scale": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.93277, 1.00171, 1.00056, 0.944, 1.16867, 0.98576, 0.91686, 0.9042, 0.83078, 0.88219]}, "grad-norm vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.93277, 1.00171, 1.00056, 0.944, 1.16867, 0.98576, 0.91686, 0.9042, 0.83078, 0.88219]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [106.0, 114.0, 108.0, 110.0, 81.0, 105.0, 85.0, 109.0, 146.0, 122.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [106.0, 114.0, 108.0, 110.0, 81.0, 105.0, 85.0, 109.0, 146.0, 122.0]}, "params-norm": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [35.67851, 35.67851, 35.67851, 35.67851, 35.67851, 35.6785, 35.67851, 35.6785, 35.67848, 35.67848]}, "params-norm vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [35.67851, 35.67851, 35.67851, 35.67851, 35.67851, 35.6785, 35.67851, 35.6785, 35.67848, 35.67848]}, "iteration-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [11.71205, 0.64203, 0.63681, 0.62887, 0.62867, 0.62983, 0.6294, 0.62857, 0.62698, 0.62637]}, "lm loss validation": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [9.1542]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 1, "step_interval": 5, "values": [9.1542]}, "lm loss validation ppl": {"start_step": 0, "end_step": 1, "step_interval": 5, "values": [9454.09668]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 1, "step_interval": 5, "values": [9454.09668]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G/golden_values_lts.json
new file mode 100644
index 0000000000000000000000000000000000000000..bfdacf168e3dc72ddd70a940a7e5de200f3b5c69
--- /dev/null
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G/golden_values_lts.json
@@ -0,0 +1 @@
+{"forward-backward-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.12533, 0.61523, 0.612, 0.61274, 0.60959, 0.61563, 0.61043, 0.62211, 0.61259, 0.61475]}, "forward-compute-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3.2886, 0.29298, 0.28952, 0.29035, 0.28755, 0.29301, 0.28608, 0.30023, 0.28978, 0.29236]}, "backward-compute-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1.10925, 0.28738, 0.28707, 0.28715, 0.28829, 0.28813, 0.29022, 0.28846, 0.29053, 0.29005]}, "batch-generator-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.76471, 0.01852, 0.01694, 0.02369, 0.02029, 0.01651, 0.01633, 0.02469, 0.01956, 0.01684]}, "forward-recv-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3.67666, 0.02972, 0.02965, 0.02942, 0.02811, 0.0288, 0.0288, 0.02849, 0.02832, 0.02838]}, "forward-send-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.9526, 0.00016, 0.00016, 0.00016, 0.00016, 0.00018, 0.00017, 0.00017, 0.00014, 0.00015]}, "backward-recv-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.07105, 0.07081, 0.07084, 0.07037, 0.06972, 0.07299, 0.06941, 0.06963, 0.07091, 0.07042]}, "backward-send-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.00019, 0.0002, 0.00021, 0.00019, 0.0002, 0.00019, 0.00019, 0.00018, 0.00018, 0.00018]}, "forward-send-backward-recv-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [4.17022, 0.29888, 0.30073, 0.30472, 0.30255, 0.30377, 0.30116, 0.3082, 0.3045, 0.30713]}, "backward-send-forward-recv-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.89549, 0.00229, 0.00225, 0.00218, 0.00224, 0.00218, 0.00214, 0.00228, 0.00208, 0.00209]}, "layernorm-grads-all-reduce-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [3e-05, 3e-05, 4e-05, 2e-05, 3e-05, 2e-05, 2e-05, 3e-05, 2e-05, 2e-05]}, "embedding-grads-all-reduce-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [5e-05, 3e-05, 5e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05, 3e-05]}, "all-grads-sync-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.19492, 0.00027, 0.00039, 0.00025, 0.00027, 0.00025, 0.00024, 0.00025, 0.00022, 0.00022]}, "params-all-gather-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.00015, 0.0001, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 9e-05, 9e-05]}, "optimizer-copy-to-main-grad-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.00013, 0.00011, 0.00011, 0.0001, 0.0001, 0.0001, 0.0001, 0.00011, 9e-05, 9e-05]}, "optimizer-clip-main-grad-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.02498, 0.00052, 0.00052, 0.00039, 0.00051, 0.00039, 0.00041, 0.00041, 0.00037, 0.00036]}, "optimizer-count-zeros-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.00735, 0.00064, 0.00064, 0.00064, 0.00063, 0.00065, 0.00068, 0.00065, 0.00065, 0.00065]}, "optimizer-inner-step-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.00093, 0.00021, 0.00021, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.00018, 0.00018]}, "optimizer-copy-main-to-model-params-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.00018, 0.00015, 0.00015, 0.00015, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014, 0.00014]}, "optimizer-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.03475, 0.00249, 0.00249, 0.0023, 0.00258, 0.0023, 0.00234, 0.00235, 0.00223, 0.00223]}, "learning-rate": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}, "learning-rate vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}, "batch-size": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "batch-size vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0, 32.0]}, "lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.19948, 9.20339, 9.20246, 9.19721, 9.1917, 9.18976, 9.18515, 9.17526, 9.1738, 9.16094]}, "lm loss vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [9.19948, 9.20339, 9.20246, 9.19721, 9.1917, 9.18976, 9.18515, 9.17526, 9.1738, 9.16094]}, "loss-scale": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "loss-scale vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "grad-norm": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.93282, 1.00192, 1.00046, 0.94405, 1.16906, 0.98576, 0.91623, 0.90401, 0.83116, 0.88246]}, "grad-norm vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [0.93282, 1.00192, 1.00046, 0.94405, 1.16906, 0.98576, 0.91623, 0.90401, 0.83116, 0.88246]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [103.0, 122.0, 112.0, 97.0, 93.0, 105.0, 105.0, 101.0, 126.0, 120.0]}, "num-zeros vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [103.0, 122.0, 112.0, 97.0, 93.0, 105.0, 105.0, 101.0, 126.0, 120.0]}, "params-norm": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [35.67851, 35.67851, 35.67851, 35.67851, 35.67851, 35.67851, 35.67851, 35.6785, 35.67849, 35.67848]}, "params-norm vs samples": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [35.67851, 35.67851, 35.67851, 35.67851, 35.67851, 35.67851, 35.67851, 35.6785, 35.67849, 35.67848]}, "iteration-time": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [11.25871, 0.63103, 0.62702, 0.628, 0.62436, 0.6304, 0.62504, 0.63626, 0.62666, 0.62873]}, "lm loss validation": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [9.1542]}, "lm loss validation vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [9.1542]}, "lm loss validation ppl": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [9454.09668]}, "lm loss validation ppl vs samples": {"start_step": 0, "end_step": 2, "step_interval": 5, "values": [9454.09668]}}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G/model_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9a40c4406e57f38dca0a65ad99d8be87edf2e4c4
--- /dev/null
+++ b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G/model_config.yaml
@@ -0,0 +1,58 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Tree
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  GPUS_PER_NODE: 8
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 624
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --num-attention-heads: 12
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 2
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_PATH}
+  --load: ${CHECKPOINT_PATH}
+  --split: 949,50,1
+  --tokenizer-type: NullTokenizer
+  --vocab-size: 8192
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 4
+  --pipeline-model-parallel-size: 1
+  --encoder-pipeline-model-parallel-size: 1
+  --encoder-tensor-model-parallel-size: 4
+  --deterministic-mode: true
+  --attention-softmax-in-fp32: true
+  --ckpt-format: torch
+  --no-gradient-accumulation-fusion: true
+  --bf16: true
+  --img-h: 336
+  --img-w: 336
+  --patch-dim: 14
+  --mock-data: true
+  --freeze-ViT: true
+  --freeze-LM: true
+  --use-distributed-optimizer: true
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G/model_config.yaml b/tests/functional_tests/test_cases/multimodal-llava/multimodal_llava_mr_mcore_te_tp4_pp1_resume_torch_etp3_dgx_a100_1N7G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_lts.json b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_mr_mcore_tp4_pp1_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml
old mode 100755
new mode 100644
index 798f00c902a4a2a83d6e2bfc7fe1592573c34d6d..bde4e7200b18425ecf09237f12c33b1df17e852a
--- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch/model_config.yaml
@@ -51,4 +51,5 @@ MODEL_ARGS:
   --deterministic-mode: true
   --attention-softmax-in-fp32: true
   --ckpt-format: torch
+  --attention-backend: unfused
 TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_lts.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml
old mode 100755
new mode 100644
index df56656bd63cc4e26758f1571af2d55d753e8fef..289e213759eba4488a5bb704f9015298cecd3d9d
--- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/model_config.yaml
@@ -51,4 +51,5 @@ MODEL_ARGS:
   --deterministic-mode: true
   --attention-softmax-in-fp32: true
   --ckpt-format: torch
+  --attention-backend: unfused
 TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_lts.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml
old mode 100755
new mode 100644
index 940b85cfab15b54ed5ff68ef6b4aec0983b02321..8cfc7e4253dfab58c26ec34b3a77c0dba6ebc81b
--- a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml
+++ b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/model_config.yaml
@@ -52,4 +52,5 @@ MODEL_ARGS:
   --deterministic-mode: true
   --attention-softmax-in-fp32: true
   --ckpt-format: torch
+  --attention-backend: unfused
 TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_lts.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_vp1_resume_torch/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_dev.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_lts.json b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_220m_nightly_dgx_a100_1N8G_mcore_tp2_pp1_vp1/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_lts.json b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_lts.json b/tests/functional_tests/test_cases/t5/t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1_sequence_parallel/golden_values_lts.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_release/golden_values_0.9.0.json b/tests/functional_tests/test_cases/t5/t5_release/golden_values_0.9.0.json
old mode 100755
new mode 100644
diff --git a/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml b/tests/functional_tests/test_cases/t5/t5_release/model_config.yaml
old mode 100755
new mode 100644
diff --git a/tests/test_utils/python_scripts/common.py b/tests/test_utils/python_scripts/common.py
old mode 100755
new mode 100644
diff --git a/tests/test_utils/python_scripts/generate_jet_trigger_job.py b/tests/test_utils/python_scripts/generate_jet_trigger_job.py
old mode 100755
new mode 100644
diff --git a/tests/test_utils/python_scripts/generate_local_jobs.py b/tests/test_utils/python_scripts/generate_local_jobs.py
old mode 100755
new mode 100644
diff --git a/tests/test_utils/python_scripts/launch_jet_workload.py b/tests/test_utils/python_scripts/launch_jet_workload.py
old mode 100755
new mode 100644
diff --git a/tests/test_utils/recipes/_build-mcore-dev.yaml b/tests/test_utils/recipes/_build-mcore-dev.yaml
old mode 100755
new mode 100644
diff --git a/tests/test_utils/recipes/_build-mcore-lts.yaml b/tests/test_utils/recipes/_build-mcore-lts.yaml
old mode 100755
new mode 100644
diff --git a/tests/test_utils/recipes/_build-nemo.yaml b/tests/test_utils/recipes/_build-nemo.yaml
old mode 100755
new mode 100644
diff --git a/tests/test_utils/recipes/bert.yaml b/tests/test_utils/recipes/bert.yaml
old mode 100755
new mode 100644
diff --git a/tests/test_utils/recipes/gpt-modelopt.yaml b/tests/test_utils/recipes/gpt-modelopt.yaml
old mode 100755
new mode 100644
diff --git a/tests/test_utils/recipes/gpt-nemo.yaml b/tests/test_utils/recipes/gpt-nemo.yaml
old mode 100755
new mode 100644
diff --git a/tests/test_utils/recipes/gpt.yaml b/tests/test_utils/recipes/gpt.yaml
old mode 100755
new mode 100644
diff --git a/tests/test_utils/recipes/multimodal-llava.yaml b/tests/test_utils/recipes/multimodal-llava.yaml
old mode 100755
new mode 100644
index 3989ebeefa5ea9b84ce1119433ce295edcf8b41b..0d43c64bad63926f54a137c38aaf351306c1d004
--- a/tests/test_utils/recipes/multimodal-llava.yaml
+++ b/tests/test_utils/recipes/multimodal-llava.yaml
@@ -40,6 +40,8 @@ products:
     test_case:
       - multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G
       - multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G
+      - multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dgx_a100_1N8G
+      - multimodal_llava_mr_mcore_te_tp4_pp1_freeze_vit_freeze_lm_dist_opt_dgx_a100_1N8G
   - environment: [lts, dev]
     scope: [mr]
     n_repeat: [5]
diff --git a/tests/test_utils/recipes/t5.yaml b/tests/test_utils/recipes/t5.yaml
old mode 100755
new mode 100644
diff --git a/tests/test_utils/recipes/unit-tests.yaml b/tests/test_utils/recipes/unit-tests.yaml
old mode 100755
new mode 100644
diff --git a/tests/test_utils/shell_scripts/notify.sh b/tests/test_utils/shell_scripts/notify.sh
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/__init__.py b/tests/unit_tests/__init__.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/conftest.py b/tests/unit_tests/conftest.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/data/__init__.py b/tests/unit_tests/data/__init__.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/data/test_bin_reader.py b/tests/unit_tests/data/test_bin_reader.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/data/test_builder.py b/tests/unit_tests/data/test_builder.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/data/test_gpt_dataset.py b/tests/unit_tests/data/test_gpt_dataset.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/data/test_multimodal_dataset.py b/tests/unit_tests/data/test_multimodal_dataset.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/data/test_preprocess_data.py b/tests/unit_tests/data/test_preprocess_data.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/data/test_preprocess_mmdata.py b/tests/unit_tests/data/test_preprocess_mmdata.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/dist_checkpointing/__init__.py b/tests/unit_tests/dist_checkpointing/__init__.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/dist_checkpointing/conftest.py b/tests/unit_tests/dist_checkpointing/conftest.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/dist_checkpointing/models/__init__.py b/tests/unit_tests/dist_checkpointing/models/__init__.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/dist_checkpointing/models/common.py b/tests/unit_tests/dist_checkpointing/models/common.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/dist_checkpointing/models/test_bert_model.py b/tests/unit_tests/dist_checkpointing/models/test_bert_model.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/dist_checkpointing/models/test_gpt_model.py b/tests/unit_tests/dist_checkpointing/models/test_gpt_model.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/dist_checkpointing/models/test_mamba.py b/tests/unit_tests/dist_checkpointing/models/test_mamba.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/dist_checkpointing/models/test_mlp_glu.py b/tests/unit_tests/dist_checkpointing/models/test_mlp_glu.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/dist_checkpointing/models/test_moe_experts.py b/tests/unit_tests/dist_checkpointing/models/test_moe_experts.py
old mode 100755
new mode 100644
index e5e3ac98bd22bdf6471c35acb0d45b5ef3ad934e..54a60fc62a128b7f38b07dcff24ed29387b0ee41
--- a/tests/unit_tests/dist_checkpointing/models/test_moe_experts.py
+++ b/tests/unit_tests/dist_checkpointing/models/test_moe_experts.py
@@ -15,7 +15,10 @@ from megatron.core.dist_checkpointing.strategies.fully_parallel import (
     FullyParallelLoadStrategyWrapper,
     FullyParallelSaveStrategyWrapper,
 )
-from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_layer_local_spec,
+    get_gpt_layer_with_transformer_engine_spec,
+)
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 from megatron.core.transformer.moe.experts import GroupedMLP, SequentialMLP, TEGroupedMLP
 from megatron.core.transformer.transformer_config import TransformerConfig
@@ -43,22 +46,25 @@ def initialize_expert_layer(seed, glu=True, expert_type='sequential', fp8=False,
     )
     default_config_kwargs.update(**config_kwargs)
     transformer_config = TransformerConfig(**default_config_kwargs)
-    transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
-        num_experts=num_moe_experts, moe_grouped_gemm=(expert_type != 'sequential'), fp8=fp8
-    )
     if expert_type == 'grouped':
         model = GroupedMLP(num_local_experts, transformer_config)
     elif expert_type == 'te_grouped':
+        transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+            num_experts=num_moe_experts, moe_grouped_gemm=True
+        )
         model = TEGroupedMLP(
             num_local_experts,
             transformer_config,
-            transformer_layer_spec.submodules.mlp.submodules.experts,
+            transformer_layer_spec.submodules.mlp.submodules.experts.submodules,
         )
     elif expert_type == 'sequential':
+        transformer_layer_spec = get_gpt_layer_local_spec(
+            num_experts=num_moe_experts, moe_grouped_gemm=False
+        )
         model = SequentialMLP(
             num_local_experts,
             transformer_config,
-            transformer_layer_spec.submodules.mlp.submodules.experts,
+            transformer_layer_spec.submodules.mlp.submodules.experts.submodules,
         )
     else:
         raise ValueError('expert_type can only be one of ["sequential", "grouped", "te_grouped"]')
@@ -86,6 +92,7 @@ class TestExpertLayerReconfiguration:
     def teardown_method(self, method):
         Utils.destroy_model_parallel()
 
+    @pytest.mark.internal
     @pytest.mark.parametrize(
         "use_fpsl,src_tp_pp_ep_etp,dest_tp_pp_ep_etp,use_glu",
         [
@@ -200,6 +207,7 @@ class TestExpertLayerReconfiguration:
             diffs = diff(state_dict_A, state_dict_B)
             assert not any(map(bool, diffs)), diffs
 
+    @pytest.mark.internal
     @pytest.mark.parametrize(
         "src_tp_pp_exp,dest_tp_pp_exp,use_glu",
         [
diff --git a/tests/unit_tests/dist_checkpointing/models/test_retro_model.py b/tests/unit_tests/dist_checkpointing/models/test_retro_model.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/dist_checkpointing/models/test_t5_model.py b/tests/unit_tests/dist_checkpointing/models/test_t5_model.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/dist_checkpointing/test_async_save.py b/tests/unit_tests/dist_checkpointing/test_async_save.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/dist_checkpointing/test_cached_metadata.py b/tests/unit_tests/dist_checkpointing/test_cached_metadata.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py b/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py
old mode 100755
new mode 100644
index fa00a20cad1cb5da886ca42fefb286dbc3fe5efe..1485eebe10c2fe6d6ee26358c67188dbbde7170b
--- a/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py
+++ b/tests/unit_tests/dist_checkpointing/test_flattened_resharding.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
 
 import io
+from contextlib import nullcontext
 
 import numpy as np
 import pytest
@@ -18,6 +19,10 @@ from megatron.core.dist_checkpointing.strategies.resharding import (
     restore_nd_flattened_tensors_formulation,
 )
 from megatron.core.dist_checkpointing.strategies.torch import get_reformulation_metadata
+from megatron.core.dist_checkpointing.validation import (
+    determine_global_metadata,
+    validate_sharding_integrity,
+)
 from tests.unit_tests.dist_checkpointing import TempNamedDir
 from tests.unit_tests.test_utilities import Utils
 
@@ -198,3 +203,66 @@ class TestFlattenedResharding:
             ),
         }
         return state_dict
+
+    def test_flattened_tensors_are_properly_validated(self, tmp_path_dist_ckpt):
+        Utils.initialize_model_parallel()
+        # Global tensor of shape (6, 6) is built from:
+        # ranks 0, 1, 2 tensors of length 1, 2, 3
+        # and then ranks 3, ..., 7 tensors of length 6
+        local_flat_ten = torch.ones(Utils.rank + 1 if Utils.rank <= 2 else 6) * Utils.rank
+
+        global_flattened_len = 6 + (Utils.world_size - 3) * 6
+        if Utils.world_size == 8:
+            assert global_flattened_len == 1 + 2 + 3 + 5 * 6
+            local_ten_shape = (1, 6)
+        else:
+            local_ten_shape = (global_flattened_len,)
+
+        if Utils.rank == 0:
+            local_dp_slice_start = 0
+        elif Utils.rank == 1:
+            local_dp_slice_start = 1
+        elif Utils.rank == 2:
+            local_dp_slice_start = 3
+        else:
+            local_dp_slice_start = 0
+        local_dp_slice = slice(local_dp_slice_start, local_dp_slice_start + len(local_flat_ten))
+
+        state_dict = {
+            'sd_key_flat': ShardedTensor.from_rank_offsets_flat(
+                'flat',
+                local_flat_ten,
+                local_ten_shape,
+                *((0, max(0, Utils.rank - 2), 6),) if Utils.world_size == 8 else (),
+                flattened_range=local_dp_slice,
+                replica_id=0
+            )
+        }
+        validate_sharding_integrity(determine_global_metadata(state_dict)[1])
+        if Utils.rank == 1:
+            old_state_dict = state_dict
+            state_dict = {}
+
+        with (
+            pytest.raises(CheckpointingException) if Utils.rank == 0 else nullcontext()
+        ) as exc_info:
+            validate_sharding_integrity(determine_global_metadata(state_dict)[1])
+        if Utils.rank == 0:
+            assert 'Flattened ranges dont cover the whole shard ShardedTensor' in str(
+                exc_info.value
+            )
+
+        if Utils.rank == 1:
+            state_dict = old_state_dict
+
+        if Utils.rank == 4:
+            state_dict = {}
+
+        with (
+            pytest.raises(CheckpointingException) if Utils.rank == 0 else nullcontext()
+        ) as exc_info:
+            validate_sharding_integrity(determine_global_metadata(state_dict)[1])
+        if Utils.rank == 0:
+            assert 'Invalid access pattern' in str(exc_info.value)
+
+        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/dist_checkpointing/test_fp8.py b/tests/unit_tests/dist_checkpointing/test_fp8.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/dist_checkpointing/test_fully_parallel.py b/tests/unit_tests/dist_checkpointing/test_fully_parallel.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/dist_checkpointing/test_local.py b/tests/unit_tests/dist_checkpointing/test_local.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/dist_checkpointing/test_mapping.py b/tests/unit_tests/dist_checkpointing/test_mapping.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/dist_checkpointing/test_nonpersistent.py b/tests/unit_tests/dist_checkpointing/test_nonpersistent.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/dist_checkpointing/test_optimizer.py b/tests/unit_tests/dist_checkpointing/test_optimizer.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/dist_checkpointing/test_serialization.py b/tests/unit_tests/dist_checkpointing/test_serialization.py
old mode 100755
new mode 100644
index e59896c9221882777371fba7a553bf7f061eba5b..1f2d229b1cc2997664b9aacbe252f8c285801315
--- a/tests/unit_tests/dist_checkpointing/test_serialization.py
+++ b/tests/unit_tests/dist_checkpointing/test_serialization.py
@@ -568,6 +568,39 @@ class TestSerialization:
 
         Utils.destroy_model_parallel()
 
+    def test_empty_load(self, tmp_path_dist_ckpt):
+        Utils.initialize_model_parallel(2, 4)
+
+        if Utils.rank == 0:
+            state_dict = {'common': 'common-value'}
+        elif Utils.rank == 1:
+            state_dict = {'a': 3}  # this is not saved at all (common saved by rank 0 only)
+        elif Utils.rank == 2:
+            state_dict = {'b': 3}  # this is not saved at all (common saved by rank 0 only)
+        else:
+            state_dict = {
+                'a': ShardedTensor.from_rank_offsets(
+                    'x', torch.ones((2,)) * Utils.rank, replica_id=Utils.rank - 3
+                )
+            }
+
+        with TempNamedDir(tmp_path_dist_ckpt / 'test_empty_load', sync=True) as ckpt_dir:
+            save(state_dict, ckpt_dir)
+            torch.distributed.barrier()
+            loaded_state_dict = load(state_dict, ckpt_dir)
+            assert loaded_state_dict['common'] == 'common-value'
+
+            if Utils.rank <= 2:
+                assert loaded_state_dict.keys() == {'common'}
+            else:
+                assert loaded_state_dict.keys() == {'common', 'a'}
+                loaded_state_dict['a'].cpu().numpy().tolist() == [
+                    3,
+                    3,
+                ]  # rank 3 held the main replica so did the saving
+
+        Utils.destroy_model_parallel()
+
 
 class TestNonStrictLoad:
     def setup_method(self, method):
diff --git a/tests/unit_tests/dist_checkpointing/utils.py b/tests/unit_tests/dist_checkpointing/utils.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/distributed/test_grad_reduce_for_replicated_embedder.py b/tests/unit_tests/distributed/test_grad_reduce_for_replicated_embedder.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/distributed/test_param_and_grad_buffer.py b/tests/unit_tests/distributed/test_param_and_grad_buffer.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/export/trtllm/__init__.py b/tests/unit_tests/export/trtllm/__init__.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/export/trtllm/test_distributed_fp8.py b/tests/unit_tests/export/trtllm/test_distributed_fp8.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/export/trtllm/test_single_device_fp8.py b/tests/unit_tests/export/trtllm/test_single_device_fp8.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/export/trtllm/test_trtllm_distributed_gpu_converter.py b/tests/unit_tests/export/trtllm/test_trtllm_distributed_gpu_converter.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/export/trtllm/test_trtllm_helper.py b/tests/unit_tests/export/trtllm/test_trtllm_helper.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/export/trtllm/test_trtllm_layers.py b/tests/unit_tests/export/trtllm/test_trtllm_layers.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/export/trtllm/test_trtllm_single_device_converter.py b/tests/unit_tests/export/trtllm/test_trtllm_single_device_converter.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/fusions/test_torch_softmax.py b/tests/unit_tests/fusions/test_torch_softmax.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/inference/__init__.py b/tests/unit_tests/inference/__init__.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/inference/engines/__init__.py b/tests/unit_tests/inference/engines/__init__.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/inference/engines/test_mcore_engine.py b/tests/unit_tests/inference/engines/test_mcore_engine.py
old mode 100755
new mode 100644
index 8295744d36004da55980d53a9cae49bfa675a0b2..1b342db4e6ac8cfd5786eba049c3ed271db9cd72
--- a/tests/unit_tests/inference/engines/test_mcore_engine.py
+++ b/tests/unit_tests/inference/engines/test_mcore_engine.py
@@ -5,7 +5,6 @@ from unittest import mock
 
 import torch
 
-from megatron.core.inference.common_inference_params import CommonInferenceParams
 from megatron.core.inference.engines.mcore_engine import MCoreEngine
 from megatron.core.inference.inference_request import InferenceRequest, Status
 from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
@@ -14,8 +13,9 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper
 from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
     InferenceWrapperConfig,
 )
-from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import (
-    SimpleTextGenerationController,
+from megatron.core.inference.sampling_params import SamplingParams
+from megatron.core.inference.text_generation_controllers.text_generation_controller import (
+    TextGenerationController,
 )
 from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
 from megatron.core.models.gpt.gpt_model import GPTModel
@@ -60,7 +60,7 @@ class TestMCoreEngine:
 
         inference_wrapped_model = GPTInferenceWrapper(gpt_model, inference_wrapper_config)
         self.mock_tokenizer = mock.Mock()
-        text_generation_controller = SimpleTextGenerationController(
+        text_generation_controller = TextGenerationController(
             inference_wrapped_model=inference_wrapped_model, tokenizer=self.mock_tokenizer
         )
 
@@ -85,7 +85,7 @@ class TestMCoreEngine:
 
         prompts = ["sample" * (i + 1) for i in range(self.batch_size)]
         results: List[InferenceRequest] = self.mcore_engine.generate(
-            prompts, common_inference_params=CommonInferenceParams(num_tokens_to_generate=10)
+            prompts, sampling_params=SamplingParams(num_tokens_to_generate=10)
         )
 
         for result in results:
@@ -110,9 +110,7 @@ class TestMCoreEngine:
 
         prompts = ["" for i in range(self.batch_size)]
         results: List[InferenceRequest] = self.mcore_engine.generate(
-            prompts,
-            add_BOS=True,
-            common_inference_params=CommonInferenceParams(num_tokens_to_generate=10),
+            prompts, add_BOS=True, sampling_params=SamplingParams(num_tokens_to_generate=10)
         )
 
         for result in results:
diff --git a/tests/unit_tests/inference/model_inference_wrappers/__init__.py b/tests/unit_tests/inference/model_inference_wrappers/__init__.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py b/tests/unit_tests/inference/model_inference_wrappers/gpt/test_gpt_inference_wrapper.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py b/tests/unit_tests/inference/model_inference_wrappers/t5/test_t5_inference_wrapper.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py b/tests/unit_tests/inference/model_inference_wrappers/test_model_inference_wrapper_config.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/inference/test_common_inference_params.py b/tests/unit_tests/inference/test_common_inference_params.py
old mode 100755
new mode 100644
index af51e433df687dbbda6caa76d7dd04072445f0fb..c7ef4c9ed82956c9904836f825195ff48fc6e07b
--- a/tests/unit_tests/inference/test_common_inference_params.py
+++ b/tests/unit_tests/inference/test_common_inference_params.py
@@ -1,10 +1,10 @@
-from megatron.core.inference.common_inference_params import CommonInferenceParams
+from megatron.core.inference.sampling_params import SamplingParams
 
 
-class TestCommonInferenceParams:
+class TestSamplingParams:
 
     def test_inference_params(self):
-        inference_parameters = CommonInferenceParams()
+        inference_parameters = SamplingParams()
         inference_parameters.add_attributes({"min_tokens": 45})
         assert (
             inference_parameters.min_tokens == 45
diff --git a/tests/unit_tests/inference/test_flash_decode.py b/tests/unit_tests/inference/test_flash_decode.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/inference/test_inference_utils.py b/tests/unit_tests/inference/test_inference_utils.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/inference/test_modelopt_gpt_model.py b/tests/unit_tests/inference/test_modelopt_gpt_model.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/inference/test_scheduler.py b/tests/unit_tests/inference/test_scheduler.py
old mode 100755
new mode 100644
index b1f0ea184e15d0c5def3f4858e5ae017c5ce2ddf..90caa70a7b25f4bad49935ac15148018ce1619bb
--- a/tests/unit_tests/inference/test_scheduler.py
+++ b/tests/unit_tests/inference/test_scheduler.py
@@ -2,8 +2,8 @@ from typing import Dict
 
 import torch
 
-from megatron.core.inference.common_inference_params import CommonInferenceParams
 from megatron.core.inference.inference_request import InferenceRequest, Status
+from megatron.core.inference.sampling_params import SamplingParams
 from megatron.core.inference.scheduler import Scheduler
 
 
@@ -25,7 +25,7 @@ class TestScheduler:
     def test_scheduler(self):
         prompt = "sample prompt"
         prompt_tokens = torch.randn(5)
-        inference_parameters = CommonInferenceParams()
+        inference_parameters = SamplingParams()
 
         for i in range(self.max_batch_size):
             self.scheduler.add_request(prompt, prompt_tokens, inference_parameters)
diff --git a/tests/unit_tests/inference/text_generation_controllers/__init__.py b/tests/unit_tests/inference/text_generation_controllers/__init__.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py
old mode 100755
new mode 100644
index c28d0c3432914d9b8d343a96268de6942be854d0..12903a919ffad2469fc9677dfdc22772a0d0f07e
--- a/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py
+++ b/tests/unit_tests/inference/text_generation_controllers/test_encoder_decoder_text_generation_controller.py
@@ -10,7 +10,6 @@ import numpy as np
 import pytest
 import torch
 
-from megatron.core.inference.common_inference_params import CommonInferenceParams
 from megatron.core.inference.inference_request import InferenceRequest, Status
 from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
     InferenceWrapperConfig,
@@ -18,6 +17,7 @@ from megatron.core.inference.model_inference_wrappers.inference_wrapper_config i
 from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import (
     T5InferenceWrapper,
 )
+from megatron.core.inference.sampling_params import SamplingParams
 from megatron.core.inference.text_generation_controllers.encoder_decoder_text_generation_controller import (
     EncoderDecoderTextGenerationController,
 )
@@ -126,7 +126,7 @@ class TestEncoderDecoderTextGenerationController:
                 request_id=i,
                 prompt=prompt,
                 encoder_prompt=encoder_prompt,
-                inference_parameters=CommonInferenceParams(num_tokens_to_generate=10),
+                inference_parameters=SamplingParams(num_tokens_to_generate=10),
                 arrival_time=time.time(),
                 prompt_tokens=prompt_tokens,
                 status=Status.ACTIVE_BUT_NOT_GENERATING_TOKENS,
diff --git a/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py b/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py
old mode 100755
new mode 100644
index 1e09cf05fb4aa1f635cc4ff83e1ce0b2ee50f3f5..1db360f232c130ae9a93f9a2c7858f67fe66824d
--- a/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py
+++ b/tests/unit_tests/inference/text_generation_controllers/test_simple_text_generation_controller.py
@@ -9,7 +9,6 @@ from unittest import mock
 import pytest
 import torch
 
-from megatron.core.inference.common_inference_params import CommonInferenceParams
 from megatron.core.inference.inference_request import InferenceRequest, Status
 from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
     GPTInferenceWrapper,
@@ -17,8 +16,9 @@ from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper
 from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
     InferenceWrapperConfig,
 )
-from megatron.core.inference.text_generation_controllers.simple_text_generation_controller import (
-    SimpleTextGenerationController,
+from megatron.core.inference.sampling_params import SamplingParams
+from megatron.core.inference.text_generation_controllers.text_generation_controller import (
+    TextGenerationController,
 )
 from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
 from megatron.core.models.gpt.gpt_model import GPTModel
@@ -28,7 +28,7 @@ from megatron.core.transformer.transformer_config import TransformerConfig
 from tests.unit_tests.test_utilities import Utils
 
 
-class TestSimpleTextGenerationController:
+class TestTextGenerationController:
 
     def setup_method(self, method):
         Utils.initialize_model_parallel(
@@ -67,7 +67,7 @@ class TestSimpleTextGenerationController:
 
         self.mock_tokenizer = mock.Mock()
 
-        self.text_generation_controller = SimpleTextGenerationController(
+        self.text_generation_controller = TextGenerationController(
             inference_wrapped_model=inference_wrapped_model, tokenizer=self.mock_tokenizer
         )
 
@@ -78,7 +78,7 @@ class TestSimpleTextGenerationController:
         with pytest.raises(AssertionError) as aerror:
             self.text_generation_controller.sample_from_logits(
                 last_token_logits=None,
-                common_inference_params=CommonInferenceParams(top_k=2, top_p=0.4),
+                sampling_params=SamplingParams(top_k=2, top_p=0.4),
                 vocab_size=self.vocab_size,
             )
         assert str(aerror.value) == 'Cannot have top-p and top-k both greater than zero'
@@ -86,7 +86,7 @@ class TestSimpleTextGenerationController:
         with pytest.raises(AssertionError) as aerror:
             self.text_generation_controller.sample_from_logits(
                 last_token_logits=None,
-                common_inference_params=CommonInferenceParams(top_p=1.4, top_k=0),
+                sampling_params=SamplingParams(top_p=1.4, top_k=0),
                 vocab_size=self.vocab_size,
             )
         assert str(aerror.value) == 'top-p should be in (0,1]'
@@ -94,7 +94,7 @@ class TestSimpleTextGenerationController:
         with pytest.raises(AssertionError) as aerror:
             self.text_generation_controller.sample_from_logits(
                 last_token_logits=torch.randn(self.batch_size, 1),
-                common_inference_params=CommonInferenceParams(top_k=self.vocab_size + 10),
+                sampling_params=SamplingParams(top_k=self.vocab_size + 10),
                 vocab_size=self.vocab_size,
             )
         assert str(aerror.value) == 'top-k is larger than logit size.'
@@ -103,14 +103,14 @@ class TestSimpleTextGenerationController:
             torch.arange(0, self.vocab_size).repeat(self.batch_size, 1).float().cuda()
         )
         sampled_logits = self.text_generation_controller.sample_from_logits(
-            last_token_logits, CommonInferenceParams(top_k=1), self.vocab_size
+            last_token_logits, SamplingParams(top_k=1), self.vocab_size
         )
         assert torch.all(
             sampled_logits.cpu() == torch.ones(self.batch_size) * self.vocab_size - 1
         ), f"The sampled logits should all be {self.vocab_size} but its {sampled_logits}"
 
         sampled_logits = self.text_generation_controller.sample_from_logits(
-            last_token_logits, CommonInferenceParams(top_k=2), self.vocab_size
+            last_token_logits, SamplingParams(top_k=2), self.vocab_size
         )
         assert torch.all(
             sampled_logits >= self.vocab_size - 2
@@ -120,7 +120,7 @@ class TestSimpleTextGenerationController:
         top_p = 0.3
         expected_min_value = l[l.softmax(dim=-1).cumsum(dim=-1) > top_p][0].item()
         sampled_logits = self.text_generation_controller.sample_from_logits(
-            last_token_logits, CommonInferenceParams(top_p=top_p, top_k=0), self.vocab_size
+            last_token_logits, SamplingParams(top_p=top_p, top_k=0), self.vocab_size
         )
         assert torch.all(
             sampled_logits >= expected_min_value
@@ -131,7 +131,7 @@ class TestSimpleTextGenerationController:
         expected_min_value = l[l.div_(temperature).softmax(dim=-1).cumsum(dim=-1) > top_p][0].item()
         sampled_logits = self.text_generation_controller.sample_from_logits(
             last_token_logits,
-            CommonInferenceParams(top_p=top_p, temperature=temperature, top_k=0),
+            SamplingParams(top_p=top_p, temperature=temperature, top_k=0),
             self.vocab_size,
         )
         assert torch.all(
@@ -154,7 +154,7 @@ class TestSimpleTextGenerationController:
             inference_request = InferenceRequest(
                 request_id=i,
                 prompt=prompt,
-                inference_parameters=CommonInferenceParams(num_tokens_to_generate=10),
+                inference_parameters=SamplingParams(num_tokens_to_generate=10),
                 arrival_time=time.time(),
                 prompt_tokens=torch.randint(
                     low=0, high=self.vocab_size - 1, size=(len(prompt),)
diff --git a/tests/unit_tests/models/__init__.py b/tests/unit_tests/models/__init__.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/models/test_base_embedding.py b/tests/unit_tests/models/test_base_embedding.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/models/test_bert_model.py b/tests/unit_tests/models/test_bert_model.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/models/test_clip_vit_model.py b/tests/unit_tests/models/test_clip_vit_model.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/models/test_gpt_model.py b/tests/unit_tests/models/test_gpt_model.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/models/test_llava_model.py b/tests/unit_tests/models/test_llava_model.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/models/test_mamba_model.py b/tests/unit_tests/models/test_mamba_model.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/models/test_multimodal_projector.py b/tests/unit_tests/models/test_multimodal_projector.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/models/test_t5_model.py b/tests/unit_tests/models/test_t5_model.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/pipeline_parallel/__init__.py b/tests/unit_tests/pipeline_parallel/__init__.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/pipeline_parallel/test_helpers.py b/tests/unit_tests/pipeline_parallel/test_helpers.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/pipeline_parallel/test_schedules.py b/tests/unit_tests/pipeline_parallel/test_schedules.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/ssm/test_mamba_block.py b/tests/unit_tests/ssm/test_mamba_block.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/ssm/test_mamba_hybrid_layer_allocation.py b/tests/unit_tests/ssm/test_mamba_hybrid_layer_allocation.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/ssm/test_mamba_layer.py b/tests/unit_tests/ssm/test_mamba_layer.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/ssm/test_mamba_mixer.py b/tests/unit_tests/ssm/test_mamba_mixer.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/tensor_parallel/__init__.py b/tests/unit_tests/tensor_parallel/__init__.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/tensor_parallel/test_cross_entropy.py b/tests/unit_tests/tensor_parallel/test_cross_entropy.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/tensor_parallel/test_data.py b/tests/unit_tests/tensor_parallel/test_data.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/tensor_parallel/test_initialization.py b/tests/unit_tests/tensor_parallel/test_initialization.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/tensor_parallel/test_layers.py b/tests/unit_tests/tensor_parallel/test_layers.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/tensor_parallel/test_mappings.py b/tests/unit_tests/tensor_parallel/test_mappings.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/tensor_parallel/test_random.py b/tests/unit_tests/tensor_parallel/test_random.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/tensor_parallel/test_tensor_parallel_utils.py b/tests/unit_tests/tensor_parallel/test_tensor_parallel_utils.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/test_basic.py b/tests/unit_tests/test_basic.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/test_imports.py b/tests/unit_tests/test_imports.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/test_inference.py b/tests/unit_tests/test_inference.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/test_local_multi_tensor_fns.py b/tests/unit_tests/test_local_multi_tensor_fns.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/test_num_microbatches_calculator.py b/tests/unit_tests/test_num_microbatches_calculator.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/test_optimizer.py b/tests/unit_tests/test_optimizer.py
old mode 100755
new mode 100644
index 732a68cfa6956cbb383014557f6b07ed4f415d48..bc4852b773b560723c073b930a7798ce0f5d1b33
--- a/tests/unit_tests/test_optimizer.py
+++ b/tests/unit_tests/test_optimizer.py
@@ -64,3 +64,50 @@ def test_chained_optimizer():
 
     assert list(optimizer_1.state.values())[0]["exp_avg"].is_cuda
     assert list(optimizer_2.state.values())[0]["momentum_buffer"].is_cuda
+
+
+def test_precision_aware_fused_adam():
+    try:
+        from transformer_engine.pytorch.optimizers import FusedAdam
+    except ImportError:
+        # Older versions of TE don't have FusedAdam.
+        return
+
+    import inspect
+
+    adam_args = inspect.signature(FusedAdam).parameters
+    arg_names = ["master_weight_dtype", "exp_avg_dtype", "exp_avg_sq_dtype", "use_decoupled_grad"]
+    for name in arg_names:
+        if name not in adam_args:
+            # Skip the test if TE doesn't support precision aware FusedAdam.
+            return
+
+    tensor = torch.rand(278011, dtype=torch.bfloat16).cuda()
+    params_1 = [torch.nn.Parameter(tensor.float())]  # FP32 reference
+    params_2 = [torch.nn.Parameter(tensor.clone())]  # BF16
+
+    options = {"lr": 1, "betas": (0.1, 0.25), "eps": 1e-08, "weight_decay": 0, "amsgrad": False}
+
+    optimizer_1 = FusedAdam(params_1, **options)
+    optimizer_2 = FusedAdam(params_2, master_weights=True, use_decoupled_grad=True, **options)
+
+    for _ in range(1000):
+        for p_1, p_2 in zip(params_1, params_2):
+            p_1.grad = torch.rand_like(p_1)
+            p_2.decoupled_grad = p_1.grad.clone()
+
+        optimizer_1.step()
+        optimizer_2.step()
+
+        master_params = [optimizer_2.get_unscaled_state(p, "master_param") for p in params_2]
+        for p_1, p_2 in zip(params_1, master_params):
+            bytes_1 = p_1.data.view(torch.uint8)
+            bytes_2 = p_2.data.view(torch.uint8)
+            # Make sure bit-wise matched
+            assert torch.all(bytes_1 == bytes_2)
+
+        for p_1, p_2 in zip(params_1, params_2):
+            bytes_1 = p_1.data.bfloat16().view(torch.uint8)
+            bytes_2 = p_2.data.view(torch.uint8)
+            # Make sure bit-wise matched
+            assert torch.all(bytes_1 == bytes_2)
diff --git a/tests/unit_tests/test_optimizer_param_scheduler.py b/tests/unit_tests/test_optimizer_param_scheduler.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/test_parallel_state.py b/tests/unit_tests/test_parallel_state.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/test_tokenizer.py b/tests/unit_tests/test_tokenizer.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/test_training.py b/tests/unit_tests/test_training.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/test_utilities.py b/tests/unit_tests/test_utilities.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/test_utils.py b/tests/unit_tests/test_utils.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/transformer/__init__.py b/tests/unit_tests/transformer/__init__.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/transformer/moe/__init__.py b/tests/unit_tests/transformer/moe/__init__.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/transformer/moe/conftest.py b/tests/unit_tests/transformer/moe/conftest.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/transformer/moe/test_aux_loss.py b/tests/unit_tests/transformer/moe/test_aux_loss.py
old mode 100755
new mode 100644
index 50567e1930a0e2498a4218cfb37ffc11e5f3d7e2..af8f9901f5da0ffbde400c486a4932ac4aa7c3cd
--- a/tests/unit_tests/transformer/moe/test_aux_loss.py
+++ b/tests/unit_tests/transformer/moe/test_aux_loss.py
@@ -96,3 +96,47 @@ class TestAuxLoss:
             moe_aux_loss_coeff=0.1,
         )
         container.aux_loss_test(self.input, self.baseline_grad)
+
+
+class TestSeqAuxLoss:
+    def setup_method(self, method):
+        baseline_container = AuxlossTestContainer(
+            tp_size=1,
+            ep_size=1,
+            pp_size=1,
+            cp_size=1,
+            num_moe_experts=8,
+            moe_router_topk=2,
+            moe_router_load_balancing_type="seq_aux_loss",
+            moe_token_dispatcher_type="alltoall",
+            moe_aux_loss_coeff=0.1,
+        )
+        moe_layer = baseline_container.moe_layer
+        self.input = torch.randn((32, 8, moe_layer.config.hidden_size)).cuda()
+        self.input.requires_grad = True
+        probs, indices = moe_layer.router(self.input)
+        probs.sum().mul_(0).backward()  # zero out the main gradients
+        self.baseline_grad = self.input.grad
+        self.input.grad = None
+        clear_aux_losses_tracker()
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.internal
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.internal
+    @pytest.mark.parametrize("tp_size,ep_size,cp_size", [(1, 8, 1)])
+    def test_a2a_dispatcher(self, tp_size, ep_size, cp_size):
+        container = AuxlossTestContainer(
+            tp_size=tp_size,
+            ep_size=ep_size,
+            pp_size=1,
+            cp_size=cp_size,
+            num_moe_experts=8,
+            moe_router_topk=2,
+            moe_router_load_balancing_type="seq_aux_loss",
+            moe_token_dispatcher_type="alltoall",
+            moe_aux_loss_coeff=0.1,
+        )
+        container.aux_loss_test(self.input, self.baseline_grad)
diff --git a/tests/unit_tests/transformer/moe/test_grouped_mlp.py b/tests/unit_tests/transformer/moe/test_grouped_mlp.py
old mode 100755
new mode 100644
index 2c275493257771f76beaf71b73b734553824aa00..c7c4935976941076d7be88d9bd884c13a28a9f96
--- a/tests/unit_tests/transformer/moe/test_grouped_mlp.py
+++ b/tests/unit_tests/transformer/moe/test_grouped_mlp.py
@@ -4,7 +4,10 @@ import pytest
 import torch
 import torch.nn.functional as F
 
-from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_layer_local_spec,
+    get_gpt_layer_with_transformer_engine_spec,
+)
 from megatron.core.transformer.moe import grouped_gemm_util as gg
 from megatron.core.transformer.moe.experts import TEGroupedMLP
 from megatron.core.transformer.moe.moe_layer import MoELayer
@@ -66,9 +69,7 @@ class TestParallelGroupedMLP:
         ## Vanilla sequential GEMM
         # Set random seed for reproducability
         _set_random_seed(seed_=123, data_parallel_random_init=False)
-        transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
-            self.num_experts, moe_grouped_gemm=False
-        )
+        transformer_layer_spec = get_gpt_layer_local_spec(self.num_experts, moe_grouped_gemm=False)
         self.sequential_mlp = MoELayer(tf_config, transformer_layer_spec.submodules.mlp.submodules)
 
         self.args = parse_args(ignore_unknown_args=True)
@@ -254,9 +255,7 @@ class TestTEGroupedMLP:
         ## Vanilla sequential GEMM
         # Set random seed for reproducability
         _set_random_seed(seed_=123, data_parallel_random_init=False)
-        transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
-            self.num_experts, moe_grouped_gemm=False
-        )
+        transformer_layer_spec = get_gpt_layer_local_spec(self.num_experts, moe_grouped_gemm=False)
         self.sequential_mlp = MoELayer(tf_config, transformer_layer_spec.submodules.mlp.submodules)
 
         self.args = parse_args(ignore_unknown_args=True)
diff --git a/tests/unit_tests/transformer/moe/test_moe_layer.py b/tests/unit_tests/transformer/moe/test_moe_layer.py
old mode 100755
new mode 100644
index d303a3f3e989a3d4e23127b3b864b9704ef910a6..59afadfd2076f15aedebb39e3e2707a6b3bf5afd
--- a/tests/unit_tests/transformer/moe/test_moe_layer.py
+++ b/tests/unit_tests/transformer/moe/test_moe_layer.py
@@ -13,6 +13,7 @@ from megatron.core.transformer.moe.moe_layer import MoELayer
 from megatron.core.transformer.moe.router import Router
 from megatron.core.transformer.transformer_block import TransformerBlock
 from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import is_te_min_version
 from megatron.training.initialize import _set_random_seed
 from tests.unit_tests.test_utilities import Utils
 
@@ -21,6 +22,10 @@ class TestMoELayerInit:
     def setup_method(self, method):
         pass
 
+    @pytest.mark.skipif(
+        not is_te_min_version("1.7.0.dev0"),
+        reason="Expert with TE Linear is only supported in TE 1.7.0 and later.",
+    )
     @pytest.mark.parametrize("moe_token_dispatcher_type", ["allgather", "alltoall"])
     @pytest.mark.parametrize("num_moe_experts", [1, 2])
     @pytest.mark.parametrize("grouped_gemm", [True, False])
@@ -49,7 +54,8 @@ class TestMoELayerInit:
 
     @pytest.mark.parametrize("moe_token_dispatcher_type", ["allgather", "alltoall"])
     @pytest.mark.parametrize("num_moe_experts", [1, 2])
-    def test_legacy_moe_layer(self, num_moe_experts, moe_token_dispatcher_type):
+    @pytest.mark.parametrize("grouped_gemm", [True, False])
+    def test_legacy_moe_layer(self, num_moe_experts, moe_token_dispatcher_type, grouped_gemm):
         Utils.initialize_model_parallel(1, 1)
         _set_random_seed(seed_=123, data_parallel_random_init=False)
         num_moe_experts = 4
@@ -59,13 +65,15 @@ class TestMoELayerInit:
             num_attention_heads=4,
             num_moe_experts=num_moe_experts,
             use_cpu_initialization=True,
+            moe_token_dispatcher_type=moe_token_dispatcher_type,
             moe_router_load_balancing_type="aux_loss",
             moe_router_topk=2,
             moe_aux_loss_coeff=0.01,
+            moe_grouped_gemm=grouped_gemm,
             add_bias_linear=False,
         )
         transformer_layer_spec = get_gpt_layer_local_spec(
-            num_experts=num_moe_experts, moe_grouped_gemm=False
+            num_experts=num_moe_experts, moe_grouped_gemm=grouped_gemm
         )
         moe_layer = MoELayer(
             self.transformer_config, transformer_layer_spec.submodules.mlp.submodules
diff --git a/tests/unit_tests/transformer/moe/test_routers.py b/tests/unit_tests/transformer/moe/test_routers.py
old mode 100755
new mode 100644
index 65796ff599878a653115a52e4f5b5f87dbc68d4b..2f6025d993ccf43b49719011cc4e229c66ad61c5
--- a/tests/unit_tests/transformer/moe/test_routers.py
+++ b/tests/unit_tests/transformer/moe/test_routers.py
@@ -3,7 +3,7 @@
 import pytest
 import torch
 
-from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
 from megatron.core.transformer.moe.moe_layer import MoELayer
 from megatron.core.transformer.moe.router import Router
 from megatron.core.transformer.transformer_config import TransformerConfig
@@ -27,7 +27,7 @@ class TestTop2Router:
             moe_router_topk=2,
             moe_aux_loss_coeff=0,
         )
-        transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+        transformer_layer_spec = get_gpt_layer_local_spec(
             num_experts=num_moe_experts, moe_grouped_gemm=False
         )
         self.sequential_mlp = MoELayer(
@@ -84,3 +84,62 @@ class TestTop2Router:
         out = self.sequential_mlp(hidden_states)[0]
         out.sum().mul_(0).backward()
         assert self.sequential_mlp.router.weight.grad.abs().sum() > 0
+
+
+class TestDeviceLimitedTop2Router:
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1, expert_model_parallel_size=8)
+        _set_random_seed(seed_=123, data_parallel_random_init=False)
+        print("done intializing")
+        num_moe_experts = 8
+        self.transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=12,
+            num_attention_heads=4,
+            num_moe_experts=num_moe_experts,
+            use_cpu_initialization=True,
+            expert_model_parallel_size=8,
+            moe_router_load_balancing_type="aux_loss",
+            moe_router_topk_limited_devices=2,
+            moe_router_pre_softmax=True,
+            moe_router_topk=2,
+            moe_aux_loss_coeff=0,
+        )
+        transformer_layer_spec = get_gpt_layer_local_spec(
+            num_experts=num_moe_experts, moe_grouped_gemm=False
+        )
+        self.sequential_mlp = MoELayer(
+            self.transformer_config, transformer_layer_spec.submodules.mlp.submodules
+        )
+        self.router = self.sequential_mlp.router
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_constructor(self):
+        assert isinstance(self.router, Router)
+
+        num_weights = sum([p.numel() for p in self.router.parameters()])
+        assert num_weights == 12 * 8, num_weights
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.parametrize("moe_router_pre_softmax", [(True), (False)])
+    def test_router_forward(self, moe_router_pre_softmax):
+        with torch.no_grad():
+            self.router = self.router.cuda()
+            self.router.config.moe_router_pre_softmax = moe_router_pre_softmax
+            if moe_router_pre_softmax:
+                self.router.config.moe_router_topk_scaling_factor = 16.0
+            # [num tokens, hidden size]
+            hidden_states = torch.randn((32, 2, self.router.config.hidden_size))
+            hidden_states = hidden_states.cuda()
+            scores, indices = self.router(hidden_states)
+            print(scores.shape, indices.shape)
+            assert scores.shape == (64, 8)
+            assert indices.shape == (64, 8)
+            print(
+                (indices == 0).sum(),
+                (indices == 1).sum(),
+                (indices == 2).sum(),
+                (indices == 3).sum(),
+            )
diff --git a/tests/unit_tests/transformer/moe/test_sequential_mlp.py b/tests/unit_tests/transformer/moe/test_sequential_mlp.py
old mode 100755
new mode 100644
index 2a005555d5eaeee233c74cf5bd2d957ed6e2b79d..dc350e092b33b9cf537340e6267746064866d1cf
--- a/tests/unit_tests/transformer/moe/test_sequential_mlp.py
+++ b/tests/unit_tests/transformer/moe/test_sequential_mlp.py
@@ -5,7 +5,7 @@ import pytest
 import torch
 
 from megatron.core.extensions.transformer_engine import TEColumnParallelLinear, TERowParallelLinear
-from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
 from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 from megatron.core.transformer.mlp import MLPSubmodules
@@ -35,7 +35,7 @@ class TestParallelSequentialMLP:
             moe_router_load_balancing_type="sinkhorn",
             moe_router_topk=1,
         )
-        transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+        transformer_layer_spec = get_gpt_layer_local_spec(
             num_experts=num_moe_experts, moe_grouped_gemm=False
         )
         self.sequential_mlp = MoELayer(
diff --git a/tests/unit_tests/transformer/moe/test_shared_experts.py b/tests/unit_tests/transformer/moe/test_shared_experts.py
old mode 100755
new mode 100644
index 0cacf308361224eaf97bceaa0d78590db9717fa4..f721c48293700525ed9da35ac85da8c03fd1048a
--- a/tests/unit_tests/transformer/moe/test_shared_experts.py
+++ b/tests/unit_tests/transformer/moe/test_shared_experts.py
@@ -3,7 +3,7 @@
 import pytest
 import torch
 
-from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 from megatron.core.transformer.moe.moe_layer import MoELayer
 from megatron.core.transformer.transformer_config import TransformerConfig
@@ -39,7 +39,7 @@ class TestSharedExperts:
             moe_router_topk=1,
             add_bias_linear=False,
         )
-        transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+        transformer_layer_spec = get_gpt_layer_local_spec(
             num_experts=num_moe_experts, moe_grouped_gemm=False
         )
         self.moe_layer = MoELayer(
@@ -98,7 +98,7 @@ class TestSharedExpertsOverlap:
             moe_router_topk=1,
             add_bias_linear=False,
         )
-        transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+        transformer_layer_spec = get_gpt_layer_local_spec(
             num_experts=num_moe_experts, moe_grouped_gemm=False
         )
         self.moe_layer = MoELayer(
diff --git a/tests/unit_tests/transformer/moe/test_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_token_dispatcher.py
old mode 100755
new mode 100644
index 895cb291aab773a795467843f7f3c7792d30d3b7..f8463042b71aecc6893bf5b6b6440da77909c130
--- a/tests/unit_tests/transformer/moe/test_token_dispatcher.py
+++ b/tests/unit_tests/transformer/moe/test_token_dispatcher.py
@@ -6,7 +6,7 @@ import pytest
 import torch
 
 from megatron.core import parallel_state
-from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
 from megatron.core.transformer.moe.moe_layer import MoELayer
 from megatron.core.transformer.moe.moe_utils import permute, unpermute
 from megatron.core.transformer.transformer_config import TransformerConfig
@@ -75,7 +75,7 @@ class MoEModelTestContainer:
         self.moe_layer = self.new_moe_layer()
 
     def new_moe_layer(self):
-        transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+        transformer_layer_spec = get_gpt_layer_local_spec(
             num_experts=self.config.num_moe_experts, moe_grouped_gemm=self.config.moe_grouped_gemm
         )
         moe_layer = MoELayer(
diff --git a/tests/unit_tests/transformer/moe/test_upcycling.py b/tests/unit_tests/transformer/moe/test_upcycling.py
old mode 100755
new mode 100644
index fc53d57ad18e7e3904a595b8b3ba3c6734f93bf0..5b5610eb33327503391a8845f3ef2a26cdc3f75f
--- a/tests/unit_tests/transformer/moe/test_upcycling.py
+++ b/tests/unit_tests/transformer/moe/test_upcycling.py
@@ -7,9 +7,7 @@ import torch.distributed
 
 from megatron.core import mpu
 from megatron.core.enums import ModelType
-from megatron.core.models.gpt.gpt_layer_specs import (
-    get_gpt_layer_with_transformer_engine_spec as gpt_te_spec,
-)
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
 from megatron.core.models.gpt.gpt_model import GPTModel
 from megatron.core.num_microbatches_calculator import destroy_num_microbatches_calculator
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
@@ -32,7 +30,9 @@ from tests.unit_tests.test_utilities import Utils
 _SEED = 42
 
 
-def model_provider(pre_process=True, post_process=True, layer_spec_fn=gpt_te_spec, **config_kwargs):
+def model_provider(
+    pre_process=True, post_process=True, layer_spec_fn=get_gpt_layer_local_spec, **config_kwargs
+):
     model_parallel_cuda_manual_seed(_SEED)
     args = get_args()
 
@@ -40,7 +40,7 @@ def model_provider(pre_process=True, post_process=True, layer_spec_fn=gpt_te_spe
 
     model = GPTModel(
         config=config,
-        transformer_layer_spec=gpt_te_spec(
+        transformer_layer_spec=layer_spec_fn(
             args.num_experts, args.moe_grouped_gemm, args.qk_layernorm
         ),
         vocab_size=args.vocal_size,
diff --git a/tests/unit_tests/transformer/test_attention.py b/tests/unit_tests/transformer/test_attention.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/transformer/test_attention_packed_seq.py b/tests/unit_tests/transformer/test_attention_packed_seq.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/transformer/test_core_attention.py b/tests/unit_tests/transformer/test_core_attention.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/transformer/test_mlp.py b/tests/unit_tests/transformer/test_mlp.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/transformer/test_module.py b/tests/unit_tests/transformer/test_module.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/transformer/test_multi_latent_attention.py b/tests/unit_tests/transformer/test_multi_latent_attention.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/transformer/test_retro_attention.py b/tests/unit_tests/transformer/test_retro_attention.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/transformer/test_rope.py b/tests/unit_tests/transformer/test_rope.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/transformer/test_spec_customization.py b/tests/unit_tests/transformer/test_spec_customization.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/transformer/test_transformer_block.py b/tests/unit_tests/transformer/test_transformer_block.py
old mode 100755
new mode 100644
diff --git a/tests/unit_tests/transformer/test_transformer_layer.py b/tests/unit_tests/transformer/test_transformer_layer.py
old mode 100755
new mode 100644
diff --git a/tools/autoformat.sh b/tools/autoformat.sh
old mode 100755
new mode 100644
diff --git a/tools/bert_embedding/__init__.py b/tools/bert_embedding/__init__.py
old mode 100755
new mode 100644
diff --git a/tools/bert_embedding/dataset.py b/tools/bert_embedding/dataset.py
old mode 100755
new mode 100644
diff --git a/tools/bert_embedding/embed.py b/tools/bert_embedding/embed.py
old mode 100755
new mode 100644
diff --git a/tools/bert_embedding/external_libs.py b/tools/bert_embedding/external_libs.py
old mode 100755
new mode 100644
diff --git a/tools/bert_embedding/huggingface.py b/tools/bert_embedding/huggingface.py
old mode 100755
new mode 100644
diff --git a/tools/checkpoint/convert.py b/tools/checkpoint/convert.py
old mode 100755
new mode 100644
index 935613b143ba7b5d77980125323748b25f779c38..3fffa2fd79010cb1b1fe1cdfbd468c4ba0f5be42
--- a/tools/checkpoint/convert.py
+++ b/tools/checkpoint/convert.py
@@ -137,10 +137,13 @@ def main():
 
     args = parser.parse_args()
 
-    queue = mp.Queue(maxsize=args.max_queue_size)
+    ctx = mp.get_context("spawn")
+    queue = ctx.Queue(maxsize=args.max_queue_size)
+    # queue = mp.Queue(maxsize=args.max_queue_size)
 
     print("Starting saver...")
-    saver_proc = mp.Process(target=saver.save_checkpoint, args=(queue, args))
+    saver_proc = ctx.Process(target=saver.save_checkpoint, args=(queue, args))
+    # saver_proc = mp.Process(target=saver.save_checkpoint, args=(queue, args))
     saver_proc.start()
 
     print("Starting loader...")
diff --git a/tools/checkpoint/hybrid_conversion.py b/tools/checkpoint/hybrid_conversion.py
old mode 100755
new mode 100644
diff --git a/tools/checkpoint/loader_llama_mistral.py b/tools/checkpoint/loader_llama_mistral.py
old mode 100755
new mode 100644
index ce470d0f700bad8ad45090526b6f15e7e31c506f..e7869a2894f9c764856770816e18ed988c2f1712
--- a/tools/checkpoint/loader_llama_mistral.py
+++ b/tools/checkpoint/loader_llama_mistral.py
@@ -527,7 +527,7 @@ def _load_checkpoint(queue, args):
     mpu.set_tensor_model_parallel_world_size(margs.tensor_model_parallel_size)
     mpu.set_pipeline_model_parallel_world_size(margs.pipeline_model_parallel_size)
     mpu.set_virtual_pipeline_model_parallel_world_size(margs.virtual_pipeline_model_parallel_size)
-    fused_kernels.load(margs)
+    # fused_kernels.load(margs)
 
     # Short aliases.
     tp_size = margs.tensor_model_parallel_size
diff --git a/tools/checkpoint/loader_mcore.py b/tools/checkpoint/loader_mcore.py
old mode 100755
new mode 100644
diff --git a/tools/checkpoint/loader_megatron.py b/tools/checkpoint/loader_megatron.py
old mode 100755
new mode 100644
diff --git a/tools/checkpoint/loader_mixtral_hf.py b/tools/checkpoint/loader_mixtral_hf.py
old mode 100755
new mode 100644
diff --git a/tools/checkpoint/saver_mcore.py b/tools/checkpoint/saver_mcore.py
old mode 100755
new mode 100644
diff --git a/tools/checkpoint/saver_megatron.py b/tools/checkpoint/saver_megatron.py
old mode 100755
new mode 100644
index 9b11b9afe749c47d746e8b6d7fee395ff1ee0015..7d9cdc3669973fe91fed42b6203c74b8f020c7ff
--- a/tools/checkpoint/saver_megatron.py
+++ b/tools/checkpoint/saver_megatron.py
@@ -206,7 +206,7 @@ def save_checkpoint(queue, args):
     mpu.set_pipeline_model_parallel_world_size(args.target_pipeline_parallel_size)
     mpu.set_tensor_model_parallel_rank(0)
     mpu.set_pipeline_model_parallel_rank(0)
-    fused_kernels.load(margs)
+    # fused_kernels.load(margs)
 
     # Embeddings
     # -----------
diff --git a/tools/checkpoint/schema_base.py b/tools/checkpoint/schema_base.py
old mode 100755
new mode 100644
diff --git a/tools/checkpoint/schema_mcore.py b/tools/checkpoint/schema_mcore.py
old mode 100755
new mode 100644
diff --git a/tools/checkpoint/utils.py b/tools/checkpoint/utils.py
old mode 100755
new mode 100644
diff --git a/tools/copyright.sh b/tools/copyright.sh
old mode 100755
new mode 100644
diff --git a/tools/linter.py b/tools/linter.py
old mode 100755
new mode 100644
diff --git a/tools/merge_datasets.py b/tools/merge_datasets.py
old mode 100755
new mode 100644
diff --git a/tools/openwebtext/README.md b/tools/openwebtext/README.md
old mode 100755
new mode 100644
diff --git a/tools/openwebtext/add_id.py b/tools/openwebtext/add_id.py
old mode 100755
new mode 100644
diff --git a/tools/openwebtext/blacklist_urls.py b/tools/openwebtext/blacklist_urls.py
old mode 100755
new mode 100644
diff --git a/tools/openwebtext/cleanup_dataset.py b/tools/openwebtext/cleanup_dataset.py
old mode 100755
new mode 100644
diff --git a/tools/openwebtext/cleanup_fix_dataset.py b/tools/openwebtext/cleanup_fix_dataset.py
old mode 100755
new mode 100644
diff --git a/tools/openwebtext/filter_ngrams.py b/tools/openwebtext/filter_ngrams.py
old mode 100755
new mode 100644
diff --git a/tools/openwebtext/find_duplicates.py b/tools/openwebtext/find_duplicates.py
old mode 100755
new mode 100644
diff --git a/tools/openwebtext/group_duplicate_url.py b/tools/openwebtext/group_duplicate_url.py
old mode 100755
new mode 100644
diff --git a/tools/openwebtext/merge_jsons.py b/tools/openwebtext/merge_jsons.py
old mode 100755
new mode 100644
diff --git a/tools/openwebtext/remove_group_duplicates.py b/tools/openwebtext/remove_group_duplicates.py
old mode 100755
new mode 100644
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
old mode 100755
new mode 100644
diff --git a/tools/preprocess_data_nmt.py b/tools/preprocess_data_nmt.py
old mode 100755
new mode 100644
diff --git a/tools/preprocess_mmdata.py b/tools/preprocess_mmdata.py
old mode 100755
new mode 100644
diff --git a/tools/report_theoretical_memory.py b/tools/report_theoretical_memory.py
old mode 100755
new mode 100644
diff --git a/tools/retro/README.md b/tools/retro/README.md
old mode 100755
new mode 100644
diff --git a/tools/retro/build_db.md b/tools/retro/build_db.md
old mode 100755
new mode 100644
diff --git a/tools/retro/cli/__init__.py b/tools/retro/cli/__init__.py
old mode 100755
new mode 100644
diff --git a/tools/retro/cli/__main__.py b/tools/retro/cli/__main__.py
old mode 100755
new mode 100644
diff --git a/tools/retro/cli/cli.py b/tools/retro/cli/cli.py
old mode 100755
new mode 100644
diff --git a/tools/retro/config_utils.py b/tools/retro/config_utils.py
old mode 100755
new mode 100644
diff --git a/tools/retro/docker/Dockerfile b/tools/retro/docker/Dockerfile
old mode 100755
new mode 100644
diff --git a/tools/retro/preprocess_data.py b/tools/retro/preprocess_data.py
old mode 100755
new mode 100644
diff --git a/tools/retro/sft/README.md b/tools/retro/sft/README.md
old mode 100755
new mode 100644
diff --git a/tools/retro/sft/dataset_conv.py b/tools/retro/sft/dataset_conv.py
old mode 100755
new mode 100644
diff --git a/tools/retro/sft/open_inst.sh b/tools/retro/sft/open_inst.sh
old mode 100755
new mode 100644
diff --git a/tools/retro/sft/sft_retro.py b/tools/retro/sft/sft_retro.py
old mode 100755
new mode 100644
diff --git a/tools/retro/sft/sft_retro_lm.sh b/tools/retro/sft/sft_retro_lm.sh
old mode 100755
new mode 100644
diff --git a/tools/retro/text_generation/evaluate.py b/tools/retro/text_generation/evaluate.py
old mode 100755
new mode 100644
diff --git a/tools/retro/text_generation/metrics.py b/tools/retro/text_generation/metrics.py
old mode 100755
new mode 100644
diff --git a/tools/retro/text_generation/retro_api.py b/tools/retro/text_generation/retro_api.py
old mode 100755
new mode 100644
diff --git a/tools/retro/text_generation/retro_generate.sh b/tools/retro/text_generation/retro_generate.sh
old mode 100755
new mode 100644
diff --git a/tools/retro/text_generation/retro_generation.py b/tools/retro/text_generation/retro_generation.py
old mode 100755
new mode 100644
diff --git a/tools/retro/text_generation/retro_text_generation.py b/tools/retro/text_generation/retro_text_generation.py
old mode 100755
new mode 100644
diff --git a/tools/run_mamba_text_generation_server.py b/tools/run_mamba_text_generation_server.py
old mode 100755
new mode 100644
diff --git a/tools/run_text_generation_server.py b/tools/run_text_generation_server.py
old mode 100755
new mode 100644
diff --git a/tools/run_vlm_text_generation.py b/tools/run_vlm_text_generation.py
old mode 100755
new mode 100644
diff --git a/tools/text_generation_cli.py b/tools/text_generation_cli.py
old mode 100755
new mode 100644
diff --git a/train_GPT-MOE_567B.sh b/train_GPT-MOE_567B.sh
new file mode 100755
index 0000000000000000000000000000000000000000..247e6f6e41706adddbf1564671bdb4c094dde7af
--- /dev/null
+++ b/train_GPT-MOE_567B.sh
@@ -0,0 +1,207 @@
+#!/bin/bash
+
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+    fi
+done
+
+source /opt/dtk/env.sh
+# Runs Mixtral 8x7B model
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export GPU_MAX_HW_QUEUES=10
+
+export NCCL_ALGO=Ring
+export NCCL_MIN_NCHANNELS=32
+export NCCL_MAX_NCHANNELS=32
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_NET_GDR_READ=1
+export RCCL_SDMA_COPY_ENABLE=0
+export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+export GROUPED_GEMM_BatchLinear=1
+export GLOG_minloglevel=3
+export LD_LIBRARY_PATH=/public/home/xingjl/dependency/rocblas-install-0227/lib:$LD_LIBRARY_PATH
+
+RANK=$OMPI_COMM_WORLD_RANK
+LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+DIST_URL=${1}
+DIST_PORT=25900
+
+CHECKPOINT_PATH=./CKPT
+TOKENIZER_MODEL=./mixtral_dataset/tokenizer.model
+DATA_PATH=./mixtral_dataset/my-mixtral_text_document
+
+DISTRIBUTED_ARGS=(
+    --rank ${RANK}
+    --world-size ${WORLD_SIZE}
+    --local-rank ${LOCAL_RANK}
+    --dist-url tcp://${DIST_URL}:${DIST_PORT}
+)
+
+MODEL_ARGS=(
+    --use-mcore-models
+    --disable-bias-linear
+    --seq-length 8192
+    --max-position-embeddings 32768
+    --num-layers 32 #64
+    --hidden-size 8192
+    --ffn-hidden-size 32768
+    --num-attention-heads 64
+    --init-method-std 0.01
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --normalization RMSNorm
+    --position-embedding-type rope
+    --untie-embeddings-and-output-weights
+    --no-masked-softmax-fusion
+    --no-position-embedding
+    --rotary-base 1000000
+)
+
+MOE_ARGS=(
+    --num-experts 16
+    --moe-router-topk 2
+    --moe-router-load-balancing-type aux_loss
+    --moe-aux-loss-coeff 1e-2
+    --moe-token-dispatcher-type alltoall
+    --moe-expert-capacity-factor 0.5
+    --moe-pad-expert-input-to-capacity
+    #--moe-grouped-gemm
+)
+
+DATA_ARGS=(
+    --tokenizer-type Llama2Tokenizer
+    --tokenizer-model ${TOKENIZER_MODEL}
+    --data-path $DATA_PATH
+    --split 99990,8,2
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 1
+    --global-batch-size 1024
+    --lr 1e-4
+    --train-iters 10
+    --lr-decay-iters 320000
+    --lr-decay-style cosine
+    --min-lr 1.0e-5
+    --weight-decay 0.1
+    --lr-warmup-iters 500
+    --clip-grad 1.0
+    --bf16
+    --overlap-param-gather
+    --overlap-grad-reduce
+)
+
+TORCH_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 3
+    --profile-step-end 4
+    --profile-dir torch_prof_data_6nodes_tp4-ep8
+    --use-pytorch-profiler
+)
+
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
+
+MODEL_PARALLEL_ARGS=(
+    --tensor-model-parallel-size 2
+    --pipeline-model-parallel-size 16
+    --expert-model-parallel-size 16
+    --expert-tensor-parallel-size 1
+    --context-parallel-size 2
+    --use-distributed-optimizer
+    --sequence-parallel
+)
+
+LOGGING_ARGS=(
+    --log-throughput \
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 5 \
+    #--save $CHECKPOINT_PATH \
+    #--load $CHECKPOINT_PATH \
+    --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
+    --no-load-optim \
+    --no-load-rng
+)
+
+if [ -n "${WANDB_API_KEY}" ]; then
+    LOGGING_ARGS+=(
+        --wandb-project ${WANDB_PROJECT:-"Mixtral"}
+        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
+    )
+fi
+
+APP="python3 -u pretrain_gpt.py \
+    ${DISTRIBUTED_ARGS[@]} \
+    ${MODEL_ARGS[@]} \
+    ${MOE_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${LOGGING_ARGS[@]} \
+    "
+
+if [[ $profiling == "torch" ]]; then
+    APP+=" ${TORCH_PROFIE_ARGS[@]}"
+elif [[ $profiling == "hip" ]]; then
+    mkdir -p hip_prof_data
+    APP+=" ${HIP_PROFIE_ARGS[@]}"
+    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
+fi
+
+#for hygon cpu
+case ${LOCAL_RANK} in
+[0])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[4])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=4 --membind=4 ${APP}
+  ;;
+[5])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=5 --membind=5 ${APP}
+  ;;
+[6])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=6 --membind=6 ${APP}
+  ;;
+[7])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=7 --membind=7 ${APP}
+  ;;
+esac
diff --git a/train_GPT-MOE_567B_1nodes.sh b/train_GPT-MOE_567B_1nodes.sh
new file mode 100755
index 0000000000000000000000000000000000000000..4ede863ef31fbb937acaf1b4cade7733f949603b
--- /dev/null
+++ b/train_GPT-MOE_567B_1nodes.sh
@@ -0,0 +1,206 @@
+#!/bin/bash
+
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+    fi
+done
+
+source /opt/dtk/env.sh
+# Runs Mixtral 8x7B model
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export GPU_MAX_HW_QUEUES=10
+
+export NCCL_ALGO=Ring
+export NCCL_MIN_NCHANNELS=32
+export NCCL_MAX_NCHANNELS=32
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_NET_GDR_READ=1
+export RCCL_SDMA_COPY_ENABLE=0
+export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+export GROUPED_GEMM_BatchLinear=1
+export GLOG_minloglevel=3
+export LD_LIBRARY_PATH=/public/home/xingjl/dependency/rocblas-install-0227/lib:$LD_LIBRARY_PATH
+
+RANK=$OMPI_COMM_WORLD_RANK
+LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+DIST_URL=${1}
+DIST_PORT=25900
+
+CHECKPOINT_PATH=./CKPT
+TOKENIZER_MODEL=./mixtral_dataset/tokenizer.model
+DATA_PATH=./mixtral_dataset/my-mixtral_text_document
+
+DISTRIBUTED_ARGS=(
+    --rank ${RANK}
+    --world-size ${WORLD_SIZE}
+    --local-rank ${LOCAL_RANK}
+    --dist-url tcp://${DIST_URL}:${DIST_PORT}
+)
+
+MODEL_ARGS=(
+    --use-mcore-models
+    --disable-bias-linear
+    --seq-length 8192
+    --max-position-embeddings 32768
+    --num-layers 2
+    --hidden-size 8192
+    --ffn-hidden-size 32768
+    --num-attention-heads 64
+    --init-method-std 0.01
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --normalization RMSNorm
+    --position-embedding-type rope
+    --untie-embeddings-and-output-weights
+    --no-masked-softmax-fusion
+    --no-position-embedding
+    --rotary-base 1000000
+)
+
+MOE_ARGS=(
+    --num-experts 16
+    --moe-router-topk 2
+    --moe-router-load-balancing-type aux_loss
+    --moe-aux-loss-coeff 1e-2
+    --moe-token-dispatcher-type alltoall
+    --moe-expert-capacity-factor 0.5
+    --moe-pad-expert-input-to-capacity
+    --moe-grouped-gemm
+)
+
+DATA_ARGS=(
+    --tokenizer-type Llama2Tokenizer
+    --tokenizer-model ${TOKENIZER_MODEL}
+    --data-path $DATA_PATH
+    --split 99990,8,2
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 1
+    --global-batch-size 256
+    --lr 1e-4
+    --train-iters 500
+    --lr-decay-iters 320000
+    --lr-decay-style cosine
+    --min-lr 1.0e-5
+    --weight-decay 0.1
+    --lr-warmup-iters 500
+    --clip-grad 1.0
+    --bf16
+    --overlap-param-gather
+    --overlap-grad-reduce
+)
+
+TORCH_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 3
+    --profile-step-end 4
+    --profile-dir torch_prof_gpt_1nodes_tp2-pp1-ep8-ep_tp1
+    --use-pytorch-profiler
+)
+
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 4 5 6 7
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
+
+MODEL_PARALLEL_ARGS=(
+    --tensor-model-parallel-size 2
+    --pipeline-model-parallel-size 1
+    --expert-model-parallel-size 8
+    --expert-tensor-parallel-size 1
+    --use-distributed-optimizer
+    --sequence-parallel
+)
+
+LOGGING_ARGS=(
+    --log-throughput \
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 5 \
+    #--save $CHECKPOINT_PATH \
+    #--load $CHECKPOINT_PATH \
+    --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
+    --no-load-optim \
+    --no-load-rng
+)
+
+if [ -n "${WANDB_API_KEY}" ]; then
+    LOGGING_ARGS+=(
+        --wandb-project ${WANDB_PROJECT:-"Mixtral"}
+        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
+    )
+fi
+
+APP="python3 -u pretrain_gpt.py \
+    ${DISTRIBUTED_ARGS[@]} \
+    ${MODEL_ARGS[@]} \
+    ${MOE_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${LOGGING_ARGS[@]} \
+    "
+
+if [[ $profiling == "torch" ]]; then
+    APP+=" ${TORCH_PROFIE_ARGS[@]}"
+elif [[ $profiling == "hip" ]]; then
+    mkdir -p hip_prof_data
+    APP+=" ${HIP_PROFIE_ARGS[@]}"
+    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
+fi
+
+#for hygon cpu
+case ${LOCAL_RANK} in
+[0])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[4])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=4 --membind=4 ${APP}
+  ;;
+[5])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=5 --membind=5 ${APP}
+  ;;
+[6])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=6 --membind=6 ${APP}
+  ;;
+[7])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=7 --membind=7 ${APP}
+  ;;
+esac
\ No newline at end of file
diff --git a/train_mixtral_8x7B_1nodes.sh b/train_mixtral_8x7B_1nodes.sh
index d04df17c8bb79daaf7ecad78f21f5899db518d84..2e9ee2390e6c229eef4c5edb53d962aece533ae9 100755
--- a/train_mixtral_8x7B_1nodes.sh
+++ b/train_mixtral_8x7B_1nodes.sh
@@ -4,32 +4,27 @@ for para in $*
 do
     if [[ $para == --profiling* ]];then
         profiling=${para#*=}
-        export GPU_FLUSH_ON_EXECUTION=1
-        export HIP_DIRECT_DISPATCH=0
     fi
 done
 
 source /opt/dtk/env.sh
 # Runs Mixtral 8x7B model
-export HIP_DIRECT_DISPATCH=0
+export CUDA_DEVICE_MAX_CONNECTIONS=1
 export HSA_FORCE_FINE_GRAIN_PCIE=1
 export OMP_NUM_THREADS=1
 export GPU_MAX_HW_QUEUES=10
-export NVTE_FORCE_BLASLT=1
 
 export NCCL_ALGO=Ring
-export NCCL_NCHANNELS_PER_PEER=2
-export NCCL_MIN_NCHANNELS=16
-export NCCL_IB_TIMEOUT=22
-export CUDA_DEVICE_MAX_CONNECTIONS=1
-#export NCCL_IB_HCA=mlx5_0
-#export NCCL_SOCKET_IFNAME=enp145s0f0
-export NCCL_NET_GDR_LEVEL=SYS
-export NCCL_NET_GDR_READ=0
+export NCCL_MIN_NCHANNELS=32
+export NCCL_MAX_NCHANNELS=32
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_NET_GDR_READ=1
+export RCCL_SDMA_COPY_ENABLE=0
+export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+export GROUPED_GEMM_BatchLinear=1
 export GLOG_minloglevel=3
-export LD_LIBRARY_PATH=/opt/hipblaslt-install/lib/:$LD_LIBRARY_PATH
-
-
+export LD_LIBRARY_PATH=/public/home/xingjl/dependency/rocblas-install-0227/lib:$LD_LIBRARY_PATH
 
 RANK=$OMPI_COMM_WORLD_RANK
 LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
@@ -38,8 +33,8 @@ DIST_URL=${1}
 DIST_PORT=25900
 
 CHECKPOINT_PATH=./CKPT 
-TOKENIZER_MODEL=../../megatron-lm/mixtral_dataset/tokenizer.model
-DATA_PATH=../../megatron-lm/mixtral_dataset/my-mixtral_text_document
+TOKENIZER_MODEL=./mixtral_dataset/tokenizer.model
+DATA_PATH=./mixtral_dataset/my-mixtral_text_document
 
 DISTRIBUTED_ARGS=(
     --rank ${RANK}
@@ -53,8 +48,8 @@ MODEL_ARGS=(
     --disable-bias-linear
     --seq-length 4096
     --max-position-embeddings 32768
-    --num-layers 8 #16
-    --hidden-size 1024
+    --num-layers 4
+    --hidden-size 4096
     --ffn-hidden-size 14336
     --num-attention-heads 32
     --init-method-std 0.01
@@ -79,7 +74,7 @@ MOE_ARGS=(
     --moe-token-dispatcher-type alltoall
     --moe-expert-capacity-factor 0.5
     --moe-pad-expert-input-to-capacity
-    --moe-grouped-gemm
+    #--moe-grouped-gemm
 )
 
 DATA_ARGS=(
@@ -91,9 +86,9 @@ DATA_ARGS=(
 
 TRAINING_ARGS=(
     --micro-batch-size 1
-    --global-batch-size 128 #256
+    --global-batch-size 256
     --lr 1e-4
-    --train-iters 20
+    --train-iters 10
     --lr-decay-iters 320000
     --lr-decay-style cosine
     --min-lr 1.0e-5
@@ -110,7 +105,7 @@ TORCH_PROFIE_ARGS=(
     --profile-ranks 0 1 2 3 4 5 6 7
     --profile-step-start 3
     --profile-step-end 4
-    --profile-dir torch_prof_data
+    --profile-dir torch_prof_mixtral_1nodes_tp2-pp1-ep8-ep_tp1-cp1
     --use-pytorch-profiler
 )
 
@@ -125,7 +120,7 @@ HIP_PROFIE_ARGS=(
 MODEL_PARALLEL_ARGS=(
     --tensor-model-parallel-size 2
     --pipeline-model-parallel-size 1
-    --expert-model-parallel-size 2
+    --expert-model-parallel-size 8
     --expert-tensor-parallel-size 1
     --use-distributed-optimizer
     --sequence-parallel
@@ -173,35 +168,43 @@ fi
 case ${LOCAL_RANK} in
 [0])
   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=0 --membind=0 ${APP}
+  ${APP}
+  #numactl --cpunodebind=0 --membind=0 ${APP}
   ;;
 [1])
   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=1 --membind=1 ${APP}
+  ${APP}
+  #numactl --cpunodebind=1 --membind=1 ${APP}
   ;;
 [2])
   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=2 --membind=2 ${APP}
+  ${APP}
+  #numactl --cpunodebind=2 --membind=2 ${APP}
   ;;
 [3])
   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=3 --membind=3 ${APP}
+  ${APP}
+  #numactl --cpunodebind=3 --membind=3 ${APP}
   ;;
 [4])
   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=4 --membind=4 ${APP}
+  ${APP}
+  #numactl --cpunodebind=4 --membind=4 ${APP}
   ;;
 [5])
   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=5 --membind=5 ${APP}
+  ${APP}
+  #numactl --cpunodebind=5 --membind=5 ${APP}
   ;;
 [6])
   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=6 --membind=6 ${APP}
+  ${APP}
+  #numactl --cpunodebind=6 --membind=6 ${APP}
   ;;
 [7])
   export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-  numactl --cpunodebind=7 --membind=7 ${APP}
+  ${APP}
+  #numactl --cpunodebind=7 --membind=7 ${APP}
   ;;
 esac
 
diff --git a/train_mixtral_8x7B_2nodes.sh b/train_mixtral_8x7B_2nodes.sh
new file mode 100755
index 0000000000000000000000000000000000000000..420f65aed87d5cac524351d31c174d666b863b95
--- /dev/null
+++ b/train_mixtral_8x7B_2nodes.sh
@@ -0,0 +1,210 @@
+#!/bin/bash
+
+for para in $*
+do
+    if [[ $para == --profiling* ]];then
+        profiling=${para#*=}
+    fi
+done
+
+source /opt/dtk/env.sh
+# Runs Mixtral 8x7B model
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export GPU_MAX_HW_QUEUES=10
+
+export NCCL_ALGO=Ring
+export NCCL_MIN_NCHANNELS=32
+export NCCL_MAX_NCHANNELS=32
+export NCCL_NET_GDR_LEVEL=7
+export NCCL_NET_GDR_READ=1
+export RCCL_SDMA_COPY_ENABLE=0
+export NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1,mlx5_8:1,mlx5_9:1
+export NCCL_TOPO_FILE="/public/home/xingjl/dependency/rccl-tests-0204/topo-input.xml"
+export GROUPED_GEMM_BatchLinear=1
+export GLOG_minloglevel=3
+export LD_LIBRARY_PATH=/public/home/xingjl/dependency/rocblas-install-0227/lib:$LD_LIBRARY_PATH
+
+RANK=$OMPI_COMM_WORLD_RANK
+LOCAL_RANK=$OMPI_COMM_WORLD_LOCAL_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+DIST_URL=${1}
+DIST_PORT=25900
+
+CHECKPOINT_PATH=./CKPT 
+TOKENIZER_MODEL=./mixtral_dataset/tokenizer.model
+DATA_PATH=./mixtral_dataset/my-mixtral_text_document
+
+DISTRIBUTED_ARGS=(
+    --rank ${RANK}
+    --world-size ${WORLD_SIZE}
+    --local-rank ${LOCAL_RANK}
+    --dist-url tcp://${DIST_URL}:${DIST_PORT}
+)
+
+MODEL_ARGS=(
+    --use-mcore-models
+    --disable-bias-linear
+    --seq-length 4096
+    --max-position-embeddings 32768
+    --num-layers 32
+    --hidden-size 4096
+    --ffn-hidden-size 14336
+    --num-attention-heads 32
+    --init-method-std 0.01
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --normalization RMSNorm
+    --position-embedding-type rope
+    --swiglu
+    --untie-embeddings-and-output-weights
+    --group-query-attention
+    --num-query-groups 8
+    --no-masked-softmax-fusion
+    --no-position-embedding
+    --rotary-base 1000000
+)
+
+MOE_ARGS=(
+    --num-experts 8
+    --moe-router-topk 2
+    --moe-router-load-balancing-type aux_loss
+    --moe-aux-loss-coeff 1e-2
+    --moe-token-dispatcher-type alltoall
+    --moe-expert-capacity-factor 0.5
+    --moe-pad-expert-input-to-capacity
+    --moe-grouped-gemm
+)
+
+DATA_ARGS=(
+    --tokenizer-type Llama2Tokenizer
+    --tokenizer-model ${TOKENIZER_MODEL}
+    --data-path $DATA_PATH
+    --split 99990,8,2
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 1
+    --global-batch-size 256
+    --lr 1e-4
+    --train-iters 10
+    --lr-decay-iters 320000
+    --lr-decay-style cosine
+    --min-lr 1.0e-5
+    --weight-decay 0.1
+    --lr-warmup-iters 500
+    --clip-grad 1.0
+    --bf16
+    --overlap-param-gather
+    --overlap-grad-reduce
+)
+
+TORCH_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 8 9 10 11 
+    --profile-step-start 3
+    --profile-step-end 4
+    --profile-dir torch_prof_mixtral_4nodes_tp2-pp8-ep2-ep_tp1
+    --use-pytorch-profiler
+)
+
+HIP_PROFIE_ARGS=(
+    --profile
+    --profile-ranks 0 1 2 3 8 9 10 11 
+    --profile-step-start 4
+    --profile-step-end 5
+    --use-hip-profiler
+)
+
+MODEL_PARALLEL_ARGS=(
+    --tensor-model-parallel-size 2
+    --pipeline-model-parallel-size 8
+    --expert-model-parallel-size 2
+    --expert-tensor-parallel-size 1
+    --use-distributed-optimizer
+    --sequence-parallel
+)
+
+LOGGING_ARGS=(
+    --log-throughput \
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters -1 \
+    #--save $CHECKPOINT_PATH \
+    #--load $CHECKPOINT_PATH \
+    --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
+    --no-load-optim \
+    --no-load-rng
+)
+
+if [ -n "${WANDB_API_KEY}" ]; then
+    LOGGING_ARGS+=(
+        --wandb-project ${WANDB_PROJECT:-"Mixtral"}
+        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
+    )
+fi
+
+APP="python3 -u pretrain_gpt.py \
+    ${DISTRIBUTED_ARGS[@]} \
+    ${MODEL_ARGS[@]} \
+    ${MOE_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${LOGGING_ARGS[@]} \
+    "
+
+if [[ $profiling == "torch" ]]; then
+    APP+=" ${TORCH_PROFIE_ARGS[@]}"
+elif [[ $profiling == "hip" ]]; then
+    mkdir -p hip_prof_data
+    APP+=" ${HIP_PROFIE_ARGS[@]}"
+    APP="hipprof -d hip_prof_data --hip-trace --trace-off ${APP}"
+fi
+
+#for hygon cpu
+case ${LOCAL_RANK} in
+[0])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+[4])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=4 --membind=4 ${APP}
+  ;;
+[5])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=5 --membind=5 ${APP}
+  ;;
+[6])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=6 --membind=6 ${APP}
+  ;;
+[7])
+  export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  #numactl --cpunodebind=7 --membind=7 ${APP}
+  ;;
+esac
+
diff --git a/unit-test-job-lts.yaml b/unit-test-job-lts.yaml
old mode 100755
new mode 100644