Initial commit

2d2fca6c · jerrrrry · 2d2fca6c · 2d2fca6c · 2d2fca6c · 2d2fca6c
Commit 2d2fca6c authored Feb 12, 2026 by jerrrrry
17 changed files
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_12.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_12.sh
+#!/bin/bash
+
+# ================================
+# Choose the case to run.
+# ================================
+
+# Interleaved schedule options = [YES, NO].
+INTERLEAVED=YES
+
+# Batch size (global batch size) options = [12, 24, 36, ..., 60].
+GBS=12
+
+
+
+
+
+# Set interleaved schedule options.
+if [ ${INTERLEAVED} == "YES" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
+elif [ ${INTERLEAVED} == "NO" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+else
+    echo "Invalid configuration"
+    exit 1
+fi
+
+
+# Other params.
+TP=8
+PP=12
+MBS=1
+NLS=96
+HS=12288
+NAH=96
+DDP=local
+NNODES=12
+
+
+# Name of the job.
+export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS}
+
+
+# Import the configs.
+. `pwd`/CONFIG.sh
+
+
+# Submit the job.
+. `pwd`/SBATCH.sh
+
+
+exit 0
+
+
+
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_13.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_13.sh
+#!/bin/bash
+
+# ================================
+# Choose the case to run.
+# ================================
+
+# Pipeline-parallel size options = [2, 4, 8, 16, 32].
+PP=2
+
+# Batch size (global batch size) options = [32, 128].
+GBS=32
+
+
+
+
+
+# Set pipeline-parallel and tensor-parallel size options.
+TP=$((64/PP))
+
+
+# Other params.
+MBS=1
+NLS=32
+HS=20480
+NAH=128
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+NNODES=8
+
+
+# Name of the job.
+export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS}
+
+
+# Import the configs.
+. `pwd`/CONFIG.sh
+
+
+# Submit the job.
+. `pwd`/SBATCH.sh
+
+
+exit 0
+
+
+
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_14.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_14.sh
+#!/bin/bash
+
+# ================================
+# Choose the case to run.
+# ================================
+
+# Pipeline-parallel size options = [2, 4, 8, 16, 32].
+PP=2
+
+# Batch size (global batch size) options = [32, 512].
+GBS=32
+
+
+
+
+
+# Set pipeline-parallel and data-parallel size options.
+DP=$((64/PP))
+
+
+# Other params.
+TP=1
+MBS=1
+NLS=32
+HS=3840
+NAH=32
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+NNODES=8
+
+
+# Name of the job.
+export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS}
+
+
+# Import the configs.
+. `pwd`/CONFIG.sh
+
+
+# Submit the job.
+. `pwd`/SBATCH.sh
+
+
+exit 0
+
+
+
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_15.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_15.sh
+#!/bin/bash
+
+# ================================
+# Choose the case to run.
+# ================================
+
+# Tensor-parallel size options = [2, 4, 8, 16, 32].
+TP=2
+
+# Batch size (global batch size) options = [32, 128, 512].
+GBS=32
+
+
+
+
+
+# Set tensor-parallel and data-parallel size options.
+DP=$((64/TP))
+
+
+# Other params.
+PP=1
+MBS=1
+NLS=32
+HS=3840
+NAH=32
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+NNODES=8
+
+
+# Name of the job.
+export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS}
+
+
+# Import the configs.
+. `pwd`/CONFIG.sh
+
+
+# Submit the job.
+. `pwd`/SBATCH.sh
+
+
+exit 0
+
+
+
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_16.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_16.sh
+#!/bin/bash
+
+# ================================
+# Choose the case to run.
+# ================================
+
+# Microbatch size options = [1, 2, 4, 8].
+MBS=1
+
+# Batch size (global batch size) options = [128, 512].
+GBS=128
+
+
+
+
+
+# Other params.
+TP=8
+PP=8
+NLS=32
+HS=15360
+NAH=128
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+NNODES=8
+
+
+# Name of the job.
+export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS}
+
+
+# Import the configs.
+. `pwd`/CONFIG.sh
+
+
+# Submit the job.
+. `pwd`/SBATCH.sh
+
+
+exit 0
+
+
+
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_17.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_17.sh
+#!/bin/bash
+
+# ================================
+# Choose the case to run.
+# ================================
+
+# Activation recomputation options = [YES, NO].
+ACTIVATION_RECOMPUTATION=YES
+
+# Batch size (global batch size) options = [1, 2, 4, ..., 256].
+GBS=1
+
+
+
+
+
+# Set activation recomputation.
+if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then
+    MEGATRON_EXTRA_PARAMS=""
+else
+    echo "Invalid configuration"
+    exit 1
+fi
+
+
+# Other params.
+TP=8
+PP=16
+MBS=1
+NLS=80
+HS=12288
+NAH=96
+DDP=local
+NNODES=16
+
+
+# Name of the job.
+export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS}
+
+
+# Import the configs.
+. `pwd`/CONFIG.sh
+
+
+# Submit the job.
+. `pwd`/SBATCH.sh
+
+
+exit 0
+
+
+
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_18.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_18.sh
+#!/bin/bash
+
+# ================================
+# Choose the case to run.
+# ================================
+
+# Scatter-gather communication optimization options = [YES, NO].
+SCATTER_GATHER=YES
+
+# Batch size (global batch size) options = [12, 24, 36, ..., 60].
+GBS=12
+
+
+
+
+
+# Set scatter-gather communication optimization options.
+if [ ${SCATTER_GATHER} == "YES" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
+elif [ ${SCATTER_GATHER} == "NO" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline "
+else
+    echo "Invalid configuration"
+    exit 1
+fi
+
+
+# Other params.
+TP=8
+PP=12
+MBS=1
+NLS=96
+HS=12288
+NAH=96
+DDP=local
+NNODES=12
+
+
+# Name of the job.
+export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS}
+
+
+# Import the configs.
+. `pwd`/CONFIG.sh
+
+
+# Submit the job.
+. `pwd`/SBATCH.sh
+
+
+exit 0
+
+
+
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/run_table_1.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/run_table_1.sh
+#!/bin/bash
+
+# ================================
+# Choose the case to run.
+# ================================
+# model size options = [1.7B, 3.6B, 7.5B, 18B, 39B, 76B, 145B, 310B, 530B, 1T]
+MODEL_SIZE=1.7B
+
+
+
+
+
+
+if [ ${MODEL_SIZE} == "1.7B" ]; then
+    TP=1
+    PP=1
+    MBS=16
+    GBS=512
+    NLS=24
+    HS=2304
+    NAH=24
+    DDP=torch
+    NNODES=4
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "3.6B" ]; then
+    TP=2
+    PP=1
+    MBS=16
+    GBS=512
+    NLS=30
+    HS=3072
+    NAH=32
+    DDP=torch
+    NNODES=8
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "7.5B" ]; then
+    TP=4
+    PP=1
+    MBS=16
+    GBS=512
+    NLS=36
+    HS=4096
+    NAH=32
+    DDP=torch
+    NNODES=16
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "18B" ]; then
+    TP=8
+    PP=1
+    MBS=8
+    GBS=1024
+    NLS=40
+    HS=6144
+    NAH=48
+    DDP=torch
+    NNODES=32
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "39B" ]; then
+    TP=8
+    PP=2
+    MBS=4
+    GBS=1536
+    NLS=48
+    HS=8192
+    NAH=64
+    DDP=local
+    NNODES=64
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "76B" ]; then
+    TP=8
+    PP=4
+    MBS=2
+    GBS=1792
+    NLS=60
+    HS=10240
+    NAH=80
+    DDP=local
+    NNODES=128
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5"
+elif [ ${MODEL_SIZE} == "145B" ]; then
+    TP=8
+    PP=8
+    MBS=2
+    GBS=2304
+    NLS=80
+    HS=12288
+    NAH=96
+    DDP=local
+    NNODES=192
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5 "
+elif [ ${MODEL_SIZE} == "310B" ]; then
+    TP=8
+    PP=16
+    MBS=1
+    GBS=2160
+    NLS=96
+    HS=16384
+    NAH=128
+    DDP=local
+    NNODES=240
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 3 "
+elif [ ${MODEL_SIZE} == "530B" ]; then
+    TP=8
+    PP=35
+    MBS=1
+    GBS=2520
+    NLS=105
+    HS=20480
+    NAH=128
+    DDP=local
+    NNODES=315
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 1 "
+elif [ ${MODEL_SIZE} == "1T" ]; then
+    TP=8
+    PP=64
+    MBS=1
+    GBS=3072
+    NLS=128
+    HS=25600
+    NAH=160
+    DDP=local
+    NNODES=384
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+else
+    echo "Invalid configuration"
+    exit 1
+fi
+
+
+# Name of the job
+export JOB_NAME=results_table_1_model_size_${MODEL_SIZE}
+
+
+# Import the configs.
+. `pwd`/CONFIG.sh
+
+
+# Submit the job.
+. `pwd`/SBATCH.sh
+
+
+exit 0
+
+
+
--- a/Megatron-LM/examples/bert/README.md
+++ b/Megatron-LM/examples/bert/README.md
+# BERT MODEL
+
+## Table of contents
+- [1. Training Setup](#1-training-setup)
+- [2. Configurations](#2-configurations)
+
+## 1. Training setup
+<a id="markdown-training-setup" name="training-setup"></a>
+
+To run the model using a docker container run it as follows
+```
+PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
+CHECKPOINT_PATH="" #<Specify path>
+TENSORBOARD_LOGS_PATH=""#<Specify path>
+VOCAB_FILE="" #<Specify path to file>//bert-vocab.txt
+DATA_PATH="" #<Specify path and file prefix>_text_document
+
+docker run \
+  --gpus=all \
+  --ipc=host \
+  --workdir /workspace/megatron-lm \
+  -v /path/to/data:/path/to/data \
+  -v /path/to/megatron-lm:/workspace/megatron-lm \
+  megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
+  bash examples/bert/train_bert_340m_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $DATA_PATH "
+
+```
+NOTE: Depending on the environment you are running it the above command might like slightly different.
+
+
+## 2. Configurations
+<a id="markdown-configurations" name="configurations"></a>
+The example in this folder shows you how to run 340m large model. There are other configs you could run as well
+
+### 4B
+```
+       --num-layers 48 \
+       --hidden-size 2560 \
+       --num-attention-heads 32 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+
+```
+
+### 20B
+```
+       --num-layers 48 \
+       --hidden-size 6144 \
+       --num-attention-heads 96 \
+       --tensor-model-parallel-size 4 \
+       --pipeline-model-parallel-size 4 \
+
+```
\ No newline at end of file
--- a/Megatron-LM/examples/bert/train_bert_340m_distributed.sh
+++ b/Megatron-LM/examples/bert/train_bert_340m_distributed.sh
+#!/bin/bash
+
+# Runs the "340M" parameter model (Bert - Large)
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NUM_NODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
+
+CHECKPOINT_PATH=$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=$2 #<Specify path>
+VOCAB_FILE=$3 #<Specify path to file>/bert-vocab.json
+DATA_PATH=$4 #<Specify path and file prefix>_text_document
+
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE 
+    --nnodes $NUM_NODES 
+    --master_addr $MASTER_ADDR 
+    --master_port $MASTER_PORT
+)
+
+BERT_MODEL_ARGS=(
+    --num-layers 24 
+    --hidden-size 1024 
+    --num-attention-heads 16 
+    --seq-length 512 
+    --max-position-embeddings 512 
+    --attention-backend auto # Can use (flash/fused/unfused/local)
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 4 
+    --global-batch-size 32 
+    --train-iters 1000000 
+    --weight-decay 1e-2 
+    --clip-grad 1.0 
+    --fp16
+    --lr 0.0001
+    --lr-decay-iters 990000 
+    --lr-decay-style linear 
+    --min-lr 1.0e-5 
+    --weight-decay 1e-2 
+    --lr-warmup-fraction .01 
+    --clip-grad 1.0 
+)
+
+MODEL_PARALLEL_ARGS=(
+	--tensor-model-parallel-size 8 
+	--pipeline-model-parallel-size 16 
+)
+
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --vocab-file $VOCAB_FILE 
+    --split 949,50,1
+)
+
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 100
+    --save-interval 10000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+
+torchrun ${DISTRIBUTED_ARGS[@]} pretrain_bert.py \
+    ${BERT_MODEL_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${EVAL_AND_LOGGING_ARGS[@]}
+    
\ No newline at end of file
--- a/Megatron-LM/examples/export/README.md
+++ b/Megatron-LM/examples/export/README.md
+# Megatron Core Export
+
+This module is used to export megatron core models to different inference frameworks. 
+Currently we support TRTLLM export . In the future we will be adding support for VLLM etc. 
+
+## PTQ AND EXPORT
+Follow the examples of [TensorRT Model Optimizer](../post_training/modelopt) to perform post training quantization, followed by an export to a HF-like checkpoint for TensorRT-LLM, vLLM, and SGLang deployment. 
+
+# TRTLLM EXPORT
+Follow the instructions in [trtllm_export](./trtllm_export/) to do export to TRTLLM checkpoint format alone.
--- a/Megatron-LM/examples/export/trtllm_export/README.md
+++ b/Megatron-LM/examples/export/trtllm_export/README.md
+# Megatron Core To TRTLLM Export Documentation
+This guide will walk you through how you can use the megatron core export for exporting models to trtllm format
+
+### Contents
+- [Megatron Core To TRTLLM Export Documentation](#megatron-core-to-trtllm-export-documentation)
+- [Contents](#contents)
+  - [1. Quick Start](#1-quick-start)
+    - [1.1 Understanding The Code](#11-understanding-the-code)
+    - [1.2 Running The Code](#12-running-the-code)
+  - [2. GPU Export](#2-gpu-export)
+  - [3. Future work](#4-future-work)
+
+#### 1. Quick Start
+This will walk you through the flow of converting an mcore gpt model to trtllm format using single device mode. The file can be found at [gpt_single_device_cpu_export.py](./single_device_export/gpt_single_device_cpu_export.py)
+
+NOTE: For faster performance, if your entire model will fit into gpu memory, pre transfer the model state dict to gpu and then call the get_trtllm_pretrained_config_and_model_weights function.
+
+<br>
+
+##### 1.1 Understanding The Code
+***STEP 1 - We initialize model parallel and other default arguments***
+We initalize tp and pp to 1 so that we can get the full model state dict on cpu
+```python
+    initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
+```
+
+***STEP 2 - We load the model using the model_provider_function***
+NOTE: We create a simple gpt model
+
+```python
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=64, # Needs to be atleast 32 times num_attn_heads
+        num_attention_heads=2, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32,
+    )
+
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=100, 
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
+
+    # Optionally you can also load a model using this code 
+    # sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    # checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    # gpt_model.load_state_dict(checkpoint)
+
+```
+
+***STEP 3 - Instantiate the TRTLLM Helper***
+We instantiate the [TRTLLM Helper](../../../megatron/core/export/trtllm/trtllm_helper.py)  For the GPT model we instantiate trtllm_helper as shown below.
+```python
+    if hasattr(gpt_model, "rotary_pos_emb"):
+        seq_len_interpolation_factor =  gpt_model.rotary_pos_emb.seq_len_interpolation_factor
+
+    trtllm_helper = TRTLLMHelper(
+                        transformer_config=gpt_model.config, 
+                        model_type=ModelType.gpt,
+                        position_embedding_type = gpt_model.position_embedding_type, 
+                        max_position_embeddings = gpt_model.max_position_embeddings, 
+                        rotary_percentage = gpt_model.rotary_percent,
+                        rotary_base = gpt_model.rotary_base,
+                        moe_tp_mode = 2,
+                        multi_query_mode = False,
+                        activation = "gelu", 
+                        seq_len_interpolation_factor = seq_len_interpolation_factor,
+                        share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
+                    )   
+```
+
+***STEP 4 - Get the TRTLLM Weights and configs***
+To convert model weights to trtllm weights and configs, we use the [single_device_converter](../../../megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py). We pass as inputs the model state dict, and export config. In this example we use inference tp size as 2 for the export. 
+
+```python
+    model_state_dict={}
+    for key , val in gpt_model.state_dict().items():
+        # val is non for _extra_state layers . We filter it out
+        if val is not None:
+            model_state_dict[key] = val
+
+    export_config = ExportConfig(inference_tp_size = 2)
+    weight_list, config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+        model_state_dict= model_state_dict,
+        dtype = DataType.bfloat16,
+        export_config=export_config
+    )
+```
+
+***STEP 5 - Build the TRTLLM Engine***
+Following code is used to build the TRTLLM Engine. 
+
+```python
+    for trtllm_model_weights, trtllm_model_config in zip(weight_list, config_list):
+        trtllm_helper.build_and_save_engine(
+            max_input_len=256,
+            max_output_len=256,
+            max_batch_size=8,
+            engine_dir='/opt/megatron-lm/engine',
+            trtllm_model_weights=trtllm_model_weights,
+            trtllm_model_config=trtllm_model_config,
+            lora_ckpt_list=None,
+            use_lora_plugin=None,
+            max_lora_rank=64,
+            lora_target_modules=None,
+            max_prompt_embedding_table_size=0,
+            paged_kv_cache=True,
+            remove_input_padding=True,
+            paged_context_fmha=False,
+            use_refit=False,
+            max_num_tokens=None,
+            max_seq_len=512,
+            opt_num_tokens=None,
+            max_beam_width=1,
+            tokens_per_block=128,
+            multiple_profiles=False,
+            gpt_attention_plugin="auto",
+            gemm_plugin="auto",
+        )
+```
+<br>
+
+##### 1.2 Running The Code
+An example run script is shown below. 
+
+```
+# In a workstation 
+MLM_PATH=/path/to/megatron-lm
+CONTAINER_IMAGE=gitlab-master.nvidia.com:5005/dl/joc/nemo-ci/trtllm_0.12/train:pipe.17669124-x86
+
+docker run -it --gpus=all --ipc=host -v $MLM_PATH/:/opt/megatron-lm $CONTAINER_IMAGE bash
+
+# Inside the container run the following. 
+
+cd /opt/megatron-lm/
+
+CUDA_VISIBLE_DEVICES=0 torchrun --nproc-per-node 1  examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
+```
+
+<br>
+
+#### 2. GPU Export
+You can use the [gpt_distributed_gpu_export.py](./distributed_export/gpt_distributed_gpu_export.py) to run a more optimized on device distributed. version of trtllm export. Internally this uses the [distributed_converter](../../../megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py) to convert model weights on device. 
+In the single device version you collect all the model weights on CPU/GPU, convert it to trtllm format, and then store the engine back on disk. In the GPU version you load each individual state dict on the gpus, convert it on the device itself and store the engine on disk. 
+
+To run the gpu version 
+
+```
+CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc-per-node 2  examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
+```
+
+<br>
+
+#### 3. Future work
+The following are planned for the future releases . 
+* Pipeline parallellism for export (Work in progress) 
+* GPU Export for more models (Work in progress for some models)
+* Refit functionality
+* VLLM Support
\ No newline at end of file
--- a/Megatron-LM/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
+++ b/Megatron-LM/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
+import os
+import torch
+from megatron.core import parallel_state
+from megatron.core import dist_checkpointing
+from megatron.core.export.model_type import ModelType
+from megatron.core.export.data_type import DataType
+from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+
+
+_SEQUENCE_LENGTH = 64
+_VOCAB_SIZE = 256
+
+def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
+    parallel_state.destroy_model_parallel()
+
+    # Torch setup for distributed training
+    rank = int(os.environ['LOCAL_RANK'])
+    world_size = torch.cuda.device_count()
+    torch.cuda.set_device(rank)
+    torch.distributed.init_process_group(world_size=world_size, rank=rank)
+
+    # Megatron core distributed training initialization
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size = tensor_model_parallel_size, pipeline_model_parallel_size=pipeline_model_parallel_size)
+
+def model_provider():
+    """Build the model."""
+
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=64, 
+        num_attention_heads=2, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32
+    )
+
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=_VOCAB_SIZE, 
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
+
+    return gpt_model
+
+def load_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    gpt_model.load_state_dict(checkpoint)
+    return gpt_model
+
+if __name__ == "__main__":
+    initialize_distributed(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
+    model_parallel_cuda_manual_seed(123)
+
+    gpt_model = model_provider()
+    device = torch.device("cuda")
+    gpt_model.to(device) 
+    
+    # Optionally you can also load a gpt model from ckpt_path using this code below
+    # gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
+
+    seq_len_interpolation_factor = None
+    if hasattr(gpt_model, "rotary_pos_emb"):
+        seq_len_interpolation_factor =  gpt_model.rotary_pos_emb.seq_len_interpolation_factor
+
+    trtllm_helper = TRTLLMHelper(
+                        transformer_config=gpt_model.config, 
+                        model_type=ModelType.gpt,
+                        position_embedding_type = gpt_model.position_embedding_type, 
+                        max_position_embeddings = gpt_model.max_position_embeddings, 
+                        rotary_percentage = gpt_model.rotary_percent,
+                        rotary_base = gpt_model.rotary_base,
+                        moe_tp_mode = 2,
+                        multi_query_mode = False,
+                        activation = "gelu", 
+                        seq_len_interpolation_factor = seq_len_interpolation_factor,
+                        share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
+                    )
+    
+
+    trtllm_model_weights, trtllm_model_config = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+        model_state_dict= gpt_model.state_dict(),
+        dtype = DataType.bfloat16,
+        on_device_distributed_conversion=True, 
+        vocab_size=_VOCAB_SIZE, 
+        gpus_per_node=2,
+    )
+
+    trtllm_helper.build_and_save_engine(
+        max_input_len=256,
+        max_output_len=256,
+        max_batch_size=8,
+        engine_dir='/opt/megatron-lm/engine',
+        trtllm_model_weights=trtllm_model_weights[0],
+        trtllm_model_config=trtllm_model_config[0],
+        lora_ckpt_list=None,
+        use_lora_plugin=None,
+        max_lora_rank=64,
+        lora_target_modules=None,
+        max_prompt_embedding_table_size=0,
+        paged_kv_cache=True,
+        remove_input_padding=True,
+        paged_context_fmha=False,
+        use_refit=False,
+        max_num_tokens=None,
+        max_seq_len=512,
+        opt_num_tokens=None,
+        max_beam_width=1,
+        tokens_per_block=128,
+        multiple_profiles=False,
+        gpt_attention_plugin="auto",
+        gemm_plugin="auto",
+    )
--- a/Megatron-LM/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
+++ b/Megatron-LM/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
+import os
+import torch
+from megatron.core import parallel_state
+from megatron.core import dist_checkpointing
+from megatron.core.export.model_type import ModelType
+from megatron.core.export.data_type import DataType
+from megatron.core.export.export_config import ExportConfig
+from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+
+
+_SEQUENCE_LENGTH = 64
+
+
+def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
+    parallel_state.destroy_model_parallel()
+
+    # Torch setup for distributed training
+    rank = int(os.environ['LOCAL_RANK'])
+    world_size = torch.cuda.device_count()
+    torch.cuda.set_device(rank)
+    torch.distributed.init_process_group(world_size=world_size, rank=rank)
+
+    # Megatron core distributed training initialization
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size)
+
+def model_provider():
+    """Build the model."""
+
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=64, # Needs to be atleast 32 times num_attn_heads
+        num_attention_heads=2, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32,
+    )
+
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=100, 
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
+
+    return gpt_model
+
+def load_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    gpt_model.load_state_dict(checkpoint)
+    return gpt_model
+
+if __name__ == "__main__":
+    # Need to use TP1 PP1 for export on single device
+    initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
+    model_parallel_cuda_manual_seed(123)
+
+    gpt_model = model_provider()
+
+    # Optionally you can also load a gpt model from ckpt_path using this code below
+    # gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
+
+    seq_len_interpolation_factor = None
+    if hasattr(gpt_model, "rotary_pos_emb"):
+        seq_len_interpolation_factor =  gpt_model.rotary_pos_emb.seq_len_interpolation_factor
+
+    trtllm_helper = TRTLLMHelper(
+                        transformer_config=gpt_model.config, 
+                        model_type=ModelType.gpt,
+                        position_embedding_type = gpt_model.position_embedding_type, 
+                        max_position_embeddings = gpt_model.max_position_embeddings, 
+                        rotary_percentage = gpt_model.rotary_percent,
+                        rotary_base = gpt_model.rotary_base,
+                        moe_tp_mode = 2,
+                        multi_query_mode = False,
+                        activation = "gelu", 
+                        seq_len_interpolation_factor = seq_len_interpolation_factor,
+                        share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
+                    )
+    
+
+    export_config = ExportConfig(inference_tp_size = 2)
+    # NOTE : For faster performance, if your entire model will fit in gpu memory, transfer model state dict to GPU and then call this api
+    weight_list, config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+        model_state_dict= gpt_model.state_dict(),
+        dtype = DataType.bfloat16,
+        export_config=export_config
+    )
+
+    for trtllm_model_weights, trtllm_model_config in zip(weight_list, config_list):
+        trtllm_helper.build_and_save_engine(
+            max_input_len=256,
+            max_output_len=256,
+            max_batch_size=8,
+            engine_dir='/opt/megatron-lm/engine',
+            trtllm_model_weights=trtllm_model_weights,
+            trtllm_model_config=trtllm_model_config,
+            lora_ckpt_list=None,
+            use_lora_plugin=None,
+            max_lora_rank=64,
+            lora_target_modules=None,
+            max_prompt_embedding_table_size=0,
+            paged_kv_cache=True,
+            remove_input_padding=True,
+            paged_context_fmha=False,
+            use_refit=False,
+            max_num_tokens=None,
+            max_seq_len=512,
+            opt_num_tokens=None,
+            max_beam_width=1,
+            tokens_per_block=128,
+            multiple_profiles=False,
+            gpt_attention_plugin="auto",
+            gemm_plugin="auto",
+        )
\ No newline at end of file
--- a/Megatron-LM/examples/gpt3/README.md
+++ b/Megatron-LM/examples/gpt3/README.md
+# GPT3 MODEL
+
+## Table of contents
+- [1. Training Setup](#1-training-setup)
+- [2. Configurations](#2-configurations)
+- [3. Training Results](#3-training-results)
+
+## 1. Training setup
+<a id="markdown-training-setup" name="training-setup"></a>
+
+To run the model using a docker container run it as follows
+```
+PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
+CHECKPOINT_PATH="" #<Specify path>
+TENSORBOARD_LOGS_PATH=""#<Specify path>
+VOCAB_FILE="" #<Specify path to file>/gpt2-vocab.json
+MERGE_FILE="" #<Specify path to file>/gpt2-merges.txt
+DATA_PATH="" #<Specify path and file prefix>_text_document
+
+docker run \
+  --gpus=all \
+  --ipc=host \
+  --workdir /workspace/megatron-lm \
+  -v /path/to/data:/path/to/data \
+  -v /path/to/megatron-lm:/workspace/megatron-lm \
+  megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
+  bash examples/gpt3/train_gpt3_175b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $MERGE_FILE $DATA_PATH "
+
+```
+NOTE: Depending on the environment you are running it the above command might like slightly different.
+
+
+## 2. Configurations
+<a id="markdown-configurations" name="configurations"></a>
+The example in this folder shows you how to run 175B model. There are other configs you could run as well
+
+### 345M
+```
+       --num-layers 12 \
+       --hidden-size 512 \
+       --num-attention-heads 8 \
+       --seq-length 1024 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+
+```
+
+### 857M
+```
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --seq-length 2048 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+
+```
--- a/Megatron-LM/examples/gpt3/gpt_config.yaml
+++ b/Megatron-LM/examples/gpt3/gpt_config.yaml
+# WARNING: Yaml configs is currently an experimental feature
+language_model:
+  # model architecture
+  num_layers: 24
+  hidden_size: 1024
+  num_attention_heads: 16
+  num_query_groups: null
+
+  ffn_hidden_size: null
+  kv_channels: null
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  fp32_residual_connection: False
+
+  apply_residual_connection_post_layernorm: False
+  layernorm_epsilon: 1.e-5
+  layernorm_zero_centered_gamma: True
+  add_bias_linear: False
+  bias_activation_fusion: False
+  add_qkv_bias: False
+  gated_linear_unit: False
+  activation_func: swiglu
+  num_moe_experts: null
+  rotary_interleaved: False
+  window_size: null
+
+  # initialization
+  init_method: null
+  init_method_std: 0.02
+  output_layer_init_method: null
+
+  # mixed-precision
+  apply_query_key_layer_scaling: False
+  attention_softmax_in_fp32: False
+
+  # fusion
+  bias_swiglu_fusion: True
+  masked_softmax_fusion: True
+  persist_layer_norm: False
+  memory_efficient_layer_norm: False
+  bias_dropout_fusion: True
+  apply_rope_fusion: True
+
+  # activation recomputation
+  recompute_granularity: null
+  recompute_method: null
+  recompute_num_layers: null
+  distribute_saved_activations: null
+
+  # fp8 related
+  fp8: null
+  fp8_margin: 0
+  fp8_interval: 1
+  fp8_amax_history_len: 1
+  fp8_amax_compute_algo: "most_recent"
+  fp8_wgrad: True
+
+  # miscellaneous
+  clone_scatter_output_in_embedding: True
+
+  normalization: "LayerNorm"  # alt value supported by TE: "RMSNorm"
+
+  # MoE related
+  moe_router_load_balancing_type: "aux_loss"
+  moe_router_topk: 2
+  moe_router_group_topk: null
+  moe_router_num_groups: null
+  moe_grouped_gemm: False
+  moe_aux_loss_coeff: 0  # 1e-2 would be a good start value for load balance loss.
+  moe_z_loss_coeff: null  # 1e-3 would be a good start value for z-loss
+  moe_input_jitter_eps: null
+  moe_token_dropping: False
+
+model_parallel:
+  # Model parallelism
+  tensor_model_parallel_size: 1
+  context_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  virtual_pipeline_model_parallel_size: null
+  sequence_parallel: True
+  expert_model_parallel_size: 1
+
+  # Initialization
+  perform_initialization: True
+  use_cpu_initialization: null
+
+  # Training
+  fp16: False
+  bf16: True
+  params_dtype: null # Set from above arguments for core
+  timers: null
+
+  # Optimizations
+  gradient_accumulation_fusion: True
+  async_tensor_model_parallel_allreduce: True
+  tp_comm_overlap: False
+
+  # Debug Options
+  tp_comm_split_ag: True
+  tp_comm_atomic_ag: True
+  tp_comm_split_rs: True
+  tp_comm_atomic_rs: True
+  tp_comm_bulk_wgrad: True
+  tp_comm_bulk_dgrad: True
+
+  # Parallelism
+  finalize_model_grads_func: null
+
+  # Pipeline Parallel
+  pipeline_dtype: null
+  grad_scale_func: null
+  enable_autocast: False
+  autocast_dtype: null
+  variable_seq_lengths: False
+  num_microbatches_with_partial_activation_checkpoints: null
+  overlap_p2p_comm: False
+  batch_p2p_comm: True
+  batch_p2p_sync: True
+  use_ring_exchange_p2p: False
+  deallocate_pipeline_outputs: False
+  no_sync_func: null
+  grad_sync_func: null
+  param_sync_func: null
+
+  # CPU Offloading
+  cpu_offloading: False
+  cpu_offloading_num_layers: 0
+  _cpu_offloading_context: null
+  cpu_offloading_weights: False
+  cpu_offloading_activations: True
+
+  # Timing
+  barrier_with_L1_time: True
+
+# training:
+use_legacy_models: False
+spec: null
+micro_batch_size: 2
+global_batch_size: 128
+rampup_batch_size: [32, 32, 65324160] 
+check_for_nan_in_loss_and_grad: True
+num_layers_per_virtual_pipeline_stage: null
+
+encoder_num_layers: null
+decoder_num_layers: null
+rotary_seq_len_interpolation_factor: null
+add_position_embedding: False
+make_vocab_size_divisible_by: 128
+group_query_attention: False
+
+
+exit_signal_handler: False
+exit_duration_in_mins: null
+exit_interval: null
+
+untie_embeddings_and_output_weights: True
+position_embedding_type: rope
+rotary_percent: 0.5
+openai_gelu: False
+squared_relu: False
+swiglu: True
+onnx_safe: null
+bert_binary_head: True
+max_position_embeddings: 4096
+
+transformer_impl: local
+use_flash_attn: False
+seed: 1234
+data_parallel_random_init: False
+
+# Optimizer
+optimizer: adam
+lr: 2.5e-4
+lr_decay_style: cosine
+lr_decay_iters: null
+lr_decay_samples: 255126953
+lr_warmup_fraction: null
+lr_warmup_iters: 0
+lr_warmup_samples: 81381
+lr_warmup_init: 0.0
+min_lr: 2.5e-5
+weight_decay: 0.1
+start_weight_decay: null
+end_weight_decay: null
+weight_decay_incr_style: constant
+clip_grad: 1.0
+adam_beta1: 0.9
+adam_beta2: 0.95
+adam_eps: 1.e-08
+sgd_momentum: 0.9
+override_opt_param_scheduler: False
+use_checkpoint_opt_param_scheduler: False
+
+# checkpointing arguments
+save: null
+save_interval: 20000
+no_save_optim: null
+no_save_rng: null
+load: null
+no_load_optim: null
+no_load_rng: null
+finetune: False
+use_checkpoint_args: False
+exit_on_missing_checkpoint: False
+
+# loss arguments
+loss_scale: null
+initial_loss_scale: 4294967296
+min_loss_scale: 1.0
+loss_scale_window: 1000 
+hysteresis: 2
+accumulate_allreduce_grads_in_fp32: False
+fp16_lm_cross_entropy: False
+
+# distributed arguments
+distributed_backend: nccl
+distributed_timeout_minutes: 10
+overlap_grad_reduce: False
+align_grad_reduce: True
+overlap_param_gather: False
+align_param_gather: False
+scatter_gather_tensors_in_pipeline: True
+local_rank: null
+lazy_mpu_init: null
+empty_unused_memory_level: 0
+standalone_embedding_stage: False
+use_distributed_optimizer: False
+nccl_communicator_config_path: null
+
+train_iters: null
+eval_iters: 32
+eval_interval: 2000
+skip_train: False
+
+adlr_autoresume: False
+adlr_autoresume_interval: 1000
+
+# garbage collection
+manual_gc: False
+manual_gc_interval: 0
+manual_gc_eval: True
+
+tp_comm_overlap_cfg: null
+
+#data
+data_path: null
+split: '99,1,0'
+train_data_path: null
+valid_data_path: null
+test_data_path: null
+data_cache_path: null
+mock_data: False
+vocab_size: null
+vocab_file: null
+merge_file: null
+vocab_extra_ids: 0
+seq_length: 4096
+encoder_seq_length: null
+decoder_seq_length: null
+retriever_seq_length: 256
+sample_rate: 1.0
+mask_prob: 0.15
+short_seq_prob: 0.1
+num_workers: 2
+tokenizer_type: GPTSentencePieceTokenizer
+tokenizer_model: null
+reset_position_ids: False
+reset_attention_mask: False
+eod_mask_loss: False
+train_samples: 268554688
+dataloader_type: null
+
+#profile:
+profile: False
+profile_ranks: [0]
+profile_step_end: 12
+profile_step_start: 10
+
+#logging:
+log_params_norm: True
+log_num_zeros_in_grad: True
+log_throughput: False
+log_progress: False
+timing_log_level: 0
+timing_log_option: minmax
+tensorboard_log_interval: 1
+tensorboard_queue_size: 1000
+log_timers_to_tensorboard: False
+log_validation_ppl_to_tensorboard: False
+log_memory_to_tensorboard: False
+log_world_size_to_tensorboard: False
+log_loss_scale_to_tensorboard: True
+wandb_project: ''
+wandb_exp_name: ''
+wandb_save_dir: ''
+enable_one_logger: True
+one_logger_project: megatron-lm
+one_logger_run_name: null
+log_interval: 100
+tensorboard_dir: null
--- a/Megatron-LM/examples/gpt3/train_gpt3_175b_distributed.sh
+++ b/Megatron-LM/examples/gpt3/train_gpt3_175b_distributed.sh
+#!/bin/bash
+
+# Runs the "175B" parameter model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NUM_NODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
+
+CHECKPOINT_PATH=$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=$2 #<Specify path>
+VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
+MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
+DATA_PATH=$5 #<Specify path and file prefix>_text_document
+
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE 
+    --nnodes $NUM_NODES 
+    --master_addr $MASTER_ADDR 
+    --master_port $MASTER_PORT
+)
+
+GPT_MODEL_ARGS=(
+    --num-layers 96 
+    --hidden-size 12288 
+    --num-attention-heads 96 
+    --seq-length 2048 
+    --max-position-embeddings 2048 
+    --attention-backend auto # Can use (flash/fused/unfused/local)
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 1 
+    --global-batch-size 1536 
+    --rampup-batch-size 16 16 5859375 
+    --train-iters 500000 
+    --weight-decay 0.1 
+    --adam-beta1 0.9 
+    --adam-beta2 0.95 
+    --init-method-std 0.006 
+    --clip-grad 1.0 
+    --fp16
+    --lr 6.0e-5 
+    --lr-decay-style cosine 
+    --min-lr 6.0e-6
+    --lr-warmup-fraction .001 
+    --lr-decay-iters 430000 
+)
+
+MODEL_PARALLEL_ARGS=(
+	--tensor-model-parallel-size 8 
+	--pipeline-model-parallel-size 16 
+)
+
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --vocab-file $VOCAB_FILE 
+    --merge-file $MERGE_FILE 
+    --split 949,50,1
+)
+
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 100
+    --save-interval 10000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+
+torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
+    ${GPT_MODEL_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${EVAL_AND_LOGGING_ARGS[@]}