Initial commit

4e867b3c · jerrrrry · 4e867b3c · 4e867b3c · 4e867b3c · 4e867b3c
Commit 4e867b3c authored Aug 06, 2025 by jerrrrry
20 changed files
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/CONFIG.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/CONFIG.sh
+#!/bin/bash
+# SLURM options.
+export SLURM_PARTITION=<slurm partition, used to feed -p option in slurm>
+export SLURM_ACCOUNT=<slurm account, used to feed -A option in slurm>
+# Source code.
+export MEGATRON_CODE_DIR=<megatron source code directory>
+# This variable is used to mount the relevant part of the filesystem
+# inside the docker container. Note that the `MEGATRON_CODE_DIR` and the
+# launch directory already get mounted; this variable should be used to
+# mount the directories that contain the data and tokenizer files.
+export DOCKER_MOUNT_DIR=<megatron dataset and bpe tokenizer vocab path>
+# Data and tokenizer files.
+MEGATRON_DATA=<path to megatron processed data>
+BPE_VOCAB_FILE=<path to bpe vocab file>
+BPE_MERGE_FILE=<path to bpe merges file>
+# Megatron input parameters.
+# `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters
+# that are not listed here. 
+export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \
+	--tensor-model-parallel-size ${TP} \
+	--pipeline-model-parallel-size ${PP} \
+	--micro-batch-size ${MBS} \
+	--global-batch-size ${GBS} \
+        --num-layers ${NLS} \
+        --hidden-size ${HS} \
+        --num-attention-heads ${NAH} \
+	--DDP-impl ${DDP} \
+	--data-path ${MEGATRON_DATA} \
+	--vocab-file ${BPE_VOCAB_FILE} \
+	--merge-file ${BPE_MERGE_FILE} \
+        --log-interval 5 \
+        --seq-length 2048 \
+        --max-position-embeddings 2048 \
+        --train-iters 500 \
+        --lr-decay-iters 320 \
+        --lr 0.0001 \
+	--min-lr 0.00001 \
+        --lr-decay-style cosine \
+        --lr-warmup-fraction 0.01 \
+        --split 969,30,1 \
+        --eval-iters 100 \
+        --eval-interval 1000 \
+        --clip-grad 1.0 \
+        --fp16 \
+	--loss-scale 8192 "
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/README.md
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/README.md
+# Reproducing Figures in SC21 Paper
+This directory contains some of the scripts that were used to produce the
+results in the [Megatron paper](https://arxiv.org/pdf/2104.04473.pdf) that is
+to appear at [SuperComputing 2021](https://sc21.supercomputing.org/). These
+scripts use [Slurm](https://slurm.schedmd.com/documentation.html) with the
+[pyxis plugin](https://github.com/NVIDIA/pyxis), but can be modified for other
+schedulers as well.
+## Git commit
+To replicate these results use Megatron-LM commit: 6985e58938d40ad91ac07b0fddcfad8132e1447e
+## Setup
+All the cluster-dependent variables are in [`CONFIG.sh`](./CONFIG.sh). Please
+update the unspecified values (in angle brackets `<...>`) before launching any
+scripts.
+## Scripts
+Below is a list of scripts that can be used to reproduce various figures in our
+[paper](https://arxiv.org/pdf/2104.04473.pdf):
+* [run_table_1.sh](./run_table_1.sh): Table 1 showing weak-scaling throughput
+for GPT models ranging from 1 billion to 1 trillion parameters.
+* [run_figure_11.sh](./run_figure_11.sh): Figure 11 showing the weak-scaling
+performance of pipeline parallelism.
+* [run_figure_12.sh](./run_figure_12.sh): Figure 12 showing the effect of
+the interleaved schedule on a 175B GPT model.
+* [run_figure_13.sh](./run_figure_13.sh): Figure 13 showing the effect of
+different degrees of pipeline and tensor model parallelism on a model with
+162.2 billion parameters.
+* [run_figure_14.sh](./run_figure_14.sh): Figure 14 showing the effect of
+different degrees of data and pipeline model parallelism on a model with
+5.9 billion parameters.
+* [run_figure_15.sh](./run_figure_15.sh): Figure 15 showing the effect of
+different degrees of data and tensor model parallelism on a model with
+5.9 billion parameters.
+* [run_figure_16.sh](./run_figure_16.sh): Figure 16 showing the effect of
+microbatch size.
+* [run_figure_17.sh](./run_figure_17.sh): Figure 17 showing the effect of
+activation recomputation.
+* [run_figure_18.sh](./run_figure_18.sh): Figure 18 showing the effect of
+the scatter-gather communication optimization.
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/SBATCH.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/SBATCH.sh
+#!/bin/bash
+sbatch -p ${SLURM_PARTITION} \
+       -A ${SLURM_ACCOUNT} \
+       --job-name=${JOB_NAME} \
+       --nodes=${NNODES} \
+       --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh
+exit 0
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/SRUN.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/SRUN.sh
+#!/bin/bash
+#SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8
+THIS_DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+mkdir -p ${THIS_DIR}/logs
+CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}"
+srun -l \
+     --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \
+     --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \
+     --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}"
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_11.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_11.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Pipeline-parallel size options = [1, 2, 4, 8].
+PP=1
+# Batch size (global batch size) options = [8, 128].
+GBS=8
+# Set pipeline-parallel size options.
+NLS=$((3*PP))
+NNODES=${PP}
+# Other params.
+TP=8
+MBS=1
+HS=20480
+NAH=128
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+# Name of the job.
+export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_12.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_12.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Interleaved schedule options = [YES, NO].
+INTERLEAVED=YES
+# Batch size (global batch size) options = [12, 24, 36, ..., 60].
+GBS=12
+# Set interleaved schedule options.
+if [ ${INTERLEAVED} == "YES" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
+elif [ ${INTERLEAVED} == "NO" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+else
+    echo "Invalid configuration"
+    exit 1
+fi
+# Other params.
+TP=8
+PP=12
+MBS=1
+NLS=96
+HS=12288
+NAH=96
+DDP=local
+NNODES=12
+# Name of the job.
+export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_13.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_13.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Pipeline-parallel size options = [2, 4, 8, 16, 32].
+PP=2
+# Batch size (global batch size) options = [32, 128].
+GBS=32
+# Set pipeline-parallel and tensor-parallel size options.
+TP=$((64/PP))
+# Other params.
+MBS=1
+NLS=32
+HS=20480
+NAH=128
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+NNODES=8
+# Name of the job.
+export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_14.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_14.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Pipeline-parallel size options = [2, 4, 8, 16, 32].
+PP=2
+# Batch size (global batch size) options = [32, 512].
+GBS=32
+# Set pipeline-parallel and data-parallel size options.
+DP=$((64/PP))
+# Other params.
+TP=1
+MBS=1
+NLS=32
+HS=3840
+NAH=32
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+NNODES=8
+# Name of the job.
+export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_15.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_15.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Tensor-parallel size options = [2, 4, 8, 16, 32].
+TP=2
+# Batch size (global batch size) options = [32, 128, 512].
+GBS=32
+# Set tensor-parallel and data-parallel size options.
+DP=$((64/TP))
+# Other params.
+PP=1
+MBS=1
+NLS=32
+HS=3840
+NAH=32
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+NNODES=8
+# Name of the job.
+export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_16.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_16.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Microbatch size options = [1, 2, 4, 8].
+MBS=1
+# Batch size (global batch size) options = [128, 512].
+GBS=128
+# Other params.
+TP=8
+PP=8
+NLS=32
+HS=15360
+NAH=128
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+NNODES=8
+# Name of the job.
+export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_17.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_17.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Activation recomputation options = [YES, NO].
+ACTIVATION_RECOMPUTATION=YES
+# Batch size (global batch size) options = [1, 2, 4, ..., 256].
+GBS=1
+# Set activation recomputation.
+if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then
+    MEGATRON_EXTRA_PARAMS=""
+else
+    echo "Invalid configuration"
+    exit 1
+fi
+# Other params.
+TP=8
+PP=16
+MBS=1
+NLS=80
+HS=12288
+NAH=96
+DDP=local
+NNODES=16
+# Name of the job.
+export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_18.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/run_figure_18.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Scatter-gather communication optimization options = [YES, NO].
+SCATTER_GATHER=YES
+# Batch size (global batch size) options = [12, 24, 36, ..., 60].
+GBS=12
+# Set scatter-gather communication optimization options.
+if [ ${SCATTER_GATHER} == "YES" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
+elif [ ${SCATTER_GATHER} == "NO" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline "
+else
+    echo "Invalid configuration"
+    exit 1
+fi
+# Other params.
+TP=8
+PP=12
+MBS=1
+NLS=96
+HS=12288
+NAH=96
+DDP=local
+NNODES=12
+# Name of the job.
+export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/Megatron-LM/examples/academic_paper_scripts/sc21/run_table_1.sh
+++ b/Megatron-LM/examples/academic_paper_scripts/sc21/run_table_1.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# model size options = [1.7B, 3.6B, 7.5B, 18B, 39B, 76B, 145B, 310B, 530B, 1T]
+MODEL_SIZE=1.7B
+if [ ${MODEL_SIZE} == "1.7B" ]; then
+    TP=1
+    PP=1
+    MBS=16
+    GBS=512
+    NLS=24
+    HS=2304
+    NAH=24
+    DDP=torch
+    NNODES=4
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "3.6B" ]; then
+    TP=2
+    PP=1
+    MBS=16
+    GBS=512
+    NLS=30
+    HS=3072
+    NAH=32
+    DDP=torch
+    NNODES=8
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "7.5B" ]; then
+    TP=4
+    PP=1
+    MBS=16
+    GBS=512
+    NLS=36
+    HS=4096
+    NAH=32
+    DDP=torch
+    NNODES=16
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "18B" ]; then
+    TP=8
+    PP=1
+    MBS=8
+    GBS=1024
+    NLS=40
+    HS=6144
+    NAH=48
+    DDP=torch
+    NNODES=32
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "39B" ]; then
+    TP=8
+    PP=2
+    MBS=4
+    GBS=1536
+    NLS=48
+    HS=8192
+    NAH=64
+    DDP=local
+    NNODES=64
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "76B" ]; then
+    TP=8
+    PP=4
+    MBS=2
+    GBS=1792
+    NLS=60
+    HS=10240
+    NAH=80
+    DDP=local
+    NNODES=128
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5"
+elif [ ${MODEL_SIZE} == "145B" ]; then
+    TP=8
+    PP=8
+    MBS=2
+    GBS=2304
+    NLS=80
+    HS=12288
+    NAH=96
+    DDP=local
+    NNODES=192
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5 "
+elif [ ${MODEL_SIZE} == "310B" ]; then
+    TP=8
+    PP=16
+    MBS=1
+    GBS=2160
+    NLS=96
+    HS=16384
+    NAH=128
+    DDP=local
+    NNODES=240
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 3 "
+elif [ ${MODEL_SIZE} == "530B" ]; then
+    TP=8
+    PP=35
+    MBS=1
+    GBS=2520
+    NLS=105
+    HS=20480
+    NAH=128
+    DDP=local
+    NNODES=315
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 1 "
+elif [ ${MODEL_SIZE} == "1T" ]; then
+    TP=8
+    PP=64
+    MBS=1
+    GBS=3072
+    NLS=128
+    HS=25600
+    NAH=160
+    DDP=local
+    NNODES=384
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+else
+    echo "Invalid configuration"
+    exit 1
+fi
+# Name of the job
+export JOB_NAME=results_table_1_model_size_${MODEL_SIZE}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/Megatron-LM/examples/bert/README.md
+++ b/Megatron-LM/examples/bert/README.md
+# BERT MODEL
+## Table of contents
+- [1. Training Setup](#1-training-setup)
+- [2. Configurations](#2-configurations)
+## 1. Training setup
+<a id="markdown-training-setup" name="training-setup"></a>
+To run the model using a docker container run it as follows
+```
+PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
+CHECKPOINT_PATH="" #<Specify path>
+TENSORBOARD_LOGS_PATH=""#<Specify path>
+VOCAB_FILE="" #<Specify path to file>//bert-vocab.txt
+DATA_PATH="" #<Specify path and file prefix>_text_document
+docker run \
+  --gpus=all \
+  --ipc=host \
+  --workdir /workspace/megatron-lm \
+  -v /path/to/data:/path/to/data \
+  -v /path/to/megatron-lm:/workspace/megatron-lm \
+  megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
+  bash examples/bert/train_bert_340m_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $DATA_PATH "
+```
+NOTE: Depending on the environment you are running it the above command might like slightly different.
+## 2. Configurations
+<a id="markdown-configurations" name="configurations"></a>
+The example in this folder shows you how to run 340m large model. There are other configs you could run as well
+### 4B
+```
+       --num-layers 48 \
+       --hidden-size 2560 \
+       --num-attention-heads 32 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+```
+### 20B
+```
+       --num-layers 48 \
+       --hidden-size 6144 \
+       --num-attention-heads 96 \
+       --tensor-model-parallel-size 4 \
+       --pipeline-model-parallel-size 4 \
+```
\ No newline at end of file
--- a/Megatron-LM/examples/bert/train_bert_340m_distributed.sh
+++ b/Megatron-LM/examples/bert/train_bert_340m_distributed.sh
+#!/bin/bash
+# Runs the "340M" parameter model (Bert - Large)
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NUM_NODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
+CHECKPOINT_PATH=$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=$2 #<Specify path>
+VOCAB_FILE=$3 #<Specify path to file>/bert-vocab.json
+DATA_PATH=$4 #<Specify path and file prefix>_text_document
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE 
+    --nnodes $NUM_NODES 
+    --master_addr $MASTER_ADDR 
+    --master_port $MASTER_PORT
+)
+BERT_MODEL_ARGS=(
+    --num-layers 24 
+    --hidden-size 1024 
+    --num-attention-heads 16 
+    --seq-length 512 
+    --max-position-embeddings 512 
+    --attention-backend auto # Can use (flash/fused/unfused/local)
+)
+TRAINING_ARGS=(
+    --micro-batch-size 4 
+    --global-batch-size 32 
+    --train-iters 1000000 
+    --weight-decay 1e-2 
+    --clip-grad 1.0 
+    --fp16
+    --lr 0.0001
+    --lr-decay-iters 990000 
+    --lr-decay-style linear 
+    --min-lr 1.0e-5 
+    --weight-decay 1e-2 
+    --lr-warmup-fraction .01 
+    --clip-grad 1.0 
+)
+MODEL_PARALLEL_ARGS=(
+	--tensor-model-parallel-size 8 
+	--pipeline-model-parallel-size 16 
+)
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --vocab-file $VOCAB_FILE 
+    --split 949,50,1
+)
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 100
+    --save-interval 10000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+torchrun ${DISTRIBUTED_ARGS[@]} pretrain_bert.py \
+    ${BERT_MODEL_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${EVAL_AND_LOGGING_ARGS[@]}
\ No newline at end of file
--- a/Megatron-LM/examples/export/README.md
+++ b/Megatron-LM/examples/export/README.md
+# Megatron Core Export
+This module is used to export megatron core models to different inference frameworks. 
+Currently we support TRTLLM export . In the future we will be adding support for VLLM etc. 
+## PTQ AND EXPORT
+Follow the examples of [TensorRT Model Optimizer](../post_training/modelopt) to perform post training quantization, followed by an export to a HF-like checkpoint for TensorRT-LLM, vLLM, and SGLang deployment. 
+# TRTLLM EXPORT
+Follow the instructions in [trtllm_export](./trtllm_export/) to do export to TRTLLM checkpoint format alone.
--- a/Megatron-LM/examples/export/knowledge_distillation/pretrain_gpt_modelopt.py
+++ b/Megatron-LM/examples/export/knowledge_distillation/pretrain_gpt_modelopt.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+"""Pretrain GPT."""
+import os
+import sys
+from functools import partial
+# This file isn't located in project root, but to import, it should pretend to be.
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
+from megatron.core import mpu
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
+from megatron.core.datasets.utils import get_blend_from_list
+from megatron.core.enums import ModelType
+from megatron.core.models.gpt import GPTModel
+from megatron.core.utils import StragglerDetector
+from megatron.post_training.arguments import add_modelopt_args
+from megatron.post_training.loss_func import loss_func
+from megatron.post_training.model_provider import model_provider
+from megatron.training import get_args, get_timers, get_tokenizer, pretrain
+from megatron.training.utils import (
+    get_batch_on_this_cp_rank,
+    get_batch_on_this_tp_rank,
+    print_rank_0,
+)
+stimer = StragglerDetector()
+def get_batch(data_iterator):
+    """Generate a batch."""
+    # TODO: this is pretty hacky, find a better way
+    if (not mpu.is_pipeline_first_stage()) and (not mpu.is_pipeline_last_stage()):
+        return None, None, None, None, None
+    # get batches based on the TP rank you are on
+    batch = get_batch_on_this_tp_rank(data_iterator)
+    # slice batch along sequence dimension for context parallelism
+    batch = get_batch_on_this_cp_rank(batch)
+    return batch.values()
+def forward_step(data_iterator, model: GPTModel):
+    """Forward training step.
+    Args:
+        data_iterator : Input data iterator
+        model (GPTModel): The GPT Model
+    """
+    timers = get_timers()
+    # Get the batch.
+    timers('batch-generator', log_level=2).start()
+    global stimer
+    with stimer(bdata=True):
+        tokens, labels, loss_mask, attention_mask, position_ids = get_batch(data_iterator)
+    timers('batch-generator').stop()
+    with stimer:
+        output_tensor = model(tokens, position_ids, attention_mask, labels=labels)
+    # [ModelOpt]: model is needed to access ModelOpt distillation losses
+    return output_tensor, partial(loss_func, loss_mask, model)
+def is_dataset_built_on_rank():
+    return (
+        mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage()
+    ) and mpu.get_tensor_model_parallel_rank() == 0
+def core_gpt_dataset_config_from_args(args):
+    tokenizer = get_tokenizer()
+    return GPTDatasetConfig(
+        random_seed=args.seed,
+        sequence_length=args.seq_length,
+        blend=get_blend_from_list(args.data_path),
+        blend_per_split=[
+            get_blend_from_list(args.train_data_path),
+            get_blend_from_list(args.valid_data_path),
+            get_blend_from_list(args.test_data_path),
+        ],
+        split=args.split,
+        num_dataset_builder_threads=args.num_dataset_builder_threads,
+        path_to_cache=args.data_cache_path,
+        mmap_bin_files=args.mmap_bin_files,
+        tokenizer=tokenizer,
+        reset_position_ids=args.reset_position_ids,
+        reset_attention_mask=args.reset_attention_mask,
+        eod_mask_loss=args.eod_mask_loss,
+        create_attention_mask=args.create_attention_mask_in_dataloader,
+    )
+def train_valid_test_datasets_provider(train_val_test_num_samples):
+    """Build the train test and validation datasets.
+    Args:
+        train_val_test_num_samples : A list containing the number of samples in train test and validation.
+    """
+    args = get_args()
+    config = core_gpt_dataset_config_from_args(args)
+    if args.mock_data:
+        dataset_type = MockGPTDataset
+    else:
+        dataset_type = GPTDataset
+    print_rank_0("> building train, validation, and test datasets for GPT ...")
+    train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
+        dataset_type, train_val_test_num_samples, is_dataset_built_on_rank, config
+    ).build()
+    print_rank_0("> finished creating GPT datasets ...")
+    return train_ds, valid_ds, test_ds
+if __name__ == "__main__":
+    # Temporary for transition to core datasets
+    train_valid_test_datasets_provider.is_distributed = True
+    pretrain(
+        train_valid_test_datasets_provider,
+        model_provider,
+        ModelType.encoder_or_decoder,
+        forward_step,
+        args_defaults={"tokenizer_type": "GPT2BPETokenizer"},
+        extra_args_provider=add_modelopt_args,
+    )
--- a/Megatron-LM/examples/export/trtllm_export/README.md
+++ b/Megatron-LM/examples/export/trtllm_export/README.md
+# Megatron Core To TRTLLM Export Documentation
+This guide will walk you through how you can use the megatron core export for exporting models to trtllm format
+### Contents
+- [Megatron Core To TRTLLM Export Documentation](#megatron-core-to-trtllm-export-documentation)
+- [Contents](#contents)
+  - [1. Quick Start](#1-quick-start)
+    - [1.1 Understanding The Code](#11-understanding-the-code)
+    - [1.2 Running The Code](#12-running-the-code)
+  - [2. GPU Export](#2-gpu-export)
+  - [3. Future work](#4-future-work)
+#### 1. Quick Start
+This will walk you through the flow of converting an mcore gpt model to trtllm format using single device mode. The file can be found at [gpt_single_device_cpu_export.py](./single_device_export/gpt_single_device_cpu_export.py)
+NOTE: For faster performance, if your entire model will fit into gpu memory, pre transfer the model state dict to gpu and then call the get_trtllm_pretrained_config_and_model_weights function.
+<br>
+##### 1.1 Understanding The Code
+***STEP 1 - We initialize model parallel and other default arguments***
+We initalize tp and pp to 1 so that we can get the full model state dict on cpu
+```python
+    initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
+```
+***STEP 2 - We load the model using the model_provider_function***
+NOTE: We create a simple gpt model
+```python
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=64, # Needs to be atleast 32 times num_attn_heads
+        num_attention_heads=2, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32,
+    )
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=100, 
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
+    # Optionally you can also load a model using this code 
+    # sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    # checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    # gpt_model.load_state_dict(checkpoint)
+```
+***STEP 3 - Instantiate the TRTLLM Helper***
+We instantiate the [TRTLLM Helper](../../../megatron/core/export/trtllm/trtllm_helper.py)  For the GPT model we instantiate trtllm_helper as shown below.
+```python
+    if hasattr(gpt_model, "rotary_pos_emb"):
+        seq_len_interpolation_factor =  gpt_model.rotary_pos_emb.seq_len_interpolation_factor
+    trtllm_helper = TRTLLMHelper(
+                        transformer_config=gpt_model.config, 
+                        model_type=ModelType.gpt,
+                        position_embedding_type = gpt_model.position_embedding_type, 
+                        max_position_embeddings = gpt_model.max_position_embeddings, 
+                        rotary_percentage = gpt_model.rotary_percent,
+                        rotary_base = gpt_model.rotary_base,
+                        moe_tp_mode = 2,
+                        multi_query_mode = False,
+                        activation = "gelu", 
+                        seq_len_interpolation_factor = seq_len_interpolation_factor,
+                        share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
+                    )   
+```
+***STEP 4 - Get the TRTLLM Weights and configs***
+To convert model weights to trtllm weights and configs, we use the [single_device_converter](../../../megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py). We pass as inputs the model state dict, and export config. In this example we use inference tp size as 2 for the export. 
+```python
+    model_state_dict={}
+    for key , val in gpt_model.state_dict().items():
+        # val is non for _extra_state layers . We filter it out
+        if val is not None:
+            model_state_dict[key] = val
+    export_config = ExportConfig(inference_tp_size = 2)
+    weight_list, config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+        model_state_dict= model_state_dict,
+        dtype = DataType.bfloat16,
+        export_config=export_config
+    )
+```
+***STEP 5 - Build the TRTLLM Engine***
+Following code is used to build the TRTLLM Engine. 
+```python
+    for trtllm_model_weights, trtllm_model_config in zip(weight_list, config_list):
+        trtllm_helper.build_and_save_engine(
+            max_input_len=256,
+            max_output_len=256,
+            max_batch_size=8,
+            engine_dir='/opt/megatron-lm/engine',
+            trtllm_model_weights=trtllm_model_weights,
+            trtllm_model_config=trtllm_model_config,
+            lora_ckpt_list=None,
+            use_lora_plugin=None,
+            max_lora_rank=64,
+            lora_target_modules=None,
+            max_prompt_embedding_table_size=0,
+            paged_kv_cache=True,
+            remove_input_padding=True,
+            paged_context_fmha=False,
+            use_refit=False,
+            max_num_tokens=None,
+            max_seq_len=512,
+            opt_num_tokens=None,
+            max_beam_width=1,
+            tokens_per_block=128,
+            multiple_profiles=False,
+            gpt_attention_plugin="auto",
+            gemm_plugin="auto",
+        )
+```
+<br>
+##### 1.2 Running The Code
+An example run script is shown below. 
+```
+# In a workstation 
+MLM_PATH=/path/to/megatron-lm
+CONTAINER_IMAGE=gitlab-master.nvidia.com:5005/dl/joc/nemo-ci/trtllm_0.12/train:pipe.17669124-x86
+docker run -it --gpus=all --ipc=host -v $MLM_PATH/:/opt/megatron-lm $CONTAINER_IMAGE bash
+# Inside the container run the following. 
+cd /opt/megatron-lm/
+CUDA_VISIBLE_DEVICES=0 torchrun --nproc-per-node 1  examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
+```
+<br>
+#### 2. GPU Export
+You can use the [gpt_distributed_gpu_export.py](./distributed_export/gpt_distributed_gpu_export.py) to run a more optimized on device distributed. version of trtllm export. Internally this uses the [distributed_converter](../../../megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py) to convert model weights on device. 
+In the single device version you collect all the model weights on CPU/GPU, convert it to trtllm format, and then store the engine back on disk. In the GPU version you load each individual state dict on the gpus, convert it on the device itself and store the engine on disk. 
+To run the gpu version 
+```
+CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc-per-node 2  examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
+```
+<br>
+#### 3. Future work
+The following are planned for the future releases . 
+* Pipeline parallellism for export (Work in progress) 
+* GPU Export for more models (Work in progress for some models)
+* Refit functionality
+* VLLM Support
\ No newline at end of file
--- a/Megatron-LM/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
+++ b/Megatron-LM/examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
+import os
+import torch
+from megatron.core import parallel_state
+from megatron.core import dist_checkpointing
+from megatron.core.export.model_type import ModelType
+from megatron.core.export.data_type import DataType
+from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+_SEQUENCE_LENGTH = 64
+_VOCAB_SIZE = 256
+def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
+    parallel_state.destroy_model_parallel()
+    # Torch setup for distributed training
+    rank = int(os.environ['LOCAL_RANK'])
+    world_size = torch.cuda.device_count()
+    torch.cuda.set_device(rank)
+    torch.distributed.init_process_group(world_size=world_size, rank=rank)
+    # Megatron core distributed training initialization
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size = tensor_model_parallel_size, pipeline_model_parallel_size=pipeline_model_parallel_size)
+def model_provider():
+    """Build the model."""
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=64, 
+        num_attention_heads=2, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32
+    )
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=_VOCAB_SIZE, 
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
+    return gpt_model
+def load_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    gpt_model.load_state_dict(checkpoint)
+    return gpt_model
+if __name__ == "__main__":
+    initialize_distributed(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
+    model_parallel_cuda_manual_seed(123)
+    gpt_model = model_provider()
+    device = torch.device("cuda")
+    gpt_model.to(device) 
+    # Optionally you can also load a gpt model from ckpt_path using this code below
+    # gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
+    seq_len_interpolation_factor = None
+    if hasattr(gpt_model, "rotary_pos_emb"):
+        seq_len_interpolation_factor =  gpt_model.rotary_pos_emb.seq_len_interpolation_factor
+    trtllm_helper = TRTLLMHelper(
+                        transformer_config=gpt_model.config, 
+                        model_type=ModelType.gpt,
+                        position_embedding_type = gpt_model.position_embedding_type, 
+                        max_position_embeddings = gpt_model.max_position_embeddings, 
+                        rotary_percentage = gpt_model.rotary_percent,
+                        rotary_base = gpt_model.rotary_base,
+                        moe_tp_mode = 2,
+                        multi_query_mode = False,
+                        activation = "gelu", 
+                        seq_len_interpolation_factor = seq_len_interpolation_factor,
+                        share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
+                    )
+    trtllm_model_weights, trtllm_model_config = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+        model_state_dict= gpt_model.state_dict(),
+        dtype = DataType.bfloat16,
+        on_device_distributed_conversion=True, 
+        vocab_size=_VOCAB_SIZE, 
+        gpus_per_node=2,
+    )
+    trtllm_helper.build_and_save_engine(
+        max_input_len=256,
+        max_output_len=256,
+        max_batch_size=8,
+        engine_dir='/opt/megatron-lm/engine',
+        trtllm_model_weights=trtllm_model_weights[0],
+        trtllm_model_config=trtllm_model_config[0],
+        lora_ckpt_list=None,
+        use_lora_plugin=None,
+        max_lora_rank=64,
+        lora_target_modules=None,
+        max_prompt_embedding_table_size=0,
+        paged_kv_cache=True,
+        remove_input_padding=True,
+        paged_context_fmha=False,
+        use_refit=False,
+        max_num_tokens=None,
+        max_seq_len=512,
+        opt_num_tokens=None,
+        max_beam_width=1,
+        tokens_per_block=128,
+        multiple_profiles=False,
+        gpt_attention_plugin="auto",
+        gemm_plugin="auto",
+    )
--- a/Megatron-LM/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
+++ b/Megatron-LM/examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
+import os
+import torch
+from megatron.core import parallel_state
+from megatron.core import dist_checkpointing
+from megatron.core.export.model_type import ModelType
+from megatron.core.export.data_type import DataType
+from megatron.core.export.export_config import ExportConfig
+from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+_SEQUENCE_LENGTH = 64
+def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
+    parallel_state.destroy_model_parallel()
+    # Torch setup for distributed training
+    rank = int(os.environ['LOCAL_RANK'])
+    world_size = torch.cuda.device_count()
+    torch.cuda.set_device(rank)
+    torch.distributed.init_process_group(world_size=world_size, rank=rank)
+    # Megatron core distributed training initialization
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size)
+def model_provider():
+    """Build the model."""
+    transformer_config = TransformerConfig(
+        num_layers=2, 
+        hidden_size=64, # Needs to be atleast 32 times num_attn_heads
+        num_attention_heads=2, 
+        use_cpu_initialization=True, 
+        pipeline_dtype=torch.float32,
+    )
+    gpt_model = GPTModel(
+        config=transformer_config, 
+        transformer_layer_spec=get_gpt_layer_local_spec(), 
+        vocab_size=100, 
+        max_sequence_length=_SEQUENCE_LENGTH,
+    )
+    return gpt_model
+def load_distributed_checkpoint(checkpoint_path, gpt_model):
+    sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
+    checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
+    gpt_model.load_state_dict(checkpoint)
+    return gpt_model
+if __name__ == "__main__":
+    # Need to use TP1 PP1 for export on single device
+    initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
+    model_parallel_cuda_manual_seed(123)
+    gpt_model = model_provider()
+    # Optionally you can also load a gpt model from ckpt_path using this code below
+    # gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
+    seq_len_interpolation_factor = None
+    if hasattr(gpt_model, "rotary_pos_emb"):
+        seq_len_interpolation_factor =  gpt_model.rotary_pos_emb.seq_len_interpolation_factor
+    trtllm_helper = TRTLLMHelper(
+                        transformer_config=gpt_model.config, 
+                        model_type=ModelType.gpt,
+                        position_embedding_type = gpt_model.position_embedding_type, 
+                        max_position_embeddings = gpt_model.max_position_embeddings, 
+                        rotary_percentage = gpt_model.rotary_percent,
+                        rotary_base = gpt_model.rotary_base,
+                        moe_tp_mode = 2,
+                        multi_query_mode = False,
+                        activation = "gelu", 
+                        seq_len_interpolation_factor = seq_len_interpolation_factor,
+                        share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
+                    )
+    export_config = ExportConfig(inference_tp_size = 2)
+    # NOTE : For faster performance, if your entire model will fit in gpu memory, transfer model state dict to GPU and then call this api
+    weight_list, config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+        model_state_dict= gpt_model.state_dict(),
+        dtype = DataType.bfloat16,
+        export_config=export_config
+    )
+    for trtllm_model_weights, trtllm_model_config in zip(weight_list, config_list):
+        trtllm_helper.build_and_save_engine(
+            max_input_len=256,
+            max_output_len=256,
+            max_batch_size=8,
+            engine_dir='/opt/megatron-lm/engine',
+            trtllm_model_weights=trtllm_model_weights,
+            trtllm_model_config=trtllm_model_config,
+            lora_ckpt_list=None,
+            use_lora_plugin=None,
+            max_lora_rank=64,
+            lora_target_modules=None,
+            max_prompt_embedding_table_size=0,
+            paged_kv_cache=True,
+            remove_input_padding=True,
+            paged_context_fmha=False,
+            use_refit=False,
+            max_num_tokens=None,
+            max_seq_len=512,
+            opt_num_tokens=None,
+            max_beam_width=1,
+            tokens_per_block=128,
+            multiple_profiles=False,
+            gpt_attention_plugin="auto",
+            gemm_plugin="auto",
+        )
\ No newline at end of file