update

e1354f9d · liangjing · e1354f9d · e1354f9d · e1354f9d · e1354f9d
Commit e1354f9d authored May 22, 2024 by liangjing
20 changed files
--- a/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_PR-MoE64or128_MoS.sh
+++ b/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_PR-MoE64or128_MoS.sh
+#!/bin/bash
+DIR=`pwd`
+###############################################################################
+### Main configs
+## GPT-3 models use 2K sequence length/context window
+SEQ_LEN=2048
+
+### The "GPT-3 XXX" below are configs from GPT-3 paper
+### https://arxiv.org/abs/2005.14165, choose based on
+### your desired model size or build your own configs
+
+## GPT-3 Small 125M
+# MODEL_SIZE=0.125
+# NUM_LAYERS=12
+# HIDDEN_SIZE=768
+# NUM_ATTN_HEADS=12
+# GLOBAL_BATCH_SIZE=256
+# LR=6.0e-4
+# MIN_LR=6.0e-5
+
+## GPT-3 Medium 350M
+# MODEL_SIZE=0.35
+# NUM_LAYERS=24
+# HIDDEN_SIZE=1024
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=256
+# LR=3.0e-4
+# MIN_LR=3.0e-5
+
+## GPT-3 Large 760M
+# MODEL_SIZE=0.76
+# NUM_LAYERS=24
+# HIDDEN_SIZE=1536
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=256
+# LR=2.5e-4
+# MIN_LR=2.5e-5
+
+## GPT-3 XL 1.3B
+MODEL_SIZE=1.3
+NUM_LAYERS=24
+HIDDEN_SIZE=2048
+NUM_ATTN_HEADS=16
+GLOBAL_BATCH_SIZE=512
+# LR=2.0e-4
+# MIN_LR=2.0e-5
+
+## GPT-3 2.7B
+# MODEL_SIZE=2.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=2560
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=512
+# LR=1.6e-4
+# MIN_LR=1.6e-5
+
+## GPT-3 6.7B
+# MODEL_SIZE=6.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=4096
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.2e-4
+# MIN_LR=1.2e-5
+
+## GPT-3 13B
+# MODEL_SIZE=13
+# NUM_LAYERS=40
+# HIDDEN_SIZE=5120
+# NUM_ATTN_HEADS=40
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.0e-4
+# MIN_LR=1.0e-5
+
+## GPT-3 175B
+# MODEL_SIZE=175
+# NUM_LAYERS=96
+# HIDDEN_SIZE=12288
+# NUM_ATTN_HEADS=96
+# GLOBAL_BATCH_SIZE=1536
+# LR=0.6e-4
+# MIN_LR=0.6e-5
+###############################################################################
+### Training duration configs
+## The main termination condition, original GPT-3 paper trains for 300B tokens
+## For MoE model, we found sometimes training a bit more to 330B tokens helps
+TRAIN_TOKENS=300000000000
+# TRAIN_TOKENS=330000000000
+
+## TRAIN_ITERS is another termination condition and also affect the number of 
+## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
+## above, and techniques like curriculum learning has less token in some steps,
+## so we just set this config large enough to make sure we have enough
+## processed data and don't terminate by TRAIN_ITERS.
+TRAIN_ITERS=$(( ${TRAIN_TOKENS} * 3 / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
+
+## Another termination condition in minutes. Set it large enough to avoid
+## undesired early termination.
+EXIT_DURATION=30000000
+###############################################################################
+### LR configs
+## LR warmup and decay duration, this token-based config is preferable since
+## no need to readjust when the batch size/seqlen is changed.
+## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
+## For MoE model, we found that setting the decay token to 300B helps.
+WARMUP_TOKENS=375000000
+# LR_DECAY_TOKENS=260000000000
+LR_DECAY_TOKENS=300000000000
+###############################################################################
+### Parallelism configs
+## Micro batch size per GPU
+## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
+BATCH_SIZE=4
+
+## Model parallelism, 1 is no MP
+MP_SIZE=1
+
+## Pipeline parallelism
+## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
+## to 1 and use the "--no-pipeline-parallel" arg.
+PP_SIZE=1
+NUM_GPUS=128
+###############################################################################
+### MoE configs
+## Number of experts. EP_SIZE 128 means standard MoE
+# EP_SIZE=128
+EP_SIZE="64 64 64 64 64 64 64 64 128 128"
+EP_SIZE_TEACHER="64 64 64 64 64 64 64 64 64 64 128 128"
+
+EP_PARALLEL_SIZE=$NUM_GPUS
+
+
+## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
+## found that lower LR and min LR (than the base dense model) helps.
+## For 1.3B PR-MoE-64/128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
+## heavily tuned.
+LR=1.2e-4
+MIN_LR=1.0e-6
+
+## Coefficient for MoE loss. We find that 0.01 is a good value at least for
+## 1.3B MoE-128 model
+MLC=0.01
+
+## Below configs adjust the MoE expert token capacity limit during training and
+## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
+## Larger capacity factor or disabling capacity limit could improve training
+## convergence, but will also reduce training throughput.
+MOE_TRAIN_CAP_FACTOR=1.0
+MOE_EVAL_CAP_FACTOR=1.0
+MOE_MIN_CAP=4
+MOE_DROP_TOKEN="true"
+# MOE_DROP_TOKEN="false"
+###############################################################################
+### Curriculum learning (CL) configs
+## Enable/disable CL
+CL_ENABLED="false"
+## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
+## for tuning the following configs
+CL_START_SEQLEN=80
+CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
+CL_TOKENS=60
+CL_TOKENS=$((${CL_TOKENS} * 1000000000))
+CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
+###############################################################################
+### Misc configs
+LOG_INTERVAL=10
+EVAL_ITERS=10
+EVAL_INTERVAL=100
+SAVE_INTERVAL=10000
+
+## Standard deviation for weight initialization
+## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
+## dense model. Usually larger model needs lower std.
+INIT_STD=0.014
+# INIT_STD=0.01
+
+## Activation checkpointing saves GPU memory, but reduces training speed
+ACTIVATION_CHECKPOINT="true"
+# ACTIVATION_CHECKPOINT="false"
+###############################################################################
+### Output and data configs
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+host="${HOSTNAME}"
+NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
+NAME="${NAME}-ep-pyramid-64+128-mos-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
+
+if [ "${CL_ENABLED}" = "true" ]; then
+    NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
+fi
+
+OUTPUT_BASEPATH=$DIR/output
+mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
+mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
+mkdir -p "${OUTPUT_BASEPATH}/log/"
+TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}"
+mkdir -p ${TENSORBOARD_DIR} 
+## Note that for MoE model with billion-scale base model, the checkpoint can be
+## as large as TB-scale which normal NFS cannot handle efficiently.
+CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
+
+### Mixture-of-Students (MoS) configs
+KD_BETA_CE=1
+CHECKPOINT_PATH_STUDENT="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
+CHECKPOINT_PATH_TEACHER="${OUTPUT_BASEPATH}/checkpoint/gpt-1.3B-lr-1.2e-4-minlr-1.0e-6-bs-512-gpus-128-mp-1-pp-1-ep-pyramid-64+128-mlc-0.01-cap-1.0-drop-true/"
+CHECKPOINT_PATH_SAVE="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
+
+USE_INTERNAL_DATA="true"
+# USE_INTERNAL_DATA="false"
+
+if [ "${USE_INTERNAL_DATA}" = "true" ]; then
+    ## The internal data is only accessible within Microsoft
+    ## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
+    BASE_DATA_PATH=/vc_data/Megatron-LM/data
+    DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
+    ## For cluster Lab-RR1-V100
+    # BASE_DATA_PATH=/data/Megatron-LM/data
+    # DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
+    ## For cluster Azure-CentralUS-A100
+    # BASE_DATA_PATH=/data/Megatron-LM/data
+    # DATA_HOME=/vc_data_1/users/amawa/blended
+
+    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+    ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
+    BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
+    B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
+    CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
+    CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
+    GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
+    GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
+    NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
+    OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
+    PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
+    PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
+    RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
+    SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
+    ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
+    WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
+    DATA_BLEND="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
+    0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
+    0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
+    0.01359 ${ARX} 0.01588 ${GIT}"
+else
+    ## Placeholder, we plan to test a public dataset
+    VOCAB_PATH=""
+    MERGE_PATH=""
+    DATA_BLEND=""
+fi
+###############################################################################
+data_options=" \
+         --vocab-file ${VOCAB_PATH} \
+         --merge-file ${MERGE_PATH} \
+         --data-path ${DATA_BLEND} \
+         --data-impl mmap"
+        
+megatron_options=" \
+        --override-opt_param-scheduler \
+        --adam-beta1 0.9 \
+        --adam-beta2 0.95 \
+        --tensor-model-parallel-size ${MP_SIZE} \
+        --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
+        --num-experts ${EP_SIZE} \
+        --moe-loss-coeff ${MLC} \
+        --mlp-type residual \
+        --moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
+        --moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
+        --moe-min-capacity ${MOE_MIN_CAP} \
+        --init-method-std ${INIT_STD} \
+        --lr-decay-tokens ${LR_DECAY_TOKENS} \
+        --lr-warmup-tokens ${WARMUP_TOKENS} \
+        --micro-batch-size ${BATCH_SIZE} \
+        --exit-duration-in-mins ${EXIT_DURATION} \
+        --global-batch-size ${GLOBAL_BATCH_SIZE} \
+        --num-layers 21 \
+        --hidden-size ${HIDDEN_SIZE} \
+        --num-attention-heads ${NUM_ATTN_HEADS} \
+        --seq-length ${SEQ_LEN} \
+        --max-position-embeddings ${SEQ_LEN} \
+        --train-tokens ${TRAIN_TOKENS} \
+        --train-iters ${TRAIN_ITERS} \
+        --lr ${LR} \
+        --min-lr ${MIN_LR} \
+        --lr-decay-style cosine \
+        --split 98,2,0 \
+        --log-interval ${LOG_INTERVAL} \
+        --eval-interval ${EVAL_INTERVAL} \
+        --eval-iters ${EVAL_ITERS} \
+        --save-interval ${SAVE_INTERVAL} \
+        --weight-decay 0.1 \
+        --clip-grad 1.0 \
+        --hysteresis 2 \
+        --num-workers 0 \
+        --fp16 \
+        --load ${CHECKPOINT_PATH_STUDENT} \
+        --save ${CHECKPOINT_PATH_SAVE} \
+        --mos \
+        --kd-beta-ce ${KD_BETA_CE} \
+        --num-layers-teacher ${NUM_LAYERS} \
+        --num-experts-teacher ${EP_SIZE_TEACHER} \
+        --hidden-size-teacher ${HIDDEN_SIZE} \
+        --num-attention-heads-teacher ${NUM_ATTN_HEADS} \
+        --load-teacher ${CHECKPOINT_PATH_TEACHER} \
+        --tensorboard-queue-size 1 \
+        --log-timers-to-tensorboard \
+        --log-batch-size-to-tensorboard \
+        --log-validation-ppl-to-tensorboard \
+        --tensorboard-dir ${TENSORBOARD_DIR}"
+
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+megatron_options="${megatron_options} \
+        --checkpoint-activations"
+fi
+
+megatron_options="${megatron_options} \
+        --create-moe-param-group"
+
+
+if [ "${MOE_DROP_TOKEN}" = "false" ]; then
+megatron_options="${megatron_options} \
+        --disable-moe-token-dropping"
+fi
+
+template_json="ds_config_gpt_Zero2_TEMPLATE.json"
+config_json="ds_config_gpt_${NAME}.json"
+sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
+    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
+    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
+    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
+    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
+	  > ${config_json}
+
+deepspeed_options=" \
+		    --deepspeed \
+		    --deepspeed_config ${config_json} \
+		    --pipeline-model-parallel-size ${PP_SIZE}"
+
+# Currently MoE is not compatible with pipeline parallel
+deepspeed_options="${deepspeed_options} \
+        --no-pipeline-parallel"
+
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+        --deepspeed-activation-checkpointing"
+fi
+
+# run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
+run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}.log"
+echo ${run_cmd}
+eval ${run_cmd}
+set +x
--- a/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_dense.sh
+++ b/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_dense.sh
+#!/bin/bash
+DIR=`pwd`
+###############################################################################
+### Main configs
+## GPT-3 models use 2K sequence length/context window
+SEQ_LEN=2048
+
+### The "GPT-3 XXX" below are configs from GPT-3 paper
+### https://arxiv.org/abs/2005.14165, choose based on
+### your desired model size or build your own configs
+
+## GPT-3 Small 125M
+# MODEL_SIZE=0.125
+# NUM_LAYERS=12
+# HIDDEN_SIZE=768
+# NUM_ATTN_HEADS=12
+# GLOBAL_BATCH_SIZE=256
+# LR=6.0e-4
+# MIN_LR=6.0e-5
+
+## GPT-3 Medium 350M
+# MODEL_SIZE=0.35
+# NUM_LAYERS=24
+# HIDDEN_SIZE=1024
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=256
+# LR=3.0e-4
+# MIN_LR=3.0e-5
+
+## GPT-3 Large 760M
+# MODEL_SIZE=0.76
+# NUM_LAYERS=24
+# HIDDEN_SIZE=1536
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=256
+# LR=2.5e-4
+# MIN_LR=2.5e-5
+
+## GPT-3 XL 1.3B
+MODEL_SIZE=1.3
+NUM_LAYERS=24
+HIDDEN_SIZE=2048
+NUM_ATTN_HEADS=16
+GLOBAL_BATCH_SIZE=512
+LR=2.0e-4
+MIN_LR=2.0e-5
+
+## GPT-3 2.7B
+# MODEL_SIZE=2.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=2560
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=512
+# LR=1.6e-4
+# MIN_LR=1.6e-5
+
+## GPT-3 6.7B
+# MODEL_SIZE=6.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=4096
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.2e-4
+# MIN_LR=1.2e-5
+
+## GPT-3 13B
+# MODEL_SIZE=13
+# NUM_LAYERS=40
+# HIDDEN_SIZE=5120
+# NUM_ATTN_HEADS=40
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.0e-4
+# MIN_LR=1.0e-5
+
+## GPT-3 175B
+# MODEL_SIZE=175
+# NUM_LAYERS=96
+# HIDDEN_SIZE=12288
+# NUM_ATTN_HEADS=96
+# GLOBAL_BATCH_SIZE=1536
+# LR=0.6e-4
+# MIN_LR=0.6e-5
+###############################################################################
+### Training duration configs
+## The main termination condition, original GPT-3 paper trains for 300B tokens
+## For MoE model, we found sometimes training a bit more to 330B tokens helps
+TRAIN_TOKENS=300000000000
+# TRAIN_TOKENS=330000000000
+
+## TRAIN_SAMPLES is another termination condition and also affect the number of 
+## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
+## above, and techniques like curriculum learning has less token in some steps,
+## so we just set this config large enough to make sure we have enough
+## processed data and don't terminate by TRAIN_SAMPLES.
+TRAIN_SAMPLES=$(( ${TRAIN_TOKENS} * 3 / ${SEQ_LEN} ))
+
+## Another termination condition in minutes. Set it large enough to avoid
+## undesired early termination.
+EXIT_DURATION=30000000
+###############################################################################
+### LR configs
+## LR warmup and decay duration, this token-based config is preferable since
+## no need to readjust when the batch size/seqlen is changed.
+## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
+## For MoE model, we found that setting the decay token to 300B helps.
+WARMUP_TOKENS=375000000
+LR_DECAY_TOKENS=260000000000
+# LR_DECAY_TOKENS=300000000000
+###############################################################################
+### Parallelism configs
+## Micro batch size per GPU
+## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
+BATCH_SIZE=2
+
+## Model parallelism, 1 is no MP
+MP_SIZE=4
+
+## Pipeline parallelism
+## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
+## to 1 and use the "--no-pipeline-parallel" arg.
+PP_SIZE=1
+NUM_GPUS=64
+###############################################################################
+### MoE configs
+## Number of experts. EP_SIZE 1 means dense model without MoE
+EP_SIZE=1
+# EP_SIZE=128
+
+if [[ $EP_SIZE -gt $NUM_GPUS ]]; then
+    EP_PARALLEL_SIZE=$NUM_GPUS
+else
+    EP_PARALLEL_SIZE=$EP_SIZE
+fi
+
+## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
+## found that lower LR and min LR (than the base dense model) helps.
+## For 1.3B MoE-128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
+## For 350M MoE-128 model we used LR=2.0e-4 and MIN_LR=2.0e-6, but they are not
+## heavily tuned.
+# LR=2.0e-4
+# MIN_LR=2e-06
+
+## Coefficient for MoE loss. We find that 0.01 is a good value at least for
+## 1.3B MoE-128 model
+MLC=0.01
+
+## Below configs adjust the MoE expert token capacity limit during training and
+## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
+## Larger capacity factor or disabling capacity limit could improve training
+## convergence, but will also reduce training throughput.
+MOE_TRAIN_CAP_FACTOR=1.0
+MOE_EVAL_CAP_FACTOR=1.0
+MOE_MIN_CAP=4
+MOE_DROP_TOKEN="true"
+# MOE_DROP_TOKEN="false"
+###############################################################################
+### Curriculum learning (CL) configs
+## Enable/disable CL
+CL_ENABLED="false"
+## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
+## for tuning the following configs
+CL_START_SEQLEN=80
+CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
+CL_TOKENS=60
+CL_TOKENS=$((${CL_TOKENS} * 1000000000))
+CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
+###############################################################################
+### Misc configs
+LOG_INTERVAL=10
+EVAL_ITERS=10
+EVAL_INTERVAL=100
+SAVE_INTERVAL=1000
+
+## Standard deviation for weight initialization
+## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
+## dense model. Usually larger model needs lower std.
+INIT_STD=0.014
+# INIT_STD=0.01
+
+## Activation checkpointing saves GPU memory, but reduces training speed
+ACTIVATION_CHECKPOINT="true"
+# ACTIVATION_CHECKPOINT="false"
+###############################################################################
+### Output and data configs
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+host="${HOSTNAME}"
+NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
+if [[ $EP_SIZE -gt 1 ]]; then
+    NAME="${NAME}-ep-${EP_SIZE}-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
+fi
+if [ "${CL_ENABLED}" = "true" ]; then
+    NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
+fi
+
+OUTPUT_BASEPATH=$DIR/output
+mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
+mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
+mkdir -p "${OUTPUT_BASEPATH}/log/"
+TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}_${current_time}"
+mkdir -p ${TENSORBOARD_DIR} 
+## Note that for MoE model with billion-scale base model, the checkpoint can be
+## as large as TB-scale which normal NFS cannot handle efficiently.
+CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
+
+# USE_INTERNAL_DATA="true"
+USE_INTERNAL_DATA="false"
+
+if [ "${USE_INTERNAL_DATA}" = "true" ]; then
+    ## The internal data is only accessible within Microsoft
+    ## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
+    # BASE_DATA_PATH=/vc_data/Megatron-LM/data
+    # DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
+    ## For cluster Lab-RR1-V100
+    BASE_DATA_PATH=/data/Megatron-LM/data
+    DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
+    ## For cluster Azure-CentralUS-A100
+    # BASE_DATA_PATH=/data/Megatron-LM/data
+    # DATA_HOME=/vc_data_1/users/amawa/blended
+
+    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+    ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
+    BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
+    B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
+    CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
+    CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
+    GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
+    GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
+    NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
+    OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
+    PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
+    PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
+    RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
+    SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
+    ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
+    WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
+    DATA_BLEND="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
+    0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
+    0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
+    0.01359 ${ARX} 0.01588 ${GIT}"
+else
+    VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
+    MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
+    # Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
+    DATA_BLEND=/data/the_pile_public_merged_nopreprocessing/pile_text_document
+fi
+###############################################################################
+data_options=" \
+         --vocab-file ${VOCAB_PATH} \
+         --merge-file ${MERGE_PATH} \
+         --data-path ${DATA_BLEND} \
+         --data-impl mmap"
+        
+megatron_options=" \
+        --override-opt_param-scheduler \
+        --adam-beta1 0.9 \
+        --adam-beta2 0.95 \
+        --tensor-model-parallel-size ${MP_SIZE} \
+        --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
+        --num-experts ${EP_SIZE} \
+        --moe-loss-coeff ${MLC} \
+        --moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
+        --moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
+        --moe-min-capacity ${MOE_MIN_CAP} \
+        --init-method-std ${INIT_STD} \
+        --lr-decay-tokens ${LR_DECAY_TOKENS} \
+        --lr-warmup-tokens ${WARMUP_TOKENS} \
+        --micro-batch-size ${BATCH_SIZE} \
+        --exit-duration-in-mins ${EXIT_DURATION} \
+        --rampup-batch-size 32 32 1953125 \
+        --global-batch-size ${GLOBAL_BATCH_SIZE} \
+        --num-layers ${NUM_LAYERS} \
+        --hidden-size ${HIDDEN_SIZE} \
+        --num-attention-heads ${NUM_ATTN_HEADS} \
+        --seq-length ${SEQ_LEN} \
+        --max-position-embeddings ${SEQ_LEN} \
+        --train-tokens ${TRAIN_TOKENS} \
+        --train-samples ${TRAIN_SAMPLES} \
+        --lr ${LR} \
+        --min-lr ${MIN_LR} \
+        --lr-decay-style cosine \
+        --split 98,2,0 \
+        --log-interval ${LOG_INTERVAL} \
+        --eval-interval ${EVAL_INTERVAL} \
+        --eval-iters ${EVAL_ITERS} \
+        --save-interval ${SAVE_INTERVAL} \
+        --weight-decay 0.1 \
+        --clip-grad 1.0 \
+        --hysteresis 2 \
+        --num-workers 0 \
+        --fp16 \
+        --load ${CHECKPOINT_PATH} \
+        --save ${CHECKPOINT_PATH} \
+        --tensorboard-queue-size 1 \
+        --log-timers-to-tensorboard \
+        --log-batch-size-to-tensorboard \
+        --log-validation-ppl-to-tensorboard \
+        --tensorboard-dir ${TENSORBOARD_DIR}"
+
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+megatron_options="${megatron_options} \
+        --checkpoint-activations"
+fi
+
+if [[ $EP_SIZE -gt 1 ]]; then
+megatron_options="${megatron_options} \
+        --create-moe-param-group"
+fi
+
+if [ "${MOE_DROP_TOKEN}" = "false" ]; then
+megatron_options="${megatron_options} \
+        --disable-moe-token-dropping"
+fi
+
+template_json="ds_config_gpt_TEMPLATE.json"
+config_json="ds_config_gpt_${NAME}.json"
+sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
+    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
+    | sed "s/ZERO_STAGE/0/" \
+    | sed "s/PRESCALE_GRAD/true/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
+    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
+    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
+    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
+	  > ${config_json}
+
+deepspeed_options=" \
+		    --deepspeed \
+		    --deepspeed_config ${config_json} \
+		    --pipeline-model-parallel-size ${PP_SIZE}"
+
+# Currently MoE is not compatible with pipeline parallel
+if [[ $EP_SIZE -gt 1 ]]; then
+deepspeed_options="${deepspeed_options} \
+        --no-pipeline-parallel"
+fi
+
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+        --deepspeed-activation-checkpointing"
+fi
+
+run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
+echo ${run_cmd}
+eval ${run_cmd}
+set +x
--- a/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_dense_cl.sh
+++ b/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_dense_cl.sh
+#!/bin/bash
+DIR=`pwd`
+###############################################################################
+### Main configs
+## GPT-3 models use 2K sequence length/context window
+SEQ_LEN=2048
+
+### The "GPT-3 XXX" below are configs from GPT-3 paper
+### https://arxiv.org/abs/2005.14165, choose based on
+### your desired model size or build your own configs
+
+## GPT-3 Small 125M
+# MODEL_SIZE=0.125
+# NUM_LAYERS=12
+# HIDDEN_SIZE=768
+# NUM_ATTN_HEADS=12
+# GLOBAL_BATCH_SIZE=256
+# LR=6.0e-4
+# MIN_LR=6.0e-5
+
+## GPT-3 Medium 350M
+# MODEL_SIZE=0.35
+# NUM_LAYERS=24
+# HIDDEN_SIZE=1024
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=256
+# LR=3.0e-4
+# MIN_LR=3.0e-5
+
+## GPT-3 Large 760M
+# MODEL_SIZE=0.76
+# NUM_LAYERS=24
+# HIDDEN_SIZE=1536
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=256
+# LR=2.5e-4
+# MIN_LR=2.5e-5
+
+## GPT-3 XL 1.3B
+MODEL_SIZE=1.3
+NUM_LAYERS=24
+HIDDEN_SIZE=2048
+NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=512
+# LR=2.0e-4
+MIN_LR=2.0e-5
+
+# Curriculum learning (CL) enables stable large-batch training
+GLOBAL_BATCH_SIZE=4096 # 8x
+LR=8.0e-4 # 4x
+
+## GPT-3 2.7B
+# MODEL_SIZE=2.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=2560
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=512
+# LR=1.6e-4
+# MIN_LR=1.6e-5
+
+## GPT-3 6.7B
+# MODEL_SIZE=6.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=4096
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.2e-4
+# MIN_LR=1.2e-5
+
+## GPT-3 13B
+# MODEL_SIZE=13
+# NUM_LAYERS=40
+# HIDDEN_SIZE=5120
+# NUM_ATTN_HEADS=40
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.0e-4
+# MIN_LR=1.0e-5
+
+## GPT-3 175B
+# MODEL_SIZE=175
+# NUM_LAYERS=96
+# HIDDEN_SIZE=12288
+# NUM_ATTN_HEADS=96
+# GLOBAL_BATCH_SIZE=1536
+# LR=0.6e-4
+# MIN_LR=0.6e-5
+###############################################################################
+### Training duration configs
+## The main termination condition, original GPT-3 paper trains for 300B tokens
+TRAIN_TOKENS=300000000000
+
+## TRAIN_SAMPLES is another termination condition and also affect the number of 
+## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
+## above, and techniques like curriculum learning has less token in some samples,
+## so we just set this config large enough to make sure we have enough
+## processed data and don't terminate by TRAIN_SAMPLES.
+TRAIN_SAMPLES=$(( ${TRAIN_TOKENS} * 3 / ${SEQ_LEN} ))
+
+## Another termination condition in minutes. Set it large enough to avoid
+## undesired early termination.
+EXIT_DURATION=30000000
+###############################################################################
+### LR configs
+## LR warmup and decay duration, this token-based config is preferable since
+## no need to readjust when the batch size/seqlen is changed.
+## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
+WARMUP_TOKENS=375000000
+LR_DECAY_TOKENS=260000000000
+###############################################################################
+### Parallelism configs
+## Micro batch size per GPU
+## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
+BATCH_SIZE=16
+
+## Model parallelism, 1 is no MP
+MP_SIZE=2
+
+## Pipeline parallelism. To disable PP, set PP_SIZE to 1 and NO_PP to true.
+PP_SIZE=1
+NO_PP="true"
+
+## ZeRO stage
+ZERO_STAGE=0
+
+## Total number of GPUs
+NUM_GPUS=128
+DP_SIZE=$(( ${NUM_GPUS} / ${PP_SIZE} / ${MP_SIZE} ))
+###############################################################################
+### Curriculum learning (CL) configs
+## Enable/disable CL
+CL_ENABLED="true"
+## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
+## for tuning the following configs
+CL_START_SEQLEN=80
+CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
+CL_TOKENS=60
+CL_STEP=$(( ${CL_TOKENS} * 1000000000 / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
+###############################################################################
+### Misc configs
+LOG_INTERVAL=10
+EVAL_ITERS=10
+EVAL_INTERVAL=100
+SAVE_INTERVAL=1000
+
+## Standard deviation for weight initialization. Usually larger model needs
+## lower std. We used a heuristic equation of sqrt(1/3/HIDDEN_SIZE) from the
+## MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
+INIT_STD=0.013
+
+## Activation checkpointing saves GPU memory, but reduces training speed
+ACTIVATION_CHECKPOINT="true"
+# ACTIVATION_CHECKPOINT="false"
+
+## Whether or not log optimizer states (norms, max abs values) to tensorboard.
+## This is not required for training and might save GPU memory when turned off.
+LOG_OPTIMIZER_STATE="true"
+###############################################################################
+### Output and data configs
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+host="${HOSTNAME}"
+NAME="gpt3-with-pile-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-zero-${ZERO_STAGE}-mp-${MP_SIZE}-pp-${PP_SIZE}"
+if [ "${NO_PP}" = "true" ]; then
+    NAME="${NAME}-no_pp"
+fi
+if [ "${CL_ENABLED}" = "true" ]; then
+    NAME="${NAME}-cl-startseqlen-${CL_START_SEQLEN}-step-${CL_STEP}-token-${CL_TOKENS}B"
+fi
+
+LOG_PATH="log/"
+TENSORBOARD_PATH="tensorboard/${NAME}_${host}_${current_time}"
+CHECKPOINT_PATH="/blob/users/conglli/project/gpt3_with_pile/checkpoint/${NAME}"
+mkdir -p ${LOG_PATH}
+mkdir -p ${TENSORBOARD_PATH}
+mkdir -p ${CHECKPOINT_PATH}
+
+VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
+MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
+# Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
+DATA_PATH=/data/the_pile_public_merged_nopreprocessing/pile_text_document
+###############################################################################
+data_options=" \
+         --vocab-file ${VOCAB_PATH} \
+         --merge-file ${MERGE_PATH} \
+         --data-path ${DATA_PATH} \
+         --data-impl mmap"
+        
+megatron_options=" \
+        --override-opt_param-scheduler \
+        --adam-beta1 0.9 \
+        --adam-beta2 0.95 \
+        --tensor-model-parallel-size ${MP_SIZE} \
+        --init-method-std ${INIT_STD} \
+        --lr-decay-tokens ${LR_DECAY_TOKENS} \
+        --lr-warmup-tokens ${WARMUP_TOKENS} \
+        --micro-batch-size ${BATCH_SIZE} \
+        --exit-duration-in-mins ${EXIT_DURATION} \
+        --global-batch-size ${GLOBAL_BATCH_SIZE} \
+        --num-layers ${NUM_LAYERS} \
+        --hidden-size ${HIDDEN_SIZE} \
+        --num-attention-heads ${NUM_ATTN_HEADS} \
+        --seq-length ${SEQ_LEN} \
+        --max-position-embeddings ${SEQ_LEN} \
+        --train-tokens ${TRAIN_TOKENS} \
+        --train-samples ${TRAIN_SAMPLES} \
+        --lr ${LR} \
+        --min-lr ${MIN_LR} \
+        --lr-decay-style cosine \
+        --split 98,2,0 \
+        --log-interval ${LOG_INTERVAL} \
+        --eval-interval ${EVAL_INTERVAL} \
+        --eval-iters ${EVAL_ITERS} \
+        --save-interval ${SAVE_INTERVAL} \
+        --weight-decay 0.1 \
+        --clip-grad 1.0 \
+        --hysteresis 2 \
+        --num-workers 0 \
+        --fp16 \
+        --load ${CHECKPOINT_PATH} \
+        --save ${CHECKPOINT_PATH} \
+        --tensorboard-queue-size 1 \
+        --log-timers-to-tensorboard \
+        --log-batch-size-to-tensorboard \
+        --log-validation-ppl-to-tensorboard \
+        --tensorboard-dir ${TENSORBOARD_PATH}"
+
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+megatron_options="${megatron_options} \
+        --checkpoint-activations"
+fi
+
+if [ "${LOG_OPTIMIZER_STATE}" = "true" ]; then
+megatron_options="${megatron_options} \
+        --log-optimizer-states-to-tensorboard"
+fi
+
+template_json="ds_config_gpt_TEMPLATE.json"
+config_json="ds_config_${NAME}.json"
+if [[ $ZERO_STAGE -gt 0 ]]; then
+sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
+    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
+    | sed "s/ZERO_STAGE/${ZERO_STAGE}/" \
+    | sed "s/PRESCALE_GRAD/false/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
+    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
+    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
+    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
+      > ${config_json}
+else
+sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
+    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
+    | sed "s/ZERO_STAGE/${ZERO_STAGE}/" \
+    | sed "s/PRESCALE_GRAD/true/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
+    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
+    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
+    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
+      > ${config_json}
+fi
+
+deepspeed_options=" \
+            --deepspeed \
+            --deepspeed_config ${config_json} \
+            --zero-stage ${ZERO_STAGE} \
+            --pipeline-model-parallel-size ${PP_SIZE}"
+
+if [[ "${NO_PP}" = "true" ]]; then
+deepspeed_options="${deepspeed_options} \
+        --no-pipeline-parallel"
+fi
+
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+        --deepspeed-activation-checkpointing"
+fi
+
+run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${LOG_PATH}/${NAME}_${host}_${current_time}.log"
+echo ${run_cmd}
+eval ${run_cmd}
+set +x
--- a/examples_deepspeed/MoE/ds_pretrain_gpt_125M_MoE64.sh
+++ b/examples_deepspeed/MoE/ds_pretrain_gpt_125M_MoE64.sh
+#!/bin/bash
+DIR=`pwd`
+###############################################################################
+### Main configs
+## GPT-3 models use 2K sequence length/context window
+SEQ_LEN=2048
+
+### The "GPT-3 XXX" below are configs from GPT-3 paper
+### https://arxiv.org/abs/2005.14165, choose based on
+### your desired model size or build your own configs
+
+## GPT-3 Small 125M
+MODEL_SIZE=0.125
+NUM_LAYERS=12
+HIDDEN_SIZE=768
+NUM_ATTN_HEADS=12
+GLOBAL_BATCH_SIZE=256
+# LR=6.0e-4
+# MIN_LR=6.0e-5
+
+## GPT-3 Medium 350M
+# MODEL_SIZE=0.35
+# NUM_LAYERS=24
+# HIDDEN_SIZE=1024
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=256
+# LR=3.0e-4
+# MIN_LR=3.0e-5
+
+## GPT-3 Large 760M
+# MODEL_SIZE=0.76
+# NUM_LAYERS=24
+# HIDDEN_SIZE=1536
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=256
+# LR=2.5e-4
+# MIN_LR=2.5e-5
+
+## GPT-3 XL 1.3B
+# MODEL_SIZE=1.3
+# NUM_LAYERS=24
+# HIDDEN_SIZE=2048
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=512
+# LR=2.0e-4
+# MIN_LR=2.0e-5
+
+## GPT-3 2.7B
+# MODEL_SIZE=2.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=2560
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=512
+# LR=1.6e-4
+# MIN_LR=1.6e-5
+
+## GPT-3 6.7B
+# MODEL_SIZE=6.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=4096
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.2e-4
+# MIN_LR=1.2e-5
+
+## GPT-3 13B
+# MODEL_SIZE=13
+# NUM_LAYERS=40
+# HIDDEN_SIZE=5120
+# NUM_ATTN_HEADS=40
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.0e-4
+# MIN_LR=1.0e-5
+
+## GPT-3 175B
+# MODEL_SIZE=175
+# NUM_LAYERS=96
+# HIDDEN_SIZE=12288
+# NUM_ATTN_HEADS=96
+# GLOBAL_BATCH_SIZE=1536
+# LR=0.6e-4
+# MIN_LR=0.6e-5
+###############################################################################
+### Training duration configs
+## The main termination condition, original GPT-3 paper trains for 300B tokens
+## For MoE model, we found sometimes training a bit more to 330B tokens helps
+TRAIN_TOKENS=300000000000
+# TRAIN_TOKENS=330000000000
+
+## TRAIN_ITERS is another termination condition and also affect the number of 
+## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
+## above, and techniques like curriculum learning has less token in some steps,
+## so we just set this config large enough to make sure we have enough
+## processed data and don't terminate by TRAIN_ITERS.
+TRAIN_ITERS=$(( ${TRAIN_TOKENS} * 3 / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
+
+## Another termination condition in minutes. Set it large enough to avoid
+## undesired early termination.
+EXIT_DURATION=30000000
+###############################################################################
+### LR configs
+## LR warmup and decay duration, this token-based config is preferable since
+## no need to readjust when the batch size/seqlen is changed.
+## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
+## For MoE model, we found that setting the decay token to 300B helps.
+WARMUP_TOKENS=375000000
+# LR_DECAY_TOKENS=260000000000
+LR_DECAY_TOKENS=300000000000
+###############################################################################
+### Parallelism configs
+## Micro batch size per GPU
+## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
+BATCH_SIZE=4
+
+## Model parallelism, 1 is no MP
+MP_SIZE=1
+
+## Pipeline parallelism
+## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
+## to 1 and use the "--no-pipeline-parallel" arg.
+PP_SIZE=1
+NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
+NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+NUM_NODE=$(( ${NUM_GPUS} / ${NUM_GPUS_PERNODE} ))
+###############################################################################
+### MoE configs
+## Number of experts. EP_SIZE 1 means dense model without MoE
+# EP_SIZE=1
+EP_SIZE=64
+
+if [[ $EP_SIZE -gt $NUM_GPUS ]]; then
+    EP_PARALLEL_SIZE=$NUM_GPUS
+else
+    EP_PARALLEL_SIZE=$EP_SIZE
+fi
+
+## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
+## found that lower LR and min LR (than the base dense model) helps.
+## For 1.3B MoE-128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
+## For 350M MoE-128 model we used LR=2.0e-4 and MIN_LR=2.0e-6, but they are not
+## heavily tuned.
+LR=4.5e-4
+MIN_LR=4.5e-06
+
+## Coefficient for MoE loss. We find that 0.01 is a good value at least for
+## 1.3B MoE-128 model
+MLC=0.01
+
+## Below configs adjust the MoE expert token capacity limit during training and
+## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
+## Larger capacity factor or disabling capacity limit could improve training
+## convergence, but will also reduce training throughput.
+MOE_TRAIN_CAP_FACTOR=1.0
+MOE_EVAL_CAP_FACTOR=1.0
+MOE_MIN_CAP=4
+MOE_DROP_TOKEN="true"
+# MOE_DROP_TOKEN="false"
+###############################################################################
+### Curriculum learning (CL) configs
+## Enable/disable CL
+CL_ENABLED="false"
+## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
+## for tuning the following configs
+CL_START_SEQLEN=80
+CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
+CL_TOKENS=60
+CL_TOKENS=$((${CL_TOKENS} * 1000000000))
+CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
+###############################################################################
+### Misc configs
+LOG_INTERVAL=10
+EVAL_ITERS=10
+EVAL_INTERVAL=100
+SAVE_INTERVAL=10000
+
+## Standard deviation for weight initialization
+## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
+## dense model. Usually larger model needs lower std.
+INIT_STD=0.014
+# INIT_STD=0.01
+
+## Activation checkpointing saves GPU memory, but reduces training speed
+ACTIVATION_CHECKPOINT="true"
+# ACTIVATION_CHECKPOINT="false"
+###############################################################################
+### Output and data configs
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+host="${HOSTNAME}"
+NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
+if [[ $EP_SIZE -gt 1 ]]; then
+    NAME="${NAME}-ep-${EP_SIZE}-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
+fi
+if [ "${CL_ENABLED}" = "true" ]; then
+    NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
+fi
+
+OUTPUT_BASEPATH=$DIR/output
+mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
+mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
+mkdir -p "${OUTPUT_BASEPATH}/log/"
+TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}_${current_time}"
+mkdir -p ${TENSORBOARD_DIR} 
+## Note that for MoE model with billion-scale base model, the checkpoint can be
+## as large as TB-scale which normal NFS cannot handle efficiently.
+CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
+
+# USE_INTERNAL_DATA="true"
+USE_INTERNAL_DATA="false"
+
+if [ "${USE_INTERNAL_DATA}" = "true" ]; then
+    ## The internal data is only accessible within Microsoft
+    ## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
+    # BASE_DATA_PATH=/vc_data/Megatron-LM/data
+    # DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
+    ## For cluster Lab-RR1-V100
+    BASE_DATA_PATH=/data/Megatron-LM/data
+    DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
+    ## For cluster Azure-CentralUS-A100
+    # BASE_DATA_PATH=/data/Megatron-LM/data
+    # DATA_HOME=/vc_data_1/users/amawa/blended
+
+    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+    ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
+    BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
+    B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
+    CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
+    CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
+    GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
+    GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
+    NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
+    OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
+    PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
+    PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
+    RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
+    SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
+    ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
+    WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
+    DATA_PATH="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
+    0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
+    0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
+    0.01359 ${ARX} 0.01588 ${GIT}"
+else
+    VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
+    MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
+    # Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
+    # For cluster Azure-EastUS-V100-32GB-4, Lab-RR1-V100
+    DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
+    # For cluster Azure-WestUS3-A100
+    # DATA_PATH=/blob/data/the_pile_public_merged_nopreprocessing/pile_text_document
+fi
+###############################################################################
+data_options=" \
+         --vocab-file ${VOCAB_PATH} \
+         --merge-file ${MERGE_PATH} \
+         --data-path ${DATA_PATH} \
+         --data-impl mmap"
+        
+megatron_options=" \
+        --override-opt_param-scheduler \
+        --adam-beta1 0.9 \
+        --adam-beta2 0.95 \
+        --tensor-model-parallel-size ${MP_SIZE} \
+        --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
+        --num-experts ${EP_SIZE} \
+        --moe-loss-coeff ${MLC} \
+        --moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
+        --moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
+        --moe-min-capacity ${MOE_MIN_CAP} \
+        --init-method-std ${INIT_STD} \
+        --lr-decay-tokens ${LR_DECAY_TOKENS} \
+        --lr-warmup-tokens ${WARMUP_TOKENS} \
+        --micro-batch-size ${BATCH_SIZE} \
+        --exit-duration-in-mins ${EXIT_DURATION} \
+        --global-batch-size ${GLOBAL_BATCH_SIZE} \
+        --num-layers ${NUM_LAYERS} \
+        --hidden-size ${HIDDEN_SIZE} \
+        --num-attention-heads ${NUM_ATTN_HEADS} \
+        --seq-length ${SEQ_LEN} \
+        --max-position-embeddings ${SEQ_LEN} \
+        --train-tokens ${TRAIN_TOKENS} \
+        --train-iters ${TRAIN_ITERS} \
+        --lr ${LR} \
+        --min-lr ${MIN_LR} \
+        --lr-decay-style cosine \
+        --split 98,2,0 \
+        --log-interval ${LOG_INTERVAL} \
+        --eval-interval ${EVAL_INTERVAL} \
+        --eval-iters ${EVAL_ITERS} \
+        --save-interval ${SAVE_INTERVAL} \
+        --weight-decay 0.1 \
+        --clip-grad 1.0 \
+        --hysteresis 2 \
+        --num-workers 0 \
+        --fp16 \
+        --load ${CHECKPOINT_PATH} \
+        --save ${CHECKPOINT_PATH} \
+        --tensorboard-queue-size 1 \
+        --log-timers-to-tensorboard \
+        --log-batch-size-to-tensorboard \
+        --log-validation-ppl-to-tensorboard \
+        --tensorboard-dir ${TENSORBOARD_DIR}"
+
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+megatron_options="${megatron_options} \
+        --checkpoint-activations"
+fi
+
+if [[ $EP_SIZE -gt 1 ]]; then
+megatron_options="${megatron_options} \
+        --create-moe-param-group"
+fi
+
+if [ "${MOE_DROP_TOKEN}" = "false" ]; then
+megatron_options="${megatron_options} \
+        --disable-moe-token-dropping"
+fi
+
+template_json="ds_config_gpt_TEMPLATE.json"
+config_json="ds_config_gpt_${NAME}.json"
+sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
+    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
+    | sed "s/ZERO_STAGE/0/" \
+    | sed "s/PRESCALE_GRAD/true/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
+    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
+    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
+    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
+	  > ${config_json}
+
+deepspeed_options=" \
+		    --deepspeed \
+		    --deepspeed_config ${config_json} \
+		    --pipeline-model-parallel-size ${PP_SIZE}"
+
+# Currently MoE is not compatible with pipeline parallel
+if [[ $EP_SIZE -gt 1 ]]; then
+deepspeed_options="${deepspeed_options} \
+        --no-pipeline-parallel"
+fi
+
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+        --deepspeed-activation-checkpointing"
+fi
+
+## When saving checkpoint to a storage with cache, their could be consistency
+## issue of the pointer to latest checkpoint. Here we find the correct pointer
+## and broadcast it to all nodes.
+ITERATION_FILE="$CHECKPOINT_PATH/latest_checkpointed_iteration.txt"
+ITERATION_FILE_2="$CHECKPOINT_PATH/latest"
+ITERATION=0
+for (( node = 0; node <= NUM_NODE-1; node++ ))
+do
+    if $(ssh -q worker-"$node" "test -f \"$ITERATION_FILE\""); then
+        LOCAL_ITERATION=$(ssh -q worker-"$node" cat $ITERATION_FILE)
+        ITERATION=$(( ${LOCAL_ITERATION} > ${ITERATION} ? ${LOCAL_ITERATION} :  ${ITERATION} ))
+    fi
+done
+if [[ $ITERATION -gt 0 ]]; then
+    ITERATION_2="global_step${ITERATION}"
+    ds_ssh "echo $ITERATION > $ITERATION_FILE"
+    ds_ssh "echo $ITERATION_2 > $ITERATION_FILE_2"
+fi
+
+run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
+echo ${run_cmd}
+eval ${run_cmd}
+set +x
--- a/examples_deepspeed/MoE/ds_pretrain_gpt_125M_dense_cl.sh
+++ b/examples_deepspeed/MoE/ds_pretrain_gpt_125M_dense_cl.sh
+#!/bin/bash
+DIR=`pwd`
+###############################################################################
+### Main configs
+## GPT-3 models use 2K sequence length/context window
+SEQ_LEN=2048
+
+### The "GPT-3 XXX" below are configs from GPT-3 paper
+### https://arxiv.org/abs/2005.14165, choose based on
+### your desired model size or build your own configs
+
+## GPT-3 Small 125M
+MODEL_SIZE=0.125
+NUM_LAYERS=12
+HIDDEN_SIZE=768
+NUM_ATTN_HEADS=12
+# GLOBAL_BATCH_SIZE=256
+# LR=6.0e-4
+MIN_LR=6.0e-5
+
+# Curriculum learning (CL) enables stable large-batch training
+GLOBAL_BATCH_SIZE=2048 # 8x
+LR=2.4e-3 # 4x
+
+## GPT-3 Medium 350M
+# MODEL_SIZE=0.35
+# NUM_LAYERS=24
+# HIDDEN_SIZE=1024
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=256
+# LR=3.0e-4
+# MIN_LR=3.0e-5
+
+## GPT-3 Large 760M
+# MODEL_SIZE=0.76
+# NUM_LAYERS=24
+# HIDDEN_SIZE=1536
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=256
+# LR=2.5e-4
+# MIN_LR=2.5e-5
+
+## GPT-3 XL 1.3B
+# MODEL_SIZE=1.3
+# NUM_LAYERS=24
+# HIDDEN_SIZE=2048
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=512
+# LR=2.0e-4
+# MIN_LR=2.0e-5
+
+## GPT-3 2.7B
+# MODEL_SIZE=2.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=2560
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=512
+# LR=1.6e-4
+# MIN_LR=1.6e-5
+
+## GPT-3 6.7B
+# MODEL_SIZE=6.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=4096
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.2e-4
+# MIN_LR=1.2e-5
+
+## GPT-3 13B
+# MODEL_SIZE=13
+# NUM_LAYERS=40
+# HIDDEN_SIZE=5120
+# NUM_ATTN_HEADS=40
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.0e-4
+# MIN_LR=1.0e-5
+
+## GPT-3 175B
+# MODEL_SIZE=175
+# NUM_LAYERS=96
+# HIDDEN_SIZE=12288
+# NUM_ATTN_HEADS=96
+# GLOBAL_BATCH_SIZE=1536
+# LR=0.6e-4
+# MIN_LR=0.6e-5
+###############################################################################
+### Training duration configs
+## The main termination condition, original GPT-3 paper trains for 300B tokens
+TRAIN_TOKENS=300000000000
+
+## TRAIN_SAMPLES is another termination condition and also affect the number of 
+## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
+## above, and techniques like curriculum learning has less token in some samples,
+## so we just set this config large enough to make sure we have enough
+## processed data and don't terminate by TRAIN_SAMPLES.
+TRAIN_SAMPLES=$(( ${TRAIN_TOKENS} * 3 / ${SEQ_LEN} ))
+
+## Another termination condition in minutes. Set it large enough to avoid
+## undesired early termination.
+EXIT_DURATION=30000000
+###############################################################################
+### LR configs
+## LR warmup and decay duration, this token-based config is preferable since
+## no need to readjust when the batch size/seqlen is changed.
+## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
+WARMUP_TOKENS=375000000
+LR_DECAY_TOKENS=260000000000
+###############################################################################
+### Parallelism configs
+## Micro batch size per GPU
+## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
+BATCH_SIZE=16
+
+## Model parallelism, 1 is no MP
+MP_SIZE=1
+
+## Pipeline parallelism. To disable PP, set PP_SIZE to 1 and NO_PP to true.
+PP_SIZE=1
+NO_PP="true"
+
+## ZeRO stage
+ZERO_STAGE=0
+
+## Total number of GPUs
+NUM_GPUS=$(($(ds_ssh nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)-2))
+NUM_GPUS_PERNODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+NUM_NODE=$(( ${NUM_GPUS} / ${NUM_GPUS_PERNODE} ))
+DP_SIZE=$(( ${NUM_GPUS} / ${PP_SIZE} / ${MP_SIZE} ))
+###############################################################################
+### Curriculum learning (CL) configs
+## Enable/disable CL
+CL_ENABLED="true"
+## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
+## for tuning the following configs
+CL_START_SEQLEN=72
+CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
+CL_TOKENS=60
+CL_STEP=$(( ${CL_TOKENS} * 1000000000 / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
+###############################################################################
+### Misc configs
+LOG_INTERVAL=10
+EVAL_ITERS=10
+EVAL_INTERVAL=100
+SAVE_INTERVAL=1000
+
+## Standard deviation for weight initialization. Usually larger model needs
+## lower std. We used a heuristic equation of sqrt(1/3/HIDDEN_SIZE) from the
+## MT-NLG 530B work (https://arxiv.org/pdf/2201.11990.pdf)
+INIT_STD=0.02
+
+## Activation checkpointing saves GPU memory, but reduces training speed
+ACTIVATION_CHECKPOINT="true"
+# ACTIVATION_CHECKPOINT="false"
+
+## Whether or not log optimizer states (norms, max abs values) to tensorboard.
+## This is not required for training and might save GPU memory when turned off.
+LOG_OPTIMIZER_STATE="true"
+###############################################################################
+### Output and data configs
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+host="${HOSTNAME}"
+NAME="gpt3-with-pile-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-zero-${ZERO_STAGE}-mp-${MP_SIZE}-pp-${PP_SIZE}"
+if [ "${NO_PP}" = "true" ]; then
+    NAME="${NAME}-no_pp"
+fi
+if [ "${CL_ENABLED}" = "true" ]; then
+    NAME="${NAME}-cl-startseqlen-${CL_START_SEQLEN}-step-${CL_STEP}-token-${CL_TOKENS}B"
+fi
+
+LOG_PATH="log/"
+TENSORBOARD_PATH="tensorboard/${NAME}_${host}_${current_time}"
+CHECKPOINT_PATH="/blob/users/conglli/project/gpt3_with_pile/checkpoint/${NAME}"
+mkdir -p ${LOG_PATH}
+mkdir -p ${TENSORBOARD_PATH}
+mkdir -p ${CHECKPOINT_PATH}
+
+VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
+MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
+# Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
+# For cluster Azure-EastUS-V100-32GB-4, Lab-RR1-V100
+DATA_PATH=/vc_data_blob/users/conglli/the_pile_public_merged_nopreprocessing/pile_text_document
+# For cluster Azure-WestUS3-A100
+# DATA_PATH=/blob/data/the_pile_public_merged_nopreprocessing/pile_text_document
+###############################################################################
+data_options=" \
+         --vocab-file ${VOCAB_PATH} \
+         --merge-file ${MERGE_PATH} \
+         --data-path ${DATA_PATH} \
+         --data-impl mmap"
+        
+megatron_options=" \
+        --override-opt_param-scheduler \
+        --adam-beta1 0.9 \
+        --adam-beta2 0.95 \
+        --tensor-model-parallel-size ${MP_SIZE} \
+        --init-method-std ${INIT_STD} \
+        --lr-decay-tokens ${LR_DECAY_TOKENS} \
+        --lr-warmup-tokens ${WARMUP_TOKENS} \
+        --micro-batch-size ${BATCH_SIZE} \
+        --exit-duration-in-mins ${EXIT_DURATION} \
+        --global-batch-size ${GLOBAL_BATCH_SIZE} \
+        --num-layers ${NUM_LAYERS} \
+        --hidden-size ${HIDDEN_SIZE} \
+        --num-attention-heads ${NUM_ATTN_HEADS} \
+        --seq-length ${SEQ_LEN} \
+        --max-position-embeddings ${SEQ_LEN} \
+        --train-tokens ${TRAIN_TOKENS} \
+        --train-samples ${TRAIN_SAMPLES} \
+        --lr ${LR} \
+        --min-lr ${MIN_LR} \
+        --lr-decay-style cosine \
+        --split 98,2,0 \
+        --log-interval ${LOG_INTERVAL} \
+        --eval-interval ${EVAL_INTERVAL} \
+        --eval-iters ${EVAL_ITERS} \
+        --save-interval ${SAVE_INTERVAL} \
+        --weight-decay 0.1 \
+        --clip-grad 1.0 \
+        --hysteresis 2 \
+        --num-workers 0 \
+        --fp16 \
+        --load ${CHECKPOINT_PATH} \
+        --save ${CHECKPOINT_PATH} \
+        --tensorboard-queue-size 1 \
+        --log-timers-to-tensorboard \
+        --log-batch-size-to-tensorboard \
+        --log-validation-ppl-to-tensorboard \
+        --tensorboard-dir ${TENSORBOARD_PATH}"
+
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+megatron_options="${megatron_options} \
+        --checkpoint-activations"
+fi
+
+if [ "${LOG_OPTIMIZER_STATE}" = "true" ]; then
+megatron_options="${megatron_options} \
+        --log-optimizer-states-to-tensorboard"
+fi
+
+template_json="ds_config_gpt_TEMPLATE.json"
+config_json="ds_config_${NAME}.json"
+if [[ $ZERO_STAGE -gt 0 ]]; then
+sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
+    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
+    | sed "s/ZERO_STAGE/${ZERO_STAGE}/" \
+    | sed "s/PRESCALE_GRAD/false/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
+    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
+    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
+    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
+      > ${config_json}
+else
+sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
+    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
+    | sed "s/ZERO_STAGE/${ZERO_STAGE}/" \
+    | sed "s/PRESCALE_GRAD/true/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
+    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
+    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
+    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
+      > ${config_json}
+fi
+
+deepspeed_options=" \
+            --deepspeed \
+            --deepspeed_config ${config_json} \
+            --zero-stage ${ZERO_STAGE} \
+            --pipeline-model-parallel-size ${PP_SIZE}"
+
+if [[ "${NO_PP}" = "true" ]]; then
+deepspeed_options="${deepspeed_options} \
+        --no-pipeline-parallel"
+fi
+
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+        --deepspeed-activation-checkpointing"
+fi
+
+## When saving checkpoint to a storage with cache, their could be consistency
+## issue of the pointer to latest checkpoint. Here we find the correct pointer
+## and broadcast it to all nodes.
+ITERATION_FILE="$CHECKPOINT_PATH/latest_checkpointed_iteration.txt"
+ITERATION_FILE_2="$CHECKPOINT_PATH/latest"
+ITERATION=0
+for (( node = 0; node <= NUM_NODE-1; node++ ))
+do
+    if $(ssh -q worker-"$node" "test -f \"$ITERATION_FILE\""); then
+        LOCAL_ITERATION=$(ssh -q worker-"$node" cat $ITERATION_FILE)
+        ITERATION=$(( ${LOCAL_ITERATION} > ${ITERATION} ? ${LOCAL_ITERATION} :  ${ITERATION} ))
+    fi
+done
+if [[ $ITERATION -gt 0 ]]; then
+    ITERATION_2="global_step${ITERATION}"
+    ds_ssh "echo $ITERATION > $ITERATION_FILE"
+    ds_ssh "echo $ITERATION_2 > $ITERATION_FILE_2"
+fi
+
+run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${LOG_PATH}/${NAME}_${host}_${current_time}.log"
+echo ${run_cmd}
+eval ${run_cmd}
+set +x
--- a/examples_deepspeed/MoE/ds_pretrain_gpt_350M_MoE128.sh
+++ b/examples_deepspeed/MoE/ds_pretrain_gpt_350M_MoE128.sh
+#!/bin/bash
+DIR=`pwd`
+###############################################################################
+### Main configs
+## GPT-3 models use 2K sequence length/context window
+SEQ_LEN=2048
+
+### The "GPT-3 XXX" below are configs from GPT-3 paper
+### https://arxiv.org/abs/2005.14165, choose based on
+### your desired model size or build your own configs
+
+## GPT-3 Small 125M
+# MODEL_SIZE=0.125
+# NUM_LAYERS=12
+# HIDDEN_SIZE=768
+# NUM_ATTN_HEADS=12
+# GLOBAL_BATCH_SIZE=256
+# LR=6.0e-4
+# MIN_LR=6.0e-5
+
+## GPT-3 Medium 350M
+MODEL_SIZE=0.35
+NUM_LAYERS=24
+HIDDEN_SIZE=1024
+NUM_ATTN_HEADS=16
+GLOBAL_BATCH_SIZE=256
+# LR=3.0e-4
+# MIN_LR=3.0e-5
+
+## GPT-3 Large 760M
+# MODEL_SIZE=0.76
+# NUM_LAYERS=24
+# HIDDEN_SIZE=1536
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=256
+# LR=2.5e-4
+# MIN_LR=2.5e-5
+
+## GPT-3 XL 1.3B
+# MODEL_SIZE=1.3
+# NUM_LAYERS=24
+# HIDDEN_SIZE=2048
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=512
+# LR=2.0e-4
+# MIN_LR=2.0e-5
+
+## GPT-3 2.7B
+# MODEL_SIZE=2.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=2560
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=512
+# LR=1.6e-4
+# MIN_LR=1.6e-5
+
+## GPT-3 6.7B
+# MODEL_SIZE=6.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=4096
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.2e-4
+# MIN_LR=1.2e-5
+
+## GPT-3 13B
+# MODEL_SIZE=13
+# NUM_LAYERS=40
+# HIDDEN_SIZE=5120
+# NUM_ATTN_HEADS=40
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.0e-4
+# MIN_LR=1.0e-5
+
+## GPT-3 175B
+# MODEL_SIZE=175
+# NUM_LAYERS=96
+# HIDDEN_SIZE=12288
+# NUM_ATTN_HEADS=96
+# GLOBAL_BATCH_SIZE=1536
+# LR=0.6e-4
+# MIN_LR=0.6e-5
+###############################################################################
+### Training duration configs
+## The main termination condition, original GPT-3 paper trains for 300B tokens
+## For MoE model, we found sometimes training a bit more to 330B tokens helps
+TRAIN_TOKENS=300000000000
+# TRAIN_TOKENS=330000000000
+
+## TRAIN_ITERS is another termination condition and also affect the number of 
+## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
+## above, and techniques like curriculum learning has less token in some steps,
+## so we just set this config large enough to make sure we have enough
+## processed data and don't terminate by TRAIN_ITERS.
+TRAIN_ITERS=$(( ${TRAIN_TOKENS} * 3 / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
+
+## Another termination condition in minutes. Set it large enough to avoid
+## undesired early termination.
+EXIT_DURATION=30000000
+###############################################################################
+### LR configs
+## LR warmup and decay duration, this token-based config is preferable since
+## no need to readjust when the batch size/seqlen is changed.
+## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
+## For MoE model, we found that setting the decay token to 300B helps.
+WARMUP_TOKENS=375000000
+# LR_DECAY_TOKENS=260000000000
+LR_DECAY_TOKENS=300000000000
+###############################################################################
+### Parallelism configs
+## Micro batch size per GPU
+## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
+BATCH_SIZE=4
+
+## Model parallelism, 1 is no MP
+MP_SIZE=1
+
+## Pipeline parallelism
+## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
+## to 1 and use the "--no-pipeline-parallel" arg.
+PP_SIZE=1
+NUM_GPUS=64
+###############################################################################
+### MoE configs
+## Number of experts. EP_SIZE 1 means dense model without MoE
+# EP_SIZE=1
+EP_SIZE=128
+
+if [[ $EP_SIZE -gt $NUM_GPUS ]]; then
+    EP_PARALLEL_SIZE=$NUM_GPUS
+else
+    EP_PARALLEL_SIZE=$EP_SIZE
+fi
+
+## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
+## found that lower LR and min LR (than the base dense model) helps.
+## For 1.3B MoE-128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
+## For 350M MoE-128 model we used LR=2.0e-4 and MIN_LR=2.0e-6, but they are not
+## heavily tuned.
+LR=2.0e-4
+MIN_LR=2e-06
+
+## Coefficient for MoE loss. We find that 0.01 is a good value at least for
+## 1.3B MoE-128 model
+MLC=0.01
+
+## Below configs adjust the MoE expert token capacity limit during training and
+## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
+## Larger capacity factor or disabling capacity limit could improve training
+## convergence, but will also reduce training throughput.
+MOE_TRAIN_CAP_FACTOR=1.0
+MOE_EVAL_CAP_FACTOR=1.0
+MOE_MIN_CAP=4
+MOE_DROP_TOKEN="true"
+# MOE_DROP_TOKEN="false"
+###############################################################################
+### Curriculum learning (CL) configs
+## Enable/disable CL
+CL_ENABLED="false"
+## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
+## for tuning the following configs
+CL_START_SEQLEN=80
+CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
+CL_TOKENS=60
+CL_TOKENS=$((${CL_TOKENS} * 1000000000))
+CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
+###############################################################################
+### Misc configs
+LOG_INTERVAL=10
+EVAL_ITERS=10
+EVAL_INTERVAL=100
+SAVE_INTERVAL=10000
+
+## Standard deviation for weight initialization
+## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
+## dense model. Usually larger model needs lower std.
+INIT_STD=0.014
+# INIT_STD=0.01
+
+## Activation checkpointing saves GPU memory, but reduces training speed
+ACTIVATION_CHECKPOINT="true"
+# ACTIVATION_CHECKPOINT="false"
+###############################################################################
+### Output and data configs
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+host="${HOSTNAME}"
+NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
+if [[ $EP_SIZE -gt 1 ]]; then
+    NAME="${NAME}-ep-${EP_SIZE}-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
+fi
+if [ "${CL_ENABLED}" = "true" ]; then
+    NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
+fi
+
+OUTPUT_BASEPATH=$DIR/output
+mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
+mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
+mkdir -p "${OUTPUT_BASEPATH}/log/"
+TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}_${current_time}"
+mkdir -p ${TENSORBOARD_DIR} 
+## Note that for MoE model with billion-scale base model, the checkpoint can be
+## as large as TB-scale which normal NFS cannot handle efficiently.
+CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
+
+# USE_INTERNAL_DATA="true"
+USE_INTERNAL_DATA="false"
+
+if [ "${USE_INTERNAL_DATA}" = "true" ]; then
+    ## The internal data is only accessible within Microsoft
+    ## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
+    # BASE_DATA_PATH=/vc_data/Megatron-LM/data
+    # DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
+    ## For cluster Lab-RR1-V100
+    BASE_DATA_PATH=/data/Megatron-LM/data
+    DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
+    ## For cluster Azure-CentralUS-A100
+    # BASE_DATA_PATH=/data/Megatron-LM/data
+    # DATA_HOME=/vc_data_1/users/amawa/blended
+
+    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+    ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
+    BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
+    B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
+    CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
+    CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
+    GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
+    GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
+    NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
+    OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
+    PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
+    PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
+    RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
+    SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
+    ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
+    WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
+    DATA_BLEND="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
+    0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
+    0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
+    0.01359 ${ARX} 0.01588 ${GIT}"
+else
+    VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
+    MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
+    # Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
+    DATA_BLEND=/data/the_pile_public_merged_nopreprocessing/pile_text_document
+fi
+###############################################################################
+data_options=" \
+         --vocab-file ${VOCAB_PATH} \
+         --merge-file ${MERGE_PATH} \
+         --data-path ${DATA_BLEND} \
+         --data-impl mmap"
+        
+megatron_options=" \
+        --override-opt_param-scheduler \
+        --adam-beta1 0.9 \
+        --adam-beta2 0.95 \
+        --tensor-model-parallel-size ${MP_SIZE} \
+        --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
+        --num-experts ${EP_SIZE} \
+        --moe-loss-coeff ${MLC} \
+        --moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
+        --moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
+        --moe-min-capacity ${MOE_MIN_CAP} \
+        --init-method-std ${INIT_STD} \
+        --lr-decay-tokens ${LR_DECAY_TOKENS} \
+        --lr-warmup-tokens ${WARMUP_TOKENS} \
+        --micro-batch-size ${BATCH_SIZE} \
+        --exit-duration-in-mins ${EXIT_DURATION} \
+        --global-batch-size ${GLOBAL_BATCH_SIZE} \
+        --num-layers ${NUM_LAYERS} \
+        --hidden-size ${HIDDEN_SIZE} \
+        --num-attention-heads ${NUM_ATTN_HEADS} \
+        --seq-length ${SEQ_LEN} \
+        --max-position-embeddings ${SEQ_LEN} \
+        --train-tokens ${TRAIN_TOKENS} \
+        --train-iters ${TRAIN_ITERS} \
+        --lr ${LR} \
+        --min-lr ${MIN_LR} \
+        --lr-decay-style cosine \
+        --split 98,2,0 \
+        --log-interval ${LOG_INTERVAL} \
+        --eval-interval ${EVAL_INTERVAL} \
+        --eval-iters ${EVAL_ITERS} \
+        --save-interval ${SAVE_INTERVAL} \
+        --weight-decay 0.1 \
+        --clip-grad 1.0 \
+        --hysteresis 2 \
+        --num-workers 0 \
+        --fp16 \
+        --load ${CHECKPOINT_PATH} \
+        --save ${CHECKPOINT_PATH} \
+        --tensorboard-queue-size 1 \
+        --log-timers-to-tensorboard \
+        --log-batch-size-to-tensorboard \
+        --log-validation-ppl-to-tensorboard \
+        --tensorboard-dir ${TENSORBOARD_DIR}"
+
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+megatron_options="${megatron_options} \
+        --checkpoint-activations"
+fi
+
+if [[ $EP_SIZE -gt 1 ]]; then
+megatron_options="${megatron_options} \
+        --create-moe-param-group"
+fi
+
+if [ "${MOE_DROP_TOKEN}" = "false" ]; then
+megatron_options="${megatron_options} \
+        --disable-moe-token-dropping"
+fi
+
+template_json="ds_config_gpt_TEMPLATE.json"
+config_json="ds_config_gpt_${NAME}.json"
+sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
+    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
+    | sed "s/ZERO_STAGE/0/" \
+    | sed "s/PRESCALE_GRAD/true/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
+    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
+    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
+    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
+	  > ${config_json}
+
+deepspeed_options=" \
+		    --deepspeed \
+		    --deepspeed_config ${config_json} \
+		    --pipeline-model-parallel-size ${PP_SIZE}"
+
+# Currently MoE is not compatible with pipeline parallel
+if [[ $EP_SIZE -gt 1 ]]; then
+deepspeed_options="${deepspeed_options} \
+        --no-pipeline-parallel"
+fi
+
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+        --deepspeed-activation-checkpointing"
+fi
+
+run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
+echo ${run_cmd}
+eval ${run_cmd}
+set +x
--- a/examples_deepspeed/MoE/ds_pretrain_gpt_350M_PR-MoE32or64.sh
+++ b/examples_deepspeed/MoE/ds_pretrain_gpt_350M_PR-MoE32or64.sh
+#!/bin/bash
+DIR=`pwd`
+###############################################################################
+### Main configs
+## GPT-3 models use 2K sequence length/context window
+SEQ_LEN=2048
+
+### The "GPT-3 XXX" below are configs from GPT-3 paper
+### https://arxiv.org/abs/2005.14165, choose based on
+### your desired model size or build your own configs
+
+## GPT-3 Small 125M
+# MODEL_SIZE=0.125
+# NUM_LAYERS=12
+# HIDDEN_SIZE=768
+# NUM_ATTN_HEADS=12
+# GLOBAL_BATCH_SIZE=256
+# LR=6.0e-4
+# MIN_LR=6.0e-5
+
+## GPT-3 Medium 350M
+MODEL_SIZE=0.35
+NUM_LAYERS=24
+HIDDEN_SIZE=1024
+NUM_ATTN_HEADS=16
+GLOBAL_BATCH_SIZE=256
+# LR=3.0e-4
+# MIN_LR=3.0e-5
+
+## GPT-3 Large 760M
+# MODEL_SIZE=0.76
+# NUM_LAYERS=24
+# HIDDEN_SIZE=1536
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=256
+# LR=2.5e-4
+# MIN_LR=2.5e-5
+
+## GPT-3 XL 1.3B
+# MODEL_SIZE=1.3
+# NUM_LAYERS=24
+# HIDDEN_SIZE=2048
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=512
+# LR=2.0e-4
+# MIN_LR=2.0e-5
+
+## GPT-3 2.7B
+# MODEL_SIZE=2.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=2560
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=512
+# LR=1.6e-4
+# MIN_LR=1.6e-5
+
+## GPT-3 6.7B
+# MODEL_SIZE=6.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=4096
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.2e-4
+# MIN_LR=1.2e-5
+
+## GPT-3 13B
+# MODEL_SIZE=13
+# NUM_LAYERS=40
+# HIDDEN_SIZE=5120
+# NUM_ATTN_HEADS=40
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.0e-4
+# MIN_LR=1.0e-5
+
+## GPT-3 175B
+# MODEL_SIZE=175
+# NUM_LAYERS=96
+# HIDDEN_SIZE=12288
+# NUM_ATTN_HEADS=96
+# GLOBAL_BATCH_SIZE=1536
+# LR=0.6e-4
+# MIN_LR=0.6e-5
+###############################################################################
+### Training duration configs
+## The main termination condition, original GPT-3 paper trains for 300B tokens
+## For MoE model, we found sometimes training a bit more to 330B tokens helps
+TRAIN_TOKENS=300000000000
+# TRAIN_TOKENS=330000000000
+
+## TRAIN_ITERS is another termination condition and also affect the number of 
+## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
+## above, and techniques like curriculum learning has less token in some steps,
+## so we just set this config large enough to make sure we have enough
+## processed data and don't terminate by TRAIN_ITERS.
+TRAIN_ITERS=$(( ${TRAIN_TOKENS} * 3 / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
+
+## Another termination condition in minutes. Set it large enough to avoid
+## undesired early termination.
+EXIT_DURATION=30000000
+###############################################################################
+### LR configs
+## LR warmup and decay duration, this token-based config is preferable since
+## no need to readjust when the batch size/seqlen is changed.
+## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
+## For MoE model, we found that setting the decay token to 300B helps.
+WARMUP_TOKENS=375000000
+# LR_DECAY_TOKENS=260000000000
+LR_DECAY_TOKENS=300000000000
+###############################################################################
+### Parallelism configs
+## Micro batch size per GPU
+## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
+BATCH_SIZE=4
+
+## Model parallelism, 1 is no MP
+MP_SIZE=1
+
+## Pipeline parallelism
+## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
+## to 1 and use the "--no-pipeline-parallel" arg.
+PP_SIZE=1
+NUM_GPUS=64
+###############################################################################
+### MoE configs
+## Number of experts. EP_SIZE 128 means standard MoE
+# EP_SIZE=128
+EP_SIZE="32 32 32 32 32 32 32 32 32 32 64 64"
+
+EP_PARALLEL_SIZE=$NUM_GPUS
+
+## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
+## found that lower LR and min LR (than the base dense model) helps.
+## For 1.3B PR-MoE-64/128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
+## For 350M PR-MoE-32/64 model we used LR=3.0e-4 and MIN_LR=1.0e-6, but they are not
+## heavily tuned.
+LR=3.0e-4
+MIN_LR=1.0e-06
+
+## Coefficient for MoE loss. We find that 0.01 is a good value at least for
+## 1.3B MoE-128 model
+MLC=0.01
+
+## Below configs adjust the MoE expert token capacity limit during training and
+## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
+## Larger capacity factor or disabling capacity limit could improve training
+## convergence, but will also reduce training throughput.
+MOE_TRAIN_CAP_FACTOR=1.0
+MOE_EVAL_CAP_FACTOR=1.0
+MOE_MIN_CAP=4
+MOE_DROP_TOKEN="true"
+# MOE_DROP_TOKEN="false"
+###############################################################################
+### Curriculum learning (CL) configs
+## Enable/disable CL
+CL_ENABLED="false"
+## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
+## for tuning the following configs
+CL_START_SEQLEN=80
+CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
+CL_TOKENS=60
+CL_TOKENS=$((${CL_TOKENS} * 1000000000))
+CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
+###############################################################################
+### Misc configs
+LOG_INTERVAL=10
+EVAL_ITERS=10
+EVAL_INTERVAL=100
+SAVE_INTERVAL=10000
+
+## Standard deviation for weight initialization
+## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
+## dense model. Usually larger model needs lower std.
+INIT_STD=0.014
+# INIT_STD=0.01
+
+## Activation checkpointing saves GPU memory, but reduces training speed
+ACTIVATION_CHECKPOINT="true"
+# ACTIVATION_CHECKPOINT="false"
+###############################################################################
+### Output and data configs
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+host="${HOSTNAME}"
+NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
+NAME="${NAME}-ep-pyramid-32+64-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
+
+if [ "${CL_ENABLED}" = "true" ]; then
+    NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
+fi
+
+OUTPUT_BASEPATH=$DIR/output
+mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
+mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
+mkdir -p "${OUTPUT_BASEPATH}/log/"
+TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}_${current_time}"
+mkdir -p ${TENSORBOARD_DIR} 
+## Note that for MoE model with billion-scale base model, the checkpoint can be
+## as large as TB-scale which normal NFS cannot handle efficiently.
+CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
+
+# USE_INTERNAL_DATA="true"
+USE_INTERNAL_DATA="false"
+
+if [ "${USE_INTERNAL_DATA}" = "true" ]; then
+    ## The internal data is only accessible within Microsoft
+    ## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
+    BASE_DATA_PATH=/vc_data/Megatron-LM/data
+    DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
+    ## For cluster Lab-RR1-V100
+    # BASE_DATA_PATH=/data/Megatron-LM/data
+    # DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
+    ## For cluster Azure-CentralUS-A100
+    # BASE_DATA_PATH=/data/Megatron-LM/data
+    # DATA_HOME=/vc_data_1/users/amawa/blended
+
+    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+    ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
+    BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
+    B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
+    CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
+    CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
+    GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
+    GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
+    NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
+    OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
+    PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
+    PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
+    RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
+    SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
+    ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
+    WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
+    DATA_BLEND="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
+    0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
+    0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
+    0.01359 ${ARX} 0.01588 ${GIT}"
+else
+    VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
+    MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
+    # Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
+    DATA_BLEND=/data/the_pile_public_merged_nopreprocessing/pile_text_document
+fi
+###############################################################################
+data_options=" \
+         --vocab-file ${VOCAB_PATH} \
+         --merge-file ${MERGE_PATH} \
+         --data-path ${DATA_BLEND} \
+         --data-impl mmap"
+        
+megatron_options=" \
+        --override-opt_param-scheduler \
+        --adam-beta1 0.9 \
+        --adam-beta2 0.95 \
+        --tensor-model-parallel-size ${MP_SIZE} \
+        --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
+        --num-experts ${EP_SIZE} \
+        --moe-loss-coeff ${MLC} \
+        --mlp-type residual \
+        --moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
+        --moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
+        --moe-min-capacity ${MOE_MIN_CAP} \
+        --init-method-std ${INIT_STD} \
+        --lr-decay-tokens ${LR_DECAY_TOKENS} \
+        --lr-warmup-tokens ${WARMUP_TOKENS} \
+        --micro-batch-size ${BATCH_SIZE} \
+        --exit-duration-in-mins ${EXIT_DURATION} \
+        --global-batch-size ${GLOBAL_BATCH_SIZE} \
+        --num-layers ${NUM_LAYERS} \
+        --hidden-size ${HIDDEN_SIZE} \
+        --num-attention-heads ${NUM_ATTN_HEADS} \
+        --seq-length ${SEQ_LEN} \
+        --max-position-embeddings ${SEQ_LEN} \
+        --train-tokens ${TRAIN_TOKENS} \
+        --train-iters ${TRAIN_ITERS} \
+        --lr ${LR} \
+        --min-lr ${MIN_LR} \
+        --lr-decay-style cosine \
+        --split 98,2,0 \
+        --log-interval ${LOG_INTERVAL} \
+        --eval-interval ${EVAL_INTERVAL} \
+        --eval-iters ${EVAL_ITERS} \
+        --save-interval ${SAVE_INTERVAL} \
+        --weight-decay 0.1 \
+        --clip-grad 1.0 \
+        --hysteresis 2 \
+        --num-workers 0 \
+        --fp16 \
+        --load ${CHECKPOINT_PATH} \
+        --save ${CHECKPOINT_PATH} \
+        --tensorboard-queue-size 1 \
+        --log-timers-to-tensorboard \
+        --log-batch-size-to-tensorboard \
+        --log-validation-ppl-to-tensorboard \
+        --tensorboard-dir ${TENSORBOARD_DIR}"
+
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+megatron_options="${megatron_options} \
+        --checkpoint-activations"
+fi
+
+megatron_options="${megatron_options} \
+        --create-moe-param-group"
+
+if [ "${MOE_DROP_TOKEN}" = "false" ]; then
+megatron_options="${megatron_options} \
+        --disable-moe-token-dropping"
+fi
+
+template_json="ds_config_gpt_TEMPLATE.json"
+config_json="ds_config_gpt_${NAME}.json"
+sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
+    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
+    | sed "s/ZERO_STAGE/0/" \
+    | sed "s/PRESCALE_GRAD/true/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
+    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
+    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
+    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
+	  > ${config_json}
+
+deepspeed_options=" \
+		    --deepspeed \
+		    --deepspeed_config ${config_json} \
+		    --pipeline-model-parallel-size ${PP_SIZE}"
+
+# Currently MoE is not compatible with pipeline parallel
+deepspeed_options="${deepspeed_options} \
+        --no-pipeline-parallel"
+
+
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+        --deepspeed-activation-checkpointing"
+fi
+
+run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
+echo ${run_cmd}
+eval ${run_cmd}
+set +x
--- a/examples_deepspeed/MoE/ds_pretrain_gpt_350M_PR-MoE32or64_MoS.sh
+++ b/examples_deepspeed/MoE/ds_pretrain_gpt_350M_PR-MoE32or64_MoS.sh
+#!/bin/bash
+DIR=`pwd`
+###############################################################################
+### Main configs
+## GPT-3 models use 2K sequence length/context window
+SEQ_LEN=2048
+
+### The "GPT-3 XXX" below are configs from GPT-3 paper
+### https://arxiv.org/abs/2005.14165, choose based on
+### your desired model size or build your own configs
+
+## GPT-3 Small 125M
+# MODEL_SIZE=0.125
+# NUM_LAYERS=12
+# HIDDEN_SIZE=768
+# NUM_ATTN_HEADS=12
+# GLOBAL_BATCH_SIZE=256
+# LR=6.0e-4
+# MIN_LR=6.0e-5
+
+## GPT-3 Medium 350M
+MODEL_SIZE=0.35
+NUM_LAYERS=24
+HIDDEN_SIZE=1024
+NUM_ATTN_HEADS=16
+GLOBAL_BATCH_SIZE=256
+# LR=3.0e-4
+# MIN_LR=3.0e-5
+
+## GPT-3 Large 760M
+# MODEL_SIZE=0.76
+# NUM_LAYERS=24
+# HIDDEN_SIZE=1536
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=256
+# LR=2.5e-4
+# MIN_LR=2.5e-5
+
+## GPT-3 XL 1.3B
+# MODEL_SIZE=1.3
+# NUM_LAYERS=24
+# HIDDEN_SIZE=2048
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=512
+# LR=2.0e-4
+# MIN_LR=2.0e-5
+
+## GPT-3 2.7B
+# MODEL_SIZE=2.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=2560
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=512
+# LR=1.6e-4
+# MIN_LR=1.6e-5
+
+## GPT-3 6.7B
+# MODEL_SIZE=6.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=4096
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.2e-4
+# MIN_LR=1.2e-5
+
+## GPT-3 13B
+# MODEL_SIZE=13
+# NUM_LAYERS=40
+# HIDDEN_SIZE=5120
+# NUM_ATTN_HEADS=40
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.0e-4
+# MIN_LR=1.0e-5
+
+## GPT-3 175B
+# MODEL_SIZE=175
+# NUM_LAYERS=96
+# HIDDEN_SIZE=12288
+# NUM_ATTN_HEADS=96
+# GLOBAL_BATCH_SIZE=1536
+# LR=0.6e-4
+# MIN_LR=0.6e-5
+###############################################################################
+### Training duration configs
+## The main termination condition, original GPT-3 paper trains for 300B tokens
+## For MoE model, we found sometimes training a bit more to 330B tokens helps
+TRAIN_TOKENS=300000000000
+# TRAIN_TOKENS=330000000000
+
+## TRAIN_ITERS is another termination condition and also affect the number of 
+## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
+## above, and techniques like curriculum learning has less token in some steps,
+## so we just set this config large enough to make sure we have enough
+## processed data and don't terminate by TRAIN_ITERS.
+TRAIN_ITERS=$(( ${TRAIN_TOKENS} * 3 / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
+
+## Another termination condition in minutes. Set it large enough to avoid
+## undesired early termination.
+EXIT_DURATION=30000000
+###############################################################################
+### LR configs
+## LR warmup and decay duration, this token-based config is preferable since
+## no need to readjust when the batch size/seqlen is changed.
+## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
+## For MoE model, we found that setting the decay token to 300B helps.
+WARMUP_TOKENS=375000000
+# LR_DECAY_TOKENS=260000000000
+LR_DECAY_TOKENS=300000000000
+###############################################################################
+### Parallelism configs
+## Micro batch size per GPU
+## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
+BATCH_SIZE=4
+
+## Model parallelism, 1 is no MP
+MP_SIZE=1
+
+## Pipeline parallelism
+## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
+## to 1 and use the "--no-pipeline-parallel" arg.
+PP_SIZE=1
+NUM_GPUS=64
+###############################################################################
+### MoE configs
+## Number of experts. EP_SIZE 128 means standard MoE
+# EP_SIZE=128
+EP_SIZE="32 32 32 32 32 32 32 32 64 64"
+EP_SIZE_TEACHER="32 32 32 32 32 32 32 32 32 32 64 64"
+
+EP_PARALLEL_SIZE=$NUM_GPUS
+
+## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
+## found that lower LR and min LR (than the base dense model) helps.
+## For 1.3B PR-MoE-64/128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
+## For 350M PR-MoE-32/64 model we used LR=3.0e-4 and MIN_LR=1.0e-6, but they are not
+## heavily tuned.
+LR=3.0e-4
+MIN_LR=1.0e-06
+
+## Coefficient for MoE loss. We find that 0.01 is a good value at least for
+## 1.3B MoE-128 model
+MLC=0.01
+
+## Below configs adjust the MoE expert token capacity limit during training and
+## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
+## Larger capacity factor or disabling capacity limit could improve training
+## convergence, but will also reduce training throughput.
+MOE_TRAIN_CAP_FACTOR=1.0
+MOE_EVAL_CAP_FACTOR=1.0
+MOE_MIN_CAP=4
+MOE_DROP_TOKEN="true"
+# MOE_DROP_TOKEN="false"
+###############################################################################
+### Curriculum learning (CL) configs
+## Enable/disable CL
+CL_ENABLED="false"
+## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
+## for tuning the following configs
+CL_START_SEQLEN=80
+CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
+CL_TOKENS=60
+CL_TOKENS=$((${CL_TOKENS} * 1000000000))
+CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
+###############################################################################
+### Misc configs
+LOG_INTERVAL=10
+EVAL_ITERS=10
+EVAL_INTERVAL=100
+SAVE_INTERVAL=10000
+
+## Standard deviation for weight initialization
+## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
+## dense model. Usually larger model needs lower std.
+INIT_STD=0.014
+# INIT_STD=0.01
+
+## Activation checkpointing saves GPU memory, but reduces training speed
+ACTIVATION_CHECKPOINT="true"
+# ACTIVATION_CHECKPOINT="false"
+###############################################################################
+### Output and data configs
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+host="${HOSTNAME}"
+NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
+NAME="${NAME}-ep-pyramid-32+64-mos-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
+
+if [ "${CL_ENABLED}" = "true" ]; then
+    NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
+fi
+
+OUTPUT_BASEPATH=$DIR/output
+mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
+mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
+mkdir -p "${OUTPUT_BASEPATH}/log/"
+TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}_${current_time}"
+mkdir -p ${TENSORBOARD_DIR} 
+## Note that for MoE model with billion-scale base model, the checkpoint can be
+## as large as TB-scale which normal NFS cannot handle efficiently.
+CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
+
+### Mixture-of-Students (MoS) configs
+KD_BETA_CE=1
+CHECKPOINT_PATH_STUDENT="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
+CHECKPOINT_PATH_TEACHER="${OUTPUT_BASEPATH}/checkpoint/gpt-1.3B-lr-1.2e-4-minlr-1.0e-6-bs-512-gpus-128-mp-1-pp-1-ep-pyramid-64+128-mlc-0.01-cap-1.0-drop-true/"
+CHECKPOINT_PATH_SAVE="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
+
+USE_INTERNAL_DATA="true"
+# USE_INTERNAL_DATA="false"
+
+if [ "${USE_INTERNAL_DATA}" = "true" ]; then
+    ## The internal data is only accessible within Microsoft
+    ## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
+    BASE_DATA_PATH=/vc_data/Megatron-LM/data
+    DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
+    ## For cluster Lab-RR1-V100
+    # BASE_DATA_PATH=/data/Megatron-LM/data
+    # DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
+    ## For cluster Azure-CentralUS-A100
+    # BASE_DATA_PATH=/data/Megatron-LM/data
+    # DATA_HOME=/vc_data_1/users/amawa/blended
+
+    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+    ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
+    BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
+    B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
+    CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
+    CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
+    GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
+    GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
+    NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
+    OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
+    PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
+    PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
+    RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
+    SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
+    ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
+    WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
+    DATA_BLEND="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
+    0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
+    0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
+    0.01359 ${ARX} 0.01588 ${GIT}"
+else
+    ## Placeholder, we plan to test a public dataset
+    VOCAB_PATH=""
+    MERGE_PATH=""
+    DATA_BLEND=""
+fi
+###############################################################################
+data_options=" \
+         --vocab-file ${VOCAB_PATH} \
+         --merge-file ${MERGE_PATH} \
+         --data-path ${DATA_BLEND} \
+         --data-impl mmap"
+        
+megatron_options=" \
+        --override-opt_param-scheduler \
+        --adam-beta1 0.9 \
+        --adam-beta2 0.95 \
+        --tensor-model-parallel-size ${MP_SIZE} \
+        --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
+        --num-experts ${EP_SIZE} \
+        --moe-loss-coeff ${MLC} \
+        --mlp-type residual \
+        --moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
+        --moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
+        --moe-min-capacity ${MOE_MIN_CAP} \
+        --init-method-std ${INIT_STD} \
+        --lr-decay-tokens ${LR_DECAY_TOKENS} \
+        --lr-warmup-tokens ${WARMUP_TOKENS} \
+        --micro-batch-size ${BATCH_SIZE} \
+        --exit-duration-in-mins ${EXIT_DURATION} \
+        --global-batch-size ${GLOBAL_BATCH_SIZE} \
+        --num-layers 21 \
+        --hidden-size ${HIDDEN_SIZE} \
+        --num-attention-heads ${NUM_ATTN_HEADS} \
+        --seq-length ${SEQ_LEN} \
+        --max-position-embeddings ${SEQ_LEN} \
+        --train-tokens ${TRAIN_TOKENS} \
+        --train-iters ${TRAIN_ITERS} \
+        --lr ${LR} \
+        --min-lr ${MIN_LR} \
+        --lr-decay-style cosine \
+        --split 98,2,0 \
+        --log-interval ${LOG_INTERVAL} \
+        --eval-interval ${EVAL_INTERVAL} \
+        --eval-iters ${EVAL_ITERS} \
+        --save-interval ${SAVE_INTERVAL} \
+        --weight-decay 0.1 \
+        --clip-grad 1.0 \
+        --hysteresis 2 \
+        --num-workers 0 \
+        --fp16 \
+        --load ${CHECKPOINT_PATH_STUDENT} \
+        --save ${CHECKPOINT_PATH_SAVE} \
+        --mos \
+        --kd-beta-ce ${KD_BETA_CE} \
+        --num-layers-teacher ${NUM_LAYERS} \
+        --num-experts-teacher ${EP_SIZE_TEACHER} \
+        --hidden-size-teacher ${HIDDEN_SIZE} \
+        --num-attention-heads-teacher ${NUM_ATTN_HEADS} \
+        --load-teacher ${CHECKPOINT_PATH_TEACHER} \
+        --tensorboard-queue-size 1 \
+        --log-timers-to-tensorboard \
+        --log-batch-size-to-tensorboard \
+        --log-validation-ppl-to-tensorboard \
+        --tensorboard-dir ${TENSORBOARD_DIR}"
+
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+megatron_options="${megatron_options} \
+        --checkpoint-activations"
+fi
+
+megatron_options="${megatron_options} \
+        --create-moe-param-group"
+
+if [ "${MOE_DROP_TOKEN}" = "false" ]; then
+megatron_options="${megatron_options} \
+        --disable-moe-token-dropping"
+fi
+
+template_json="ds_config_gpt_TEMPLATE.json"
+config_json="ds_config_gpt_${NAME}.json"
+sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
+    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
+    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
+    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
+    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
+	  > ${config_json}
+
+deepspeed_options=" \
+		    --deepspeed \
+		    --deepspeed_config ${config_json} \
+		    --pipeline-model-parallel-size ${PP_SIZE}"
+
+# Currently MoE is not compatible with pipeline parallel
+deepspeed_options="${deepspeed_options} \
+        --no-pipeline-parallel"
+
+
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+        --deepspeed-activation-checkpointing"
+fi
+
+run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
+echo ${run_cmd}
+eval ${run_cmd}
+set +x
--- a/examples_deepspeed/MoE/ds_pretrain_gpt_350M_dense.sh
+++ b/examples_deepspeed/MoE/ds_pretrain_gpt_350M_dense.sh
+#!/bin/bash
+DIR=`pwd`
+###############################################################################
+### Main configs
+## GPT-3 models use 2K sequence length/context window
+SEQ_LEN=2048
+
+### The "GPT-3 XXX" below are configs from GPT-3 paper
+### https://arxiv.org/abs/2005.14165, choose based on
+### your desired model size or build your own configs
+
+## GPT-3 Small 125M
+# MODEL_SIZE=0.125
+# NUM_LAYERS=12
+# HIDDEN_SIZE=768
+# NUM_ATTN_HEADS=12
+# GLOBAL_BATCH_SIZE=256
+# LR=6.0e-4
+# MIN_LR=6.0e-5
+
+## GPT-3 Medium 350M
+MODEL_SIZE=0.35
+NUM_LAYERS=24
+HIDDEN_SIZE=1024
+NUM_ATTN_HEADS=16
+GLOBAL_BATCH_SIZE=256
+LR=3.0e-4
+MIN_LR=3.0e-5
+
+## GPT-3 Large 760M
+# MODEL_SIZE=0.76
+# NUM_LAYERS=24
+# HIDDEN_SIZE=1536
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=256
+# LR=2.5e-4
+# MIN_LR=2.5e-5
+
+## GPT-3 XL 1.3B
+# MODEL_SIZE=1.3
+# NUM_LAYERS=24
+# HIDDEN_SIZE=2048
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=512
+# LR=2.0e-4
+# MIN_LR=2.0e-5
+
+## GPT-3 2.7B
+# MODEL_SIZE=2.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=2560
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=512
+# LR=1.6e-4
+# MIN_LR=1.6e-5
+
+## GPT-3 6.7B
+# MODEL_SIZE=6.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=4096
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.2e-4
+# MIN_LR=1.2e-5
+
+## GPT-3 13B
+# MODEL_SIZE=13
+# NUM_LAYERS=40
+# HIDDEN_SIZE=5120
+# NUM_ATTN_HEADS=40
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.0e-4
+# MIN_LR=1.0e-5
+
+## GPT-3 175B
+# MODEL_SIZE=175
+# NUM_LAYERS=96
+# HIDDEN_SIZE=12288
+# NUM_ATTN_HEADS=96
+# GLOBAL_BATCH_SIZE=1536
+# LR=0.6e-4
+# MIN_LR=0.6e-5
+###############################################################################
+### Training duration configs
+## The main termination condition, original GPT-3 paper trains for 300B tokens
+## For MoE model, we found sometimes training a bit more to 330B tokens helps
+TRAIN_TOKENS=300000000000
+# TRAIN_TOKENS=330000000000
+
+## TRAIN_SAMPLES is another termination condition and also affect the number of 
+## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
+## above, and techniques like curriculum learning has less token in some steps,
+## so we just set this config large enough to make sure we have enough
+## processed data and don't terminate by TRAIN_SAMPLES.
+TRAIN_SAMPLES=$(( ${TRAIN_TOKENS} * 3 / ${SEQ_LEN} ))
+
+## Another termination condition in minutes. Set it large enough to avoid
+## undesired early termination.
+EXIT_DURATION=30000000
+###############################################################################
+### LR configs
+## LR warmup and decay duration, this token-based config is preferable since
+## no need to readjust when the batch size/seqlen is changed.
+## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
+## For MoE model, we found that setting the decay token to 300B helps.
+WARMUP_TOKENS=375000000
+LR_DECAY_TOKENS=260000000000
+# LR_DECAY_TOKENS=300000000000
+###############################################################################
+### Parallelism configs
+## Micro batch size per GPU
+## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
+BATCH_SIZE=4
+
+## Model parallelism, 1 is no MP
+MP_SIZE=1
+
+## Pipeline parallelism
+## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
+## to 1 and use the "--no-pipeline-parallel" arg.
+PP_SIZE=1
+NUM_GPUS=64
+###############################################################################
+### MoE configs
+## Number of experts. EP_SIZE 1 means dense model without MoE
+EP_SIZE=1
+# EP_SIZE=128
+
+if [[ $EP_SIZE -gt $NUM_GPUS ]]; then
+    EP_PARALLEL_SIZE=$NUM_GPUS
+else
+    EP_PARALLEL_SIZE=$EP_SIZE
+fi
+
+## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
+## found that lower LR and min LR (than the base dense model) helps.
+## For 1.3B MoE-128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
+## For 350M MoE-128 model we used LR=2.0e-4 and MIN_LR=2.0e-6, but they are not
+## heavily tuned.
+# LR=2.0e-4
+# MIN_LR=2e-06
+
+## Coefficient for MoE loss. We find that 0.01 is a good value at least for
+## 1.3B MoE-128 model
+MLC=0.01
+
+## Below configs adjust the MoE expert token capacity limit during training and
+## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
+## Larger capacity factor or disabling capacity limit could improve training
+## convergence, but will also reduce training throughput.
+MOE_TRAIN_CAP_FACTOR=1.0
+MOE_EVAL_CAP_FACTOR=1.0
+MOE_MIN_CAP=4
+MOE_DROP_TOKEN="true"
+# MOE_DROP_TOKEN="false"
+###############################################################################
+### Curriculum learning (CL) configs
+## Enable/disable CL
+CL_ENABLED="false"
+## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
+## for tuning the following configs
+CL_START_SEQLEN=80
+CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
+CL_TOKENS=60
+CL_TOKENS=$((${CL_TOKENS} * 1000000000))
+CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
+###############################################################################
+### Misc configs
+LOG_INTERVAL=10
+EVAL_ITERS=10
+EVAL_INTERVAL=100
+SAVE_INTERVAL=1000
+
+## Standard deviation for weight initialization
+## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
+## dense model. Usually larger model needs lower std.
+INIT_STD=0.014
+# INIT_STD=0.01
+
+## Activation checkpointing saves GPU memory, but reduces training speed
+ACTIVATION_CHECKPOINT="true"
+# ACTIVATION_CHECKPOINT="false"
+###############################################################################
+### Output and data configs
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+host="${HOSTNAME}"
+NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
+if [[ $EP_SIZE -gt 1 ]]; then
+    NAME="${NAME}-ep-${EP_SIZE}-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
+fi
+if [ "${CL_ENABLED}" = "true" ]; then
+    NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
+fi
+
+OUTPUT_BASEPATH=$DIR/output
+mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
+mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
+mkdir -p "${OUTPUT_BASEPATH}/log/"
+TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}_${current_time}"
+mkdir -p ${TENSORBOARD_DIR} 
+## Note that for MoE model with billion-scale base model, the checkpoint can be
+## as large as TB-scale which normal NFS cannot handle efficiently.
+CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
+
+# USE_INTERNAL_DATA="true"
+USE_INTERNAL_DATA="false"
+
+if [ "${USE_INTERNAL_DATA}" = "true" ]; then
+    ## The internal data is only accessible within Microsoft
+    ## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
+    # BASE_DATA_PATH=/vc_data/Megatron-LM/data
+    # DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
+    ## For cluster Lab-RR1-V100
+    BASE_DATA_PATH=/data/Megatron-LM/data
+    DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
+    ## For cluster Azure-CentralUS-A100
+    # BASE_DATA_PATH=/data/Megatron-LM/data
+    # DATA_HOME=/vc_data_1/users/amawa/blended
+
+    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+    ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
+    BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
+    B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
+    CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
+    CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
+    GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
+    GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
+    NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
+    OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
+    PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
+    PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
+    RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
+    SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
+    ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
+    WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
+    DATA_BLEND="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
+    0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
+    0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
+    0.01359 ${ARX} 0.01588 ${GIT}"
+else
+    VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
+    MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
+    # Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
+    DATA_BLEND=/data/the_pile_public_merged_nopreprocessing/pile_text_document
+fi
+###############################################################################
+data_options=" \
+         --vocab-file ${VOCAB_PATH} \
+         --merge-file ${MERGE_PATH} \
+         --data-path ${DATA_BLEND} \
+         --data-impl mmap"
+        
+megatron_options=" \
+        --override-opt_param-scheduler \
+        --adam-beta1 0.9 \
+        --adam-beta2 0.95 \
+        --tensor-model-parallel-size ${MP_SIZE} \
+        --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
+        --num-experts ${EP_SIZE} \
+        --moe-loss-coeff ${MLC} \
+        --moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
+        --moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
+        --moe-min-capacity ${MOE_MIN_CAP} \
+        --init-method-std ${INIT_STD} \
+        --lr-decay-tokens ${LR_DECAY_TOKENS} \
+        --lr-warmup-tokens ${WARMUP_TOKENS} \
+        --micro-batch-size ${BATCH_SIZE} \
+        --exit-duration-in-mins ${EXIT_DURATION} \
+        --global-batch-size ${GLOBAL_BATCH_SIZE} \
+        --num-layers ${NUM_LAYERS} \
+        --hidden-size ${HIDDEN_SIZE} \
+        --num-attention-heads ${NUM_ATTN_HEADS} \
+        --seq-length ${SEQ_LEN} \
+        --max-position-embeddings ${SEQ_LEN} \
+        --train-tokens ${TRAIN_TOKENS} \
+        --train-samples ${TRAIN_SAMPLES} \
+        --lr ${LR} \
+        --min-lr ${MIN_LR} \
+        --lr-decay-style cosine \
+        --split 98,2,0 \
+        --log-interval ${LOG_INTERVAL} \
+        --eval-interval ${EVAL_INTERVAL} \
+        --eval-iters ${EVAL_ITERS} \
+        --save-interval ${SAVE_INTERVAL} \
+        --weight-decay 0.1 \
+        --clip-grad 1.0 \
+        --hysteresis 2 \
+        --num-workers 0 \
+        --fp16 \
+        --load ${CHECKPOINT_PATH} \
+        --save ${CHECKPOINT_PATH} \
+        --tensorboard-queue-size 1 \
+        --log-timers-to-tensorboard \
+        --log-batch-size-to-tensorboard \
+        --log-validation-ppl-to-tensorboard \
+        --tensorboard-dir ${TENSORBOARD_DIR}"
+
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+megatron_options="${megatron_options} \
+        --checkpoint-activations"
+fi
+
+if [[ $EP_SIZE -gt 1 ]]; then
+megatron_options="${megatron_options} \
+        --create-moe-param-group"
+fi
+
+if [ "${MOE_DROP_TOKEN}" = "false" ]; then
+megatron_options="${megatron_options} \
+        --disable-moe-token-dropping"
+fi
+
+template_json="ds_config_gpt_TEMPLATE.json"
+config_json="ds_config_gpt_${NAME}.json"
+sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
+    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
+    | sed "s/ZERO_STAGE/0/" \
+    | sed "s/PRESCALE_GRAD/true/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
+    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
+    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
+    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
+	  > ${config_json}
+
+deepspeed_options=" \
+		    --deepspeed \
+		    --deepspeed_config ${config_json} \
+		    --pipeline-model-parallel-size ${PP_SIZE}"
+
+# Currently MoE is not compatible with pipeline parallel
+if [[ $EP_SIZE -gt 1 ]]; then
+deepspeed_options="${deepspeed_options} \
+        --no-pipeline-parallel"
+fi
+
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+        --deepspeed-activation-checkpointing"
+fi
+
+run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
+echo ${run_cmd}
+eval ${run_cmd}
+set +x
--- a/examples_deepspeed/MoE/ds_pretrain_gpt_6.7B_dense.sh
+++ b/examples_deepspeed/MoE/ds_pretrain_gpt_6.7B_dense.sh
+#!/bin/bash
+DIR=`pwd`
+###############################################################################
+### Main configs
+## GPT-3 models use 2K sequence length/context window
+SEQ_LEN=2048
+
+### The "GPT-3 XXX" below are configs from GPT-3 paper
+### https://arxiv.org/abs/2005.14165, choose based on
+### your desired model size or build your own configs
+
+## GPT-3 Small 125M
+# MODEL_SIZE=0.125
+# NUM_LAYERS=12
+# HIDDEN_SIZE=768
+# NUM_ATTN_HEADS=12
+# GLOBAL_BATCH_SIZE=256
+# LR=6.0e-4
+# MIN_LR=6.0e-5
+
+## GPT-3 Medium 350M
+# MODEL_SIZE=0.35
+# NUM_LAYERS=24
+# HIDDEN_SIZE=1024
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=256
+# LR=3.0e-4
+# MIN_LR=3.0e-5
+
+## GPT-3 Large 760M
+# MODEL_SIZE=0.76
+# NUM_LAYERS=24
+# HIDDEN_SIZE=1536
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=256
+# LR=2.5e-4
+# MIN_LR=2.5e-5
+
+## GPT-3 XL 1.3B
+# MODEL_SIZE=1.3
+# NUM_LAYERS=24
+# HIDDEN_SIZE=2048
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=512
+# LR=2.0e-4
+# MIN_LR=2.0e-5
+
+## GPT-3 2.7B
+# MODEL_SIZE=2.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=2560
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=512
+# LR=1.6e-4
+# MIN_LR=1.6e-5
+
+## GPT-3 6.7B
+MODEL_SIZE=6.7
+NUM_LAYERS=32
+HIDDEN_SIZE=4096
+NUM_ATTN_HEADS=32
+GLOBAL_BATCH_SIZE=1024
+LR=1.2e-4
+MIN_LR=1.2e-5
+
+## GPT-3 13B
+# MODEL_SIZE=13
+# NUM_LAYERS=40
+# HIDDEN_SIZE=5120
+# NUM_ATTN_HEADS=40
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.0e-4
+# MIN_LR=1.0e-5
+
+## GPT-3 175B
+# MODEL_SIZE=175
+# NUM_LAYERS=96
+# HIDDEN_SIZE=12288
+# NUM_ATTN_HEADS=96
+# GLOBAL_BATCH_SIZE=1536
+# LR=0.6e-4
+# MIN_LR=0.6e-5
+###############################################################################
+### Training duration configs
+## The main termination condition, original GPT-3 paper trains for 300B tokens
+## For MoE model, we found sometimes training a bit more to 330B tokens helps
+TRAIN_TOKENS=300000000000
+# TRAIN_TOKENS=330000000000
+
+## TRAIN_SAMPLES is another termination condition and also affect the number of 
+## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
+## above, and techniques like curriculum learning has less token in some steps,
+## so we just set this config large enough to make sure we have enough
+## processed data and don't terminate by TRAIN_SAMPLES.
+TRAIN_SAMPLES=$(( ${TRAIN_TOKENS} * 3 / ${SEQ_LEN} ))
+
+## Another termination condition in minutes. Set it large enough to avoid
+## undesired early termination.
+EXIT_DURATION=30000000
+###############################################################################
+### LR configs
+## LR warmup and decay duration, this token-based config is preferable since
+## no need to readjust when the batch size/seqlen is changed.
+## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
+## For MoE model, we found that setting the decay token to 300B helps.
+WARMUP_TOKENS=375000000
+LR_DECAY_TOKENS=260000000000
+# LR_DECAY_TOKENS=300000000000
+###############################################################################
+### Parallelism configs
+## Micro batch size per GPU
+## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
+BATCH_SIZE=4
+
+## Model parallelism, 1 is no MP
+MP_SIZE=8
+
+## Pipeline parallelism
+## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
+## to 1 and use the "--no-pipeline-parallel" arg.
+PP_SIZE=1
+NUM_GPUS=64
+###############################################################################
+### MoE configs
+## Number of experts. EP_SIZE 1 means dense model without MoE
+EP_SIZE=1
+# EP_SIZE=128
+
+if [[ $EP_SIZE -gt $NUM_GPUS ]]; then
+    EP_PARALLEL_SIZE=$NUM_GPUS
+else
+    EP_PARALLEL_SIZE=$EP_SIZE
+fi
+
+## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
+## found that lower LR and min LR (than the base dense model) helps.
+## For 1.3B MoE-128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
+## For 350M MoE-128 model we used LR=2.0e-4 and MIN_LR=2.0e-6, but they are not
+## heavily tuned.
+# LR=2.0e-4
+# MIN_LR=2e-06
+
+## Coefficient for MoE loss. We find that 0.01 is a good value at least for
+## 1.3B MoE-128 model
+MLC=0.01
+
+## Below configs adjust the MoE expert token capacity limit during training and
+## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
+## Larger capacity factor or disabling capacity limit could improve training
+## convergence, but will also reduce training throughput.
+MOE_TRAIN_CAP_FACTOR=1.0
+MOE_EVAL_CAP_FACTOR=1.0
+MOE_MIN_CAP=4
+MOE_DROP_TOKEN="true"
+# MOE_DROP_TOKEN="false"
+###############################################################################
+### Curriculum learning (CL) configs
+## Enable/disable CL
+CL_ENABLED="false"
+## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
+## for tuning the following configs
+CL_START_SEQLEN=80
+CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
+CL_TOKENS=60
+CL_TOKENS=$((${CL_TOKENS} * 1000000000))
+CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
+###############################################################################
+### Misc configs
+LOG_INTERVAL=10
+EVAL_ITERS=10
+EVAL_INTERVAL=100
+SAVE_INTERVAL=1000
+
+## Standard deviation for weight initialization
+## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
+## dense model. Usually larger model needs lower std.
+# INIT_STD=0.014
+INIT_STD=0.01
+
+## Activation checkpointing saves GPU memory, but reduces training speed
+ACTIVATION_CHECKPOINT="true"
+# ACTIVATION_CHECKPOINT="false"
+###############################################################################
+### Output and data configs
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+host="${HOSTNAME}"
+NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
+if [[ $EP_SIZE -gt 1 ]]; then
+    NAME="${NAME}-ep-${EP_SIZE}-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
+fi
+if [ "${CL_ENABLED}" = "true" ]; then
+    NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
+fi
+
+OUTPUT_BASEPATH=$DIR/output
+mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
+mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
+mkdir -p "${OUTPUT_BASEPATH}/log/"
+TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}_${current_time}"
+mkdir -p ${TENSORBOARD_DIR} 
+## Note that for MoE model with billion-scale base model, the checkpoint can be
+## as large as TB-scale which normal NFS cannot handle efficiently.
+CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
+
+# USE_INTERNAL_DATA="true"
+USE_INTERNAL_DATA="false"
+
+if [ "${USE_INTERNAL_DATA}" = "true" ]; then
+    ## The internal data is only accessible within Microsoft
+    ## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
+    # BASE_DATA_PATH=/vc_data/Megatron-LM/data
+    # DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
+    ## For cluster Lab-RR1-V100
+    BASE_DATA_PATH=/data/Megatron-LM/data
+    DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
+    ## For cluster Azure-CentralUS-A100
+    # BASE_DATA_PATH=/data/Megatron-LM/data
+    # DATA_HOME=/vc_data_1/users/amawa/blended
+
+    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+    ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
+    BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
+    B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
+    CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
+    CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
+    GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
+    GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
+    NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
+    OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
+    PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
+    PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
+    RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
+    SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
+    ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
+    WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
+    DATA_BLEND="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
+    0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
+    0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
+    0.01359 ${ARX} 0.01588 ${GIT}"
+else
+    VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
+    MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
+    # Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
+    DATA_BLEND=/data/the_pile_public_merged_nopreprocessing/pile_text_document
+fi
+###############################################################################
+data_options=" \
+         --vocab-file ${VOCAB_PATH} \
+         --merge-file ${MERGE_PATH} \
+         --data-path ${DATA_BLEND} \
+         --data-impl mmap"
+        
+megatron_options=" \
+        --override-opt_param-scheduler \
+        --adam-beta1 0.9 \
+        --adam-beta2 0.95 \
+        --tensor-model-parallel-size ${MP_SIZE} \
+        --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
+        --num-experts ${EP_SIZE} \
+        --moe-loss-coeff ${MLC} \
+        --moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
+        --moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
+        --moe-min-capacity ${MOE_MIN_CAP} \
+        --init-method-std ${INIT_STD} \
+        --lr-decay-tokens ${LR_DECAY_TOKENS} \
+        --lr-warmup-tokens ${WARMUP_TOKENS} \
+        --micro-batch-size ${BATCH_SIZE} \
+        --exit-duration-in-mins ${EXIT_DURATION} \
+        --rampup-batch-size 32 32 4882812 \
+        --global-batch-size ${GLOBAL_BATCH_SIZE} \
+        --num-layers ${NUM_LAYERS} \
+        --hidden-size ${HIDDEN_SIZE} \
+        --num-attention-heads ${NUM_ATTN_HEADS} \
+        --seq-length ${SEQ_LEN} \
+        --max-position-embeddings ${SEQ_LEN} \
+        --train-tokens ${TRAIN_TOKENS} \
+        --train-samples ${TRAIN_SAMPLES} \
+        --lr ${LR} \
+        --min-lr ${MIN_LR} \
+        --lr-decay-style cosine \
+        --split 98,2,0 \
+        --log-interval ${LOG_INTERVAL} \
+        --eval-interval ${EVAL_INTERVAL} \
+        --eval-iters ${EVAL_ITERS} \
+        --save-interval ${SAVE_INTERVAL} \
+        --weight-decay 0.1 \
+        --clip-grad 1.0 \
+        --hysteresis 2 \
+        --num-workers 0 \
+        --fp16 \
+        --load ${CHECKPOINT_PATH} \
+        --save ${CHECKPOINT_PATH} \
+        --tensorboard-queue-size 1 \
+        --log-timers-to-tensorboard \
+        --log-batch-size-to-tensorboard \
+        --log-validation-ppl-to-tensorboard \
+        --tensorboard-dir ${TENSORBOARD_DIR}"
+
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+megatron_options="${megatron_options} \
+        --checkpoint-activations"
+fi
+
+if [[ $EP_SIZE -gt 1 ]]; then
+megatron_options="${megatron_options} \
+        --create-moe-param-group"
+fi
+
+if [ "${MOE_DROP_TOKEN}" = "false" ]; then
+megatron_options="${megatron_options} \
+        --disable-moe-token-dropping"
+fi
+
+template_json="ds_config_gpt_TEMPLATE.json"
+config_json="ds_config_gpt_${NAME}.json"
+sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
+    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
+    | sed "s/ZERO_STAGE/0/" \
+    | sed "s/PRESCALE_GRAD/true/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
+    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
+    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
+    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
+	  > ${config_json}
+
+deepspeed_options=" \
+		    --deepspeed \
+		    --deepspeed_config ${config_json} \
+		    --pipeline-model-parallel-size ${PP_SIZE}"
+
+# Currently MoE is not compatible with pipeline parallel
+if [[ $EP_SIZE -gt 1 ]]; then
+deepspeed_options="${deepspeed_options} \
+        --no-pipeline-parallel"
+fi
+
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+        --deepspeed-activation-checkpointing"
+fi
+
+run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
+echo ${run_cmd}
+eval ${run_cmd}
+set +x
--- a/examples_deepspeed/MoE/readme_evalharness.md
+++ b/examples_deepspeed/MoE/readme_evalharness.md
+# How to run lm-eval on Megatron-DeepSpeed checkpoint using the original setup
+
+A great portion of this eval harness feature is inherited from https://github.com/bigscience-workshop/Megatron-DeepSpeed/pull/212, but with code/doc changes (e.g., to support case without pipeline parallelism and MoE models).
+
+This particular setup uses the normal deepspeed checkpoint and requires no conversion to Megatron-LM.
+
+## Prerequisites
+
+1. Install software
+
+On login console with external network
+
+Get lm-eval harness (https://github.com/EleutherAI/lm-evaluation-harness) and `best-download==0.0.7` needed to download some tasks.
+Below package version numbers are what we tested that work.
+```
+(maybe need pip install --upgrade pip)
+pip install best-download==0.0.7 lm-eval==0.2.0 datasets==1.15.1 transformers==4.20.1 huggingface-hub==0.8.1
+```
+
+2. Pre-download needed datasets
+
+some symlinks due to lm-harness' issues with relative position of data
+```
+mkdir data
+cd ../../tasks/eval_harness/
+ln -s ../../examples_deepspeed/MoE/data/ data
+cd ../../examples_deepspeed/MoE/
+```
+<!-- Also make sure `data` is not on one of the limited paritions like WORKSF. -->
+
+Then install datasets for the tasks:
+```
+python ../../tasks/eval_harness/download.py --task_list hellaswag,lambada,triviaqa,webqs,winogrande,piqa,arc_challenge,arc_easy,openbookqa,race,boolq,cb,copa,rte,wic,wsc,multirc,record,anli_r1,anli_r2,anli_r3,wikitext,logiqa,mathqa,mc_taco,mrpc,prost,pubmedqa,qnli,qqp,sciq,sst,wnli
+```
+
+Previously we set `export HF_DATASETS_OFFLINE=1` to make the dataset offline after the above manual download. But somehow now this could trigger error on some kind of online verification for some of the datasets, so it's recommended to only set offline mode when necessary.
+
+<!-- If there are things like custom tokenizers, pre-download those too, e.g.:
+
+```
+python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('bigscience/oscar_13_languages_alpha_weight')"
+```
+and make sure that `export TRANSFORMERS_OFFLINE=1` is in the script.
+You know there is a custom tokenizer if the training script had something like:
+
+```
+--tokenizer-type PretrainedFromHF \
+ --tokenizer-name-or-path bigscience/oscar_13_languages_alpha_weight \
+``` -->
+
+3. Prepare the script
+
+<!-- Prepare the run script, replace `variant` with a unique identifier for the current eval so that multiple evals could run in parallel and not all log into the same `results.json` file. so, e.g., `tr9c-1B3-swiglu`
+
+```
+cp examples_deepspeed/run_evalharness_deepspeed.slurm run_evalharness-variant.slurm
+```
+
+now edit `run_evalharness-variant.slurm`
+
+
+Note that the eval code knows to pull the original training args from the checkpoint, so we don't need to pass any of those. And we just need to setup the evaluation args. -->
+
+`ds_evalharness.sh` is the example script.
+
+1. Edit:
+
+```
+PP_SIZE=1
+TP_SIZE=1
+NO_PP="true"
+EP_PARALLEL_SIZE=1
+NUM_NODE=1
+NUM_GPU_PER_NODE=1
+```
+to match the eval topology. 
+
+Edit:
+```
+CHECKPOINT_PATH=
+CONFIG_PATH=
+RESULT_PATH=
+```
+to the checkpoint/ds config you want to use, and where to save the results.
+<!-- If the model fits into 1 gpu, then there is nothing to change.
+
+The eval script will automatically reshape the model if it was of a different topology. -->
+
+
+2. Adjust the following to fit the chosen GPU. As of last check for 1.3B model the settings are one of:
+```
+EVAL_MICRO_BATCH_SIZE=6  # 16GB GPU 1.3B model
+EVAL_MICRO_BATCH_SIZE=12 # 32GB GPU 1.3B model
+```
+
+If you get OOM lower it further.
+
+3. If not using the Deepspeed path, disable it by removing:
+
+```
+    --deepspeed \
+    --deepspeed_config ds_config.json \
+```
+
+If you didn't disable it and the program crashed on checkpoint loading unable to find some key, disable deepspeed as explained above.
+
+Note that for MoE models and for models without pipeline parallelism, currently they might not work for the case without deepspeed.
+
+<!-- ## Eval
+
+Currently it takes 2-3 hours to run on 32GB for 1.3B model, 6-7h for 16GB GPU, so a 20h slurm job should be enough.
+
+When ready, launch:
+```
+sbatch ./run_evalharness-variant.slurm
+```
+
+To monitor progress:
+```
+tail -f tail -f $VARIANT-eval-harness.log
+```
+where the variant is what you set `$VARIANT` to in the slurm script.
+
+The template is set up for 16GB gpu since they are easier to get by. If you change to 32GB, adjust:
+```
+#SBATCH --constraint=v100-32g
+...
+EVAL_MICRO_BATCH_SIZE=12 # 32GB GPU 1.3B model
+```
+
+
+Note that the original ETA at the start of the run can be 10x too longer than the actual outcome. For example it may suggest 18 hours but will complete in 2 hours.
+
+
+## Short eval
+
+if you just want to quickly test that everything can run to the end, edit `tasks/eval_harness/evaluate.py`,  e.g. to run only 10 batches:
+```
+- results = evaluator.evaluate(adaptor, task_dict, False, 0, None)
+ results = evaluator.evaluate(adaptor, task_dict, False, 0, 10)
+```
+
+(XXX: could be a cmd line option so that code won't need to be modified)
+
+
+## Import into spreadsheet
+
+https://docs.google.com/spreadsheets/d/1CI8Q9RCblLRzUOPJ6ViqBmo284-8ojluQ-CmaEuhuv0/edit?usp=sharing
+
+Note that the spreadsheet format is quite different, so use this script:
+```
+./tasks/eval_harness/report-to-csv.py results.json
+```
+to reformat the json results into csv while changing its shape to match the spreadsheet format
+
+Since some records might be missing or extraneous here is the best way to do it:
+
+1. copy the data from first 2 columns to some place under the main spreadsheet
+
+2. put the pointer to the 3rd column next to where the 2 first columns were copied.
+
+3. import `results.csv` using file-> import -> file ->
+
+Import location: Replace data at selected cell
+
+4. Now it should be easy to align the new records with the old ones - delete irrelevant records and Insert->Cells where data is missing until the first 2 columns match
+
+5. now create 2 cols in the main table on top and now it should be safe to Copy-n-Paste the 2-col data range, without the task/metrics columns into the newly created space. -->
--- a/examples_deepspeed/README.md
+++ b/examples_deepspeed/README.md
+# Megatron-DeepSpeed Recipes and Scripts
+
+This folder includes various example scripts with DeepSpeed technologies integrated. Below we describe each sub-folder, sorted by last update date.
+
+## Sync with NVIDIA/Megatron-LM (last updated: Jul 2023)
+The ```rebase``` folder includes details about the recent sync with the NVIDIA/Megatron-LM repo (where this repo is forked from). It includes example scripts we used to test after the sync, together with a README documentation about what were tested.
+
+## Data Efficiency (last updated: Feb 2023)
+
+The ```data_efficiency``` folder includes GPT-3 and BERT pretraining examples for DeepSpeed Data Efficiency Library, together with examples of zero-shot evaluation for GPT models and GLUE finetuning for BERT models. Please refer to the detailed tutorials in data_efficiency/README.MD. Currently this folder includes the newest example scripts for GPT/BERT pretraining/eval/finetuning, both with and without DeepSpeed Data Efficiency Library techniques.
+
+## BERT example (last updated: Dec 2022)
+
+The ```bert_with_pile``` folder includes examples about BERT-style model pre-training (using the public Pile data or user's own data) with DeepSpeed integration. Please refer to the readme in the folder for tutorial.
+
+## Azure (last updated: Nov 2022)
+
+We strongly recommend to start with AzureML recipe in the ```azureml``` folder.
+
+If you have a custom infrastructure (e.g. HPC clusters) or Azure VM and VMSS based environments, please refer to the bash scripts in the ```azure``` folder.
+
+## Model Compression (last updated: Aug 2022)
+
+The ```compression``` folder includes examples about layer reduction for task-agnostic compression. Please refer to [this tutorial](https://www.deepspeed.ai/tutorials/model-compression/#11-layer-reduction) about the DeepSpeed Model Compression Library. These recipes are for GPT-style NLG models.
+
+## MoE (last updated: Jun 2022)
+
+Please see the ```MoE``` folder for different training recipes and scripts for Mixture-of-expert based models and dense models. These recipes are for GPT-style NLG models, and currently this is the only folder with MoE training examples.
+
+## Curriculum Learning (last updated: Oct 2021)
+
+Curriculum learning recipes are in the ```curriculum_learning``` folder. Please refer to the detailed tutorials linked inside. These recipes are for GPT-style NLG models.
+Note that the DeepSpeed Data Efficiency Library above includes a more general curriculum learning support. This legacy curriculum learning feature is still compatible, but we recommend using the DeepSpeed Data Efficiency Library above. However, the newer DeepSpeed Data Efficiency Library currently is not compatible with pipeline parallelism. So if you have to use pipeline parallelism, you would need to use this legacy curriculum learning version.
--- a/examples_deepspeed/azure/README.md
+++ b/examples_deepspeed/azure/README.md
+## Recipes for experimentation on Azure
+
+The recipes have been tested on command line on a cluster setup using Azure VMs and VMSS as well as inside Docker based environments.
+
+To run any of the examples in this folder, please go to the base directory of Megatron-DeepSpeed and run as follows
+
+```bash examples_deepspeed/azure/run-benchmark-model.sh```
+
+### Pre-requisites
+
+To run the above script, you will need to either setup your own dataset and modify the scripts or use our helper scripts to download the publicly available Books dataset and GPT vocab files. Please use the following from the ```dataset``` folder
+
+```bash dataset/download_books.sh```
+
+```bash dataset/download_vocab.sh```
+
+### Run 175B and 1T models
+
+We have included two recipes for the 175B model and the 1T model. To train the model, we assume that the users will modify and tune hyperparameters and configurations by themselves. To facilitate initial training, we have made the recipes runnable with the Books dataset as follows.
+
+```bash examples_deepspeed/azure/run-175b.sh```
+
+```bash examples_deepspeed/azure/run-1t.sh```
+
+### Note about ZeRO stage 3 and CPU offload 
+
+By default, we have enabled ZeRO Stage 3 for both the recipes above. For the 1T model, we have also enabled the CPU-offload feature to save on memory and enable a larger batch size that offers better performance. 
--- a/examples_deepspeed/azure/run-175b.sh
+++ b/examples_deepspeed/azure/run-175b.sh
+#!/bin/bash
+set -ex
+
+data_options=" \
+         --vocab-file ${VOCAB_PATH} \
+         --merge-file ${MERGE_PATH} \
+         --data-path ${DATA_PATH} \
+         --data-impl mmap"
+
+BASE_PATH=$PWD/dataset/
+DATA_PATH=${BASE_PATH}/BookCorpusDataset_text_document
+DS_CONFIG=ds_config.json
+
+# Hostfile path
+HF=/job/hostfile 
+
+# Disabling tensor/pipeline parallelism
+TP=1
+PP=1
+
+# HEADS ~= HIDDEN/128
+
+# Model: 175B
+NLAYERS=96
+HIDDEN=12288
+HEADS=96
+SEQ=1024
+
+
+MICRO_BATCH=4
+NODES=1
+GPN=8
+GLOBAL_BATCH=$(( ${GPN} * ${MICRO_BATCH} * ${NODES} ))
+
+# Initial power scale for loss
+SP=15
+
+# Uncomment/comment one of the following blocks.
+
+# For 1T model, start with microbatch=1, try to get 2 and 4.  If OOM w/ 4, use cpu-offloading
+
+# Set to cpu for offloading to cpu for larger models
+#OFFLOAD_DEVICE="cpu"
+#CPU_OPTIM=" --cpu-optimizer"
+
+# Set to none and empty string for no cpu offloading
+OFFLOAD_DEVICE="none"  
+CPU_OPTIM=" "
+
+ZERO_STAGE=3
+OUTPUT_DIR=ds_z_off-${OFFLOAD_DEVICE}_stage_${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_nodes${NODES}
+#OUTPUT_DIR=baseline_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH}
+mkdir -p $OUTPUT_DIR
+
+cat <<EOT > $DS_CONFIG
+{
+  "train_batch_size" : $GLOBAL_BATCH,
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
+  "steps_per_print": 1,
+  "gradient_accumulation_steps": 1,
+  "zero_optimization": {
+    "stage": 3,
+    "stage3_max_live_parameters": 3e9,
+    "stage3_max_reuse_distance": 3e9,
+    "stage3_param_persistence_threshold": 1e5,
+    "stage3_prefetch_bucket_size": 5e7,
+    "contiguous_gradients": true,
+    "overlap_comm": true,
+    "reduce_bucket_size": 90000000,
+    "sub_group_size": 1e9,
+    "offload_optimizer": {
+      "device": "$OFFLOAD_DEVICE",
+      "buffer_count": 4,
+      "pipeline_read": false,
+      "pipeline_write": false,
+      "pin_memory": true
+    }
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+    "initial_scale_power" : $SP,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "wall_clock_breakdown": true,
+  "zero_allow_untested_optimizer": false,
+  "aio": {
+    "block_size": 1048576,
+    "queue_depth": 16,
+    "single_submit": false,
+    "overlap_events": true,
+    "thread_count": 2
+  }
+}
+EOT
+
+export NCCL_DEBUG=warn 
+
+ds_args=" "
+ds_args=" --deepspeed ${ds_args}"
+ds_args=" --no-pipeline-parallel ${ds_args}" 
+ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
+ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
+ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
+
+
+
+deepspeed --force_multi --num_nodes=$NODES --hostfile $HF pretrain_gpt.py \
+    --tensor-model-parallel-size $TP \
+    --pipeline-model-parallel-size $PP \
+    --num-layers $NLAYERS \
+    --hidden-size $HIDDEN \
+    --num-attention-heads $HEADS \
+    --seq-length $SEQ \
+    --loss-scale $SP \
+    --max-position-embeddings $SEQ \
+    --micro-batch-size $MICRO_BATCH \
+    --global-batch-size $GLOBAL_BATCH \
+    --train-iters 1000 \
+    --lr 6.0e-5 \
+    --min-lr 6.0e-6 \
+    --lr-decay-style cosine \
+    --log-interval 1 \
+    --eval-iters 40 \
+    --eval-interval 1000 \
+    --data-path $DATA_PATH \
+    --vocab-file $BASE_PATH/gpt2-vocab.json \
+    --merge-file $BASE_PATH/gpt2-merges.txt \
+    --save-interval 1000 \
+    --split 98,2,0 \
+    --clip-grad 1.0 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.006 \
+    --fp16 \
+    --checkpoint-activations \
+    --tensorboard-dir $OUTPUT_DIR \
+    $CPU_OPTIM $ds_args \
+    --exit-interval 5000 | tee ${OUTPUT_DIR}/output.log
--- a/examples_deepspeed/azure/run-1t.sh
+++ b/examples_deepspeed/azure/run-1t.sh
+#!/bin/bash
+set -ex
+
+data_options=" \
+         --vocab-file ${VOCAB_PATH} \
+         --merge-file ${MERGE_PATH} \
+         --data-path ${DATA_PATH} \
+         --data-impl mmap"
+
+BASE_PATH=$PWD/dataset/
+DATA_PATH=${BASE_PATH}/BookCorpusDataset_text_document
+DS_CONFIG=ds_config.json
+
+# Hostfile path
+HF=/job/hostfile
+
+# Disabling tensor/pipeline parallelism
+TP=1
+PP=1
+
+# HEADS ~= HIDDEN/128
+
+# Refer to Megatron-table in the README.md file for model sizes
+# Model: 310B
+#NLAYERS=96
+#HIDDEN=16384
+#HEADS=128
+#SEQ=2048
+
+# Model 530B
+#NLAYERS=105
+#HIDDEN=20480
+#HEADS=160
+#SEQ=2048
+
+# Model 1T
+NLAYERS=128
+HIDDEN=25600
+HEADS=160
+SEQ=1024
+
+MICRO_BATCH=1
+NODES=1
+GPN=8
+GLOBAL_BATCH=$(( ${GPN} * ${MICRO_BATCH} * ${NODES} ))
+
+# Initial power scale for loss
+SP=15
+
+# Uncomment/comment one of the following blocks.
+
+# For 1T model, start with microbatch=1, try to get 2 and 4.  If OOM w/ 4, use cpu-offloading
+
+# Set to cpu for offloading to cpu for larger models
+OFFLOAD_DEVICE="cpu"
+CPU_OPTIM=" --cpu-optimizer"
+
+# Set to none and empty string for no cpu offloading
+#OFFLOAD_DEVICE="none"  
+#CPU_OPTIM=" "
+
+ZERO_STAGE=3
+OUTPUT_DIR=ds_z_off-${OFFLOAD_DEVICE}_stage_${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_nodes${NODES}
+#OUTPUT_DIR=baseline_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH}
+mkdir -p $OUTPUT_DIR
+
+cat <<EOT > $DS_CONFIG
+{
+  "train_batch_size" : $GLOBAL_BATCH,
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
+  "steps_per_print": 1,
+  "gradient_accumulation_steps": 1,
+  "zero_optimization": {
+    "stage": 3,
+    "stage3_max_live_parameters": 3e9,
+    "stage3_max_reuse_distance": 3e9,
+    "stage3_param_persistence_threshold": 1e5,
+    "stage3_prefetch_bucket_size": 5e7,
+    "contiguous_gradients": true,
+    "overlap_comm": true,
+    "reduce_bucket_size": 90000000,
+    "sub_group_size": 1e9,
+    "offload_optimizer": {
+      "device": "$OFFLOAD_DEVICE",
+      "buffer_count": 4,
+      "pipeline_read": false,
+      "pipeline_write": false,
+      "pin_memory": true
+    }
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+    "initial_scale_power" : $SP,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "wall_clock_breakdown": true,
+  "zero_allow_untested_optimizer": false,
+  "aio": {
+    "block_size": 1048576,
+    "queue_depth": 16,
+    "single_submit": false,
+    "overlap_events": true,
+    "thread_count": 2
+  }
+}
+EOT
+
+export NCCL_DEBUG=warn 
+
+ds_args=" "
+ds_args=" --deepspeed ${ds_args}"
+ds_args=" --no-pipeline-parallel ${ds_args}" 
+ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
+ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
+ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
+
+
+
+deepspeed --force_multi --num_nodes=$NODES --hostfile $HF pretrain_gpt.py \
+    --tensor-model-parallel-size $TP \
+    --pipeline-model-parallel-size $PP \
+    --num-layers $NLAYERS \
+    --hidden-size $HIDDEN \
+    --num-attention-heads $HEADS \
+    --seq-length $SEQ \
+    --loss-scale $SP \
+    --max-position-embeddings $SEQ \
+    --micro-batch-size $MICRO_BATCH \
+    --global-batch-size $GLOBAL_BATCH \
+    --train-iters 1000 \
+    --lr 6.0e-5 \
+    --min-lr 6.0e-6 \
+    --lr-decay-style cosine \
+    --log-interval 1 \
+    --eval-iters 40 \
+    --eval-interval 1000 \
+    --data-path $DATA_PATH \
+    --vocab-file $BASE_PATH/gpt2-vocab.json \
+    --merge-file $BASE_PATH/gpt2-merges.txt \
+    --save-interval 1000 \
+    --split 98,2,0 \
+    --clip-grad 1.0 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.006 \
+    --fp16 \
+    --checkpoint-activations \
+    --tensorboard-dir $OUTPUT_DIR \
+    $CPU_OPTIM $ds_args \
+    --exit-interval 5000 | tee ${OUTPUT_DIR}/output.log
--- a/examples_deepspeed/azure/run-benchmark-model.sh
+++ b/examples_deepspeed/azure/run-benchmark-model.sh
+#!/bin/bash
+set -ex
+
+data_options=" \
+         --vocab-file ${VOCAB_PATH} \
+         --merge-file ${MERGE_PATH} \
+         --data-path ${DATA_PATH} \
+         --data-impl mmap"
+
+BASE_PATH=$PWD/dataset/
+DATA_PATH=${BASE_PATH}/BookCorpusDataset_text_document
+DS_CONFIG=ds_config.json
+
+# Hostfile path
+HF=/job/hostfile 
+
+# Disabling tensor/pipeline parallelism
+TP=1
+PP=1
+
+# HEADS ~= HIDDEN/128
+
+# Model: Benchmark model
+NLAYERS=1
+HIDDEN=12288
+HEADS=96
+SEQ=1024
+
+
+MICRO_BATCH=4
+NODES=2
+GPN=8
+GLOBAL_BATCH=$(( ${GPN} * ${MICRO_BATCH} * ${NODES} ))
+
+# Initial power scale for loss
+SP=15
+
+# Uncomment/comment one of the following blocks.
+
+# For 1T model, start with microbatch=1, try to get 2 and 4.  If OOM w/ 4, use cpu-offloading
+
+# Set to cpu for offloading to cpu for larger models
+#OFFLOAD_DEVICE="cpu"
+#CPU_OPTIM=" --cpu-optimizer"
+
+# Set to none and empty string for no cpu offloading
+OFFLOAD_DEVICE="none"  
+CPU_OPTIM=" "
+
+ZERO_STAGE=3
+OUTPUT_DIR=ds_z_off-${OFFLOAD_DEVICE}_stage_${ZERO_STAGE}_nl${NLAYERS}_hs${HIDDEN}_mb${MICRO_BATCH}_seq${SEQ}_gb${GLOBAL_BATCH}_nodes${NODES}
+#OUTPUT_DIR=baseline_nl${NLAYERS}_hs${HIDDEN}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH}
+mkdir -p $OUTPUT_DIR
+
+cat <<EOT > $DS_CONFIG
+{
+  "train_batch_size" : $GLOBAL_BATCH,
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH,
+  "steps_per_print": 1,
+  "gradient_accumulation_steps": 1,
+  "zero_optimization": {
+    "stage": 3,
+    "stage3_max_live_parameters": 3e9,
+    "stage3_max_reuse_distance": 3e9,
+    "stage3_param_persistence_threshold": 1e5,
+    "stage3_prefetch_bucket_size": 5e7,
+    "contiguous_gradients": true,
+    "overlap_comm": true,
+    "reduce_bucket_size": 90000000,
+    "sub_group_size": 1e9,
+    "offload_optimizer": {
+      "device": "$OFFLOAD_DEVICE",
+      "buffer_count": 4,
+      "pipeline_read": false,
+      "pipeline_write": false,
+      "pin_memory": true
+    }
+  },
+  "gradient_clipping": 1.0,
+  "fp16": {
+    "enabled": true,
+    "initial_scale_power" : $SP,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "wall_clock_breakdown": true,
+  "zero_allow_untested_optimizer": false,
+  "aio": {
+    "block_size": 1048576,
+    "queue_depth": 16,
+    "single_submit": false,
+    "overlap_events": true,
+    "thread_count": 2
+  }
+}
+EOT
+
+export NCCL_DEBUG=warn 
+
+ds_args=" "
+ds_args=" --deepspeed ${ds_args}"
+ds_args=" --no-pipeline-parallel ${ds_args}" 
+ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
+ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
+ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
+
+
+
+deepspeed --force_multi --num_nodes=$NODES --hostfile $HF pretrain_gpt.py \
+    --tensor-model-parallel-size $TP \
+    --pipeline-model-parallel-size $PP \
+    --num-layers $NLAYERS \
+    --hidden-size $HIDDEN \
+    --num-attention-heads $HEADS \
+    --seq-length $SEQ \
+    --loss-scale $SP \
+    --max-position-embeddings $SEQ \
+    --micro-batch-size $MICRO_BATCH \
+    --global-batch-size $GLOBAL_BATCH \
+    --train-iters 50 \
+    --lr 6.0e-5 \
+    --min-lr 6.0e-6 \
+    --lr-decay-style cosine \
+    --log-interval 1 \
+    --eval-iters 40 \
+    --eval-interval 1000 \
+    --data-path $DATA_PATH \
+    --vocab-file $BASE_PATH/gpt2-vocab.json \
+    --merge-file $BASE_PATH/gpt2-merges.txt \
+    --save-interval 1000 \
+    --split 98,2,0 \
+    --clip-grad 1.0 \
+    --weight-decay 0.1 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --init-method-std 0.006 \
+    --fp16 \
+    --checkpoint-activations \
+    --tensorboard-dir $OUTPUT_DIR \
+    $CPU_OPTIM $ds_args \
+    --exit-interval 5000 | tee ${OUTPUT_DIR}/output.log
--- a/examples_deepspeed/azureml/Dockerfile.dockerfile
+++ b/examples_deepspeed/azureml/Dockerfile.dockerfile
+FROM mcr.microsoft.com/azureml/curated/acpt-pytorch-1.11-cuda11.3:12
+USER root:root
+
+RUN pip install pybind11
+RUN pip install regex
\ No newline at end of file
--- a/examples_deepspeed/azureml/README.md
+++ b/examples_deepspeed/azureml/README.md
+## Megatron-DeepSpeed on AzureML
+Example script for running Megatron-DeepSpeed using Azure Machine Learning.
+
+------
+
+# Workspace Setup
+Setup an AML workspace. Refer to: [set-up doc](https://github.com/Azure/azureml-examples/tree/main/v1/python-sdk#set-up).
+
+# Dataset Preparation
+Create AML Dataset. To run remote AML job, you need to provide AML FileDataset. 
+Refer to [prepare_dataset script](prepare_dataset.py) to upload .bin and .idx files to blob store and on how to create FileDataset.
+
+> Note: The folder `bookcorpus_data` used by [prepare_dataset script](prepare_dataset.py) should not be under `azureml` directories. It is because Azure ML does not allow to include large files (limit: 100 files or 1048576 bytes) for Docker build context.
+
+# Training
+Run Megatron-DeepSpeed on Azure ML. Refer to [aml_submit script](aml_submit.py).
--- a/examples_deepspeed/azureml/aml_submit.py
+++ b/examples_deepspeed/azureml/aml_submit.py
+import os
+import requests
+import sys
+
+# AzureML libraries
+import azureml.core
+from azureml.core import Dataset, Environment, Experiment, ScriptRunConfig, Workspace
+from azureml.core.compute import ComputeTarget, AmlCompute
+from azureml.core.compute_target import ComputeTargetException
+from azureml.core.runconfig import PyTorchConfiguration
+from azureml.core.environment import DockerBuildContext
+
+# Check core SDK version number
+print("SDK version:", azureml.core.VERSION)
+
+# For setting up a workspace, refer to: https://github.com/Azure/azureml-examples/tree/main/python-sdk#set-up
+ws = Workspace.from_config()
+print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')
+
+#-------------------------------------------------------------------------------
+# Prepare Compute Cluster
+#-------------------------------------------------------------------------------
+cluster_name = "a100-80gb"
+
+# Verify that the cluster doesn't exist already
+try:
+    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
+    print('Found existing compute target.')
+except ComputeTargetException:
+    print('Creating a new compute target...')
+    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_ND96amsr_A100_v4', min_nodes=32, max_nodes=32)
+    
+    # create the cluster
+    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
+    compute_target.wait_for_completion(show_output=True)
+
+#-------------------------------------------------------------------------------
+# Prepare Data
+# Megatron-DeepSpeed takes in data_path, vocab_file, and merge_file.
+# For AML, we are adding a parameter aml_data_download_path which specifies how to deliver the dataset to a compute target.
+# In the submitted run, files in the datasets will be either mounted or downloaded to local path on the compute target.
+# 
+# data_path for this example is path to the .bin and .idx file, excluding extension.
+# e.g. for data/BookCorpusDataset_text_document.bin and data/BookCorpusDataset_text_document.idx,
+# data_path = "data/BookCorpusDataset_text_document"
+#
+# Once the folder is downloaded to the compute target, it will use aml_data_download_path to locate the folder
+# and data_path to locate .bin and .idx files
+#
+# vocab_file and merge_file would also be passed in a similar way.
+#-------------------------------------------------------------------------------
+datastore = ws.get_default_datastore()
+blobstore_datadir = "bookcorpus_data"
+data_path = f"BookCorpusDataset_text_document"
+# Load data folder which contains bookcorpus .bin and .idx files
+train_dataset = Dataset.File.from_files(path=[(datastore, blobstore_datadir)])
+aml_data_download_path = train_dataset.as_download(blobstore_datadir)
+
+vocab_file_dataset = Dataset.File.from_files("https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json")
+merge_file_dataset = Dataset.File.from_files("https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt")
+vocab_file = vocab_file_dataset.as_download()
+merge_file = merge_file_dataset.as_download()
+
+
+#-------------------------------------------------------------------------------
+# Setup training environment
+#-------------------------------------------------------------------------------
+
+megatron_ds_env = Environment.from_docker_build_context(name='megatron-ds-curated-acpt', docker_build_context=DockerBuildContext.from_local_directory(workspace = ws, path = '.', dockerfile_path='Dockerfile.dockerfile'))
+megatron_ds_env.register(ws).build(ws).wait_for_completion()  # Comment this out if environment already exists
+
+#-------------------------------------------------------------------------------
+# Training Settings and Arguments
+#-------------------------------------------------------------------------------
+node_count = 2
+total_processes_count = 16
+micro_batch_size = 1
+global_batch_size = micro_batch_size * total_processes_count
+tensorboard_dir = '/tmp/outputs/tensorboard'
+
+run_args = ['--tensor-model-parallel-size', 1, 
+            '--pipeline-model-parallel-size', 1, 
+            '--num-layers', 20,
+            '--hidden-size', 12288,
+            '--num-attention-heads', 96,
+            '--seq-length', 1024,
+            '--loss-scale', 15, 
+            '--max-position-embeddings', 1024, 
+            '--micro-batch-size', micro_batch_size,
+            '--global-batch-size', global_batch_size,
+            '--train-iters', 100,
+            '--lr', 6.0e-5,
+            '--min-lr', 6.0e-6, 
+            '--lr-decay-style', 'cosine',
+            '--log-interval', 1, 
+            '--eval-iters', 40, 
+            '--eval-interval', 1000,
+            '--aml-data-download-path', aml_data_download_path,
+            '--data-path', data_path,
+            '--vocab-file', vocab_file,
+            '--merge-file', merge_file,
+            '--save-interval', 1000, 
+            '--split', '98,2,0',
+            '--clip-grad', 1.0, 
+            '--weight-decay', 0.1,
+            '--adam-beta1', 0.9,
+            '--adam-beta2', 0.95,
+            '--init-method-std', 0.006,
+            '--fp16',
+            '--data-impl', 'mmap',
+            '--checkpoint-activations',
+            '--tensorboard-dir', tensorboard_dir,
+            #'--cpu-optimizer',
+            '--deepspeed',
+            '--no-pipeline-parallel',
+            '--deepspeed_config', 'ds_config.json',
+            '--zero-stage', 3,
+            '--deepspeed-activation-checkpointing',
+            '--exit-interval', 5000,
+]
+
+#-------------------------------------------------------------------------------
+# DeepSpeed ds_config.json
+#-------------------------------------------------------------------------------
+import json
+ds_config = {
+    "train_batch_size" : global_batch_size,
+    "train_micro_batch_size_per_gpu": micro_batch_size,
+    "steps_per_print": 1,
+    "gradient_accumulation_steps": 1,
+    "zero_optimization": {
+      "stage": 3,
+      "stage3_max_live_parameters": 3e9,
+      "stage3_max_reuse_distance": 3e9,
+      "stage3_param_persistence_threshold": 1e5,
+      "stage3_prefetch_bucket_size": 5e7,
+      "contiguous_gradients": True,
+      "overlap_comm": True,
+      "reduce_bucket_size": 90000000,
+      "sub_group_size": 1e9,
+      "offload_optimizer": {
+        "device": "none",
+        "buffer_count": 4,
+        "pipeline_read": False,
+        "pipeline_write": False,
+        "pin_memory": True
+      }
+    },
+    "gradient_clipping": 1.0,
+    "fp16": {
+      "enabled": True,
+      "initial_scale_power" : 15,
+      "loss_scale_window": 1000,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "wall_clock_breakdown": True,
+    "zero_allow_untested_optimizer": False,
+    "aio": {
+      "block_size": 1048576,
+      "queue_depth": 16,
+      "single_submit": False,
+      "overlap_events": True,
+      "thread_count": 2
+    }
+  }
+
+# Place ds_config.json in the same folder as pretrain_gpt.py (script to run)
+ds_config_path = '../../ds_config.json'
+with open(ds_config_path, 'w') as fp:
+    json.dump(ds_config, fp, indent=4)
+
+#-------------------------------------------------------------------------------
+# Create ScriptRunConfig
+#-------------------------------------------------------------------------------
+distr_config = PyTorchConfiguration(process_count=total_processes_count, node_count=node_count)
+
+megatron_ds_src = ScriptRunConfig(source_directory='../../',
+                      script='pretrain_gpt.py',
+                      arguments=run_args,
+                      compute_target=compute_target,
+                      environment=megatron_ds_env,
+                      distributed_job_config=distr_config)
+
+megatron_ds_src.run_config.environment_variables['NCCL_DEBUG'] = 'WARN'
+megatron_ds_src.run_config.environment_variables['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
+megatron_ds_src.run_config.environment_variables['NCCL_SOCKET_IFNAME'] = 'eth0'
+megatron_ds_src.run_config.environment_variables['NCCL_IB_PCI_RELAXED_ORDERING']='1'
+megatron_ds_src.run_config.environment_variables['UCX_TLS']='tcp'
+megatron_ds_src.run_config.environment_variables['UCX_NET_DEVICES']='eth0'
+
+#-------------------------------------------------------------------------------
+# Submit experiment
+#-------------------------------------------------------------------------------
+experiment_name = 'megatron-ds'
+experiment = Experiment(ws, name=experiment_name)
+
+run = experiment.submit(megatron_ds_src, tags={'bs':micro_batch_size, 'gpus':total_processes_count})
--- a/examples_deepspeed/azureml/prepare_dataset.py
+++ b/examples_deepspeed/azureml/prepare_dataset.py
+# Use this script to upload data to blob store
+
+# AzureML libraries
+from azureml.core import Workspace
+from azureml.core.dataset import Dataset
+from azureml.data.datapath import DataPath
+
+ws = Workspace.from_config()
+print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')
+
+data_dir = "bookcorpus_data"  # Local directory for where data is located that includes .bin and .idx files
+blobstore_datadir = data_dir  # Blob store directory to store data in
+
+datastore = ws.get_default_datastore()
+
+# Book Corpus Data
+print("upload dataset to blob store")
+uploaded_data = Dataset.File.upload_directory(
+    src_dir=data_dir,
+    target=DataPath(datastore, blobstore_datadir),
+    show_progress=True
+)
+
+# Usage after uploading the directory
+# To refer to the folder directly:
+train_dataset = Dataset.File.from_files(path=[(datastore, blobstore_datadir)])
+print(train_dataset)
+# To refer to a specific file:
+# train_dataset = Dataset.File.from_files(path=[(datastore, blobstore_datadir + "/filename.ext")])
+# Create DatasetConsumptionConfig to specify how to deliver the dataset to a compute target.
+# In the submitted run, files in the datasets will be either mounted or downloaded to local path on the compute target.
+# input_data_dir = train_dataset.as_mount()
+# input_data_dir = train_dataset.as_download()