update

e1354f9d · liangjing · e1354f9d · e1354f9d · e1354f9d · e1354f9d
Commit e1354f9d authored May 22, 2024 by liangjing
20 changed files
--- a/examples/run_text_generation_server_345M.sh
+++ b/examples/run_text_generation_server_345M.sh
+#!/bin/bash
+# This example will start serving the 345M model.
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+CHECKPOINT=<Path to checkpoint (e.g /345m)>
+VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
+MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+pip install flask-restful
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 24  \
+       --hidden-size 1024  \
+       --load ${CHECKPOINT}  \
+       --num-attention-heads 16  \
+       --max-position-embeddings 1024  \
+       --tokenizer-type GPT2BPETokenizer  \
+       --fp16  \
+       --micro-batch-size 1  \
+       --seq-length 1024  \
+       --out-seq-length 1024  \
+       --temperature 1.0  \
+       --vocab-file $VOCAB_FILE  \
+       --merge-file $MERGE_FILE  \
+       --top_p 0.9  \
+       --seed 42
--- a/examples/run_text_generation_server_345M_8_tensor_parallel.sh
+++ b/examples/run_text_generation_server_345M_8_tensor_parallel.sh
+#!/bin/bash
+# This example will start serving the 345M model that is partitioned 8 way tensor parallel
+DISTRIBUTED_ARGS="--nproc_per_node 8 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+CHECKPOINT=<Path to checkpoint (e.g /345m)>
+VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
+MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
+pip install flask-restful
+python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tensor-model-parallel-size 8  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 24  \
+       --hidden-size 1024  \
+       --load ${CHECKPOINT}  \
+       --num-attention-heads 16  \
+       --max-position-embeddings 1024  \
+       --tokenizer-type GPT2BPETokenizer  \
+       --fp16  \
+       --micro-batch-size 1  \
+       --seq-length 1024  \
+       --out-seq-length 1024  \
+       --temperature 1.0  \
+       --vocab-file $VOCAB_FILE  \
+       --merge-file $MERGE_FILE  \
+       --top_p 0.9  \
+       --seed 42
--- a/examples/sc21/CONFIG.sh
+++ b/examples/sc21/CONFIG.sh
+#!/bin/bash
+# SLURM options.
+export SLURM_PARTITION=<slurm partition, used to feed -p option in slurm>
+export SLURM_ACCOUNT=<slurm account, used to feed -A option in slurm>
+# Source code.
+export MEGATRON_CODE_DIR=<megatron source code directory>
+# This variable is used to mount the relevant part of the filesystem
+# inside the docker container. Note that the `MEGATRON_CODE_DIR` and the
+# launch directory already get mounted; this variable should be used to
+# mount the directories that contain the data and tokenizer files.
+export DOCKER_MOUNT_DIR=<megatron dataset and bpe tokenizer vocab path>
+# Data and tokenizer files.
+MEGATRON_DATA=<path to megatron processed data>
+BPE_VOCAB_FILE=<path to bpe vocab file>
+BPE_MERGE_FILE=<path to bpe merges file>
+# Megatron input parameters.
+# `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters
+# that are not listed here. 
+export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \
+	--tensor-model-parallel-size ${TP} \
+	--pipeline-model-parallel-size ${PP} \
+	--micro-batch-size ${MBS} \
+	--global-batch-size ${GBS} \
+        --num-layers ${NLS} \
+        --hidden-size ${HS} \
+        --num-attention-heads ${NAH} \
+	--DDP-impl ${DDP} \
+	--data-path ${MEGATRON_DATA} \
+	--vocab-file ${BPE_VOCAB_FILE} \
+	--merge-file ${BPE_MERGE_FILE} \
+        --log-interval 5 \
+        --seq-length 2048 \
+        --max-position-embeddings 2048 \
+        --train-iters 500 \
+        --lr-decay-iters 320 \
+        --lr 0.0001 \
+	--min-lr 0.00001 \
+        --lr-decay-style cosine \
+        --lr-warmup-fraction 0.01 \
+        --split 969,30,1 \
+        --eval-iters 100 \
+        --eval-interval 1000 \
+        --clip-grad 1.0 \
+        --fp16 \
+	--loss-scale 8192 "
--- a/examples/sc21/README.md
+++ b/examples/sc21/README.md
+# Reproducing Figures in SC21 Paper
+This directory contains some of the scripts that were used to produce the
+results in the [Megatron paper](https://arxiv.org/pdf/2104.04473.pdf) that is
+to appear at [SuperComputing 2021](https://sc21.supercomputing.org/). These
+scripts use [Slurm](https://slurm.schedmd.com/documentation.html) with the
+[pyxis plugin](https://github.com/NVIDIA/pyxis), but can be modified for other
+schedulers as well.
+## Setup
+All the cluster-dependent variables are in [`CONFIG.sh`](./CONFIG.sh). Please
+update the unspecified values (in angle brackets `<...>`) before launching any
+scripts.
+## Scripts
+Below is a list of scripts that can be used to reproduce various figures in our
+[paper](https://arxiv.org/pdf/2104.04473.pdf):
+* [run_table_1.sh](./run_table_1.sh): Table 1 showing weak-scaling throughput
+for GPT models ranging from 1 billion to 1 trillion parameters.
+* [run_figure_11.sh](./run_figure_11.sh): Figure 11 showing the weak-scaling
+performance of pipeline parallelism.
+* [run_figure_12.sh](./run_figure_12.sh): Figure 12 showing the effect of
+the interleaved schedule on a 175B GPT model.
+* [run_figure_13.sh](./run_figure_13.sh): Figure 13 showing the effect of
+different degrees of pipeline and tensor model parallelism on a model with
+162.2 billion parameters.
+* [run_figure_14.sh](./run_figure_14.sh): Figure 14 showing the effect of
+different degrees of data and pipeline model parallelism on a model with
+5.9 billion parameters.
+* [run_figure_15.sh](./run_figure_15.sh): Figure 15 showing the effect of
+different degrees of data and tensor model parallelism on a model with
+5.9 billion parameters.
+* [run_figure_16.sh](./run_figure_16.sh): Figure 16 showing the effect of
+microbatch size.
+* [run_figure_17.sh](./run_figure_17.sh): Figure 17 showing the effect of
+activation recomputation.
+* [run_figure_18.sh](./run_figure_18.sh): Figure 18 showing the effect of
+the scatter-gather communication optimization.
--- a/examples/sc21/SBATCH.sh
+++ b/examples/sc21/SBATCH.sh
+#!/bin/bash
+sbatch -p ${SLURM_PARTITION} \
+       -A ${SLURM_ACCOUNT} \
+       --job-name=${JOB_NAME} \
+       --nodes=${NNODES} \
+       --export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh
+exit 0
--- a/examples/sc21/SRUN.sh
+++ b/examples/sc21/SRUN.sh
+#!/bin/bash
+#SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8
+THIS_DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+mkdir -p ${THIS_DIR}/logs
+CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}"
+srun -l \
+     --container-image "nvcr.io#nvidia/pytorch:20.12-py3" \
+     --container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \
+     --output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}"
--- a/examples/sc21/run_figure_11.sh
+++ b/examples/sc21/run_figure_11.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Pipeline-parallel size options = [1, 2, 4, 8].
+PP=1
+# Batch size (global batch size) options = [8, 128].
+GBS=8
+# Set pipeline-parallel size options.
+NLS=$((3*PP))
+NNODES=${PP}
+# Other params.
+TP=8
+MBS=1
+HS=20480
+NAH=128
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+# Name of the job.
+export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/examples/sc21/run_figure_12.sh
+++ b/examples/sc21/run_figure_12.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Interleaved schedule options = [YES, NO].
+INTERLEAVED=YES
+# Batch size (global batch size) options = [12, 24, 36, ..., 60].
+GBS=12
+# Set interleaved schedule options.
+if [ ${INTERLEAVED} == "YES" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
+elif [ ${INTERLEAVED} == "NO" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+else
+    echo "Invalid configuration"
+    exit 1
+fi
+# Other params.
+TP=8
+PP=12
+MBS=1
+NLS=96
+HS=12288
+NAH=96
+DDP=local
+NNODES=12
+# Name of the job.
+export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/examples/sc21/run_figure_13.sh
+++ b/examples/sc21/run_figure_13.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Pipeline-parallel size options = [2, 4, 8, 16, 32].
+PP=2
+# Batch size (global batch size) options = [32, 128].
+GBS=32
+# Set pipeline-parallel and tensor-parallel size options.
+TP=$((64/PP))
+# Other params.
+MBS=1
+NLS=32
+HS=20480
+NAH=128
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+NNODES=8
+# Name of the job.
+export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/examples/sc21/run_figure_14.sh
+++ b/examples/sc21/run_figure_14.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Pipeline-parallel size options = [2, 4, 8, 16, 32].
+PP=2
+# Batch size (global batch size) options = [32, 512].
+GBS=32
+# Set pipeline-parallel and data-parallel size options.
+DP=$((64/PP))
+# Other params.
+TP=1
+MBS=1
+NLS=32
+HS=3840
+NAH=32
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+NNODES=8
+# Name of the job.
+export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/examples/sc21/run_figure_15.sh
+++ b/examples/sc21/run_figure_15.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Tensor-parallel size options = [2, 4, 8, 16, 32].
+TP=2
+# Batch size (global batch size) options = [32, 128, 512].
+GBS=32
+# Set tensor-parallel and data-parallel size options.
+DP=$((64/TP))
+# Other params.
+PP=1
+MBS=1
+NLS=32
+HS=3840
+NAH=32
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+NNODES=8
+# Name of the job.
+export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/examples/sc21/run_figure_16.sh
+++ b/examples/sc21/run_figure_16.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Microbatch size options = [1, 2, 4, 8].
+MBS=1
+# Batch size (global batch size) options = [128, 512].
+GBS=128
+# Other params.
+TP=8
+PP=8
+NLS=32
+HS=15360
+NAH=128
+DDP=local
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+NNODES=8
+# Name of the job.
+export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/examples/sc21/run_figure_17.sh
+++ b/examples/sc21/run_figure_17.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Activation recomputation options = [YES, NO].
+ACTIVATION_RECOMPUTATION=YES
+# Batch size (global batch size) options = [1, 2, 4, ..., 256].
+GBS=1
+# Set activation recomputation.
+if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then
+    MEGATRON_EXTRA_PARAMS=""
+else
+    echo "Invalid configuration"
+    exit 1
+fi
+# Other params.
+TP=8
+PP=16
+MBS=1
+NLS=80
+HS=12288
+NAH=96
+DDP=local
+NNODES=16
+# Name of the job.
+export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/examples/sc21/run_figure_18.sh
+++ b/examples/sc21/run_figure_18.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# Scatter-gather communication optimization options = [YES, NO].
+SCATTER_GATHER=YES
+# Batch size (global batch size) options = [12, 24, 36, ..., 60].
+GBS=12
+# Set scatter-gather communication optimization options.
+if [ ${SCATTER_GATHER} == "YES" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
+elif [ ${SCATTER_GATHER} == "NO" ]; then
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline "
+else
+    echo "Invalid configuration"
+    exit 1
+fi
+# Other params.
+TP=8
+PP=12
+MBS=1
+NLS=96
+HS=12288
+NAH=96
+DDP=local
+NNODES=12
+# Name of the job.
+export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/examples/sc21/run_table_1.sh
+++ b/examples/sc21/run_table_1.sh
+#!/bin/bash
+# ================================
+# Choose the case to run.
+# ================================
+# model size options = [1.7B, 3.6B, 7.5B, 18B, 39B, 76B, 145B, 310B, 530B, 1T]
+MODEL_SIZE=1.7B
+if [ ${MODEL_SIZE} == "1.7B" ]; then
+    TP=1
+    PP=1
+    MBS=16
+    GBS=512
+    NLS=24
+    HS=2304
+    NAH=24
+    DDP=torch
+    NNODES=4
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "3.6B" ]; then
+    TP=2
+    PP=1
+    MBS=16
+    GBS=512
+    NLS=30
+    HS=3072
+    NAH=32
+    DDP=torch
+    NNODES=8
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "7.5B" ]; then
+    TP=4
+    PP=1
+    MBS=16
+    GBS=512
+    NLS=36
+    HS=4096
+    NAH=32
+    DDP=torch
+    NNODES=16
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "18B" ]; then
+    TP=8
+    PP=1
+    MBS=8
+    GBS=1024
+    NLS=40
+    HS=6144
+    NAH=48
+    DDP=torch
+    NNODES=32
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "39B" ]; then
+    TP=8
+    PP=2
+    MBS=4
+    GBS=1536
+    NLS=48
+    HS=8192
+    NAH=64
+    DDP=local
+    NNODES=64
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+elif [ ${MODEL_SIZE} == "76B" ]; then
+    TP=8
+    PP=4
+    MBS=2
+    GBS=1792
+    NLS=60
+    HS=10240
+    NAH=80
+    DDP=local
+    NNODES=128
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5"
+elif [ ${MODEL_SIZE} == "145B" ]; then
+    TP=8
+    PP=8
+    MBS=2
+    GBS=2304
+    NLS=80
+    HS=12288
+    NAH=96
+    DDP=local
+    NNODES=192
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5 "
+elif [ ${MODEL_SIZE} == "310B" ]; then
+    TP=8
+    PP=16
+    MBS=1
+    GBS=2160
+    NLS=96
+    HS=16384
+    NAH=128
+    DDP=local
+    NNODES=240
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 3 "
+elif [ ${MODEL_SIZE} == "530B" ]; then
+    TP=8
+    PP=35
+    MBS=1
+    GBS=2520
+    NLS=105
+    HS=20480
+    NAH=128
+    DDP=local
+    NNODES=315
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 1 "
+elif [ ${MODEL_SIZE} == "1T" ]; then
+    TP=8
+    PP=64
+    MBS=1
+    GBS=3072
+    NLS=128
+    HS=25600
+    NAH=160
+    DDP=local
+    NNODES=384
+    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
+else
+    echo "Invalid configuration"
+    exit 1
+fi
+# Name of the job
+export JOB_NAME=results_table_1_model_size_${MODEL_SIZE}
+# Import the configs.
+. `pwd`/CONFIG.sh
+# Submit the job.
+. `pwd`/SBATCH.sh
+exit 0
--- a/examples_deepspeed/MoE/ds_config_gpt_TEMPLATE.json
+++ b/examples_deepspeed/MoE/ds_config_gpt_TEMPLATE.json
+{
+  "train_batch_size" : CONFIG_BATCH_SIZE,
+  "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
+  "steps_per_print": LOG_INTERVAL,
+  "zero_optimization": {
+    "stage": ZERO_STAGE
+  },
+  "gradient_clipping": 1.0,
+  "prescale_gradients": PRESCALE_GRAD,
+  "fp16": {
+    "enabled": CONFIG_FP16_ENABLED,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 11
+  },
+  "bf16": {
+    "enabled": CONFIG_BF16_ENABLED
+  },
+  "curriculum_learning": {
+    "enabled": CONFIG_CL_ENABLED,
+    "curriculum_type": "seqlen",
+    "min_difficulty": CONFIG_CL_MIN,
+    "max_difficulty": CONFIG_CL_MAX,
+    "schedule_type": "fixed_linear",
+    "schedule_config": {
+      "total_curriculum_step": CONFIG_CL_DURATION,
+      "difficulty_step": 8
+    }
+  },
+  "wall_clock_breakdown" : false
+}
--- a/examples_deepspeed/MoE/ds_config_gpt_Zero2_TEMPLATE.json
+++ b/examples_deepspeed/MoE/ds_config_gpt_Zero2_TEMPLATE.json
+{
+  "train_batch_size" : CONFIG_BATCH_SIZE,
+  "train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
+  "steps_per_print": LOG_INTERVAL,
+  "zero_optimization": {
+    "stage": 2
+  },
+  "gradient_clipping": 1.0,
+  "prescale_gradients": false,
+  "fp16": {
+    "enabled": CONFIG_FP16_ENABLED,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 11
+  },
+  "bf16": {
+    "enabled": CONFIG_BF16_ENABLED
+  },
+  "curriculum_learning": {
+    "enabled": CONFIG_CL_ENABLED,
+    "curriculum_type": "seqlen",
+    "min_difficulty": CONFIG_CL_MIN,
+    "max_difficulty": CONFIG_CL_MAX,
+    "schedule_type": "fixed_linear",
+    "schedule_config": {
+      "total_curriculum_step": CONFIG_CL_DURATION,
+      "difficulty_step": 8
+    }
+  },
+  "wall_clock_breakdown" : false
+}
--- a/examples_deepspeed/MoE/ds_evalharness.sh
+++ b/examples_deepspeed/MoE/ds_evalharness.sh
+# This is an example zero-shot eval script. Please first read the readme_evalharness.md under the same directory.
+CHECKPOINT_PATH=/blob/users/conglli/project/gpt3_with_pile/checkpoint/gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-128-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-20728-token-45B/global_step81566/
+CONFIG_PATH=ds_config_gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-128-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-20728-token-45B.json
+RESULT_PATH=gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-128-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-20728-token-45B_global_step81566.log
+PP_SIZE=1
+TP_SIZE=1
+NO_PP="true"
+EP_PARALLEL_SIZE=1
+# Currently eval harness does not support data parallel
+# However, for MoE models it's possible to enable a "fake data parallel"
+# in order to load experts on multiple gpus. At the same time, it's not
+# real data parallel because we load the same data on all gpus.
+# On the other hand, it's better to use less number of gpus than training,
+# to reduce communication overhead.
+NUM_NODE=1
+NUM_GPU_PER_NODE=1
+TASKS="lambada"
+# WikiText-2, not used in GPT-3 paper but used in GPT-2 paper
+# TASKS="wikitext"
+# Tasks that appeared in GPT-3 paper (sorted based on the order in paper), plus WikiText-2.
+# TASKS="hellaswag,lambada,triviaqa,webqs,winogrande,piqa,arc_challenge,arc_easy,openbookqa,race,boolq,cb,copa,rte,wic,wsc,multirc,record,anli_r1,anli_r2,anli_r3,wikitext"
+# All tasks that confirmed to work, there are more tasks on https://github.com/EleutherAI/lm-evaluation-harness that we didn't test.
+# TASKS="hellaswag,lambada,triviaqa,webqs,winogrande,piqa,arc_challenge,arc_easy,openbookqa,race,boolq,cb,copa,rte,wic,wsc,multirc,record,anli_r1,anli_r2,anli_r3,wikitext,logiqa,mathqa,mc_taco,mrpc,prost,pubmedqa,qnli,qqp,sciq,sst,wnli"
+VOCAB_FILE=/data/Megatron-LM/data/gpt2-vocab.json
+MERGE_FILE=/data/Megatron-LM/data/gpt2-merges.txt
+# export HF_DATASETS_OFFLINE=1
+# Dummy arguments to make megatron happy. No need to configure them.
+# The reason we don't need to configure them and many other arguments is
+# because the eval framework will read the arguments from checkpoint file.
+MEGATRON_REQUIRED_ARGS="\
+    --num-layers -1\
+    --hidden-size -1\
+    --num-attention-heads -1\
+    --seq-length -1 \
+    --max-position-embeddings -1
+"
+CMD="../../tasks/eval_harness/evaluate.py \
+    --load $CHECKPOINT_PATH\
+    --tensor-model-parallel-size $TP_SIZE \
+    --pipeline-model-parallel-size $PP_SIZE\
+    --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
+    --vocab-file $VOCAB_FILE\
+    --merge-file $MERGE_FILE\
+    --micro-batch-size 12\
+    --no-load-optim \
+    --no-load-rng \
+    --inference \
+    --disable-moe-token-dropping \
+    --tokenizer-type GPT2BPETokenizer \
+    --adaptive_seq_len\
+    --eval_fp32\
+    --task_list $TASKS\
+    --results_path $RESULT_PATH \
+    --deepspeed \
+    --deepspeed_config $CONFIG_PATH \
+    $MEGATRON_REQUIRED_ARGS\
+    "
+if [[ "${NO_PP}" = "true" ]]; then
+CMD="${CMD} \
+    --no-pipeline-parallel"
+fi
+LAUNCHER="deepspeed --num_nodes $NUM_NODE --num_gpus $NUM_GPU_PER_NODE"
+$LAUNCHER $CMD
\ No newline at end of file
--- a/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_MoE128.sh
+++ b/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_MoE128.sh
+#!/bin/bash
+DIR=`pwd`
+###############################################################################
+### Main configs
+## GPT-3 models use 2K sequence length/context window
+SEQ_LEN=2048
+### The "GPT-3 XXX" below are configs from GPT-3 paper
+### https://arxiv.org/abs/2005.14165, choose based on
+### your desired model size or build your own configs
+## GPT-3 Small 125M
+# MODEL_SIZE=0.125
+# NUM_LAYERS=12
+# HIDDEN_SIZE=768
+# NUM_ATTN_HEADS=12
+# GLOBAL_BATCH_SIZE=256
+# LR=6.0e-4
+# MIN_LR=6.0e-5
+## GPT-3 Medium 350M
+# MODEL_SIZE=0.35
+# NUM_LAYERS=24
+# HIDDEN_SIZE=1024
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=256
+# LR=3.0e-4
+# MIN_LR=3.0e-5
+## GPT-3 Large 760M
+# MODEL_SIZE=0.76
+# NUM_LAYERS=24
+# HIDDEN_SIZE=1536
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=256
+# LR=2.5e-4
+# MIN_LR=2.5e-5
+## GPT-3 XL 1.3B
+MODEL_SIZE=1.3
+NUM_LAYERS=24
+HIDDEN_SIZE=2048
+NUM_ATTN_HEADS=16
+GLOBAL_BATCH_SIZE=512
+# LR=2.0e-4
+# MIN_LR=2.0e-5
+## GPT-3 2.7B
+# MODEL_SIZE=2.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=2560
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=512
+# LR=1.6e-4
+# MIN_LR=1.6e-5
+## GPT-3 6.7B
+# MODEL_SIZE=6.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=4096
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.2e-4
+# MIN_LR=1.2e-5
+## GPT-3 13B
+# MODEL_SIZE=13
+# NUM_LAYERS=40
+# HIDDEN_SIZE=5120
+# NUM_ATTN_HEADS=40
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.0e-4
+# MIN_LR=1.0e-5
+## GPT-3 175B
+# MODEL_SIZE=175
+# NUM_LAYERS=96
+# HIDDEN_SIZE=12288
+# NUM_ATTN_HEADS=96
+# GLOBAL_BATCH_SIZE=1536
+# LR=0.6e-4
+# MIN_LR=0.6e-5
+###############################################################################
+### Training duration configs
+## The main termination condition, original GPT-3 paper trains for 300B tokens
+## For MoE model, we found sometimes training a bit more to 330B tokens helps
+TRAIN_TOKENS=300000000000
+# TRAIN_TOKENS=330000000000
+## TRAIN_ITERS is another termination condition and also affect the number of 
+## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
+## above, and techniques like curriculum learning has less token in some steps,
+## so we just set this config large enough to make sure we have enough
+## processed data and don't terminate by TRAIN_ITERS.
+TRAIN_ITERS=$(( ${TRAIN_TOKENS} * 3 / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
+## Another termination condition in minutes. Set it large enough to avoid
+## undesired early termination.
+EXIT_DURATION=30000000
+###############################################################################
+### LR configs
+## LR warmup and decay duration, this token-based config is preferable since
+## no need to readjust when the batch size/seqlen is changed.
+## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
+## For MoE model, we found that setting the decay token to 300B helps.
+WARMUP_TOKENS=375000000
+# LR_DECAY_TOKENS=260000000000
+LR_DECAY_TOKENS=300000000000
+###############################################################################
+### Parallelism configs
+## Micro batch size per GPU
+## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
+BATCH_SIZE=8
+## Model parallelism, 1 is no MP
+MP_SIZE=1
+## Pipeline parallelism
+## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
+## to 1 and use the "--no-pipeline-parallel" arg.
+PP_SIZE=1
+NUM_GPUS=64
+###############################################################################
+### MoE configs
+## Number of experts. EP_SIZE 1 means dense model without MoE
+# EP_SIZE=1
+EP_SIZE=128
+if [[ $EP_SIZE -gt $NUM_GPUS ]]; then
+    EP_PARALLEL_SIZE=$NUM_GPUS
+else
+    EP_PARALLEL_SIZE=$EP_SIZE
+fi
+## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
+## found that lower LR and min LR (than the base dense model) helps.
+## For 1.3B MoE-128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
+## For 350M MoE-128 model we used LR=2.0e-4 and MIN_LR=2.0e-6, but they are not
+## heavily tuned.
+LR=1.2e-4
+MIN_LR=1.0e-6
+## Coefficient for MoE loss. We find that 0.01 is a good value at least for
+## 1.3B MoE-128 model
+MLC=0.01
+## Below configs adjust the MoE expert token capacity limit during training and
+## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
+## Larger capacity factor or disabling capacity limit could improve training
+## convergence, but will also reduce training throughput.
+MOE_TRAIN_CAP_FACTOR=1.0
+MOE_EVAL_CAP_FACTOR=1.0
+MOE_MIN_CAP=4
+MOE_DROP_TOKEN="true"
+# MOE_DROP_TOKEN="false"
+###############################################################################
+### Curriculum learning (CL) configs
+## Enable/disable CL
+CL_ENABLED="false"
+## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
+## for tuning the following configs
+CL_START_SEQLEN=80
+CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
+CL_TOKENS=60
+CL_TOKENS=$((${CL_TOKENS} * 1000000000))
+CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
+###############################################################################
+### Misc configs
+LOG_INTERVAL=10
+EVAL_ITERS=10
+EVAL_INTERVAL=100
+SAVE_INTERVAL=10000
+## Standard deviation for weight initialization
+## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
+## dense model. Usually larger model needs lower std.
+INIT_STD=0.014
+# INIT_STD=0.01
+## Activation checkpointing saves GPU memory, but reduces training speed
+ACTIVATION_CHECKPOINT="true"
+# ACTIVATION_CHECKPOINT="false"
+###############################################################################
+### Output and data configs
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+host="${HOSTNAME}"
+NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
+if [[ $EP_SIZE -gt 1 ]]; then
+    NAME="${NAME}-ep-${EP_SIZE}-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
+fi
+if [ "${CL_ENABLED}" = "true" ]; then
+    NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
+fi
+OUTPUT_BASEPATH=$DIR/output
+mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
+mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
+mkdir -p "${OUTPUT_BASEPATH}/log/"
+TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}_${current_time}"
+mkdir -p ${TENSORBOARD_DIR} 
+## Note that for MoE model with billion-scale base model, the checkpoint can be
+## as large as TB-scale which normal NFS cannot handle efficiently.
+CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
+# USE_INTERNAL_DATA="true"
+USE_INTERNAL_DATA="false"
+if [ "${USE_INTERNAL_DATA}" = "true" ]; then
+    ## The internal data is only accessible within Microsoft
+    ## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
+    # BASE_DATA_PATH=/vc_data/Megatron-LM/data
+    # DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
+    ## For cluster Lab-RR1-V100
+    BASE_DATA_PATH=/data/Megatron-LM/data
+    DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
+    ## For cluster Azure-CentralUS-A100
+    # BASE_DATA_PATH=/data/Megatron-LM/data
+    # DATA_HOME=/vc_data_1/users/amawa/blended
+    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+    ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
+    BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
+    B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
+    CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
+    CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
+    GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
+    GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
+    NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
+    OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
+    PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
+    PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
+    RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
+    SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
+    ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
+    WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
+    DATA_BLEND="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
+    0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
+    0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
+    0.01359 ${ARX} 0.01588 ${GIT}"
+else
+    VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
+    MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
+    # Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
+    DATA_BLEND=/data/the_pile_public_merged_nopreprocessing/pile_text_document
+fi
+###############################################################################
+data_options=" \
+         --vocab-file ${VOCAB_PATH} \
+         --merge-file ${MERGE_PATH} \
+         --data-path ${DATA_BLEND} \
+         --data-impl mmap"
+megatron_options=" \
+        --override-opt_param-scheduler \
+        --adam-beta1 0.9 \
+        --adam-beta2 0.95 \
+        --tensor-model-parallel-size ${MP_SIZE} \
+        --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
+        --num-experts ${EP_SIZE} \
+        --moe-loss-coeff ${MLC} \
+        --moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
+        --moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
+        --moe-min-capacity ${MOE_MIN_CAP} \
+        --init-method-std ${INIT_STD} \
+        --lr-decay-tokens ${LR_DECAY_TOKENS} \
+        --lr-warmup-tokens ${WARMUP_TOKENS} \
+        --micro-batch-size ${BATCH_SIZE} \
+        --exit-duration-in-mins ${EXIT_DURATION} \
+        --global-batch-size ${GLOBAL_BATCH_SIZE} \
+        --num-layers ${NUM_LAYERS} \
+        --hidden-size ${HIDDEN_SIZE} \
+        --num-attention-heads ${NUM_ATTN_HEADS} \
+        --seq-length ${SEQ_LEN} \
+        --max-position-embeddings ${SEQ_LEN} \
+        --train-tokens ${TRAIN_TOKENS} \
+        --train-iters ${TRAIN_ITERS} \
+        --lr ${LR} \
+        --min-lr ${MIN_LR} \
+        --lr-decay-style cosine \
+        --split 98,2,0 \
+        --log-interval ${LOG_INTERVAL} \
+        --eval-interval ${EVAL_INTERVAL} \
+        --eval-iters ${EVAL_ITERS} \
+        --save-interval ${SAVE_INTERVAL} \
+        --weight-decay 0.1 \
+        --clip-grad 1.0 \
+        --hysteresis 2 \
+        --num-workers 0 \
+        --fp16 \
+        --load ${CHECKPOINT_PATH} \
+        --save ${CHECKPOINT_PATH} \
+        --tensorboard-queue-size 1 \
+        --log-timers-to-tensorboard \
+        --log-batch-size-to-tensorboard \
+        --log-validation-ppl-to-tensorboard \
+        --tensorboard-dir ${TENSORBOARD_DIR}"
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+megatron_options="${megatron_options} \
+        --checkpoint-activations"
+fi
+if [[ $EP_SIZE -gt 1 ]]; then
+megatron_options="${megatron_options} \
+        --create-moe-param-group"
+fi
+if [ "${MOE_DROP_TOKEN}" = "false" ]; then
+megatron_options="${megatron_options} \
+        --disable-moe-token-dropping"
+fi
+template_json="ds_config_gpt_TEMPLATE.json"
+config_json="ds_config_gpt_${NAME}.json"
+sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
+    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
+    | sed "s/ZERO_STAGE/0/" \
+    | sed "s/PRESCALE_GRAD/true/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
+    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
+    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
+    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
+	  > ${config_json}
+deepspeed_options=" \
+		    --deepspeed \
+		    --deepspeed_config ${config_json} \
+		    --pipeline-model-parallel-size ${PP_SIZE}"
+# Currently MoE is not compatible with pipeline parallel
+if [[ $EP_SIZE -gt 1 ]]; then
+deepspeed_options="${deepspeed_options} \
+        --no-pipeline-parallel"
+fi
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+        --deepspeed-activation-checkpointing"
+fi
+run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
+echo ${run_cmd}
+eval ${run_cmd}
+set +x
--- a/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_PR-MoE64or128.sh
+++ b/examples_deepspeed/MoE/ds_pretrain_gpt_1.3B_PR-MoE64or128.sh
+#!/bin/bash
+DIR=`pwd`
+###############################################################################
+### Main configs
+## GPT-3 models use 2K sequence length/context window
+SEQ_LEN=2048
+### The "GPT-3 XXX" below are configs from GPT-3 paper
+### https://arxiv.org/abs/2005.14165, choose based on
+### your desired model size or build your own configs
+## GPT-3 Small 125M
+# MODEL_SIZE=0.125
+# NUM_LAYERS=12
+# HIDDEN_SIZE=768
+# NUM_ATTN_HEADS=12
+# GLOBAL_BATCH_SIZE=256
+# LR=6.0e-4
+# MIN_LR=6.0e-5
+## GPT-3 Medium 350M
+# MODEL_SIZE=0.35
+# NUM_LAYERS=24
+# HIDDEN_SIZE=1024
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=256
+# LR=3.0e-4
+# MIN_LR=3.0e-5
+## GPT-3 Large 760M
+# MODEL_SIZE=0.76
+# NUM_LAYERS=24
+# HIDDEN_SIZE=1536
+# NUM_ATTN_HEADS=16
+# GLOBAL_BATCH_SIZE=256
+# LR=2.5e-4
+# MIN_LR=2.5e-5
+## GPT-3 XL 1.3B
+MODEL_SIZE=1.3
+NUM_LAYERS=24
+HIDDEN_SIZE=2048
+NUM_ATTN_HEADS=16
+GLOBAL_BATCH_SIZE=512
+# LR=2.0e-4
+# MIN_LR=2.0e-5
+## GPT-3 2.7B
+# MODEL_SIZE=2.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=2560
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=512
+# LR=1.6e-4
+# MIN_LR=1.6e-5
+## GPT-3 6.7B
+# MODEL_SIZE=6.7
+# NUM_LAYERS=32
+# HIDDEN_SIZE=4096
+# NUM_ATTN_HEADS=32
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.2e-4
+# MIN_LR=1.2e-5
+## GPT-3 13B
+# MODEL_SIZE=13
+# NUM_LAYERS=40
+# HIDDEN_SIZE=5120
+# NUM_ATTN_HEADS=40
+# GLOBAL_BATCH_SIZE=1024
+# LR=1.0e-4
+# MIN_LR=1.0e-5
+## GPT-3 175B
+# MODEL_SIZE=175
+# NUM_LAYERS=96
+# HIDDEN_SIZE=12288
+# NUM_ATTN_HEADS=96
+# GLOBAL_BATCH_SIZE=1536
+# LR=0.6e-4
+# MIN_LR=0.6e-5
+###############################################################################
+### Training duration configs
+## The main termination condition, original GPT-3 paper trains for 300B tokens
+## For MoE model, we found sometimes training a bit more to 330B tokens helps
+TRAIN_TOKENS=300000000000
+# TRAIN_TOKENS=330000000000
+## TRAIN_ITERS is another termination condition and also affect the number of 
+## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
+## above, and techniques like curriculum learning has less token in some steps,
+## so we just set this config large enough to make sure we have enough
+## processed data and don't terminate by TRAIN_ITERS.
+TRAIN_ITERS=$(( ${TRAIN_TOKENS} * 3 / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
+## Another termination condition in minutes. Set it large enough to avoid
+## undesired early termination.
+EXIT_DURATION=30000000
+###############################################################################
+### LR configs
+## LR warmup and decay duration, this token-based config is preferable since
+## no need to readjust when the batch size/seqlen is changed.
+## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
+## For MoE model, we found that setting the decay token to 300B helps.
+WARMUP_TOKENS=375000000
+# LR_DECAY_TOKENS=260000000000
+LR_DECAY_TOKENS=300000000000
+###############################################################################
+### Parallelism configs
+## Micro batch size per GPU
+## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
+BATCH_SIZE=8
+## Model parallelism, 1 is no MP
+MP_SIZE=1
+## Pipeline parallelism
+## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
+## to 1 and use the "--no-pipeline-parallel" arg.
+PP_SIZE=1
+NUM_GPUS=64
+###############################################################################
+### MoE configs
+## Number of experts. EP_SIZE 128 means standard MoE
+# EP_SIZE=128
+EP_SIZE="64 64 64 64 64 64 64 64 64 64 128 128"
+EP_PARALLEL_SIZE=$NUM_GPUS
+## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
+## found that lower LR and min LR (than the base dense model) helps.
+## For 1.3B PR-MoE-64/128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
+## heavily tuned.
+LR=1.2e-4
+MIN_LR=1.0e-6
+## Coefficient for MoE loss. We find that 0.01 is a good value at least for
+## 1.3B MoE-128 model
+MLC=0.01
+## Below configs adjust the MoE expert token capacity limit during training and
+## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
+## Larger capacity factor or disabling capacity limit could improve training
+## convergence, but will also reduce training throughput.
+MOE_TRAIN_CAP_FACTOR=1.0
+MOE_EVAL_CAP_FACTOR=1.0
+MOE_MIN_CAP=4
+MOE_DROP_TOKEN="true"
+# MOE_DROP_TOKEN="false"
+###############################################################################
+### Curriculum learning (CL) configs
+## Enable/disable CL
+CL_ENABLED="false"
+## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
+## for tuning the following configs
+CL_START_SEQLEN=80
+CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
+CL_TOKENS=60
+CL_TOKENS=$((${CL_TOKENS} * 1000000000))
+CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
+###############################################################################
+### Misc configs
+LOG_INTERVAL=10
+EVAL_ITERS=10
+EVAL_INTERVAL=100
+SAVE_INTERVAL=10000
+## Standard deviation for weight initialization
+## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
+## dense model. Usually larger model needs lower std.
+INIT_STD=0.014
+# INIT_STD=0.01
+## Activation checkpointing saves GPU memory, but reduces training speed
+ACTIVATION_CHECKPOINT="true"
+# ACTIVATION_CHECKPOINT="false"
+###############################################################################
+### Output and data configs
+current_time=$(date "+%Y.%m.%d-%H.%M.%S")
+host="${HOSTNAME}"
+NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
+NAME="${NAME}-ep-pyramid-64+128-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
+if [ "${CL_ENABLED}" = "true" ]; then
+    NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
+fi
+OUTPUT_BASEPATH=$DIR/output
+mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
+mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
+mkdir -p "${OUTPUT_BASEPATH}/log/"
+TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}_${current_time}"
+mkdir -p ${TENSORBOARD_DIR} 
+## Note that for MoE model with billion-scale base model, the checkpoint can be
+## as large as TB-scale which normal NFS cannot handle efficiently.
+CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
+# USE_INTERNAL_DATA="true"
+USE_INTERNAL_DATA="false"
+if [ "${USE_INTERNAL_DATA}" = "true" ]; then
+    ## The internal data is only accessible within Microsoft
+    ## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
+    BASE_DATA_PATH=/vc_data/Megatron-LM/data
+    DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
+    ## For cluster Lab-RR1-V100
+    # BASE_DATA_PATH=/data/Megatron-LM/data
+    # DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
+    ## For cluster Azure-CentralUS-A100
+    # BASE_DATA_PATH=/data/Megatron-LM/data
+    # DATA_HOME=/vc_data_1/users/amawa/blended
+    VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
+    MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
+    ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
+    BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
+    B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
+    CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
+    CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
+    GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
+    GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
+    NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
+    OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
+    PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
+    PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
+    RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
+    SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
+    ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
+    WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
+    DATA_BLEND="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
+    0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
+    0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
+    0.01359 ${ARX} 0.01588 ${GIT}"
+else
+    VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
+    MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
+    # Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
+    DATA_BLEND=/data/the_pile_public_merged_nopreprocessing/pile_text_document
+fi
+###############################################################################
+data_options=" \
+         --vocab-file ${VOCAB_PATH} \
+         --merge-file ${MERGE_PATH} \
+         --data-path ${DATA_BLEND} \
+         --data-impl mmap"
+megatron_options=" \
+        --override-opt_param-scheduler \
+        --adam-beta1 0.9 \
+        --adam-beta2 0.95 \
+        --tensor-model-parallel-size ${MP_SIZE} \
+        --moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
+        --num-experts ${EP_SIZE} \
+        --moe-loss-coeff ${MLC} \
+        --mlp-type residual \
+        --moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
+        --moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
+        --moe-min-capacity ${MOE_MIN_CAP} \
+        --init-method-std ${INIT_STD} \
+        --lr-decay-tokens ${LR_DECAY_TOKENS} \
+        --lr-warmup-tokens ${WARMUP_TOKENS} \
+        --micro-batch-size ${BATCH_SIZE} \
+        --exit-duration-in-mins ${EXIT_DURATION} \
+        --global-batch-size ${GLOBAL_BATCH_SIZE} \
+        --num-layers ${NUM_LAYERS} \
+        --hidden-size ${HIDDEN_SIZE} \
+        --num-attention-heads ${NUM_ATTN_HEADS} \
+        --seq-length ${SEQ_LEN} \
+        --max-position-embeddings ${SEQ_LEN} \
+        --train-tokens ${TRAIN_TOKENS} \
+        --train-iters ${TRAIN_ITERS} \
+        --lr ${LR} \
+        --min-lr ${MIN_LR} \
+        --lr-decay-style cosine \
+        --split 98,2,0 \
+        --log-interval ${LOG_INTERVAL} \
+        --eval-interval ${EVAL_INTERVAL} \
+        --eval-iters ${EVAL_ITERS} \
+        --save-interval ${SAVE_INTERVAL} \
+        --weight-decay 0.1 \
+        --clip-grad 1.0 \
+        --hysteresis 2 \
+        --num-workers 0 \
+        --fp16 \
+        --load ${CHECKPOINT_PATH} \
+        --save ${CHECKPOINT_PATH} \
+        --tensorboard-queue-size 1 \
+        --log-timers-to-tensorboard \
+        --log-batch-size-to-tensorboard \
+        --log-validation-ppl-to-tensorboard \
+        --tensorboard-dir ${TENSORBOARD_DIR}"
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+megatron_options="${megatron_options} \
+        --checkpoint-activations"
+fi
+megatron_options="${megatron_options} \
+        --create-moe-param-group"
+if [ "${MOE_DROP_TOKEN}" = "false" ]; then
+megatron_options="${megatron_options} \
+        --disable-moe-token-dropping"
+fi
+template_json="ds_config_gpt_Zero2_TEMPLATE.json"
+config_json="ds_config_gpt_${NAME}.json"
+sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
+    | sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
+    | sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
+    | sed "s/CONFIG_FP16_ENABLED/true/" \
+    | sed "s/CONFIG_BF16_ENABLED/false/" \
+    | sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
+    | sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
+    | sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
+    | sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
+	  > ${config_json}
+deepspeed_options=" \
+		    --deepspeed \
+		    --deepspeed_config ${config_json} \
+		    --pipeline-model-parallel-size ${PP_SIZE}"
+# Currently MoE is not compatible with pipeline parallel
+deepspeed_options="${deepspeed_options} \
+        --no-pipeline-parallel"
+if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
+deepspeed_options="${deepspeed_options} \
+        --deepspeed-activation-checkpointing"
+fi
+run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
+echo ${run_cmd}
+eval ${run_cmd}
+set +x