"src/routes/vscode:/vscode.git/clone" did not exist on "b986c2aefd34df8ee1000615fdb151d87d7b7432"
Commit e1354f9d authored by liangjing's avatar liangjing
Browse files

update

parents
Pipeline #1025 failed with stages
in 0 seconds
#!/bin/bash
# This example will start serving the 345M model.
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
CHECKPOINT=<Path to checkpoint (e.g /345m)>
VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
export CUDA_DEVICE_MAX_CONNECTIONS=1
pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--load ${CHECKPOINT} \
--num-attention-heads 16 \
--max-position-embeddings 1024 \
--tokenizer-type GPT2BPETokenizer \
--fp16 \
--micro-batch-size 1 \
--seq-length 1024 \
--out-seq-length 1024 \
--temperature 1.0 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--top_p 0.9 \
--seed 42
#!/bin/bash
# This example will start serving the 345M model that is partitioned 8 way tensor parallel
DISTRIBUTED_ARGS="--nproc_per_node 8 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
CHECKPOINT=<Path to checkpoint (e.g /345m)>
VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
pip install flask-restful
python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tensor-model-parallel-size 8 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--load ${CHECKPOINT} \
--num-attention-heads 16 \
--max-position-embeddings 1024 \
--tokenizer-type GPT2BPETokenizer \
--fp16 \
--micro-batch-size 1 \
--seq-length 1024 \
--out-seq-length 1024 \
--temperature 1.0 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--top_p 0.9 \
--seed 42
#!/bin/bash
# SLURM options.
export SLURM_PARTITION=<slurm partition, used to feed -p option in slurm>
export SLURM_ACCOUNT=<slurm account, used to feed -A option in slurm>
# Source code.
export MEGATRON_CODE_DIR=<megatron source code directory>
# This variable is used to mount the relevant part of the filesystem
# inside the docker container. Note that the `MEGATRON_CODE_DIR` and the
# launch directory already get mounted; this variable should be used to
# mount the directories that contain the data and tokenizer files.
export DOCKER_MOUNT_DIR=<megatron dataset and bpe tokenizer vocab path>
# Data and tokenizer files.
MEGATRON_DATA=<path to megatron processed data>
BPE_VOCAB_FILE=<path to bpe vocab file>
BPE_MERGE_FILE=<path to bpe merges file>
# Megatron input parameters.
# `MEGATRON_EXTRA_PARAMS` can be used to provide any extra parameters
# that are not listed here.
export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \
--micro-batch-size ${MBS} \
--global-batch-size ${GBS} \
--num-layers ${NLS} \
--hidden-size ${HS} \
--num-attention-heads ${NAH} \
--DDP-impl ${DDP} \
--data-path ${MEGATRON_DATA} \
--vocab-file ${BPE_VOCAB_FILE} \
--merge-file ${BPE_MERGE_FILE} \
--log-interval 5 \
--seq-length 2048 \
--max-position-embeddings 2048 \
--train-iters 500 \
--lr-decay-iters 320 \
--lr 0.0001 \
--min-lr 0.00001 \
--lr-decay-style cosine \
--lr-warmup-fraction 0.01 \
--split 969,30,1 \
--eval-iters 100 \
--eval-interval 1000 \
--clip-grad 1.0 \
--fp16 \
--loss-scale 8192 "
# Reproducing Figures in SC21 Paper
This directory contains some of the scripts that were used to produce the
results in the [Megatron paper](https://arxiv.org/pdf/2104.04473.pdf) that is
to appear at [SuperComputing 2021](https://sc21.supercomputing.org/). These
scripts use [Slurm](https://slurm.schedmd.com/documentation.html) with the
[pyxis plugin](https://github.com/NVIDIA/pyxis), but can be modified for other
schedulers as well.
## Setup
All the cluster-dependent variables are in [`CONFIG.sh`](./CONFIG.sh). Please
update the unspecified values (in angle brackets `<...>`) before launching any
scripts.
## Scripts
Below is a list of scripts that can be used to reproduce various figures in our
[paper](https://arxiv.org/pdf/2104.04473.pdf):
* [run_table_1.sh](./run_table_1.sh): Table 1 showing weak-scaling throughput
for GPT models ranging from 1 billion to 1 trillion parameters.
* [run_figure_11.sh](./run_figure_11.sh): Figure 11 showing the weak-scaling
performance of pipeline parallelism.
* [run_figure_12.sh](./run_figure_12.sh): Figure 12 showing the effect of
the interleaved schedule on a 175B GPT model.
* [run_figure_13.sh](./run_figure_13.sh): Figure 13 showing the effect of
different degrees of pipeline and tensor model parallelism on a model with
162.2 billion parameters.
* [run_figure_14.sh](./run_figure_14.sh): Figure 14 showing the effect of
different degrees of data and pipeline model parallelism on a model with
5.9 billion parameters.
* [run_figure_15.sh](./run_figure_15.sh): Figure 15 showing the effect of
different degrees of data and tensor model parallelism on a model with
5.9 billion parameters.
* [run_figure_16.sh](./run_figure_16.sh): Figure 16 showing the effect of
microbatch size.
* [run_figure_17.sh](./run_figure_17.sh): Figure 17 showing the effect of
activation recomputation.
* [run_figure_18.sh](./run_figure_18.sh): Figure 18 showing the effect of
the scatter-gather communication optimization.
#!/bin/bash
sbatch -p ${SLURM_PARTITION} \
-A ${SLURM_ACCOUNT} \
--job-name=${JOB_NAME} \
--nodes=${NNODES} \
--export=MEGATRON_CODE_DIR,MEGATRON_PARAMS,DOCKER_MOUNT_DIR SRUN.sh
exit 0
#!/bin/bash
#SBATCH -t 0:30:00 --exclusive --mem=0 --overcommit --ntasks-per-node=8
THIS_DIR=`pwd`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
mkdir -p ${THIS_DIR}/logs
CMD="python -u ${MEGATRON_CODE_DIR}/pretrain_gpt.py ${MEGATRON_PARAMS}"
srun -l \
--container-image "nvcr.io#nvidia/pytorch:20.12-py3" \
--container-mounts "${THIS_DIR}:${THIS_DIR},${MEGATRON_CODE_DIR}:${MEGATRON_CODE_DIR},${DOCKER_MOUNT_DIR}:${DOCKER_MOUNT_DIR}" \
--output=${THIS_DIR}/logs/%x_%j_$DATETIME.log sh -c "${CMD}"
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# Pipeline-parallel size options = [1, 2, 4, 8].
PP=1
# Batch size (global batch size) options = [8, 128].
GBS=8
# Set pipeline-parallel size options.
NLS=$((3*PP))
NNODES=${PP}
# Other params.
TP=8
MBS=1
HS=20480
NAH=128
DDP=local
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
# Name of the job.
export JOB_NAME=results_figure_11_pipeline_parallel_size_${PP}_batch_size_${GBS}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# Interleaved schedule options = [YES, NO].
INTERLEAVED=YES
# Batch size (global batch size) options = [12, 24, 36, ..., 60].
GBS=12
# Set interleaved schedule options.
if [ ${INTERLEAVED} == "YES" ]; then
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
elif [ ${INTERLEAVED} == "NO" ]; then
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
else
echo "Invalid configuration"
exit 1
fi
# Other params.
TP=8
PP=12
MBS=1
NLS=96
HS=12288
NAH=96
DDP=local
NNODES=12
# Name of the job.
export JOB_NAME=results_figure_12_interleaved_${INTERLEAVED}_batch_size_${GBS}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# Pipeline-parallel size options = [2, 4, 8, 16, 32].
PP=2
# Batch size (global batch size) options = [32, 128].
GBS=32
# Set pipeline-parallel and tensor-parallel size options.
TP=$((64/PP))
# Other params.
MBS=1
NLS=32
HS=20480
NAH=128
DDP=local
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
NNODES=8
# Name of the job.
export JOB_NAME=results_figure_13_pipeline_parallel_size_${PP}_tensor_parallel_size_${TP}_batch_size_${GBS}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# Pipeline-parallel size options = [2, 4, 8, 16, 32].
PP=2
# Batch size (global batch size) options = [32, 512].
GBS=32
# Set pipeline-parallel and data-parallel size options.
DP=$((64/PP))
# Other params.
TP=1
MBS=1
NLS=32
HS=3840
NAH=32
DDP=local
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
NNODES=8
# Name of the job.
export JOB_NAME=results_figure_14_pipeline_parallel_size_${PP}_data_parallel_size_${DP}_batch_size_${GBS}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# Tensor-parallel size options = [2, 4, 8, 16, 32].
TP=2
# Batch size (global batch size) options = [32, 128, 512].
GBS=32
# Set tensor-parallel and data-parallel size options.
DP=$((64/TP))
# Other params.
PP=1
MBS=1
NLS=32
HS=3840
NAH=32
DDP=local
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
NNODES=8
# Name of the job.
export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# Microbatch size options = [1, 2, 4, 8].
MBS=1
# Batch size (global batch size) options = [128, 512].
GBS=128
# Other params.
TP=8
PP=8
NLS=32
HS=15360
NAH=128
DDP=local
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
NNODES=8
# Name of the job.
export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# Activation recomputation options = [YES, NO].
ACTIVATION_RECOMPUTATION=YES
# Batch size (global batch size) options = [1, 2, 4, ..., 256].
GBS=1
# Set activation recomputation.
if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then
MEGATRON_EXTRA_PARAMS=""
else
echo "Invalid configuration"
exit 1
fi
# Other params.
TP=8
PP=16
MBS=1
NLS=80
HS=12288
NAH=96
DDP=local
NNODES=16
# Name of the job.
export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# Scatter-gather communication optimization options = [YES, NO].
SCATTER_GATHER=YES
# Batch size (global batch size) options = [12, 24, 36, ..., 60].
GBS=12
# Set scatter-gather communication optimization options.
if [ ${SCATTER_GATHER} == "YES" ]; then
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
elif [ ${SCATTER_GATHER} == "NO" ]; then
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline "
else
echo "Invalid configuration"
exit 1
fi
# Other params.
TP=8
PP=12
MBS=1
NLS=96
HS=12288
NAH=96
DDP=local
NNODES=12
# Name of the job.
export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# model size options = [1.7B, 3.6B, 7.5B, 18B, 39B, 76B, 145B, 310B, 530B, 1T]
MODEL_SIZE=1.7B
if [ ${MODEL_SIZE} == "1.7B" ]; then
TP=1
PP=1
MBS=16
GBS=512
NLS=24
HS=2304
NAH=24
DDP=torch
NNODES=4
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${MODEL_SIZE} == "3.6B" ]; then
TP=2
PP=1
MBS=16
GBS=512
NLS=30
HS=3072
NAH=32
DDP=torch
NNODES=8
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${MODEL_SIZE} == "7.5B" ]; then
TP=4
PP=1
MBS=16
GBS=512
NLS=36
HS=4096
NAH=32
DDP=torch
NNODES=16
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${MODEL_SIZE} == "18B" ]; then
TP=8
PP=1
MBS=8
GBS=1024
NLS=40
HS=6144
NAH=48
DDP=torch
NNODES=32
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${MODEL_SIZE} == "39B" ]; then
TP=8
PP=2
MBS=4
GBS=1536
NLS=48
HS=8192
NAH=64
DDP=local
NNODES=64
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${MODEL_SIZE} == "76B" ]; then
TP=8
PP=4
MBS=2
GBS=1792
NLS=60
HS=10240
NAH=80
DDP=local
NNODES=128
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5"
elif [ ${MODEL_SIZE} == "145B" ]; then
TP=8
PP=8
MBS=2
GBS=2304
NLS=80
HS=12288
NAH=96
DDP=local
NNODES=192
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5 "
elif [ ${MODEL_SIZE} == "310B" ]; then
TP=8
PP=16
MBS=1
GBS=2160
NLS=96
HS=16384
NAH=128
DDP=local
NNODES=240
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 3 "
elif [ ${MODEL_SIZE} == "530B" ]; then
TP=8
PP=35
MBS=1
GBS=2520
NLS=105
HS=20480
NAH=128
DDP=local
NNODES=315
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 1 "
elif [ ${MODEL_SIZE} == "1T" ]; then
TP=8
PP=64
MBS=1
GBS=3072
NLS=128
HS=25600
NAH=160
DDP=local
NNODES=384
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
else
echo "Invalid configuration"
exit 1
fi
# Name of the job
export JOB_NAME=results_table_1_model_size_${MODEL_SIZE}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
{
"train_batch_size" : CONFIG_BATCH_SIZE,
"train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
"steps_per_print": LOG_INTERVAL,
"zero_optimization": {
"stage": ZERO_STAGE
},
"gradient_clipping": 1.0,
"prescale_gradients": PRESCALE_GRAD,
"fp16": {
"enabled": CONFIG_FP16_ENABLED,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 11
},
"bf16": {
"enabled": CONFIG_BF16_ENABLED
},
"curriculum_learning": {
"enabled": CONFIG_CL_ENABLED,
"curriculum_type": "seqlen",
"min_difficulty": CONFIG_CL_MIN,
"max_difficulty": CONFIG_CL_MAX,
"schedule_type": "fixed_linear",
"schedule_config": {
"total_curriculum_step": CONFIG_CL_DURATION,
"difficulty_step": 8
}
},
"wall_clock_breakdown" : false
}
{
"train_batch_size" : CONFIG_BATCH_SIZE,
"train_micro_batch_size_per_gpu": CONFIG_MBSIZE,
"steps_per_print": LOG_INTERVAL,
"zero_optimization": {
"stage": 2
},
"gradient_clipping": 1.0,
"prescale_gradients": false,
"fp16": {
"enabled": CONFIG_FP16_ENABLED,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 11
},
"bf16": {
"enabled": CONFIG_BF16_ENABLED
},
"curriculum_learning": {
"enabled": CONFIG_CL_ENABLED,
"curriculum_type": "seqlen",
"min_difficulty": CONFIG_CL_MIN,
"max_difficulty": CONFIG_CL_MAX,
"schedule_type": "fixed_linear",
"schedule_config": {
"total_curriculum_step": CONFIG_CL_DURATION,
"difficulty_step": 8
}
},
"wall_clock_breakdown" : false
}
# This is an example zero-shot eval script. Please first read the readme_evalharness.md under the same directory.
CHECKPOINT_PATH=/blob/users/conglli/project/gpt3_with_pile/checkpoint/gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-128-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-20728-token-45B/global_step81566/
CONFIG_PATH=ds_config_gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-128-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-20728-token-45B.json
RESULT_PATH=gpt3-with-pile-0.125B-lr-2.4e-3-minlr-6.0e-5-bs-2048-gpus-128-zero-0-mp-1-pp-1-no_pp-cl-startseqlen-72-step-20728-token-45B_global_step81566.log
PP_SIZE=1
TP_SIZE=1
NO_PP="true"
EP_PARALLEL_SIZE=1
# Currently eval harness does not support data parallel
# However, for MoE models it's possible to enable a "fake data parallel"
# in order to load experts on multiple gpus. At the same time, it's not
# real data parallel because we load the same data on all gpus.
# On the other hand, it's better to use less number of gpus than training,
# to reduce communication overhead.
NUM_NODE=1
NUM_GPU_PER_NODE=1
TASKS="lambada"
# WikiText-2, not used in GPT-3 paper but used in GPT-2 paper
# TASKS="wikitext"
# Tasks that appeared in GPT-3 paper (sorted based on the order in paper), plus WikiText-2.
# TASKS="hellaswag,lambada,triviaqa,webqs,winogrande,piqa,arc_challenge,arc_easy,openbookqa,race,boolq,cb,copa,rte,wic,wsc,multirc,record,anli_r1,anli_r2,anli_r3,wikitext"
# All tasks that confirmed to work, there are more tasks on https://github.com/EleutherAI/lm-evaluation-harness that we didn't test.
# TASKS="hellaswag,lambada,triviaqa,webqs,winogrande,piqa,arc_challenge,arc_easy,openbookqa,race,boolq,cb,copa,rte,wic,wsc,multirc,record,anli_r1,anli_r2,anli_r3,wikitext,logiqa,mathqa,mc_taco,mrpc,prost,pubmedqa,qnli,qqp,sciq,sst,wnli"
VOCAB_FILE=/data/Megatron-LM/data/gpt2-vocab.json
MERGE_FILE=/data/Megatron-LM/data/gpt2-merges.txt
# export HF_DATASETS_OFFLINE=1
# Dummy arguments to make megatron happy. No need to configure them.
# The reason we don't need to configure them and many other arguments is
# because the eval framework will read the arguments from checkpoint file.
MEGATRON_REQUIRED_ARGS="\
--num-layers -1\
--hidden-size -1\
--num-attention-heads -1\
--seq-length -1 \
--max-position-embeddings -1
"
CMD="../../tasks/eval_harness/evaluate.py \
--load $CHECKPOINT_PATH\
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE\
--moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
--vocab-file $VOCAB_FILE\
--merge-file $MERGE_FILE\
--micro-batch-size 12\
--no-load-optim \
--no-load-rng \
--inference \
--disable-moe-token-dropping \
--tokenizer-type GPT2BPETokenizer \
--adaptive_seq_len\
--eval_fp32\
--task_list $TASKS\
--results_path $RESULT_PATH \
--deepspeed \
--deepspeed_config $CONFIG_PATH \
$MEGATRON_REQUIRED_ARGS\
"
if [[ "${NO_PP}" = "true" ]]; then
CMD="${CMD} \
--no-pipeline-parallel"
fi
LAUNCHER="deepspeed --num_nodes $NUM_NODE --num_gpus $NUM_GPU_PER_NODE"
$LAUNCHER $CMD
\ No newline at end of file
#!/bin/bash
DIR=`pwd`
###############################################################################
### Main configs
## GPT-3 models use 2K sequence length/context window
SEQ_LEN=2048
### The "GPT-3 XXX" below are configs from GPT-3 paper
### https://arxiv.org/abs/2005.14165, choose based on
### your desired model size or build your own configs
## GPT-3 Small 125M
# MODEL_SIZE=0.125
# NUM_LAYERS=12
# HIDDEN_SIZE=768
# NUM_ATTN_HEADS=12
# GLOBAL_BATCH_SIZE=256
# LR=6.0e-4
# MIN_LR=6.0e-5
## GPT-3 Medium 350M
# MODEL_SIZE=0.35
# NUM_LAYERS=24
# HIDDEN_SIZE=1024
# NUM_ATTN_HEADS=16
# GLOBAL_BATCH_SIZE=256
# LR=3.0e-4
# MIN_LR=3.0e-5
## GPT-3 Large 760M
# MODEL_SIZE=0.76
# NUM_LAYERS=24
# HIDDEN_SIZE=1536
# NUM_ATTN_HEADS=16
# GLOBAL_BATCH_SIZE=256
# LR=2.5e-4
# MIN_LR=2.5e-5
## GPT-3 XL 1.3B
MODEL_SIZE=1.3
NUM_LAYERS=24
HIDDEN_SIZE=2048
NUM_ATTN_HEADS=16
GLOBAL_BATCH_SIZE=512
# LR=2.0e-4
# MIN_LR=2.0e-5
## GPT-3 2.7B
# MODEL_SIZE=2.7
# NUM_LAYERS=32
# HIDDEN_SIZE=2560
# NUM_ATTN_HEADS=32
# GLOBAL_BATCH_SIZE=512
# LR=1.6e-4
# MIN_LR=1.6e-5
## GPT-3 6.7B
# MODEL_SIZE=6.7
# NUM_LAYERS=32
# HIDDEN_SIZE=4096
# NUM_ATTN_HEADS=32
# GLOBAL_BATCH_SIZE=1024
# LR=1.2e-4
# MIN_LR=1.2e-5
## GPT-3 13B
# MODEL_SIZE=13
# NUM_LAYERS=40
# HIDDEN_SIZE=5120
# NUM_ATTN_HEADS=40
# GLOBAL_BATCH_SIZE=1024
# LR=1.0e-4
# MIN_LR=1.0e-5
## GPT-3 175B
# MODEL_SIZE=175
# NUM_LAYERS=96
# HIDDEN_SIZE=12288
# NUM_ATTN_HEADS=96
# GLOBAL_BATCH_SIZE=1536
# LR=0.6e-4
# MIN_LR=0.6e-5
###############################################################################
### Training duration configs
## The main termination condition, original GPT-3 paper trains for 300B tokens
## For MoE model, we found sometimes training a bit more to 330B tokens helps
TRAIN_TOKENS=300000000000
# TRAIN_TOKENS=330000000000
## TRAIN_ITERS is another termination condition and also affect the number of
## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
## above, and techniques like curriculum learning has less token in some steps,
## so we just set this config large enough to make sure we have enough
## processed data and don't terminate by TRAIN_ITERS.
TRAIN_ITERS=$(( ${TRAIN_TOKENS} * 3 / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
## Another termination condition in minutes. Set it large enough to avoid
## undesired early termination.
EXIT_DURATION=30000000
###############################################################################
### LR configs
## LR warmup and decay duration, this token-based config is preferable since
## no need to readjust when the batch size/seqlen is changed.
## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
## For MoE model, we found that setting the decay token to 300B helps.
WARMUP_TOKENS=375000000
# LR_DECAY_TOKENS=260000000000
LR_DECAY_TOKENS=300000000000
###############################################################################
### Parallelism configs
## Micro batch size per GPU
## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
BATCH_SIZE=8
## Model parallelism, 1 is no MP
MP_SIZE=1
## Pipeline parallelism
## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
## to 1 and use the "--no-pipeline-parallel" arg.
PP_SIZE=1
NUM_GPUS=64
###############################################################################
### MoE configs
## Number of experts. EP_SIZE 1 means dense model without MoE
# EP_SIZE=1
EP_SIZE=128
if [[ $EP_SIZE -gt $NUM_GPUS ]]; then
EP_PARALLEL_SIZE=$NUM_GPUS
else
EP_PARALLEL_SIZE=$EP_SIZE
fi
## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
## found that lower LR and min LR (than the base dense model) helps.
## For 1.3B MoE-128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
## For 350M MoE-128 model we used LR=2.0e-4 and MIN_LR=2.0e-6, but they are not
## heavily tuned.
LR=1.2e-4
MIN_LR=1.0e-6
## Coefficient for MoE loss. We find that 0.01 is a good value at least for
## 1.3B MoE-128 model
MLC=0.01
## Below configs adjust the MoE expert token capacity limit during training and
## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
## Larger capacity factor or disabling capacity limit could improve training
## convergence, but will also reduce training throughput.
MOE_TRAIN_CAP_FACTOR=1.0
MOE_EVAL_CAP_FACTOR=1.0
MOE_MIN_CAP=4
MOE_DROP_TOKEN="true"
# MOE_DROP_TOKEN="false"
###############################################################################
### Curriculum learning (CL) configs
## Enable/disable CL
CL_ENABLED="false"
## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
## for tuning the following configs
CL_START_SEQLEN=80
CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
CL_TOKENS=60
CL_TOKENS=$((${CL_TOKENS} * 1000000000))
CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
###############################################################################
### Misc configs
LOG_INTERVAL=10
EVAL_ITERS=10
EVAL_INTERVAL=100
SAVE_INTERVAL=10000
## Standard deviation for weight initialization
## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
## dense model. Usually larger model needs lower std.
INIT_STD=0.014
# INIT_STD=0.01
## Activation checkpointing saves GPU memory, but reduces training speed
ACTIVATION_CHECKPOINT="true"
# ACTIVATION_CHECKPOINT="false"
###############################################################################
### Output and data configs
current_time=$(date "+%Y.%m.%d-%H.%M.%S")
host="${HOSTNAME}"
NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
if [[ $EP_SIZE -gt 1 ]]; then
NAME="${NAME}-ep-${EP_SIZE}-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
fi
if [ "${CL_ENABLED}" = "true" ]; then
NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
fi
OUTPUT_BASEPATH=$DIR/output
mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
mkdir -p "${OUTPUT_BASEPATH}/log/"
TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}_${current_time}"
mkdir -p ${TENSORBOARD_DIR}
## Note that for MoE model with billion-scale base model, the checkpoint can be
## as large as TB-scale which normal NFS cannot handle efficiently.
CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
# USE_INTERNAL_DATA="true"
USE_INTERNAL_DATA="false"
if [ "${USE_INTERNAL_DATA}" = "true" ]; then
## The internal data is only accessible within Microsoft
## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
# BASE_DATA_PATH=/vc_data/Megatron-LM/data
# DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
## For cluster Lab-RR1-V100
BASE_DATA_PATH=/data/Megatron-LM/data
DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
## For cluster Azure-CentralUS-A100
# BASE_DATA_PATH=/data/Megatron-LM/data
# DATA_HOME=/vc_data_1/users/amawa/blended
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
DATA_BLEND="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
0.01359 ${ARX} 0.01588 ${GIT}"
else
VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
# Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
DATA_BLEND=/data/the_pile_public_merged_nopreprocessing/pile_text_document
fi
###############################################################################
data_options=" \
--vocab-file ${VOCAB_PATH} \
--merge-file ${MERGE_PATH} \
--data-path ${DATA_BLEND} \
--data-impl mmap"
megatron_options=" \
--override-opt_param-scheduler \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--tensor-model-parallel-size ${MP_SIZE} \
--moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
--num-experts ${EP_SIZE} \
--moe-loss-coeff ${MLC} \
--moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
--moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
--moe-min-capacity ${MOE_MIN_CAP} \
--init-method-std ${INIT_STD} \
--lr-decay-tokens ${LR_DECAY_TOKENS} \
--lr-warmup-tokens ${WARMUP_TOKENS} \
--micro-batch-size ${BATCH_SIZE} \
--exit-duration-in-mins ${EXIT_DURATION} \
--global-batch-size ${GLOBAL_BATCH_SIZE} \
--num-layers ${NUM_LAYERS} \
--hidden-size ${HIDDEN_SIZE} \
--num-attention-heads ${NUM_ATTN_HEADS} \
--seq-length ${SEQ_LEN} \
--max-position-embeddings ${SEQ_LEN} \
--train-tokens ${TRAIN_TOKENS} \
--train-iters ${TRAIN_ITERS} \
--lr ${LR} \
--min-lr ${MIN_LR} \
--lr-decay-style cosine \
--split 98,2,0 \
--log-interval ${LOG_INTERVAL} \
--eval-interval ${EVAL_INTERVAL} \
--eval-iters ${EVAL_ITERS} \
--save-interval ${SAVE_INTERVAL} \
--weight-decay 0.1 \
--clip-grad 1.0 \
--hysteresis 2 \
--num-workers 0 \
--fp16 \
--load ${CHECKPOINT_PATH} \
--save ${CHECKPOINT_PATH} \
--tensorboard-queue-size 1 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
--tensorboard-dir ${TENSORBOARD_DIR}"
if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
megatron_options="${megatron_options} \
--checkpoint-activations"
fi
if [[ $EP_SIZE -gt 1 ]]; then
megatron_options="${megatron_options} \
--create-moe-param-group"
fi
if [ "${MOE_DROP_TOKEN}" = "false" ]; then
megatron_options="${megatron_options} \
--disable-moe-token-dropping"
fi
template_json="ds_config_gpt_TEMPLATE.json"
config_json="ds_config_gpt_${NAME}.json"
sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
| sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
| sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
| sed "s/ZERO_STAGE/0/" \
| sed "s/PRESCALE_GRAD/true/" \
| sed "s/CONFIG_FP16_ENABLED/true/" \
| sed "s/CONFIG_BF16_ENABLED/false/" \
| sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
| sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
| sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
| sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
> ${config_json}
deepspeed_options=" \
--deepspeed \
--deepspeed_config ${config_json} \
--pipeline-model-parallel-size ${PP_SIZE}"
# Currently MoE is not compatible with pipeline parallel
if [[ $EP_SIZE -gt 1 ]]; then
deepspeed_options="${deepspeed_options} \
--no-pipeline-parallel"
fi
if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
deepspeed_options="${deepspeed_options} \
--deepspeed-activation-checkpointing"
fi
run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
echo ${run_cmd}
eval ${run_cmd}
set +x
#!/bin/bash
DIR=`pwd`
###############################################################################
### Main configs
## GPT-3 models use 2K sequence length/context window
SEQ_LEN=2048
### The "GPT-3 XXX" below are configs from GPT-3 paper
### https://arxiv.org/abs/2005.14165, choose based on
### your desired model size or build your own configs
## GPT-3 Small 125M
# MODEL_SIZE=0.125
# NUM_LAYERS=12
# HIDDEN_SIZE=768
# NUM_ATTN_HEADS=12
# GLOBAL_BATCH_SIZE=256
# LR=6.0e-4
# MIN_LR=6.0e-5
## GPT-3 Medium 350M
# MODEL_SIZE=0.35
# NUM_LAYERS=24
# HIDDEN_SIZE=1024
# NUM_ATTN_HEADS=16
# GLOBAL_BATCH_SIZE=256
# LR=3.0e-4
# MIN_LR=3.0e-5
## GPT-3 Large 760M
# MODEL_SIZE=0.76
# NUM_LAYERS=24
# HIDDEN_SIZE=1536
# NUM_ATTN_HEADS=16
# GLOBAL_BATCH_SIZE=256
# LR=2.5e-4
# MIN_LR=2.5e-5
## GPT-3 XL 1.3B
MODEL_SIZE=1.3
NUM_LAYERS=24
HIDDEN_SIZE=2048
NUM_ATTN_HEADS=16
GLOBAL_BATCH_SIZE=512
# LR=2.0e-4
# MIN_LR=2.0e-5
## GPT-3 2.7B
# MODEL_SIZE=2.7
# NUM_LAYERS=32
# HIDDEN_SIZE=2560
# NUM_ATTN_HEADS=32
# GLOBAL_BATCH_SIZE=512
# LR=1.6e-4
# MIN_LR=1.6e-5
## GPT-3 6.7B
# MODEL_SIZE=6.7
# NUM_LAYERS=32
# HIDDEN_SIZE=4096
# NUM_ATTN_HEADS=32
# GLOBAL_BATCH_SIZE=1024
# LR=1.2e-4
# MIN_LR=1.2e-5
## GPT-3 13B
# MODEL_SIZE=13
# NUM_LAYERS=40
# HIDDEN_SIZE=5120
# NUM_ATTN_HEADS=40
# GLOBAL_BATCH_SIZE=1024
# LR=1.0e-4
# MIN_LR=1.0e-5
## GPT-3 175B
# MODEL_SIZE=175
# NUM_LAYERS=96
# HIDDEN_SIZE=12288
# NUM_ATTN_HEADS=96
# GLOBAL_BATCH_SIZE=1536
# LR=0.6e-4
# MIN_LR=0.6e-5
###############################################################################
### Training duration configs
## The main termination condition, original GPT-3 paper trains for 300B tokens
## For MoE model, we found sometimes training a bit more to 330B tokens helps
TRAIN_TOKENS=300000000000
# TRAIN_TOKENS=330000000000
## TRAIN_ITERS is another termination condition and also affect the number of
## data samples to be indexed. Since we want to reach the TRAIN_TOKENS
## above, and techniques like curriculum learning has less token in some steps,
## so we just set this config large enough to make sure we have enough
## processed data and don't terminate by TRAIN_ITERS.
TRAIN_ITERS=$(( ${TRAIN_TOKENS} * 3 / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
## Another termination condition in minutes. Set it large enough to avoid
## undesired early termination.
EXIT_DURATION=30000000
###############################################################################
### LR configs
## LR warmup and decay duration, this token-based config is preferable since
## no need to readjust when the batch size/seqlen is changed.
## Original GPT-3 paper uses 375M warmup tokens and 260B decay tokens.
## For MoE model, we found that setting the decay token to 300B helps.
WARMUP_TOKENS=375000000
# LR_DECAY_TOKENS=260000000000
LR_DECAY_TOKENS=300000000000
###############################################################################
### Parallelism configs
## Micro batch size per GPU
## Make sure that BATCH_SIZE <= GLOBAL_BATCH_SIZE*PP_SIZE*MP_SIZE/NUM_GPUS
BATCH_SIZE=8
## Model parallelism, 1 is no MP
MP_SIZE=1
## Pipeline parallelism
## Currently we don't support PP for MoE. To disable PP, set PP_SIZE
## to 1 and use the "--no-pipeline-parallel" arg.
PP_SIZE=1
NUM_GPUS=64
###############################################################################
### MoE configs
## Number of experts. EP_SIZE 128 means standard MoE
# EP_SIZE=128
EP_SIZE="64 64 64 64 64 64 64 64 64 64 128 128"
EP_PARALLEL_SIZE=$NUM_GPUS
## Original GPT-3 model always set min LR at 10% of max LR. For MoE model, we
## found that lower LR and min LR (than the base dense model) helps.
## For 1.3B PR-MoE-64/128 model we used LR=1.2e-4 and MIN_LR=1.0e-6.
## heavily tuned.
LR=1.2e-4
MIN_LR=1.0e-6
## Coefficient for MoE loss. We find that 0.01 is a good value at least for
## 1.3B MoE-128 model
MLC=0.01
## Below configs adjust the MoE expert token capacity limit during training and
## eval. To completely disable capacity limit, set MOE_DROP_TOKEN to false.
## Larger capacity factor or disabling capacity limit could improve training
## convergence, but will also reduce training throughput.
MOE_TRAIN_CAP_FACTOR=1.0
MOE_EVAL_CAP_FACTOR=1.0
MOE_MIN_CAP=4
MOE_DROP_TOKEN="true"
# MOE_DROP_TOKEN="false"
###############################################################################
### Curriculum learning (CL) configs
## Enable/disable CL
CL_ENABLED="false"
## Consult the tutorial https://www.deepspeed.ai/tutorials/curriculum-learning/
## for tuning the following configs
CL_START_SEQLEN=80
CL_AVG_SEQLEN=$(( (${CL_START_SEQLEN} + ${SEQ_LEN}) / 2 ))
CL_TOKENS=60
CL_TOKENS=$((${CL_TOKENS} * 1000000000))
CL_STEP=$(( ${CL_TOKENS} / (${GLOBAL_BATCH_SIZE} * ${CL_AVG_SEQLEN}) ))
###############################################################################
### Misc configs
LOG_INTERVAL=10
EVAL_ITERS=10
EVAL_INTERVAL=100
SAVE_INTERVAL=10000
## Standard deviation for weight initialization
## We used 0.014 for 350M/1.3B dense/MoE models, and used 0.01 for 6.7B
## dense model. Usually larger model needs lower std.
INIT_STD=0.014
# INIT_STD=0.01
## Activation checkpointing saves GPU memory, but reduces training speed
ACTIVATION_CHECKPOINT="true"
# ACTIVATION_CHECKPOINT="false"
###############################################################################
### Output and data configs
current_time=$(date "+%Y.%m.%d-%H.%M.%S")
host="${HOSTNAME}"
NAME="gpt-${MODEL_SIZE}B-lr-${LR}-minlr-${MIN_LR}-bs-${GLOBAL_BATCH_SIZE}-gpus-${NUM_GPUS}-mp-${MP_SIZE}-pp-${PP_SIZE}"
NAME="${NAME}-ep-pyramid-64+128-mlc-${MLC}-cap-${MOE_TRAIN_CAP_FACTOR}-drop-${MOE_DROP_TOKEN}"
if [ "${CL_ENABLED}" = "true" ]; then
NAME="${NAME}-cl-${CL_START_SEQLEN}-${CL_STEP}"
fi
OUTPUT_BASEPATH=$DIR/output
mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
mkdir -p "${OUTPUT_BASEPATH}/log/"
TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${host}_${current_time}"
mkdir -p ${TENSORBOARD_DIR}
## Note that for MoE model with billion-scale base model, the checkpoint can be
## as large as TB-scale which normal NFS cannot handle efficiently.
CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
# USE_INTERNAL_DATA="true"
USE_INTERNAL_DATA="false"
if [ "${USE_INTERNAL_DATA}" = "true" ]; then
## The internal data is only accessible within Microsoft
## For cluster Azure-EastUS-V100-32GB-4, Azure-WestUS3-A100
BASE_DATA_PATH=/vc_data/Megatron-LM/data
DATA_HOME="/vc_data/pile-cc1-cc2-shuf"
## For cluster Lab-RR1-V100
# BASE_DATA_PATH=/data/Megatron-LM/data
# DATA_HOME="/turing-ssd/users/conglli/data/pile-cc1-cc2-shuf"
## For cluster Azure-CentralUS-A100
# BASE_DATA_PATH=/data/Megatron-LM/data
# DATA_HOME=/vc_data_1/users/amawa/blended
VOCAB_PATH=${BASE_DATA_PATH}/gpt2-vocab.json
MERGE_PATH=${BASE_DATA_PATH}/gpt2-merges.txt
ARX="${DATA_HOME}/ArXiv_ftfy_cleaned_id_shuf_text_document"
BC2="${DATA_HOME}/BookCorpus2_ftfy_cleaned_id_shuf_text_document"
B3="${DATA_HOME}/Books3_ftfy_cleaned_id_shuf_text_document"
CC2020="${DATA_HOME}/CC-2020-50_id_cleaned_shuf_text_document"
CC2021="${DATA_HOME}/CC-2021-04_id_cleaned_shuf_text_document"
GIT="${DATA_HOME}/Github_ftfy_id_shuf_text_document"
GUT="${DATA_HOME}/Gutenberg_PG-19_ftfy_cleaned_id_cleaned_shuf_text_document"
NIH="${DATA_HOME}/NIH_ExPorter_ftfy_id_shuf_text_document"
OWT2="${DATA_HOME}/OpenWebText2_ftfy_cleaned_id_shuf_text_document"
PCC="${DATA_HOME}/Pile-CC_id_cleaned_shuf_text_document"
PM="${DATA_HOME}/PubMed_Abstracts_ftfy_id_shuf_text_document"
RN="${DATA_HOME}/rn_dedup_shuf_cleaned_0.7_cleaned_shuf_text_document"
SE="${DATA_HOME}/StackExchange_ftfy_id_shuf_text_document"
ST="${DATA_HOME}/stories_dedup0.7_shuf_cleaned_shuf_text_document"
WIK="${DATA_HOME}/Wikipedia_en_ftfy_id_shuf_text_document"
DATA_BLEND="0.14336 ${B3} 0.08962 ${RN} 0.19336 ${OWT2} 0.05689 ${SE} \
0.00859 ${ST} 0.02897 ${PM} 0.04771 ${WIK} 0.00873 ${GUT} 0.01007 ${BC2} \
0.00208 ${NIH} 0.13017 ${CC2020} 0.09446 ${PCC} 0.15652 ${CC2021} \
0.01359 ${ARX} 0.01588 ${GIT}"
else
VOCAB_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-vocab.json
MERGE_PATH=/data/the_pile_public_merged_nopreprocessing/gpt2-merges.txt
# Public the Pile dataset, can be downloaded at https://mystic.the-eye.eu/public/AI/pile_neox/
DATA_BLEND=/data/the_pile_public_merged_nopreprocessing/pile_text_document
fi
###############################################################################
data_options=" \
--vocab-file ${VOCAB_PATH} \
--merge-file ${MERGE_PATH} \
--data-path ${DATA_BLEND} \
--data-impl mmap"
megatron_options=" \
--override-opt_param-scheduler \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--tensor-model-parallel-size ${MP_SIZE} \
--moe-expert-parallel-size ${EP_PARALLEL_SIZE} \
--num-experts ${EP_SIZE} \
--moe-loss-coeff ${MLC} \
--mlp-type residual \
--moe-train-capacity-factor ${MOE_TRAIN_CAP_FACTOR} \
--moe-eval-capacity-factor ${MOE_EVAL_CAP_FACTOR} \
--moe-min-capacity ${MOE_MIN_CAP} \
--init-method-std ${INIT_STD} \
--lr-decay-tokens ${LR_DECAY_TOKENS} \
--lr-warmup-tokens ${WARMUP_TOKENS} \
--micro-batch-size ${BATCH_SIZE} \
--exit-duration-in-mins ${EXIT_DURATION} \
--global-batch-size ${GLOBAL_BATCH_SIZE} \
--num-layers ${NUM_LAYERS} \
--hidden-size ${HIDDEN_SIZE} \
--num-attention-heads ${NUM_ATTN_HEADS} \
--seq-length ${SEQ_LEN} \
--max-position-embeddings ${SEQ_LEN} \
--train-tokens ${TRAIN_TOKENS} \
--train-iters ${TRAIN_ITERS} \
--lr ${LR} \
--min-lr ${MIN_LR} \
--lr-decay-style cosine \
--split 98,2,0 \
--log-interval ${LOG_INTERVAL} \
--eval-interval ${EVAL_INTERVAL} \
--eval-iters ${EVAL_ITERS} \
--save-interval ${SAVE_INTERVAL} \
--weight-decay 0.1 \
--clip-grad 1.0 \
--hysteresis 2 \
--num-workers 0 \
--fp16 \
--load ${CHECKPOINT_PATH} \
--save ${CHECKPOINT_PATH} \
--tensorboard-queue-size 1 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
--tensorboard-dir ${TENSORBOARD_DIR}"
if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
megatron_options="${megatron_options} \
--checkpoint-activations"
fi
megatron_options="${megatron_options} \
--create-moe-param-group"
if [ "${MOE_DROP_TOKEN}" = "false" ]; then
megatron_options="${megatron_options} \
--disable-moe-token-dropping"
fi
template_json="ds_config_gpt_Zero2_TEMPLATE.json"
config_json="ds_config_gpt_${NAME}.json"
sed "s/CONFIG_BATCH_SIZE/${GLOBAL_BATCH_SIZE}/" ${template_json} \
| sed "s/CONFIG_MBSIZE/${BATCH_SIZE}/" \
| sed "s/LOG_INTERVAL/${LOG_INTERVAL}/" \
| sed "s/CONFIG_FP16_ENABLED/true/" \
| sed "s/CONFIG_BF16_ENABLED/false/" \
| sed "s/CONFIG_CL_ENABLED/${CL_ENABLED}/" \
| sed "s/CONFIG_CL_MIN/${CL_START_SEQLEN}/" \
| sed "s/CONFIG_CL_MAX/${SEQ_LEN}/" \
| sed "s/CONFIG_CL_DURATION/${CL_STEP}/" \
> ${config_json}
deepspeed_options=" \
--deepspeed \
--deepspeed_config ${config_json} \
--pipeline-model-parallel-size ${PP_SIZE}"
# Currently MoE is not compatible with pipeline parallel
deepspeed_options="${deepspeed_options} \
--no-pipeline-parallel"
if [ "${ACTIVATION_CHECKPOINT}" = "true" ]; then
deepspeed_options="${deepspeed_options} \
--deepspeed-activation-checkpointing"
fi
run_cmd="deepspeed ${DIR}/../../pretrain_gpt.py ${megatron_options} ${data_options} ${deepspeed_options} &> ${OUTPUT_BASEPATH}/log/${NAME}_${host}_${current_time}.log"
echo ${run_cmd}
eval ${run_cmd}
set +x
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment