run_gemini.sh 801 Bytes
Newer Older
1
set -x
2
3
# distplan in ["CAI_ZeRO1", "CAI_ZeRO2", "CAI_Gemini", "Pytorch_DDP", "Pytorch_ZeRO"]
export DISTPLAN=${DISTPLAN:-"CAI_Gemini"}
4

5
# The following options only valid when DISTPLAN="colossalai"
6
export GPUNUM=${GPUNUM:-1}
7
8
export BATCH_SIZE=${BATCH_SIZE:-16}
export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"}
9
export TRAIN_STEP=${TRAIN_STEP:-10}
10
11
# export PYTHONPATH=$PWD:$PYTHONPATH

12
13
14
15
16
17
if [ ${USE_SHARD_INIT} = "True" ]; then
  USE_SHARD_INIT="--shardinit"
else
  USE_SHARD_INIT=""
fi

18
mkdir -p gemini_logs
19
20

torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \
21
22
--model_type=${MODEL_TYPE} \
--batch_size=${BATCH_SIZE} \
23
--distplan=${DISTPLAN} \
24
--train_step=${TRAIN_STEP} \
25
2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPLAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log