run_gemini.sh 763 Bytes
Newer Older
1
2
3
set -x
export BS=${BS:-16}
export MEMCAP=${MEMCAP:-0}
Alex_996's avatar
Alex_996 committed
4
# Acceptable values include `125m`, `350m`, `1.3b`, `2.7b`, `6.7b`, `13b`, `30b`, `66b`. For `175b`
5
6
export MODEL=${MODEL:-"125m"}
export GPUNUM=${GPUNUM:-1}
7
export USE_SHARD_INIT=${USE_SHARD_INIT:-"false"}
8
9
10
11

# make directory for logs
mkdir -p ./logs

12
13
14
15
16
17
if [ ${USE_SHARD_INIT} = "true" ]; then
  USE_SHARD_INIT="--shardinit"
else
  USE_SHARD_INIT=""
fi

18
19
20
21
22
23
24
25
26
export MODLE_PATH="facebook/opt-${MODEL}"

# HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1
torchrun \
  --nproc_per_node ${GPUNUM} \
  --master_port 19198 \
  train_gemini_opt.py \
  --mem_cap ${MEMCAP} \
  --model_name_or_path ${MODLE_PATH} \
27
  ${USE_SHARD_INIT} \
28
  --batch_size ${BS} 2>&1 | tee ./logs/colo_${MODEL}_bs_${BS}_cap_${MEMCAP}_gpu_${GPUNUM}.log