run_gpt_567B.sh 1.09 KB
Newer Older
silencealiang's avatar
silencealiang committed
1
2
3
4
5
6
7
8
9
for para in $*
do
    if [[ $para == --profiling* ]];then
        profiling=${para#*=}
    fi
done

# Those variables need to modify
GPUS=""                 # how many gpus to use
wxj's avatar
wxj committed
10
MPI_PORT=""             # mpi port to use
silencealiang's avatar
silencealiang committed
11
12
13
14
15
16
17
18
19
20
21
HOST=""                 # hostname
PORT=""                 # port id
DATA_PATH=""            # path to redpajama_text_document
TOKENIZER_MODEL_PATH="" # path to tokenizer.model
CHECKPOINT_PATH=""      # path to ckpt

# Runs GPT 567B model
mpirun -np ${GPUS}  --hostfile hostfile_gpt_567B \
                    --allow-run-as-root \
                    --bind-to none \
                    --mca plm_rsh_no_tree_spawn 1 \
wxj's avatar
wxj committed
22
                    --mca plm_rsh_args "-p ${MPI_PORT}" \
silencealiang's avatar
silencealiang committed
23
24
25
26
27
28
29
30
31
32
                    bash -c "
                    ./train_gpt_567B_$((${GPUS} / 8))nodes.sh \
                    ${HOST} \
                    ${PORT} \
                    --data_path=$DATA_PATH \
                    --tokenizer_path=$TOKENIZER_MODEL_PATH \
                    --checkpoint_path=$CHECKPOINT_PATH \
                    --profiling=$profiling" > log-$((${GPUS} / 8))nodes-`date +%F-%H%M`.log 2>&1

wait