update

3f451e07 · liangjing · eeac1cc4 · 3f451e07 · 3f451e07 · 3f451e07
Commit 3f451e07 authored Jul 25, 2023 by liangjing
Hide whitespace changes
Inline Side-by-side

Showing with 190 additions and 0 deletions

lightop-0.1-cp37-cp37m-linux_x86_64.whl lightop-0.1-cp37-cp37m-linux_x86_64.whl +0 -0

run-16B-fp16.sh run-16B-fp16.sh +31 -0

single-16B-fp16.sh single-16B-fp16.sh +159 -0

No files found.
--- a/lightop-0.1-cp37-cp37m-linux_x86_64.whl
+++ b/lightop-0.1-cp37-cp37m-linux_x86_64.whl
--- a/run-16B-fp16.sh
+++ b/run-16B-fp16.sh
+#!/bin/bash
+#SBATCH -p tydexclu01
+#SBATCH -N 16
+#SBATCH --cpus-per-task=1
+#SBATCH --ntasks-per-node=32
+#SBATCH --mem 0
+#SBATCH --gres=dcu:4
+#SBATCH -J gpt2
+#SBATCH -o logs/gpt2-16B-%j.out
+#SBATCH -e logs/gpt2-16B-%j.out
+ulimit -u 200000
+export NCCL_IB_HCA=mlx5
+export NCCL_SOCKET_IFNAME=ib0
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+echo "START TIME: $(date)"
+rm -f ./hostfile/*
+rm -f core.*
+hostfile=./hostfile/$SLURM_JOB_ID
+scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
+for i in `cat $hostfile`
+do
+    echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID
+done
+np=$(cat $hostfile|sort|uniq |wc -l)
+np=$(($np*4))
+nodename=$(cat $hostfile |sed -n "1p")
+dist_url=`echo $nodename | awk '{print $1}'`
+mpirun -np $np --allow-run-as-root --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/single-16B-fp16.sh $dist_url
--- a/single-16B-fp16.sh
+++ b/single-16B-fp16.sh
+#!/bin/bash
+export NCCL_SOCKET_IFNAME=ib0
+export NCCL_IB_HCA=mlx5
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export MIOPEN_FIND_MODE=3
+export ROCBLAS_COMPUTETYPE_FP16R=0
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+RANK=$OMPI_COMM_WORLD_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+MODEL_NAME=gpt2-oscar_16B-4tp
+DATA_OUTPUT_PATH=./
+LOGS_PATH=$DATA_OUTPUT_PATH/logs
+CHECKPOINT_PATH=./output-module/$MODEL_NAME
+DATA_PATH="my-gpt2_text_document"
+TENSORBOARD_PATH=output_dir/tensorboard/$MODEL_NAME
+CODECARBON_PATH=output_dir/codecarbon/$MODEL_NAME
+TP_SIZE=4   # always fixed to the size of a single node
+PP_SIZE=4   # NLAYERS must be a multiple of PP_SIZE here
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=128 
+NLAYERS=40
+NHIDDEN=5760
+NHEADS=24
+SEQ_LEN=2048
+SAVE_INTERVAL=1000
+OPTIMIZER_ARGS=" \
+    --optimizer adam \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --adam-eps 1e-8 \
+    --lr 6.0e-5 \
+    --min-lr 6.0e-6 \
+    --lr-decay-style cosine \
+    --clip-grad 1.0 \
+    --weight-decay 1e-1 \
+    "
+GPT_ARGS=" \
+    --num-layers $NLAYERS \
+    --hidden-size $NHIDDEN \
+    --num-attention-heads $NHEADS \
+    --seq-length $SEQ_LEN \
+    --max-position-embeddings $SEQ_LEN \
+    --micro-batch-size $MICRO_BATCH_SIZE \
+    --global-batch-size $GLOBAL_BATCH_SIZE \
+    --train_iters 8000 \
+    --loss-scale 12 \
+    --vocab-file gpt2-vocab.json \
+    --merge-file gpt2-merges.txt \
+    --clip-grad 1.0 \
+    --checkpoint-activations \
+    --seed 42
+    $OPTIMIZER_ARGS \
+    "
+OUTPUT_ARGS=" \
+    --log-interval 1 \
+    --save-interval $SAVE_INTERVAL \
+    --eval-interval 1000 \
+    --eval-iters 40 \
+    --tensorboard-dir $TENSORBOARD_PATH \
+    --tensorboard-queue-size 5 \
+    --log-timers-to-tensorboard \
+    --log-batch-size-to-tensorboard \
+    --log-validation-ppl-to-tensorboard \
+    "
+DATA_ARGS=" \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --data-path $DATA_PATH \
+    "
+ZERO_STAGE=1
+config_json="./${MODEL_NAME}_ds_config.json"
+cat <<EOT > $config_json
+{
+  "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
+  "train_batch_size": $GLOBAL_BATCH_SIZE,
+  "gradient_clipping": 1.0,
+  "zero_optimization": {
+    "stage": $ZERO_STAGE
+  },
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 500,
+    "hysteresis": 2,
+    "min_loss_scale": 1,
+    "initial_scale_power": 12
+  },
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false
+}
+EOT
+DEEPSPEED_ARGS=" \
+    --deepspeed \
+    --deepspeed_config ${config_json} \
+    --zero-stage ${ZERO_STAGE} \
+    --deepspeed-activation-checkpointing \
+    "
+export CMD=" \
+    --tensor-model-parallel-size $TP_SIZE \
+    --pipeline-model-parallel-size $PP_SIZE \
+    $GPT_ARGS \
+    $DATA_ARGS \
+    $OUTPUT_ARGS \
+    --data-impl mmap \
+    --split 949,50,1 \
+    --distributed-backend nccl \
+     $DEEPSPEED_ARGS \
+    "
+APP="python3 -u `pwd`/pretrain_gpt.py \
+    --rank ${RANK} \
+    --world_size ${WORLD_SIZE} \
+    --dist_url tcp://${1}:34566 \
+    --num-workers 2 \
+    ${CMD} \
+    "
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac