Commit c338d32c authored by hepj987's avatar hepj987
Browse files

更新运行方式

parent bf95e032
Pipeline #434 canceled with stage
......@@ -9,7 +9,7 @@ GPT2模型:第二代生成式预训练模型(Generative Pre-Training2)。
### 模型结构
```
GPT2使用 Transformer 的 Decoder 结构,并对 Transformer Decoder 进行了一些改动,并通过Megatron和deepspeed进行分布式运行
GPT2使用 Transformer 的 Decoder 结构,并对 Transformer Decoder 进行了一些改动,并通过Megatron和deepspeed可以使用DP、TP、PP的3D并行式的分布式方式训练
```
### 数据集
......@@ -21,16 +21,19 @@ wget https://huggingface.co/bigscience/misc-test-data/resolve/main/stas/oscar-1G
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
xz -d oscar-1GB.jsonl.xz
#处理数据集参数
--input 输入数据集路径,即oscar-1GB.jsonl.xz解压后的文件路径
--output-prefix 输出数据路径,处理后会自动加上_text_document后缀
--vocab 下载的gpt2-vocab.json词表文件路径
--dataset-impl dataset类型
--tokenizer-type tokenizer类型
--merge-file 下载的gpt2-merges.txt文件路径
--append-eod 添加结束标志符
--workers 进程数
#处理数据集
python tools/preprocess_data.py \
--input oscar-1GB.jsonl \
--output-prefix my-gpt2 \
--vocab gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file gpt2-merges.txt \
--append-eod \
--workers 8
sh creat-data.sh
```
## GPT2预训练
......@@ -46,15 +49,15 @@ docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.10.0-centos7.6-dtk
进入docker
```
pip install -r requirements.txt -i http://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
```
### 训练(单卡测试样例)
### GPT2单节点训练
```
rm megatron/arguments.py
cp megatron/arguments.py-one_node megatron/arguments.py
sh run-train.sh(基于单节点四卡)
#np为起的进程数,和使用GPU数量一致,并且TP*PP < np,4卡的话可以设置2tp 2pp,或者1tp 4pp,4tp 1pp,节点内使用TP性能更好
mpirun -np 4 run-one-node.sh(基于单节点四卡)
```
```
......@@ -81,23 +84,43 @@ SAVE_INTERVAL 保存频率
--eval-iters 验证iter
```
### GPT2模型16B训练(多节点)
### GPT2模型16B多节点训练
要求DCU集群Slurm环境正常
要求DCU集群配置好相应的虚拟环境,已安装python依赖项
推荐用户使用预编译好的python3.7包来快速建立python3虚拟环境,pytorch、apex、torchaudio、colossalai、faiss、mmcv-full 、torchvision、tensorflow需要[光合开发者社区](https://cancon.hpccube.com:65024/4/main/)下载所需DCU版本安装包
在安装一下依赖时需要使用基于DTK编译的版本,下载地址[光合开发者社区](https://cancon.hpccube.com:65024/4/main/)
```
pytorch
deepspeed
apex
torchaudio
colossalai
faiss
mmcv-full
torchvision
tensorflow
```
这里以DTK23.04、python3.7,torch1.10为例,进入[光合开发者社区](https://cancon.hpccube.com:65024/4/main/)进入到pytorch->dtk23.04->下载 torch-1.10.0+gite378c3c.abi0.dtk2304-cp37-cp37m-manylinux2014_x86_64.whl。然后可以仿照下边配置环境:
```
#创建虚拟环境
export PYTHON3_LIB_PATH=/python_lib_path
virtualenv -p /python_bin_path/python3 --system-site-packages venv_gpt2
source env.sh #进入venv_gpt2虚拟环境
#进入venv_gpt2虚拟环境
source venv_gpt2/bin/activate
#加载DTK以及其他环境设置
source env.sh
#安装DTK版本依赖
pip install torch-1.10.0+gite378c3c.abi0.dtk2304-cp37-cp37m-manylinux2014_x86_64.whl
pip install deepspeed-0.9.2+git25d5540.abi0.dtk2304.torch1.10.0-cp37-cp37m-manylinux2014_x86_64.whl
#安装其他依赖
pip install -r requirements.txt -i http://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
```
```
rm megatron/arguments.py
cp megatron/arguments.py-nodes megatron/arguments.py
#多节点运行
sbatch run-16B.sh(主要参数在single-16B.sh)
```
......@@ -137,61 +160,45 @@ SAVE_INTERVAL 保存频率
| :-------: | :-----------: | :----------: |
| 32 x 4DCU | 4.299443E+00 | 7.365877E+01 |
## GPT2文本生成
使用GPT做文本生成时需要对训练好的模型进行转换,转换需要安装0.7.3版本 deepspeed(此工程已包含)
### 转换成多卡推理
```
pip install deepspeed-0.7.3+unknown-cp37-cp37m-linux_x86_64.whl -i http://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
#训练后的模型保存格式为deepspeed格式,如果用于推理,需要进行格式转换成megatron格式,deepspeed-> megatron格式时转换前后TP数需要保持相同
#转换脚本
sh conver-model_to_megatron.sh
```
对deepspeed进行一些修改
```
修改/usr/local/lib/python3.7/site-packages/deepspeed/checkpoint/constants.py
第34行
ZERO_FILE_PREFIX = 'bf16_' + 'zero_pp_rank_'
改为:
ZERO_FILE_PREFIX = 'zero_pp_rank_'
修改/usr/local/lib/python3.7/site-packages/deepspeed/ops/op_builder/builder.py
第133行 def assert_torch_info(torch_info):函数
删除下边的版本判断
install_torch_version = torch_info['version']
install_cuda_version = torch_info['cuda_version']
install_hip_version = torch_info['hip_version']
修改/usr/local/lib/python3.7/site-packages/deepspeed/runtime/state_dict_factory.py文件
第177行def check_ckpt_list(self):函数
删除mp_world_size判断
if 'mp_world_size' in sd.keys():
assert len(self.ckpt_list) == sd['mp_world_size'], f"checkpoint count {len(self.ckpt_list)} is different from saved mp_world_size {sd['mp_world_size']}"
#重要参数
需要将工程路径加入PYTHONPATH
例如:export PYTHONPATH=/home/megatron-deepspeed_dtk23.04:$PYTHONPATH
CHECKPOINT_PATH 需要转换的模型路径(具体到保存的global_step)
output_folder 转换后的模型路径
target_tp 转换后的TP数,与训练保持一直或设置为1
target_pp 转换后的PP数,与训练保持一直或设置为1
```
### 转换脚本
### 转换成单卡推理
```
sh conver.sh
```
#原始模型保存的是deepspeed格式,deepspeed-> megatron格式时转换前后TP数需要保持相同,因此需要先deepspeed->deepspeed(改变TP成1),然后再由deepspeed-> megatron转换成可推理的格式
#转换脚本
sh conver-model-1tp.sh
```
#重要参数
需要将工程路径加入PYTHONPATH
例如:export PYTHONPATH=/home/megatron-deepspeed_dtk22.10:$PYTHONPATH
CHECKPOINT_PATH 需要转换的模型路径(具体到保存的global_step)
output_folder 转换后的模型路径
target_tp 转换后的TP数(需要与训练时保持一致)
target_pp 转换后的PP数 (设置为1)
```
### 无条件文本生成
```
sh run-inf.sh(这里以单节点小模型为例)
#多卡推理
mpirun -np 4 run-inf-gpus.sh
#单卡推理
mpirun -np 1 run-inf.sh
```
```
......
export PYTHONPATH=/home/megatron-deepspeed_dtk23.04::$PYTHONPATH
CHECKPOINT_PATH=/home/megatron-deepspeed-dtk23.04/checkopints/gpt2-4tp/global_step1000
OUTPUT_PATH=./checkopints/megatron-1tp
python tools/convert_checkpoint/deepspeed_to_deepspeed.py \
--input_folder $CHECKPOINT_PATH \
--output_folder ./conver-model-deepspeed-1tp \
--target_tp 1 \
--target_pp 1
python tools/convert_checkpoint/deepspeed_to_megatron.py \
--input_folder ./conver-model-deepspeed-1tp/global_step1000 \
--output_folder $OUTPUT_PATH \
--target_tp 1 \
--target_pp 1
export PYTHONPATH=/home/megatron-deepspeed_dtk23.04::$PYTHONPATH
CHECKPOINT_PATH=/home/megatron-deepspeed-dtk23.04/checkopints/gpt2-oscar_16B-4tp/global_step1000
OUTPUT_PATH=./conver-4tp-model
python tools/convert_checkpoint/deepspeed_to_megatron.py \
--input_folder $CHECKPOINT_PATH \
--output_folder $OUTPUT_PATH \
--target_tp 4 \
--target_pp 1
python tools/preprocess_data.py \
--input oscar-1GB.jsonl \
--output-prefix my-gpt2 \
--vocab gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file gpt2-merges.txt \
--append-eod \
--workers 8
This diff is collapsed.
This diff is collapsed.
python tools/preprocess_data.py \
--input oscar-1GB.jsonl \
--output-prefix ./data/my-gpt2 \
--vocab gpt2-vocab.json \
--dataset-impl mmap \
--tokenizer-type GPT2BPETokenizer \
--merge-file gpt2-merges.txt \
--append-eod \
--workers 8
\ No newline at end of file
#!/bin/bash
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
MODEL_NAME=gpt2-4tp
DATA_OUTPUT_PATH=./
LOGS_PATH=$DATA_OUTPUT_PATH/logs
CHECKPOINT_PATH=checkpoint/$MODEL_NAME
DATA_PATH=./data/my-gpt2_text_document
TENSORBOARD_PATH=output_dir/tensorboard/$MODEL_NAME
CODECARBON_PATH=output_dir/codecarbon/$MODEL_NAME
N_GPUS=4
TP_SIZE=4 # always fixed to the size of a single node
PP_SIZE=1 #128 #96 # NLAYERS must be a multiple of PP_SIZE here
MICRO_BATCH_SIZE=2
GLOBAL_BATCH_SIZE=32 #256 #1536
NLAYERS=24
NHIDDEN=1024 #12480
NHEADS=16
SEQ_LEN=1024
SAVE_INTERVAL=1000
#rampup-batch-size 16 16 5859375
OPTIMIZER_ARGS=" \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--lr 6.0e-5 \
--min-lr 6.0e-6 \
--lr-decay-style cosine \
--clip-grad 1.0 \
--weight-decay 1e-1 \
"
GPT_ARGS=" \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--num-attention-heads $NHEADS \
--seq-length $SEQ_LEN \
--max-position-embeddings $SEQ_LEN \
--micro-batch-size $MICRO_BATCH_SIZE \
--global-batch-size $GLOBAL_BATCH_SIZE \
--train-iters 50 \
--loss-scale 12 \
--vocab-file gpt2-vocab.json \
--merge-file gpt2-merges.txt \
--clip-grad 1.0 \
--fp16 \
--checkpoint-activations \
--seed 42
$OPTIMIZER_ARGS \
"
OUTPUT_ARGS=" \
--log-interval 1 \
--save-interval $SAVE_INTERVAL \
--eval-interval 10 \
--eval-iters 40 \
--tensorboard-dir $TENSORBOARD_PATH \
--tensorboard-queue-size 5 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
"
DATA_ARGS=" \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
"
ZERO_STAGE=1
config_json="./${MODEL_NAME}_ds_config.json"
cat <<EOT > $config_json
{
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
"train_batch_size": $GLOBAL_BATCH_SIZE,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT
DEEPSPEED_ARGS=" \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"
APP="python pretrain_gpt.py \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
$DEEPSPEED_ARGS \
--rank ${RANK} \
--world_size ${WORLD_SIZE} \
--dist_url env://127.0.0.1::34566
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
#SBATCH -p tydexclu01
#SBATCH -N 16
#SBATCH --cpus-per-task=1
#SBATCH --ntasks-per-node=32
#SBATCH --mem 0
#SBATCH --gres=dcu:4
#SBATCH -J gpt2
#SBATCH -o logs/gpt2-16B-%j.out
#SBATCH -e logs/gpt2-16B-%j.out
ulimit -u 200000
export NCCL_IB_HCA=mlx5
export NCCL_SOCKET_IFNAME=ib0
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
echo "START TIME: $(date)"
rm -f ./hostfile/*
hostfile=./hostfile/$SLURM_JOB_ID
scontrol show hostnames $SLURM_JOB_NODELIST > ${hostfile}
for i in `cat $hostfile`
do
echo ${i} slots=4 >> `pwd`/hostfile/hostfile-dl-$SLURM_JOB_ID
done
np=$(cat $hostfile|sort|uniq |wc -l)
np=$(($np*4))
nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'`
mpirun -np $np --allow-run-as-root --hostfile hostfile/hostfile-dl-$SLURM_JOB_ID --bind-to none `pwd`/single-16B.sh $dist_url
#!/bin/bash
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CHECKPOINT_PATH=./conver-4tp-model
VOCAB_FILE=gpt2-vocab.json
MERGE_FILE=gpt2-merges.txt
APP="python tools/generate_samples_gpt.py \
--tensor-model-parallel-size 4 \
--num-layers 40 \
--hidden-size 5760 \
--load $CHECKPOINT_PATH \
--num-attention-heads 24 \
--max-position-embeddings 2048 \
--tokenizer-type GPT2BPETokenizer \
--fp16 \
--micro-batch-size 2 \
--seq-length 2048 \
--out-seq-length 128 \
--temperature 1.0 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--genfile gpt2-genfile.json \
--num-samples 4 \
--top_p 0.9 \
--recompute \
--rank ${RANK} \
--world_size ${WORLD_SIZE}"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
CHECKPOINT_PATH=./checkopints/megatron-1tp
VOCAB_FILE=gpt2-vocab.json
MERGE_FILE=gpt2-merges.txt
APP="python tools/generate_samples_gpt.py \
--tensor-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--load $CHECKPOINT_PATH \
--num-attention-heads 16 \
--max-position-embeddings 1024 \
--tokenizer-type GPT2BPETokenizer \
--fp16 \
--micro-batch-size 2 \
--seq-length 1024 \
--out-seq-length 128 \
--temperature 1.0 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--genfile gpt2-genfile.json \
--num-samples 4 \
--top_p 0.9 \
--recompute \
--rank ${RANK} \
--world_size ${WORLD_SIZE}"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
MODEL_NAME=gpt2-4tp
DATA_OUTPUT_PATH=./
LOGS_PATH=$DATA_OUTPUT_PATH/logs
CHECKPOINT_PATH=checkpoint/$MODEL_NAME
DATA_PATH=my-gpt2_text_document
TENSORBOARD_PATH=output_dir/tensorboard/$MODEL_NAME
CODECARBON_PATH=output_dir/codecarbon/$MODEL_NAME
TP_SIZE=4 # always fixed to the size of a single node
PP_SIZE=1 # NLAYERS must be a multiple of PP_SIZE here
MICRO_BATCH_SIZE=2
GLOBAL_BATCH_SIZE=32
NLAYERS=24
NHIDDEN=1024
NHEADS=16
SEQ_LEN=1024
SAVE_INTERVAL=1000
OPTIMIZER_ARGS=" \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--lr 6.0e-5 \
--min-lr 6.0e-6 \
--lr-decay-style cosine \
--clip-grad 1.0 \
--weight-decay 1e-1 \
"
GPT_ARGS=" \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--num-attention-heads $NHEADS \
--seq-length $SEQ_LEN \
--max-position-embeddings $SEQ_LEN \
--micro-batch-size $MICRO_BATCH_SIZE \
--global-batch-size $GLOBAL_BATCH_SIZE \
--train-iters 1000 \
--loss-scale 12 \
--vocab-file gpt2-vocab.json \
--merge-file gpt2-merges.txt \
--clip-grad 1.0 \
--fp16 \
--checkpoint-activations \
--seed 42
$OPTIMIZER_ARGS \
"
OUTPUT_ARGS=" \
--log-interval 1 \
--save-interval $SAVE_INTERVAL \
--eval-interval 10 \
--eval-iters 40 \
--tensorboard-dir $TENSORBOARD_PATH \
--tensorboard-queue-size 5 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
"
DATA_ARGS=" \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
"
ZERO_STAGE=1
config_json="./${MODEL_NAME}_ds_config.json"
cat <<EOT > $config_json
{
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
"train_batch_size": $GLOBAL_BATCH_SIZE,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT
DEEPSPEED_ARGS=" \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"
APP="python pretrain_gpt.py \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
$DEEPSPEED_ARGS \
--rank ${RANK} \
--world_size ${WORLD_SIZE} \
--dist_url env://127.0.0.1::34566
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
#!/bin/bash
export NCCL_SOCKET_IFNAME=ib0
export NCCL_IB_HCA=mlx5
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
MODEL_NAME=gpt2-oscar_16B-4tp
DATA_OUTPUT_PATH=./
LOGS_PATH=$DATA_OUTPUT_PATH/logs
CHECKPOINT_PATH=checkopints/$MODEL_NAME
DATA_PATH=my-gpt2_text_document
TENSORBOARD_PATH=output_dir/tensorboard/$MODEL_NAME
CODECARBON_PATH=output_dir/codecarbon/$MODEL_NAME
TP_SIZE=4 # always fixed to the size of a single node
PP_SIZE=8 # NLAYERS must be a multiple of PP_SIZE here
MICRO_BATCH_SIZE=1
GLOBAL_BATCH_SIZE=128
NLAYERS=40
NHIDDEN=5760
NHEADS=24
SEQ_LEN=2048
SAVE_INTERVAL=1000
OPTIMIZER_ARGS=" \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--lr 6.0e-5 \
--min-lr 6.0e-6 \
--lr-decay-style cosine \
--clip-grad 1.0 \
--weight-decay 1e-1 \
"
GPT_ARGS=" \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--num-attention-heads $NHEADS \
--seq-length $SEQ_LEN \
--max-position-embeddings $SEQ_LEN \
--micro-batch-size $MICRO_BATCH_SIZE \
--global-batch-size $GLOBAL_BATCH_SIZE \
--train_iters 7000 \
--loss-scale 12 \
--vocab-file gpt2-vocab.json \
--merge-file gpt2-merges.txt \
--clip-grad 1.0 \
--checkpoint-activations \
--seed 42
$OPTIMIZER_ARGS \
"
OUTPUT_ARGS=" \
--log-interval 1 \
--save-interval $SAVE_INTERVAL \
--eval-interval 1000 \
--eval-iters 40 \
--tensorboard-dir $TENSORBOARD_PATH \
--tensorboard-queue-size 5 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
"
DATA_ARGS=" \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
"
ZERO_STAGE=1
config_json="./${MODEL_NAME}_ds_config.json"
cat <<EOT > $config_json
{
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
"train_batch_size": $GLOBAL_BATCH_SIZE,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"fp16": {
"enabled": false,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT
DEEPSPEED_ARGS=" \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"
export CMD=" \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
$DEEPSPEED_ARGS \
"
APP="python3 -u `pwd`/pretrain_gpt.py \
--rank ${RANK} \
--world_size ${WORLD_SIZE} \
--dist_url tcp://${1}:34566 \
--num-workers 2 \
${CMD} \
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment