Commit 0eed4566 authored by hepj987's avatar hepj987
Browse files

调整为标准格式

parent ba71120e
Pipeline #558 canceled with stage
......@@ -18,6 +18,39 @@ GPT2使用Transformer的Decoder结构,并对 Transformer Decoder 进行了一
![image-gpt](image-gpt.png)
## 环境配置
### Docker(方式一)
推荐使用docker方式运行,提供[光源](https://www.sourcefind.cn/)拉取的docker镜像:
```
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.10.0-centos7.6-dtk-23.04-py37-latest
docker run -dit --network=host --name=gpt2_pytorch --privileged --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size=16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root --ulimit stack=-1:-1 --ulimit memlock=-1:-1 image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.10.0-centos7.6-dtk-23.04-py37-latest
docker exec -it gpt2_pytorch /bin/bash
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
```
### Anaconda(方法二):
这里以DTK23.04、python3.7,torch1.10为例,进入[光合开发者社区](https://cancon.hpccube.com:65024/4/main/)进入到pytorch->dtk23.04->下载 torch-1.10.0+gite378c3c.abi0.dtk2304-cp37-cp37m-manylinux2014_x86_64.whl。然后可以仿照下边配置环境:
```
#创建虚拟环境
conda create -n venv_gpt2 python=3.7
#进入venv_gpt2虚拟环境
source activate venv_gpt2
#加载DTK以及其他环境设置
source env.sh
#安装DTK版本依赖
pip install torch-1.10.0+gite378c3c.abi0.dtk2304-cp37-cp37m-manylinux2014_x86_64.whl
pip install deepspeed-0.9.2+git25d5540.abi0.dtk2304.torch1.10.0-cp37-cp37m-manylinux2014_x86_64.whl
#安装其他依赖
pip install -r requirements.txt -i http://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
```
## 数据集
`oscar-1GB`
......@@ -53,19 +86,7 @@ sh creat-data.sh
└── oscar-1GB.jsonl
```
## 环境配置
推荐使用docker方式运行,提供[光源](https://www.sourcefind.cn/)拉取的docker镜像:
```
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.10.0-centos7.6-dtk-23.04-py37-latest
```
进入docker
```
pip install -r requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
```
## GPT2预训练
......@@ -103,74 +124,11 @@ SAVE_INTERVAL 保存频率
### GPT2模型16B多节点训练
要求DCU集群配置好相应的虚拟环境,已安装python依赖项。
在安装一下依赖时需要使用基于DTK编译的版本,下载地址在[光合开发者社区](https://cancon.hpccube.com:65024/4/main/)
```
pytorch
deepspeed
apex
torchaudio
colossalai
faiss
mmcv-full
torchvision
tensorflow
```
这里以DTK23.04、python3.7,torch1.10为例,进入[光合开发者社区](https://cancon.hpccube.com:65024/4/main/)进入到pytorch->dtk23.04->下载 torch-1.10.0+gite378c3c.abi0.dtk2304-cp37-cp37m-manylinux2014_x86_64.whl。然后可以仿照下边配置环境:
```
#创建虚拟环境
export PYTHON3_LIB_PATH=/python_lib_path
virtualenv -p /python_bin_path/python3 --system-site-packages venv_gpt2
#进入venv_gpt2虚拟环境
source venv_gpt2/bin/activate
#加载DTK以及其他环境设置
source env.sh
#安装DTK版本依赖
pip install torch-1.10.0+gite378c3c.abi0.dtk2304-cp37-cp37m-manylinux2014_x86_64.whl
pip install deepspeed-0.9.2+git25d5540.abi0.dtk2304.torch1.10.0-cp37-cp37m-manylinux2014_x86_64.whl
#安装其他依赖
pip install -r requirements.txt -i http://pypi.tuna.tsinghua.edu.cn/simple --trusted-host pypi.tuna.tsinghua.edu.cn
```
```
#多节点运行
sbatch run-16B.sh(主要参数在single-16B.sh)
sh mpi-run-16B.sh(主要参数在single-16B.sh,参数类型与单节点相同, 默认以fp32精度训练,如需采用fp16精度可执行sh mpi-16B-fp16.sh)
```
```
#重要参数
MODEL_NAME 模型名(自定义)
CHECKPOINT_PATH 模型保存&加载路径
DATA_PATH 数据集路径(转换后的)
TENSORBOARD_PATH tensorboard路径
CODECARBON_PATH codecarbon路径
TP_SIZE TP数量
PP_SIZE PP数量
MICRO_BATCH_SIZE MICRO_BATCH_SIZE大小
GLOBAL_BATCH_SIZE GLOBAL_BATCH_SIZE大小
NLAYERS 层数
NHIDDEN 隐藏层维度
NHEADS 注意力机制头数
SEQ_LEN 最大长度
SAVE_INTERVAL 保存频率
--train_iters 训练步数
--eval-interval 验证频率
--eval-iters 验证iter
```
### 16B模型训练loss
| 卡数 | lm loss |
| :-------: | :----------: |
| 32 x 4DCU | 1.965622E+00 |
### 16B模型验证
| 卡数 | lm loss value | lm loss PPL |
......@@ -238,9 +196,19 @@ mpirun -np 1 run-inf.sh
## result
16B模型使用oscar数据集收敛情况如下:
16B模型训练loss:
| 卡数 | 配置 | lm loss |
| :-------: | :---------------: | :----------: |
| 32 x 4DCU | tp4,pp8,单卡16G | 1.965622E+00 |
16B模型验证:
| 卡数 | 配置 | lm loss value | lm loss PPL |
| :-------: | :---------------: | :-----------: | :----------: |
| 32 x 4DCU | tp4,pp8,单卡16G | 4.299443E+00 | 7.365877E+01 |
16B模型收敛曲线如下:
![image-20230524143710566](image-gpt-loss.png)
......
# 模型唯一标识
modelCode=107
# 模型名称
modelName=gpt2_pytorch
# 模型描述
modelDescription=基于Pytorch训练框架的gpt2模型
# 应用场景
appScenario=训练,推理,train,inference,nlp,智能聊天助手
# 框架类型
frameType=Pytorch,Deepspeed
source env.sh
hostfile=./hostfile
np=$(cat $hostfile|sort|uniq |wc -l)
np=$(($np*8))
nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'`
which mpirun
mpirun -np $np --allow-run-as-root --hostfile hostfile --bind-to none --mca btl_tcp_if_include $dist_url single-16B-fp16.sh
echo "END TIME: $(date)"
source env.sh
hostfile=./hostfile
np=$(cat $hostfile|sort|uniq |wc -l)
np=$(($np*8))
nodename=$(cat $hostfile |sed -n "1p")
dist_url=`echo $nodename | awk '{print $1}'`
which mpirun
mpirun -np $np --allow-run-as-root --hostfile hostfile --bind-to none --mca btl_tcp_if_include $dist_url single-16B.sh
echo "END TIME: $(date)"
#!/bin/bash
export NCCL_SOCKET_IFNAME=ib0
export NCCL_IB_HCA=mlx5
export HSA_FORCE_FINE_GRAIN_PCIE=1
export MIOPEN_FIND_MODE=3
export ROCBLAS_COMPUTETYPE_FP16R=0
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
MODEL_NAME=gpt2-oscar_16B-4tp
DATA_OUTPUT_PATH=./
LOGS_PATH=$DATA_OUTPUT_PATH/logs
CHECKPOINT_PATH=checkopints/$MODEL_NAME
DATA_PATH=my-gpt2_text_document
TENSORBOARD_PATH=output_dir/tensorboard/$MODEL_NAME
CODECARBON_PATH=output_dir/codecarbon/$MODEL_NAME
TP_SIZE=4 # always fixed to the size of a single node
PP_SIZE=8 # NLAYERS must be a multiple of PP_SIZE here
MICRO_BATCH_SIZE=1
GLOBAL_BATCH_SIZE=128
NLAYERS=40
NHIDDEN=5760
NHEADS=24
SEQ_LEN=2048
SAVE_INTERVAL=1000
OPTIMIZER_ARGS=" \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--lr 6.0e-5 \
--min-lr 6.0e-6 \
--lr-decay-style cosine \
--clip-grad 1.0 \
--weight-decay 1e-1 \
"
GPT_ARGS=" \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--num-attention-heads $NHEADS \
--seq-length $SEQ_LEN \
--max-position-embeddings $SEQ_LEN \
--micro-batch-size $MICRO_BATCH_SIZE \
--global-batch-size $GLOBAL_BATCH_SIZE \
--train_iters 7000 \
--loss-scale 12 \
--vocab-file gpt2-vocab.json \
--merge-file gpt2-merges.txt \
--clip-grad 1.0 \
--checkpoint-activations \
--seed 42 \
$OPTIMIZER_ARGS \
"
OUTPUT_ARGS=" \
--log-interval 1 \
--save-interval $SAVE_INTERVAL \
--eval-interval 1000 \
--eval-iters 40 \
--tensorboard-dir $TENSORBOARD_PATH \
--tensorboard-queue-size 5 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
"
DATA_ARGS=" \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
"
ZERO_STAGE=1
config_json="./${MODEL_NAME}_ds_config.json"
cat <<EOT > $config_json
{
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
"train_batch_size": $GLOBAL_BATCH_SIZE,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT
DEEPSPEED_ARGS=" \
--deepspeed \
--deepspeed_config ${config_json} \
--zero-stage ${ZERO_STAGE} \
--deepspeed-activation-checkpointing \
"
export CMD=" \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
$GPT_ARGS \
$DATA_ARGS \
$OUTPUT_ARGS \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
$DEEPSPEED_ARGS \
"
APP="python3 -u `pwd`/pretrain_gpt.py \
--rank ${RANK} \
--world_size ${WORLD_SIZE} \
--dist_url tcp://${1}:34566 \
--num-workers 2 \
${CMD} \
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[4])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_4:1
export UCX_IB_PCI_BW=mlx5_4:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
[5])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_5:1
export UCX_IB_PCI_BW=mlx5_5:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
[6])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_6:1
export UCX_IB_PCI_BW=mlx5_6:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
[7])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_7:1
export UCX_IB_PCI_BW=mlx5_7:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
......@@ -133,27 +133,51 @@ APP="python3 -u `pwd`/pretrain_gpt.py \
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=1 --membind=1 ${APP}
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=2 --membind=2 ${APP}
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=0 --membind=0 ${APP}
;;
[4])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_4:1
export UCX_IB_PCI_BW=mlx5_4:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
[5])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_5:1
export UCX_IB_PCI_BW=mlx5_5:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
[6])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_6:1
export UCX_IB_PCI_BW=mlx5_6:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
[7])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export UCX_NET_DEVICES=mlx5_7:1
export UCX_IB_PCI_BW=mlx5_7:50Gbs
NCCL_SOCKET_IFNAME=ib0 numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment