"test/vscode:/vscode.git/clone" did not exist on "9ccee9c051cfabcdf2919fa2c1f69c11a72bf23d"
Commit 9fbb3cc0 authored by qianyj's avatar qianyj
Browse files

Update code to dtk23.04.1-tf2.11

parent 32e4ca51
......@@ -14,7 +14,7 @@ ResNet50使用了多个具有残差连接的残差块来解决梯度消失或梯
## 环境配置
### Docker(方法一)
```
docker pull image.sourcefind.cn:5000/dcu/admin/base/tensorflow:2.7.0-centos7.6-dtk-22.10.1-py38-latest
docker pull image.sourcefind.cn:5000/dcu/admin/base/tensorflow:2.11.0-centos7.6-dtk23.04.1-py38
# <Your Image ID>用上面拉取docker镜像的ID替换
docker run --shm-size 16g --network=host --name=resnet50_tensorFlow --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v $PWD/resnet50_tensorflow:/home/resnet50_tensorflow -it <Your Image ID> bash
pip install -r requirements.txt --no-deps
......@@ -30,12 +30,12 @@ docker run --rm --shm-size 16g --network=host --name=resnet50_tensorflow --privi
1、关于本项目DCU显卡所需的特殊深度学习库可以从开发者社区下载安装:
https://developer.hpccube.com/tool/
```
DTK版本:dtk22.10.1
DTK版本:dtk23.04.1
python: 3.8
tensorflow: 2.7
tf-models-official: 2.7
keras: 2.7
tensorboard: 2.7
tensorflow: 2.11
tf-models-official: 2.11
keras: 2.11
tensorboard: 2.11
```
`Tips:以上dtk、python、tensorflow等DCU相关工具版本需要严格一一对应`
......@@ -77,33 +77,29 @@ tfrecord-imagenet
不打开xla:
export PYTHONPATH=/home/resnet50_tensorFlow:$PYTHONPATH
python3 official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=128 --num_gpus=1 --use_synthetic_data=false --train_epochs=90 --dtype=fp32
python3 official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=128 --num_gpus=1 --use_synthetic_data=false --train_epochs=90 --dtype=fp32
打开xla:
sh /opt/dtk/.hip/replace_origin.sh
export PYTHONPATH=/home/resnet50_tensorflow:$PYTHONPATH
在resnet_ctl_imagenet_main.py中添加环境变量os.environ["XLA_FLAGS"]="--xla_gpu_cuda_data_dir=/opt/dtk/amdgcn/bitcode"
TF_XLA_FLAGS="--tf_xla_auto_jit=1" python3 official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=128 --num_gpus=1 --use_synthetic_data=false --train_epochs=90 --dtype=fp32
TF_XLA_FLAGS="--tf_xla_auto_jit=1" python3 official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=128 --num_gpus=1 --use_synthetic_data=false --train_epochs=90 --dtype=fp32
#### 单机四卡训练指令:
不打开xla:
export PYTHONPATH=/home/resnet50_tensorflow:$PYTHONPATH
python3 official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=512 --num_gpus=4 --use_synthetic_data=false --train_epochs=90 --dtype=fp32
python3 official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=512 --num_gpus=4 --use_synthetic_data=false --train_epochs=90 --dtype=fp32
打开xla:
sh /opt/dtk/.hip/replace_origin.sh
export PYTHONPATH=/home/resnet50_tensorflow:$PYTHONPATH
在resnet_ctl_imagenet_main.py中添加环境变量os.environ["XLA_FLAGS"]="--xla_gpu_cuda_data_dir=/opt/dtk/amdgcn/bitcode"
TF_XLA_FLAGS="--tf_xla_auto_jit=1" python3 official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=512 --num_gpus=4 --train_epochs=90 --use_synthetic_data=false --dtype=fp32
TF_XLA_FLAGS="--tf_xla_auto_jit=1" python3 official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=512 --num_gpus=4 --train_epochs=90 --use_synthetic_data=false --dtype=fp32
#### 多机多卡训练指令(以单机四卡模拟四卡四进程为例):
sed指令只需要执行一次,添加支持多卡运行的代码
sed -i '100 r configfile' official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py
sed -i '100 r configfile' official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py
不打开xla:
......@@ -112,9 +108,7 @@ sed指令只需要执行一次,添加支持多卡运行的代码
打开xla:
sh /opt/dtk/.hip/replace_origin.sh
export PYTHONPATH=/home/resnet50_tensorflow:$PYTHONPATH
在resnet_ctl_imagenet_main.py中添加环境变量os.environ["XLA_FLAGS"]="--xla_gpu_cuda_data_dir=/opt/dtk/amdgcn/bitcode"
mpirun -np 4 --hostfile hostfile -mca btl self,tcp --allow-run-as-root --bind-to none scripts-run/single_process_xla.sh
### fp16训练
......@@ -123,28 +117,24 @@ sed指令只需要执行一次,添加支持多卡运行的代码
不打开xla:
export PYTHONPATH=/home/resnet50_tensorFlow:$PYTHONPATH
python3 official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=128 --num_gpus=1 --use_synthetic_data=false --train_epochs=90 --dtype=fp16
python3 official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=128 --num_gpus=1 --use_synthetic_data=false --train_epochs=90 --dtype=fp16
打开xla:
sh /opt/dtk/.hip/replace_origin.sh
export PYTHONPATH=/home/resnet50_tensorflow:$PYTHONPATH
在resnet_ctl_imagenet_main.py中添加环境变量os.environ["XLA_FLAGS"]="--xla_gpu_cuda_data_dir=/opt/dtk/amdgcn/bitcode"
TF_XLA_FLAGS="--tf_xla_auto_jit=1" python3 official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=128 --num_gpus=1 --train_epochs=90 --use_synthetic_data=false --dtype=fp16
TF_XLA_FLAGS="--tf_xla_auto_jit=1" python3 official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=128 --num_gpus=1 --train_epochs=90 --use_synthetic_data=false --dtype=fp16
#### 单机四卡训练指令
不打开xla:
export PYTHONPATH=/home/resnet50_tensorflow:$PYTHONPATH
python3 official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=512 --num_gpus=4 --train_epochs=90 --use_synthetic_data=false --dtype=fp16
python3 official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=512 --num_gpus=4 --train_epochs=90 --use_synthetic_data=false --dtype=fp16
打开xla:
sh /opt/dtk/.hip/replace_origin.sh
export PYTHONPATH=/home/resnet50_tensorflow:$PYTHONPATH
在resnet_ctl_imagenet_main.py中添加环境变量os.environ["XLA_FLAGS"]="--xla_gpu_cuda_data_dir=/opt/dtk/amdgcn/bitcode"
TF_XLA_FLAGS="--tf_xla_auto_jit=1" python3 official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=512 --num_gpus=4 --train_epochs=90 --use_synthetic_data=false --dtype=fp16
TF_XLA_FLAGS="--tf_xla_auto_jit=1" python3 official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=512 --num_gpus=4 --train_epochs=90 --use_synthetic_data=false --dtype=fp16
#### 多机多卡训练指令(以单机四卡模拟四卡四进程为例)
......@@ -161,9 +151,7 @@ sed指令只需要执行一次,添加支持多卡运行的代码
打开xla:
sh /opt/dtk/.hip/replace_origin.sh
export PYTHONPATH=/home/resnet50_tensorflow:$PYTHONPATH
在resnet_ctl_imagenet_main.py中添加环境变量os.environ["XLA_FLAGS"]="--xla_gpu_cuda_data_dir=/opt/dtk/amdgcn/bitcode"
mpirun -np 4 --hostfile hostfile -mca btl self,tcp --allow-run-as-root --bind-to none scripts-run/single_process_xla.sh
### result
......
FROM image.sourcefind.cn:5000/dcu/admin/base/tensorflow:2.7.0-centos7.6-dtk-22.10.1-py38-latest
FROM image.sourcefind.cn:5000/dcu/admin/base/tensorflow:2.11.0-centos7.6-dtk23.04.1-py38
ENV DEBIAN_FRONTEND=noninteractive
# RUN yum update && yum install -y git cmake wget build-essential
RUN source /opt/dtk-22.10.1/env.sh
RUN source /opt/dtk-23.04.1/env.sh
# 安装pip相关依赖
COPY requirements.txt requirements.txt
RUN pip3 install -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com -r requirements.txt --no-deps
numpy==1.21.5
numpy==1.22.4
tensorflow_model_optimization
tf-models-official==2.7
tf-models-official==2.11
psutil
tensorboard==2.7
keras==2.7
tensorboard==2.11
keras==2.11
......@@ -3,7 +3,7 @@
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
drank=$OMPI_COMM_WORLD_RANK
APP="python3 ./official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py --num_gpus=1 --skip_eval=true --batch_size=512 --train_epochs=90 --use_synthetic_data=false --distribution_strategy=multi_worker_mirrored --all_reduce_alg=nccl --dtype=fp32 --data_dir=${data_dir} --task_index=${drank} "
APP="python3 ./official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py --num_gpus=1 --skip_eval=true --batch_size=512 --train_epochs=90 --use_synthetic_data=false --distribution_strategy=multi_worker_mirrored --all_reduce_alg=nccl --dtype=fp32 --data_dir=${data_dir} --task_index=${drank} "
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
......
......@@ -3,7 +3,7 @@
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
drank=$OMPI_COMM_WORLD_RANK
APP="python3 ./official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py --num_gpus=1 --skip_eval=true --batch_size=512 --train_epochs=90 --use_synthetic_data=false --distribution_strategy=multi_worker_mirrored --all_reduce_alg=nccl --dtype=fp32 --data_dir=${data_dir} --task_index=${drank}"
APP="python3 ./official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py --num_gpus=1 --skip_eval=true --batch_size=512 --train_epochs=90 --use_synthetic_data=false --distribution_strategy=multi_worker_mirrored --all_reduce_alg=nccl --dtype=fp32 --data_dir=${data_dir} --task_index=${drank}"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment