Update code to dtk23.04.1-tf2.11

9fbb3cc0 · qianyj · 32e4ca51 · 9fbb3cc0 · 9fbb3cc0 · 9fbb3cc0
Commit 9fbb3cc0 authored Nov 28, 2023 by qianyj
6 changed files
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ ResNet50使用了多个具有残差连接的残差块来解决梯度消失或梯
 ## 环境配置
 ### Docker(方法一)
 ```
-docker pull image.sourcefind.cn:5000/dcu/admin/base/tensorflow:2.7.0-centos7.6-dtk-22.10.1-py38-latest
+docker pull image.sourcefind.cn:5000/dcu/admin/base/tensorflow:2.11.0-centos7.6-dtk23.04.1-py38
 # <Your Image ID>用上面拉取docker镜像的ID替换
 docker run --shm-size 16g --network=host --name=resnet50_tensorFlow --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v $PWD/resnet50_tensorflow:/home/resnet50_tensorflow -it <Your Image ID> bash
 pip install -r requirements.txt --no-deps
@@ -30,12 +30,12 @@ docker run --rm --shm-size 16g --network=host --name=resnet50_tensorflow --privi
 1、关于本项目DCU显卡所需的特殊深度学习库可以从开发者社区下载安装：
 https://developer.hpccube.com/tool/
 ```
-DTK版本：dtk22.10.1
+DTK版本：dtk23.04.1
 python:  3.8
-tensorflow: 2.7
-tf-models-official: 2.7
-keras: 2.7
-tensorboard: 2.7
+tensorflow: 2.11
+tf-models-official: 2.11
+keras: 2.11
+tensorboard: 2.11
 ```
 `Tips:以上dtk、python、tensorflow等DCU相关工具版本需要严格一一对应`

@@ -77,33 +77,29 @@ tfrecord-imagenet
 不打开xla:

    export PYTHONPATH=/home/resnet50_tensorFlow:$PYTHONPATH  
-    python3 official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=128 --num_gpus=1  --use_synthetic_data=false  --train_epochs=90  --dtype=fp32
+    python3 official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=128 --num_gpus=1  --use_synthetic_data=false  --train_epochs=90  --dtype=fp32

 打开xla:
    
-    sh /opt/dtk/.hip/replace_origin.sh
    export PYTHONPATH=/home/resnet50_tensorflow:$PYTHONPATH
-    在resnet_ctl_imagenet_main.py中添加环境变量os.environ["XLA_FLAGS"]="--xla_gpu_cuda_data_dir=/opt/dtk/amdgcn/bitcode"
-    TF_XLA_FLAGS="--tf_xla_auto_jit=1" python3 official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=128 --num_gpus=1  --use_synthetic_data=false  --train_epochs=90  --dtype=fp32
+    TF_XLA_FLAGS="--tf_xla_auto_jit=1" python3 official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=128 --num_gpus=1  --use_synthetic_data=false  --train_epochs=90  --dtype=fp32

 #### 单机四卡训练指令：
 不打开xla:

    export PYTHONPATH=/home/resnet50_tensorflow:$PYTHONPATH
-    python3 official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=512 --num_gpus=4  --use_synthetic_data=false  --train_epochs=90  --dtype=fp32
+    python3 official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=512 --num_gpus=4  --use_synthetic_data=false  --train_epochs=90  --dtype=fp32

 打开xla:

-    sh /opt/dtk/.hip/replace_origin.sh
    export PYTHONPATH=/home/resnet50_tensorflow:$PYTHONPATH
-    在resnet_ctl_imagenet_main.py中添加环境变量os.environ["XLA_FLAGS"]="--xla_gpu_cuda_data_dir=/opt/dtk/amdgcn/bitcode"
-    TF_XLA_FLAGS="--tf_xla_auto_jit=1" python3 official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=512 --num_gpus=4  --train_epochs=90  --use_synthetic_data=false --dtype=fp32
+    TF_XLA_FLAGS="--tf_xla_auto_jit=1" python3 official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=512 --num_gpus=4  --train_epochs=90  --use_synthetic_data=false --dtype=fp32

 #### 多机多卡训练指令(以单机四卡模拟四卡四进程为例)：

 sed指令只需要执行一次，添加支持多卡运行的代码

-    sed -i '100 r configfile' official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py
+    sed -i '100 r configfile' official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py

 不打开xla：

@@ -112,9 +108,7 @@ sed指令只需要执行一次，添加支持多卡运行的代码

 打开xla：
    
-    sh /opt/dtk/.hip/replace_origin.sh
    export PYTHONPATH=/home/resnet50_tensorflow:$PYTHONPATH
-    在resnet_ctl_imagenet_main.py中添加环境变量os.environ["XLA_FLAGS"]="--xla_gpu_cuda_data_dir=/opt/dtk/amdgcn/bitcode"
    mpirun -np 4 --hostfile hostfile  -mca btl self,tcp  --allow-run-as-root  --bind-to none scripts-run/single_process_xla.sh
    
 ### fp16训练
@@ -123,28 +117,24 @@ sed指令只需要执行一次，添加支持多卡运行的代码
 不打开xla：
   
    export PYTHONPATH=/home/resnet50_tensorFlow:$PYTHONPATH
-    python3 official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=128 --num_gpus=1  --use_synthetic_data=false --train_epochs=90  --dtype=fp16
+    python3 official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=128 --num_gpus=1  --use_synthetic_data=false --train_epochs=90  --dtype=fp16

 打开xla：
  
-    sh /opt/dtk/.hip/replace_origin.sh
    export PYTHONPATH=/home/resnet50_tensorflow:$PYTHONPATH
-    在resnet_ctl_imagenet_main.py中添加环境变量os.environ["XLA_FLAGS"]="--xla_gpu_cuda_data_dir=/opt/dtk/amdgcn/bitcode"
-    TF_XLA_FLAGS="--tf_xla_auto_jit=1" python3 official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=128 --num_gpus=1  --train_epochs=90  --use_synthetic_data=false --dtype=fp16
+    TF_XLA_FLAGS="--tf_xla_auto_jit=1" python3 official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=128 --num_gpus=1  --train_epochs=90  --use_synthetic_data=false --dtype=fp16

 #### 单机四卡训练指令

 不打开xla:
  
    export PYTHONPATH=/home/resnet50_tensorflow:$PYTHONPATH
-    python3 official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=512 --num_gpus=4  --train_epochs=90  --use_synthetic_data=false --dtype=fp16
+    python3 official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=512 --num_gpus=4  --train_epochs=90  --use_synthetic_data=false --dtype=fp16

 打开xla：
    
-    sh /opt/dtk/.hip/replace_origin.sh
    export PYTHONPATH=/home/resnet50_tensorflow:$PYTHONPATH
-    在resnet_ctl_imagenet_main.py中添加环境变量os.environ["XLA_FLAGS"]="--xla_gpu_cuda_data_dir=/opt/dtk/amdgcn/bitcode"
-    TF_XLA_FLAGS="--tf_xla_auto_jit=1" python3 official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=512 --num_gpus=4  --train_epochs=90  --use_synthetic_data=false --dtype=fp16
+    TF_XLA_FLAGS="--tf_xla_auto_jit=1" python3 official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=512 --num_gpus=4  --train_epochs=90  --use_synthetic_data=false --dtype=fp16

 #### 多机多卡训练指令(以单机四卡模拟四卡四进程为例)

@@ -161,9 +151,7 @@ sed指令只需要执行一次，添加支持多卡运行的代码

 打开xla：
 
-    sh /opt/dtk/.hip/replace_origin.sh
    export PYTHONPATH=/home/resnet50_tensorflow:$PYTHONPATH
-    在resnet_ctl_imagenet_main.py中添加环境变量os.environ["XLA_FLAGS"]="--xla_gpu_cuda_data_dir=/opt/dtk/amdgcn/bitcode"
    mpirun -np 4 --hostfile hostfile  -mca btl self,tcp  --allow-run-as-root  --bind-to none scripts-run/single_process_xla.sh

 ### result

--- a/docker/dockerfile
+++ b/docker/dockerfile
-FROM image.sourcefind.cn:5000/dcu/admin/base/tensorflow:2.7.0-centos7.6-dtk-22.10.1-py38-latest
+FROM image.sourcefind.cn:5000/dcu/admin/base/tensorflow:2.11.0-centos7.6-dtk23.04.1-py38
 ENV DEBIAN_FRONTEND=noninteractive
 # RUN yum update && yum install -y git cmake wget build-essential
-RUN source /opt/dtk-22.10.1/env.sh
+RUN source /opt/dtk-23.04.1/env.sh
 # 安装pip相关依赖
 COPY requirements.txt requirements.txt
 RUN pip3 install -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com -r requirements.txt  --no-deps
--- a/docker/requirements.txt
+++ b/docker/requirements.txt
-numpy==1.21.5
+numpy==1.22.4
 tensorflow_model_optimization
-tf-models-official==2.7
+tf-models-official==2.11
 psutil
-tensorboard==2.7
-keras==2.7
+tensorboard==2.11
+keras==2.11
--- a/requirements.txt
+++ b/requirements.txt
-numpy==1.21.5
+numpy==1.22.4
 tensorflow_model_optimization
-tf-models-official==2.7
+tf-models-official==2.11
 psutil
-tensorboard==2.7
-keras==2.7
+tensorboard==2.11
+keras==2.11
--- a/scripts-run/single_process.sh
+++ b/scripts-run/single_process.sh
@@ -3,7 +3,7 @@
 lrank=$OMPI_COMM_WORLD_LOCAL_RANK
 drank=$OMPI_COMM_WORLD_RANK

-APP="python3 ./official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py   --num_gpus=1  --skip_eval=true   --batch_size=512 --train_epochs=90  --use_synthetic_data=false  --distribution_strategy=multi_worker_mirrored  --all_reduce_alg=nccl --dtype=fp32  --data_dir=${data_dir}   --task_index=${drank} "
+APP="python3 ./official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py   --num_gpus=1  --skip_eval=true   --batch_size=512 --train_epochs=90  --use_synthetic_data=false  --distribution_strategy=multi_worker_mirrored  --all_reduce_alg=nccl --dtype=fp32  --data_dir=${data_dir}   --task_index=${drank} "
 case ${lrank} in
 [0])
  export HIP_VISIBLE_DEVICES=0

--- a/scripts-run/single_process_xla.sh
+++ b/scripts-run/single_process_xla.sh
@@ -3,7 +3,7 @@
 lrank=$OMPI_COMM_WORLD_LOCAL_RANK
 drank=$OMPI_COMM_WORLD_RANK

-APP="python3 ./official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py   --num_gpus=1  --skip_eval=true   --batch_size=512 --train_epochs=90  --use_synthetic_data=false  --distribution_strategy=multi_worker_mirrored  --all_reduce_alg=nccl --dtype=fp32  --data_dir=${data_dir}   --task_index=${drank}"
+APP="python3 ./official/legacy/image_classification/resnet/resnet_ctl_imagenet_main.py  --num_gpus=1  --skip_eval=true   --batch_size=512 --train_epochs=90  --use_synthetic_data=false  --distribution_strategy=multi_worker_mirrored  --all_reduce_alg=nccl --dtype=fp32  --data_dir=${data_dir}   --task_index=${drank}"
 case ${lrank} in
 [0])
  export HIP_VISIBLE_DEVICES=0