update

0a3a611b · liangjj · 557ae9c4 · 0a3a611b · 0a3a611b · 0a3a611b
Commit 0a3a611b authored Mar 14, 2023 by liangjj
16 changed files
--- a/TensorFlow2x/ComputeVision/Classification/README.md
+++ b/TensorFlow2x/ComputeVision/Classification/README.md
@@ -5,29 +5,38 @@
 # 运行
 ## 单卡
    export PYTHONPATH=/path/to/tensorflow/model:$PYTHONPATH
    export HIP_VISIBLE_DEVICES=0
+    #without xla
    python3 models-master/official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=128 --num_gpus=1  --use_synthetic_data=false
+    #with xla
+    TF_XLA_FLAGS="--tf_xla_auto_jit=2" python3 models-master/official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=128 --num_gpus=1  --use_synthetic_data=false
 ## 单机多卡
    export PYTHONPATH=/path/to/tensorflow/model:$PYTHONPATH
    export HIP_VISIBLE_DEVICES=0,1,2,3
+    #without xla
    python3 models-master/official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=512 --num_gpus=4  --use_synthetic_data=false
+    #with xla
+    TF_XLA_FLAGS="--tf_xla_auto_jit=2" python3 models-master/official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py --data_dir=/path/to/{ImageNet-tensorflow_data_dir} --model_dir=/path/to/{model_save_dir} --batch_size=512 --num_gpus=4  --use_synthetic_data=false
 ## 分布式多卡
    # sed指令只需要执行一次，添加支持多卡运行的代码
    sed -i '101 r configfile' models-master/official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py
    export PYTHONPATH=/path/to/tensorflow/model:$PYTHONPATH
+    #without xla
    mpirun -np ${num_gpu} --hostfile hostfile  -mca btl self,tcp  --bind-to none scripts-run/single_process.sh
+    #with xla
+    mpirun -np ${num_gpu} --hostfile hostfile  -mca btl self,tcp  --bind-to none scripts-run/single_process_xla.sh
 ### 测试说明
   多卡测试时需要修改部分代码，具体可参考https://tensorflow.google.cn/guide/migrate/multi_worker_cpu_gpu_training?hl=en
   hostfile格式参考：
     node1 slots=4

--- a/TensorFlow2x/ComputeVision/Classification/scripts-run/single_process.sh
+++ b/TensorFlow2x/ComputeVision/Classification/scripts-run/single_process.sh
 #!/bin/bash
 lrank=$OMPI_COMM_WORLD_LOCAL_RANK
-d_rank=$OMPI_COMM_WORLD_RANK
+drank=$OMPI_COMM_WORLD_RANK
-APP="python3 ../models-master/official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py   --num_gpus=1  --skip_eval=true   --batch_size=512 --train_epochs=90  --use_synthetic_data=false  --distribution_strategy=multi_worker_mirrored  --all_reduce_alg=nccl --dtype=fp32  --data_dir=${data_dir}   --task_index=${drank} "
+APP="python3 ./models-master/official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py   --num_gpus=1  --skip_eval=true   --batch_size=512 --train_epochs=90  --use_synthetic_data=false  --distribution_strategy=multi_worker_mirrored  --all_reduce_alg=nccl --dtype=fp32  --data_dir=${data_dir}   --task_index=${drank} "
 case ${lrank} in
 [0])
  export HIP_VISIBLE_DEVICES=0

--- a/TensorFlow2x/ComputeVision/Classification/scripts-run/single_process_xla.sh
+++ b/TensorFlow2x/ComputeVision/Classification/scripts-run/single_process_xla.sh
+#!/bin/bash
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+drank=$OMPI_COMM_WORLD_RANK
+APP="python3 ./models-master/official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py   --num_gpus=1  --skip_eval=true   --batch_size=512 --train_epochs=90  --use_synthetic_data=false  --distribution_strategy=multi_worker_mirrored  --all_reduce_alg=nccl --dtype=fp32  --data_dir=${data_dir}   --task_index=${drank}"
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  TF_XLA_FLAGS="--tf_xla_auto_jit=2" numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=1
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  TF_XLA_FLAGS="--tf_xla_auto_jit=2" numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=2
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  TF_XLA_FLAGS="--tf_xla_auto_jit=2" numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=3
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  TF_XLA_FLAGS="--tf_xla_auto_jit=2" numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac
--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/README.md
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/README.md
@@ -104,18 +104,32 @@ https://storage.googleapis.com/cloud-tpu-checkpoints/retinanet/resnet50-checkpoi
 ## 单卡训练  
 ```
+#without_xla
 export HIP_VISIBLE_DEVICES=0
 python3 scripts/benchmark_training.py --gpus 1 --batch_size 2 --model_dir save_model --data_dir /public/home/tianlh/AI-application/Tensorflow/MaskRCNN_tf2/dataset/coco2017_tfrecord --weights_dir weights 
+#with xla
+export HIP_VISIBLE_DEVICES=0
+python3 scripts/benchmark_training_xla.py --gpus 1 --batch_size 2 --model_dir save_model --data_dir /public/home/tianlh/AI-application/Tensorflow/MaskRCNN_tf2/dataset/coco2017_tfrecord --weights_dir weights 
 ```
 ## 多卡训练 
 ``` 
+#without xla
 export HIP_VISIBLE_DEVICES=0,1
 python3 scripts/benchmark_training.py --gpus 2 --batch_size 4 --model_dir save_model_2dcu --data_dir /public/home/tianlh/AI-application/Tensorflow/MaskRCNN_tf2/dataset/coco2017_tfrecord --weights_dir weights 
+#with xla 
+export HIP_VISIBLE_DEVICES=0,1
+python3 scripts/benchmark_training_xla.py --gpus 2 --batch_size 4 --model_dir save_model_2dcu --data_dir /public/home/tianlh/AI-application/Tensorflow/MaskRCNN_tf2/dataset/coco2017_tfrecord --weights_dir weights 
 ```
 ## 推理  
 ```
+#without xla
 python3 scripts/benchmark_inference.py --batch_size 2 --model_dir save_model --data_dir /public/home/tianlh/AI-application/Tensorflow/MaskRCNN_tf2/dataset/coco2017_tfrecord --weights_dir weights
+#with xla
+python3 scripts/benchmark_inference_xla.py --batch_size 2 --model_dir save_model --data_dir /public/home/tianlh/AI-application/Tensorflow/MaskRCNN_tf2/dataset/coco2017_tfrecord --weights_dir weights
 ```
 # 参考资料

--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/mask_rcnn/dataloader.py
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/mask_rcnn/dataloader.py
@@ -201,11 +201,11 @@ class InputReader(object):
        data_options.experimental_optimization.map_fusion = True
        data_options.experimental_optimization.map_parallelization = True
-        map_vectorization_options = tf.data.experimental.MapVectorizationOptions()
+        #map_vectorization_options = tf.data.experimental.MapVectorizationOptions()
-        map_vectorization_options.enabled = True
+        #map_vectorization_options.enabled = True
-        map_vectorization_options.use_choose_fastest = True
+        #map_vectorization_options.use_choose_fastest = True
-        data_options.experimental_optimization.map_vectorization = map_vectorization_options
+        #data_options.experimental_optimization.map_vectorization = map_vectorization_options
        data_options.experimental_optimization.noop_elimination = True
        data_options.experimental_optimization.parallel_batch = True

--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/scripts/benchmark_inference.py
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/scripts/benchmark_inference.py
@@ -43,7 +43,6 @@ def main():
        f' --eval_samples 1200' 
        f' --use_batched_nms'
        f' --nouse_custom_box_proposals_op'
-        f' --xla'
        f' --eval_batch_size {flags.batch_size}'
        f' {"--amp" if flags.amp else ""}'
    )

--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/scripts/benchmark_inference_xla.py
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/scripts/benchmark_inference_xla.py
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Scripts that simplifies running evaluation benchmark """
+import argparse
+import os
+import shutil
+import subprocess
+def main():
+    # CLI flags
+    parser = argparse.ArgumentParser(description="MaskRCNN evaluation benchmark")
+    parser.add_argument('--batch_size', type=int, required=True)
+    parser.add_argument('--amp', action='store_true')
+    parser.add_argument('--data_dir', type=str, default='/data')
+    parser.add_argument('--model_dir', type=str, default='/tmp/model')
+    parser.add_argument('--weights_dir', type=str, default='/model')
+    flags = parser.parse_args()
+    main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../mask_rcnn_main.py'))
+    # build command
+    cmd = (
+        f'python {main_path}'
+        f' --mode eval'
+        f' --model_dir "{flags.model_dir}"'
+        f' --checkpoint "{os.path.join(flags.weights_dir, "resnet/resnet-nhwc-2018-02-07/model.ckpt-112603")}"'
+        f' --validation_file_pattern "{os.path.join(flags.data_dir, "val*.tfrecord")}"'
+        f' --val_json_file "{os.path.join(flags.data_dir, "annotations/instances_val2017.json")}"' 
+        f' --num_steps_per_eval 200'
+        f' --eval_samples 1200' 
+        f' --use_batched_nms'
+        f' --nouse_custom_box_proposals_op'
+        f' --xla'
+        f' --eval_batch_size {flags.batch_size}'
+        f' {"--amp" if flags.amp else ""}'
+    )
+    # print command
+    line = '-' * shutil.get_terminal_size()[0]
+    print(line, cmd, line, sep='\n')
+    # run model
+    subprocess.call(cmd, shell=True)
+if __name__ == '__main__':
+    main()
--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/scripts/benchmark_training.py
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/scripts/benchmark_training.py
@@ -45,7 +45,6 @@ def main():
        f' --use_batched_nms'
        f' --noeval_after_training'
        f' --nouse_custom_box_proposals_op'
-        f' --xla'
        f' --train_batch_size {flags.batch_size}'
        f' {"--amp" if flags.amp else ""}'
    )

--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/scripts/benchmark_training_xla.py
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/scripts/benchmark_training_xla.py
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Scripts that simplifies running training benchmark """
+import argparse
+import os
+import shutil
+import subprocess
+def main():
+    # CLI flags
+    parser = argparse.ArgumentParser(description="MaskRCNN train benchmark")
+    parser.add_argument('--gpus', type=int, required=True)
+    parser.add_argument('--batch_size', type=int, required=True)
+    parser.add_argument('--amp', action='store_true')
+    parser.add_argument('--data_dir', type=str, default='/data')
+    parser.add_argument('--model_dir', type=str, default='/tmp/model')
+    parser.add_argument('--weights_dir', type=str, default='/model')
+    flags = parser.parse_args()
+    main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../mask_rcnn_main.py'))
+    # build command
+    cmd = (
+        f'horovodrun -np {flags.gpus} '
+        f'python {main_path}'
+        f' --mode train'
+        f' --model_dir "{flags.model_dir}"'
+        f' --checkpoint "{os.path.join(flags.weights_dir, "resnet/resnet-nhwc-2018-02-07/model.ckpt-112603")}"'
+        f' --training_file_pattern "{os.path.join(flags.data_dir, "train*.tfrecord")}"'
+        f' --init_learning_rate 0.04'
+        f' --total_steps 200'
+        f' --use_batched_nms'
+        f' --noeval_after_training'
+        f' --nouse_custom_box_proposals_op'
+        f' --xla'
+        f' --train_batch_size {flags.batch_size}'
+        f' {"--amp" if flags.amp else ""}'
+    )
+    # print command
+    line = '-' * shutil.get_terminal_size()[0]
+    print(line, cmd, line, sep='\n')
+    # run model
+    subprocess.call(cmd, shell=True)
+if __name__ == '__main__':
+    main()
--- a/TensorFlow2x/ComputeVision/Detection/SSD/2007_train.txt
+++ b/TensorFlow2x/ComputeVision/Detection/SSD/2007_train.txt
--- a/TensorFlow2x/ComputeVision/Detection/SSD/2007_val.txt
+++ b/TensorFlow2x/ComputeVision/Detection/SSD/2007_val.txt
--- a/TensorFlow2x/ComputeVision/Detection/SSD/README.md
+++ b/TensorFlow2x/ComputeVision/Detection/SSD/README.md
@@ -49,8 +49,11 @@ export LD_LIBRARY_PATH=/public/software/compiler/rocm/dtk-21.10.1/roctracer/lib
 export HSA_FORCE_FINE_GRAIN_PCIE=1  
 export MIOPEN_FIND_MODE=3  
 export HIP_VISIBLE_DEVICES=0  
-cd /public/home/libodi/work1/ssd-tf2-master  
+cd /public/home/libodi/work1/ssd-tf2-master
+#without xla
 python3 train.py --dtype=fp32
+#with xla
+TF_XLA_FLAGS="--tf_xla_auto_jit=2" python3 train.py --dtype=fp32
 ```
 使用ssd-tf2-master/voc_annotation.py自动生成训练集和验证集，其中训练集5717 张、验证集5823张。**具体为，修改voc_annotation.py里面的annotation_mode=2，运行voc_annotation.py生成根目录下的2007_train.txt和2007_val.txt。**

--- a/TensorFlow2x/ComputeVision/Detection/SSD/train.py
+++ b/TensorFlow2x/ComputeVision/Detection/SSD/train.py
@@ -65,7 +65,7 @@ if __name__ == "__main__":
    #   网络一般不从0开始训练，至少会使用主干部分的权值，有些论文提到可以不用预训练，主要原因是他们 数据集较大 且 调参能力优秀。
    #   如果一定要训练网络的主干部分，可以了解imagenet数据集，首先训练分类模型，分类模型的 主干部分 和该模型通用，基于此进行训练。
    #----------------------------------------------------------------------------------------------------------------------------#
-    model_path      = ''
+    model_path      = 'model_data/ssd_weights.h5'
    #------------------------------------------------------#
    #   输入的shape大小
    #------------------------------------------------------#
@@ -116,8 +116,8 @@ if __name__ == "__main__":
    #----------------------------------------------------#
    #   获得图片路径和标签
    #----------------------------------------------------#
-    train_annotation_path   = '2012_train.txt'
+    train_annotation_path   = '2007_train.txt'
-    val_annotation_path     = '2012_val.txt'
+    val_annotation_path     = '2007_val.txt'
    #----------------------------------------------------#
    #   获取classes和anchor

--- a/TensorFlow2x/ComputeVision/Detection/SSD/voc_annotation.py
+++ b/TensorFlow2x/ComputeVision/Detection/SSD/voc_annotation.py
@@ -32,7 +32,7 @@ train_percent       = 0.9
 #-------------------------------------------------------#
 VOCdevkit_path  = 'VOCdevkit'
-VOCdevkit_sets  = [('2012', 'train'), ('2012', 'val')]
+VOCdevkit_sets  = [('2007', 'train'), ('2007', 'val')]
 classes, _      = get_classes(classes_path)
 def convert_annotation(year, image_id, list_file):
@@ -56,8 +56,8 @@ if __name__ == "__main__":
    random.seed(0)
    if annotation_mode == 0 or annotation_mode == 1:
        print("Generate txt in ImageSets.")
-        xmlfilepath     = os.path.join(VOCdevkit_path, 'VOC2012/Annotations')
+        xmlfilepath     = os.path.join(VOCdevkit_path, 'VOC2007/Annotations')
-        saveBasePath    = os.path.join(VOCdevkit_path, 'VOC2012/ImageSets/Main')
+        saveBasePath    = os.path.join(VOCdevkit_path, 'VOC2007/ImageSets/Main')
        temp_xml        = os.listdir(xmlfilepath)
        total_xml       = []
        for xml in temp_xml:
@@ -106,4 +106,4 @@ if __name__ == "__main__":
                convert_annotation(year, image_id, list_file)
                list_file.write('\n')
            list_file.close()
-        print("Generate 2012_train.txt and 2012_val.txt for train done.")
+        print("Generate 2007_train.txt and 2007_val.txt for train done.")
--- a/TensorFlow2x/ComputeVision/Detection/YOLOv3/README.md
+++ b/TensorFlow2x/ComputeVision/Detection/YOLOv3/README.md
@@ -94,7 +94,7 @@ python tools/visualize_dataset.py --classes=./data/voc2012.names
 ```
 export HIP_VISIBLE_DEVICES=0
 export PYTHONPATH=/public/home/zhenyi/miniconda3/envs/tf2.7.0-dtk21.10-build/bin/
+#without xla
 python train.py \
 --dataset ./data/voc2012_train.tfrecord \
 --val_dataset ./data/voc2012_val.tfrecord \
@@ -105,5 +105,16 @@ python train.py \
 --epochs 10 \
 --weights ./checkpoints/yolov3.tf \ 
 --weights_num_classes 80
+ #with xla
+TF_XLA_FLAGS="--tf_xla_auto_jit=2" python train.py \
+ --dataset ./data/voc2012_train.tfrecord \
+ --val_dataset ./data/voc2012_val.tfrecord \
+ --classes ./data/voc2012.names \
+ --num_classes 20 \
+ --mode fit --transfer darknet \
+ --batch_size 16 \
+ --epochs 10 \
+ --weights ./checkpoints/yolov3.tf \ 
+ --weights_num_classes 80
 ```
--- a/TensorFlow2x/ComputeVision/Segmentation/UNet/README.md
+++ b/TensorFlow2x/ComputeVision/Segmentation/UNet/README.md
@@ -17,8 +17,16 @@
 - `export MIOPEN_FIND_MODE=3`
 ## 测试运行
+#whitout xla
 ` python3 main.py `
+#with xla
+TF_XLA_FLAGS="--tf_xla_auto_jit=2" python3 main.py
 ## 参数说明
 main.py文件内：
 - trainGenerator的第一个参数为batch_size