Merge branch 'pan_dev' into 'main'

Pan dev See merge request dcutoolkit/deeplearing/dlexamples_new!30

Merge branch 'pan_dev' into 'main'
Pan dev See merge request dcutoolkit/deeplearing/dlexamples_new!30
67a9c0b7 · sunxx1 · d3cea8c9 · 78e30386 · 67a9c0b7 · 67a9c0b7
Commit 67a9c0b7 authored Jul 30, 2022 by sunxx1
3 changed files
--- a/PyTorch/Compute-Vision/Accuracy_Validation/ResNet50/README.md
+++ b/PyTorch/Compute-Vision/Accuracy_Validation/ResNet50/README.md
-# 介绍
-该测试用例用于ResNet50精度验证，单卡运行指令如下
+# 简介

-# 运行示例
- 
-## fp32  
-    python3 main_acc.py --batch-size=64 --arch=resnet50 -j 6 --epochs=90 --save-path=/path/to/{save_model_dir} /path/to/{ImageNet_pytorch_data_dir}/
+该测试用例可用于ResNet50/Vgg16等网络的性能测试及精度验证。

-## fp16  
-    python3 main_acc.py --batch-size=64 --arch=resnet50 -j 6 --epochs=90 --amp --opt-level O1 --loss-scale=dynamic --save-path=/path/to/{save_model_dir} /path/to/{ImageNet_pytorch_data_dir}/
+# 单卡测试（单精度）

-# 参考
+## 运行
+
+```
+export HIP_VISIBLE_DEVICES=0
+python3 main_acc.py --batch-size=64 --arch=resnet50 -j 6 --epochs=90 --save-path=/path/to/{save_model_dir} /path/to/{ImageNet_pytorch_data_dir}/
+```
+
+# 单卡测试（混合精度）
+
+## 运行
+
+```
+export HIP_VISIBLE_DEVICES=0
+python3 main_acc.py --batch-size=64 --arch=resnet50 -j 6 --epochs=90 --amp --opt-level O1 --loss-scale=dynamic --save-path=/path/to/{save_model_dir} /path/to/{ImageNet_pytorch_data_dir}/
+```
+
+
+
+# 多卡测试（单精度）

-[https://github.com/pytorch/examples/tree/master/imagenet](https://github.com/pytorch/examples/tree/master/imagenet)
+## 运行
+
+```
+#single_process.sh与main_acc.py在同级目录下
+mpirun --allow-run-as-root --bind-to none -np 4 single_process.sh localhost resnet50 64
+```
+
+# 参数说明
+
+```
+--arch 设置要测试的网络，可以是 resnet50/vgg16/inception_v3/mobilenet_v2等
+```
+
+# inception_v3 测试说明
+
+inception_v3测试时需要修改部分代码，具体可参考https://developer.hpccube.com/tool/ → AI生态包→ 技术文档 → 基于pytorch的DCU深度学习测试示例文档。
+
+# 参考

+https://github.com/pytorch/examples/tree/master/imagenet
\ No newline at end of file
--- a/PyTorch/Compute-Vision/Accuracy_Validation/ResNet50/main_acc.py
+++ b/PyTorch/Compute-Vision/Accuracy_Validation/ResNet50/main_acc.py
@@ -209,22 +209,6 @@ def main_worker(gpu, ngpus_per_node, args):
            model = torch.nn.DataParallel(model).cuda()

    # optionally resume from a checkpoint
-#    if args.resume:
-#        if os.path.isfile(args.resume):
-#            print("=> loading checkpoint '{}'".format(args.resume))
-#            checkpoint = torch.load(args.resume)
-#            args.start_epoch = checkpoint['epoch']
-#            best_acc1 = checkpoint['best_acc1']
-#            if args.gpu is not None:
-#                # best_acc1 may be from a checkpoint from a different GPU
-#                best_acc1 = best_acc1.to(args.gpu)
-#            model.load_state_dict(checkpoint['state_dict'])
-#            optimizer.load_state_dict(checkpoint['optimizer'])
-#            print("=> loaded checkpoint '{}' (epoch {})"
-#                  .format(args.resume, checkpoint['epoch']))
-#        else:
-#            print("=> no checkpoint found at '{}'".format(args.resume))
-
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
@@ -421,15 +405,6 @@ def validate(val_loader, model, criterion, args):

    return top1.avg

-
-#def save_checkpoint(state, epoch, is_best, rank, filename='/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume/checkpoint.pth.tar'):
-#    filename='/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume'+str(rank)+'/checkpoint_'+str(epoch)+'.pth.tar'
-#    torch.save(state, filename)
-#    if is_best:
-#        #shutil.copyfile(filename, '/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume/model_best.pth.tar')
-#        best_dir='/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume'+str(rank)+'/model_best.pth.tar'
-#        shutil.copyfile(filename, best_dir)
-
 def save_checkpoint(state, epoch, is_best, rank, filename):
    rank_path = filename + '/' + str(rank)
    if not os.path.isdir(rank_path):
@@ -437,7 +412,6 @@ def save_checkpoint(state, epoch, is_best, rank, filename):
    filename = rank_path+'/checkpoint_'+str(epoch)+'.pth.tar'
    torch.save(state, filename)
    if is_best:
-        #shutil.copyfile(filename, '/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume/model_best.pth.tar')
        best_dir=rank_path+'/model_best.pth.tar'
        shutil.copyfile(filename, best_dir)


--- a/PyTorch/Compute-Vision/Accuracy_Validation/ResNet50/single_process.sh
+++ b/PyTorch/Compute-Vision/Accuracy_Validation/ResNet50/single_process.sh
+#!/bin/bash
+export NCCL_SOCKET_IFNAME=eno1
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+comm_rank=$OMPI_COMM_WORLD_RANK
+comm_size=$OMPI_COMM_WORLD_SIZE
+
+APP="python3 `pwd`/main_acc.py --batch-size=${3} --arch=${2} -j 6 --epochs=90 --dist-url tcp://${1}:34567 --dist-backend nccl --world-size=${comm_size} --rank=${comm_rank} --save-path=/path/to/{save_model_dir} /path/to/{ImageNet_pytorch_data_dir}/"
+
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_0:1
+  export UCX_IB_PCI_BW=mlx5_0:50Gbs
+  echo numactl --cpunodebind=0 --membind=0 ${APP}
+  numactl --cpunodebind=0 --membind=0 ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_1:1
+  export UCX_IB_PCI_BW=mlx5_1:50Gbs
+  echo numactl --cpunodebind=1 --membind=1 ${APP}
+  numactl --cpunodebind=1 --membind=1 ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_2:1
+  export UCX_IB_PCI_BW=mlx5_2:50Gbs
+  echo numactl --cpunodebind=2 --membind=2 ${APP} 
+  numactl --cpunodebind=2 --membind=2 ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export UCX_NET_DEVICES=mlx5_3:1
+  export UCX_IB_PCI_BW=mlx5_3:50Gbs
+  echo numactl --cpunodebind=3 --membind=3 ${APP}
+  numactl --cpunodebind=3 --membind=3 ${APP}
+  ;;
+esac