Commit 67a9c0b7 authored by sunxx1's avatar sunxx1
Browse files

Merge branch 'pan_dev' into 'main'

Pan dev

See merge request dcutoolkit/deeplearing/dlexamples_new!30
parents d3cea8c9 78e30386
# 介绍
该测试用例用于ResNet50精度验证,单卡运行指令如下
# 简介
# 运行示例
## fp32
python3 main_acc.py --batch-size=64 --arch=resnet50 -j 6 --epochs=90 --save-path=/path/to/{save_model_dir} /path/to/{ImageNet_pytorch_data_dir}/
该测试用例可用于ResNet50/Vgg16等网络的性能测试及精度验证。
## fp16
python3 main_acc.py --batch-size=64 --arch=resnet50 -j 6 --epochs=90 --amp --opt-level O1 --loss-scale=dynamic --save-path=/path/to/{save_model_dir} /path/to/{ImageNet_pytorch_data_dir}/
# 单卡测试(单精度)
# 参考
## 运行
```
export HIP_VISIBLE_DEVICES=0
python3 main_acc.py --batch-size=64 --arch=resnet50 -j 6 --epochs=90 --save-path=/path/to/{save_model_dir} /path/to/{ImageNet_pytorch_data_dir}/
```
# 单卡测试(混合精度)
## 运行
```
export HIP_VISIBLE_DEVICES=0
python3 main_acc.py --batch-size=64 --arch=resnet50 -j 6 --epochs=90 --amp --opt-level O1 --loss-scale=dynamic --save-path=/path/to/{save_model_dir} /path/to/{ImageNet_pytorch_data_dir}/
```
# 多卡测试(单精度)
[https://github.com/pytorch/examples/tree/master/imagenet](https://github.com/pytorch/examples/tree/master/imagenet)
## 运行
```
#single_process.sh与main_acc.py在同级目录下
mpirun --allow-run-as-root --bind-to none -np 4 single_process.sh localhost resnet50 64
```
# 参数说明
```
--arch 设置要测试的网络,可以是 resnet50/vgg16/inception_v3/mobilenet_v2等
```
# inception_v3 测试说明
inception_v3测试时需要修改部分代码,具体可参考https://developer.hpccube.com/tool/ → AI生态包→ 技术文档 → 基于pytorch的DCU深度学习测试示例文档。
# 参考
https://github.com/pytorch/examples/tree/master/imagenet
\ No newline at end of file
......@@ -209,22 +209,6 @@ def main_worker(gpu, ngpus_per_node, args):
model = torch.nn.DataParallel(model).cuda()
# optionally resume from a checkpoint
# if args.resume:
# if os.path.isfile(args.resume):
# print("=> loading checkpoint '{}'".format(args.resume))
# checkpoint = torch.load(args.resume)
# args.start_epoch = checkpoint['epoch']
# best_acc1 = checkpoint['best_acc1']
# if args.gpu is not None:
# # best_acc1 may be from a checkpoint from a different GPU
# best_acc1 = best_acc1.to(args.gpu)
# model.load_state_dict(checkpoint['state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer'])
# print("=> loaded checkpoint '{}' (epoch {})"
# .format(args.resume, checkpoint['epoch']))
# else:
# print("=> no checkpoint found at '{}'".format(args.resume))
if args.resume:
if os.path.isfile(args.resume):
print("=> loading checkpoint '{}'".format(args.resume))
......@@ -421,15 +405,6 @@ def validate(val_loader, model, criterion, args):
return top1.avg
#def save_checkpoint(state, epoch, is_best, rank, filename='/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume/checkpoint.pth.tar'):
# filename='/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume'+str(rank)+'/checkpoint_'+str(epoch)+'.pth.tar'
# torch.save(state, filename)
# if is_best:
# #shutil.copyfile(filename, '/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume/model_best.pth.tar')
# best_dir='/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume'+str(rank)+'/model_best.pth.tar'
# shutil.copyfile(filename, best_dir)
def save_checkpoint(state, epoch, is_best, rank, filename):
rank_path = filename + '/' + str(rank)
if not os.path.isdir(rank_path):
......@@ -437,7 +412,6 @@ def save_checkpoint(state, epoch, is_best, rank, filename):
filename = rank_path+'/checkpoint_'+str(epoch)+'.pth.tar'
torch.save(state, filename)
if is_best:
#shutil.copyfile(filename, '/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume/model_best.pth.tar')
best_dir=rank_path+'/model_best.pth.tar'
shutil.copyfile(filename, best_dir)
......
#!/bin/bash
export NCCL_SOCKET_IFNAME=eno1
export HSA_FORCE_FINE_GRAIN_PCIE=1
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank=$OMPI_COMM_WORLD_RANK
comm_size=$OMPI_COMM_WORLD_SIZE
APP="python3 `pwd`/main_acc.py --batch-size=${3} --arch=${2} -j 6 --epochs=90 --dist-url tcp://${1}:34567 --dist-backend nccl --world-size=${comm_size} --rank=${comm_rank} --save-path=/path/to/{save_model_dir} /path/to/{ImageNet_pytorch_data_dir}/"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_0:1
export UCX_IB_PCI_BW=mlx5_0:50Gbs
echo numactl --cpunodebind=0 --membind=0 ${APP}
numactl --cpunodebind=0 --membind=0 ${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_1:1
export UCX_IB_PCI_BW=mlx5_1:50Gbs
echo numactl --cpunodebind=1 --membind=1 ${APP}
numactl --cpunodebind=1 --membind=1 ${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_2:1
export UCX_IB_PCI_BW=mlx5_2:50Gbs
echo numactl --cpunodebind=2 --membind=2 ${APP}
numactl --cpunodebind=2 --membind=2 ${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3
export UCX_NET_DEVICES=mlx5_3:1
export UCX_IB_PCI_BW=mlx5_3:50Gbs
echo numactl --cpunodebind=3 --membind=3 ${APP}
numactl --cpunodebind=3 --membind=3 ${APP}
;;
esac
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment