Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
dcuai
dlexamples
Commits
67a9c0b7
Commit
67a9c0b7
authored
Jul 30, 2022
by
sunxx1
Browse files
Merge branch 'pan_dev' into 'main'
Pan dev See merge request dcutoolkit/deeplearing/dlexamples_new!30
parents
d3cea8c9
78e30386
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
81 additions
and
36 deletions
+81
-36
PyTorch/Compute-Vision/Accuracy_Validation/ResNet50/README.md
...rch/Compute-Vision/Accuracy_Validation/ResNet50/README.md
+41
-10
PyTorch/Compute-Vision/Accuracy_Validation/ResNet50/main_acc.py
...h/Compute-Vision/Accuracy_Validation/ResNet50/main_acc.py
+0
-26
PyTorch/Compute-Vision/Accuracy_Validation/ResNet50/single_process.sh
...ute-Vision/Accuracy_Validation/ResNet50/single_process.sh
+40
-0
No files found.
PyTorch/Compute-Vision/Accuracy_Validation/ResNet50/README.md
View file @
67a9c0b7
# 介绍
# 简介
该测试用例用于ResNet50精度验证,单卡运行指令如下
# 运行示例
该测试用例可用于ResNet50/Vgg16等网络的性能测试及精度验证。
## fp32
python3 main_acc.py --batch-size=64 --arch=resnet50 -j 6 --epochs=90 --save-path=/path/to/{save_model_dir} /path/to/{ImageNet_pytorch_data_dir}/
## fp16
# 单卡测试(单精度)
python3 main_acc.py --batch-size=64 --arch=resnet50 -j 6 --epochs=90 --amp --opt-level O1 --loss-scale=dynamic --save-path=/path/to/{save_model_dir} /path/to/{ImageNet_pytorch_data_dir}/
# 参考
## 运行
```
export HIP_VISIBLE_DEVICES=0
python3 main_acc.py --batch-size=64 --arch=resnet50 -j 6 --epochs=90 --save-path=/path/to/{save_model_dir} /path/to/{ImageNet_pytorch_data_dir}/
```
# 单卡测试(混合精度)
## 运行
```
export HIP_VISIBLE_DEVICES=0
python3 main_acc.py --batch-size=64 --arch=resnet50 -j 6 --epochs=90 --amp --opt-level O1 --loss-scale=dynamic --save-path=/path/to/{save_model_dir} /path/to/{ImageNet_pytorch_data_dir}/
```
# 多卡测试(单精度)
[
https://github.com/pytorch/examples/tree/master/imagenet
](
https://github.com/pytorch/examples/tree/master/imagenet
)
## 运行
```
#single_process.sh与main_acc.py在同级目录下
mpirun --allow-run-as-root --bind-to none -np 4 single_process.sh localhost resnet50 64
```
# 参数说明
```
--arch 设置要测试的网络,可以是 resnet50/vgg16/inception_v3/mobilenet_v2等
```
# inception_v3 测试说明
inception_v3测试时需要修改部分代码,具体可参考https://developer.hpccube.com/tool/ → AI生态包→ 技术文档 → 基于pytorch的DCU深度学习测试示例文档。
# 参考
https://github.com/pytorch/examples/tree/master/imagenet
\ No newline at end of file
PyTorch/Compute-Vision/Accuracy_Validation/ResNet50/main_acc.py
View file @
67a9c0b7
...
@@ -209,22 +209,6 @@ def main_worker(gpu, ngpus_per_node, args):
...
@@ -209,22 +209,6 @@ def main_worker(gpu, ngpus_per_node, args):
model
=
torch
.
nn
.
DataParallel
(
model
).
cuda
()
model
=
torch
.
nn
.
DataParallel
(
model
).
cuda
()
# optionally resume from a checkpoint
# optionally resume from a checkpoint
# if args.resume:
# if os.path.isfile(args.resume):
# print("=> loading checkpoint '{}'".format(args.resume))
# checkpoint = torch.load(args.resume)
# args.start_epoch = checkpoint['epoch']
# best_acc1 = checkpoint['best_acc1']
# if args.gpu is not None:
# # best_acc1 may be from a checkpoint from a different GPU
# best_acc1 = best_acc1.to(args.gpu)
# model.load_state_dict(checkpoint['state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer'])
# print("=> loaded checkpoint '{}' (epoch {})"
# .format(args.resume, checkpoint['epoch']))
# else:
# print("=> no checkpoint found at '{}'".format(args.resume))
if
args
.
resume
:
if
args
.
resume
:
if
os
.
path
.
isfile
(
args
.
resume
):
if
os
.
path
.
isfile
(
args
.
resume
):
print
(
"=> loading checkpoint '{}'"
.
format
(
args
.
resume
))
print
(
"=> loading checkpoint '{}'"
.
format
(
args
.
resume
))
...
@@ -421,15 +405,6 @@ def validate(val_loader, model, criterion, args):
...
@@ -421,15 +405,6 @@ def validate(val_loader, model, criterion, args):
return
top1
.
avg
return
top1
.
avg
#def save_checkpoint(state, epoch, is_best, rank, filename='/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume/checkpoint.pth.tar'):
# filename='/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume'+str(rank)+'/checkpoint_'+str(epoch)+'.pth.tar'
# torch.save(state, filename)
# if is_best:
# #shutil.copyfile(filename, '/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume/model_best.pth.tar')
# best_dir='/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume'+str(rank)+'/model_best.pth.tar'
# shutil.copyfile(filename, best_dir)
def
save_checkpoint
(
state
,
epoch
,
is_best
,
rank
,
filename
):
def
save_checkpoint
(
state
,
epoch
,
is_best
,
rank
,
filename
):
rank_path
=
filename
+
'/'
+
str
(
rank
)
rank_path
=
filename
+
'/'
+
str
(
rank
)
if
not
os
.
path
.
isdir
(
rank_path
):
if
not
os
.
path
.
isdir
(
rank_path
):
...
@@ -437,7 +412,6 @@ def save_checkpoint(state, epoch, is_best, rank, filename):
...
@@ -437,7 +412,6 @@ def save_checkpoint(state, epoch, is_best, rank, filename):
filename
=
rank_path
+
'/checkpoint_'
+
str
(
epoch
)
+
'.pth.tar'
filename
=
rank_path
+
'/checkpoint_'
+
str
(
epoch
)
+
'.pth.tar'
torch
.
save
(
state
,
filename
)
torch
.
save
(
state
,
filename
)
if
is_best
:
if
is_best
:
#shutil.copyfile(filename, '/public/home/aiss/419_rocm2.x_DL/examples_dl/aiss_test/base_acc/HK/HK3/test_resume/model_best.pth.tar')
best_dir
=
rank_path
+
'/model_best.pth.tar'
best_dir
=
rank_path
+
'/model_best.pth.tar'
shutil
.
copyfile
(
filename
,
best_dir
)
shutil
.
copyfile
(
filename
,
best_dir
)
...
...
PyTorch/Compute-Vision/Accuracy_Validation/ResNet50/single_process.sh
0 → 100644
View file @
67a9c0b7
#!/bin/bash
export
NCCL_SOCKET_IFNAME
=
eno1
export
HSA_FORCE_FINE_GRAIN_PCIE
=
1
lrank
=
$OMPI_COMM_WORLD_LOCAL_RANK
comm_rank
=
$OMPI_COMM_WORLD_RANK
comm_size
=
$OMPI_COMM_WORLD_SIZE
APP
=
"python3
`
pwd
`
/main_acc.py --batch-size=
${
3
}
--arch=
${
2
}
-j 6 --epochs=90 --dist-url tcp://
${
1
}
:34567 --dist-backend nccl --world-size=
${
comm_size
}
--rank=
${
comm_rank
}
--save-path=/path/to/{save_model_dir} /path/to/{ImageNet_pytorch_data_dir}/"
case
${
lrank
}
in
[
0]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
export
UCX_NET_DEVICES
=
mlx5_0:1
export
UCX_IB_PCI_BW
=
mlx5_0:50Gbs
echo
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
numactl
--cpunodebind
=
0
--membind
=
0
${
APP
}
;;
[
1]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
export
UCX_NET_DEVICES
=
mlx5_1:1
export
UCX_IB_PCI_BW
=
mlx5_1:50Gbs
echo
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
numactl
--cpunodebind
=
1
--membind
=
1
${
APP
}
;;
[
2]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
export
UCX_NET_DEVICES
=
mlx5_2:1
export
UCX_IB_PCI_BW
=
mlx5_2:50Gbs
echo
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
numactl
--cpunodebind
=
2
--membind
=
2
${
APP
}
;;
[
3]
)
export
HIP_VISIBLE_DEVICES
=
0,1,2,3
export
UCX_NET_DEVICES
=
mlx5_3:1
export
UCX_IB_PCI_BW
=
mlx5_3:50Gbs
echo
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
numactl
--cpunodebind
=
3
--membind
=
3
${
APP
}
;;
esac
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment