Commit 2836906d authored by Sugon_ldc's avatar Sugon_ldc
Browse files

add new branch based on dtk23.04 and torch1.13.1

parent 0c279359
...@@ -23,7 +23,7 @@ YOLOv5 是一种基于单阶段目标检测算法,通过将图像划分为不 ...@@ -23,7 +23,7 @@ YOLOv5 是一种基于单阶段目标检测算法,通过将图像划分为不
### Docker (方法一) ### Docker (方法一)
``` ```
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.10.0-centos7.6-dtk-22.10.1-py38-latest docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04.1-py38-latest
docker run -it -v /path/your_code_data/:/path/your_code_data/ --shm-size=32G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name docker_name imageID bash docker run -it -v /path/your_code_data/:/path/your_code_data/ --shm-size=32G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name docker_name imageID bash
...@@ -44,10 +44,10 @@ docker run -it -v /path/your_code_data/:/path/your_code_data/ --shm-size=32G --p ...@@ -44,10 +44,10 @@ docker run -it -v /path/your_code_data/:/path/your_code_data/ --shm-size=32G --p
1、关于本项目DCU显卡所需的特殊深度学习库可从光合开发者社区下载安装: https://developer.hpccube.com/tool/ 1、关于本项目DCU显卡所需的特殊深度学习库可从光合开发者社区下载安装: https://developer.hpccube.com/tool/
``` ```
DTK软件栈:dtk22.10.1 DTK软件栈:dtk23.04.1
python:python3.8 python:python3.8
torch:1.10 torch:1.13.1
torchvision:0.10.0 torchvision:0.14.1
``` ```
Tips:以上dtk软件栈、python、torch等DCU相关工具版本需要严格一一对应 Tips:以上dtk软件栈、python、torch等DCU相关工具版本需要严格一一对应
...@@ -104,6 +104,8 @@ COCO2017(在网络良好的情况下,如果没有下载数据集,程序会 ...@@ -104,6 +104,8 @@ COCO2017(在网络良好的情况下,如果没有下载数据集,程序会
``` ```
export HIP_VISIBLE_DEVICES=0 export HIP_VISIBLE_DEVICES=0
export USE_MIOPEN_BATCHNORM=1
python3 train.py --batch 32 --data coco.yaml --cfg 'yolov5m.yaml' --weights '' --project 'run/train' --hyp 'data/hyps/hyp.scratch-high.yaml' --epochs 1000 2>&1 | tee yolov5m.log python3 train.py --batch 32 --data coco.yaml --cfg 'yolov5m.yaml' --weights '' --project 'run/train' --hyp 'data/hyps/hyp.scratch-high.yaml' --epochs 1000 2>&1 | tee yolov5m.log
``` ```
...@@ -114,6 +116,7 @@ python3 train.py --batch 32 --data coco.yaml --cfg 'yolov5m.yaml' --weights '' - ...@@ -114,6 +116,7 @@ python3 train.py --batch 32 --data coco.yaml --cfg 'yolov5m.yaml' --weights '' -
#以单机四卡为例子 #以单机四卡为例子
export HIP_VISIBLE_DEVICES=0,1,2,3 export HIP_VISIBLE_DEVICES=0,1,2,3
export HSA_FORCE_FINE_GRAIN_PCIE=1 export HSA_FORCE_FINE_GRAIN_PCIE=1
export USE_MIOPEN_BATCHNORM=1
python3 -m torch.distributed.run --nproc_per_node 4 train.py --batch 128 --data coco.yaml --cfg 'yolov5m.yaml' --weights '' --project 'run/train' --hyp 'data/hyps/hyp.scratch-high.yaml' --device 0,1,2,3 --epochs 1000 2>&1 | tee yolov5m_4.log python3 -m torch.distributed.run --nproc_per_node 4 train.py --batch 128 --data coco.yaml --cfg 'yolov5m.yaml' --weights '' --project 'run/train' --hyp 'data/hyps/hyp.scratch-high.yaml' --device 0,1,2,3 --epochs 1000 2>&1 | tee yolov5m_4.log
``` ```
...@@ -123,9 +126,17 @@ python3 -m torch.distributed.run --nproc_per_node 4 train.py --batch 128 --data ...@@ -123,9 +126,17 @@ python3 -m torch.distributed.run --nproc_per_node 4 train.py --batch 128 --data
``` ```
#下面的例子中使用两个节点,每个节点包含4加速张卡 #下面的例子中使用两个节点,每个节点包含4加速张卡
#node 1 #node 1
export HIP_VISIBLE_DEVICES=0,1,2,3
export HSA_FORCE_FINE_GRAIN_PCIE=1
export USE_MIOPEN_BATCHNORM=1
python3 -m torch.distributed.launch --nproc_per_node 4 --nnodes 2 --node_rank 0 --master_addr "node1" --master_port 34567 train.py --batch 256 --data coco.yaml --weight '' --project 'multi/train' --hyp 'data/hyps/hyp.scratch-high.yaml' --cfg 'yolov5m.yaml' --epochs 1000 2>&1 | tee yolov5m_8.log python3 -m torch.distributed.launch --nproc_per_node 4 --nnodes 2 --node_rank 0 --master_addr "node1" --master_port 34567 train.py --batch 256 --data coco.yaml --weight '' --project 'multi/train' --hyp 'data/hyps/hyp.scratch-high.yaml' --cfg 'yolov5m.yaml' --epochs 1000 2>&1 | tee yolov5m_8.log
#node2 #node2
export HIP_VISIBLE_DEVICES=0,1,2,3
export HSA_FORCE_FINE_GRAIN_PCIE=1
export USE_MIOPEN_BATCHNORM=1
python3 -m torch.distributed.launch --nproc_per_node 4 --nnodes 2 --node_rank 1 --master_addr "node1" --master_port 34567 train.py --batch 256 --data coco.yaml --weight '' --project 'multi/train' --hyp 'data/hyps/hyp.scratch-high.yaml' --cfg 'yolov5m.yaml' --epochs 1000 2>&1 | tee yolov5m_8.log python3 -m torch.distributed.launch --nproc_per_node 4 --nnodes 2 --node_rank 1 --master_addr "node1" --master_port 34567 train.py --batch 256 --data coco.yaml --weight '' --project 'multi/train' --hyp 'data/hyps/hyp.scratch-high.yaml' --cfg 'yolov5m.yaml' --epochs 1000 2>&1 | tee yolov5m_8.log
``` ```
......
FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.10.0-centos7.6-dtk-22.10.1-py38-latest FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04.1-py38-latest
RUN source /opt/dtk/env.sh RUN source /opt/dtk/env.sh
COPY requirments.txt requirments.txt COPY requirments.txt requirments.txt
RUN pip3 install -r requirements.txt RUN pip3 install -r requirements.txt
......
...@@ -181,9 +181,9 @@ class ComputeLoss: ...@@ -181,9 +181,9 @@ class ComputeLoss:
], device=targets.device).float() * g # offsets ], device=targets.device).float() * g # offsets
for i in range(self.nl): for i in range(self.nl):
anchors = self.anchors[i]
gain[2:6] = torch.tensor(p[i].shape)[[3, 2, 3, 2]] # xyxy gain anchors, shape = self.anchors[i], p[i].shape
gain[2:6] = torch.tensor(shape)[[3, 2, 3, 2]] # xyxy gain
# Match targets to anchors # Match targets to anchors
t = targets * gain t = targets * gain
if nt: if nt:
...@@ -214,7 +214,7 @@ class ComputeLoss: ...@@ -214,7 +214,7 @@ class ComputeLoss:
# Append # Append
a = t[:, 6].long() # anchor indices a = t[:, 6].long() # anchor indices
indices.append((b, a, gj.clamp_(0, gain[3] - 1), gi.clamp_(0, gain[2] - 1))) # image, anchor, grid indices indices.append((b, a, gj.clamp_(0, shape[2] - 1), gi.clamp_(0, shape[3] - 1))) # image, anchor, grid
tbox.append(torch.cat((gxy - gij, gwh), 1)) # box tbox.append(torch.cat((gxy - gij, gwh), 1)) # box
anch.append(anchors[a]) # anchors anch.append(anchors[a]) # anchors
tcls.append(c) # class tcls.append(c) # class
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment