Commit bb715355 authored by chenxi226's avatar chenxi226
Browse files

能正常训练

parent 97a58f31
......@@ -5,6 +5,8 @@ __pycache__/
*.vscode
*.simg
*.sif
data_set
models
# C extensions
*.so
......
......@@ -53,7 +53,7 @@ def nms_cpu(boxes, scores, thresh):
return torch.tensor(keep).to(boxes).long()
@autocast(enabled=False)
torch.amp.autocast("cuda",enabled=False)
def nms(boxes: Tensor, scores: Tensor, iou_threshold: float):
"""
Performs non-maximum suppression
......
......@@ -71,7 +71,7 @@ def box_area(boxes: Union[Tensor, ndarray]) -> Union[Tensor, ndarray]:
return box_area_3d(boxes)
@autocast(enabled=False)
torch.amp.autocast("cuda",enabled=False)
def box_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 0) -> Tensor:
"""
Return intersection-over-union (Jaccard index) of boxes.
......
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
// Modifications licensed under:
// SPDX-FileCopyrightText: 2020 Division of Medical Image Computing, German Cancer Research Center (DKFZ), Heidelberg, Germany
// SPDX-License-Identifier: Apache-2.0
//
// Parts of this code are from torchvision licensed under
// SPDX-FileCopyrightText: 2016 Soumith Chintala
// SPDX-License-Identifier: BSD-3-Clause
/* adopted from
https://github.com/pytorch/vision/blob/master/torchvision/csrc/nms.h on Nov 15 2019
no cpu support, but could be added with this interface.
*/
//#include "cpu/vision_cpu.h"
#include <torch/types.h>
at::Tensor nms_cuda(const at::Tensor& dets, const at::Tensor& scores, float iou_threshold);
at::Tensor nms(
const at::Tensor& dets,
const at::Tensor& scores,
const double iou_threshold) {
if (dets.device().is_cuda()) {
if (dets.numel() == 0) {
//at::hip::HIPGuardMasqueradingAsCUDA device_guard(dets.device());
return at::empty({0}, dets.options().dtype(at::kLong));
}
return nms_cuda(dets, scores, iou_threshold);
}
AT_ERROR("Not compiled with CPU support");
//at::Tensor result = nms_cpu(dets, scores, iou_threshold);
//return result;
}
......@@ -170,7 +170,7 @@ at::Tensor nms_cuda(const at::Tensor& dets, const at::Tensor& scores, float iou_
if (is_3d) {
//std::cout << "performing NMS on 3D boxes in CUDA" << std::endl;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
dets_sorted.type(), "nms_kernel_cuda", [&] {
dets_sorted.scalar_type(), "nms_kernel_cuda", [&] {
nms_kernel_3d<scalar_t><<<blocks, threads, 0, stream>>>(
dets_num,
iou_threshold,
......@@ -180,7 +180,7 @@ at::Tensor nms_cuda(const at::Tensor& dets, const at::Tensor& scores, float iou_
}
else {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
dets_sorted.type(), "nms_kernel_cuda", [&] {
dets_sorted.scalar_type(), "nms_kernel_cuda", [&] {
nms_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
dets_num,
iou_threshold,
......
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
// Parts of this code are from torchvision licensed under
// SPDX-FileCopyrightText: 2016 Soumith Chintala
// SPDX-License-Identifier: BSD-3-Clause
#pragma once
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = (blockIdx.x * blockDim.x) + threadIdx.x; i < (n); \
i += (blockDim.x * gridDim.x))
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
#include "hip/hip_runtime.h"
// Modifications licensed under:
// SPDX-FileCopyrightText: 2020 Division of Medical Image Computing, German Cancer Research Center (DKFZ), Heidelberg, Germany
// SPDX-License-Identifier: Apache-2.0
//
// Parts of this code are from torchvision licensed under
// SPDX-FileCopyrightText: 2016 Soumith Chintala
// SPDX-License-Identifier: BSD-3-Clause
#include <torch/extension.h>
#include <ATen/ATen.h>
#include <ATen/hip/HIPContext.h>
#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
#include <ATen/hip/HIPApplyUtils.cuh>
#include "../hip/hip_helpers.h"
#include <iostream>
#include <vector>
int const threadsPerBlock = sizeof(unsigned long long) * 8;
template <typename T>
__device__ inline float devIoU(T const* const a, T const* const b) {
// a, b hold box coords as (y1, x1, y2, x2) with y1 < y2 etc.
T bottom = max(a[0], b[0]), top = min(a[2], b[2]);
T left = max(a[1], b[1]), right = min(a[3], b[3]);
T width = max(right - left, (T)0), height = max(top - bottom, (T)0);
T interS = width * height;
T Sa = (a[2] - a[0]) * (a[3] - a[1]);
T Sb = (b[2] - b[0]) * (b[3] - b[1]);
return interS / (Sa + Sb - interS);
}
template <typename T>
__device__ inline float devIoU_3d(T const* const a, T const* const b) {
// a, b hold box coords as (y1, x1, y2, x2, z1, z2) with y1 < y2 etc.
// get coordinates of intersection, calc intersection
T bottom = max(a[0], b[0]), top = min(a[2], b[2]);
T left = max(a[1], b[1]), right = min(a[3], b[3]);
T front = max(a[4], b[4]), back = min(a[5], b[5]);
T width = max(right - left, (T)0), height = max(top - bottom, (T)0);
T depth = max(back - front, (T)0);
T interS = width * height * depth;
// calc separate boxes volumes
T Sa = (a[2] - a[0]) * (a[3] - a[1]) * (a[5] - a[4]);
T Sb = (b[2] - b[0]) * (b[3] - b[1]) * (b[5] - b[4]);
return interS / (Sa + Sb - interS);
}
template <typename T>
__global__ void nms_kernel(const int n_boxes, const float iou_threshold, const T* dev_boxes,
unsigned long long* dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size =
min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
const int col_size =
min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
__shared__ T block_boxes[threadsPerBlock * 4];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 4 + 0] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 0];
block_boxes[threadIdx.x * 4 + 1] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 1];
block_boxes[threadIdx.x * 4 + 2] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 2];
block_boxes[threadIdx.x * 4 + 3] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 3];
}
__syncthreads();
if (threadIdx.x < row_size) {
const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
const T* cur_box = dev_boxes + cur_box_idx * 4;
int i = 0;
unsigned long long t = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (devIoU<T>(cur_box, block_boxes + i * 4) > iou_threshold) {
t |= 1ULL << i;
}
}
const int col_blocks = at::cuda::ATenCeilDiv(n_boxes, threadsPerBlock);
dev_mask[cur_box_idx * col_blocks + col_start] = t;
}
}
template <typename T>
__global__ void nms_kernel_3d(const int n_boxes, const float iou_threshold, const T* dev_boxes,
unsigned long long* dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size =
min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
const int col_size =
min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
__shared__ T block_boxes[threadsPerBlock * 6];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 6 + 0] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];
block_boxes[threadIdx.x * 6 + 1] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];
block_boxes[threadIdx.x * 6 + 2] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];
block_boxes[threadIdx.x * 6 + 3] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];
block_boxes[threadIdx.x * 6 + 4] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];
block_boxes[threadIdx.x * 6 + 5] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 5];
}
__syncthreads();
if (threadIdx.x < row_size) {
const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
const T* cur_box = dev_boxes + cur_box_idx * 6;
int i = 0;
unsigned long long t = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (devIoU_3d<T>(cur_box, block_boxes + i * 6) > iou_threshold) {
t |= 1ULL << i;
}
}
const int col_blocks = at::cuda::ATenCeilDiv(n_boxes, threadsPerBlock);
dev_mask[cur_box_idx * col_blocks + col_start] = t;
}
}
at::Tensor nms_cuda(const at::Tensor& dets, const at::Tensor& scores, float iou_threshold) {
/* dets expected as (n_dets, dim) where dim=4 in 2D, dim=6 in 3D */
AT_ASSERTM(dets.type().is_cuda(), "dets must be a CUDA tensor");
AT_ASSERTM(scores.type().is_cuda(), "scores must be a CUDA tensor");
at::hip::HIPGuardMasqueradingAsCUDA device_guard(dets.device());
bool is_3d = dets.size(1) == 6;
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto dets_sorted = dets.index_select(0, order_t);
int dets_num = dets.size(0);
const int col_blocks = at::cuda::ATenCeilDiv(dets_num, threadsPerBlock);
at::Tensor mask =
at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
dim3 blocks(col_blocks, col_blocks);
dim3 threads(threadsPerBlock);
hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
if (is_3d) {
//std::cout << "performing NMS on 3D boxes in CUDA" << std::endl;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
dets_sorted.scalar_type(), "nms_kernel_cuda", [&] {
hipLaunchKernelGGL(( nms_kernel_3d<scalar_t>), dim3(blocks), dim3(threads), 0, stream,
dets_num,
iou_threshold,
dets_sorted.data_ptr<scalar_t>(),
(unsigned long long*)mask.data_ptr<int64_t>());
});
}
else {
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
dets_sorted.scalar_type(), "nms_kernel_cuda", [&] {
hipLaunchKernelGGL(( nms_kernel<scalar_t>), dim3(blocks), dim3(threads), 0, stream,
dets_num,
iou_threshold,
dets_sorted.data_ptr<scalar_t>(),
(unsigned long long*)mask.data_ptr<int64_t>());
});
}
at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long* mask_host = (unsigned long long*)mask_cpu.data_ptr<int64_t>();
std::vector<unsigned long long> remv(col_blocks);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
at::Tensor keep =
at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
int64_t* keep_out = keep.data_ptr<int64_t>();
int num_to_keep = 0;
for (int i = 0; i < dets_num; i++) {
int nblock = i / threadsPerBlock;
int inblock = i % threadsPerBlock;
if (!(remv[nblock] & (1ULL << inblock))) {
keep_out[num_to_keep++] = i;
unsigned long long* p = mask_host + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv[j] |= p[j];
}
}
}
AT_CUDA_CHECK(hipGetLastError());
return order_t.index(
{keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
.to(order_t.device(), keep.scalar_type())});
}
\ No newline at end of file
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
// Modifications licensed under:
// SPDX-FileCopyrightText: 2020 Division of Medical Image Computing, German Cancer Research Center (DKFZ), Heidelberg, Germany
// SPDX-License-Identifier: Apache-2.0
//
// Parts of this code are from torchvision licensed under
// SPDX-FileCopyrightText: 2016 Soumith Chintala
// SPDX-License-Identifier: BSD-3-Clause
#include <torch/extension.h>
#include "cpu/nms_hip.cpp"
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("nms", &nms, "NMS C++ and/or CUDA");
}
......@@ -272,10 +272,11 @@ def smi_memory_allocated(gpu_id: int = 0) -> int:
Returns:
int: measured GPU memory in bytes
"""
reading = int(sp.check_output(
['nvidia-smi', '--query-gpu=memory.used',
'--format=csv,nounits,noheader'], encoding='utf-8').split('\n')[gpu_id])
return mb2b(reading)
# reading = int(sp.check_output(
# ['nvidia-smi', '--query-gpu=memory.used',
# '--format=csv,nounits,noheader'], encoding='utf-8').split('\n')[gpu_id])
# return mb2b(reading)
return torch.cuda.memory_allocated(device=gpu_id)
class Tracemalloc():
......
......@@ -22,7 +22,8 @@ from typing import Any, Callable, Dict, Optional, Sequence, Hashable, Type, Type
import torch
import pytorch_lightning as pl
from pytorch_lightning.core.memory import ModelSummary
# from pytorch_lightning.core.memory import ModelSummary
from pytorch_lightning.callbacks.model_summary import ModelSummary
from loguru import logger
from nndet.io.load import save_txt
......@@ -79,14 +80,14 @@ class LightningBaseModule(pl.LightningModule):
self.epoch_start_tic = time()
return super().on_epoch_start()
def validation_epoch_end(self, validation_step_outputs):
def on_validation_epoch_end(self):
"""
Print time of epoch
(needed for cluster where progress bar is deactivated)
"""
self.epoch_end_toc = time()
logger.info(f"This epoch took {int(self.epoch_end_toc - self.epoch_start_tic)} s")
return super().validation_epoch_end(validation_step_outputs)
return super().on_validation_epoch_end()
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
......
......@@ -131,6 +131,8 @@ class RetinaUNetModule(LightningBaseModuleSWA):
)
self.eval_score_key = "mAP_IoU_0.10_0.50_0.05_MaxDet_100"
self.training_step_outputs = []
self.validation_step_outputs = []
def training_step(self, batch, batch_idx):
"""
......@@ -151,6 +153,7 @@ class RetinaUNetModule(LightningBaseModuleSWA):
batch_num=batch_idx,
)
loss = sum(losses.values())
self.training_step_outputs.append(loss)
return {"loss": loss, **{key: l.detach().item() for key, l in losses.items()}}
def validation_step(self, batch, batch_idx):
......@@ -175,10 +178,12 @@ class RetinaUNetModule(LightningBaseModuleSWA):
loss = sum(losses.values())
self.evaluation_step(prediction=prediction, targets=targets)
return {
output = {
"loss": loss.detach().item(),
**{key: l.detach().item() for key, l in losses.items()},
}
self.validation_step_outputs.append(output)
return output
def evaluation_step(
self,
......@@ -232,30 +237,26 @@ class RetinaUNetModule(LightningBaseModuleSWA):
target=gt_seg,
)
def training_epoch_end(self, training_step_outputs):
def on_train_epoch_end(self):
"""
Log train loss to loguru logger
Log train loss to loguru logger (PyTorch Lightning 2.x version)
"""
# process and log losses
vals = defaultdict(list)
for _val in training_step_outputs:
for _k, _v in _val.items():
if _k == "loss":
vals[_k].append(_v.detach().item())
else:
vals[_k].append(_v)
for _key, _vals in vals.items():
mean_val = np.mean(_vals)
if _key == "loss":
logger.info(f"Train loss reached: {mean_val:0.5f}")
self.log(f"train_{_key}", mean_val, sync_dist=True)
return super().training_epoch_end(training_step_outputs)
# 直接计算平均 loss(training_step_outputs 是 Tensor 列表)
avg_loss = torch.stack(self.training_step_outputs).mean()
logger.info(f"Train loss reached: {avg_loss.item():0.5f}")
self.log(f"train_loss", avg_loss, sync_dist=True)
# 清理内存
self.training_step_outputs.clear()
def validation_epoch_end(self, validation_step_outputs):
def on_validation_epoch_end(self):
"""
Log val loss to loguru logger
Log val loss to loguru logger (PyTorch Lightning 2.x version)
"""
# 从实例变量获取保存的输出
validation_step_outputs = self.validation_step_outputs
# process and log losses
vals = defaultdict(list)
for _val in validation_step_outputs:
......@@ -270,7 +271,9 @@ class RetinaUNetModule(LightningBaseModuleSWA):
# process and log metrics
self.evaluation_end()
return super().validation_epoch_end(validation_step_outputs)
# 清理内存(重要!)
self.validation_step_outputs.clear()
def evaluation_end(self):
"""
......
......@@ -22,7 +22,16 @@ from loguru import logger
import torch
from torch.optim.lr_scheduler import _LRScheduler
from pytorch_lightning.callbacks import StochasticWeightAveraging
from pytorch_lightning.trainer.optimizers import _get_default_scheduler_config
# from pytorch_lightning.trainer.optimizers import _get_default_scheduler_config
def _get_default_scheduler_config():
return {
"scheduler": None, # 必需: scheduler 实例
"interval": "epoch", # 调用时机: 'epoch' 或 'step'
"frequency": 1, # 每隔多少 interval 调用一次
"monitor": "val_loss", # 监控指标 (用于 ReduceLROnPlateau)
"strict": True, # 如果 monitor 不存在是否报错
"name": None, # TensorBoard 中显示的名称
}
from pytorch_lightning.utilities import rank_zero_warn
from nndet.training.learning_rate import CycleLinear
......@@ -54,7 +63,7 @@ class BaseSWA(StochasticWeightAveraging):
"""
super().__init__(
swa_epoch_start=swa_epoch_start,
swa_lrs=None,
swa_lrs=0.001,
annealing_epochs=10,
annealing_strategy="cos",
avg_fn=avg_fn,
......
......@@ -32,7 +32,7 @@ import pandas as pd
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D # noqa: F401 unused import
import matplotlib.pyplot as plt
plt.style.use('seaborn-deep')
plt.style.use('default')
from sklearn.metrics import confusion_matrix
from torch import Tensor
import SimpleITK as sitk
......
......@@ -9,7 +9,8 @@ import numpy as np
from torch import Tensor
from collections import abc
from torch._six import string_classes
# from torch._six import string_classes
string_classes = (str, bytes)
from typing import Sequence, Union, Any, Mapping, Callable, List
np_str_obj_array_pattern = re.compile(r'[SaUO]')
......
......@@ -268,23 +268,25 @@ def _train(
logger.info(f"Using {plugins} plugins for training")
trainer = pl.Trainer(
gpus=list(range(num_gpus)) if num_gpus > 1 else num_gpus,
accelerator=cfg["trainer_cfg"]["accelerator"],
# gpus=list(range(num_gpus)) if num_gpus > 1 else num_gpus,
devices=num_gpus, # 替换 gpus → devices
accelerator="gpu", # 明确用 GPU
# accelerator=cfg["trainer_cfg"]["accelerator"],
precision=cfg["trainer_cfg"]["precision"],
amp_backend=cfg["trainer_cfg"]["amp_backend"],
amp_level=cfg["trainer_cfg"]["amp_level"],
# amp_backend=cfg["trainer_cfg"]["amp_backend"],
# amp_level=cfg["trainer_cfg"]["amp_level"],
benchmark=cfg["trainer_cfg"]["benchmark"],
deterministic=cfg["trainer_cfg"]["deterministic"],
callbacks=callbacks,
logger=pl_logger,
max_epochs=module.max_epochs,
progress_bar_refresh_rate=None if bool(int(os.getenv("det_verbose", 1))) else 0,
reload_dataloaders_every_epoch=False,
enable_progress_bar=None if bool(int(os.getenv("det_verbose", 1))) else 0,
# reload_dataloaders_every_epoch=False,
num_sanity_val_steps=10,
weights_summary='full',
# weights_summary='full',
plugins=plugins,
terminate_on_nan=True, # TODO: make modular
move_metrics_to_cpu=False,
# terminate_on_nan=True, # TODO: make modular
# move_metrics_to_cpu=False,
**trainer_kwargs
)
trainer.fit(module, datamodule=datamodule)
......
......@@ -232,4 +232,5 @@ def env():
if __name__ == '__main__':
env()
# env()
unpack()
......@@ -61,7 +61,7 @@ def get_extensions():
extension = CUDAExtension
sources += source_cuda
define_macros += [('WITH_CUDA', None)]
extra_compile_args["nvcc"] = [
extra_compile_args["hipcc"] = [
"-DCUDA_HAS_FP16=1",
"-D__CUDA_NO_HALF_OPERATORS__",
"-D__CUDA_NO_HALF_CONVERSIONS__",
......@@ -71,7 +71,7 @@ def get_extensions():
# It's better if pytorch can do this by default ..
CC = os.environ.get("CC", None)
if CC is not None:
extra_compile_args["nvcc"].append("-ccbin={}".format(CC))
extra_compile_args["hipcc"].append("-ccbin={}".format(CC))
sources = [os.path.join(extensions_dir, s) for s in sources]
include_dirs = [str(extensions_dir)]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment