能正常训练

bb715355 · chenxi226 · 97a58f31 · bb715355 · bb715355 · bb715355
Commit bb715355 authored Apr 16, 2026 by chenxi226
17 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,8 @@ __pycache__/
 *.vscode
 *.simg
 *.sif
+data_set
+models

 # C extensions
 *.so

--- a/nndet/core/boxes/nms.py
+++ b/nndet/core/boxes/nms.py
@@ -53,7 +53,7 @@ def nms_cpu(boxes, scores, thresh):
    return torch.tensor(keep).to(boxes).long()


-@autocast(enabled=False)
+torch.amp.autocast("cuda",enabled=False)
 def nms(boxes: Tensor, scores: Tensor, iou_threshold: float):
    """
    Performs non-maximum suppression

--- a/nndet/core/boxes/ops.py
+++ b/nndet/core/boxes/ops.py
@@ -71,7 +71,7 @@ def box_area(boxes: Union[Tensor, ndarray]) -> Union[Tensor, ndarray]:
        return box_area_3d(boxes)


-@autocast(enabled=False)
+torch.amp.autocast("cuda",enabled=False)
 def box_iou(boxes1: Tensor, boxes2: Tensor,  eps: float = 0) -> Tensor:
    """
    Return intersection-over-union (Jaccard index) of boxes.

--- a/nndet/csrc/cpu/nms_hip.cpp
+++ b/nndet/csrc/cpu/nms_hip.cpp
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+// Modifications licensed under:
+// SPDX-FileCopyrightText: 2020 Division of Medical Image Computing, German Cancer Research Center (DKFZ), Heidelberg, Germany
+// SPDX-License-Identifier: Apache-2.0
+//
+// Parts of this code are from torchvision licensed under
+// SPDX-FileCopyrightText: 2016 Soumith Chintala
+// SPDX-License-Identifier: BSD-3-Clause
+
+/*  adopted from
+    https://github.com/pytorch/vision/blob/master/torchvision/csrc/nms.h on Nov 15 2019
+    no cpu support, but could be added with this interface.
+*/
+//#include "cpu/vision_cpu.h"
+#include <torch/types.h>
+
+at::Tensor nms_cuda(const at::Tensor& dets, const at::Tensor& scores, float iou_threshold);
+
+at::Tensor nms(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    const double iou_threshold) {
+  if (dets.device().is_cuda()) {
+
+    if (dets.numel() == 0) {
+      //at::hip::HIPGuardMasqueradingAsCUDA device_guard(dets.device());
+      return at::empty({0}, dets.options().dtype(at::kLong));
+    }
+    return nms_cuda(dets, scores, iou_threshold);
+
+  }
+  AT_ERROR("Not compiled with CPU support");
+  //at::Tensor result = nms_cpu(dets, scores, iou_threshold);
+  //return result;
+}
--- a/nndet/csrc/cuda/nms.cu
+++ b/nndet/csrc/cuda/nms.cu
@@ -170,7 +170,7 @@ at::Tensor nms_cuda(const at::Tensor& dets, const at::Tensor& scores, float iou_
  if (is_3d) {
  //std::cout << "performing NMS on 3D boxes in CUDA" << std::endl;
  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      dets_sorted.type(), "nms_kernel_cuda", [&] {
+      dets_sorted.scalar_type(), "nms_kernel_cuda", [&] {
        nms_kernel_3d<scalar_t><<<blocks, threads, 0, stream>>>(
            dets_num,
            iou_threshold,
@@ -180,7 +180,7 @@ at::Tensor nms_cuda(const at::Tensor& dets, const at::Tensor& scores, float iou_
   }
   else {
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      dets_sorted.type(), "nms_kernel_cuda", [&] {
+      dets_sorted.scalar_type(), "nms_kernel_cuda", [&] {
        nms_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
            dets_num,
            iou_threshold,

--- a/nndet/csrc/hip/hip_helpers.h
+++ b/nndet/csrc/hip/hip_helpers.h
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+// Parts of this code are from torchvision licensed under
+// SPDX-FileCopyrightText: 2016 Soumith Chintala
+// SPDX-License-Identifier: BSD-3-Clause
+
+#pragma once
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                                \
+  for (int i = (blockIdx.x * blockDim.x) + threadIdx.x; i < (n); \
+       i += (blockDim.x * gridDim.x))
--- a/nndet/csrc/hip/nms.hip
+++ b/nndet/csrc/hip/nms.hip
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+#include "hip/hip_runtime.h"
+// Modifications licensed under:
+// SPDX-FileCopyrightText: 2020 Division of Medical Image Computing, German Cancer Research Center (DKFZ), Heidelberg, Germany
+// SPDX-License-Identifier: Apache-2.0
+//
+// Parts of this code are from torchvision licensed under
+// SPDX-FileCopyrightText: 2016 Soumith Chintala
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <ATen/hip/HIPContext.h>
+#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
+#include <ATen/hip/HIPApplyUtils.cuh>
+
+#include "../hip/hip_helpers.h"
+
+#include <iostream>
+#include <vector>
+
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+
+template <typename T>
+__device__ inline float devIoU(T const* const a, T const* const b) {
+  // a, b hold box coords as (y1, x1, y2, x2) with y1 < y2 etc.
+  T bottom = max(a[0], b[0]), top = min(a[2], b[2]);
+  T left = max(a[1], b[1]), right = min(a[3], b[3]);
+  T width = max(right - left, (T)0), height = max(top - bottom, (T)0);
+  T interS = width * height;
+
+  T Sa = (a[2] - a[0]) * (a[3] - a[1]);
+  T Sb = (b[2] - b[0]) * (b[3] - b[1]);
+
+  return interS / (Sa + Sb - interS);
+}
+
+template <typename T>
+__device__ inline float devIoU_3d(T const* const a, T const* const b) {
+  // a, b hold box coords as (y1, x1, y2, x2, z1, z2) with y1 < y2 etc.
+  // get coordinates of intersection, calc intersection
+  T bottom = max(a[0], b[0]), top = min(a[2], b[2]);
+  T left = max(a[1], b[1]), right = min(a[3], b[3]);
+  T front = max(a[4], b[4]), back = min(a[5], b[5]);
+  T width = max(right - left, (T)0), height = max(top - bottom, (T)0);
+  T depth = max(back - front, (T)0);
+  T interS = width * height * depth;
+  // calc separate boxes volumes
+  T Sa = (a[2] - a[0]) * (a[3] - a[1]) * (a[5] - a[4]);
+  T Sb = (b[2] - b[0]) * (b[3] - b[1]) * (b[5] - b[4]);
+
+  return interS / (Sa + Sb - interS);
+}
+
+
+template <typename T>
+__global__ void nms_kernel(const int n_boxes, const float iou_threshold, const T* dev_boxes,
+    unsigned long long* dev_mask) {
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+  const int row_size =
+      min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+  const int col_size =
+      min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+  __shared__ T block_boxes[threadsPerBlock * 4];
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 4 + 0] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 0];
+    block_boxes[threadIdx.x * 4 + 1] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 1];
+    block_boxes[threadIdx.x * 4 + 2] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 2];
+    block_boxes[threadIdx.x * 4 + 3] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 4 + 3];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+    const T* cur_box = dev_boxes + cur_box_idx * 4;
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (devIoU<T>(cur_box, block_boxes + i * 4) > iou_threshold) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = at::cuda::ATenCeilDiv(n_boxes, threadsPerBlock);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+
+template <typename T>
+__global__ void nms_kernel_3d(const int n_boxes, const float iou_threshold, const T* dev_boxes,
+    unsigned long long* dev_mask) {
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  // if (row_start > col_start) return;
+  const int row_size =
+      min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+  const int col_size =
+      min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+  __shared__ T block_boxes[threadsPerBlock * 6];
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 6 + 0] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];
+    block_boxes[threadIdx.x * 6 + 1] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];
+    block_boxes[threadIdx.x * 6 + 2] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];
+    block_boxes[threadIdx.x * 6 + 3] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];
+    block_boxes[threadIdx.x * 6 + 4] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];
+    block_boxes[threadIdx.x * 6 + 5] =
+        dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 5];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+    const T* cur_box = dev_boxes + cur_box_idx * 6;
+    int i = 0;
+    unsigned long long t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (devIoU_3d<T>(cur_box, block_boxes + i * 6) > iou_threshold) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = at::cuda::ATenCeilDiv(n_boxes, threadsPerBlock);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+
+at::Tensor nms_cuda(const at::Tensor& dets, const at::Tensor& scores, float iou_threshold) {
+  /* dets expected as (n_dets, dim) where dim=4 in 2D, dim=6 in 3D */
+  AT_ASSERTM(dets.type().is_cuda(), "dets must be a CUDA tensor");
+  AT_ASSERTM(scores.type().is_cuda(), "scores must be a CUDA tensor");
+  at::hip::HIPGuardMasqueradingAsCUDA device_guard(dets.device());
+
+  bool is_3d = dets.size(1) == 6;
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+  auto dets_sorted = dets.index_select(0, order_t);
+
+  int dets_num = dets.size(0);
+
+  const int col_blocks = at::cuda::ATenCeilDiv(dets_num, threadsPerBlock);
+
+  at::Tensor mask =
+      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
+
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(threadsPerBlock);
+  hipStream_t stream = at::hip::getCurrentHIPStreamMasqueradingAsCUDA();
+
+
+  if (is_3d) {
+  //std::cout << "performing NMS on 3D boxes in CUDA" << std::endl;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      dets_sorted.scalar_type(), "nms_kernel_cuda", [&] {
+       hipLaunchKernelGGL(( nms_kernel_3d<scalar_t>), dim3(blocks), dim3(threads), 0, stream, 
+            dets_num,
+            iou_threshold,
+            dets_sorted.data_ptr<scalar_t>(),
+            (unsigned long long*)mask.data_ptr<int64_t>());
+      });
+   }
+   else {
+   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      dets_sorted.scalar_type(), "nms_kernel_cuda", [&] {
+       hipLaunchKernelGGL(( nms_kernel<scalar_t>), dim3(blocks), dim3(threads), 0, stream, 
+            dets_num,
+            iou_threshold,
+            dets_sorted.data_ptr<scalar_t>(),
+            (unsigned long long*)mask.data_ptr<int64_t>());
+      });
+
+   }
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long* mask_host = (unsigned long long*)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  at::Tensor keep =
+      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
+  int64_t* keep_out = keep.data_ptr<int64_t>();
+
+  int num_to_keep = 0;
+  for (int i = 0; i < dets_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep_out[num_to_keep++] = i;
+      unsigned long long* p = mask_host + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+
+  AT_CUDA_CHECK(hipGetLastError());
+  return order_t.index(
+      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
+           .to(order_t.device(), keep.scalar_type())});
+}
\ No newline at end of file
--- a/nndet/csrc/ops_hip.cpp
+++ b/nndet/csrc/ops_hip.cpp
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+// Modifications licensed under:
+// SPDX-FileCopyrightText: 2020 Division of Medical Image Computing, German Cancer Research Center (DKFZ), Heidelberg, Germany
+// SPDX-License-Identifier: Apache-2.0
+//
+// Parts of this code are from torchvision licensed under
+// SPDX-FileCopyrightText: 2016 Soumith Chintala
+// SPDX-License-Identifier: BSD-3-Clause
+
+
+#include <torch/extension.h>
+#include "cpu/nms_hip.cpp"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("nms", &nms, "NMS C++ and/or CUDA");
+}
--- a/nndet/planning/estimator.py
+++ b/nndet/planning/estimator.py
@@ -272,10 +272,11 @@ def smi_memory_allocated(gpu_id: int = 0) -> int:
    Returns:
        int: measured GPU memory in bytes
    """
-    reading = int(sp.check_output(
-        ['nvidia-smi', '--query-gpu=memory.used',
-         '--format=csv,nounits,noheader'], encoding='utf-8').split('\n')[gpu_id])
-    return mb2b(reading)
+    # reading = int(sp.check_output(
+    #     ['nvidia-smi', '--query-gpu=memory.used',
+    #      '--format=csv,nounits,noheader'], encoding='utf-8').split('\n')[gpu_id])
+    # return mb2b(reading)
+    return torch.cuda.memory_allocated(device=gpu_id)


 class Tracemalloc():

--- a/nndet/ptmodule/base_module.py
+++ b/nndet/ptmodule/base_module.py
@@ -22,7 +22,8 @@ from typing import Any, Callable, Dict, Optional, Sequence, Hashable, Type, Type

 import torch
 import pytorch_lightning as pl
-from pytorch_lightning.core.memory import ModelSummary
+# from pytorch_lightning.core.memory import ModelSummary
+from pytorch_lightning.callbacks.model_summary import ModelSummary
 from loguru import logger

 from nndet.io.load import save_txt
@@ -79,14 +80,14 @@ class LightningBaseModule(pl.LightningModule):
        self.epoch_start_tic = time()
        return super().on_epoch_start()
    
-    def validation_epoch_end(self, validation_step_outputs):
+    def on_validation_epoch_end(self):
        """
        Print time of epoch
        (needed for cluster where progress bar is deactivated)
        """
        self.epoch_end_toc = time()
        logger.info(f"This epoch took {int(self.epoch_end_toc - self.epoch_start_tic)} s")
-        return super().validation_epoch_end(validation_step_outputs)
+        return super().on_validation_epoch_end()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """

--- a/nndet/ptmodule/retinaunet/base.py
+++ b/nndet/ptmodule/retinaunet/base.py
@@ -131,6 +131,8 @@ class RetinaUNetModule(LightningBaseModuleSWA):
        )

        self.eval_score_key = "mAP_IoU_0.10_0.50_0.05_MaxDet_100"
+        self.training_step_outputs = []
+        self.validation_step_outputs = []

    def training_step(self, batch, batch_idx):
        """
@@ -151,6 +153,7 @@ class RetinaUNetModule(LightningBaseModuleSWA):
            batch_num=batch_idx,
        )
        loss = sum(losses.values())
+        self.training_step_outputs.append(loss)
        return {"loss": loss, **{key: l.detach().item() for key, l in losses.items()}}

    def validation_step(self, batch, batch_idx):
@@ -175,10 +178,12 @@ class RetinaUNetModule(LightningBaseModuleSWA):
            loss = sum(losses.values())

        self.evaluation_step(prediction=prediction, targets=targets)
-        return {
+        output = {
            "loss": loss.detach().item(),
            **{key: l.detach().item() for key, l in losses.items()},
        }
+        self.validation_step_outputs.append(output)
+        return output

    def evaluation_step(
        self,
@@ -232,30 +237,26 @@ class RetinaUNetModule(LightningBaseModuleSWA):
            target=gt_seg,
        )

-    def training_epoch_end(self, training_step_outputs):
+    def on_train_epoch_end(self):
        """
-        Log train loss to loguru logger
+        Log train loss to loguru logger (PyTorch Lightning 2.x version)
        """
-        # process and log losses
-        vals = defaultdict(list)
-        for _val in training_step_outputs:
-            for _k, _v in _val.items():
-                if _k == "loss":
-                    vals[_k].append(_v.detach().item())
-                else:
-                    vals[_k].append(_v)
-
-        for _key, _vals in vals.items():
-            mean_val = np.mean(_vals)
-            if _key == "loss":
-                logger.info(f"Train loss reached: {mean_val:0.5f}")
-            self.log(f"train_{_key}", mean_val, sync_dist=True)
-        return super().training_epoch_end(training_step_outputs)
+        # 直接计算平均 loss（training_step_outputs 是 Tensor 列表）
+        avg_loss = torch.stack(self.training_step_outputs).mean()
+        
+        logger.info(f"Train loss reached: {avg_loss.item():0.5f}")
+        self.log(f"train_loss", avg_loss, sync_dist=True)
+        
+        # 清理内存
+        self.training_step_outputs.clear()

-    def validation_epoch_end(self, validation_step_outputs):
+    def on_validation_epoch_end(self):
        """
-        Log val loss to loguru logger
+        Log val loss to loguru logger (PyTorch Lightning 2.x version)
        """
+        # 从实例变量获取保存的输出
+        validation_step_outputs = self.validation_step_outputs
+        
        # process and log losses
        vals = defaultdict(list)
        for _val in validation_step_outputs:
@@ -270,7 +271,9 @@ class RetinaUNetModule(LightningBaseModuleSWA):

        # process and log metrics
        self.evaluation_end()
-        return super().validation_epoch_end(validation_step_outputs)
+        
+        # 清理内存（重要！）
+        self.validation_step_outputs.clear()

    def evaluation_end(self):
        """

--- a/nndet/training/swa.py
+++ b/nndet/training/swa.py
@@ -22,7 +22,16 @@ from loguru import logger
 import torch
 from torch.optim.lr_scheduler import _LRScheduler
 from pytorch_lightning.callbacks import StochasticWeightAveraging
-from pytorch_lightning.trainer.optimizers import _get_default_scheduler_config
+# from pytorch_lightning.trainer.optimizers import _get_default_scheduler_config
+def _get_default_scheduler_config():
+    return {
+        "scheduler": None,  # 必需: scheduler 实例
+        "interval": "epoch", # 调用时机: 'epoch' 或 'step'
+        "frequency": 1,    # 每隔多少 interval 调用一次
+        "monitor": "val_loss", # 监控指标 (用于 ReduceLROnPlateau)
+        "strict": True,    # 如果 monitor 不存在是否报错
+        "name": None,      # TensorBoard 中显示的名称
+    }
 from pytorch_lightning.utilities import rank_zero_warn

 from nndet.training.learning_rate import CycleLinear
@@ -54,7 +63,7 @@ class BaseSWA(StochasticWeightAveraging):
        """
        super().__init__(
            swa_epoch_start=swa_epoch_start,
-            swa_lrs=None,
+            swa_lrs=0.001,
            annealing_epochs=10,
            annealing_strategy="cos",
            avg_fn=avg_fn,

--- a/nndet/utils/analysis.py
+++ b/nndet/utils/analysis.py
@@ -32,7 +32,7 @@ import pandas as pd
 import seaborn as sns
 from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 unused import
 import matplotlib.pyplot as plt
-plt.style.use('seaborn-deep')
+plt.style.use('default')
 from sklearn.metrics import confusion_matrix
 from torch import Tensor
 import SimpleITK as sitk

--- a/nndet/utils/tensor.py
+++ b/nndet/utils/tensor.py
@@ -9,7 +9,8 @@ import numpy as np
 from torch import Tensor

 from collections import abc
-from torch._six import string_classes
+# from torch._six import string_classes
+string_classes = (str, bytes)
 from typing import Sequence, Union, Any, Mapping, Callable, List

 np_str_obj_array_pattern = re.compile(r'[SaUO]')

--- a/scripts/train.py
+++ b/scripts/train.py
@@ -268,23 +268,25 @@ def _train(
    logger.info(f"Using {plugins} plugins for training")

    trainer = pl.Trainer(
-        gpus=list(range(num_gpus)) if num_gpus > 1 else num_gpus,
-        accelerator=cfg["trainer_cfg"]["accelerator"],
+        # gpus=list(range(num_gpus)) if num_gpus > 1 else num_gpus,
+        devices=num_gpus,               # 替换 gpus → devices
+        accelerator="gpu",              # 明确用 GPU
+        # accelerator=cfg["trainer_cfg"]["accelerator"],
        precision=cfg["trainer_cfg"]["precision"],
-        amp_backend=cfg["trainer_cfg"]["amp_backend"],
-        amp_level=cfg["trainer_cfg"]["amp_level"],
+        # amp_backend=cfg["trainer_cfg"]["amp_backend"],
+        # amp_level=cfg["trainer_cfg"]["amp_level"],
        benchmark=cfg["trainer_cfg"]["benchmark"],
        deterministic=cfg["trainer_cfg"]["deterministic"],
        callbacks=callbacks,
        logger=pl_logger,
        max_epochs=module.max_epochs,
-        progress_bar_refresh_rate=None if bool(int(os.getenv("det_verbose", 1))) else 0,
-        reload_dataloaders_every_epoch=False,
+        enable_progress_bar=None if bool(int(os.getenv("det_verbose", 1))) else 0,
+        # reload_dataloaders_every_epoch=False,
        num_sanity_val_steps=10,
-        weights_summary='full',
+        # weights_summary='full',
        plugins=plugins,
-        terminate_on_nan=True,  # TODO: make modular
-        move_metrics_to_cpu=False,
+        # terminate_on_nan=True,  # TODO: make modular
+        # move_metrics_to_cpu=False,
        **trainer_kwargs
    )
    trainer.fit(module, datamodule=datamodule)

--- a/scripts/utils.py
+++ b/scripts/utils.py
@@ -232,4 +232,5 @@ def env():


 if __name__ == '__main__':
-    env()
+    # env()
+    unpack()
--- a/setup.py
+++ b/setup.py
@@ -61,7 +61,7 @@ def get_extensions():
        extension = CUDAExtension
        sources += source_cuda
        define_macros += [('WITH_CUDA', None)]
-        extra_compile_args["nvcc"] = [
+        extra_compile_args["hipcc"] = [
            "-DCUDA_HAS_FP16=1",
            "-D__CUDA_NO_HALF_OPERATORS__",
            "-D__CUDA_NO_HALF_CONVERSIONS__",
@@ -71,7 +71,7 @@ def get_extensions():
        # It's better if pytorch can do this by default ..
        CC = os.environ.get("CC", None)
        if CC is not None:
-            extra_compile_args["nvcc"].append("-ccbin={}".format(CC))
+            extra_compile_args["hipcc"].append("-ccbin={}".format(CC))

    sources = [os.path.join(extensions_dir, s) for s in sources]
    include_dirs = [str(extensions_dir)]