[Feature] Add getJobLimitCapability interface and use it in nms

24f88646 · bdf · Zaida Zhou · a8f7ae48 · 24f88646 · 24f88646
Commit 24f88646 authored Oct 18, 2022 by bdf Committed by Zaida Zhou Oct 22, 2022
Showing with 59 additions and 28 deletions

mmcv/ops/csrc/common/pytorch_mlu_helper.hpp mmcv/ops/csrc/common/pytorch_mlu_helper.hpp +10 -0

mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp +49 -28

No files found.
--- a/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp
+++ b/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp
@@ -25,6 +25,16 @@
 #define CEIL_ALIGN(x, y) (((x) + (y)-1) / (y) * (y))
+inline int32_t getJobLimitCapability() {
+  CNcontext drv_ctx;
+  CNctxConfigParam ctx_conf_param;
+  TORCH_CHECK(
+      CN_SUCCESS == cnGetCtxConfigParam(drv_ctx, CN_CTX_CONFIG_UNION_LIMIT,
+                                        &ctx_conf_param),
+      "cnGetCtxConfigParam fails.");
+  return (int32_t)ctx_conf_param.unionLimit;
+}
 #endif  // MMCV_WITH_MLU
 #endif  // PYTORCH_MLU_HELPER_HPP_
--- a/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
@@ -16,9 +16,9 @@
 void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
               const cnrtDataType_t data_type_input, const void *boxes_ptr,
               const void *scores_ptr, const int input_num_boxes,
-               const int input_stride, const int max_output_boxes,
+               const int max_output_boxes, const float iou_threshold,
-               const float iou_threshold, const float offset,
+               const float offset, void *workspace_ptr, void *output_size_ptr,
-               void *workspace_ptr, void *output_size_ptr, void *output_ptr);
+               void *output_ptr);
 int selectUnionType(uint32_t use_job, int box_num_per_core) {
  // the box_num_per_core should be at least 256, otherwise the real IO
@@ -30,6 +30,45 @@ int selectUnionType(uint32_t use_job, int box_num_per_core) {
  return use_job;
 }
+static cnnlStatus_t policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
+                               int &core_num_per_class,
+                               const int input_box_num) {
+  uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  uint32_t job_limit = getJobLimitCapability();
+  uint32_t core_number = job_limit;
+  int box_num_per_core = (input_box_num + core_number - 1) / core_number;
+  int use_job = selectUnionType(job_limit, box_num_per_core);
+  // initiate k_type as Union1
+  k_dim->x = core_dim;
+  k_dim->y = 1;
+  k_dim->z = 1;
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+  switch (job_limit) {
+    case CN_KERNEL_CLASS_BLOCK:
+    case CN_KERNEL_CLASS_UNION:
+    case CN_KERNEL_CLASS_UNION2:
+    case CN_KERNEL_CLASS_UNION4:
+    case CN_KERNEL_CLASS_UNION8:
+    case CN_KERNEL_CLASS_UNION16: {
+      if (use_job < 4) {
+        k_dim->x = 1;
+        *k_type = CNRT_FUNC_TYPE_BLOCK;
+      } else if (use_job == 4) {
+        k_dim->x = core_dim;
+        *k_type = CNRT_FUNC_TYPE_UNION1;
+      } else {
+        k_dim->x = use_job;
+        *k_type = (cnrtFunctionType_t)use_job;
+      }
+    }; break;
+    default:
+      LOG(WARNING) << "[cnnlNms_v2]: got unsupported job limit number."
+                   << " Use default CN_KERNEL_CLASS_UNION1 with UNION1 task.";
+  }
+  return CNNL_STATUS_SUCCESS;
+}
 Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
                            int offset) {
  // dimension parameters check
@@ -53,33 +92,14 @@ Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
  }
  int input_num_boxes = boxes.size(0);
-  int input_stride = boxes.size(0);
  int max_output_boxes = boxes.size(0);
  cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype());
  cnrtDim3_t k_dim;
  cnrtJobType_t k_type;
-  uint32_t union_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
-  uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  int core_num_per_class;
-  uint32_t job_limit = union_number * core_dim;
+  policyFunc(&k_dim, &k_type, core_num_per_class, input_num_boxes);
-  uint32_t core_number = union_number * core_dim;
-  int box_num_per_core = (input_num_boxes + core_number - 1) / core_number;
-  // initiate k_type as Union1
-  k_dim.x = core_dim;
-  k_dim.y = 1;
-  k_dim.z = 1;
-  k_type = CNRT_FUNC_TYPE_UNION1;
-  int use_job = selectUnionType(job_limit, box_num_per_core);
-  if (use_job < 4) {
-    k_dim.x = 1;
-    k_type = CNRT_FUNC_TYPE_BLOCK;
-  } else if (use_job == 4) {
-    k_dim.x = core_dim;
-    k_type = CNRT_FUNC_TYPE_UNION1;
-  } else {
-    k_dim.x = use_job;
-    k_type = (cnrtFunctionType_t)use_job;
-  }
  // transpose boxes (n, 4) to (4, n) for better performance
  auto boxes_t = boxes.transpose(0, 1);
@@ -96,6 +116,7 @@ Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
  } else {
    space_size = input_num_boxes * sizeof(float) * info_num + sizeof(float);
  }
  auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte));
  // get compute queue
@@ -112,12 +133,12 @@ Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
  auto output_size_impl = torch_mlu::getMluTensorImpl(output_size);
  auto output_size_ptr = output_size_impl->cnnlMalloc();
+  uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
  CNLOG(INFO) << "Launch Kernel MLUUnionX NMS<<<Union" << k_type / core_dim
              << ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
  KernelNms(k_dim, k_type, queue, data_type_input, boxes_ptr, scores_ptr,
-            input_num_boxes, input_stride, max_output_boxes, iou_threshold,
+            input_num_boxes, max_output_boxes, iou_threshold, offset,
-            offset, workspace_ptr, output_size_ptr, output_ptr);
+            workspace_ptr, output_size_ptr, output_ptr);
  int output_num = *static_cast<int *>(output_size.cpu().data_ptr());
  return output.slice(0, 0, output_num);
 }