add Minkowski conv kernel

cfaa1a3a · yanyan · 9ce18407 · cfaa1a3a · cfaa1a3a · cfaa1a3a
Commit cfaa1a3a authored Jun 16, 2020 by yanyan
20 changed files
--- a/LICENSE
+++ b/LICENSE
@@ -186,7 +186,7 @@
      same "printed page" as the copyright notice for easier
      identification within third-party archives.
-   Copyright 2019 Yan Yan
+   Copyright 2019-2020 Yan Yan
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.

--- a/include/paramsgrid.h
+++ b/include/paramsgrid.h
-// Copyright 2019 Yan Yan
+// Copyright 2019-2020 Yan Yan
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/include/spconv/box_iou.h
+++ b/include/spconv/box_iou.h
-// Copyright 2019 Yan Yan
+// Copyright 2019-2020 Yan Yan
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/include/spconv/fused_conv.h
+++ b/include/spconv/fused_conv.h
-// Copyright 2019 Yan Yan
+// Copyright 2019-2020 Yan Yan
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,10 +13,14 @@
 // limitations under the License.
 #pragma once
 #include <cuda_runtime_api.h>
-#include <tensorview/tensorview.h>
+#include <tensorview/tensor.h>
 #include <torch/script.h>
 namespace spconv {
+enum FusedConvAlgo { kFSparseConvNet, kFMinkowskiEngine };
+using all_fused_conv_algos_t =
+    tv::mp_list_c<int, kFSparseConvNet, kFMinkowskiEngine>;
 void fused_conv_cuda(torch::Tensor output, torch::Tensor features,
                     torch::Tensor filters, torch::Tensor indicesIn,
                     torch::Tensor indicesOut, int nHot);
@@ -26,4 +30,26 @@ void fused_conv_backward_cuda(torch::Tensor features, torch::Tensor din,
                              torch::Tensor dfilters, torch::Tensor indicesIn,
                              torch::Tensor indicesOut, int nHot);
+void fused_conv_cuda_minkowski(torch::Tensor output, torch::Tensor features,
+                               torch::Tensor filters, torch::Tensor indicesIn,
+                               torch::Tensor indicesOut, int nHot);
+void fused_conv_backward_cuda_minkowski(torch::Tensor features,
+                                        torch::Tensor din, torch::Tensor dout,
+                                        torch::Tensor filters,
+                                        torch::Tensor dfilters,
+                                        torch::Tensor indicesIn,
+                                        torch::Tensor indicesOut, int nHot);
+template <int Algo> struct FusedConvDispatch;
+template <> struct FusedConvDispatch<kFSparseConvNet> {
+  constexpr static auto *fwd = fused_conv_cuda;
+  constexpr static auto *bwd = fused_conv_backward_cuda;
+};
+template <> struct FusedConvDispatch<kFMinkowskiEngine> {
+  constexpr static auto *fwd = fused_conv_cuda_minkowski;
+  constexpr static auto *bwd = fused_conv_backward_cuda_minkowski;
+};
 } // namespace spconv
--- a/include/spconv/fused_spconv_ops.h
+++ b/include/spconv/fused_spconv_ops.h
-// Copyright 2019 Yan Yan
+// Copyright 2019-2020 Yan Yan
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/include/spconv/geometry.h
+++ b/include/spconv/geometry.h
-// Copyright 2019 Yan Yan
+// Copyright 2019-2020 Yan Yan
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/include/spconv/indice.cu.h
+++ b/include/spconv/indice.cu.h
-// Copyright 2019 Yan Yan
+// Copyright 2019-2020 Yan Yan
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/include/spconv/indice.h
+++ b/include/spconv/indice.h
-// Copyright 2019 Yan Yan
+// Copyright 2019-2020 Yan Yan
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/include/spconv/maxpool.h
+++ b/include/spconv/maxpool.h
-// Copyright 2019 Yan Yan
+// Copyright 2019-2020 Yan Yan
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/include/spconv/minkowski.cu.h
+++ b/include/spconv/minkowski.cu.h
+/* Copyright (c) Chris Choy (chrischoy@ai.stanford.edu).
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural
+ * Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part
+ * of the code.
+ */
+template <typename Dtype, typename Itype, int BLOCK_SIZE>
+__global__ void matmul(const Dtype *A, const int wA, const int hA,
+                       const Dtype *B, const int wB, const int hB, Dtype *C,
+                       const Itype *in_map, const Itype *out_map) {
+  // Use in_feat as A and kernel as B
+  // Block index
+  const int bx = blockIdx.x;
+  const int by = blockIdx.y;
+  // Thread index
+  const int tx = threadIdx.x;
+  const int ty = threadIdx.y;
+  // Coordinate. x is for rows, y is for columns.
+  const int x = BLOCK_SIZE * bx + tx;
+  const int y = BLOCK_SIZE * by + ty;
+  // Csub is used to store the element of the block sub-matrix
+  // that is computed by the thread
+  Dtype Csub = 0;
+  const Itype in_row = y < hA ? in_map[y] : 0;
+  const Itype out_row = y < hA ? out_map[y] : 0;
+  // Loop over all the sub-matrices of A and B
+  // required to compute the block sub-matrix
+  for (int s = 0; s < wA; s += BLOCK_SIZE) {
+    // Declaration of the shared memory array As used to
+    // store the sub-matrix of A
+    __shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE];
+    // Declaration of the shared memory array Bs used to
+    // store the sub-matrix of B
+    __shared__ Dtype Bs[BLOCK_SIZE][BLOCK_SIZE];
+    // Load the matrices from device memory
+    // to shared memory; each thread loads
+    // one element of each matrix
+    As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * in_row + s + tx] : 0;
+    Bs[ty][tx] = ((s + ty) < hB && x < wB) ? B[wB * (s + ty) + x] : 0;
+    // Synchronize to make sure the matrices are loaded
+    __syncthreads();
+    // Multiply the two matrices together;
+    // each thread computes one element
+    // of the block sub-matrix
+#pragma unroll
+    for (int k = 0; k < BLOCK_SIZE; ++k) {
+      Csub += As[ty][k] * Bs[k][tx];
+    }
+    // Synchronize to make sure that the preceding
+    // computation is done before loading two new
+    // sub-matrices of A and B in the next iteration
+    __syncthreads();
+  }
+  // Write the block sub-matrix to device memory;
+  // each thread writes one element
+  if (y < hA && x < wB)
+    atomicAdd(&C[wB * out_row + x], Csub);
+  // C[wB * out_row + x] += Csub;
+}
+template <typename Dtype, typename Itype, int BLOCK_SIZE>
+__global__ void matmul2(const Dtype *A, const int wA, const int hA,
+                        const Dtype *B, const int wB, const int hB,
+                        const Dtype *D, const int wD, const int hD, Dtype *C,
+                        Dtype *E, const Itype *in_map, const Itype *out_map) {
+  // Use grad_out_feat as A, transposed kernel weight as B, and in_feat as D
+  // Block index
+  const int bx = blockIdx.x;
+  const int by = blockIdx.y;
+  // Thread index
+  const int tx = threadIdx.x;
+  const int ty = threadIdx.y;
+  // Coordinate. y is for rows, x is for columns.
+  const int x = BLOCK_SIZE * bx + tx;
+  const int y = BLOCK_SIZE * by + ty;
+  const Itype in_row = y < hA ? in_map[y] : 0;
+  const Itype out_row = y < hA ? out_map[y] : 0;
+  // Csub is used to store the element of the block sub-matrix
+  // that is computed by the thread
+  Dtype Csub = 0;
+  Dtype Esub = 0;
+  // Declaration of the shared memory array As used to
+  // store the sub-matrix of A
+  __shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE];
+  // Declaration of the shared memory array Bs used to
+  // store the sub-matrix of B
+  __shared__ Dtype BTs[BLOCK_SIZE][BLOCK_SIZE];
+  // Declaration of the shared memory array Ds used to
+  // store the sub-matrix of D
+  __shared__ Dtype DTs[BLOCK_SIZE][BLOCK_SIZE];
+  // For Ds = D^T[...:..., ...:...], use the transposed grid dimension for A
+  DTs[ty][tx] = (x < wD && y < hD) ? D[wD * in_row + x] : 0;
+  // Loop over all the sub-matrices of A and B
+  // required to compute the block sub-matrix
+  for (int s = 0; s < wA; s += BLOCK_SIZE) {
+    // Load the matrices from device memory
+    // to shared memory; each thread loads
+    // one element of each matrix
+    As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * out_row + s + tx] : 0;
+    // Transposed kernel
+    BTs[ty][tx] = ((s + ty) < wB && x < hB) ? B[wB * x + s + ty] : 0;
+    // Synchronize to make sure the matrices are loaded
+    __syncthreads();
+    // Multiply the two matrices together;
+    // each thread computes one element
+    // of the block sub-matrix
+#pragma unroll
+    for (int k = 0; k < BLOCK_SIZE; ++k) {
+      Csub += As[ty][k] * BTs[k][tx];
+    }
+    // For Esub, reset to 0
+    Esub = 0;
+#pragma unroll
+    for (int k = 0; k < BLOCK_SIZE; ++k) {
+      Esub += DTs[k][ty] * As[k][tx];
+    }
+    // Synchronize to make sure that the preceding
+    // computation is done before loading two new
+    // sub-matrices of A and B in the next iteration
+    __syncthreads();
+    // For the E matrix which requires accmulation of multiple blocks, use
+    // atomic addition. This can be replaced with a more sophisticaed reduction
+    // algorithm.
+    if ((bx * BLOCK_SIZE + ty) < wD && (s + tx) < wA)
+      atomicAdd(&E[wA * (bx * BLOCK_SIZE + ty) + (s + tx)], Esub);
+  }
+  // Write the block sub-matrix to device memory;
+  // each thread writes one element
+  if (y < hA && x < hB)
+    atomicAdd(&C[hB * in_row + x], Csub);
+}
--- a/include/spconv/nms.h
+++ b/include/spconv/nms.h
-// Copyright 2019 Yan Yan
+// Copyright 2019-2020 Yan Yan
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/include/spconv/nms_functor.h
+++ b/include/spconv/nms_functor.h
-// Copyright 2019 Yan Yan
+// Copyright 2019-2020 Yan Yan
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/include/spconv/nms_gpu.h
+++ b/include/spconv/nms_gpu.h
-// Copyright 2019 Yan Yan
+// Copyright 2019-2020 Yan Yan
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/include/spconv/nms_ops.h
+++ b/include/spconv/nms_ops.h
-// Copyright 2019 Yan Yan
+// Copyright 2019-2020 Yan Yan
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/include/spconv/pillar_scatter_functor.h
+++ b/include/spconv/pillar_scatter_functor.h
-// Copyright 2019 Yan Yan
+// Copyright 2019-2020 Yan Yan
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/include/spconv/pillar_scatter_ops.h
+++ b/include/spconv/pillar_scatter_ops.h
-// Copyright 2019 Yan Yan
+// Copyright 2019-2020 Yan Yan
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/include/spconv/point2voxel.h
+++ b/include/spconv/point2voxel.h
-// Copyright 2019 Yan Yan
+// Copyright 2019-2020 Yan Yan
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/include/spconv/pool_ops.h
+++ b/include/spconv/pool_ops.h
-// Copyright 2019 Yan Yan
+// Copyright 2019-2020 Yan Yan
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/include/spconv/reordering.cu.h
+++ b/include/spconv/reordering.cu.h
-// Copyright 2019 Yan Yan
+// Copyright 2019-2020 Yan Yan
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

--- a/include/spconv/reordering.h
+++ b/include/spconv/reordering.h
-// Copyright 2019 Yan Yan
+// Copyright 2019-2020 Yan Yan
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.