working on tensor core test

01ed382c · yan.yan · 3517290c · 3517290c · 3517290c · 01ed382c
Commit 01ed382c authored Oct 18, 2021 by yan.yan
19 changed files
--- a/src/utils/all.cc
+++ b/src/utils/all.cc
-// Copyright 2019-2020 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <spconv/box_iou.h>
-#include <spconv/nms.h>
-#include <spconv/point2voxel.h>
-namespace py = pybind11;
-using namespace pybind11::literals;
-PYBIND11_MODULE(spconv_utils, m) {
-  m.doc() = "util pybind11 functions for spconv";
-#ifdef TV_CUDA
-  m.def("non_max_suppression", &spconv::non_max_suppression<double>,
-        py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
-        "keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
-  m.def("non_max_suppression", &spconv::non_max_suppression<float>,
-        py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
-        "keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
-#endif
-  m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<double>,
-        py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
-        "order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
-  m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<float>,
-        py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
-        "order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
-  m.def("rotate_non_max_suppression_cpu",
-        &spconv::rotate_non_max_suppression_cpu<float>,
-        py::return_value_policy::reference_internal, "bbox iou",
-        "box_corners"_a = 1, "order"_a = 2, "standup_iou"_a = 3,
-        "thresh"_a = 4);
-  m.def("rotate_non_max_suppression_cpu",
-        &spconv::rotate_non_max_suppression_cpu<double>,
-        py::return_value_policy::reference_internal, "bbox iou",
-        "box_corners"_a = 1, "order"_a = 2, "standup_iou"_a = 3,
-        "thresh"_a = 4);
-  m.def("rbbox_iou", &spconv::rbbox_iou<double>,
-        py::return_value_policy::reference_internal, "rbbox iou",
-        "box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
-        "standup_thresh"_a = 4);
-  m.def("rbbox_iou", &spconv::rbbox_iou<float>,
-        py::return_value_policy::reference_internal, "rbbox iou",
-        "box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
-        "standup_thresh"_a = 4);
-  m.def("rbbox_intersection", &spconv::rbbox_intersection<double>,
-        py::return_value_policy::reference_internal, "rbbox iou",
-        "box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
-        "standup_thresh"_a = 4);
-  m.def("rbbox_intersection", &spconv::rbbox_intersection<float>,
-        py::return_value_policy::reference_internal, "rbbox iou",
-        "box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
-        "standup_thresh"_a = 4);
-  m.def("points_to_voxel_3d_np", &spconv::points_to_voxel_3d_np<float, 3>,
-        "matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
-        "voxel_point_mask"_a = 3, "coors"_a = 4, "num_points_per_voxel"_a = 5,
-        "coor_to_voxelidx"_a = 6, "voxel_size"_a = 7, "coors_range"_a = 8,
-        "max_points"_a = 9, "max_voxels"_a = 10);
-  m.def("points_to_voxel_3d_np", &spconv::points_to_voxel_3d_np<double, 3>,
-        "matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
-        "voxel_point_mask"_a = 3, "coors"_a = 4, "num_points_per_voxel"_a = 5,
-        "coor_to_voxelidx"_a = 6, "voxel_size"_a = 7, "coors_range"_a = 8,
-        "max_points"_a = 9, "max_voxels"_a = 10);
-  m.def("points_to_voxel_3d_np_mean",
-        &spconv::points_to_voxel_3d_np_mean<float, 3>, "matrix tensor_square",
-        "points"_a = 1, "voxels"_a = 2, "voxel_point_mask"_a = 3, "means"_a = 4,
-        "coors"_a = 5, "num_points_per_voxel"_a = 6, "coor_to_voxelidx"_a = 7,
-        "voxel_size"_a = 8, "coors_range"_a = 9, "max_points"_a = 10,
-        "max_voxels"_a = 11);
-  m.def("points_to_voxel_3d_np_mean",
-        &spconv::points_to_voxel_3d_np_mean<double, 3>, "matrix tensor_square",
-        "points"_a = 1, "voxels"_a = 2, "voxel_point_mask"_a = 3, "means"_a = 4,
-        "coors"_a = 5, "num_points_per_voxel"_a = 6, "coor_to_voxelidx"_a = 7,
-        "voxel_size"_a = 8, "coors_range"_a = 9, "max_points"_a = 10,
-        "max_voxels"_a = 11);
-  m.def("points_to_voxel_3d_with_filtering",
-        &spconv::points_to_voxel_3d_with_filtering<float, 3>,
-        "matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
-        "voxel_point_mask"_a = 3, "voxel_mask"_a = 4, "mins"_a = 5,
-        "maxs"_a = 6, "coors"_a = 7, "num_points_per_voxel"_a = 8,
-        "coor_to_voxelidx"_a = 9, "voxel_size"_a = 10, "coors_range"_a = 11,
-        "max_points"_a = 12, "max_voxels"_a = 13, "block_factor"_a = 14,
-        "block_size"_a = 15, "height_threshold"_a = 16,
-        "height_high_threshold"_a = 17);
-  m.def("points_to_voxel_3d_with_filtering",
-        &spconv::points_to_voxel_3d_with_filtering<float, 3>,
-        "matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
-        "voxel_point_mask"_a = 3, "voxel_mask"_a = 4, "mins"_a = 5,
-        "maxs"_a = 6, "coors"_a = 7, "num_points_per_voxel"_a = 8,
-        "coor_to_voxelidx"_a = 9, "voxel_size"_a = 10, "coors_range"_a = 11,
-        "max_points"_a = 12, "max_voxels"_a = 13, "block_factor"_a = 14,
-        "block_size"_a = 15, "height_threshold"_a = 16,
-        "height_high_threshold"_a = 17);
-}
\ No newline at end of file
--- a/src/utils/nms.cu
+++ b/src/utils/nms.cu
-// ------------------------------------------------------------------
-// Deformable Convolutional Networks
-// Copyright (c) 2015 Microsoft
-// Licensed under The MIT License
-// Modified from MATLAB Faster R-CNN
-// (https://github.com/shaoqingren/faster_rcnn)
-// ------------------------------------------------------------------
-#include <cuda_runtime.h>
-#include <iostream>
-#include <spconv/nms_gpu.h>
-#include <vector>
-#define CUDA_CHECK(condition)                                                  \
-  /* Code block avoids redefinition of cudaError_t error */                    \
-  do {                                                                         \
-    cudaError_t error = condition;                                             \
-    if (error != cudaSuccess) {                                                \
-      std::cout << cudaGetErrorString(error) << std::endl;                     \
-    }                                                                          \
-  } while (0)
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-int const threadsPerBlock = sizeof(unsigned long long) * 8;
-template <typename DType>
-__device__ inline DType devIoU(DType const *const a, DType const *const b) {
-  DType left = max(a[0], b[0]), right = min(a[2], b[2]);
-  DType top = max(a[1], b[1]), bottom = min(a[3], b[3]);
-  DType width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
-  DType interS = width * height;
-  DType Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
-  DType Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
-  return interS / (Sa + Sb - interS);
-}
-template <typename DType, int BLOCK_THREADS>
-__global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
-                           const DType *dev_boxes,
-                           unsigned long long *dev_mask) {
-  const int row_start = blockIdx.y;
-  const int col_start = blockIdx.x;
-  // if (row_start > col_start) return;
-  const int row_size = min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
-  const int col_size = min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
-  __shared__ DType block_boxes[BLOCK_THREADS * 5];
-  if (threadIdx.x < col_size) {
-#pragma unroll
-    for (int i = 0; i < 5; ++i) {
-      block_boxes[threadIdx.x * 5 + i] =
-          dev_boxes[(BLOCK_THREADS * col_start + threadIdx.x) * 5 + i];
-    }
-  }
-  __syncthreads();
-  if (threadIdx.x < row_size) {
-    const int cur_box_idx = BLOCK_THREADS * row_start + threadIdx.x;
-    const DType *cur_box = dev_boxes + cur_box_idx * 5;
-    unsigned long long t = 0;
-    int start = 0;
-    if (row_start == col_start) {
-      start = threadIdx.x + 1;
-    }
-    for (int i = start; i < col_size; i++) {
-      if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
-        t |= 1ULL << i;
-      }
-    }
-    const int col_blocks = DIVUP(n_boxes, BLOCK_THREADS);
-    dev_mask[cur_box_idx * col_blocks + col_start] = t;
-  }
-}
-void _set_device(int device_id) {
-  int current_device;
-  CUDA_CHECK(cudaGetDevice(&current_device));
-  if (current_device == device_id) {
-    return;
-  }
-  // The call to cudaSetDevice must come before any calls to Get, which
-  // may perform initialization using the GPU.
-  CUDA_CHECK(cudaSetDevice(device_id));
-}
-template <typename DType, int BLOCK_THREADS>
-int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
-             int boxes_dim, DType nms_overlap_thresh, int device_id) {
-  _set_device(device_id);
-  DType *boxes_dev = NULL;
-  unsigned long long *mask_dev = NULL;
-  const int col_blocks = DIVUP(boxes_num, BLOCK_THREADS);
-  CUDA_CHECK(cudaMalloc(&boxes_dev, boxes_num * boxes_dim * sizeof(DType)));
-  CUDA_CHECK(cudaMemcpy(boxes_dev, boxes_host,
-                        boxes_num * boxes_dim * sizeof(DType),
-                        cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaMalloc(&mask_dev,
-                        boxes_num * col_blocks * sizeof(unsigned long long)));
-  dim3 blocks(DIVUP(boxes_num, BLOCK_THREADS), DIVUP(boxes_num, BLOCK_THREADS));
-  dim3 threads(BLOCK_THREADS);
-  nms_kernel<DType, BLOCK_THREADS>
-      <<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes_dev, mask_dev);
-  std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
-  CUDA_CHECK(cudaMemcpy(&mask_host[0], mask_dev,
-                        sizeof(unsigned long long) * boxes_num * col_blocks,
-                        cudaMemcpyDeviceToHost));
-  std::vector<unsigned long long> remv(col_blocks);
-  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
-  int num_to_keep = 0;
-  for (int i = 0; i < boxes_num; i++) {
-    int nblock = i / BLOCK_THREADS;
-    int inblock = i % BLOCK_THREADS;
-    if (!(remv[nblock] & (1ULL << inblock))) {
-      keep_out[num_to_keep++] = i;
-      unsigned long long *p = &mask_host[0] + i * col_blocks;
-      for (int j = nblock; j < col_blocks; j++) {
-        remv[j] |= p[j];
-      }
-    }
-  }
-  CUDA_CHECK(cudaFree(boxes_dev));
-  CUDA_CHECK(cudaFree(mask_dev));
-  return num_to_keep;
-}
-// template<>
-template int _nms_gpu<float, threadsPerBlock>(int *keep_out,
-                                              const float *boxes_host,
-                                              int boxes_num, int boxes_dim,
-                                              float nms_overlap_thresh,
-                                              int device_id);
-// template<>
-template int _nms_gpu<double, threadsPerBlock>(int *keep_out,
-                                               const double *boxes_host,
-                                               int boxes_num, int boxes_dim,
-                                               double nms_overlap_thresh,
-                                               int device_id);
\ No newline at end of file
--- a/test/aaa.py
+++ b/test/aaa.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+STR = """
+0.0016176700592041016
+0.002481698989868164
+0.0027854442596435547
+0.0031723976135253906
+0.0017437934875488281
+0.0020503997802734375
+0.001399993896484375
+0.0016183853149414062
+0.0007357597351074219
+0.0008492469787597656
+0.0006558895111083984
+0.0007994174957275391
+0.000335693359375
+0.000347137451171875
+"""
+"""
+0.003921985626220703
+0.0049707889556884766
+0.0052530765533447266
+0.0060312747955322266
+0.0036766529083251953
+0.00421142578125
+0.002129793167114258
+0.0023038387298583984
+0.0013151168823242188
+0.0015285015106201172
+0.0008392333984375
+0.0008127689361572266
+0.0002486705780029297
+0.00030994415283203125
+"""
+STR = """
+0.0006084442138671875
+0.0005354881286621094
+0.0012688636779785156
+0.0012619495391845703
+0.002301931381225586
+0.0019693374633789062
+0.0038712024688720703
+0.002872467041015625
+0.005068302154541016
+0.0047588348388671875
+0.007832765579223633
+0.005643367767333984
+0.005807161331176758
+0.004715442657470703"""
+"""
+0.0004992485046386719
+0.0003979206085205078
+0.0013720989227294922
+0.0015933513641357422
+0.0027768611907958984
+0.0024590492248535156
+0.004837512969970703
+0.004601001739501953
+0.009881019592285156
+0.008889913558959961
+0.017162084579467773
+0.009079217910766602
+0.009355545043945312
+0.0068836212158203125
+"""
+nums = list(map(float, STR.strip().split("\n")))
+print(sum(nums))
\ No newline at end of file
--- a/test/benchmark.py
+++ b/test/benchmark.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import time
 from pathlib import Path
 import numpy as np
 import torch
 from torch import nn
+from cumm import tensorview as tv 
-import spconv
+import spconv.pytorch as spconv
-from spconv.utils import VoxelGeneratorV2
+from spconv.utils import Point2VoxelCPU3d
 def waymo_data(batch_size=1):
-    gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
+    gen = Point2VoxelCPU3d([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 3,
-                           150000)
+                           150000, 1)
+    # gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
+    #                        150000)
    data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
-    pc = data["pc"]
+    pc = np.ascontiguousarray(data["pc"])
-    data = gen.generate(pc)
+    print(pc.shape)
-    voxels = data["voxels"].reshape(-1, 3)
+    voxels_tv, indices_tv, _ = gen.point_to_voxel(tv.from_numpy(pc))
-    coors = data["coordinates"]
+    voxels = voxels_tv.numpy().reshape(-1, 3)
+    coors = indices_tv.numpy()
    N = coors.shape[0]
    coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
    return voxels, coors, gen.grid_size
@@ -28,14 +44,25 @@ class Net(nn.Module):
        self.net = spconv.SparseSequential(
            spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0",
                              algo=algo),
+            # spconv.SubMConv3d(32,
+            #                   32,
+            #                   3,
+            #                   bias=False,
+            #                   indice_key="c0",
+            #                   algo=algo),
+            # # nn.BatchNorm1d(32),
+            # # nn.ReLU(),
+            # # spconv.SparseConv3d(64, 64, 2, 2, bias=False,
+            # #                   algo=algo),
+            # spconv.SubMConv3d(32, 64, 3, bias=False, indice_key="c0",
+            #                   algo=algo),
            spconv.SubMConv3d(64,
                              64,
                              3,
                              bias=False,
                              indice_key="c0",
                              algo=algo),
-            # nn.BatchNorm1d(32),
-            # nn.ReLU(),
            spconv.SparseMaxPool3d(2, 2),
            spconv.SubMConv3d(64,
                              96,
@@ -137,27 +164,111 @@ class Net(nn.Module):
                                    self.grid)
        return self.net(x)
+class Net2(nn.Module):
+    def __init__(self, shape, algo):
+        super().__init__()
+        self.net = spconv.SparseSequential(
+            spconv.SubMConv3d(3, 256, 3, bias=False, indice_key="c0",
+                              algo=algo),
+            # spconv.SubMConv3d(32,
+            #                   32,
+            #                   3,
+            #                   bias=False,
+            #                   indice_key="c0",
+            #                   algo=algo),
+            # # nn.BatchNorm1d(32),
+            # # nn.ReLU(),
+            # # spconv.SparseConv3d(64, 64, 2, 2, bias=False,
+            # #                   algo=algo),
+            # spconv.SubMConv3d(32, 64, 3, bias=False, indice_key="c0",
+            #                   algo=algo),
+            spconv.SubMConv3d(256,
+                              256,
+                              3,
+                              bias=False,
+                              indice_key="c0",
+                              algo=algo),
+            # nn.BatchNorm1d(32),
+            # nn.ReLU(),
+            spconv.SparseMaxPool3d(2, 2),
+            spconv.SubMConv3d(256,
+                              512,
+                              3,
+                              bias=False,
+                              indice_key="c1",
+                              algo=algo),
+            spconv.SubMConv3d(512,
+                              512,
+                              3,
+                              bias=False,
+                              indice_key="c1",
+                              algo=algo),
+        )
+        max_batch_size = 1
+        # grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
+        self.grid = torch.full([max_batch_size, *shape], -1,
+                               dtype=torch.int32).cuda()
+        # self.grid = None
+        self.shape = shape
+    def forward(self, features, coors, batch_size):
+        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
+                                    self.grid)
+        return self.net(x)
 def main():
-    voxels, coors, spatial_shape = waymo_data()
+    import pickle 
+    np.random.seed(50051)
+    torch.manual_seed(50051)
+    # voxels, coors, spatial_shape = waymo_data()
+    # with open("/home/yy/test_spconv.pkl", "wb") as f:
+    #     pickle.dump((voxels, coors, spatial_shape), f)
+    with open("/home/yy/test_spconv.pkl", "rb") as f:
+        (voxels, coors, spatial_shape) = pickle.load(f)
+    print(spatial_shape)
+    print(voxels.shape)
+    # voxels = voxels[:100]
+    # coors = coors[:100]
    voxels_th = torch.from_numpy(voxels).cuda().float()
    coors_th = torch.from_numpy(coors).cuda().int()
+    voxels_th.requires_grad = True
    algo = spconv.ConvAlgo.Native
-    net = Net(spatial_shape[::-1], algo).cuda().eval().float()
+    net = Net(spatial_shape, algo).cuda().eval().float()
    print(coors_th.shape)
    out = net(voxels_th, coors_th, 1)
    print(out.spatial_shape)
+    print(voxels.mean(),  voxels.max(), voxels.min())
+    dout = np.random.uniform(-0.2, 0.2,
+                                out.features.shape).astype(np.float32)
+    dout_t = torch.from_numpy(dout).cuda()
+    print(out.spatial_shape, out.features.mean(),  out.features.max(),  out.features.min())
    times = []
    with torch.no_grad():
        for i in range(20):
+            print("------------")
            torch.cuda.synchronize()
            t = time.time()
-            out = net(voxels_th, coors_th, 1)
+            out_nograd = net(voxels_th, coors_th, 1)
            torch.cuda.synchronize()
            times.append(time.time() - t)
+    print("spconv time", np.mean(times[10:]))
+    times = []
+    for i in range(10):
+        out = net(voxels_th, coors_th, 1)
+        print("------------")
+        torch.cuda.synchronize()
+        t = time.time()
+        out.features.backward(dout_t)
+        torch.cuda.synchronize()
+        times.append(time.time() - t)
    # print((net.grid == -1).float().sum(), net.grid.numel())
    # print("spconv time", time.time() - t)
-    print("spconv time", np.mean(times[10:]))
+    print("spconv bw time", np.mean(times[5:]))
 if __name__ == "__main__":

--- a/test/benchmark_detail.py
+++ b/test/benchmark_detail.py
-import time
-from pathlib import Path
-import numpy as np
-import torch
-from torch import nn
-import spconv
-from spconv.utils import VoxelGeneratorV2
-def waymo_data(batch_size=1):
-    gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
-                           150000)
-    data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
-    pc = data["pc"]
-    data = gen.generate(pc)
-    voxels = data["voxels"].reshape(-1, 3)
-    coors = data["coordinates"]
-    N = coors.shape[0]
-    coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
-    return voxels, coors, gen.grid_size
-class Net(nn.Module):
-    def __init__(self, shape, algo):
-        super().__init__()
-        self.net = spconv.SparseSequential(
-            spconv.SubMConv3d(3,
-                              64,
-                              3,
-                              bias=False,
-                              indice_key="c0",
-                              algo=algo,
-                              name="subm-0-0"),
-            spconv.SubMConv3d(64,
-                              64,
-                              3,
-                              bias=False,
-                              indice_key="c0",
-                              algo=algo,
-                              name="subm-0-1"),
-            # nn.BatchNorm1d(32),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2, name="pool-0"),
-            spconv.SubMConv3d(64,
-                              96,
-                              3,
-                              bias=False,
-                              indice_key="c1",
-                              algo=algo,
-                              name="subm-1-0"),
-            spconv.SubMConv3d(96,
-                              96,
-                              3,
-                              bias=False,
-                              indice_key="c1",
-                              algo=algo,
-                              name="subm-1-1"),
-            # nn.BatchNorm1d(64),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2, name="pool-1"),
-            spconv.SubMConv3d(96,
-                              128,
-                              3,
-                              bias=False,
-                              indice_key="c2",
-                              algo=algo,
-                              name="subm-2-0"),
-            spconv.SubMConv3d(128,
-                              128,
-                              3,
-                              bias=False,
-                              indice_key="c2",
-                              algo=algo,
-                              name="subm-2-1"),
-            # nn.BatchNorm1d(128),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2, name="pool-2"),
-            spconv.SubMConv3d(128,
-                              160,
-                              3,
-                              bias=False,
-                              indice_key="c3",
-                              algo=algo,
-                              name="subm-3-0"),
-            spconv.SubMConv3d(160,
-                              160,
-                              3,
-                              bias=False,
-                              indice_key="c3",
-                              algo=algo,
-                              name="subm-3-1"),
-            # nn.BatchNorm1d(128),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2, name="pool-3"),
-            spconv.SubMConv3d(160,
-                              192,
-                              3,
-                              bias=False,
-                              indice_key="c4",
-                              algo=algo,
-                              name="subm-4-0"),
-            spconv.SubMConv3d(192,
-                              192,
-                              3,
-                              bias=False,
-                              indice_key="c4",
-                              algo=algo,
-                              name="subm-4-1"),
-            # nn.BatchNorm1d(128),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2, name="pool-4"),
-            spconv.SubMConv3d(192,
-                              224,
-                              3,
-                              bias=False,
-                              indice_key="c5",
-                              algo=algo,
-                              name="subm-5-0"),
-            spconv.SubMConv3d(224,
-                              224,
-                              3,
-                              bias=False,
-                              indice_key="c5",
-                              algo=algo,
-                              name="subm-5-1"),
-            # nn.BatchNorm1d(128),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2, name="pool-5"),
-            spconv.SubMConv3d(224,
-                              256,
-                              3,
-                              bias=False,
-                              indice_key="c6",
-                              algo=algo,
-                              name="subm-6-0"),
-            spconv.SubMConv3d(256,
-                              256,
-                              3,
-                              bias=False,
-                              indice_key="c6",
-                              algo=algo,
-                              name="subm-6-1"),
-        )
-        max_batch_size = 1
-        # grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
-        self.grid = torch.full([max_batch_size, *shape], -1,
-                               dtype=torch.int32).cuda()
-        # self.grid = None
-        self.shape = shape
-    def forward(self, features, coors, batch_size):
-        x = spconv.SparseConvTensor(features,
-                                    coors,
-                                    self.shape,
-                                    batch_size,
-                                    self.grid,
-                                    benchmark=True)
-        return self.net(x)
-def main():
-    dtype = torch.float32
-    voxels, coors, spatial_shape = waymo_data()
-    voxels_th = torch.from_numpy(voxels).cuda().to(dtype)
-    coors_th = torch.from_numpy(coors).cuda().int()
-    algo = spconv.ConvAlgo.Minkowski
-    net = Net(spatial_shape[::-1], algo).cuda().eval().to(dtype)
-    print(coors_th.shape)
-    out = net(voxels_th, coors_th, 1)
-    print(out.spatial_shape)
-    times = []
-    detail_bench = {}
-    detail_ind_gen_bench = {}
-    with torch.no_grad():
-        for i in range(20):
-            torch.cuda.synchronize()
-            t = time.time()
-            out = net(voxels_th, coors_th, 1)
-            for k, v in out.benchmark_record.items():
-                if k not in detail_bench:
-                    detail_bench[k] = []
-                    detail_ind_gen_bench[k] = []
-                detail_bench[k].extend(v["time"])
-                detail_ind_gen_bench[k].extend(v["indice_gen_time"])
-            torch.cuda.synchronize()
-            times.append(time.time() - t)
-    # print((net.grid == -1).float().sum(), net.grid.numel())
-    # print("spconv time", time.time() - t)
-    print("spconv time", np.mean(times[10:]))
-    print(detail_bench["subm-6-0"])
-    print(detail_ind_gen_bench["subm-6-0"])
-if __name__ == "__main__":
-    main()
--- a/test/benchmark_points_to_voxel.py
+++ b/test/benchmark_points_to_voxel.py
-import time
-from pathlib import Path
-import numpy as np
-import torch
-from torch import nn
-import spconv
-from spconv.utils import VoxelGeneratorV2, VoxelGeneratorV3
-def waymo_data_gpu(batch_size=1):
-    print('gpu with total points available per voxel')
-    data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
-    points = torch.from_numpy(data['pc']).cuda().float()
-    voxel_size = torch.Tensor([0.1, 0.1,
-                               0.1]).to(points.dtype).to(points.device)
-    coors_range = torch.Tensor([-80, -80, -2, 80, 80,
-                                6]).to(points.dtype).to(points.device)
-    gen = VoxelGeneratorV3(voxel_size,
-                           coors_range,
-                           max_points=200000,
-                           num_features=points.shape[1],
-                           dtype=points.dtype,
-                           device=points.device)
-    voxels, coors = gen.generate(points)
-    times = []
-    with torch.no_grad():
-        for i in range(200):
-            torch.cuda.synchronize()
-            t = time.time()
-            voxels, coors = gen.generate(points)
-            torch.cuda.synchronize()
-            times.append(time.time() - t)
-    print("voxelization time", np.mean(times[100:]))
-    N = coors.shape[0]
-    batch_id = torch.zeros([N, 1], dtype=coors.dtype, device=coors.device)
-    coors = torch.cat([batch_id, coors], dim=1)
-    return voxels, coors, gen.grid_size
-def waymo_data_cpu(max_points_per_voxel=1, batch_size=1):
-    print('cpu with %d max points per voxel' % max_points_per_voxel)
-    gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6],
-                           max_points_per_voxel, 150000)
-    data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
-    pc = data["pc"]
-    data = gen.generate(pc)
-    times = []
-    with torch.no_grad():
-        for i in range(200):
-            torch.cuda.synchronize()
-            t = time.time()
-            data = gen.generate(pc)
-            torch.cuda.synchronize()
-            times.append(time.time() - t)
-    print("voxelization time", np.mean(times[100:]))
-    voxels = data["voxels"].reshape(-1, 3)
-    coors = data["coordinates"]
-    N = coors.shape[0]
-    coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
-    return voxels, coors, gen.grid_size
-def get_index(coor, grid_size):
-    index = coor[0]
-    for c, g in zip(coor[1:], grid_size):
-        index = index * g + c
-    return index
-def main():
-    voxels_gpu, coors_gpu, grid_size_gpu = waymo_data_gpu()
-    voxels_cpu, coors_cpu, grid_size_cpu = waymo_data_cpu(1)
-    waymo_data_cpu(10)
-    waymo_data_cpu(40)
-    print('...')
-    grid_size_gpu = grid_size_gpu[::-1]
-    grid_size_cpu = grid_size_cpu[::-1]
-    assert len(grid_size_gpu) == len(grid_size_cpu), "mismatch grid size"
-    assert grid_size_gpu[0] == grid_size_cpu[0], "mismatch grid size"
-    assert grid_size_gpu[1] == grid_size_cpu[1], "mismatch grid size"
-    assert grid_size_gpu[2] == grid_size_cpu[2], "mismatch grid size"
-    assert coors_gpu.shape[0] == coors_cpu.shape[0], "mismatch coors shape"
-    index2voxel = dict()
-    for coor, voxel in zip(coors_gpu, voxels_gpu):
-        index = get_index(coor, grid_size_gpu).item()
-        index2voxel[index] = voxel[:3].cpu()
-    for coor, voxel in zip(coors_cpu, voxels_cpu):
-        index = get_index(coor, grid_size_cpu).item()
-        assert index in index2voxel, "mismatch index: " + str(index)
-        assert (index2voxel.pop(index) - voxel[:3]).abs().max() < 0.1, \
-                    "voxel diff should be smaller than voxel_size 0.1"
-    print('Perfect GPU Voxelization!!!')
-if __name__ == "__main__":
-    main()
--- a/test/benchmark_points_to_voxel_gpu.py
+++ b/test/benchmark_points_to_voxel_gpu.py
-import time
-from pathlib import Path
-import numpy as np
-import torch
-from torch import nn
-import spconv
-from spconv.utils import VoxelGeneratorV3
-def waymo_data(batch_size=1):
-    data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
-    points = torch.from_numpy(data['pc']).cuda().float()
-    voxel_size = torch.Tensor([0.1, 0.1,
-                               0.1]).to(points.dtype).to(points.device)
-    coors_range = torch.Tensor([-80, -80, -2, 80, 80,
-                                6]).to(points.dtype).to(points.device)
-    gen = VoxelGeneratorV3(voxel_size, coors_range)
-    voxels, coors = gen.generate(points)
-    N = coors.shape[0]
-    batch_id = torch.zeros([N, 1], dtype=coors.dtype, device=coors.device)
-    coors = torch.cat([batch_id, coors], dim=1)
-    return voxels, coors, gen.grid_size
-class Net(nn.Module):
-    def __init__(self, shape, algo, device):
-        super().__init__()
-        self.device = device
-        self.net = spconv.SparseSequential(
-            spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0",
-                              algo=algo),
-            spconv.SubMConv3d(64,
-                              64,
-                              3,
-                              bias=False,
-                              indice_key="c0",
-                              algo=algo),
-            # nn.BatchNorm1d(32),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(64,
-                              96,
-                              3,
-                              bias=False,
-                              indice_key="c1",
-                              algo=algo),
-            spconv.SubMConv3d(96,
-                              96,
-                              3,
-                              bias=False,
-                              indice_key="c1",
-                              algo=algo),
-            # nn.BatchNorm1d(64),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(96,
-                              128,
-                              3,
-                              bias=False,
-                              indice_key="c2",
-                              algo=algo),
-            spconv.SubMConv3d(128,
-                              128,
-                              3,
-                              bias=False,
-                              indice_key="c2",
-                              algo=algo),
-            # nn.BatchNorm1d(128),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(128,
-                              160,
-                              3,
-                              bias=False,
-                              indice_key="c3",
-                              algo=algo),
-            spconv.SubMConv3d(160,
-                              160,
-                              3,
-                              bias=False,
-                              indice_key="c3",
-                              algo=algo),
-            # nn.BatchNorm1d(128),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(160,
-                              192,
-                              3,
-                              bias=False,
-                              indice_key="c4",
-                              algo=algo),
-            spconv.SubMConv3d(192,
-                              192,
-                              3,
-                              bias=False,
-                              indice_key="c4",
-                              algo=algo),
-            # nn.BatchNorm1d(128),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(192,
-                              224,
-                              3,
-                              bias=False,
-                              indice_key="c5",
-                              algo=algo),
-            spconv.SubMConv3d(224,
-                              224,
-                              3,
-                              bias=False,
-                              indice_key="c5",
-                              algo=algo),
-            # nn.BatchNorm1d(128),
-            # nn.ReLU(),
-            spconv.SparseMaxPool3d(2, 2),
-            spconv.SubMConv3d(224,
-                              256,
-                              3,
-                              bias=False,
-                              indice_key="c6",
-                              algo=algo),
-            spconv.SubMConv3d(256,
-                              256,
-                              3,
-                              bias=False,
-                              indice_key="c6",
-                              algo=algo),
-        )
-        max_batch_size = 1
-        # grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
-        self.grid = torch.full([max_batch_size, *shape],
-                               -1,
-                               dtype=torch.int32,
-                               device=self.device)
-        # self.grid = None
-        self.shape = shape
-    def forward(self, features, coors, batch_size):
-        x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
-                                    self.grid)
-        return self.net(x)
-def main():
-    voxels, coors, spatial_shape = waymo_data()
-    voxels_th, coors_th = voxels, coors
-    algo = spconv.ConvAlgo.Native
-    net = Net(spatial_shape[::-1], algo,
-              voxels_th.device).cuda(device=voxels_th.device).eval().float()
-    print(coors_th.shape)
-    out = net(voxels_th, coors_th, 1)
-    print(out.spatial_shape)
-    times = []
-    with torch.no_grad():
-        for i in range(20):
-            torch.cuda.synchronize()
-            t = time.time()
-            out = net(voxels_th, coors_th, 1)
-            torch.cuda.synchronize()
-            times.append(time.time() - t)
-    # print((net.grid == -1).float().sum(), net.grid.numel())
-    # print("spconv time", time.time() - t)
-    print("spconv time", np.mean(times[10:]))
-if __name__ == "__main__":
-    main()
--- a/test/src/catch_main.cpp
+++ b/test/src/catch_main.cpp
-// 000-CatchMain.cpp
-// In a Catch project with multiple files, dedicate one file to compile the
-// source code of Catch itself and reuse the resulting object file for linking.
-// Let Catch provide main():
-#define CATCH_CONFIG_MAIN
-#include "catch.hpp"
-// That's it
-// Compile implementation of Catch for use with files that do contain tests:
-// - g++ -std=c++11 -Wall -I$(CATCH_SINGLE_INCLUDE) -c 000-CatchMain.cpp
-// - cl -EHsc -I%CATCH_SINGLE_INCLUDE% -c 000-CatchMain.cpp
--- a/test/src/test_conv_rule.cpp
+++ b/test/src/test_conv_rule.cpp
-#include <algorithm>
-#include <iostream>
-#include <map>
-#include "catch.hpp"
-#include <prettyprint.h>
-#include <string>
-#include <vector>
-#include <exception>
-#include <numeric>
-#include <pybind11/embed.h> // everything needed for embedding
-#include <pybind11/functional.h>
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <tuple>
-#include <pybind11_utils.h>
-#include <spconv/spconv_ops.h>
-namespace py = pybind11;
-TEST_CASE("GetConvIndPair", "[SpConvNet]")
-{
-    using namespace py::literals;
-    py::scoped_interpreter guard{}; // start the interpreter and keep it alive
-    py::exec(R"(
-    from __future__ import print_function
-    import numpy as np
-    import math
-    # import spconv
-    # import torch
-    def get_convolution_output_size(input_size,
-                                    kernel_size,
-                                    stride,
-                                    padding=None,
-                                    rate=None):
-        ndim = len(input_size)
-        if padding is None:
-            padding = [0] * ndim
-        output_size = []
-        for i in range(ndim):
-            output_size.append((input_size[i] + 2 * padding[i] - (
-                (kernel_size[i] - 1) + 1)) // stride[i] + 1)
-        return output_size
-    def get_test_sparse_data(shape,
-                            num_points,
-                            num_channels,
-                            integer=False,
-                            dtype=np.float32):
-        dense_shape = shape
-        ndim = len(dense_shape)
-        # num_points = np.random.randint(10, 100, size=[batch_size, ndim])
-        num_points = np.array(num_points)
-        # num_points = np.array([3, 2])
-        batch_size = len(num_points)
-        batch_indices = []
-        coors_total = np.stack(
-            np.meshgrid(*[np.arange(0, s) for s in shape]), axis=-1)
-        coors_total = coors_total.reshape(-1, ndim)
-        for i in range(batch_size):
-            np.random.shuffle(coors_total)
-            inds_total = coors_total[:num_points[i]]
-            inds_total = np.pad(
-                inds_total, ((0, 0), (0, 1)), mode="constant", constant_values=i)
-            batch_indices.append(inds_total)
-        if integer:
-            sparse_data = np.random.randint(
-                20, 100, size=[num_points.sum(), num_channels]).astype(dtype)
-        else:
-            sparse_data = np.random.uniform(
-                -1, 1, size=[num_points.sum(), num_channels]).astype(dtype)
-        # sparse_data = np.arange(1, num_points.sum() + 1).astype(np.float32).reshape(5, 1)
-        dense_data = np.zeros(
-            [batch_size, num_channels, *dense_shape], dtype=sparse_data.dtype)
-        start = 0
-        for i, inds in enumerate(batch_indices):
-            for j, ind in enumerate(inds):
-                dense_slice = (i, slice(None), *ind[:-1])
-                dense_data[dense_slice] = sparse_data[start + j]
-            start += len(inds)
-        batch_indices = np.concatenate(batch_indices, axis=0)
-        return {
-            "features": sparse_data.astype(dtype),
-            "indices": batch_indices.astype(np.int32),
-            "features_dense": dense_data.astype(dtype),
-        }
-    shape = [50, 30, 30]
-    num_points = [5000] * 1
-    # np.random.seed(np.random.randint(1, 100000))
-    in_channels = 64
-    sparse_dict = get_test_sparse_data(shape, num_points, in_channels)
-    features = np.ascontiguousarray(sparse_dict["features"]).astype(np.float32)
-    indices = np.ascontiguousarray(sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
-    features_dense = sparse_dict["features_dense"]
-    # indices_t = torch.from_numpy(indices)
-    filters = np.random.uniform(0, 1, size=[3, 3, 3, 64, 64]).astype(np.float32)
-    # print(outids.shape)
-    )");
-    SECTION("DebugTest"){
-        auto inds = array2TensorView<int>(py::array(py::globals()["indices"]));
-        auto inds_tensor = torch::from_blob(inds.data(), {inds.dim(0), inds.dim(1)}, torch::dtype(torch::kInt32));
-        auto inds_gpu = inds_tensor.to(torch::Device(torch::kCPU));
-        auto features = array2TensorView<float>(py::array(py::globals()["features"]));
-        auto features_tensor = torch::from_blob(features.data(), {features.dim(0), features.dim(1)}, torch::dtype(torch::kFloat));
-        auto features_gpu = features_tensor.to(torch::Device(torch::kCUDA, 0));
-        auto filters = array2TensorView<float>(py::array(py::globals()["filters"]));
-        auto filters_tensor = torch::from_blob(filters.data(), {filters.dim(0), filters.dim(1), filters.dim(2), filters.dim(3), filters.dim(4)}, torch::dtype(torch::kFloat));
-        auto filters_gpu = filters_tensor.to(torch::Device(torch::kCUDA, 0));
-        auto outputs = spconv::getIndicePair<3>(inds_gpu, 1, {46, 26, 26}, {50, 30, 30}, {3, 3, 3},
-            {1, 1, 1}, {0, 0, 0}, {2, 2, 2}, {0, 0, 0}, 0, 0, 0);
-        // std::cout << outputs[2] << std::endl;
-        /*
-        auto output = spconv::indiceConv<float>(features_gpu, filters_gpu, outputs[1], outputs[2], outputs[0].size(0), false);
-        std::cout << output << std::endl;*/
-    }
-}
\ No newline at end of file
--- a/test/test_conv.py
+++ b/test/test_conv.py
-# Copyright 2019-2020 Yan Yan
+# Copyright 2021 Yan Yan
-#
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -20,9 +20,9 @@ import numpy as np
 import torch
 from torch import nn
-import spconv
+import spconv.pytorch as spconv
 from spconv.test_utils import TestCase, generate_sparse_data, params_grid
+from spconv.constants import FILTER_HWIO
 # import sparseconvnet as scn
@@ -37,7 +37,7 @@ class SparseConv3dTestTorch(nn.Module):
                 stride,
                 padding,
                 dilation,
-                 algo=spconv.ConvAlgo.Minkowski):
+                 algo=spconv.ConvAlgo.Native):
        super().__init__()
        layers = [
            spconv.SparseConv3d(in_channels,
@@ -47,7 +47,6 @@ class SparseConv3dTestTorch(nn.Module):
                                padding=padding,
                                dilation=dilation,
                                bias=False,
-                                use_hash=False,
                                algo=algo)
        ]
        for i in range(1, num_layers):
@@ -59,7 +58,6 @@ class SparseConv3dTestTorch(nn.Module):
                                    padding=padding,
                                    dilation=dilation,
                                    bias=False,
-                                    use_hash=False,
                                    algo=algo))
        self.net = spconv.SparseSequential(*layers, )
        # self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
@@ -359,6 +357,9 @@ class TestSpConv(TestCase):
        strides = [1, 2, 3]
        paddings = [0, 1, 2]
        dilations = [1, 2, 3]
+        # strides = [1]
+        # paddings = [0]
+        # dilations = [1]
        for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
                devices, shapes, batchsizes, in_channels, out_channels, ksizes,
@@ -367,7 +368,6 @@ class TestSpConv(TestCase):
                continue  # don't support this.
            device = torch.device(dev)
            num_points = [1000] * bs
            sparse_dict = generate_sparse_data(shape, num_points, IC)
            features = np.ascontiguousarray(sparse_dict["features"]).astype(
@@ -375,8 +375,13 @@ class TestSpConv(TestCase):
            indices = np.ascontiguousarray(
                sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
            features_dense = sparse_dict["features_dense"].astype(np.float32)
-            filters = np.random.uniform(0, 1, size=[k, k, k, IC,
+            if FILTER_HWIO:
-                                                    OC]).astype(np.float32)
+                filters = np.random.uniform(0, 1, size=[k, k, k, IC,
+                                                        OC]).astype(np.float32)
+            else:
+                filters = np.random.uniform(0, 1, size=[k, k, k, OC,
+                                                        IC]).astype(np.float32)
            indices_t = torch.from_numpy(indices).int().to(device)
            features_t = torch.from_numpy(features).to(device)
            features_t.requires_grad = True
@@ -387,11 +392,19 @@ class TestSpConv(TestCase):
            net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
                                      d).to(device)
            filters_t = torch.from_numpy(filters).to(device)
-            net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
+            if FILTER_HWIO:
-                                                              2).contiguous()
+                net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
+                                                                2).contiguous()
+            else:
+                net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1,
+                                                                2).contiguous()
            net.net[0].weight.data[:] = filters_t
            out_ref = net_ref(features_dense_t)
            out = net(features_t, indices_t, bs).dense()
+            out_np = out.detach().cpu().numpy()
+            out_ref_np = out_ref.detach().cpu().numpy()
+            self.assertAllClose(out_np, out_ref_np, atol=1e-4)
            dout = np.random.uniform(-0.2, 0.2,
                                     out_ref.shape).astype(features.dtype)
            dout_t = torch.from_numpy(dout).to(device)
@@ -401,18 +414,21 @@ class TestSpConv(TestCase):
                                                               1).contiguous()
            din_sparse = gather_nd(din_dense, indices_t.long())
            din = features_t.grad.detach()
            din_np = din.cpu().numpy()
            din_sparse_np = din_sparse.cpu().numpy()
-            self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
            for layer, layer_ref in zip(net.net, net_ref.net):
                dw = layer.weight.grad.detach().cpu().numpy()
                dw_ref = layer_ref.weight.grad.detach().cpu().numpy()
-                dw = dw.transpose(4, 3, 0, 1, 2)
+                if FILTER_HWIO:
+                    dw = dw.transpose(4, 3, 0, 1, 2)
+                else:
+                    dw = dw.transpose(3, 4, 0, 1, 2)
                self.assertAllClose(dw, dw_ref, atol=1e-4)
+            self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
-            out_np = out.detach().cpu().numpy()
-            out_ref_np = out_ref.detach().cpu().numpy()
-            self.assertAllClose(out_np, out_ref_np, atol=1e-4)
    def testSpDeConv3d(self):
        np.random.seed(484)
@@ -454,7 +470,7 @@ class TestSpConv(TestCase):
            net_ref = DeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
                                        d).to(device)
            filters_t = torch.from_numpy(filters).to(device)
-            net_ref.net[0].weight.data[:] = filters_t.permute(3, 4, 0, 1,
+            net_ref.net[0].weight.data[:] = filters_t.permute(4, 3, 0, 1,
                                                              2).contiguous()
            net.net[0].weight.data[:] = filters_t
            out_ref = net_ref(features_dense_t)
@@ -474,7 +490,7 @@ class TestSpConv(TestCase):
            for layer, layer_ref in zip(net.net, net_ref.net):
                dw = layer.weight.grad.detach().cpu().numpy()
                dw_ref = layer_ref.weight.grad.detach().cpu().numpy()
-                dw = dw.transpose(3, 4, 0, 1, 2)
+                dw = dw.transpose(4, 3, 0, 1, 2)
                self.assertAllClose(dw, dw_ref, atol=1e-4)
            out_np = out.detach().cpu().numpy()
@@ -551,12 +567,16 @@ class TestSpConv(TestCase):
        shapes = [[19, 18, 17]]
        batchsizes = [1, 2]
-        in_channels = [62]
+        in_channels = [64]
-        out_channels = [62]
+        out_channels = [64]
        ksizes = [2, 3]
        strides = [1, 2, 3]
        paddings = [0, 1]
        dilations = [1, 2, 3]
+        ksizes = [2]
+        strides = [2]
+        paddings = [0]
+        dilations = [1]
        for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
                devices, shapes, batchsizes, in_channels, out_channels, ksizes,
@@ -565,6 +585,7 @@ class TestSpConv(TestCase):
                continue  # don't support this.
            device = torch.device(dev)
            num_points = [1000] * bs
            # when data contains negative, sparse maxpool is not equal to dense maxpool.
            sparse_dict = generate_sparse_data(shape,
                                               num_points,
@@ -576,8 +597,8 @@ class TestSpConv(TestCase):
            indices = np.ascontiguousarray(
                sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
            features_dense = sparse_dict["features_dense"].astype(np.float32)
-            filters = np.random.uniform(0, 1, size=[k, k, k, IC,
+            filters = np.random.uniform(0, 1, size=[k, k, k, OC,
-                                                    OC]).astype(np.float32)
+                                                    IC]).astype(np.float32)
            indices_t = torch.from_numpy(indices).int().to(device)
            features_t = torch.from_numpy(features).to(device)
            features_t.requires_grad = True
@@ -588,11 +609,15 @@ class TestSpConv(TestCase):
            out_ref = net_ref(features_dense_t)
            out = net(features_t, indices_t, bs)
            outids = out.indices
            outfeatures = out.features
            outids_dev = outids.float()
            out_dense = out.dense(channels_first=False)
            out = out_dense.permute(0, 4, 1, 2, 3).contiguous()
+            out_np = out.detach().cpu().numpy()
+            out_ref_np = out_ref.detach().cpu().numpy()
+            self.assertAllClose(out_np, out_ref_np, atol=1e-4)
            dout_sparse = np.random.uniform(
                -0.2, 0.2, outfeatures.shape).astype(features.dtype)
@@ -607,9 +632,6 @@ class TestSpConv(TestCase):
            din_sparse = gather_nd(din_dense, indices_t.long())
            din = features_t.grad.detach()
-            out_np = out.detach().cpu().numpy()
-            out_ref_np = out_ref.detach().cpu().numpy()
-            self.assertAllClose(out_np, out_ref_np, atol=1e-4)
            din_np = din.cpu().numpy()
            din_sparse_np = din_sparse.cpu().numpy()
            self.assertAllClose(din_np, din_sparse_np, atol=1e-4)

--- a/third_party/catch2/catch.hpp
+++ b/third_party/catch2/catch.hpp
--- a/cutlass @ fd7e058d
+++ b/cutlass @ fd7e058d
-Subproject commit fd7e058d0cb3e4bf743edc530c7778a210cb168b
--- a/mp11 @ 29764aad
+++ b/mp11 @ 29764aad
-Subproject commit 29764aad4881fde809af6a025c12012e47a55515
--- a/pybind11 @ 3b1dbeba
+++ b/pybind11 @ 3b1dbeba
-Subproject commit 3b1dbebabc801c9cf6f0953a4c20b904d444f879
--- a/tools/README.md
+++ b/tools/README.md
+<!--
+ Copyright 2021 Yan Yan
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+## How to debug manylinux build
+```Bash
+docker run --rm -it -e PLAT=manylinux2014_x86_64 -v `pwd`:/io -v $HOME:/myhome scrin/manylinux2014-cuda:cu114-devel bash
+/io/tools/build-wheels.sh
+```
\ No newline at end of file
--- a/tools/build-wheels.sh
+++ b/tools/build-wheels.sh
+#!/bin/bash
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e -u -x
+function repair_wheel {
+    wheel="$1"
+    outpath="$2"
+    if ! auditwheel show "$wheel"; then
+        echo "Skipping non-platform wheel $wheel"
+    else
+        auditwheel repair "$wheel" --plat "$PLAT" -w "$outpath"
+    fi
+}
+export SPCONV_DISABLE_JIT="1"
+export CUMM_CUDA_ARCH_LIST="all"
+# Compile wheels, we only support 3.7-3.10.
+# "/opt/python/cp36-cp36m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
+"/opt/python/cp37-cp37m/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
+"/opt/python/cp38-cp38/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
+"/opt/python/cp39-cp39/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
+"/opt/python/cp310-cp310/bin/pip" wheel /io/ --no-deps -w /io/wheelhouse_tmp
+# Bundle external shared libraries into the wheels
+for whl in /io/wheelhouse_tmp/*.whl; do
+    repair_wheel "$whl" /io/dist
+done
+rm -rf /io/wheelhouse_tmp
\ No newline at end of file
--- a/tools/install_windows_cuda.ps1
+++ b/tools/install_windows_cuda.ps1
+## -------------------
+## Constants
+## -------------------
+# Dictionary of known cuda versions and thier download URLS, which do not follow a consistent pattern :(
+$CUDA_KNOWN_URLS = @{
+    "10.2" = "http://developer.download.nvidia.com/compute/cuda/10.2/Prod/network_installers/cuda_10.2.89_win10_network.exe";
+    "11.0" = "http://developer.download.nvidia.com/compute/cuda/11.0.3/network_installers/cuda_11.0.3_win10_network.exe";
+    "11.1" = "https://developer.download.nvidia.com/compute/cuda/11.1.1/network_installers/cuda_11.1.1_win10_network.exe";
+    "11.2" = "https://developer.download.nvidia.com/compute/cuda/11.2.2/network_installers/cuda_11.2.2_win10_network.exe";
+    "11.3" = "https://developer.download.nvidia.com/compute/cuda/11.3.1/network_installers/cuda_11.3.1_win10_network.exe";
+    "11.4" = "https://developer.download.nvidia.com/compute/cuda/11.4.2/network_installers/cuda_11.4.2_win10_network.exe";
+}
+# cuda_runtime.h is in nvcc <= 10.2, but cudart >= 11.0
+# @todo - make this easier to vary per CUDA version.
+$CUDA_PACKAGES_IN = @(
+    "nvcc";
+    "visual_studio_integration";
+    "curand_dev";
+    "nvrtc_dev";
+    "cudart";
+)
+## -------------------
+## Select CUDA version
+## -------------------
+# Get the cuda version from the environment as env:cuda.
+$CUDA_VERSION_FULL = $env:cuda
+# Make sure CUDA_VERSION_FULL is set and valid, otherwise error.
+# Validate CUDA version, extracting components via regex
+$cuda_ver_matched = $CUDA_VERSION_FULL -match "^(?<major>[1-9][0-9]*)\.(?<minor>[0-9]+)$"
+if(-not $cuda_ver_matched){
+    Write-Output "Invalid CUDA version specified, <major>.<minor> required. '$CUDA_VERSION_FULL'."
+    exit 1
+}
+$CUDA_MAJOR=$Matches.major
+$CUDA_MINOR=$Matches.minor
+## ------------------------------------------------
+## Select CUDA packages to install from environment
+## ------------------------------------------------
+$CUDA_PACKAGES = ""
+# for CUDA >= 11 cudart is a required package.
+# if([version]$CUDA_VERSION_FULL -ge [version]"11.0") {
+#     if(-not $CUDA_PACKAGES_IN -contains "cudart") {
+#         $CUDA_PACKAGES_IN += 'cudart'
+#     }
+# }
+Foreach ($package in $CUDA_PACKAGES_IN) {
+    # Make sure the correct package name is used for nvcc.
+    if($package -eq "nvcc" -and [version]$CUDA_VERSION_FULL -lt [version]"9.1"){
+        $package="compiler"
+    } elseif($package -eq "compiler" -and [version]$CUDA_VERSION_FULL -ge [version]"9.1") {
+        $package="nvcc"
+    }
+    $CUDA_PACKAGES += " $($package)_$($CUDA_MAJOR).$($CUDA_MINOR)"
+}
+echo "$($CUDA_PACKAGES)"
+## -----------------
+## Prepare download
+## -----------------
+# Select the download link if known, otherwise have a guess.
+$CUDA_REPO_PKG_REMOTE=""
+if($CUDA_KNOWN_URLS.containsKey($CUDA_VERSION_FULL)){
+    $CUDA_REPO_PKG_REMOTE=$CUDA_KNOWN_URLS[$CUDA_VERSION_FULL]
+} else{
+    # Guess what the url is given the most recent pattern (at the time of writing, 10.1)
+    Write-Output "note: URL for CUDA ${$CUDA_VERSION_FULL} not known, estimating."
+    $CUDA_REPO_PKG_REMOTE="http://developer.download.nvidia.com/compute/cuda/$($CUDA_MAJOR).$($CUDA_MINOR)/Prod/network_installers/cuda_$($CUDA_VERSION_FULL)_win10_network.exe"
+}
+$CUDA_REPO_PKG_LOCAL="cuda_$($CUDA_VERSION_FULL)_win10_network.exe"
+## ------------
+## Install CUDA
+## ------------
+# Get CUDA network installer
+Write-Output "Downloading CUDA Network Installer for $($CUDA_VERSION_FULL) from: $($CUDA_REPO_PKG_REMOTE)"
+Invoke-WebRequest $CUDA_REPO_PKG_REMOTE -OutFile $CUDA_REPO_PKG_LOCAL | Out-Null
+if(Test-Path -Path $CUDA_REPO_PKG_LOCAL){
+    Write-Output "Downloading Complete"
+} else {
+    Write-Output "Error: Failed to download $($CUDA_REPO_PKG_LOCAL) from $($CUDA_REPO_PKG_REMOTE)"
+    exit 1
+}
+# Invoke silent install of CUDA (via network installer)
+Write-Output "Installing CUDA $($CUDA_VERSION_FULL). Subpackages $($CUDA_PACKAGES)"
+Start-Process -Wait -FilePath .\"$($CUDA_REPO_PKG_LOCAL)" -ArgumentList "-s $($CUDA_PACKAGES)"
+# Check the return status of the CUDA installer.
+if (!$?) {
+    Write-Output "Error: CUDA installer reported error. $($LASTEXITCODE)"
+    exit 1 
+}
+# Store the CUDA_PATH in the environment for the current session, to be forwarded in the action.
+$CUDA_PATH = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$($CUDA_MAJOR).$($CUDA_MINOR)"
+$CUDA_PATH_VX_Y = "CUDA_PATH_V$($CUDA_MAJOR)_$($CUDA_MINOR)" 
+# Set environmental variables in this session
+$env:CUDA_PATH = "$($CUDA_PATH)"
+$env:CUDA_PATH_VX_Y = "$($CUDA_PATH_VX_Y)"
+Write-Output "CUDA_PATH $($CUDA_PATH)"
+Write-Output "CUDA_PATH_VX_Y $($CUDA_PATH_VX_Y)"
+# PATH needs updating elsewhere, anything in here won't persist.
+# Append $CUDA_PATH/bin to path.
+# Set CUDA_PATH as an environmental variable
+# If executing on github actions, emit the appropriate echo statements to update environment variables
+if (Test-Path "env:GITHUB_ACTIONS") { 
+    # Set paths for subsequent steps, using $env:CUDA_PATH
+    echo "Adding CUDA to CUDA_PATH, CUDA_PATH_X_Y and PATH"
+    echo "CUDA_PATH=$env:CUDA_PATH" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+    echo "$env:CUDA_PATH_VX_Y=$env:CUDA_PATH" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+    echo "$env:CUDA_PATH/bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+}
--- a/tools/msvc_setup.ps1
+++ b/tools/msvc_setup.ps1
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+$installPath = &"C:\Program Files (x86)\Microsoft Visual Studio\Installer\vswhere.exe" -property installationpath
+Import-Module (Join-Path $installPath "Common7\Tools\Microsoft.VisualStudio.DevShell.dll")
+Enter-VsDevShell -VsInstallPath $installPath -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo'
\ No newline at end of file
--- a/version.txt
+++ b/version.txt
+2.0.0
\ No newline at end of file