working on tensor core test

01ed382c · yan.yan · 3517290c · 01ed382c · 01ed382c · 01ed382c
Commit 01ed382c authored Oct 18, 2021 by yan.yan
20 changed files
--- a/spconv/core.py
+++ b/spconv/core.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Optional

 import numpy as np

--- a/spconv/pytorch/cppcore.py
+++ b/spconv/pytorch/cppcore.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cumm import tensorview as tv 
+import torch 
+from typing import Optional, List
+_TORCH_DTYPE_TO_TV = {
+    torch.float32: tv.float32,
+    torch.float64: tv.float64,
+    torch.float16: tv.float16,
+    torch.int32: tv.int32,
+    torch.int64: tv.int64,
+    torch.int8: tv.int8,
+    torch.int16: tv.int16,
+    torch.uint8: tv.uint8,
+}
+
+def torch_tensor_to_tv(ten: torch.Tensor, dtype: Optional[int] = None, shape: Optional[List[int]] = None):
+    assert ten.is_contiguous(), "must be contiguous tensor"
+    ptr = ten.data_ptr()
+    device = ten.device 
+    if device.type == "cpu":
+        tv_device = -1
+    elif device.type == "cuda":
+        tv_device = 0
+    else:
+        raise NotImplementedError
+    if shape is None:
+        shape = list(ten.shape)
+    if dtype is None:
+        dtype = _TORCH_DTYPE_TO_TV[ten.dtype]
+    return tv.from_blob(ptr, shape, dtype, tv_device)
+
+def get_current_stream():
+    return torch.cuda.current_stream().cuda_stream
+
+if __name__ == "__main__":
+    a = torch.rand(2, 2)
+    atv = torch_tensor_to_tv(a)
+    print(atv.numpy_view())
\ No newline at end of file
--- a/spconv/functional.py
+++ b/spconv/functional.py
-# Copyright 2019-2020 Yan Yan
-#
+# Copyright 2021 Yan Yan
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,7 +16,7 @@ import torch
 from torch import nn
 from torch.autograd import Function

-import spconv.ops as ops
+import spconv.pytorch.ops as ops


 class SparseConvFunction(Function):

--- a/spconv/identity.py
+++ b/spconv/identity.py
--- a/spconv/modules.py
+++ b/spconv/modules.py
-# Copyright 2019-2020 Yan Yan
-#
+# Copyright 2021 Yan Yan
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

+
 import sys
 import time
 from collections import OrderedDict
@@ -19,7 +20,7 @@ from collections import OrderedDict
 import torch
 from torch import nn

-import spconv
+from spconv import pytorch as spconv


 def is_spconv_module(module):
@@ -28,7 +29,7 @@ def is_spconv_module(module):


 def is_sparse_conv(module):
-    from spconv.conv import SparseConvolution
+    from spconv.pytorch.conv import SparseConvolution
    return isinstance(module, SparseConvolution)


@@ -145,7 +146,7 @@ class SparseSequential(SparseModule):
    def fused(self):
        """don't use this. no effect.
        """
-        from spconv.conv import SparseConvolution
+        from spconv.pytorch.conv import SparseConvolution
        mods = [v for k, v in self._modules.items()]
        fused_mods = []
        idx = 0

--- a/spconv/pytorch/ops.py
+++ b/spconv/pytorch/ops.py
+# Copyright 2021 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+from cumm import tensorview as tv
+from cumm.gemm.algospec.core import ShuffleStrideType
+
+import torch
+import numpy as np
+import spconv
+from spconv.algo import AlgoHint, ConvAlgo
+from typing import List, Union
+from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream
+from spconv.core_cc.csrc.sparse.all import SpconvOps
+from spconv.algo import GEMM# , GATHER, SCATTER
+import time
+from spconv.constants import FILTER_HWIO
+
+
+def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        size = (input_size[i] + 2 * padding[i] - dilation[i] *
+                (kernel_size[i] - 1) - 1) // stride[i] + 1
+        if kernel_size[i] == -1:
+            output_size.append(1)
+        else:
+            output_size.append(size)
+    return output_size
+
+
+def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
+                           output_padding):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        if kernel_size[i] == -1:
+            raise ValueError("deconv don't support kernel_size < 0")
+        size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
+            i] + output_padding[i]
+        output_size.append(size)
+    return output_size
+
+
+def get_indice_pairs(indices: torch.Tensor,
+                     batch_size: int,
+                     spatial_shape: List[int],
+                     algo: ConvAlgo,
+                     ksize: Union[int, List[int]],
+                     stride: Union[int, List[int]],
+                     padding: Union[int, List[int]],
+                     dilation: Union[int, List[int]],
+                     out_padding: Union[int, List[int]],
+                     subm: bool = False,
+                     transpose: bool = False):
+
+    ndim = indices.shape[1] - 1
+    if not isinstance(ksize, (list, tuple)):
+        ksize = [ksize] * ndim
+    if not isinstance(stride, (list, tuple)):
+        stride = [stride] * ndim
+    if not isinstance(padding, (list, tuple)):
+        padding = [padding] * ndim
+    if not isinstance(dilation, (list, tuple)):
+        dilation = [dilation] * ndim
+    if not isinstance(out_padding, (list, tuple)):
+        out_padding = [out_padding] * ndim
+    kv: int = int(np.prod(ksize))
+    if not subm:
+        if transpose:
+            out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
+                                               padding, dilation, out_padding)
+        else:
+            out_shape = get_conv_output_size(spatial_shape, ksize, stride,
+                                             padding, dilation)
+    else:
+        out_shape = spatial_shape
+    assert algo == ConvAlgo.Native and not transpose, "TODO"
+    stream = get_current_stream()
+    pair = torch.full((2, kv, indices.shape[0]),
+                      -1,
+                      dtype=indices.dtype,
+                      device=indices.device)
+    indice_num_per_loc = torch.zeros((kv, ),
+                                     dtype=indices.dtype,
+                                     device=indices.device)
+    inds_tv = torch_tensor_to_tv(indices)
+    pair_tv = torch_tensor_to_tv(pair)
+    indice_num_per_loc_tv = torch_tensor_to_tv(indice_num_per_loc)
+    # torch.cuda.synchronize()
+    # t = time.time()
+
+    if subm:
+        out_inds = indices
+        hashdata = torch.empty((out_inds.shape[0] * 2, ),
+                               dtype=torch.int64,
+                               device=indices.device)
+        out_inds_tv = torch_tensor_to_tv(out_inds)
+        hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
+        SpconvOps.generate_subm_conv_inds(inds_tv,
+                                          hashdata_tv,
+                                          pair_tv,
+                                          out_inds_tv,
+                                          indice_num_per_loc_tv,
+                                          batch_size=batch_size,
+                                          input_dims=spatial_shape,
+                                          ksize=ksize,
+                                          dilation=dilation,
+                                          stream_int=stream)
+        # torch.cuda.synchronize()
+
+        # print("SUBM INDICE GEN", time.time() - t)
+
+    else:
+        indice_pairs_uniq = torch.empty((pair.numel() // 2 + 1, ),
+                                        dtype=indices.dtype,
+                                        device=indices.device)
+        indice_pairs_uniq_tv = torch_tensor_to_tv(indice_pairs_uniq)
+        num_act_out = SpconvOps.generate_conv_inds_stage1(
+            inds_tv,
+            pair_tv,
+            indice_pairs_uniq_tv,
+            indice_num_per_loc_tv,
+            batch_size=batch_size,
+            output_dims=out_shape,
+            input_dims=spatial_shape,
+            ksize=ksize,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            stream_int=stream)
+        out_inds = torch.empty((num_act_out, indices.shape[1]),
+                               dtype=indices.dtype,
+                               device=indices.device)
+        hashdata = torch.empty((out_inds.shape[0] * 2, ),
+                               dtype=torch.int64,
+                               device=indices.device)
+        out_inds_tv = torch_tensor_to_tv(out_inds)
+        hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
+        SpconvOps.generate_conv_inds_stage2(inds_tv,
+                                            hashdata_tv,
+                                            pair_tv,
+                                            indice_pairs_uniq_tv,
+                                            out_inds_tv,
+                                            num_out_act=num_act_out,
+                                            batch_size=batch_size,
+                                            output_dims=out_shape,
+                                            input_dims=spatial_shape,
+                                            ksize=ksize,
+                                            stride=stride,
+                                            padding=padding,
+                                            dilation=dilation,
+                                            stream_int=stream)
+        # torch.cuda.synchronize()
+
+        # print("INDICE GEN", time.time() - t)
+
+    return out_inds, pair, indice_num_per_loc
+
+
+def indice_conv(features: torch.Tensor,
+                filters: torch.Tensor,
+                indice_pairs: torch.Tensor,
+                indice_pair_num: torch.Tensor,
+                num_activate_out: int,
+                inverse: bool = False,
+                subm: bool = False,
+                algo: ConvAlgo = ConvAlgo.Native):
+    # filters: RSKC
+    # torch.cuda.synchronize()
+    # t = time.time()
+    if features.dtype == torch.int8 or features.dtype == torch.qint8:
+        raise NotImplementedError("work in progress")
+    if FILTER_HWIO:
+        out_channel = filters.shape[-1]
+    else:
+        out_channel = filters.shape[-2]
+    filters = filters.reshape(-1, *filters.shape[-2:])
+    kv = filters.shape[0]
+    kv_center = kv // 2
+    if subm:
+        if FILTER_HWIO:
+            out_features = torch.mm(features, filters[kv_center])
+        else:
+            out_features = torch.mm(features, filters[kv_center].T)
+    else:
+        out_features = torch.zeros((num_activate_out, out_channel),
+                                   dtype=features.dtype,
+                                   device=features.device)
+    if kv == 1 and subm:
+        return out_features
+
+    stream = get_current_stream()
+    indice_pair_num_cpu = indice_pair_num.cpu().tolist()
+    arch = torch.cuda.get_device_capability()
+    inited: bool = subm
+    a = torch_tensor_to_tv(features)
+    c = torch_tensor_to_tv(out_features)
+    profile_idx = kv_center
+    if subm:
+        profile_idx = kv_center - 1
+    # profile_idx = first_n
+    nhot_profile = indice_pair_num_cpu[profile_idx]
+
+    # print(nhot_profile, indice_pair_num_cpu)
+    profile_res = GEMM.get_profiled_algo(
+        a.shape,
+        filters.shape[-2:],
+        c.shape,
+        False,
+        False if FILTER_HWIO else True,
+        False,
+        arch=arch,
+        shuffle_type=ShuffleStrideType.ShuffleAC,
+        a_inds_shape=[nhot_profile],
+        c_inds_shape=[nhot_profile],
+        hint=AlgoHint.Fowrard.value)
+
+    gather_data_tv = tv.Tensor()
+    scatter_data_tv = tv.Tensor()
+
+    maxnhot = max(indice_pair_num_cpu)
+    if profile_res is None:
+        # run profile on center
+        inp_indices_th = indice_pairs[int(inverse)][profile_idx, :nhot_profile]
+        out_indices_th = indice_pairs[int(not inverse)][
+            profile_idx, :nhot_profile]
+        inp_indices = torch_tensor_to_tv(inp_indices_th)
+        out_indices = torch_tensor_to_tv(out_indices_th)
+        filter_tv = torch_tensor_to_tv(filters)[profile_idx]
+
+        profile_res, min_time = GEMM.profile_and_cache(
+            a,
+            filter_tv,
+            c,
+            False,
+            False if FILTER_HWIO else True,
+            False,
+            arch=arch,
+            shuffle_type=ShuffleStrideType.ShuffleAC,
+            a_inds=inp_indices,
+            c_inds=out_indices,
+            alpha=1.0,
+            beta=0.0,
+            hint=AlgoHint.Fowrard.value,
+            stream=stream)
+
+    indice_pairs_tv = torch_tensor_to_tv(indice_pairs)
+    pair_in = indice_pairs_tv[int(inverse)]
+    pair_out = indice_pairs_tv[int(not inverse)]
+    filters_tv = torch_tensor_to_tv(filters)
+    for i, nhot in enumerate(indice_pair_num_cpu):
+        if subm and i == kv_center:
+            continue
+        if subm and i > kv_center:
+            nhot = indice_pair_num_cpu[kv - i - 1]
+        if nhot <= 0:
+            continue
+        inp_indices = pair_in[i].slice_first_axis(0, nhot)
+        out_indices = pair_out[i].slice_first_axis(0, nhot)
+        # inp_indices = torch_tensor_to_tv(inp_indices_th)
+        # out_indices = torch_tensor_to_tv(out_indices_th)
+        b = filters_tv[i]
+        # inp @ filter.T, NC @ KC
+        beta = 1.0 if inited else 0.0
+        algo_desp = GEMM.run_profile(
+            profile_res,
+            a,
+            b,
+            c,
+            False,
+            False if FILTER_HWIO else True,
+            False,
+            arch=arch,
+            stream=stream,
+            shuffle_type=ShuffleStrideType.ShuffleAC,
+            a_inds=inp_indices,
+            c_inds=out_indices,
+            hint=AlgoHint.Fowrard.value,
+            alpha=1.0,
+            beta=beta)
+
+        # gather_times += gather_time
+        inited = True
+    # torch.cuda.synchronize()
+    # print(stream, valid_count, maxnhot, features.shape[0], features.shape[1], out_channel, time.time() - t, total_times, txt)
+    # print(algo_desp, profile_res.external_gather, profile_res.splitk, features.shape[0], features.shape[1], out_channel, time.time() - t, total_times)
+
+    # print(indice_pair_num_cpu)
+    # print(time.time() - t)
+    return out_features
+
+
+def fused_indice_conv(features, filters, bias, indice_pairs, indice_pair_num,
+                      num_activate_out, inverse, subm):
+    raise NotImplementedError
+
+
+def indice_conv_backward(features: torch.Tensor,
+                         filters: torch.Tensor,
+                         out_bp: torch.Tensor,
+                         indice_pairs: torch.Tensor,
+                         indice_pair_num: torch.Tensor,
+                         inverse: bool = False,
+                         subm: bool = False,
+                         algo: ConvAlgo = ConvAlgo.Native):
+    # workspace = torch.empty((10000), dtype=torch.uint8, device=features.device)
+    # workspace_tv = torch_tensor_to_tv(workspace)
+    # torch.cuda.synchronize()
+    # t = time.time()
+
+    num_activate_out = out_bp.shape[0]
+    out_channel = out_bp.shape[-1]
+    filters_shape = filters.shape
+    filters = filters.reshape(-1, *filters.shape[-2:])
+    kv = filters.shape[0]
+    kv_center = kv // 2
+    assert out_bp.is_contiguous()
+    assert filters.is_contiguous()
+    assert features.is_contiguous()
+
+    if subm:
+        dfilters = torch.zeros_like(filters)
+        if FILTER_HWIO:
+            torch.mm(features.T, out_bp, out=dfilters[kv_center])
+            # TODO can we use torch mm for f16 backward weight?
+            din = torch.mm(out_bp, filters[kv_center].T)
+        else:
+            torch.mm(out_bp.T, features, out=dfilters[kv_center])
+            # TODO can we use torch mm for f16 backward weight?
+            din = torch.mm(out_bp, filters[kv_center])
+    else:
+        dfilters = torch.zeros_like(filters)
+        din = torch.zeros_like(features)
+    if kv == 1 and subm:
+        return (din, dfilters.reshape(filters_shape))
+
+    inited: bool = subm
+    indice_pairs_tv = torch_tensor_to_tv(indice_pairs)
+    # torch slice (a_th[x]) is very slow, so we need to use tv.Tensor earlier.
+    pair_in = indice_pairs_tv[int(inverse)]
+    pair_out = indice_pairs_tv[int(not inverse)]
+
+    stream = get_current_stream()
+    indice_pair_num_cpu = indice_pair_num.cpu().tolist()
+    arch = torch.cuda.get_device_capability()
+    filters_tv = torch_tensor_to_tv(filters)
+
+    dfilters_tv = torch_tensor_to_tv(dfilters)
+    out_bp_tv = torch_tensor_to_tv(out_bp)
+    features_tv = torch_tensor_to_tv(features)
+
+    din_tv = torch_tensor_to_tv(din)
+
+    profile_idx = kv_center
+    if subm:
+        profile_idx = kv_center - 1
+    # profile_idx = first_n
+    nhot_profile = indice_pair_num_cpu[profile_idx]
+
+    # print(nhot_profile, indice_pair_num_cpu)
+    profile_res_dgrad = GEMM.get_profiled_algo(
+        out_bp_tv.shape,
+        filters.shape[-2:],
+        din_tv.shape,
+        False,
+        True if FILTER_HWIO else False,
+        False,
+        arch=arch,
+        shuffle_type=ShuffleStrideType.ShuffleAC,
+        a_inds_shape=[nhot_profile],
+        c_inds_shape=[nhot_profile],
+        hint=AlgoHint.BackwardInput.value)
+    if profile_res_dgrad is None:
+        inp_indices = pair_in[profile_idx].slice_first_axis(0, nhot_profile)
+        out_indices = pair_out[profile_idx].slice_first_axis(0, nhot_profile)
+        filter_tv = filters_tv[profile_idx]
+        profile_res_dgrad, min_time = GEMM.profile_and_cache(
+            out_bp_tv,
+            filter_tv,
+            din_tv,
+            False,
+            True if FILTER_HWIO else False,
+            False,
+            arch=arch,
+            shuffle_type=ShuffleStrideType.ShuffleAC,
+            a_inds=inp_indices,
+            c_inds=out_indices,
+            alpha=1.0,
+            beta=0.0,
+            # scatter_data=scatter_data_tv.slice_first_axis(0, nhot_profile),
+            hint=AlgoHint.BackwardInput.value,
+            stream=stream)
+    if not FILTER_HWIO:
+        a_wgrad = out_bp_tv
+        b_wgrad = features_tv
+    else:
+        a_wgrad = features_tv
+        b_wgrad = out_bp_tv
+    profile_res_wgrad = GEMM.get_profiled_algo(
+        a_wgrad.shape,
+        b_wgrad.shape,
+        filters.shape[-2:],
+        True,
+        False,
+        False,
+        arch=arch,
+        shuffle_type=ShuffleStrideType.ShuffleAB,
+        a_inds_shape=[nhot_profile],
+        b_inds_shape=[nhot_profile],
+        hint=AlgoHint.BackwardWeight.value)
+
+    if profile_res_wgrad is None:
+        inp_indices = pair_in[profile_idx].slice_first_axis(0, nhot_profile)
+        out_indices = pair_out[profile_idx].slice_first_axis(0, nhot_profile)
+        dfilter_tv = dfilters_tv[profile_idx]
+        if not FILTER_HWIO:
+            a_inds_wgrad = out_indices
+            b_inds_wgrad = inp_indices
+        else:
+            a_inds_wgrad = inp_indices
+            b_inds_wgrad = out_indices
+        profile_res_wgrad, min_time = GEMM.profile_and_cache(
+            a_wgrad,
+            b_wgrad,
+            dfilter_tv,
+            True,
+            False,
+            False,
+            arch=arch,
+            shuffle_type=ShuffleStrideType.ShuffleAB,
+            a_inds=a_inds_wgrad,
+            b_inds=b_inds_wgrad,
+            alpha=1.0,
+            beta=0.0,
+            # scatter_data=scatter_data_tv.slice_first_axis(0, nhot_profile),
+            hint=AlgoHint.BackwardWeight.value,
+            stream=stream)
+        # print(profile_res_wgrad.algo_desp, profile_res_wgrad.splitk, min_time)
+    maxnhot = max(indice_pair_num_cpu)
+    # get workspace size for wgrad
+    if not FILTER_HWIO:
+        a_shape = [maxnhot, out_bp_tv.dim(1)]
+        b_shape = [maxnhot, features_tv.dim(1)]
+    else:
+        b_shape = [maxnhot, out_bp_tv.dim(1)]
+        a_shape = [maxnhot, features_tv.dim(1)]
+    m, n, k = GEMM.extract_mnk(
+        a_shape, b_shape, profile_res_wgrad.algo_desp.trans_a,
+        profile_res_wgrad.algo_desp.trans_b,
+        profile_res_wgrad.algo_desp.trans_c,
+        arch=arch, 
+        shuffle_type=ShuffleStrideType.ShuffleAB,
+        a_inds_shape=[maxnhot],
+        b_inds_shape=[maxnhot],
+        hint=AlgoHint.BackwardWeight.value)
+    workspace_size = profile_res_wgrad.algo_desp.query_workspace_size(m, n, k, profile_res_wgrad.splitk)
+    workspace = torch.Tensor()
+
+    workspace_tv = tv.Tensor()
+    if workspace_size > 0:
+        workspace = torch.empty((workspace_size,), dtype=torch.int8, device=features.device)
+        workspace_tv = torch_tensor_to_tv(workspace)
+    # print(workspace_size, m, n, k, profile_res_wgrad.splitk)
+    # torch.cuda.synchronize()
+    # di_time = time.time() - t
+    # t = time.time()
+    inited = subm
+    for i, nhot in enumerate(indice_pair_num_cpu):
+        if subm and i == kv_center:
+            continue
+        if subm and i > kv_center:
+            nhot = indice_pair_num_cpu[kv - i - 1]
+        if nhot <= 0:
+            continue
+        beta = 1.0 if inited else 0.0
+        inp_indices = pair_in[i].slice_first_axis(0, nhot)
+        out_indices = pair_out[i].slice_first_axis(0, nhot)
+        # out.T @ inp, NK @ NC
+        # print(features_tv.shape, out_bp_tv.shape)
+        GEMM.run_profile(profile_res_dgrad,
+                         out_bp_tv,
+                         filters_tv[i],
+                         din_tv,
+                         False,
+                         True if FILTER_HWIO else False,
+                         False,
+                         arch=arch,
+                         stream=stream,
+                         shuffle_type=ShuffleStrideType.ShuffleAC,
+                         a_inds=out_indices,
+                         c_inds=inp_indices,
+                         hint=AlgoHint.BackwardInput.value,
+                         alpha=1.0,
+                         beta=beta)
+
+        if not FILTER_HWIO:
+            a = out_bp_tv
+            b = features_tv
+            a_inds = out_indices
+            b_inds = inp_indices
+        else:
+            a = features_tv
+            b = out_bp_tv
+            a_inds = inp_indices
+            b_inds = out_indices
+        GEMM.run_profile(profile_res_wgrad,
+                         a,
+                         b,
+                         dfilters_tv[i],
+                         True,
+                         False,
+                         False,
+                         arch=arch,
+                         stream=stream,
+                         shuffle_type=ShuffleStrideType.ShuffleAB,
+                         a_inds=a_inds,
+                         b_inds=b_inds,
+                         hint=AlgoHint.BackwardWeight.value,
+                         alpha=1.0,
+                         beta=beta,
+                         workspace=workspace_tv)
+        inited = True
+
+    # torch.cuda.synchronize()
+    # dw_time = time.time() - t
+    # # print(dw_time + di_time, di_time, dw_time, profile_res_wgrad.splitk, profile_res_wgrad.algo_desp, dfilters.shape)
+    # # print(dw_time + di_time)
+    # print(time.time() - t)
+    return (din, dfilters.reshape(filters_shape))
+
+
+def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
+    out_channel = features.shape[-1]
+    out_features = torch.zeros((num_activate_out, out_channel),
+                               dtype=features.dtype,
+                               device=features.device)
+    stream = get_current_stream()
+    indice_pair_num_cpu = indice_pair_num.cpu().tolist()
+    out_features_tv = torch_tensor_to_tv(out_features)
+    features_tv = torch_tensor_to_tv(features)
+    for i, nhot in enumerate(indice_pair_num_cpu):
+        if nhot <= 0:
+            continue
+        inp_indices = torch_tensor_to_tv(indice_pairs[0][i, :nhot])
+        out_indices = torch_tensor_to_tv(indice_pairs[1][i, :nhot])
+        SpconvOps.maxpool_forward(out_features_tv, features_tv, out_indices,
+                                  inp_indices, stream)
+    return out_features
+
+
+def indice_maxpool_backward(features, out_features, out_bp, indice_pairs,
+                            indice_pair_num):
+    out_channel = features.shape[-1]
+    din = torch.zeros_like(features)
+    stream = get_current_stream()
+    indice_pair_num_cpu = indice_pair_num.cpu().tolist()
+    out_features_tv = torch_tensor_to_tv(out_features)
+    features_tv = torch_tensor_to_tv(features)
+    out_bp_tv = torch_tensor_to_tv(out_bp)
+    din_tv = torch_tensor_to_tv(din)
+    for i, nhot in enumerate(indice_pair_num_cpu):
+        if nhot <= 0:
+            continue
+        inp_indices = torch_tensor_to_tv(indice_pairs[0][i, :nhot])
+        out_indices = torch_tensor_to_tv(indice_pairs[1][i, :nhot])
+        SpconvOps.maxpool_backward(out_features_tv, features_tv, out_bp_tv,
+                                   din_tv, out_indices, inp_indices, stream)
+
+    return din
+
+
+def nms(boxes, scores, pre_max_size, post_max_size, thresh, eps):
+    raise NotImplementedError
+
+
+def pillar_scatter(features, coors, shape):
+    raise NotImplementedError
--- a/spconv/pool.py
+++ b/spconv/pool.py
-# Copyright 2019-2020 Yan Yan
-#
+# Copyright 2021 Yan Yan
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,11 +21,12 @@ from torch import nn
 from torch.nn import init
 from torch.nn.parameter import Parameter

-import spconv
-import spconv.functional as Fsp
-from spconv import ops
-from spconv.core import IndiceData
-from spconv.modules import SparseModule
+from spconv import pytorch as spconv
+from spconv.algo import ConvAlgo
+import spconv.pytorch.functional as Fsp
+from spconv.pytorch import ops
+from spconv.pytorch.core import IndiceData
+from spconv.pytorch.modules import SparseModule


 class SparseMaxPool(SparseModule):
@@ -100,13 +101,13 @@ class SparseMaxPool(SparseModule):
            indices,
            batch_size,
            spatial_shape,
+            ConvAlgo.Native,
            self.kernel_size,
            self.stride,
            self.padding,
            self.dilation,
            0,
-            self.subm,
-            grid=input.grid)
+            False)
        if input.benchmark:
            torch.cuda.synchronize()
            interval = time.time() - t

--- a/spconv/spatial.py
+++ b/spconv/spatial.py
-# Copyright 2019-2020 Yan Yan
-#
+# Copyright 2021 Yan Yan
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,8 +21,8 @@ from torch import nn
 from torch.nn import init
 from torch.nn.parameter import Parameter

-import spconv
-from spconv.modules import SparseModule
+from spconv import pytorch as spconv
+from spconv.pytorch.modules import SparseModule


 class RemoveDuplicate(SparseModule):

--- a/spconv/tables.py
+++ b/spconv/tables.py
+# Copyright 2021 Yan Yan
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 from torch.autograd import Function

-import spconv
+import spconv.pytorch as spconv
 #from torch.nn import Module
-from spconv.modules import SparseModule
+from spconv.pytorch.modules import SparseModule


 class JoinTable(SparseModule):  # Module):

--- a/spconv/test_utils.py
+++ b/spconv/test_utils.py
-# Copyright 2019-2020 Yan Yan
-#
+# Copyright 2021 Yan Yan
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

--- a/spconv/utils/__init__.py
+++ b/spconv/utils/__init__.py
-# Copyright 2019-2020 Yan Yan
-#
+# Copyright 2021 Yan Yan
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,372 +13,13 @@
 # limitations under the License.

 import numpy as np
-import torch
-
-from spconv import spconv_utils
-from spconv.spconv_utils import (non_max_suppression_cpu,
-                                 points_to_voxel_3d_np,
-                                 points_to_voxel_3d_np_mean,
-                                 points_to_voxel_3d_with_filtering,
-                                 rbbox_intersection, rbbox_iou,
-                                 rotate_non_max_suppression_cpu)
-
-try:
-    from spconv.spconv_utils import non_max_suppression
-except ImportError:
-    pass
-
-
-def points_to_voxel(points,
-                    voxel_size,
-                    coors_range,
-                    coor_to_voxelidx,
-                    max_points=35,
-                    max_voxels=20000,
-                    full_mean=False,
-                    block_filtering=True,
-                    block_factor=1,
-                    block_size=8,
-                    height_threshold=0.2,
-                    height_high_threshold=3.0,
-                    pad_output=False):
-    """convert 3d points(N, >=3) to voxels. This version calculate
-    everything in one loop. now it takes only 0.8ms(~6k voxels) 
-    with c++ and 3.2ghz cpu.
-
-    Args:
-        points: [N, ndim] float tensor. points[:, :3] contain xyz points and
-            points[:, 3:] contain other information such as reflectivity.
-        voxel_size: [3] list/tuple or array, float. xyz, indicate voxel size
-        coors_range: [6] list/tuple or array, float. indicate voxel range.
-            format: xyzxyz, minmax
-        coor_to_voxelidx: int array. used as a dense map.
-        max_points: int. indicate maximum points contained in a voxel.
-        max_voxels: int. indicate maximum voxels this function create.
-            for voxelnet, 20000 is a good choice. you should shuffle points
-            before call this function because max_voxels may drop some points.
-        full_mean: bool. if true, all empty points in voxel will be filled with mean
-            of exist points.
-        block_filtering: filter voxels by height. used for lidar point cloud.
-            use some visualization tool to see filtered result.
-    Returns:
-        voxels: [M, max_points, ndim] float tensor. only contain points.
-        coordinates: [M, 3] int32 tensor. zyx format.
-        num_points_per_voxel: [M] int32 tensor.
-    """
-    if full_mean:
-        assert block_filtering is False
-    if not isinstance(voxel_size, np.ndarray):
-        voxel_size = np.array(voxel_size, dtype=points.dtype)
-    if not isinstance(coors_range, np.ndarray):
-        coors_range = np.array(coors_range, dtype=points.dtype)
-    voxelmap_shape = (coors_range[3:] - coors_range[:3]) / voxel_size
-    voxelmap_shape = tuple(np.round(voxelmap_shape).astype(np.int32).tolist())
-    voxelmap_shape = voxelmap_shape[::-1]
-    num_points_per_voxel = np.zeros(shape=(max_voxels, ), dtype=np.int32)
-    voxels = np.zeros(shape=(max_voxels, max_points, points.shape[-1]),
-                      dtype=points.dtype)
-    voxel_point_mask = np.zeros(shape=(max_voxels, max_points),
-                                dtype=points.dtype)
-    coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32)
-    res = {
-        "voxels": voxels,
-        "coordinates": coors,
-        "num_points_per_voxel": num_points_per_voxel,
-        "voxel_point_mask": voxel_point_mask,
-    }
-    if full_mean:
-        means = np.zeros(shape=(max_voxels, points.shape[-1]),
-                         dtype=points.dtype)
-        voxel_num = points_to_voxel_3d_np_mean(points, voxels,
-                                               voxel_point_mask, means, coors,
-                                               num_points_per_voxel,
-                                               coor_to_voxelidx,
-                                               voxel_size.tolist(),
-                                               coors_range.tolist(),
-                                               max_points, max_voxels)
-    else:
-        if block_filtering:
-            block_shape = [*voxelmap_shape[1:]]
-            block_shape = [b // block_factor for b in block_shape]
-            mins = np.full(block_shape, 99999999, dtype=points.dtype)
-            maxs = np.full(block_shape, -99999999, dtype=points.dtype)
-            voxel_mask = np.zeros((max_voxels, ), dtype=np.int32)
-            voxel_num = points_to_voxel_3d_with_filtering(
-                points, voxels, voxel_point_mask, voxel_mask, mins, maxs,
-                coors, num_points_per_voxel, coor_to_voxelidx,
-                voxel_size.tolist(), coors_range.tolist(), max_points,
-                max_voxels, block_factor, block_size, height_threshold,
-                height_high_threshold)
-            voxel_mask = voxel_mask.astype(np.bool_)
-            coors_ = coors[voxel_mask]
-            if pad_output:
-                res["coordinates"][:voxel_num] = coors_
-                res["voxels"][:voxel_num] = voxels[voxel_mask]
-                res["voxel_point_mask"][:voxel_num] = voxel_point_mask[
-                    voxel_mask]
-
-                res["num_points_per_voxel"][:voxel_num] = num_points_per_voxel[
-                    voxel_mask]
-                res["coordinates"][voxel_num:] = 0
-                res["voxels"][voxel_num:] = 0
-                res["num_points_per_voxel"][voxel_num:] = 0
-                res["voxel_point_mask"][voxel_num:] = 0
-            else:
-                res["coordinates"] = coors_
-                res["voxels"] = voxels[voxel_mask]
-                res["num_points_per_voxel"] = num_points_per_voxel[voxel_mask]
-                res["voxel_point_mask"] = voxel_point_mask[voxel_mask]
-            voxel_num = coors_.shape[0]
-        else:
-            voxel_num = points_to_voxel_3d_np(points, voxels, voxel_point_mask,
-                                              coors, num_points_per_voxel,
-                                              coor_to_voxelidx,
-                                              voxel_size.tolist(),
-                                              coors_range.tolist(), max_points,
-                                              max_voxels)
-    res["voxel_num"] = voxel_num
-    res["voxel_point_mask"] = res["voxel_point_mask"].reshape(
-        -1, max_points, 1)
-    return res
-
-
-class VoxelGenerator:
-    def __init__(self,
-                 voxel_size,
-                 point_cloud_range,
-                 max_num_points,
-                 max_voxels=20000,
-                 full_mean=True):
-        point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
-        # [0, -40, -3, 70.4, 40, 1]
-        voxel_size = np.array(voxel_size, dtype=np.float32)
-        grid_size = (point_cloud_range[3:] -
-                     point_cloud_range[:3]) / voxel_size
-        grid_size = np.round(grid_size).astype(np.int64)
-        voxelmap_shape = tuple(np.round(grid_size).astype(np.int32).tolist())
-        voxelmap_shape = voxelmap_shape[::-1]
-
-        self._coor_to_voxelidx = np.full(voxelmap_shape, -1, dtype=np.int32)
-        self._voxel_size = voxel_size
-        self._point_cloud_range = point_cloud_range
-        self._max_num_points = max_num_points
-        self._max_voxels = max_voxels
-        self._grid_size = grid_size
-        self._full_mean = full_mean
-
-    def generate(self, points, max_voxels=None):
-        res = points_to_voxel(points, self._voxel_size,
-                              self._point_cloud_range, self._coor_to_voxelidx,
-                              self._max_num_points, max_voxels
-                              or self._max_voxels, self._full_mean)
-        voxels = res["voxels"]
-        coors = res["coordinates"]
-        num_points_per_voxel = res["num_points_per_voxel"]
-        voxel_num = res["voxel_num"]
-        coors = coors[:voxel_num]
-        voxels = voxels[:voxel_num]
-        num_points_per_voxel = num_points_per_voxel[:voxel_num]
-
-        return (voxels, coors, num_points_per_voxel)
-
-    def generate_multi_gpu(self, points, max_voxels=None):
-        res = points_to_voxel(points, self._voxel_size,
-                              self._point_cloud_range, self._coor_to_voxelidx,
-                              self._max_num_points, max_voxels
-                              or self._max_voxels, self._full_mean)
-        voxels = res["voxels"]
-        coors = res["coordinates"]
-        num_points_per_voxel = res["num_points_per_voxel"]
-        voxel_num = res["voxel_num"]
-        return (voxels, coors, num_points_per_voxel)
-
-    @property
-    def voxel_size(self):
-        return self._voxel_size
-
-    @property
-    def max_num_points_per_voxel(self):
-        return self._max_num_points
-
-    @property
-    def point_cloud_range(self):
-        return self._point_cloud_range
-
-    @property
-    def grid_size(self):
-        return self._grid_size
-
-
-class VoxelGeneratorV2:
-    def __init__(self,
-                 voxel_size,
-                 point_cloud_range,
-                 max_num_points,
-                 max_voxels=20000,
-                 full_mean=False,
-                 block_filtering=False,
-                 block_factor=8,
-                 block_size=3,
-                 height_threshold=0.1,
-                 height_high_threshold=2.0):
-        assert full_mean is False, "don't use this."
-        point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
-        # [0, -40, -3, 70.4, 40, 1]
-        voxel_size = np.array(voxel_size, dtype=np.float32)
-        grid_size = (point_cloud_range[3:] -
-                     point_cloud_range[:3]) / voxel_size
-        grid_size = np.round(grid_size).astype(np.int64)
-        if block_filtering:
-            assert block_size > 0
-            assert grid_size[0] % block_factor == 0
-            assert grid_size[1] % block_factor == 0
-
-        voxelmap_shape = tuple(np.round(grid_size).astype(np.int32).tolist())
-        voxelmap_shape = voxelmap_shape[::-1]
-        self._coor_to_voxelidx = np.full(voxelmap_shape, -1, dtype=np.int32)
-        self._voxel_size = voxel_size
-        self._point_cloud_range = point_cloud_range
-        self._max_num_points = max_num_points
-        self._max_voxels = max_voxels
-        self._grid_size = grid_size
-        self._full_mean = full_mean
-        self._block_filtering = block_filtering
-        self._block_factor = block_factor
-        self._height_threshold = height_threshold
-        self._block_size = block_size
-        self._height_high_threshold = height_high_threshold
-
-    def generate(self, points, max_voxels=None):
-        res = points_to_voxel(points, self._voxel_size,
-                              self._point_cloud_range, self._coor_to_voxelidx,
-                              self._max_num_points, max_voxels
-                              or self._max_voxels, self._full_mean,
-                              self._block_filtering, self._block_factor,
-                              self._block_size, self._height_threshold,
-                              self._height_high_threshold)
-        for k, v in res.items():
-            if k != "voxel_num":
-                res[k] = v[:res["voxel_num"]]
-        return res
-
-    def generate_multi_gpu(self, points, max_voxels=None):
-        res = points_to_voxel(points,
-                              self._voxel_size,
-                              self._point_cloud_range,
-                              self._coor_to_voxelidx,
-                              self._max_num_points,
-                              max_voxels or self._max_voxels,
-                              self._full_mean,
-                              self._block_filtering,
-                              self._block_factor,
-                              self._block_size,
-                              self._height_threshold,
-                              self._height_high_threshold,
-                              pad_output=True)
-        return res
-
-    @property
-    def voxel_size(self):
-        return self._voxel_size
-
-    @property
-    def max_num_points_per_voxel(self):
-        return self._max_num_points
-
-    @property
-    def point_cloud_range(self):
-        return self._point_cloud_range
-
-    @property
-    def grid_size(self):
-        return self._grid_size
-
-
-class VoxelGeneratorV3:
-    def __init__(self, voxel_size, point_cloud_range, max_points, num_features,
-                 dtype, device):
-
-        self._max_points = max_points
-
-        self._point_cloud_range = point_cloud_range
-        self._voxel_size = voxel_size
-        self._grid_size = torch.round(
-            (self._point_cloud_range[3:] - self._point_cloud_range[:3]) /
-            self._voxel_size).to(torch.int32)
-        grid_volume = self._grid_size.prod()
-        self._grid_size = self._grid_size.cpu().numpy().tolist()
-        self._ndim = len(self._grid_size)
-
-        self._dtype = dtype
-        self._device = device
-
-        self._point_index = torch.full([max_points + 1],
-                                       grid_volume,
-                                       dtype=torch.int32,
-                                       device=self._device)
-        self._grids = torch.zeros([grid_volume, num_features],
-                                  dtype=self._dtype,
-                                  device=self._device)
-        self._num_points_per_grid = torch.zeros([grid_volume],
-                                                dtype=torch.int32,
-                                                device=self._device)
-        self._voxels = torch.zeros([max_points, num_features],
-                                   dtype=self._dtype,
-                                   device=self._device)
-        self._coors = torch.zeros([max_points, self._ndim],
-                                  dtype=torch.int32,
-                                  device=self._device)
-
-    def generate(self, points):
-        assert points.shape[
-            0] <= self._max_points, 'please enlarge max_points to not smaller than ' + str(
-                points.shape[0])
-        points.to(self._dtype).to(self._device)
-        return self.points_to_voxel(points)
-
-    def generate_multi_gpu(self, points):
-        assert points.shape[
-            0] <= self._max_points, 'please enlarge max_points to not smaller than ' + str(
-                points.shape[0])
-        points.to(self._dtype).to(self._device)
-        return self.points_to_voxel(points)
-
-    @property
-    def voxel_size(self):
-        return self._voxel_size
-
-    @property
-    def point_cloud_range(self):
-        return self._point_cloud_range
-
-    @property
-    def grid_size(self):
-        return self._grid_size
-
-    def points_to_voxel(self, points):
-        """
-            points: [N, ndim] float tensor. points[:, :3] contain xyz points and
-                points[:, 3:] contain other information such as reflectivity.
-            voxel_size: [3] list/tuple or array or tensor, float. xyz, indicate voxel size
-            coors_range: [6] list/tuple or array or tensor, float. indicate voxel range.
-                format: xyzxyz, minmax
-        """
-        indexes = torch.floor((points[:, :3] - self._point_cloud_range[:3]) /
-                              self._voxel_size).to(torch.int32)
-        num_voxel = torch.ops.spconv.points_to_voxel(
-            points, indexes, self._point_index, self._grids,
-            self._num_points_per_grid, self._voxels, self._coors,
-            self._grid_size, self._ndim)
-        voxels = self._voxels[:num_voxel, :]
-        coors = self._coors[:num_voxel, :]
-
-        # xyz --> zyx
-        #coors = coors[::-1]
-        x, y, z = coors[:, 0].reshape([-1, 1]), coors[:, 1].reshape(
-            [-1, 1]), coors[:, 2].reshape([-1, 1])
-        coors = torch.cat([z, y, x], dim=1)
-        # can be skipped
-        #        x, y, z, f = voxels[:, 0].reshape([-1, 1]), voxels[:, 1].reshape([-1, 1]), voxels[:, 2].reshape([-1, 1]), voxels[:, 3:]
-        #        voxels = torch.cat([z, y, x, f], dim=1)
-        return voxels, coors
+from cumm import tensorview as tv 
+from spconv.core_cc.csrc.sparse.all.ops1d import Point2Voxel as Point2VoxelGPU1d
+from spconv.core_cc.csrc.sparse.all.ops2d import Point2Voxel as Point2VoxelGPU2d
+from spconv.core_cc.csrc.sparse.all.ops3d import Point2Voxel as Point2VoxelGPU3d
+from spconv.core_cc.csrc.sparse.all.ops4d import Point2Voxel as Point2VoxelGPU4d
+
+from spconv.core_cc.csrc.sparse.all.ops_cpu1d import Point2VoxelCPU as Point2VoxelCPU1d
+from spconv.core_cc.csrc.sparse.all.ops_cpu2d import Point2VoxelCPU as Point2VoxelCPU2d
+from spconv.core_cc.csrc.sparse.all.ops_cpu3d import Point2VoxelCPU as Point2VoxelCPU3d
+from spconv.core_cc.csrc.sparse.all.ops_cpu4d import Point2VoxelCPU as Point2VoxelCPU4d
\ No newline at end of file
--- a/src/cuhash/CMakeLists.txt
+++ b/src/cuhash/CMakeLists.txt
-if(WIN32)
-    add_library(cuhash SHARED hash_functions.cu hash_table.cpp hash_table.cu hash_functions.cpp)
-else()
-    add_library(cuhash STATIC hash_functions.cu hash_table.cpp hash_table.cu hash_functions.cpp)
-endif()
-target_include_directories(cuhash PRIVATE ${ALL_INCLUDE} )
-set_property(TARGET cuhash PROPERTY CUDA_STANDARD 14)
-set_property(TARGET cuhash PROPERTY CXX_STANDARD 14)
-set_target_properties(cuhash PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-set_target_properties(cuhash PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-if(NOT WIN32)
-    set_property(TARGET cuhash PROPERTY POSITION_INDEPENDENT_CODE ON)
-endif()
-target_link_libraries(cuhash PRIVATE ${ALL_LIBS})
-install (TARGETS cuhash DESTINATION lib)
-
-if (SPCONV_BuildTests)
-    add_executable(cuhash_test main.cc)
-    target_include_directories(cuhash_test PRIVATE ${ALL_INCLUDE} )
-    set_property(TARGET cuhash_test PROPERTY CUDA_STANDARD 14)
-    set_property(TARGET cuhash_test PROPERTY CXX_STANDARD 14)
-    set_target_properties(cuhash_test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-    target_link_libraries(cuhash_test PRIVATE ${ALL_LIBS} cuhash)
-    install (TARGETS cuhash_test DESTINATION bin)
-endif()
\ No newline at end of file
--- a/src/cuhash/debugging.cpp
+++ b/src/cuhash/debugging.cpp
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-
-/**
- * @file
- * debugging.cpp
- *
- * @brief Debugging/statistics/performance utilities for hash tables.
- */
-
-#include <cuhash/debugging.h>
-#include <cuhash/definitions.h>
-
-#include <algorithm>
-#include <cstring>
-#include <cuhash/cuda_util.h>
-
-namespace cuhash {
-
-void OutputRetrievalStatistics(const unsigned n_queries,
-                               const unsigned *d_retrieval_probes,
-                               const unsigned n_functions) {
-  unsigned *retrieval_probes = new unsigned[n_queries];
-  CUDA_SAFE_CALL(cudaMemcpy(retrieval_probes, d_retrieval_probes,
-                            sizeof(unsigned) * n_queries,
-                            cudaMemcpyDeviceToHost));
-
-  // Create a histogram showing how many items needed how many probes to be
-  // found.
-  unsigned possible_probes = n_functions + 2;
-  unsigned *histogram = new unsigned[possible_probes];
-  memset(histogram, 0, sizeof(unsigned) * (possible_probes));
-  for (unsigned i = 0; i < n_queries; ++i) {
-    histogram[retrieval_probes[i]]++;
-  }
-
-  // Dump it.
-  char buffer[10000];
-  sprintf(buffer, "Probes for retrieval: ");
-  PrintMessage(buffer);
-  for (unsigned i = 0; i < possible_probes; ++i) {
-    sprintf(buffer, "\t(%u, %u)", i, histogram[i]);
-    PrintMessage(buffer);
-  }
-  delete[] retrieval_probes;
-  delete[] histogram;
-}
-
-void OutputBuildStatistics(const unsigned n,
-                           const unsigned *d_iterations_taken) {
-  // Output how many iterations each thread took until it found an empty slot.
-  unsigned *iterations_taken = new unsigned[n];
-  CUDA_SAFE_CALL(cudaMemcpy(iterations_taken, d_iterations_taken,
-                            sizeof(unsigned) * n, cudaMemcpyDeviceToHost));
-  std::sort(iterations_taken, iterations_taken + n);
-  unsigned total_iterations = 0;
-  unsigned max_iterations_taken = 0;
-  for (unsigned i = 0; i < n; ++i) {
-    total_iterations += iterations_taken[i];
-    max_iterations_taken = std::max(max_iterations_taken, iterations_taken[i]);
-  }
-
-  unsigned current_value = iterations_taken[0];
-  unsigned count = 1;
-  char buffer[10000];
-  sprintf(buffer, "Iterations taken:\n");
-  for (unsigned i = 1; i < n; ++i) {
-    if (iterations_taken[i] != current_value) {
-      sprintf(buffer, "%s\t(%u, %u)\n", buffer, current_value, count);
-      current_value = iterations_taken[i];
-      count = 1;
-    } else {
-      count++;
-    }
-  }
-  sprintf(buffer, "%s\t(%u, %u)", buffer, current_value, count);
-  PrintMessage(buffer);
-  sprintf(buffer, "Total iterations: %u", total_iterations);
-  PrintMessage(buffer);
-  sprintf(buffer, "Avg/Med/Max iterations: (%f %u %u)",
-          (float)total_iterations / n, iterations_taken[n / 2],
-          iterations_taken[n - 1]);
-  PrintMessage(buffer);
-  delete[] iterations_taken;
-
-  // Print the length of the longest eviction chain.
-  sprintf(buffer, "Max iterations: %u", max_iterations_taken);
-  PrintMessage(buffer);
-}
-
-}; // namespace cuhash
-
-// Leave this at the end of the file
-// Local Variables:
-// mode:c++
-// c-file-style: "NVIDIA"
-// End:
--- a/src/cuhash/debugging.cu
+++ b/src/cuhash/debugging.cu
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-
-/**
- * @file
- * debugging.cu
- *
- * @brief Debugging/statistics/performance utilities for hash tables.
- */
-
-#include <cuhash/debugging.h>
-#include <cuhash/definitions.h>
-#include <cuhash/hash_table.cuh>
-
-#include <algorithm>
-#include <cuhash/cuda_util.h>
-
-namespace cuhash {
-
-//! Debugging function: Takes statistics on the hash functions' distribution.
-/*! Determines:
- *    - How many unique slots each key has.
- *    - How many keys hash into each slot.
- *    - Whether any keys failed to get a full set of slots.
- */
-__global__ void take_hash_function_statistics_kernel(
-    const unsigned *keys, const unsigned n_entries, const unsigned table_size,
-    const uint2 *constants, const unsigned num_functions,
-    unsigned *num_slots_available, unsigned *num_hashing_in, unsigned *failed) {
-  unsigned thread_index = threadIdx.x + blockIdx.x * blockDim.x +
-                          blockIdx.y * blockDim.x * gridDim.x;
-
-  if (thread_index >= n_entries)
-    return;
-  unsigned key = keys[thread_index];
-
-  // Determine all of the locations the key hashes into.
-  // Also count how many keys hash into each location.
-  unsigned locations[kMaxHashFunctions];
-  for (unsigned i = 0; i < num_functions; ++i) {
-    locations[i] = hash_function_inner(constants[i], key) % table_size;
-
-    if (num_hashing_in != NULL) {
-      atomicAdd(num_hashing_in + locations[i], 1);
-    }
-  }
-
-  // Determine whether all of the locations were different.
-  unsigned num_slots = 1;
-  for (unsigned i = 1; i < num_functions; ++i) {
-    bool matched = false;
-    for (unsigned j = 0; j < i; ++j) {
-      if (locations[i] == locations[j]) {
-        matched = true;
-        break;
-      }
-    }
-    if (!matched) {
-      num_slots++;
-    }
-  }
-
-  if (num_slots_available != NULL) {
-    num_slots_available[thread_index] = num_slots;
-  }
-
-  if (failed != NULL && num_slots != num_functions) {
-    *failed = 1;
-  }
-}
-
-void TakeHashFunctionStatistics(const unsigned num_keys, const unsigned *d_keys,
-                                const unsigned table_size,
-                                const uint2 *constants,
-                                const unsigned kNumHashFunctions) {
-  char buffer[16000];
-  PrintMessage("Hash function constants: ");
-
-  for (unsigned i = 0; i < kNumHashFunctions; ++i) {
-    sprintf(buffer, "\t%10u, %10u", constants[i].x, constants[i].y);
-    PrintMessage(buffer);
-  }
-
-  unsigned *d_num_hashing_in = NULL;
-#ifdef COUNT_HOW_MANY_HASH_INTO_EACH_SLOT
-  CUDA_SAFE_CALL(
-      cudaMalloc((void **)&d_num_hashing_in, sizeof(unsigned) * table_size));
-  CUDA_SAFE_CALL(
-      cudaMemset(d_num_hashing_in, 0, sizeof(unsigned) * table_size));
-#endif
-
-  unsigned *d_num_slots_available = NULL;
-#ifdef COUNT_HOW_MANY_HAVE_CYCLES
-  CUDA_SAFE_CALL(
-      cudaMalloc((void **)&d_num_slots_available, sizeof(unsigned) * num_keys));
-#endif
-  uint2 *d_constants = NULL;
-  CUDA_SAFE_CALL(
-      cudaMalloc((void **)&d_constants, sizeof(uint2) * kNumHashFunctions));
-  CUDA_SAFE_CALL(cudaMemcpy(d_constants, constants,
-                            sizeof(uint2) * kNumHashFunctions,
-                            cudaMemcpyHostToDevice));
-
-  take_hash_function_statistics_kernel<<<ComputeGridDim(num_keys),
-                                         kBlockSize>>>(
-      d_keys, num_keys, table_size, d_constants, kNumHashFunctions,
-      d_num_slots_available, d_num_hashing_in, NULL);
-  CUDA_SAFE_CALL(cudaFree(d_constants));
-
-#ifdef COUNT_HOW_MANY_HASH_INTO_EACH_SLOT
-  unsigned *num_hashing_in = new unsigned[table_size];
-  CUDA_SAFE_CALL(cudaMemcpy(num_hashing_in, d_num_hashing_in,
-                            sizeof(unsigned) * table_size,
-                            cudaMemcpyDeviceToHost));
-
-  /*
-  // Print how many items hash into each slot.
-  // Used to make sure items are spread evenly throughout the table.
-  buffer[0] = '\0';
-  PrintMessage("Num hashing into each: ", true);
-  for (unsigned i = 0; i < table_size; ++i) {
-    sprintf(buffer, "%s\t%2u", buffer, num_hashing_in[i]);
-    if (i % 25 == 24) {
-      PrintMessage(buffer, true);
-      buffer[0] = '\0';
-    }
-  }
-  PrintMessage(buffer,true);
-  */
-
-  // Print a histogram of how many items are hashed into each slot.  Shows
-  // if average number of items hashing into each slot is low.
-  std::sort(num_hashing_in, num_hashing_in + table_size);
-  int count = 1;
-  unsigned previous = num_hashing_in[0];
-  sprintf(buffer, "Num items hashing into a slot:\t");
-  PrintMessage(buffer);
-  for (unsigned i = 1; i < table_size; ++i) {
-    if (num_hashing_in[i] != previous) {
-      sprintf(buffer, "\t(%u, %u)", previous, count);
-      PrintMessage(buffer);
-      previous = num_hashing_in[i];
-      count = 1;
-    } else {
-      count++;
-    }
-  }
-  sprintf(buffer, "\t(%u, %u)", previous, count);
-  PrintMessage(buffer);
-
-  delete[] num_hashing_in;
-  CUDA_SAFE_CALL(cudaFree(d_num_hashing_in));
-#endif
-
-#ifdef COUNT_HOW_MANY_HAVE_CYCLES
-  unsigned *num_slots_available = new unsigned[num_keys];
-  CUDA_SAFE_CALL(cudaMemcpy(num_slots_available, d_num_slots_available,
-                            sizeof(unsigned) * num_keys,
-                            cudaMemcpyDeviceToHost));
-
-  static const unsigned kHistogramSize = kNumHashFunctions + 1;
-  unsigned *histogram = new unsigned[kHistogramSize];
-  memset(histogram, 0, sizeof(unsigned) * kHistogramSize);
-  for (unsigned i = 0; i < num_keys; ++i) {
-    histogram[num_slots_available[i]]++;
-  }
-
-  sprintf(buffer, "Slots assigned to each key: ");
-  for (unsigned i = 1; i < kHistogramSize; ++i) {
-    sprintf(buffer, "%s(%u, %u) ", buffer, i, histogram[i]);
-  }
-  PrintMessage(buffer);
-
-  delete[] histogram;
-  delete[] num_slots_available;
-  CUDA_SAFE_CALL(cudaFree(d_num_slots_available));
-#endif
-}
-
-bool CheckAssignedSameSlot(const unsigned N, const unsigned num_keys,
-                           const unsigned *d_keys, const unsigned table_size,
-                           uint2 *constants) {
-  unsigned *d_cycle_exists = NULL;
-  uint2 *d_constants = NULL;
-
-  CUDA_SAFE_CALL(cudaMalloc((void **)&d_cycle_exists, sizeof(unsigned)));
-  CUDA_SAFE_CALL(cudaMalloc((void **)&d_constants, sizeof(uint2) * N));
-
-  CUDA_SAFE_CALL(cudaMemset(d_cycle_exists, 0, sizeof(unsigned)));
-  CUDA_SAFE_CALL(cudaMemcpy(d_constants, constants, sizeof(uint2) * N,
-                            cudaMemcpyHostToDevice));
-
-  // Check if all keys were given a full set of N slots by the functions.
-  take_hash_function_statistics_kernel<<<ComputeGridDim(num_keys),
-                                         kBlockSize>>>(
-      d_keys, num_keys, table_size, d_constants, N, NULL, NULL, d_cycle_exists);
-
-  unsigned cycle_exists;
-  CUDA_SAFE_CALL(cudaMemcpy(&cycle_exists, d_cycle_exists, sizeof(unsigned),
-                            cudaMemcpyDeviceToHost));
-
-  CUDA_SAFE_CALL(cudaFree(d_cycle_exists));
-  CUDA_SAFE_CALL(cudaFree(d_constants));
-
-  return (cycle_exists != 0);
-}
-
-void PrintStashContents(const Entry *d_stash) {
-  Entry *stash = new Entry[cuhash::kStashSize];
-  CUDA_SAFE_CALL(cudaMemcpy(stash, d_stash, sizeof(Entry) * cuhash::kStashSize,
-                            cudaMemcpyDeviceToHost));
-  for (unsigned i = 0; i < cuhash::kStashSize; ++i) {
-    if (get_key(stash[i]) != kKeyEmpty) {
-      char buffer[256];
-      sprintf(buffer, "Stash[%u]: %u = %u", i, get_key(stash[i]),
-              get_value(stash[i]));
-      PrintMessage(buffer, true);
-    }
-  }
-  delete[] stash;
-}
-
-}; // namespace cuhash
-
-// Leave this at the end of the file
-// Local Variables:
-// mode:c++
-// c-file-style: "NVIDIA"
-// End:
--- a/src/cuhash/hash_functions.cpp
+++ b/src/cuhash/hash_functions.cpp
-// nvcc (cuda) 9.0 with gcc 5.5 don't support random, so compile it in host
-
-#include <random>
-
-namespace cuhash {
-
-std::random_device random_dev;
-
-std::mt19937 random_engine(random_dev());
-std::uniform_int_distribution<unsigned> uint_distribution;
-
-unsigned generate_random_uint32() { return uint_distribution(random_engine); }
-
-} // namespace cuhash
\ No newline at end of file
--- a/src/cuhash/hash_functions.cu
+++ b/src/cuhash/hash_functions.cu
-#include <cassert>
-#include <cuhash/debugging.h>
-#include <cuhash/hash_functions.h>
-#include <cuhash/hash_table.h>
-
-namespace cuhash {
-
-void GenerateFunctions(const unsigned N, const unsigned num_keys,
-                       const unsigned *d_keys, const unsigned table_size,
-                       uint2 *constants) {
-  bool regenerate = true;
-
-  while (regenerate) {
-    regenerate = false;
-
-    // Generate a set of hash function constants for this build attempt.
-    for (unsigned i = 0; i < N; ++i) {
-      // uint_distribution(random_engine) % kPrimeDivisor;
-      // genrand_int32() % kPrimeDivisor;
-      unsigned new_a = generate_random_uint32() % kPrimeDivisor;
-      constants[i].x = (1 > new_a ? 1 : new_a);
-      constants[i].y = generate_random_uint32() % kPrimeDivisor;
-    }
-
-#ifdef FORCEFULLY_GENERATE_NO_CYCLES
-    // Ensure that every key gets N different slots.
-    regenerate =
-        CheckAssignedSameSlot(N, num_keys, d_keys, table_size, constants);
-#endif
-  }
-
-#ifdef TAKE_HASH_FUNCTION_STATISTICS
-  // Examine how well distributed the items are.
-  TakeHashFunctionStatistics(num_keys, d_keys, table_size, constants, N);
-#endif
-}
-
-}; // namespace cuhash
--- a/src/cuhash/hash_table.cpp
+++ b/src/cuhash/hash_table.cpp
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-
-/**
- * @file hash_table.cpp
- *
- * @brief Implements a basic hash table that stores one value per key.
- */
-
-#include <cuhash/debugging.h>
-#include <cuhash/hash_table.h>
-
-#include <algorithm>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <cuda_runtime_api.h>
-#include <cuhash/cuda_util.h>
-#include <limits>
-
-namespace cuhash {
-
-char buffer[256];
-
-//! @name Internal
-/// @{
-dim3 ComputeGridDim(unsigned n) {
-  // Round up in order to make sure all items are hashed in.
-  dim3 grid((n + kBlockSize - 1) / kBlockSize);
-  if (grid.x > kGridSize) {
-    grid.y = (grid.x + kGridSize - 1) / kGridSize;
-    grid.x = kGridSize;
-  }
-  return grid;
-}
-
-unsigned ComputeMaxIterations(const unsigned n, const unsigned table_size,
-                              const unsigned num_functions) {
-  float lg_input_size = (float)(log((double)n) / log(2.0));
-
-// #define CONSTANT_ITERATIONS
-#ifdef CONSTANT_ITERATIONS
-  // Set the maximum number of iterations to 7lg(N).
-  const unsigned MAX_ITERATION_CONSTANT = 7;
-  unsigned max_iterations = MAX_ITERATION_CONSTANT * lg_input_size;
-#else
-  // Use an empirical formula for determining what the maximum number of
-  // iterations should be.  Works OK in most situations.
-  float load_factor = float(n) / table_size;
-  float ln_load_factor = (float)(log(load_factor) / log(2.71828183));
-
-  unsigned max_iterations =
-      (unsigned)(4.0 * ceil(-1.0 / (0.028255 + 1.1594772 * ln_load_factor) *
-                            lg_input_size));
-#endif
-  return max_iterations;
-}
-/// @}
-
-HashTable::HashTable()
-    : table_size_(0), d_contents_(NULL), stash_count_(0), d_failures_(NULL) {
-  CUDA_CHECK_ERROR("Failed in constructor.\n");
-}
-
-bool HashTable::Initialize(const unsigned max_table_entries,
-                           const float space_usage,
-                           const unsigned num_functions) {
-  Release();
-
-  // Determine the minimum amount of slots the table requires,
-  // and whether the space_usage is within range.
-  float minimum_space_usage;
-  if (num_functions < 2 || num_functions > 5) {
-    char message[256] = "Number of hash functions must be from 2 to 5; "
-                        "others are unimplemented.";
-    PrintMessage(message, true);
-    return false;
-  } else {
-    minimum_space_usage = kMinimumSpaceUsages[num_functions];
-  }
-
-  if (space_usage < minimum_space_usage) {
-    sprintf(buffer, "Minimum possible space usage for %u functions is %f.",
-            num_functions, minimum_space_usage);
-    PrintMessage(buffer);
-    return false;
-  }
-
-  num_hash_functions_ = num_functions;
-  table_size_ = unsigned(ceil(max_table_entries * space_usage));
-
-  // Allocate memory.
-  const unsigned slots_to_allocate = table_size_ + kStashSize;
-  CUDA_SAFE_CALL(
-      cudaMalloc((void **)&d_contents_, sizeof(Entry) * slots_to_allocate));
-  CUDA_SAFE_CALL(cudaMalloc((void **)&d_failures_, sizeof(unsigned)));
-  if (!d_contents_ || !d_failures_) {
-    fprintf(stderr, "Failed to allocate %u slots.\n", slots_to_allocate);
-    return false;
-  }
-  CUDA_CHECK_ERROR("Failed to initialize.\n");
-
-  return true;
-}
-
-void HashTable::Release() {
-  table_size_ = 0;
-
-  CUDA_SAFE_CALL(cudaFree(d_contents_));
-  CUDA_SAFE_CALL(cudaFree(d_failures_));
-
-  d_contents_ = NULL;
-  d_failures_ = NULL;
-
-  CUDA_CHECK_ERROR("Failed during release.\n");
-}
-
-bool HashTable::Build(const unsigned n, const unsigned *d_keys,
-                      const unsigned *d_values) {
-  unsigned max_iterations =
-      ComputeMaxIterations(n, table_size_, num_hash_functions_);
-  unsigned num_failures = 1;
-  unsigned num_attempts = 0;
-
-  // Storage for statistics collection.
-  unsigned *d_iterations_taken = NULL;
-#ifdef TRACK_ITERATIONS
-  CUDA_SAFE_CALL(
-      cudaMalloc((void **)&d_iterations_taken, sizeof(unsigned) * n));
-#endif
-
-  // Track how many items ended up in the stash.
-  unsigned *d_stash_count = NULL;
-  CUDA_SAFE_CALL(cudaMalloc((void **)&d_stash_count, sizeof(unsigned)));
-  CUDA_CHECK_ERROR("Failed before main build loop.\n");
-
-  // Main build loop.
-  while (num_failures && ++num_attempts < kMaxRestartAttempts) {
-    CUDA_SAFE_CALL(cudaMemset(d_stash_count, 0, sizeof(unsigned)));
-
-    // Generate new hash functions.
-    if (num_hash_functions_ == 2)
-      constants_2_.Generate(n, d_keys, table_size_);
-    else if (num_hash_functions_ == 3)
-      constants_3_.Generate(n, d_keys, table_size_);
-    else if (num_hash_functions_ == 4)
-      constants_4_.Generate(n, d_keys, table_size_);
-    else
-      constants_5_.Generate(n, d_keys, table_size_);
-
-    stash_constants_.x = std::max(1u, generate_random_uint32()) % kPrimeDivisor;
-    stash_constants_.y = generate_random_uint32() % kPrimeDivisor;
-    stash_count_ = 0;
-
-    // Initialize memory.
-    unsigned slots_in_table = table_size_ + kStashSize;
-    CUDAWrapper::ClearTable(slots_in_table, kEntryEmpty, d_contents_);
-
-    num_failures = 0;
-
-    CUDAWrapper::CallCuckooHash(
-        n, num_hash_functions_, d_keys, d_values, table_size_, constants_2_,
-        constants_3_, constants_4_, constants_5_, max_iterations, d_contents_,
-        stash_constants_, d_stash_count, d_failures_, d_iterations_taken);
-
-    // Check if successful.
-    CUDA_SAFE_CALL(cudaMemcpy(&num_failures, d_failures_, sizeof(unsigned),
-                              cudaMemcpyDeviceToHost));
-
-#ifdef COUNT_UNINSERTED
-    if (num_failures) {
-      printf("Failed to insert %u items.\n", num_failures);
-    }
-#endif
-  }
-
-  // Copy out the stash size.
-  CUDA_SAFE_CALL(cudaMemcpy(&stash_count_, d_stash_count, sizeof(unsigned),
-                            cudaMemcpyDeviceToHost));
-  if (stash_count_ && num_failures == 0) {
-    // sprintf(buffer, "Stash size: %u", stash_count_);
-    // PrintMessage(buffer, true);
-
-#ifdef _DEBUG
-    PrintStashContents(d_contents_ + table_size_);
-#endif
-  }
-  CUDA_SAFE_CALL(cudaFree(d_stash_count));
-
-#ifdef TRACK_ITERATIONS
-  if (num_failures == 0) {
-    OutputBuildStatistics(n, d_iterations_taken);
-  }
-  CUDA_SAFE_CALL(cudaFree(d_iterations_taken));
-#endif
-
-  // Dump some info if a restart was required.
-  if (num_attempts >= kMaxRestartAttempts) {
-    sprintf(buffer, "Completely failed to build");
-    PrintMessage(buffer, true);
-  } else if (num_attempts > 1) {
-    sprintf(buffer, "Needed %u attempts to build, you can ignore this message.",
-            num_attempts);
-    PrintMessage(buffer, true);
-  }
-
-  CUDA_CHECK_ERROR("Error occurred during hash table build.\n");
-  return num_failures == 0;
-}
-
-void HashTable::Retrieve(const unsigned n_queries, const unsigned *d_keys,
-                         unsigned *d_values) {
-  CUDAWrapper::CallHashRetrieve(n_queries, num_hash_functions_, d_keys,
-                                table_size_, d_contents_, constants_2_,
-                                constants_3_, constants_4_, constants_5_,
-                                stash_constants_, stash_count_, d_values);
-}
-
-}; // namespace cuhash
-
-// Leave this at the end of the file
-// Local Variables:
-// mode:c++
-// c-file-style: "NVIDIA"
-// End:
--- a/src/cuhash/hash_table.cu
+++ b/src/cuhash/hash_table.cu
-// -------------------------------------------------------------
-// cuDPP -- CUDA Data Parallel Primitives library
-// -------------------------------------------------------------
-// $Revision:$
-// $Date:$
-// -------------------------------------------------------------
-// This source code is distributed under the terms of license.txt in
-// the root directory of this source distribution.
-// -------------------------------------------------------------
-
-/**
- * @file hash_table.cu
- *
- * @brief Hides all of the CUDA calls from the actual CPP file.
- */
-
-#include <cuhash/cuda_util.h>
-#include <cuhash/debugging.h>
-#include <cuhash/definitions.h>
-#include <cuhash/hash_table.cuh>
-
-#include <cuda.h>
-
-namespace cuhash {
-
-namespace CUDAWrapper {
-void ClearTable(const unsigned slots_in_table, const Entry fill_value,
-                Entry *d_contents) {
-  clear_table<Entry><<<ComputeGridDim(slots_in_table), kBlockSize>>>(
-      slots_in_table, fill_value, d_contents);
-  TV_CHECK_CUDA_ERR_V2("Error occurred during hash table clear.\n");
-}
-
-void CallCuckooHash(const unsigned n, const unsigned num_hash_functions,
-                    const unsigned *d_keys, const unsigned *d_values,
-                    const unsigned table_size, const Functions<2> constants_2,
-                    const Functions<3> constants_3,
-                    const Functions<4> constants_4,
-                    const Functions<5> constants_5,
-                    const unsigned max_iterations, Entry *d_contents,
-                    uint2 stash_constants, unsigned *d_stash_count,
-                    unsigned *d_failures, unsigned *d_iterations_taken) {
-  // Build the table.
-  cudaMemset(d_failures, 0, sizeof(unsigned));
-  if (num_hash_functions == 2) {
-    CuckooHash<<<ComputeGridDim(n), kBlockSize>>>(
-        n, d_keys, d_values, table_size, constants_2, max_iterations,
-        d_contents, stash_constants, d_stash_count, d_failures,
-        d_iterations_taken);
-  } else if (num_hash_functions == 3) {
-    CuckooHash<<<ComputeGridDim(n), kBlockSize>>>(
-        n, d_keys, d_values, table_size, constants_3, max_iterations,
-        d_contents, stash_constants, d_stash_count, d_failures,
-        d_iterations_taken);
-  } else if (num_hash_functions == 4) {
-    CuckooHash<<<ComputeGridDim(n), kBlockSize>>>(
-        n, d_keys, d_values, table_size, constants_4, max_iterations,
-        d_contents, stash_constants, d_stash_count, d_failures,
-        d_iterations_taken);
-  } else {
-    CuckooHash<<<ComputeGridDim(n), kBlockSize>>>(
-        n, d_keys, d_values, table_size, constants_5, max_iterations,
-        d_contents, stash_constants, d_stash_count, d_failures,
-        d_iterations_taken);
-  }
-
-  CUDA_CHECK_ERROR("Error occurred during hash table build.\n");
-}
-
-void CallHashRetrieve(const unsigned n_queries,
-                      const unsigned num_hash_functions, const unsigned *d_keys,
-                      const unsigned table_size, const Entry *d_contents,
-                      const Functions<2> constants_2,
-                      const Functions<3> constants_3,
-                      const Functions<4> constants_4,
-                      const Functions<5> constants_5,
-                      const uint2 stash_constants, const unsigned stash_count,
-                      unsigned *d_values) {
-  unsigned *d_retrieval_probes = NULL;
-#ifdef TRACK_ITERATIONS
-  CUDA_SAFE_CALL(
-      cudaMalloc((void **)&d_retrieval_probes, sizeof(unsigned) * n_queries));
-#endif
-
-  if (num_hash_functions == 2) {
-    hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>(
-        n_queries, d_keys, table_size, d_contents, constants_2, stash_constants,
-        stash_count, d_values, d_retrieval_probes);
-  } else if (num_hash_functions == 3) {
-    hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>(
-        n_queries, d_keys, table_size, d_contents, constants_3, stash_constants,
-        stash_count, d_values, d_retrieval_probes);
-  } else if (num_hash_functions == 4) {
-    hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>(
-        n_queries, d_keys, table_size, d_contents, constants_4, stash_constants,
-        stash_count, d_values, d_retrieval_probes);
-  } else {
-    hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>(
-        n_queries, d_keys, table_size, d_contents, constants_5, stash_constants,
-        stash_count, d_values, d_retrieval_probes);
-  }
-
-  CUDA_CHECK_ERROR("Retrieval failed.\n");
-
-#ifdef TRACK_ITERATIONS
-  OutputRetrievalStatistics(n_queries, d_retrieval_probes, num_hash_functions);
-  CUDA_SAFE_CALL(cudaFree(d_retrieval_probes));
-#endif
-}
-}; // namespace CUDAWrapper
-
-}; // namespace cuhash
--- a/src/cuhash/main.cc
+++ b/src/cuhash/main.cc
-#include <cuda.h>
-#include <cuhash/hash_table.h>
-
-int main() {
-  auto table = cuhash::HashTable();
-  table.Initialize(10, 2.0);
-  const int N = 10;
-
-  // ハッシュテーブルに格納するデータ
-  int keys[N] = {1, 6, 4, 9, 0, 3, 7, 2, 5, 8};
-  int vals[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-  // デバイスメモリにコピー
-  int *d_keys, *d_vals;
-  cudaMalloc((void **)&d_keys, sizeof(int) * N);
-  cudaMemcpy(d_keys, keys, sizeof(int) * N, cudaMemcpyHostToDevice);
-  cudaMalloc((void **)&d_vals, sizeof(int) * N);
-  cudaMemcpy(d_vals, vals, sizeof(int) * N, cudaMemcpyHostToDevice);
-
-  // ハッシュテーブルにクエリするデータ
-  int input[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-  int output[N];
-
-  // デバイスメモリにコピー
-  int *d_input, *d_output;
-  cudaMalloc((void **)&d_input, sizeof(int) * N);
-  cudaMemcpy(d_input, input, sizeof(int) * N, cudaMemcpyHostToDevice);
-  cudaMalloc((void **)&d_output, sizeof(int) * N);
-  cudaMemset(d_output, 0, sizeof(int) * N);
-  bool s = table.Build(N, (const unsigned int *)d_keys,
-                       (const unsigned int *)d_vals);
-
-  std::cout << s << std::endl;
-  table.Retrieve(N, (const unsigned int *)d_input, (unsigned int *)d_output);
-
-  std::cout << s << std::endl;
-  cudaMemcpy(output, d_output, sizeof(int) * N, cudaMemcpyDeviceToHost);
-  for (int i = 0; i < N; ++i) {
-    printf("%d\n", output[i]);
-  }
-
-  return 0;
-}
\ No newline at end of file
--- a/src/spconv/CMakeLists.txt
+++ b/src/spconv/CMakeLists.txt
-set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc spconv_ops.cc pool_ops.cc point2voxel_ops.cc)
-if (SPCONV_BuildCUDA)
-    set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu cublas_gemm.cc point2voxel.cu fused_conv.cu)
-endif()
-add_library(spconv SHARED ${ALL_FILES})
-
-find_package(OpenMP)
-if(OpenMP_CXX_FOUND)
-    target_link_libraries(spconv PUBLIC OpenMP::OpenMP_CXX)
-endif()
-
-
-target_include_directories(spconv PRIVATE ${ALL_INCLUDE} ${MP11_INCLUDE} )
-set_property(TARGET spconv PROPERTY CUDA_STANDARD 14)
-set_property(TARGET spconv PROPERTY CXX_STANDARD 14)
-set_target_properties(spconv PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-if (SPCONV_BuildCUDA)
-    target_link_libraries(spconv PRIVATE ${ALL_LIBS} cuhash spgemm)
-else()
-    target_link_libraries(spconv PRIVATE ${ALL_LIBS})
-endif()
-install (TARGETS spconv DESTINATION lib)