working on c++ only

899008fa · yan.yan · f78575ea · 899008fa · 899008fa · 899008fa
Commit 899008fa authored Jul 20, 2022 by yan.yan
11 changed files
--- a/spconv/csrc/sparse/indices.py
+++ b/spconv/csrc/sparse/indices.py
@@ -248,8 +248,7 @@ class ConvOutLocIter(pccm.ParameterizedClass):
 class SparseConvIndicesKernel(pccm.ParameterizedClass):
    def __init__(self, problem: ConvProblem, dtype_indices: dtypes.DType):
        super().__init__()
-        self.add_dependency(TensorView, TensorViewKernel, TensorViewHashKernel,
-                            ThrustLib)
+        self.add_dependency(TensorView, TensorViewKernel, TensorViewHashKernel)
        self.loc_iter = ConvOutLocIter(problem)
        self.add_param_class("spinds", self.loc_iter, "ConvLocIter")
        self.add_param_class("spinds", problem, "ConvProblem")
@@ -271,7 +270,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        code.arg("indice_pairs",
                 f"{self.dtype_indices}*")  # [2, kernelProd, MaxSize]
        code.arg("indice_pairs_for_uniq",
-                 f"TIndiceUniq*")  # [2, kernelProd, MaxSize]
+                 f"TIndiceUniq*")  # [kernelProd * MaxSize + 1]
        code.arg("indice_num_per_loc", f"int*")  # [kernelProd]

        code.arg("num_indices_in", "int")
@@ -340,7 +339,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        code.arg("indice_pairs_out_part", f"int*")  # [kernelProd, MaxSize]
        code.arg("num_indices_in", "int")
        code.arg("indices_pair_size", "int")
-        # TODO use block instead of filter_offset?
        code.raw(f"""
        int filter_offset = blockIdx.y;
        auto indice_pairs_out_part_filter = indice_pairs_out_part + filter_offset * indices_pair_size;
@@ -358,6 +356,46 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        """)
        return code

+    @pccm.cuda.cuda_global_function
+    def calc_conv_indices_stage2_bounded(self):
+        """if we bound output indices, some pair may be invalid,
+        so we need to atomicAdd and assign again.
+        here we will use indice_pairs_uniq as temp memory of 
+        indice_pairs_in_part.
+        """
+        code = pccm.FunctionCode()
+        code.targ("TTable")
+
+        code.arg("table", f"TTable")  # [N, ndim + 1]
+        code.arg("indice_pairs_uniq_before_sort", f"const typename TTable::key_type*")  # [kernelProd, MaxSize]
+        code.arg("indice_pairs_in_part_temp", f"const int*")  # [kernelProd, MaxSize]
+        code.arg("indice_pairs_in_part", f"int*")  # [kernelProd, MaxSize]
+        code.arg("indice_pairs_out_part", f"int*")  # [kernelProd, MaxSize]
+        code.arg("indice_num_per_loc", f"int*")  # [kernelProd]
+        code.arg("num_indices_in", "int")
+        code.arg("indices_pair_size", "int")
+        code.raw(f"""
+        int filter_offset = blockIdx.y;
+        auto indice_pairs_in_part_filter = indice_pairs_in_part + filter_offset * indices_pair_size;
+        auto indice_pairs_out_part_filter = indice_pairs_out_part + filter_offset * indices_pair_size;
+
+        auto indice_pairs_in_part_temp_filter = indice_pairs_in_part_temp + filter_offset * indices_pair_size;
+        auto indice_pairs_uniq_before_sort_filter = indice_pairs_uniq_before_sort + filter_offset * indices_pair_size;
+
+        for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
+            {self.dtype_indices} output_coord_offset = indice_pairs_uniq_before_sort_filter[i];
+            if (output_coord_offset != std::numeric_limits<typename TTable::key_type>::max()){{
+                auto table_offset = table.lookup_offset(output_coord_offset);
+                if (table_offset != -1){{
+                    int old_num = tv::cuda::atomicAggInc(indice_num_per_loc + filter_offset);
+                    indice_pairs_in_part_filter[old_num] = indice_pairs_in_part_temp_filter[i];
+                    indice_pairs_out_part_filter[old_num] = table.value_ptr()[table_offset];
+                }}
+            }}
+        }}
+        """)
+        return code
+
    @pccm.cuda.cuda_global_function
    def calc_conv_indices_stage1_mask(self):
        code = pccm.FunctionCode()
@@ -369,7 +407,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        code.arg("indice_pairs_bwd",
                 f"{self.dtype_indices}*")  # [kernelProd, MaxSize]
        code.arg("indice_pairs_for_uniq",
-                 f"TIndiceUniq*")  # [2, kernelProd, MaxSize]
+                 f"TIndiceUniq*")  # [kernelProd * MaxSize + 1]
        code.arg("indice_num_per_loc", f"int*")  # [kernelProd]

        code.arg("num_indices_in", "int")
@@ -397,6 +435,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
                // indice_pairs[filter_offset_mul_indices_pair_size + old_num] = i;
                // indice_pairs_bwd[filter_offset_mul_indices_pair_size + input_index] = output_coord_offset;
                // indice_pairs_for_uniq[filter_offset_mul_indices_pair_size + old_num] = output_coord_offset;
+                
                indice_pairs_for_uniq[filter_offset_mul_indices_pair_size + input_index] = output_coord_offset;
                // }}
            }}
@@ -420,7 +459,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        code.arg("num_indices_in", "int")
        code.arg("num_indices_out", "int")

-        # TODO use block instead of filter_offset?
        code.raw(f"""
        int filter_offset = blockIdx.y;
        uint32_t filter_mask_fwd = (1u << (filter_offset));
@@ -458,7 +496,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        code.arg("num_indices_in", "int")
        code.arg("kv", "int")

-        # TODO use block instead of filter_offset?
        code.raw(f"""
        for (int input_index : tv::KernelLoopX<int>(num_indices_in)) {{
            uint32_t mask = 0;
@@ -749,18 +786,13 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
                indice_pairs_uniq.data_ptr<T>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
                indice_pairs.dim(2), kv, transposed);
        }});
-        // thrust::device_ptr<{self.dtype_indices}> ptr_tr(indice_pairs_uniq.data_ptr<{self.dtype_indices}>());
-        // auto thrust_ctx = thrust::cuda::par.on(reinterpret_cast<cudaStream_t>(stream_int));
-        // thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
-        // auto new_end = thrust::unique(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
-        // auto num_out_act = new_end - ptr_tr - 1;
-        // return num_out_act;
        """)
        return code  # .ret("int")

    @pccm.cuda.static_function
    def generate_conv_inds_stage1_5(self):
        code = pccm.FunctionCode()
+        code.add_dependency(ThrustLib)
        code.arg("indice_pairs_uniq", "tv::Tensor")
        code.arg("uniq_size", "int64_t")
        code.arg("stream_int", f"std::uintptr_t", "0")
@@ -783,6 +815,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        code = pccm.FunctionCode()
        code.arg("indices, hashdata_k, hashdata_v", "tv::Tensor")
        code.arg("indice_pairs, indice_pairs_uniq, indice_pairs_uniq_before_sort, out_inds", "tv::Tensor")
+        code.arg("indice_num_per_loc", "tv::Tensor")
+
        code.arg("num_out_act", "int")
        code.arg("batch_size", "int")
        code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
@@ -790,6 +824,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
                 f"tv::array<int, {self.ndim}>")
        code.arg("transposed", f"bool", "false")
        code.arg("stream_int", f"std::uintptr_t", "0")
+        code.arg("use_bound_algo", "bool", "false")
+
        code.raw(f"""
        auto custream = reinterpret_cast<cudaStream_t>(stream_int);
        // TODO stream
@@ -798,6 +834,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
        TV_ASSERT_RT_ERR(hashdata_k.dtype() == indice_pairs_uniq.dtype(), "error");
        TV_ASSERT_RT_ERR(hashdata_v.dtype() == tv::int32, "error");
+        auto ctx = tv::Context();
+        ctx.set_cuda_stream(custream);

        // indice_pairs: [2, kv, indices.dim(0)]
        // indice_pairs_uniq: [indice_pairs.size() / 2 + 1]
@@ -805,6 +843,10 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
        // auto timer = tv::CudaContextTimer<>();
        int64_t uniq_size = indice_pairs.size() / 2 + 1;
        TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) >= num_out_act, "error");
+        // int num_out_act_bounded = num_out_act;
+        // if (num_out_act_bound > 0){{
+        //     num_out_act_bounded = std::min(num_out_act_bounded, num_out_act);
+        // }}
        TV_ASSERT_RT_ERR(out_inds.dim(0) >= num_out_act && out_inds.dim(1) == {self.ndim + 1}, "error");
        tv::cuda::Launch launcher_num_act_in(indices.dim(0), custream);
        launcher_num_act_in.blocks.y = kv;
@@ -827,11 +869,29 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
            lanucher_build_hash(build_conv_hash_table<table_t>, hash, 
                out_inds.data_ptr<int>(), indice_pairs_uniq.data_ptr<const K>(), 
                loc_iter.layout_npq, num_out_act);
+            if (!use_bound_algo){{
                launcher_num_act_in(calc_conv_indices_stage2<table_t>, hash, 
                    indice_pairs_uniq_before_sort.data_ptr<const K>(),
                    indice_pairs[1].data_ptr<int>(), 
                    indices.dim(0), 
                    indice_pairs.dim(2));
+            }}else{{
+                indice_num_per_loc.zero_(ctx);
+                // copy previous pair in to indice_pairs_uniq
+                // we need to ensure size of indice_pairs_uniq larger than pair in
+                TV_ASSERT_RT_ERR({pccm.literal(self.dtype_indices == dtypes.int32)}, "error");
+                tv::Tensor indice_pairs_in_temp = tv::from_blob(indice_pairs_uniq.raw_data(), {{indice_pairs.dim(1), indice_pairs.dim(2)}}, 
+                    indice_pairs.dtype(), indice_pairs.device());
+                indice_pairs_in_temp.copy_(indice_pairs[0].view(-1), ctx);
+                launcher_num_act_in(calc_conv_indices_stage2_bounded<table_t>, hash, 
+                    indice_pairs_uniq_before_sort.data_ptr<const K>(),
+                    indice_pairs_in_temp.data_ptr<const int>(),
+                    indice_pairs[0].data_ptr<int>(), 
+                    indice_pairs[1].data_ptr<int>(), 
+                    indice_num_per_loc.data_ptr<int>(),
+                    indices.dim(0), 
+                    indice_pairs.dim(2));
+            }}
        }});
        return num_out_act;
        """)
@@ -899,6 +959,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
                 f"tv::array<int, {self.ndim}>")
        code.arg("transposed", f"bool", "false")
        code.arg("stream_int", f"std::uintptr_t", "0")
+
        code.raw(f"""
        auto custream = reinterpret_cast<cudaStream_t>(stream_int);
        // TODO stream

--- a/spconv/gencode/__init__.py
+++ b/spconv/gencode/__init__.py
--- a/spconv/gencode/__main__.py
+++ b/spconv/gencode/__main__.py
+import os
+
+import fire
+from cumm.common import CompileInfo
+from cumm.conv.main import ConvMainUnitTest
+from cumm.gemm.main import GemmMainUnitTest
+from pccm.builder.pybind import gen_cmake
+from spconv.core import (IMPLGEMM_SIMT_PARAMS, IMPLGEMM_TURING_PARAMS,
+                         IMPLGEMM_VOLTA_PARAMS, SHUFFLE_SIMT_PARAMS,
+                         SHUFFLE_TURING_PARAMS, SHUFFLE_VOLTA_PARAMS)
+from spconv.csrc.hash.core import HashTable
+from spconv.csrc.sparse.all import SpconvOps
+from spconv.csrc.sparse.alloc import ExternalAllocator
+from spconv.csrc.sparse.convops import (ConvGemmOps, ConvTunerSimple,
+                                        ExternalSpconvMatmul, GemmTunerSimple,
+                                        SimpleExternalSpconvMatmul)
+from spconv.csrc.utils import BoxOps
+
+
+def main(include: str,
+         src: str,
+         libname: str = "spconv",
+         prefix: str = "spconvlib"):
+    all_shuffle = SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS
+    all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle))
+    cu = GemmMainUnitTest(all_shuffle)
+    cu.namespace = "cumm.gemm.main"
+    all_imp = (IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS +
+               IMPLGEMM_TURING_PARAMS)
+    # all_imp = IMPLGEMM_SIMT_PARAMS
+    all_imp = list(filter(lambda x: not x.is_nvrtc, all_imp))
+    convcu = ConvMainUnitTest(all_imp)
+    convcu.namespace = "cumm.conv.main"
+    gemmtuner = GemmTunerSimple(cu)
+    gemmtuner.namespace = "csrc.sparse.convops.gemmops"
+    convtuner = ConvTunerSimple(convcu)
+    convtuner.namespace = "csrc.sparse.convops.convops"
+    convops = ConvGemmOps(gemmtuner, convtuner)
+    convops.namespace = "csrc.sparse.convops.spops"
+
+    cus = [
+        cu,
+        convcu,
+        gemmtuner,
+        convtuner,
+        convops,
+        SpconvOps(),
+        BoxOps(),
+        HashTable(),
+        CompileInfo(),
+        ExternalAllocator(),
+        ExternalSpconvMatmul(),
+        SimpleExternalSpconvMatmul(),
+    ]
+
+    gen_cmake(libname, cus, include, src, namespace_prefix=prefix)
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
--- a/spconv/pytorch/conv.py
+++ b/spconv/pytorch/conv.py
@@ -38,20 +38,6 @@ from torch.nn.init import calculate_gain

 FILTER_HWIO = False

-
-def expand_nd(val: Union[int, List[int], Tuple[int, ...]], ndim: int) -> List[int]:
-    if isinstance(val, int):
-        val = [val] * ndim
-    elif isinstance(val, list):
-        assert len(val) == ndim
-    elif isinstance(val, tuple):
-        assert len(val) == ndim
-        return [*val]
-    else:
-        raise NotImplementedError
-    return val
-
-
 class SparseConvolution(SparseModule):
    __constants__ = [
        'stride', 'padding', 'dilation', 'groups', 'bias', 'subm', 'inverse',
@@ -82,6 +68,7 @@ class SparseConvolution(SparseModule):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = expand_nd(ndim, kernel_size)
+
        self.stride = expand_nd(ndim, stride)
        kv = int(np.prod(self.kernel_size))
        kv_stride = int(np.prod(self.stride))
@@ -130,7 +117,6 @@ class SparseConvolution(SparseModule):
            # KRSC
            self.weight = Parameter(
                torch.Tensor(out_channels, *self.kernel_size, in_channels))
-
        if bias:
            self.bias = Parameter(torch.Tensor(out_channels))
        else:

--- a/spconv/pytorch/cppcore.py
+++ b/spconv/pytorch/cppcore.py
--- a/spconv/pytorch/ops.py
+++ b/spconv/pytorch/ops.py
--- a/test/apps/testapp.py
+++ b/test/apps/testapp.py
+"""this file can only be used by spconv developer for now. 
+the "tensorpc" isn't a open source project.
+"""
+
+import tensorpc 
+
+from tensorpc.apps.flow.flowapp import App
+
+
+class TestApp(App):
+    pass
\ No newline at end of file
--- a/test/benchmark.py
+++ b/test/benchmark.py
@@ -25,7 +25,7 @@ import spconv.pytorch as spconv
 from spconv.utils import Point2VoxelCPU3d

 # torch.backends.cudnn.enabled = False
-def waymo_data(batch_size=1):
+def waymo_data(batch_size=1, num_features=-1):
    gen = Point2VoxelCPU3d([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 3,
                           150000, 1)
    # gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
@@ -35,11 +35,39 @@ def waymo_data(batch_size=1):
    print(pc.shape)
    voxels_tv, indices_tv, _ = gen.point_to_voxel(tv.from_numpy(pc))
    voxels = voxels_tv.numpy().reshape(-1, 3)
+    if num_features > 0:
+        voxels = np.zeros((voxels.shape[0], num_features), dtype=voxels.dtype)
    coors = indices_tv.numpy()
    N = coors.shape[0]
    coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
    return voxels, coors, gen.grid_size

+def waymo_data_large(batch_size=1):
+    gen = Point2VoxelCPU3d([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 3,
+                           1200000, 1)
+    # gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
+    #                        150000)
+    data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
+    pc = np.ascontiguousarray(data["pc"])
+    pc2 = pc.copy()
+    pc2[:, 1] += 1
+    pc3 = pc.copy()
+    pc3[:, 1] += 2
+    pc4 = pc.copy()
+    pc4[:, 1] += 3
+    pc5 = pc.copy()
+    pc5[:, 1] += 4
+
+    pc = np.concatenate([pc, pc2, pc3, pc4, pc5])
+    print(pc.shape)
+    voxels_tv, indices_tv, _ = gen.point_to_voxel(tv.from_numpy(pc))
+    voxels = voxels_tv.numpy().reshape(-1, 3)
+    coors = indices_tv.numpy()
+    N = coors.shape[0]
+    print("num voxels", N)
+    coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
+    return voxels, coors, gen.grid_size
+

 class Net(nn.Module):
    def __init__(self, shape, algo):
@@ -61,6 +89,21 @@ class Net(nn.Module):
            # #                   algo=algo),
            # spconv.SubMConv3d(32, 64, 3, bias=False, indice_key="c0",
            #                   algo=algo),
+
+            # spconv.SubMConv3d(64, 64, 3, bias=False, indice_key="c0",
+            #                   algo=algo),
+            # spconv.SubMConv3d(32,
+            #                   32,
+            #                   3,
+            #                   bias=False,
+            #                   indice_key="c0",
+            #                   algo=algo),
+            # # nn.BatchNorm1d(32),
+            # # nn.ReLU(),
+            # # spconv.SparseConv3d(64, 64, 2, 2, bias=False,
+            # #                   algo=algo),
+            # spconv.SubMConv3d(32, 64, 3, bias=False, indice_key="c0",
+            #                   algo=algo),
            spconv.SubMConv3d(64,
                              64,
                              3,
@@ -275,7 +318,7 @@ def main():
    import pickle
    np.random.seed(50051)
    torch.manual_seed(50051)
-    # voxels, coors, spatial_shape = waymo_data()
+    # voxels, coors, spatial_shape = waymo_data(num_features=128)
    # with open("/home/yy/test_spconv.pkl", "wb") as f:
    #     pickle.dump((voxels, coors, spatial_shape), f)
    with open(Path(__file__).parent / "data" / "test_spconv.pkl", "rb") as f:
@@ -312,7 +355,7 @@ def main():
    # MaskImpGemm: 51.0ms
    # MaskSplitImpGemm: 41.1ms
    # algo = None
-    net = Net(spatial_shape, algo).to(device).eval().to(dtype)# .train()
+    net = Net(spatial_shape, algo).to(device).eval().to(dtype).train()
    # net.load_state_dict(net.state_dict())
    spconv.assign_name_for_sparse_modules(net)
    print(coors_th.shape)
@@ -345,18 +388,18 @@ def main():
    print("spconv time", np.mean(times[10:]))
    times = []

-    # for i in range(10):
-    #     out = net(voxels_th, coors_th, 1)
-    #     print("------------")
-    #     torch.cuda.synchronize()
-    #     t = time.time()
-    #     out.features.backward(dout_t)
-    #     torch.cuda.synchronize()
-    #     times.append(time.time() - t)
-
-    # # # print((net.grid == -1).float().sum(), net.grid.numel())
-    # # # print("spconv time", time.time() - t)
-    # print("spconv bw time", np.mean(times[5:]))
+    for i in range(10):
+        out = net(voxels_th, coors_th, 1)
+        print("------------")
+        torch.cuda.synchronize()
+        t = time.time()
+        out.features.backward(dout_t)
+        torch.cuda.synchronize()
+        times.append(time.time() - t)
+
+    # # print((net.grid == -1).float().sum(), net.grid.numel())
+    # # print("spconv time", time.time() - t)
+    print("spconv bw time", np.mean(times[5:]))


 if __name__ == "__main__":

--- a/test/test_all_algo.py
+++ b/test/test_all_algo.py
--- a/test/test_conv.py
+++ b/test/test_conv.py
@@ -248,7 +248,7 @@ def test_spconv3d():
        ConvAlgo.Native, ConvAlgo.MaskImplicitGemm,
        ConvAlgo.MaskSplitImplicitGemm
    ]
-    # algos = [ConvAlgo.Native]
+    algos = [ConvAlgo.Native]

    for dev, shape, bs, IC, OC, k, s, p, d, al in params_grid(
            devices, shapes, batchsizes, in_channels, out_channels, ksizes,
@@ -308,7 +308,6 @@ def test_spconv3d():
            filters_t = torch.from_numpy(filters).to(device).to(dtype)
            net_ref.net[0].weight.data[:] = filters_t.permute(
                0, 4, 1, 2, 3).contiguous()
-
        net.net[0].weight.data[:] = filters_t
        out_ref = net_ref(features_dense_t)
        out = net(features_t, indices_t, bs).dense()
@@ -529,4 +528,4 @@ def test_spmaxpool3d():


 if __name__ == "__main__":
-    test_spmaxpool3d()
+    test_spconv3d()
--- a/test/test_multi_impl.py
+++ b/test/test_multi_impl.py
@@ -222,6 +222,7 @@ class NetLight(nn.Module):


 def _test_multi_impl(dtype: torch.dtype):
+    # TODO pytorch 1.12 don't support cpu half mm, f**k pytorch
    # TODO remove or release this when tf32 op is ready
    torch.backends.cuda.matmul.allow_tf32 = False
    torch.backends.cudnn.allow_tf32 = False
@@ -239,8 +240,6 @@ def _test_multi_impl(dtype: torch.dtype):
            np.float32)
        coors = np.ascontiguousarray(
            sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
-
-
    device = torch.device("cuda:0")
    device_cpu = torch.device("cpu:0")

@@ -275,17 +274,21 @@ def _test_multi_impl(dtype: torch.dtype):
    dout_t = torch.from_numpy(dout).to(device_cpu).to(dtype)
    dout_t_cu = torch.from_numpy(dout).to(device).to(dtype)

-
-
+    t = time.time()
+    print(1, time.time() - t)
    out_cpu = net_native_cpu(voxels_th, coors_th, 1).dense()
+    if dtype != torch.float16:
        out_cpu.backward(dout_t)
    out = net_native_gpu(voxels_th_cuda, coors_th_cuda, 1).dense()
+    print(2, time.time() - t)

    out.backward(dout_t_cu)
    out_imp = net_imp_gpu(voxels_th_cuda, coors_th_cuda, 1).dense()
+    print(3, time.time() - t)

    out_imp.backward(dout_t_cu)
    out_simp = net_simp_gpu(voxels_th_cuda, coors_th_cuda, 1).dense()
+    print(4, time.time() - t)

    out_simp.backward(dout_t_cu)
    with torch.no_grad():
@@ -297,6 +300,7 @@ def _test_multi_impl(dtype: torch.dtype):
        error_native = torch.linalg.norm(dense_cpu - dense_native).cpu().item()
        error_imp = torch.linalg.norm(dense_cpu - dense_imp).cpu().item()
        error_simp = torch.linalg.norm(dense_cpu - dense_simp).cpu().item()
+    print(5, time.time() - t)

    print("error_native", error_native)
    print("error_imp", error_imp)
@@ -320,15 +324,15 @@ def _test_multi_impl(dtype: torch.dtype):
        native_w = native_params[k]
        imp_w = imp_params[k]
        simp_w = simp_params[k]
-        cpu_w_grad = cpu_w.grad.detach().cuda()
        native_w_grad = native_w.grad.detach()
        imp_w_grad = imp_w.grad.detach()
        simp_w_grad = simp_w.grad.detach()
-
+        if dtype != torch.float16:
+            cpu_w_grad = cpu_w.grad.detach().cuda()
            error_native = torch.linalg.norm(native_w_grad - cpu_w_grad).cpu().item()
        error_imp = torch.linalg.norm(native_w_grad - imp_w_grad).cpu().item()
        error_simp = torch.linalg.norm(native_w_grad - simp_w_grad).cpu().item()
-        print(k, error_native, error_imp, error_simp)
+        print(k, error_imp, error_simp)
        assert error_imp < 1
        assert error_simp < 1