detailed libspconv example

48c8434d · yan.yan · 16a8cb24 · 48c8434d · 48c8434d · 48c8434d
Commit 48c8434d authored Sep 27, 2022 by yan.yan
5 changed files
--- a/README.md
+++ b/README.md
@@ -74,6 +74,7 @@ Update Spconv: you **MUST UNINSTALL** all spconv/cumm/spconv-cuxxx/cumm-cuxxx fi
 * nvrtc support: kernel in old GPUs will be compiled in runtime.
 * [libspconv](docs/PURE_CPP_BUILD.md): pure c++ build of all spconv ops. see [example](example/libspconv/run_build.sh)
 * tf32 kernels, faster fp32 training, disabled by default. set ```import spconv as spconv_core; spconv_core.constants.SPCONV_ALLOW_TF32 = True``` to enable them.
+* all weights are KRSC layout, some old model can't be loaded anymore.


 ## Spconv 2.1 vs Spconv 1.x

--- a/docs/PURE_CPP_BUILD.md
+++ b/docs/PURE_CPP_BUILD.md
@@ -2,6 +2,8 @@

 Spconv provide a way to generate sources to build a C++ library with all op needed for inference and train.

+The generated libspconv has been deployed to tensorrt and run in real automatic driving vehicles.
+
 ## Steps

 1. Install spconv, Install cumm, Install cumm cmake, or add it as a sub directory

--- a/example/libspconv/README.md
+++ b/example/libspconv/README.md
+# libspconv
+
+libspconv + pybindings = "core_cc.so" in spconv python package.
+
 ## libspconv Example

 run ```run_build.sh``` to get ```libspconv.so```.

--- a/example/libspconv/main.cu
+++ b/example/libspconv/main.cu
+/*
+libspconv example for INFERENCE, not TRAINING.
+*/
+
 #include <spconvlib/cumm/gemm/main/GemmMainUnitTest.h>
 #include <spconvlib/spconv/csrc/sparse/all/SpconvOps.h>
+#include <spconvlib/spconv/csrc/sparse/all/ops3d/Point2Voxel.h>
 #include <spconvlib/spconv/csrc/sparse/alloc/StaticAllocator.h>
 #include <spconvlib/spconv/csrc/sparse/convops/spops/ConvGemmOps.h>
 #include <spconvlib/spconv/csrc/sparse/inference/InferenceOps.h>
-#include <spconvlib/spconv/csrc/sparse/all/ops3d/Point2Voxel.h>

 #include <spconvlib/spconv/csrc/sparse/convops/SimpleExternalSpconvMatmul.h>

@@ -20,149 +24,423 @@ using ConvGemmOps =
    spconvlib::spconv::csrc::sparse::convops::spops::ConvGemmOps;
 using SimpleExternalSpconvMatmul =
    spconvlib::spconv::csrc::sparse::convops::SimpleExternalSpconvMatmul;
-using InferenceOps =
-    spconvlib::spconv::csrc::sparse::inference::InferenceOps;
+using InferenceOps = spconvlib::spconv::csrc::sparse::inference::InferenceOps;
 using Point2VoxelGPU3D =
    spconvlib::spconv::csrc::sparse::all::ops3d::Point2Voxel;
+using GemmMain = spconvlib::cumm::gemm::main::GemmMainUnitTest;
+using GemmTunerSimple =
+    spconvlib::spconv::csrc::sparse::convops::spops::GemmTuner;

-int main(int argc, char** argv){
-    tv::ssprint("Hello libspconv!!!");
-    TV_ASSERT_RT_ERR(argc == 2, "usage: main /path/to/benchmark-pc.jarr, you can find it in example/libspconv.")
-    std::string path = argv[1];
-    Point2VoxelGPU3D p2v{{0.1, 0.1, 0.1}, {-80, -80, -2, 80, 80, 6}, 3, 200000, 1};
-    auto pc_jarr = tv::io::load_from_file(path);
-    auto pc = pc_jarr.tensors.at(0).cuda();
-    // you should use point_to_voxel_hash_static in tensorrt and manage hash data in tensorrt workspace.
-    auto p2v_res = p2v.point_to_voxel_hash(pc);
-    tv::Tensor voxels = std::get<0>(p2v_res).cuda().view(-1, 3);
-    auto indices_without_bs = std::get<1>(p2v_res);
-    auto indices = tv::zeros({indices_without_bs.dim(0), 4}, tv::int32, 0);
-    indices.slice(1, 1, 4, 1, false, false).copy_2d_pitched_(indices_without_bs);
-    auto indices_cpu = indices.cpu();
-    auto indices_cpu_data_ptr = indices_cpu.data_ptr<int32_t>();
-    for (int i = 0; i < 5; ++i){
-        auto cur_indices_cpu_data_ptr = indices_cpu_data_ptr + i * 4;
-        tv::ssprint(cur_indices_cpu_data_ptr[0], cur_indices_cpu_data_ptr[1], cur_indices_cpu_data_ptr[2], cur_indices_cpu_data_ptr[3]);
-    }
-    auto num_per_voxel = std::get<2>(p2v_res);
-    tv::ssprint("num voxels", voxels.shape());
-    auto voxels_f16 = tv::zeros(voxels.shape(), tv::float16, 0);
-    auto voxels_f16_ptr = voxels_f16.data_ptr<__half>();
-    auto voxels_ptr = voxels.data_ptr<float>();
-    tv::kernel_1d_map(0, voxels_f16.size(), [=]TV_GPU_LAMBDA(size_t i)mutable{
-        voxels_f16_ptr[i] = __half(voxels_ptr[i]);
-    });
-    // out channels, ksize, in channels
-    tv::Tensor weights = tv::zeros({64, 3, 3, 3, 3}, tv::float16, 0);
-    tv::Tensor bias = tv::zeros({64}, tv::float16, 0);
-
-    int KV = 27;
-    int out_inds_num_limit = 100000; // upper bound of number of output indices.
-    std::vector<int32_t> ksize{3, 3, 3};
-    std::vector<int32_t> padding{1, 1, 1};
-    std::vector<int32_t> dilation{1, 1, 1};
-    std::vector<int32_t> stride{1, 1, 1};
-    int ndim = 3;
-    auto p2v_grid_size = p2v.get_grid_size();
-    std::vector<int32_t> input_dims(p2v_grid_size.begin(),
-                                         p2v_grid_size.end());
-    auto out_dims = SpconvOps::get_conv_output_size(input_dims, ksize, stride, padding, dilation);
-    tv::ssprint(ksize, input_dims, out_dims);
-
-    std::vector<int64_t> output_dims_i64(out_dims.begin(),
-                                         out_dims.end());
-    int64_t out_spatial_volume =
-        std::accumulate(output_dims_i64.begin(), output_dims_i64.end(),
-                        int64_t(1), std::multiplies<int64_t>());
-    bool use_int64_hash_k =
-        out_spatial_volume >= int64_t(std::numeric_limits<int>::max());
-    int num_act_in = voxels.dim(0);
-    bool is_subm = true;
-
-    bool direct_table = true;
-    int batch_size = 1;
-    int transpose = false;
-    bool use_direct_table = direct_table && !is_subm;
-    auto conv_algo = tv::gemm::SparseConvAlgo::kMaskImplicitGemm;
-    auto max_act_out_theory = SpconvOps::get_handcrafted_max_act_out(num_act_in, 
-        ksize, stride, padding, dilation);
-    
-    int workspace_size = SpconvOps::get_indice_gen_workspace_size(
-        KV, num_act_in, out_inds_num_limit, max_act_out_theory, is_subm,
-        use_int64_hash_k, use_direct_table);
-    // you should return workspace size in tensorrt plugin method.
-    tv::Tensor workspace = tv::empty({workspace_size}, tv::uint8, 0);
-    // get tensor map required by pair gen from workspace
-    auto ws_tensors = SpconvOps::get_indice_gen_tensors_from_workspace(
-        workspace.raw_data(), KV, num_act_in, is_subm ? num_act_in : out_inds_num_limit,
-        max_act_out_theory, is_subm, use_int64_hash_k, use_direct_table);
-    // create output tensors and insert them to static allocator
-    int pair_size = is_subm ? num_act_in : out_inds_num_limit;
-    tv::Tensor pair_fwd = tv::empty({KV, pair_size}, tv::int32, 0);
-    bool is_split_mask =
-        conv_algo == tv::gemm::SparseConvAlgo::kMaskSplitImplicitGemm;
-    int mask_count = is_split_mask ? 2 : 1;
-    tv::Tensor pair_mask_fwd = tv::empty({mask_count, pair_size}, tv::int32, 0);
-    tv::Tensor mask_argsort_fwd = tv::empty({mask_count, pair_size}, tv::int32, 0);
-    tv::Tensor out_inds = tv::empty({out_inds_num_limit, ndim + 1}, tv::int32, 0);
-    tv::Tensor indices_kernel_num = tv::zeros({KV}, tv::int32, 0);
-    cudaStream_t stream = 0;
-    ws_tensors.insert({SPCONV_ALLOC_PAIR_FWD, pair_fwd});
-    ws_tensors.insert({SPCONV_ALLOC_PAIR_MASK, pair_mask_fwd});
-    ws_tensors.insert({SPCONV_ALLOC_MASK_ARG_SORT, mask_argsort_fwd});
-    ws_tensors.insert({SPCONV_ALLOC_OUT_INDICES, out_inds});
-    ws_tensors.insert({SPCONV_ALLOC_INDICE_NUM_PER_LOC, indices_kernel_num});
-    StaticAllocator alloc(ws_tensors);
-    auto pair_res = SpconvOps::get_indice_pairs_implicit_gemm(
-        alloc, indices, batch_size, input_dims, static_cast<int>(conv_algo),
-        ksize, stride, padding, dilation, {0, 0, 0},
-        is_subm, transpose, false,
-        reinterpret_cast<std::uintptr_t>(stream), out_inds_num_limit,
-        tv::CUDAKernelTimer(false), use_direct_table);
-    int num_act_out = std::get<1>(pair_res);
-    tv::Tensor out_features = tv::empty({num_act_out, 64}, tv::float16, 0);
-
-    // this function is very slow, don't forget to cache result.
-    auto arch = ConvGemmOps::get_compute_capability();
-    int kv = pair_fwd.dim(0);
-    bool is_mask_split = pair_mask_fwd.dim(0) > 1;
-    int mask_split_cnt = pair_mask_fwd.dim(0);
-    tv::Tensor mask_tensor =
-        tv::zeros({pair_mask_fwd.dim(0)}, tv::uint32, -1);
-    auto mask_tensor_ptr = mask_tensor.data_ptr<uint32_t>();
-    if (is_mask_split) {
-      auto kv_div_2 = kv / 2;
-      auto remain = kv - kv_div_2;
-      uint64_t mask_np_1 = 1;
-      uint64_t first = ((mask_np_1 << remain) - 1);
-      uint64_t second = ((mask_np_1 << kv_div_2) - 1) << remain;
-      mask_tensor_ptr[0] = uint32_t(first);
-      mask_tensor_ptr[1] = uint32_t(second);
+int main(int argc, char **argv) {
+
+  tv::ssprint("Hello libspconv!!!");
+  TV_ASSERT_RT_ERR(argc == 2, "usage: main /path/to/benchmark-pc.jarr, you can "
+                              "find it in example/libspconv.")
+  std::string path = argv[1];
+  int ndim = 3;
+  int max_num_voxels = 200000;
+  int num_point_per_voxel = 1;
+  std::array<float, 3> vsize_xyz{0.1, 0.1, 0.1};
+
+  Point2VoxelGPU3D p2v{vsize_xyz,
+                       {-80, -80, -2, 80, 80, 6},
+                       3,
+                       max_num_voxels,
+                       num_point_per_voxel};
+  // load pc from tv::io::JsonArray, can be created from cumm.tensorview.tvio
+  auto pc_jarr = tv::io::load_from_file(path);
+  auto pc = pc_jarr.tensors.at(0).cuda();
+  // in real inference engine, you need to set a upper bound for a point cloud,
+  // then pad or truncate your point cloud. ignored in this example.
+  auto point_limit = pc.dim(0);
+  // use simple method, workspace tensor is created internally in
+  // Point2VoxelGPU3D.
+  auto p2v_res_simple = p2v.point_to_voxel_hash(pc);
+  // or manage workspace by your self (tensorrt)
+  // outputs
+  tv::Tensor voxels_padded = tv::empty(
+      {max_num_voxels, num_point_per_voxel, pc.dim(1)}, tv::float32, 0);
+  tv::Tensor indices_padded_no_batch = tv::empty(
+      {max_num_voxels, ndim /*batch size not included here*/}, tv::int32, 0);
+  tv::Tensor num_per_voxel_padded = tv::empty({max_num_voxels}, tv::int32, 0);
+  // workspaces
+  tv::Tensor hash_key_value =
+      tv::empty({point_limit * 2 /*you should use limit * 2 as hash size*/},
+                tv::custom128 /*always use int64*2 as key and value type*/,
+                0 /*0 is current GPU, -1 is CPU*/);
+  // internal usage
+  tv::Tensor point_indice_data = tv::empty({point_limit}, tv::int64, 0);
+  // indicate voxel id for every point, -1 if no voxel assigned. used for
+  // segmentation models.
+  tv::Tensor points_voxel_id = tv::empty({point_limit}, tv::int64, 0);
+
+  auto p2v_res = Point2VoxelGPU3D::point_to_voxel_hash_static(
+      pc, voxels_padded, indices_padded_no_batch, num_per_voxel_padded,
+      hash_key_value, point_indice_data, points_voxel_id, vsize_xyz,
+      tv::arrayops::to_std_array(p2v.grid_size),
+      tv::arrayops::to_std_array(p2v.grid_stride),
+      tv::arrayops::to_std_array(p2v.coors_range));
+  tv::Tensor voxels = std::get<0>(p2v_res).cuda().view(-1, 3);
+  int real_num_voxels = voxels.dim(0);
+  // add a batch index to previous indices.
+  auto indices_padded =
+      tv::zeros({indices_padded_no_batch.dim(0), 4}, tv::int32, 0);
+  indices_padded.slice(1, 1, 4, 1, false, false)
+      .copy_2d_pitched_(indices_padded_no_batch);
+  auto indices_cpu = indices_padded.cpu();
+  auto indices_cpu_data_ptr = indices_cpu.data_ptr<int32_t>();
+  for (int i = 0; i < 5; ++i) {
+    auto cur_indices_cpu_data_ptr = indices_cpu_data_ptr + i * 4;
+    tv::ssprint(cur_indices_cpu_data_ptr[0], cur_indices_cpu_data_ptr[1],
+                cur_indices_cpu_data_ptr[2], cur_indices_cpu_data_ptr[3]);
+  }
+
+  auto num_per_voxel = std::get<2>(p2v_res);
+  // convert voxels from f32 to f16.
+  voxels_padded = voxels_padded.view(-1, pc.dim(1));
+  tv::ssprint("num voxels", voxels.shape(), voxels_padded.shape());
+  auto voxels_f16_padded = tv::zeros(voxels_padded.shape(), tv::float16, 0);
+  auto voxels_f16_ptr = voxels_f16_padded.data_ptr<__half>();
+  auto voxels_ptr = voxels_padded.data_ptr<float>();
+  tv::kernel_1d_map(0, voxels_f16_padded.size(),
+                    [=] TV_GPU_LAMBDA(size_t i) mutable {
+                      voxels_f16_ptr[i] = __half(voxels_ptr[i]);
+                    });
+  // create simple weights and bias.
+  std::vector<int32_t> ksize{3, 3, 3};
+  std::vector<int32_t> padding{1, 1, 1};
+  std::vector<int32_t> dilation{1, 1, 1};
+  std::vector<int32_t> stride{1, 1, 1};
+  // weight layout is KRSC, [out channels, *ksize, in channels]
+  int KV = ksize[0] * ksize[1] * ksize[2]; // kernel volume
+
+  tv::Tensor weights = tv::zeros({64, 3, 3, 3, 3}, tv::float16, 0);
+  tv::Tensor bias = tv::zeros({64}, tv::float16, 0);
+  // in inference engine, the number of output points must have a upper bound.
+  // the shape from point2voxel is input shape.
+  auto p2v_grid_size = p2v.get_grid_size();
+  std::vector<int32_t> input_dims(p2v_grid_size.begin(), p2v_grid_size.end());
+  // calc output dims
+  auto out_dims = SpconvOps::get_conv_output_size(input_dims, ksize, stride,
+                                                  padding, dilation);
+  tv::ssprint(ksize, input_dims, out_dims);
+
+  std::vector<int64_t> output_dims_i64(out_dims.begin(), out_dims.end());
+  // if shape is too large, we will use slower int64->int32 hash table instead
+  // of int32->int32 table.
+  int64_t out_spatial_volume =
+      std::accumulate(output_dims_i64.begin(), output_dims_i64.end(),
+                      int64_t(1), std::multiplies<int64_t>());
+  bool use_int64_hash_k =
+      out_spatial_volume >= int64_t(std::numeric_limits<int>::max());
+  int static_num_act_in = voxels_padded.dim(0);
+  // static_num_act_in is just out_inds_num_limit of previous conv layer.
+  // for regular conv, the input tensor has static shape, we should save a CPU
+  // variable of real num_act_out. here we just use num_act_in.
+  int real_num_act_in = real_num_voxels;
+  // you need to slice all static inputs with real_num_act_in in static
+  // inference engine, e.g. tensorrt. here we don't need to do that.
+
+  // our algorithm support upper bound
+  int out_inds_num_limit = 100000; // upper bound of number of output indices.
+
+  int batch_size = 1;
+  // we shouldn't use standard transpose in any code.
+  int transpose = false;
+  cudaStream_t stream = 0;
+  tv::Context ctx;
+  ctx.set_cuda_stream_int(reinterpret_cast<std::uintptr_t>(stream));
+  // get_compute_capability is very slow, don't forget to cache arch result.
+  auto arch = ConvGemmOps::get_compute_capability();
+  // remove pad
+  auto input_features_real =
+      voxels_f16_padded.slice_first_axis(0, real_num_act_in);
+  auto input_indices_real = indices_padded.slice_first_axis(0, real_num_act_in);
+
+  for (int i = 0; i < 2; ++i) {
+    // if your kernel volume > 32, you need to use
+    // tv::gemm::SparseConvAlgo::kNative. otherwise use kMaskImplicitGemm.
+    if (i == 0) {
+      auto conv_algo = tv::gemm::SparseConvAlgo::kNative;
+      bool inverse = false;
+      // native algo code example
+      tv::ssprint("native example");
+      for (int j = 0; j < 2; ++j) {
+        bool is_subm = j == 0;
+        int workspace_size = SpconvOps::get_indice_gen_workspace_size(
+            KV, static_num_act_in, out_inds_num_limit,
+            0 /*used in implicit gemm*/, is_subm, use_int64_hash_k,
+            false /*used in implicit gemm*/);
+        // you should return workspace size in tensorrt plugin method.
+        tv::Tensor workspace = tv::empty({workspace_size}, tv::uint8, 0);
+        // get tensor map required by pair gen from workspace
+        // keep in mind that our indice gen function use a "allocator" to alloc
+        // temp/out tensors, in python we use TorchAllocator which is a simple
+        // dynamic allocator, in c++ (inference engine) we need to use
+        // fixed-size workspace and create a static allocator.
+        auto ws_tensors = SpconvOps::get_indice_gen_tensors_from_workspace(
+            workspace.raw_data(), KV, static_num_act_in,
+            is_subm ? static_num_act_in : out_inds_num_limit, 0, is_subm,
+            use_int64_hash_k, false);
+        // unlike implicit gemm, native pair isn't tight padded.
+        tv::Tensor pair = tv::empty({2, KV, static_num_act_in}, tv::int32, 0);
+        tv::Tensor indices_kernel_num = tv::zeros({KV}, tv::int32, 0);
+        tv::Tensor out_inds = tv::empty(
+            {is_subm ? static_num_act_in : out_inds_num_limit, ndim + 1},
+            tv::int32, 0);
+
+        ws_tensors.insert({SPCONV_ALLOC_PAIR_FWD, pair});
+        ws_tensors.insert(
+            {SPCONV_ALLOC_INDICE_NUM_PER_LOC, indices_kernel_num});
+        ws_tensors.insert({SPCONV_ALLOC_OUT_INDICES, out_inds});
+
+        StaticAllocator alloc(ws_tensors);
+
+        int num_act_out_real = SpconvOps::get_indice_pairs(
+            alloc, input_indices_real, batch_size, out_dims,
+            static_cast<int>(tv::gemm::SparseConvAlgo::kNative), ksize, stride,
+            padding, dilation, {0, 0, 0}, is_subm, false,
+            reinterpret_cast<std::uintptr_t>(stream), out_inds_num_limit,
+            static_num_act_in);
+        tv::Tensor out_features_padded =
+            tv::empty({is_subm ? static_num_act_in : out_inds_num_limit, 64},
+                      tv::float16, 0);
+        auto out_features_real =
+            out_features_padded.slice_first_axis(0, num_act_out_real);
+        // subm contains a regular gemm
+        // so we need to slice input and output
+        GemmTunerSimple gemm_tuner(GemmMain::get_all_algo_desp());
+        if (is_subm) {
+          // unlike regular conv, indices_kernel_num only [0:KV / 2] contains
+          // valid numbers, [KV / 2:] is zero so we need to slice input and
+          // output with real size to tell indice_conv the center size.
+          std::unordered_map<std::string, tv::Tensor> tensor_dict{
+              {SPCONV_ALLOC_FEATURES, input_features_real},
+              {SPCONV_ALLOC_FILTERS, weights},
+              {SPCONV_ALLOC_OUT_FEATURES, out_features_real}};
+          StaticAllocator alloc2(tensor_dict);
+          // the SimpleExternalSpconvMatmul is used to perform bias operations
+          // provided by external bias library such as cublasLt. in pytorch this
+          // class use pytorch matmul.
+          SimpleExternalSpconvMatmul ext_mm(alloc2);
+
+          ConvGemmOps::indice_conv(
+              alloc2, ext_mm, gemm_tuner, true, false, input_features_real,
+              weights, pair, indices_kernel_num, arch, out_features_real.dim(0),
+              inverse, is_subm,
+              static_cast<int>(tv::gemm::SparseConvAlgo::kNative),
+              reinterpret_cast<std::uintptr_t>(stream), bias, 1.0,
+              /*bias alpha, only used for leaky relu*/, 0.0,
+              tv::gemm::Activation::kReLU);
+        } else {
+          // regular conv use numbers in indices_kernel_num to perform gemm
+          // so we don't need to slice.
+          std::unordered_map<std::string, tv::Tensor> tensor_dict{
+              {SPCONV_ALLOC_FEATURES, voxels_f16_padded},
+              {SPCONV_ALLOC_FILTERS, weights},
+              {SPCONV_ALLOC_OUT_FEATURES, out_features_padded}};
+          StaticAllocator alloc2(tensor_dict);
+          // the SimpleExternalSpconvMatmul is used to perform bias operations
+          // provided by external bias library such as cublasLt. in pytorch this
+          // class use pytorch matmul.
+          SimpleExternalSpconvMatmul ext_mm(alloc2);
+          ConvGemmOps::indice_conv(
+              alloc2, ext_mm, gemm_tuner, true, false, voxels_f16_padded,
+              weights, pair, indices_kernel_num, arch,
+              out_features_padded.dim(0), inverse, is_subm,
+              static_cast<int>(tv::gemm::SparseConvAlgo::kNative),
+              reinterpret_cast<std::uintptr_t>(stream), bias, 1.0, 0.0,
+              tv::gemm::Activation::kReLU);
+        }
+      }
    } else {
-      mask_tensor_ptr[0] = 0xffffffff;
-    }
-    std::vector<tv::Tensor> pair_mask_splits;
-    std::vector<tv::Tensor> mask_argsort_splits;
-    for (int i = 0; i < mask_split_cnt; ++i) {
-      pair_mask_splits.push_back(
-          pair_mask_fwd[i]);
-      mask_argsort_splits.push_back(
-          mask_argsort_fwd[i]);
+      auto conv_algo = tv::gemm::SparseConvAlgo::kMaskImplicitGemm;
+      tv::ssprint("implicit gemm example");
+      // implicit gemm code example
+      for (int j = 0; j < 2; ++j) {
+        bool is_subm = j == 0;
+        // direct table: a hash based algorithm that don't need unique. enabled
+        // by default.
+        bool direct_table = true;
+        // only regular conv need direct table.
+        bool use_direct_table = direct_table && !is_subm;
+        auto max_act_out_theory = SpconvOps::get_handcrafted_max_act_out(
+            real_num_act_in, ksize, stride, padding, dilation);
+        // query workspace size.
+        int workspace_size = SpconvOps::get_indice_gen_workspace_size(
+            KV, real_num_act_in, out_inds_num_limit, max_act_out_theory,
+            is_subm, use_int64_hash_k, use_direct_table);
+        // you should return workspace size in tensorrt plugin method.
+        tv::Tensor workspace = tv::empty({workspace_size}, tv::uint8, 0);
+        // get tensor map required by pair gen from workspace
+        // keep in mind that our indice gen function use a "allocator" to alloc
+        // temp/out tensors, in python we use TorchAllocator which is a simple
+        // dynamic allocator, in c++ (inference engine) we need to use
+        // fixed-size workspace and create a static allocator.
+        auto ws_tensors = SpconvOps::get_indice_gen_tensors_from_workspace(
+            workspace.raw_data(), KV, real_num_act_in,
+            is_subm ? real_num_act_in : out_inds_num_limit, max_act_out_theory,
+            is_subm, use_int64_hash_k, use_direct_table);
+        // pair can also have a upper bound.
+        // !!!!!IMPORTANT!!!!!!! if you provide a static (padded) pair_fwd and
+        // other indice data, the output layout is tight pair_fwd_correct =
+        // pair_fwd_padded.view(-1)[:KV * real_pair_size].view(KV,
+        // real_pair_size) this valid for pair_fwd, pair_bwd, pair_mask_fwd,
+        // pair_mask_bwd, mask_argsort_fwd, mask_argsort_bwd.
+        int pair_fwd_size_padded =
+            is_subm ? real_num_act_in : out_inds_num_limit;
+        tv::Tensor pair_fwd_padded =
+            tv::empty({KV, pair_fwd_size_padded}, tv::int32, 0);
+        // you can find equivalent python code of following code in python
+        // package
+        bool is_split_mask =
+            conv_algo == tv::gemm::SparseConvAlgo::kMaskSplitImplicitGemm;
+        int mask_count = is_split_mask ? 2 : 1;
+        tv::Tensor pair_mask_fwd_padded =
+            tv::empty({mask_count, pair_fwd_size_padded}, tv::int32, 0);
+        tv::Tensor mask_argsort_fwd_padded =
+            tv::empty({mask_count, pair_fwd_size_padded}, tv::int32, 0);
+        tv::Tensor out_inds = tv::empty(
+            {is_subm ? static_num_act_in : out_inds_num_limit, ndim + 1},
+            tv::int32, 0);
+        tv::Tensor indices_kernel_num = tv::zeros({KV}, tv::int32, 0);
+        std::tuple<tv::Tensor, int> pair_res;
+        if (is_subm) {
+          // subm out inds equal to input inds, just copy them
+          out_inds.copy_(indices_padded, ctx);
+
+          // subm exmaple
+          // create output tensors and insert them to static allocator
+          // output tensors needed in subm get_indice_pairs_implicit_gemm,
+          // saved to static allocator.
+          ws_tensors.insert({SPCONV_ALLOC_PAIR_FWD, pair_fwd_padded});
+          ws_tensors.insert({SPCONV_ALLOC_PAIR_MASK, pair_mask_fwd_padded});
+          ws_tensors.insert(
+              {SPCONV_ALLOC_MASK_ARG_SORT, mask_argsort_fwd_padded});
+          ws_tensors.insert({SPCONV_ALLOC_OUT_INDICES, out_inds});
+          ws_tensors.insert(
+              {SPCONV_ALLOC_INDICE_NUM_PER_LOC, indices_kernel_num});
+          StaticAllocator alloc(ws_tensors);
+
+          pair_res = SpconvOps::get_indice_pairs_implicit_gemm(
+              alloc, input_indices_real, batch_size, input_dims,
+              static_cast<int>(conv_algo), ksize, stride, padding, dilation,
+              {0, 0, 0}, is_subm, transpose, false /*is_train*/,
+              reinterpret_cast<std::uintptr_t>(stream), out_inds_num_limit,
+              tv::CUDAKernelTimer(false), use_direct_table);
+          // for subm num_act_out always equal to num_act_in_real
+
+        } else {
+          // WARNING be careful with inverse conv, understand python
+          // code first. no inverse example here.
+          // regular conv need more outputs, used for inversed conv.
+          // bwd shape is [KV, static num_act_in (previous num_act_out_bound)]
+          tv::Tensor pair_bwd_padded =
+              tv::empty({KV, static_num_act_in}, tv::int32, 0);
+          tv::Tensor pair_mask_bwd_padded =
+              tv::empty({mask_count, static_num_act_in}, tv::int32, 0);
+          tv::Tensor mask_argsort_bwd_padded =
+              tv::empty({mask_count, static_num_act_in}, tv::int32, 0);
+
+          ws_tensors.insert({SPCONV_ALLOC_PAIR_FWD, pair_fwd_padded});
+          ws_tensors.insert({SPCONV_ALLOC_PAIR_BWD, pair_bwd_padded});
+
+          ws_tensors.insert({SPCONV_ALLOC_PAIR_MASK, pair_mask_fwd_padded});
+          ws_tensors.insert({SPCONV_ALLOC_PAIR_MASK_BWD, pair_mask_bwd_padded});
+
+          ws_tensors.insert(
+              {SPCONV_ALLOC_MASK_ARG_SORT, mask_argsort_fwd_padded});
+          ws_tensors.insert(
+              {SPCONV_ALLOC_MASK_ARG_SORT_BWD, mask_argsort_bwd_padded});
+
+          ws_tensors.insert({SPCONV_ALLOC_OUT_INDICES, out_inds});
+          ws_tensors.insert(
+              {SPCONV_ALLOC_INDICE_NUM_PER_LOC, indices_kernel_num});
+
+          StaticAllocator alloc(ws_tensors);
+
+          pair_res = SpconvOps::get_indice_pairs_implicit_gemm(
+              alloc, input_indices_real, batch_size, input_dims,
+              static_cast<int>(conv_algo), ksize, stride, padding, dilation,
+              {0, 0, 0}, is_subm, transpose, false /*is_train*/,
+              reinterpret_cast<std::uintptr_t>(stream), out_inds_num_limit,
+              tv::CUDAKernelTimer(false), use_direct_table);
+        }
+        // after get pair datas, we can start to do real convolution!
+        // in static inference engine, you need to split pair-gen and conv to
+        // different layers to reuse pair data
+        // here we just use previous result.
+        int num_act_out_real = std::get<1>(pair_res);
+
+        tv::Tensor out_features =
+            tv::empty({is_subm ? real_num_act_in : out_inds_num_limit, 64},
+                      tv::float16, 0);
+        auto out_features_real =
+            out_features.slice_first_axis(0, num_act_out_real);
+
+        bool is_mask_split = pair_mask_fwd_padded.dim(0) > 1;
+        int mask_split_cnt = pair_mask_fwd_padded.dim(0);
+        tv::Tensor mask_tensor =
+            tv::zeros({pair_mask_fwd_padded.dim(0)}, tv::uint32, -1);
+        // create split mask
+        // currently it's a constant for each algo.
+        auto mask_tensor_ptr = mask_tensor.data_ptr<uint32_t>();
+        if (is_mask_split) {
+          auto kv_div_2 = KV / 2;
+          auto remain = KV - kv_div_2;
+          uint64_t mask_np_1 = 1;
+          uint64_t first = ((mask_np_1 << remain) - 1);
+          uint64_t second = ((mask_np_1 << kv_div_2) - 1) << remain;
+          mask_tensor_ptr[0] = uint32_t(first);
+          mask_tensor_ptr[1] = uint32_t(second);
+        } else {
+          mask_tensor_ptr[0] = 0xffffffff;
+        }
+        std::vector<tv::Tensor> pair_mask_splits;
+        std::vector<tv::Tensor> mask_argsort_splits;
+        size_t real_pair_size = KV * num_act_out_real;
+        // keep in mind that pair_mask_fwd_padded is tight padded tensor, so we
+        // must get real tensor before use them if inversed conv, use xxxx_bwd
+        // here instead of fwd.
+        auto pair_fwd_real = pair_fwd_padded.view(-1)
+                                 .slice_first_axis(0, KV * num_act_out_real)
+                                 .view(KV, num_act_out_real);
+        auto pair_mask_fwd_real =
+            pair_mask_fwd_padded.view(-1)
+                .slice_first_axis(0, mask_split_cnt * num_act_out_real)
+                .view(mask_split_cnt, num_act_out_real);
+        auto mask_argsort_fwd_real =
+            mask_argsort_fwd_padded.view(-1)
+                .slice_first_axis(0, mask_split_cnt * num_act_out_real)
+                .view(mask_split_cnt, num_act_out_real);
+
+        for (int i = 0; i < mask_split_cnt; ++i) {
+          pair_mask_splits.push_back(pair_mask_fwd_real[i]);
+          mask_argsort_splits.push_back(mask_argsort_fwd_real[i]);
+        }
+        // create output tensor allocator
+        std::unordered_map<std::string, tv::Tensor> tensor_dict{
+            {SPCONV_ALLOC_FEATURES, input_features_real},
+            {SPCONV_ALLOC_FILTERS, weights},
+            {SPCONV_ALLOC_OUT_FEATURES, out_features}};
+        StaticAllocator alloc2(tensor_dict);
+        ConvTunerSimple tuner(ConvMain::get_all_conv_algo_desp());
+        auto conv_res = ConvGemmOps::implicit_gemm(
+            alloc2, tuner, input_features_real, weights, pair_fwd_real,
+            pair_mask_splits, mask_argsort_splits, num_act_out_real,
+            mask_tensor, arch, false, is_subm,
+            reinterpret_cast<std::uintptr_t>(stream),
+            tv::CUDAKernelTimer(false), false, false, bias,
+            1.0 /*bias alpha, only used for leaky relu*/,
+            0.0 /*unused for now*/, tv::gemm::Activation::kReLU);
+        tv::ssprint("selected conv algo",
+                    std::get<1>(conv_res).algo_desp.__repr__());
+        // FINISH!!!
+      }
+      // calc maximum number of output points.
    }
-    std::unordered_map<std::string, tv::Tensor> tensor_dict{
-        {SPCONV_ALLOC_FEATURES, voxels_f16},
-        {SPCONV_ALLOC_FILTERS, weights},
-        {SPCONV_ALLOC_OUT_FEATURES, out_features}};
-    StaticAllocator alloc2(tensor_dict);
-    ConvTunerSimple tuner(ConvMain::get_all_conv_algo_desp());
-    auto conv_res = ConvGemmOps::implicit_gemm(
-        alloc2, tuner, voxels_f16, weights, pair_fwd,
-        pair_mask_splits, mask_argsort_splits, num_act_out,
-        mask_tensor, arch, false, is_subm,
-        reinterpret_cast<std::uintptr_t>(stream), tv::CUDAKernelTimer(false),
-        false, false, bias, 1.0,
-        0.0, tv::gemm::Activation::kReLU);
-
-    // p2v.point_to_voxel_hash()
-    return 0;
+  }
+  checkCudaErrors(cudaStreamSynchronize(stream));
+  // p2v.point_to_voxel_hash()
+  return 0;
 }
\ No newline at end of file
--- a/example/libspconv/run_build.sh
+++ b/example/libspconv/run_build.sh
@@ -4,7 +4,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )

 git clone https://github.com/FindDefinition/cumm.git $SCRIPT_DIR/cumm

-export CUMM_CUDA_VERSION=11.4 # cuda version, required
+export CUMM_CUDA_VERSION=11.4 # cuda version, required but only used for flag selection when build libspconv.
 export CUMM_DISABLE_JIT=1
 export SPCONV_DISABLE_JIT=1
 export CUMM_INCLUDE_PATH="\${CUMM_INCLUDE_PATH}" # if you use cumm as a subdirectory, you need this to find cumm includes.
@@ -16,4 +16,4 @@ python -m spconv.gencode --include=$SCRIPT_DIR/spconv/include --src=$SCRIPT_DIR/
 mkdir -p $SCRIPT_DIR/build
 cd $SCRIPT_DIR/build
 cmake ..
-cmake --build $SCRIPT_DIR/build --config Release -j 8 --verbose
+cmake --build $SCRIPT_DIR/build --config Release -j 8 # --verbose