working on cutlass

23d9faaf · yanyan · 853302aa · 23d9faaf · 23d9faaf · e33d90b3
Commit 23d9faaf authored Jun 08, 2020 by yanyan
13 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,7 @@ add_subdirectory(third_party/pybind11)
 set(ALL_LIBS ${TORCH_LIBRARIES}) 
 set(ALL_INCLUDE ${PROJECT_SOURCE_DIR}/include)
+set(MP11_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/mp11/include)
 if (SPCONV_BuildCUDA)
    set(ALL_LIBS ${ALL_LIBS} ${CUDA_CUDART} ${CUDA_CUBLAS})

--- a/codeai-devops.yaml
+++ b/codeai-devops.yaml
+global:
+    console_url: localhost:50091
+    envs: 
+        PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python # c++ libprotobuf and python will conflicit
+analyzers: # only one analyzer is allowed for one type for now.
+    PythonAnalyzer:
+    SimpleCPPAnalyzer: # $<astex> devops.devs = ["_ci_dev_xxx"] </astex> is allowed in raw sources.
+        includes: ["*.cpp", "*.cu", "*.cc", "*.h", "*.hpp", "*.hxx", "*.cxx"]
+observers:
+    # run test functions when that function change or marked function change.
+    test:
+        type: TestObserver
+    # run dev functions when that function change or marked function change.
+    dev:
+        type: DevObserver
+        pattern: _ci_dev_.*
+    clangdev:
+        type: CPPDevObserver
+        main_pattern: dev_.*\.(cc|cpp|cxx)
+        pattern: .*\.(cc|cpp|cxx|h|hpp|hxx)
+        compiler: clang++
+        executable: build/codeai_dev
+        includes: [
+            include,
+            /usr/local/cuda/include,
+            /home/yy/anaconda3/include,
+            /home/yy/anaconda3/include/python3.7m,
+            third_party/pybind11/include,
+            third_party/include,
+            /home/yy/library/boost_1_72_0,
+        ]
+        libpaths: [
+            /home/yy/anaconda3/lib,
+        ]
+        libraries: [-lnvinfer, -lpython3.7m, -lcublas, -lcudart, -ljpeg]
+        std: c++2a
+        options: [-Wall, -Wextra]
+    cudadev:
+        type: CPPDevObserver
+        main_pattern: dev_.*\.cu
+        pattern: .*\.(cc|cpp|cxx|h|hpp|hxx|cu)
+        compiler: nvcc
+        executable: build/codeai_dev_cuda
+        run_cmd: [$(executable)]
+        sources: []
+        includes: [
+            include,
+            /usr/local/cuda/include,
+            /home/yy/anaconda3/include,
+            /home/yy/anaconda3/include/python3.7m,
+            third_party/pybind11/include,
+            third_party/cutlass/include,
+        ]
+        libpaths: [
+            /usr/local/cuda/lib64,
+            /home/yy/anaconda3/lib,
+        ]
+        libraries: [-lpython3.7m, -lcudart, -lcublas, -ljpeg]
+        std: c++14
+        options: [
+            -Wno-deprecated-declarations,
+            "-gencode=arch=compute_52,code=sm_61",
+            "-gencode=arch=compute_61,code=sm_61",
+            "-gencode=arch=compute_60,code=sm_60",
+            "-gencode=arch=compute_70,code=sm_70",
+            "-gencode=arch=compute_75,code=sm_75",
+        ]
+    torchdev:
+        type: CPPDevObserver
+        main_pattern: torchdev_.*\.(cu|cpp|cc|cxx)
+        pattern: .*\.(cc|cpp|cxx|h|hpp|hxx|cu)
+        compiler: clang++
+        executable: build/codeai_dev_torch
+        run_cmd: [$(executable)]
+        fail_cmds: # run cmd when pervious run fail with retcode
+            -6: [gdb, -ex, run, -ex, bt, -ex, quit, $(executable)] # segfault in unix
+        includes: [
+            include,
+            /home/yy/anaconda3/lib/python3.7/site-packages/torch/include,
+            /home/yy/anaconda3/lib/python3.7/site-packages/torch/include/torch/csrc/api/include,
+            /usr/local/cuda/include,
+            /home/yy/anaconda3/include,
+            /home/yy/anaconda3/include/python3.7m,
+            third_party/pybind11/include,
+            third_party/cutlass/include,
+        ]
+        libpaths: [
+            /home/yy/anaconda3/lib/python3.7/site-packages/torch/lib,
+            /usr/local/cuda/lib64,
+            /home/yy/anaconda3/lib,
+        ]
+        libraries: [-lpython3.7m, -lcublas, -lcudart, -ljpeg, -lpthread, 
+                    "-Wl,--no-as-needed,-lc10", 
+                    "-Wl,--no-as-needed,-ltorch", 
+                    "-Wl,--no-as-needed,-ltorch_cpu", 
+                    "-Wl,--no-as-needed,-lc10_cuda", 
+                    "-Wl,--no-as-needed,-ltorch_cuda"]
+        std: c++2a
+        options: [--cuda-gpu-arch=sm_61, -Wno-deprecated-declarations, -D_GLIBCXX_USE_CXX11_ABI=0]
--- a/cutlass @ e33d90b3
+++ b/cutlass @ e33d90b3
-Subproject commit e33d90b36109f67915a80c532ebbb978b72c7bd2
--- a/include/tensorview/mp_helper.h
+++ b/include/tensorview/mp_helper.h
@@ -9,6 +9,10 @@ template <class... T> struct mp_list {};
 template <class T, T... I>
 using mp_list_c = mp_list<std::integral_constant<T, I>...>;
+template <int... I>
+using mp_list_int_c = mp_list<std::integral_constant<int, I>...>;
 namespace detail {
 template <class... Ts, class F>

--- a/include/tensorview/tensor.h
+++ b/include/tensorview/tensor.h
@@ -418,7 +418,8 @@ bool dispatch_container_noexcept(Iterator begin, Iterator end, F &&f) {
        return;
      }
      if (count >= val_lst_size) {
-        TV_THROW_INVALID_ARG("iterator length invalid:", val_lst_size);
+        equal = false;
+        return;
      }
      constexpr auto c = decltype(E)::value;
      if (c != *iter) {

--- a/setup.py
+++ b/setup.py
@@ -54,7 +54,7 @@ class CMakeBuild(build_ext):
                      '-DCMAKE_PREFIX_PATH={}'.format(LIBTORCH_ROOT),
                      '-DPYBIND11_PYTHON_VERSION={}'.format(PYTHON_VERSION),
                      '-DSPCONV_BuildTests=OFF',
-                      '-DPYTORCH_VERSION={}'.format(PYTORCH_VERSION_NUMBER)
+                      '-DPYTORCH_VERSION={}'.format(PYTORCH_VERSION_NUMBER),
                      ] #  -arch=sm_61
        if not torch.cuda.is_available() and SPCONV_FORCE_BUILD_CUDA is None:
            cmake_args += ['-DSPCONV_BuildCUDA=OFF']

--- a/src/spconv/CMakeLists.txt
+++ b/src/spconv/CMakeLists.txt
@@ -10,7 +10,7 @@ if(OpenMP_CXX_FOUND)
 endif()
-target_include_directories(spconv PRIVATE ${ALL_INCLUDE} )
+target_include_directories(spconv PRIVATE ${ALL_INCLUDE} ${MP11_INCLUDE} )
 set_property(TARGET spconv PROPERTY CUDA_STANDARD 14)
 set_property(TARGET spconv PROPERTY CXX_STANDARD 14)
 set_target_properties(spconv PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

--- a/src/spconv/indice.cu
+++ b/src/spconv/indice.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include <ATen/ATen.h>
+#include <boost/mp11.hpp>
 #include <chrono>
 #include <cuhash/hash_table.h>
 #include <limits>
@@ -79,7 +80,7 @@ int create_conv_indice_pair_p1_cuda(
                                  pa, di, ou);
              TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed");
            }
-            // tv::ssprint("prepareIndicePairsKernel", timer.report() / 1000.0);
+        // tv::ssprint("prepareIndicePairsKernel", timer.report() / 1000.0);
 #ifdef TV_LOG_KERNEL_INFO
            cudaFuncAttributes attr;
            checkCudaErrors(cudaFuncGetAttributes(
@@ -226,6 +227,18 @@ int create_submconv_indice_pair_cuda(
        spatialVolume *= outSpatialShape[i];
      }
      auto dispatcher = tv::DispatchIntNoexcept<tv::mp_list_c<int, 1, 3, 5>>();
+      namespace mp11 = boost::mp11;
+      using kernel2_candidates_t =
+          mp11::mp_product<tv::mp_list, tv::mp_list_int_c<1, 3, 5>,
+                           tv::mp_list_int_c<1, 3, 5>>;
+      using kernel3_candidates_t =
+          mp11::mp_product<tv::mp_list, tv::mp_list_int_c<1, 3, 5>,
+                           tv::mp_list_int_c<1, 3, 5>,
+                           tv::mp_list_int_c<1, 3, 5>>;
+      using kernel3_candidates_final_t = mp11::mp_push_back<kernel3_candidates_t>;
+      auto dispatcher2 = tv::DispatchContainerNoexcept<kernel2_candidates_t>();
+      auto dispatcher3 = tv::DispatchContainerNoexcept<kernel3_candidates_final_t>();
      if (useHash) {
        auto table = cuhash::HashTable();
@@ -263,43 +276,33 @@ int create_submconv_indice_pair_cuda(
          if (NDim == 2) {
            tv::SimpleVector<Index, 2> ou_(outSpatialShape.begin(),
                                           outSpatialShape.end());
+            dispatcher2(kernelSize.begin(), kernelSize.end(), [&](auto K) {
-            dispatcher(kernelSize[0], [&](auto K0C) {
+              constexpr int K0 = mp11::mp_at_c<decltype(K), 0>::value;
-              dispatcher(kernelSize[1], [&](auto K1C) {
+              constexpr int K1 = mp11::mp_at_c<decltype(K), 1>::value;
-                constexpr int K0 = decltype(K0C)::value;
+              found = true;
-                constexpr int K1 = decltype(K1C)::value;
+              getSubMIndicePairsHashUnrollKernel2<Index, K0, K1>
-                found = true;
+                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
-                getSubMIndicePairsHashUnrollKernel2<Index, K0, K1>
+                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
-                    <<<tv::cuda::getBlocks(numActIn),
+                                  tv::torch2tv<Index>(indicePairs),
-                       tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
+                                  tv::torch2tv<Index>(indiceNum), ou_,
-                        tv::torch2tv<Index>(indicesIn),
+                                  spatialVolume, tableSize, tableData,
-                        tv::torch2tv<Index>(indicePairs),
+                                  constants, stash_constants, stash_count);
-                        tv::torch2tv<Index>(indiceNum), ou_, spatialVolume,
-                        tableSize, tableData, constants, stash_constants,
-                        stash_count);
-              });
            });
          } else if (NDim == 3) {
            tv::SimpleVector<Index, 3> ou_(outSpatialShape.begin(),
                                           outSpatialShape.end());
+            dispatcher3(kernelSize.begin(), kernelSize.end(), [&](auto K) {
-            dispatcher(kernelSize[0], [&](auto K0C) {
+              constexpr int K0 = mp11::mp_at_c<decltype(K), 0>::value;
-              dispatcher(kernelSize[1], [&](auto K1C) {
+              constexpr int K1 = mp11::mp_at_c<decltype(K), 1>::value;
-                dispatcher(kernelSize[2], [&](auto K2C) {
+              constexpr int K2 = mp11::mp_at_c<decltype(K), 2>::value;
-                  constexpr int K0 = decltype(K0C)::value;
+              found = true;
-                  constexpr int K1 = decltype(K1C)::value;
+              getSubMIndicePairsHashUnrollKernel3<Index, K0, K1, K2>
-                  constexpr int K2 = decltype(K2C)::value;
+                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
-                  found = true;
+                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
-                  getSubMIndicePairsHashUnrollKernel3<Index, K0, K1, K2>
+                                  tv::torch2tv<Index>(indicePairs),
-                      <<<tv::cuda::getBlocks(numActIn),
+                                  tv::torch2tv<Index>(indiceNum), ou_,
-                         tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
+                                  spatialVolume, tableSize, tableData,
-                          tv::torch2tv<Index>(indicesIn),
+                                  constants, stash_constants, stash_count);
-                          tv::torch2tv<Index>(indicePairs),
-                          tv::torch2tv<Index>(indiceNum), ou_, spatialVolume,
-                          tableSize, tableData, constants, stash_constants,
-                          stash_count);
-                });
-              });
            });
          }
        }
@@ -338,24 +341,35 @@ int create_submconv_indice_pair_cuda(
          if (NDim == 2) {
            tv::SimpleVector<Index, 2> ou_(outSpatialShape.begin(),
                                           outSpatialShape.end());
+            dispatcher2(kernelSize.begin(), kernelSize.end(), [&](auto K) {
-            dispatcher(kernelSize[0], [&](auto K0C) {
+              constexpr int K0 = mp11::mp_at_c<decltype(K), 0>::value;
-              dispatcher(kernelSize[1], [&](auto K1C) {
+              constexpr int K1 = mp11::mp_at_c<decltype(K), 1>::value;
-                constexpr int K0 = decltype(K0C)::value;
+              found = true;
-                constexpr int K1 = decltype(K1C)::value;
+              getSubMIndicePairsUnrollKernel2<Index, IndexGrid, K0, K1>
-                found = true;
+                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
-                getSubMIndicePairsUnrollKernel2<Index, IndexGrid, K0, K1>
+                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
-                    <<<tv::cuda::getBlocks(numActIn),
+                                  tv::torch2tv<IndexGrid>(gridsOut),
-                       tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
+                                  tv::torch2tv<Index>(indicePairs),
-                        tv::torch2tv<Index>(indicesIn),
+                                  tv::torch2tv<Index>(indiceNum), ou_,
-                        tv::torch2tv<IndexGrid>(gridsOut),
+                                  spatialVolume);
-                        tv::torch2tv<Index>(indicePairs),
-                        tv::torch2tv<Index>(indiceNum), ou_, spatialVolume);
-              });
            });
          } else if (NDim == 3) {
            tv::SimpleVector<Index, 3> ou_(outSpatialShape.begin(),
                                           outSpatialShape.end());
+            dispatcher3(kernelSize.begin(), kernelSize.end(), [&](auto K) {
+              constexpr int K0 = mp11::mp_at_c<decltype(K), 0>::value;
+              constexpr int K1 = mp11::mp_at_c<decltype(K), 1>::value;
+              constexpr int K2 = mp11::mp_at_c<decltype(K), 2>::value;
+              found = true;
+              getSubMIndicePairsUnrollKernel3<Index, IndexGrid, K0, K1, K2>
+                  <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
+                     0, stream>>>(tv::torch2tv<Index>(indicesIn),
+                                  tv::torch2tv<IndexGrid>(gridsOut),
+                                  tv::torch2tv<Index>(indicePairs),
+                                  tv::torch2tv<Index>(indiceNum), ou_,
+                                  spatialVolume);
+            });
+            /*
            dispatcher(kernelSize[0], [&](auto K0C) {
              dispatcher(kernelSize[1], [&](auto K1C) {
                dispatcher(kernelSize[2], [&](auto K2C) {
@@ -372,7 +386,7 @@ int create_submconv_indice_pair_cuda(
                          tv::torch2tv<Index>(indiceNum), ou_, spatialVolume);
                });
              });
-            });
+            });*/
          }
        }
        if (!found) {
@@ -396,7 +410,8 @@ int create_submconv_indice_pair_cuda(
        resetGridSubMKernel<Index, IndexGrid, NDim>
            <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
               stream>>>(indicesIn.data_ptr<Index>(),
-                         tv::torch2tv<IndexGrid>(gridsOut), ou, numActIn, spatialVolume);
+                         tv::torch2tv<IndexGrid>(gridsOut), ou, numActIn,
+                         spatialVolume);
        TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
      }
    });

--- a/src/spconv/reordering.cu
+++ b/src/spconv/reordering.cu
@@ -35,7 +35,7 @@ using half_vec_t =
 template <typename T>
 using half_vec_sadd_t =
    std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
-using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
+using kernel_block_t = tv::mp_list_c<int, 64, 32, 16, 8>;
 void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
                        torch::Tensor indices, int size) {
@@ -45,6 +45,7 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
  auto stream = at::cuda::getCurrentCUDAStream();
  auto dtype = features.scalar_type();
  auto inds_dtype = indices.scalar_type();
+  // auto timer = spconv::CudaContextTimer<>();
  tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
    using T = decltype(TValue);
    using vecload_type_t = half_vec_t<T>;
@@ -53,8 +54,7 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
      bool notFound = true;
      constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
-      tv::mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
+      tv::mp_for_each<kernel_block_t>([&](auto NumTLP) {
-                                       &notFound](auto NumTLP) {
        constexpr int NumILP = NumTLP / 4;
        // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
        int nHotBlock = (size / NumTLP) * NumTLP;
@@ -87,6 +87,7 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
                               features.data_ptr<T>(),
                               indices.data_ptr<Index>() + nHotBlock,
                               size - nHotBlock, numPlanes / vecloadFactor);
 #ifdef TV_LOG_KERNEL_INFO
              cudaFuncAttributes attr;
              checkCudaErrors(cudaFuncGetAttributes(
@@ -143,8 +144,7 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
      constexpr int vecloadFactor =
          sizeof(vecload_type_t) / sizeof(T); // important for half.
-      tv::mp_for_each<kernel_block_t>([=, &outFeatures, &buffer, &indices,
+      tv::mp_for_each<kernel_block_t>([&](auto NumTLP) {
-                                       &notFound](auto NumTLP) {
        // constexpr int NumILP = NumTLP / (64 / (NumTLP /
        // vecloadFactor));
        constexpr int NumILP = NumTLP / 4;

--- a/src/spgemm/CMakeLists.txt
+++ b/src/spgemm/CMakeLists.txt
--- a/src/spgemm/torchdev_cutlass.cu
+++ b/src/spgemm/torchdev_cutlass.cu
--- a/test/benchmark.py
+++ b/test/benchmark.py
@@ -70,7 +70,7 @@ class Net(nn.Module):
 def main():
    voxels, coors, spatial_shape = waymo_data()
    voxels_th = torch.from_numpy(voxels).cuda().float()
-    coors_th = torch.from_numpy(coors).cuda()
+    coors_th = torch.from_numpy(coors).cuda().int()
    net = Net(spatial_shape[::-1]).cuda().eval().float()
    print(coors_th.shape)
    out = net(voxels_th, coors_th, 1)

--- a/test/test_conv.py
+++ b/test/test_conv.py
@@ -349,7 +349,7 @@ def scatter_nd(indices, updates, shape):
 class TestSpConv(TestCase):
    def testSpConv3d(self):
        np.random.seed(484)
-        devices = ["cpu:0"]
+        devices = ["cuda:0"]
        shapes = [[19, 18, 17]]
        batchsizes = [1, 2]
@@ -752,8 +752,8 @@ def main_subm(algo, dtype=torch.float32):
 if __name__ == '__main__':
-    main(algo=spconv.ConvAlgo.Native, dtype=torch.float32)
+    main_subm(algo=spconv.ConvAlgo.Native, dtype=torch.float32)
-    main(algo=spconv.ConvAlgo.Native, dtype=torch.half)
+    # main(algo=spconv.ConvAlgo.Native, dtype=torch.half)
    # TestCase().assertAllClose(out_my, out_ref)
    # unittest.main()
    # TestSpConv().testSpConv3d()