Commit 60781119 authored by yanyan's avatar yanyan
Browse files

still working on cutlass

parent 23d9faaf
...@@ -44,11 +44,13 @@ set(ALL_LIBS ${TORCH_LIBRARIES}) ...@@ -44,11 +44,13 @@ set(ALL_LIBS ${TORCH_LIBRARIES})
set(ALL_INCLUDE ${PROJECT_SOURCE_DIR}/include) set(ALL_INCLUDE ${PROJECT_SOURCE_DIR}/include)
set(MP11_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/mp11/include) set(MP11_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/mp11/include)
set(CUTLASS_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/cutlass/include)
if (SPCONV_BuildCUDA) if (SPCONV_BuildCUDA)
set(ALL_LIBS ${ALL_LIBS} ${CUDA_CUDART} ${CUDA_CUBLAS}) set(ALL_LIBS ${ALL_LIBS} ${CUDA_CUDART} ${CUDA_CUBLAS})
set(ALL_INCLUDE ${ALL_INCLUDE} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) set(ALL_INCLUDE ${ALL_INCLUDE} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
add_subdirectory(src/cuhash) add_subdirectory(src/cuhash)
add_subdirectory(src/spgemm)
endif() endif()
add_subdirectory(src/spconv) add_subdirectory(src/spconv)
add_subdirectory(src/utils) add_subdirectory(src/utils)
......
...@@ -65,7 +65,7 @@ observers: ...@@ -65,7 +65,7 @@ observers:
std: c++14 std: c++14
options: [ options: [
-Wno-deprecated-declarations, -Wno-deprecated-declarations,
"-gencode=arch=compute_52,code=sm_61", "-gencode=arch=compute_52,code=sm_52",
"-gencode=arch=compute_61,code=sm_61", "-gencode=arch=compute_61,code=sm_61",
"-gencode=arch=compute_60,code=sm_60", "-gencode=arch=compute_60,code=sm_60",
"-gencode=arch=compute_70,code=sm_70", "-gencode=arch=compute_70,code=sm_70",
...@@ -76,7 +76,7 @@ observers: ...@@ -76,7 +76,7 @@ observers:
type: CPPDevObserver type: CPPDevObserver
main_pattern: torchdev_.*\.(cu|cpp|cc|cxx) main_pattern: torchdev_.*\.(cu|cpp|cc|cxx)
pattern: .*\.(cc|cpp|cxx|h|hpp|hxx|cu) pattern: .*\.(cc|cpp|cxx|h|hpp|hxx|cu)
compiler: clang++ compiler: nvcc
executable: build/codeai_dev_torch executable: build/codeai_dev_torch
run_cmd: [$(executable)] run_cmd: [$(executable)]
fail_cmds: # run cmd when pervious run fail with retcode fail_cmds: # run cmd when pervious run fail with retcode
...@@ -97,13 +97,20 @@ observers: ...@@ -97,13 +97,20 @@ observers:
/home/yy/anaconda3/lib, /home/yy/anaconda3/lib,
] ]
libraries: [-lpython3.7m, -lcublas, -lcudart, -ljpeg, -lpthread, libraries: [-lpython3.7m, -lcublas, -lcudart, -ljpeg, -lpthread,
"-Wl,--no-as-needed,-lc10", "-Xcompiler=\"-Wl,--no-as-needed,-lc10\"",
"-Wl,--no-as-needed,-ltorch", "-Xcompiler=\"-Wl,--no-as-needed,-ltorch\"",
"-Wl,--no-as-needed,-ltorch_cpu", "-Xcompiler=\"-Wl,--no-as-needed,-ltorch_cpu\"",
"-Wl,--no-as-needed,-lc10_cuda", "-Xcompiler=\"-Wl,--no-as-needed,-lc10_cuda\"",
"-Wl,--no-as-needed,-ltorch_cuda"] "-Xcompiler=\"-Wl,--no-as-needed,-ltorch_cuda\""]
std: c++2a std: c++14
options: [--cuda-gpu-arch=sm_61, -Wno-deprecated-declarations, -D_GLIBCXX_USE_CXX11_ABI=0] # options: [--cuda-gpu-arch=sm_61, -Wno-deprecated-declarations, -D_GLIBCXX_USE_CXX11_ABI=0]
options: [
-Wno-deprecated-declarations,
--expt-relaxed-constexpr,
"-gencode=arch=compute_61,code=sm_61",
-D_GLIBCXX_USE_CXX11_ABI=0,
]
...@@ -20,8 +20,8 @@ ...@@ -20,8 +20,8 @@
#include <tensorview/tensorview.h> #include <tensorview/tensorview.h>
namespace spconv { namespace spconv {
template <typename Index, unsigned NDim, int KernelMaxVolume = 256, template <typename Index, unsigned NDim, bool UseDeconv,
typename Index1D = int> int KernelMaxVolume = 256, typename Index1D = int>
__global__ void prepareIndicePairsKernel( __global__ void prepareIndicePairsKernel(
tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs, tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum, tv::TensorView<Index1D> indicePairUnique, tv::TensorView<Index> indiceNum, tv::TensorView<Index1D> indicePairUnique,
...@@ -47,54 +47,19 @@ __global__ void prepareIndicePairsKernel( ...@@ -47,54 +47,19 @@ __global__ void prepareIndicePairsKernel(
auto indicePairsDim2 = indicePairs.dim(2); auto indicePairsDim2 = indicePairs.dim(2);
Index index; Index index;
for (int ix : tv::KernelLoopX<int>(numActIn)) { for (int ix : tv::KernelLoopX<int>(numActIn)) {
numValidPoints = getValidOutPos<Index, NDim>( if (UseDeconv) {
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), // nvcc will optimize this fake "if constexpr"
stride.data(), padding.data(), dilation.data(), outSpatialShape.data(), // after cuda 11 released, we will start to use real if constexpr.
validPoints); numValidPoints = getValidOutPosTranspose<Index, NDim>(
for (Index i = 0; i < numValidPoints; ++i) { indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
pointPtr = validPoints + i * (NDim + 1); stride.data(), padding.data(), dilation.data(),
auto offset = pointPtr[NDim]; outSpatialShape.data(), validPoints);
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); } else {
indicePairs(0, offset, oldNum) = ix; numValidPoints = getValidOutPos<Index, NDim>(
index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs( indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
pointPtr, outSpatialShape.data(), 0) + stride.data(), padding.data(), dilation.data(),
spatialVolume * indicesIn(ix, 0); outSpatialShape.data(), validPoints);
indicePairs(1, offset, oldNum) = index;
indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
} }
}
}
template <typename Index, unsigned NDim, int KernelMaxVolume = 256>
__global__ void prepareDeConvIndicePairsKernel(
tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape) {
auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
Index validPoints[KernelMaxVolume * (NDim + 1)];
Index *pointPtr = nullptr;
auto indicePairsDim2 = indicePairs.dim(2);
Index index;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
numValidPoints = getValidOutPosTranspose<Index, NDim>(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
validPoints);
for (Index i = 0; i < numValidPoints; ++i) { for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1); pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim]; auto offset = pointPtr[NDim];
...@@ -191,6 +156,29 @@ assignIndicePairsKernel(tv::TensorView<Index> indicesOut, ...@@ -191,6 +156,29 @@ assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
} }
} }
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void
assignIndicePairsLimitedKernel(tv::TensorView<IndexGrid> gridsOut, int numActIn,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum) {
Index index, val;
int kernelVolume = indicePairs.dim(0);
for (int ix : tv::KernelLoopX<int>(numActIn)) {
for (int i = 0; i < kernelVolume; ++i) {
index = indicePairs(i, 1, ix);
if (index != -1) {
val = gridsOut[index];
if (val != -1) {
auto oldNum = atomicAdd(indiceNum.data() + i, Index(1));
indicePairs(i, 0, oldNum) = indicePairs(i, 0, ix);
indicePairs(i, 1, oldNum) = val;
}
}
}
}
}
template <typename Index, typename IndexGrid, unsigned NDim> template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void prepareSubMGridKernel( __global__ void prepareSubMGridKernel(
tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut, tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
...@@ -542,7 +530,6 @@ __global__ void getSubMIndicePairsHashUnrollKernel2( ...@@ -542,7 +530,6 @@ __global__ void getSubMIndicePairsHashUnrollKernel2(
} }
} }
template <typename Index, typename IndexGrid, unsigned NDim> template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void resetGridKernel(const Index *indicePairUnique, __global__ void resetGridKernel(const Index *indicePairUnique,
tv::TensorView<IndexGrid> gridsOut, tv::TensorView<IndexGrid> gridsOut,
...@@ -567,8 +554,8 @@ resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut, ...@@ -567,8 +554,8 @@ resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut,
Index index; Index index;
for (int ix : tv::KernelLoopX<int>(numAct)) { for (int ix : tv::KernelLoopX<int>(numAct)) {
indsPtr = indices + ix * (NDim + 1); indsPtr = indices + ix * (NDim + 1);
index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(indsPtr + 1, index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
outSpatialShape.data(), 0); indsPtr + 1, outSpatialShape.data(), 0);
gridsOut[index + spatialVolume * indsPtr[0]] = -1; gridsOut[index + spatialVolume * indsPtr[0]] = -1;
} }
} }
......
...@@ -14,9 +14,9 @@ ...@@ -14,9 +14,9 @@
#ifndef SPARSE_REORDERING_FUNCTOR_H_ #ifndef SPARSE_REORDERING_FUNCTOR_H_
#define SPARSE_REORDERING_FUNCTOR_H_ #define SPARSE_REORDERING_FUNCTOR_H_
#include <cuda_runtime_api.h>
#include <tensorview/tensorview.h> #include <tensorview/tensorview.h>
#include <torch/script.h> #include <torch/script.h>
namespace spconv { namespace spconv {
void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features, void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
...@@ -35,6 +35,13 @@ void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features, ...@@ -35,6 +35,13 @@ void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features,
void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures, void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures,
torch::Tensor indices, int size); torch::Tensor indices, int size);
void sparse_gather_cuda(cudaStream_t s, torch::Tensor buffer,
torch::Tensor features, torch::Tensor indices,
int size);
void sparse_scatter_add_cuda(cudaStream_t s, torch::Tensor buffer,
torch::Tensor outFeatures, torch::Tensor indices,
int size);
} // namespace spconv } // namespace spconv
#endif #endif
\ No newline at end of file
#include <cutlass/gemm/device/gemm.h>
#include <type_traits>
namespace spconv {
template <typename T>
using determine_acc_t =
std::conditional_t<std::is_same<T, cutlass::half_t>::value, float, T>;
template <typename T, bool TransA, bool TransB, bool TransC>
cudaError_t cutlassGemm(cudaStream_t s, int M, int N, int K, T alpha,
T const *A, int lda, T const *B, int ldb, T beta, T *C,
int ldc) {
// Define type definition for single-precision CUTLASS GEMM with column-major
// input matrices and 128x128x8 threadblock tile size (chosen by default).
//
// To keep the interface manageable, several helpers are defined for plausible
// compositions including the following example for single-precision GEMM.
// Typical values are used as default template arguments. See
// `cutlass/gemm/device/default_gemm_configuration.h` for more details.
//
// To view the full gemm device API interface, see
// `cutlass/gemm/device/gemm.h`
using TAcc = determine_acc_t<T>;
using ColumnMajor = cutlass::layout::ColumnMajor;
using RowMajor = cutlass::layout::RowMajor;
using LayoutA = std::conditional_t<TransA, ColumnMajor, RowMajor>;
using LayoutB = std::conditional_t<TransB, ColumnMajor, RowMajor>;
using LayoutC = std::conditional_t<TransC, ColumnMajor, RowMajor>;
using CutlassGemm = cutlass::gemm::device::Gemm<T, // Data-type of A matrix
LayoutA, // Layout of A matrix
T, // Data-type of B matrix
LayoutB, // Layout of B matrix
T, // Data-type of C matrix
LayoutC,
TAcc>; // Layout of C matrix
// Define a CUTLASS GEMM type
CutlassGemm gemm_operator;
// Construct the CUTLASS GEMM arguments object.
//
// One of CUTLASS's design patterns is to define gemm argument objects that
// are constructible in host code and passed to kernels by value. These may
// include pointers, strides, scalars, and other arguments needed by Gemm and
// its components.
//
// The benefits of this pattern are (1.) a structured, composable strategy for
// passing host-constructible arguments to kernels and (2.) minimized
// initialization overhead on kernel entry.
//
typename CutlassGemm::Arguments args(
{M, N, K}, // Gemm Problem dimensions
{A, lda}, // Tensor-ref for source matrix A
{B, ldb}, // Tensor-ref for source matrix B
{C, ldc}, // Tensor-ref for source matrix C
{C, ldc}, // Tensor-ref for destination matrix D (may be different memory
// than source C matrix)
{alpha, beta}); // Scalars used in the Epilogue
//
// Launch the CUTLASS GEMM kernel.
//
cutlass::Status status = gemm_operator(args, nullptr, s);
//
// Return a cudaError_t if the CUTLASS GEMM operator returned an error code.
//
if (status != cutlass::Status::kSuccess) {
return cudaErrorUnknown;
}
// Return success, if no errors were encountered.
return cudaSuccess;
}
} // namespace spconv
#include <cuda_runtime_api.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
namespace spconv {
void cutlass_mm_out(torch::Tensor c, torch::Tensor a, torch::Tensor b);
void cutlass_mm_out(cudaStream_t stream, torch::Tensor c, torch::Tensor a,
torch::Tensor b);
} // namespace spconv
\ No newline at end of file
#include <tensorview/tensor.h> #include <tensorview/tensor.h>
namespace spconv { namespace spconv {
enum HashTypes { enum HashTypes { kDenseMap = 0, kCUDPPHash = 1 };
kDenseMap = 0,
kCUDPPHash = 1
};
template <int Impl>
struct HashMap;
template<> template <int Impl> struct HashMap;
struct HashMap<kDenseMap>{
}; template <> struct HashMap<kDenseMap> {};
} } // namespace spconv
\ No newline at end of file \ No newline at end of file
...@@ -12,7 +12,6 @@ using mp_list_c = mp_list<std::integral_constant<T, I>...>; ...@@ -12,7 +12,6 @@ using mp_list_c = mp_list<std::integral_constant<T, I>...>;
template <int... I> template <int... I>
using mp_list_int_c = mp_list<std::integral_constant<int, I>...>; using mp_list_int_c = mp_list<std::integral_constant<int, I>...>;
namespace detail { namespace detail {
template <class... Ts, class F> template <class... Ts, class F>
......
...@@ -524,7 +524,6 @@ struct DispatchIntNoexcept<T<Args...>> { ...@@ -524,7 +524,6 @@ struct DispatchIntNoexcept<T<Args...>> {
} }
}; };
constexpr size_t kTensorMaxDim = 10; constexpr size_t kTensorMaxDim = 10;
using TensorShape = ShapeBase<kTensorMaxDim, int64_t>; using TensorShape = ShapeBase<kTensorMaxDim, int64_t>;
......
...@@ -69,7 +69,7 @@ class SparseConvTensor(object): ...@@ -69,7 +69,7 @@ class SparseConvTensor(object):
self.batch_size = batch_size self.batch_size = batch_size
self.indice_dict = {} self.indice_dict = {}
if grid is None: if grid is None:
grid = torch.Tensor() # empty tensor grid = torch.Tensor() # empty tensor
self.grid = grid self.grid = grid
@classmethod @classmethod
......
...@@ -89,11 +89,11 @@ def get_indice_pairs(indices, ...@@ -89,11 +89,11 @@ def get_indice_pairs(indices,
out_shape = spatial_shape out_shape = spatial_shape
if grid is None: if grid is None:
grid = torch.Tensor() grid = torch.Tensor()
res = torch.ops.spconv.get_indice_pairs(indices, grid, batch_size, out_shape, res = torch.ops.spconv.get_indice_pairs(indices, grid, batch_size,
spatial_shape, ksize, stride, out_shape, spatial_shape, ksize,
padding, dilation, out_padding, stride, padding, dilation,
int(subm), int(transpose), out_padding, int(subm),
int(use_hash)) int(transpose), int(use_hash))
return res return res
......
...@@ -67,8 +67,16 @@ class SparseMaxPool(SparseModule): ...@@ -67,8 +67,16 @@ class SparseMaxPool(SparseModule):
else: else:
out_spatial_shape = spatial_shape out_spatial_shape = spatial_shape
outids, indice_pairs, indice_pairs_num = ops.get_indice_pairs( outids, indice_pairs, indice_pairs_num = ops.get_indice_pairs(
indices, batch_size, spatial_shape, self.kernel_size, self.stride, indices,
self.padding, self.dilation, 0, self.subm, grid=input.grid) batch_size,
spatial_shape,
self.kernel_size,
self.stride,
self.padding,
self.dilation,
0,
self.subm,
grid=input.grid)
out_features = Fsp.indice_maxpool(features, indice_pairs.to(device), out_features = Fsp.indice_maxpool(features, indice_pairs.to(device),
indice_pairs_num.to(device), indice_pairs_num.to(device),
......
...@@ -15,7 +15,7 @@ set_property(TARGET spconv PROPERTY CUDA_STANDARD 14) ...@@ -15,7 +15,7 @@ set_property(TARGET spconv PROPERTY CUDA_STANDARD 14)
set_property(TARGET spconv PROPERTY CXX_STANDARD 14) set_property(TARGET spconv PROPERTY CXX_STANDARD 14)
set_target_properties(spconv PROPERTIES CUDA_SEPARABLE_COMPILATION ON) set_target_properties(spconv PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
if (SPCONV_BuildCUDA) if (SPCONV_BuildCUDA)
target_link_libraries(spconv PRIVATE ${ALL_LIBS} cuhash) target_link_libraries(spconv PRIVATE ${ALL_LIBS} cuhash spgemm)
else() else()
target_link_libraries(spconv PRIVATE ${ALL_LIBS}) target_link_libraries(spconv PRIVATE ${ALL_LIBS})
endif() endif()
......
...@@ -61,17 +61,9 @@ int create_conv_indice_pair_p1_cuda( ...@@ -61,17 +61,9 @@ int create_conv_indice_pair_p1_cuda(
tv::DispatchInt<max_kernel_vol_t>()( tv::DispatchInt<max_kernel_vol_t>()(
kernelVolume, std::less_equal<int>(), [&](auto I2) { kernelVolume, std::less_equal<int>(), [&](auto I2) {
constexpr int MaxKernelVolume = decltype(I2)::value; constexpr int MaxKernelVolume = decltype(I2)::value;
if (transpose) { tv::dispatch_int<0, 1>(int(transpose), [&](auto I) {
prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume> constexpr bool UseDeconv = decltype(I)::value;
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, prepareIndicePairsKernel<Index, NDim, UseDeconv, MaxKernelVolume>
0, stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum),
tv::torch2tv<Index>(indicePairUnique), ks, st,
pa, di, ou);
TV_CHECK_CUDA_ERR_V2("prepareDeConvIndicePairsKernel failed");
} else {
prepareIndicePairsKernel<Index, NDim, MaxKernelVolume>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, <<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<Index>(indicesIn), 0, stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs), tv::torch2tv<Index>(indicePairs),
...@@ -79,8 +71,7 @@ int create_conv_indice_pair_p1_cuda( ...@@ -79,8 +71,7 @@ int create_conv_indice_pair_p1_cuda(
tv::torch2tv<Index>(indicePairUnique), ks, st, tv::torch2tv<Index>(indicePairUnique), ks, st,
pa, di, ou); pa, di, ou);
TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed"); TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed");
} });
// tv::ssprint("prepareIndicePairsKernel", timer.report() / 1000.0);
#ifdef TV_LOG_KERNEL_INFO #ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr; cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes( checkCudaErrors(cudaFuncGetAttributes(
...@@ -230,15 +221,17 @@ int create_submconv_indice_pair_cuda( ...@@ -230,15 +221,17 @@ int create_submconv_indice_pair_cuda(
namespace mp11 = boost::mp11; namespace mp11 = boost::mp11;
using kernel2_candidates_t = using kernel2_candidates_t =
mp11::mp_product<tv::mp_list, tv::mp_list_int_c<1, 3, 5>, mp11::mp_product<tv::mp_list, tv::mp_list_c<int, 1, 3, 5>,
tv::mp_list_int_c<1, 3, 5>>; tv::mp_list_c<int, 1, 3, 5>>;
using kernel3_candidates_t = using kernel3_candidates_t =
mp11::mp_product<tv::mp_list, tv::mp_list_int_c<1, 3, 5>, mp11::mp_product<tv::mp_list, tv::mp_list_c<int, 1, 3, 5>,
tv::mp_list_int_c<1, 3, 5>, tv::mp_list_c<int, 1, 3, 5>,
tv::mp_list_int_c<1, 3, 5>>; tv::mp_list_c<int, 1, 3, 5>>;
using kernel3_candidates_final_t = mp11::mp_push_back<kernel3_candidates_t>; using kernel3_candidates_final_t =
mp11::mp_push_back<kernel3_candidates_t>;
auto dispatcher2 = tv::DispatchContainerNoexcept<kernel2_candidates_t>(); auto dispatcher2 = tv::DispatchContainerNoexcept<kernel2_candidates_t>();
auto dispatcher3 = tv::DispatchContainerNoexcept<kernel3_candidates_final_t>(); auto dispatcher3 =
tv::DispatchContainerNoexcept<kernel3_candidates_final_t>();
if (useHash) { if (useHash) {
auto table = cuhash::HashTable(); auto table = cuhash::HashTable();
......
...@@ -37,12 +37,12 @@ using half_vec_sadd_t = ...@@ -37,12 +37,12 @@ using half_vec_sadd_t =
std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>; std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16, 8>; using kernel_block_t = tv::mp_list_c<int, 64, 32, 16, 8>;
void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features, void sparse_gather_cuda(cudaStream_t stream, torch::Tensor buffer,
torch::Tensor indices, int size) { torch::Tensor features, torch::Tensor indices,
int size) {
if (size <= 0) if (size <= 0)
return; return;
int numPlanes = features.size(1); int numPlanes = features.size(1);
auto stream = at::cuda::getCurrentCUDAStream();
auto dtype = features.scalar_type(); auto dtype = features.scalar_type();
auto inds_dtype = indices.scalar_type(); auto inds_dtype = indices.scalar_type();
// auto timer = spconv::CudaContextTimer<>(); // auto timer = spconv::CudaContextTimer<>();
...@@ -126,12 +126,18 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features, ...@@ -126,12 +126,18 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
}); });
} }
void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures, void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size) { torch::Tensor indices, int size) {
auto stream = at::cuda::getCurrentCUDAStream();
return sparse_gather_cuda(stream, buffer, features, indices, size);
}
void sparse_scatter_add_cuda(cudaStream_t stream, torch::Tensor buffer,
torch::Tensor outFeatures, torch::Tensor indices,
int size) {
if (size <= 0) if (size <= 0)
return; return;
int numPlanes = outFeatures.size(1); int numPlanes = outFeatures.size(1);
auto stream = at::cuda::getCurrentCUDAStream();
auto dtype = outFeatures.scalar_type(); auto dtype = outFeatures.scalar_type();
auto inds_dtype = indices.scalar_type(); auto inds_dtype = indices.scalar_type();
...@@ -216,6 +222,12 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures, ...@@ -216,6 +222,12 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
}); });
} }
void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
torch::Tensor indices, int size) {
auto stream = at::cuda::getCurrentCUDAStream();
return sparse_scatter_add_cuda(stream, buffer, outFeatures, indices, size);
}
void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features, void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size) { torch::Tensor indices, int size) {
// indices: [volume, inds_stride] // indices: [volume, inds_stride]
......
#include <spconv/spconv_ops.h> #include <spconv/spconv_ops.h>
#include <spgemm/gemm_th.h>
namespace spconv { namespace spconv {
std::vector<torch::Tensor> std::vector<torch::Tensor>
...@@ -48,9 +50,9 @@ getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize, ...@@ -48,9 +50,9 @@ getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
gridSize = batchSize; gridSize = batchSize;
} }
bool resetGrid = gridOut.numel() != 0; bool resetGrid = gridOut.numel() != 0;
if (!resetGrid){ if (!resetGrid) {
gridOut = torch::full( gridOut = torch::full({gridSize}, -1,
{gridSize}, -1, torch::dtype(torch::kInt32).device(indices.device())); torch::dtype(torch::kInt32).device(indices.device()));
} }
gridOut = gridOut.view({batchSize, -1}); gridOut = gridOut.view({batchSize, -1});
int64_t numActOut = -1; int64_t numActOut = -1;
...@@ -104,7 +106,7 @@ getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize, ...@@ -104,7 +106,7 @@ getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
padding, dilation, outSpatialShape, transpose, resetGrid, useHash); padding, dilation, outSpatialShape, transpose, resetGrid, useHash);
} }
#ifdef TV_CUDA #ifdef TV_CUDA
else if (indices.device().type() == torch::kCUDA) { else if (indices.device().type() == torch::kCUDA) {
numActOut = create_conv_indice_pair_p1_cuda( numActOut = create_conv_indice_pair_p1_cuda(
indices, indicePairs, indiceNum, indicePairUnique, kernelSize, stride, indices, indicePairs, indiceNum, indicePairUnique, kernelSize, stride,
padding, dilation, outSpatialShape, transpose); padding, dilation, outSpatialShape, transpose);
...@@ -191,7 +193,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters, ...@@ -191,7 +193,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
double totalGatherTime = 0; double totalGatherTime = 0;
double totalGEMMTime = 0; double totalGEMMTime = 0;
double totalSAddTime = 0; double totalSAddTime = 0;
// tv::ssprint("first subm gemm time", timer.report() / 1000.0, std::vector<int>(indicePairNumCpu.data_ptr<int>(), // tv::ssprint("first subm gemm time", timer.report() / 1000.0,
// std::vector<int>(indicePairNumCpu.data_ptr<int>(),
// indicePairNumCpu.data_ptr<int>() + kernelVolume)); // indicePairNumCpu.data_ptr<int>() + kernelVolume));
for (int i = 0; i < kernelVolume; ++i) { for (int i = 0; i < kernelVolume; ++i) {
......
set(ALL_FILES ${ALL_FILES} gemm.cu)
add_library(spgemm SHARED ${ALL_FILES})
target_include_directories(spgemm PRIVATE ${ALL_INCLUDE} ${MP11_INCLUDE} ${CUTLASS_INCLUDE} )
set_property(TARGET spgemm PROPERTY CUDA_STANDARD 14)
set_property(TARGET spgemm PROPERTY CXX_STANDARD 14)
set_target_properties(spgemm PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(spgemm PRIVATE ${ALL_LIBS})
install (TARGETS spgemm DESTINATION lib)
#include <spgemm/gemm.h>
#include <spgemm/gemm_th.h>
namespace spconv {
template <typename T>
using determine_half_t =
std::conditional_t<std::is_same<T, at::Half>::value, cutlass::half_t, T>;
void cutlass_mm_out(cudaStream_t stream, torch::Tensor c, torch::Tensor a,
torch::Tensor b) {
TV_ASSERT_RT_ERR(c.dtype() == a.dtype() && c.dtype() == b.dtype(),
"dtype must be same");
TV_ASSERT_RT_ERR(c.is_contiguous() && b.is_contiguous() && a.is_contiguous(),
"error");
auto M = a.size(0);
auto K = a.size(1);
auto N = b.size(1);
TV_ASSERT_RT_ERR(b.size(0) == K && c.size(0) == M && c.size(1) == N, "error");
tv::dispatch_torch<float, at::Half>(c.scalar_type(), [&](auto I) {
using T = decltype(I);
using HalfT = determine_half_t<T>;
auto status = cutlassGemm<HalfT, false, false, false>(
stream, M, N, K, HalfT(1.0), reinterpret_cast<HalfT *>(a.data_ptr<T>()),
a.size(1), reinterpret_cast<HalfT *>(b.data_ptr<T>()), b.size(1),
HalfT(0.0), reinterpret_cast<HalfT *>(c.data_ptr<T>()), c.size(1));
TV_ASSERT_RT_ERR(status == cudaSuccess, "error");
});
}
void cutlass_mm_out(torch::Tensor c, torch::Tensor a, torch::Tensor b) {
auto stream = at::cuda::getCurrentCUDAStream();
return cutlass_mm_out(stream, c, a, b);
}
} // namespace spconv
\ No newline at end of file
#define TV_CUDA
#include <cutlass/gemm/device/gemm.h>
#include <spgemm/gemm.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/tensor.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
int main() {
auto M = 100000;
auto N = 128;
auto K = 128;
auto a =
torch::rand({M, K}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
auto b = torch::rand({K, N}, a.options());
auto c = torch::zeros({a.size(0), b.size(1)}, a.options());
auto c2 = torch::zeros({a.size(0), b.size(1)}, a.options());
torch::mm_out(c, a, b);
auto status = spconv::cutlassGemm<float, false, false, false>(
0, M, N, K, 1.0, a.data_ptr<float>(), a.size(1), b.data_ptr<float>(),
b.size(1), 0.0, c2.data_ptr<float>(), c2.size(1));
auto err = torch::norm(c2 - c);
tv::ssprint(status, "linalg norm", err);
tv::ssprint((c.view({-1}) == 0).sum());
auto timer = spconv::CudaContextTimer<>();
for (int i = 0; i < 10; ++i) {
torch::mm_out(c, a, b);
tv::ssprint("mm", timer.report() / 1000.0);
spconv::cutlassGemm<float, false, false, false>(
0, M, N, K, 1.0, a.data_ptr<float>(), a.size(1), b.data_ptr<float>(),
b.size(1), 0.0, c2.data_ptr<float>(), c2.size(1));
tv::ssprint("cutlass_mm", timer.report() / 1000.0);
}
return 0;
}
\ No newline at end of file
import torch import time
import spconv from pathlib import Path
import numpy as np
from spconv.utils import VoxelGeneratorV2 import numpy as np
from pathlib import Path import torch
from torch import nn from torch import nn
import time
import spconv
from spconv.utils import VoxelGeneratorV2
def waymo_data(batch_size=1): def waymo_data(batch_size=1):
gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1, 150000) gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
150000)
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz") data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
pc = data["pc"] pc = data["pc"]
data = gen.generate(pc) data = gen.generate(pc)
...@@ -17,9 +21,9 @@ def waymo_data(batch_size=1): ...@@ -17,9 +21,9 @@ def waymo_data(batch_size=1):
coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1) coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
return voxels, coors, gen.grid_size return voxels, coors, gen.grid_size
class Net(nn.Module): class Net(nn.Module):
def __init__(self, def __init__(self, shape):
shape):
super().__init__() super().__init__()
self.net = spconv.SparseSequential( self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0"), spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0"),
...@@ -57,7 +61,8 @@ class Net(nn.Module): ...@@ -57,7 +61,8 @@ class Net(nn.Module):
) )
max_batch_size = 1 max_batch_size = 1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster. # grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
self.grid = torch.full([max_batch_size, *shape], -1, dtype=torch.int32).cuda() self.grid = torch.full([max_batch_size, *shape], -1,
dtype=torch.int32).cuda()
# self.grid = None # self.grid = None
self.shape = shape self.shape = shape
...@@ -87,5 +92,6 @@ def main(): ...@@ -87,5 +92,6 @@ def main():
# print("spconv time", time.time() - t) # print("spconv time", time.time() - t)
print("spconv time", np.mean(times[10:])) print("spconv time", np.mean(times[10:]))
if __name__ == "__main__": if __name__ == "__main__":
main() main()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment