Commit 60781119 authored by yanyan's avatar yanyan
Browse files

still working on cutlass

parent 23d9faaf
......@@ -44,11 +44,13 @@ set(ALL_LIBS ${TORCH_LIBRARIES})
set(ALL_INCLUDE ${PROJECT_SOURCE_DIR}/include)
set(MP11_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/mp11/include)
set(CUTLASS_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/cutlass/include)
if (SPCONV_BuildCUDA)
set(ALL_LIBS ${ALL_LIBS} ${CUDA_CUDART} ${CUDA_CUBLAS})
set(ALL_INCLUDE ${ALL_INCLUDE} ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
add_subdirectory(src/cuhash)
add_subdirectory(src/spgemm)
endif()
add_subdirectory(src/spconv)
add_subdirectory(src/utils)
......
......@@ -65,7 +65,7 @@ observers:
std: c++14
options: [
-Wno-deprecated-declarations,
"-gencode=arch=compute_52,code=sm_61",
"-gencode=arch=compute_52,code=sm_52",
"-gencode=arch=compute_61,code=sm_61",
"-gencode=arch=compute_60,code=sm_60",
"-gencode=arch=compute_70,code=sm_70",
......@@ -76,7 +76,7 @@ observers:
type: CPPDevObserver
main_pattern: torchdev_.*\.(cu|cpp|cc|cxx)
pattern: .*\.(cc|cpp|cxx|h|hpp|hxx|cu)
compiler: clang++
compiler: nvcc
executable: build/codeai_dev_torch
run_cmd: [$(executable)]
fail_cmds: # run cmd when pervious run fail with retcode
......@@ -97,13 +97,20 @@ observers:
/home/yy/anaconda3/lib,
]
libraries: [-lpython3.7m, -lcublas, -lcudart, -ljpeg, -lpthread,
"-Wl,--no-as-needed,-lc10",
"-Wl,--no-as-needed,-ltorch",
"-Wl,--no-as-needed,-ltorch_cpu",
"-Wl,--no-as-needed,-lc10_cuda",
"-Wl,--no-as-needed,-ltorch_cuda"]
std: c++2a
options: [--cuda-gpu-arch=sm_61, -Wno-deprecated-declarations, -D_GLIBCXX_USE_CXX11_ABI=0]
"-Xcompiler=\"-Wl,--no-as-needed,-lc10\"",
"-Xcompiler=\"-Wl,--no-as-needed,-ltorch\"",
"-Xcompiler=\"-Wl,--no-as-needed,-ltorch_cpu\"",
"-Xcompiler=\"-Wl,--no-as-needed,-lc10_cuda\"",
"-Xcompiler=\"-Wl,--no-as-needed,-ltorch_cuda\""]
std: c++14
# options: [--cuda-gpu-arch=sm_61, -Wno-deprecated-declarations, -D_GLIBCXX_USE_CXX11_ABI=0]
options: [
-Wno-deprecated-declarations,
--expt-relaxed-constexpr,
"-gencode=arch=compute_61,code=sm_61",
-D_GLIBCXX_USE_CXX11_ABI=0,
]
......@@ -20,8 +20,8 @@
#include <tensorview/tensorview.h>
namespace spconv {
template <typename Index, unsigned NDim, int KernelMaxVolume = 256,
typename Index1D = int>
template <typename Index, unsigned NDim, bool UseDeconv,
int KernelMaxVolume = 256, typename Index1D = int>
__global__ void prepareIndicePairsKernel(
tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum, tv::TensorView<Index1D> indicePairUnique,
......@@ -47,54 +47,19 @@ __global__ void prepareIndicePairsKernel(
auto indicePairsDim2 = indicePairs.dim(2);
Index index;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
indicePairs(0, offset, oldNum) = ix;
index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
pointPtr, outSpatialShape.data(), 0) +
spatialVolume * indicesIn(ix, 0);
indicePairs(1, offset, oldNum) = index;
indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
if (UseDeconv) {
// nvcc will optimize this fake "if constexpr"
// after cuda 11 released, we will start to use real if constexpr.
numValidPoints = getValidOutPosTranspose<Index, NDim>(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(),
outSpatialShape.data(), validPoints);
} else {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(),
outSpatialShape.data(), validPoints);
}
}
}
template <typename Index, unsigned NDim, int KernelMaxVolume = 256>
__global__ void prepareDeConvIndicePairsKernel(
tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape) {
auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
Index validPoints[KernelMaxVolume * (NDim + 1)];
Index *pointPtr = nullptr;
auto indicePairsDim2 = indicePairs.dim(2);
Index index;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
numValidPoints = getValidOutPosTranspose<Index, NDim>(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
......@@ -191,6 +156,29 @@ assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
}
}
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void
assignIndicePairsLimitedKernel(tv::TensorView<IndexGrid> gridsOut, int numActIn,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum) {
Index index, val;
int kernelVolume = indicePairs.dim(0);
for (int ix : tv::KernelLoopX<int>(numActIn)) {
for (int i = 0; i < kernelVolume; ++i) {
index = indicePairs(i, 1, ix);
if (index != -1) {
val = gridsOut[index];
if (val != -1) {
auto oldNum = atomicAdd(indiceNum.data() + i, Index(1));
indicePairs(i, 0, oldNum) = indicePairs(i, 0, ix);
indicePairs(i, 1, oldNum) = val;
}
}
}
}
}
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void prepareSubMGridKernel(
tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
......@@ -542,7 +530,6 @@ __global__ void getSubMIndicePairsHashUnrollKernel2(
}
}
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void resetGridKernel(const Index *indicePairUnique,
tv::TensorView<IndexGrid> gridsOut,
......@@ -567,8 +554,8 @@ resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut,
Index index;
for (int ix : tv::KernelLoopX<int>(numAct)) {
indsPtr = indices + ix * (NDim + 1);
index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(indsPtr + 1,
outSpatialShape.data(), 0);
index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
indsPtr + 1, outSpatialShape.data(), 0);
gridsOut[index + spatialVolume * indsPtr[0]] = -1;
}
}
......
......@@ -14,9 +14,9 @@
#ifndef SPARSE_REORDERING_FUNCTOR_H_
#define SPARSE_REORDERING_FUNCTOR_H_
#include <cuda_runtime_api.h>
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace spconv {
void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
......@@ -35,6 +35,13 @@ void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features,
void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures,
torch::Tensor indices, int size);
void sparse_gather_cuda(cudaStream_t s, torch::Tensor buffer,
torch::Tensor features, torch::Tensor indices,
int size);
void sparse_scatter_add_cuda(cudaStream_t s, torch::Tensor buffer,
torch::Tensor outFeatures, torch::Tensor indices,
int size);
} // namespace spconv
#endif
\ No newline at end of file
#include <cutlass/gemm/device/gemm.h>
#include <type_traits>
namespace spconv {
template <typename T>
using determine_acc_t =
std::conditional_t<std::is_same<T, cutlass::half_t>::value, float, T>;
template <typename T, bool TransA, bool TransB, bool TransC>
cudaError_t cutlassGemm(cudaStream_t s, int M, int N, int K, T alpha,
T const *A, int lda, T const *B, int ldb, T beta, T *C,
int ldc) {
// Define type definition for single-precision CUTLASS GEMM with column-major
// input matrices and 128x128x8 threadblock tile size (chosen by default).
//
// To keep the interface manageable, several helpers are defined for plausible
// compositions including the following example for single-precision GEMM.
// Typical values are used as default template arguments. See
// `cutlass/gemm/device/default_gemm_configuration.h` for more details.
//
// To view the full gemm device API interface, see
// `cutlass/gemm/device/gemm.h`
using TAcc = determine_acc_t<T>;
using ColumnMajor = cutlass::layout::ColumnMajor;
using RowMajor = cutlass::layout::RowMajor;
using LayoutA = std::conditional_t<TransA, ColumnMajor, RowMajor>;
using LayoutB = std::conditional_t<TransB, ColumnMajor, RowMajor>;
using LayoutC = std::conditional_t<TransC, ColumnMajor, RowMajor>;
using CutlassGemm = cutlass::gemm::device::Gemm<T, // Data-type of A matrix
LayoutA, // Layout of A matrix
T, // Data-type of B matrix
LayoutB, // Layout of B matrix
T, // Data-type of C matrix
LayoutC,
TAcc>; // Layout of C matrix
// Define a CUTLASS GEMM type
CutlassGemm gemm_operator;
// Construct the CUTLASS GEMM arguments object.
//
// One of CUTLASS's design patterns is to define gemm argument objects that
// are constructible in host code and passed to kernels by value. These may
// include pointers, strides, scalars, and other arguments needed by Gemm and
// its components.
//
// The benefits of this pattern are (1.) a structured, composable strategy for
// passing host-constructible arguments to kernels and (2.) minimized
// initialization overhead on kernel entry.
//
typename CutlassGemm::Arguments args(
{M, N, K}, // Gemm Problem dimensions
{A, lda}, // Tensor-ref for source matrix A
{B, ldb}, // Tensor-ref for source matrix B
{C, ldc}, // Tensor-ref for source matrix C
{C, ldc}, // Tensor-ref for destination matrix D (may be different memory
// than source C matrix)
{alpha, beta}); // Scalars used in the Epilogue
//
// Launch the CUTLASS GEMM kernel.
//
cutlass::Status status = gemm_operator(args, nullptr, s);
//
// Return a cudaError_t if the CUTLASS GEMM operator returned an error code.
//
if (status != cutlass::Status::kSuccess) {
return cudaErrorUnknown;
}
// Return success, if no errors were encountered.
return cudaSuccess;
}
} // namespace spconv
#include <cuda_runtime_api.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
namespace spconv {
void cutlass_mm_out(torch::Tensor c, torch::Tensor a, torch::Tensor b);
void cutlass_mm_out(cudaStream_t stream, torch::Tensor c, torch::Tensor a,
torch::Tensor b);
} // namespace spconv
\ No newline at end of file
#include <tensorview/tensor.h>
namespace spconv {
enum HashTypes {
kDenseMap = 0,
kCUDPPHash = 1
};
template <int Impl>
struct HashMap;
enum HashTypes { kDenseMap = 0, kCUDPPHash = 1 };
template<>
struct HashMap<kDenseMap>{
template <int Impl> struct HashMap;
};
template <> struct HashMap<kDenseMap> {};
}
\ No newline at end of file
} // namespace spconv
\ No newline at end of file
......@@ -12,7 +12,6 @@ using mp_list_c = mp_list<std::integral_constant<T, I>...>;
template <int... I>
using mp_list_int_c = mp_list<std::integral_constant<int, I>...>;
namespace detail {
template <class... Ts, class F>
......
......@@ -524,7 +524,6 @@ struct DispatchIntNoexcept<T<Args...>> {
}
};
constexpr size_t kTensorMaxDim = 10;
using TensorShape = ShapeBase<kTensorMaxDim, int64_t>;
......
......@@ -69,7 +69,7 @@ class SparseConvTensor(object):
self.batch_size = batch_size
self.indice_dict = {}
if grid is None:
grid = torch.Tensor() # empty tensor
grid = torch.Tensor() # empty tensor
self.grid = grid
@classmethod
......
......@@ -89,11 +89,11 @@ def get_indice_pairs(indices,
out_shape = spatial_shape
if grid is None:
grid = torch.Tensor()
res = torch.ops.spconv.get_indice_pairs(indices, grid, batch_size, out_shape,
spatial_shape, ksize, stride,
padding, dilation, out_padding,
int(subm), int(transpose),
int(use_hash))
res = torch.ops.spconv.get_indice_pairs(indices, grid, batch_size,
out_shape, spatial_shape, ksize,
stride, padding, dilation,
out_padding, int(subm),
int(transpose), int(use_hash))
return res
......
......@@ -67,8 +67,16 @@ class SparseMaxPool(SparseModule):
else:
out_spatial_shape = spatial_shape
outids, indice_pairs, indice_pairs_num = ops.get_indice_pairs(
indices, batch_size, spatial_shape, self.kernel_size, self.stride,
self.padding, self.dilation, 0, self.subm, grid=input.grid)
indices,
batch_size,
spatial_shape,
self.kernel_size,
self.stride,
self.padding,
self.dilation,
0,
self.subm,
grid=input.grid)
out_features = Fsp.indice_maxpool(features, indice_pairs.to(device),
indice_pairs_num.to(device),
......
......@@ -15,7 +15,7 @@ set_property(TARGET spconv PROPERTY CUDA_STANDARD 14)
set_property(TARGET spconv PROPERTY CXX_STANDARD 14)
set_target_properties(spconv PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
if (SPCONV_BuildCUDA)
target_link_libraries(spconv PRIVATE ${ALL_LIBS} cuhash)
target_link_libraries(spconv PRIVATE ${ALL_LIBS} cuhash spgemm)
else()
target_link_libraries(spconv PRIVATE ${ALL_LIBS})
endif()
......
......@@ -61,17 +61,9 @@ int create_conv_indice_pair_p1_cuda(
tv::DispatchInt<max_kernel_vol_t>()(
kernelVolume, std::less_equal<int>(), [&](auto I2) {
constexpr int MaxKernelVolume = decltype(I2)::value;
if (transpose) {
prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum),
tv::torch2tv<Index>(indicePairUnique), ks, st,
pa, di, ou);
TV_CHECK_CUDA_ERR_V2("prepareDeConvIndicePairsKernel failed");
} else {
prepareIndicePairsKernel<Index, NDim, MaxKernelVolume>
tv::dispatch_int<0, 1>(int(transpose), [&](auto I) {
constexpr bool UseDeconv = decltype(I)::value;
prepareIndicePairsKernel<Index, NDim, UseDeconv, MaxKernelVolume>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
......@@ -79,8 +71,7 @@ int create_conv_indice_pair_p1_cuda(
tv::torch2tv<Index>(indicePairUnique), ks, st,
pa, di, ou);
TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed");
}
// tv::ssprint("prepareIndicePairsKernel", timer.report() / 1000.0);
});
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(
......@@ -230,15 +221,17 @@ int create_submconv_indice_pair_cuda(
namespace mp11 = boost::mp11;
using kernel2_candidates_t =
mp11::mp_product<tv::mp_list, tv::mp_list_int_c<1, 3, 5>,
tv::mp_list_int_c<1, 3, 5>>;
mp11::mp_product<tv::mp_list, tv::mp_list_c<int, 1, 3, 5>,
tv::mp_list_c<int, 1, 3, 5>>;
using kernel3_candidates_t =
mp11::mp_product<tv::mp_list, tv::mp_list_int_c<1, 3, 5>,
tv::mp_list_int_c<1, 3, 5>,
tv::mp_list_int_c<1, 3, 5>>;
using kernel3_candidates_final_t = mp11::mp_push_back<kernel3_candidates_t>;
mp11::mp_product<tv::mp_list, tv::mp_list_c<int, 1, 3, 5>,
tv::mp_list_c<int, 1, 3, 5>,
tv::mp_list_c<int, 1, 3, 5>>;
using kernel3_candidates_final_t =
mp11::mp_push_back<kernel3_candidates_t>;
auto dispatcher2 = tv::DispatchContainerNoexcept<kernel2_candidates_t>();
auto dispatcher3 = tv::DispatchContainerNoexcept<kernel3_candidates_final_t>();
auto dispatcher3 =
tv::DispatchContainerNoexcept<kernel3_candidates_final_t>();
if (useHash) {
auto table = cuhash::HashTable();
......
......@@ -37,12 +37,12 @@ using half_vec_sadd_t =
std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16, 8>;
void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size) {
void sparse_gather_cuda(cudaStream_t stream, torch::Tensor buffer,
torch::Tensor features, torch::Tensor indices,
int size) {
if (size <= 0)
return;
int numPlanes = features.size(1);
auto stream = at::cuda::getCurrentCUDAStream();
auto dtype = features.scalar_type();
auto inds_dtype = indices.scalar_type();
// auto timer = spconv::CudaContextTimer<>();
......@@ -126,12 +126,18 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
});
}
void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
torch::Tensor indices, int size) {
void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size) {
auto stream = at::cuda::getCurrentCUDAStream();
return sparse_gather_cuda(stream, buffer, features, indices, size);
}
void sparse_scatter_add_cuda(cudaStream_t stream, torch::Tensor buffer,
torch::Tensor outFeatures, torch::Tensor indices,
int size) {
if (size <= 0)
return;
int numPlanes = outFeatures.size(1);
auto stream = at::cuda::getCurrentCUDAStream();
auto dtype = outFeatures.scalar_type();
auto inds_dtype = indices.scalar_type();
......@@ -216,6 +222,12 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
});
}
void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
torch::Tensor indices, int size) {
auto stream = at::cuda::getCurrentCUDAStream();
return sparse_scatter_add_cuda(stream, buffer, outFeatures, indices, size);
}
void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size) {
// indices: [volume, inds_stride]
......
#include <spconv/spconv_ops.h>
#include <spgemm/gemm_th.h>
namespace spconv {
std::vector<torch::Tensor>
......@@ -48,9 +50,9 @@ getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
gridSize = batchSize;
}
bool resetGrid = gridOut.numel() != 0;
if (!resetGrid){
gridOut = torch::full(
{gridSize}, -1, torch::dtype(torch::kInt32).device(indices.device()));
if (!resetGrid) {
gridOut = torch::full({gridSize}, -1,
torch::dtype(torch::kInt32).device(indices.device()));
}
gridOut = gridOut.view({batchSize, -1});
int64_t numActOut = -1;
......@@ -104,7 +106,7 @@ getIndicePairs(torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
padding, dilation, outSpatialShape, transpose, resetGrid, useHash);
}
#ifdef TV_CUDA
else if (indices.device().type() == torch::kCUDA) {
else if (indices.device().type() == torch::kCUDA) {
numActOut = create_conv_indice_pair_p1_cuda(
indices, indicePairs, indiceNum, indicePairUnique, kernelSize, stride,
padding, dilation, outSpatialShape, transpose);
......@@ -191,7 +193,8 @@ torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
// tv::ssprint("first subm gemm time", timer.report() / 1000.0, std::vector<int>(indicePairNumCpu.data_ptr<int>(),
// tv::ssprint("first subm gemm time", timer.report() / 1000.0,
// std::vector<int>(indicePairNumCpu.data_ptr<int>(),
// indicePairNumCpu.data_ptr<int>() + kernelVolume));
for (int i = 0; i < kernelVolume; ++i) {
......
set(ALL_FILES ${ALL_FILES} gemm.cu)
add_library(spgemm SHARED ${ALL_FILES})
target_include_directories(spgemm PRIVATE ${ALL_INCLUDE} ${MP11_INCLUDE} ${CUTLASS_INCLUDE} )
set_property(TARGET spgemm PROPERTY CUDA_STANDARD 14)
set_property(TARGET spgemm PROPERTY CXX_STANDARD 14)
set_target_properties(spgemm PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(spgemm PRIVATE ${ALL_LIBS})
install (TARGETS spgemm DESTINATION lib)
#include <spgemm/gemm.h>
#include <spgemm/gemm_th.h>
namespace spconv {
template <typename T>
using determine_half_t =
std::conditional_t<std::is_same<T, at::Half>::value, cutlass::half_t, T>;
void cutlass_mm_out(cudaStream_t stream, torch::Tensor c, torch::Tensor a,
torch::Tensor b) {
TV_ASSERT_RT_ERR(c.dtype() == a.dtype() && c.dtype() == b.dtype(),
"dtype must be same");
TV_ASSERT_RT_ERR(c.is_contiguous() && b.is_contiguous() && a.is_contiguous(),
"error");
auto M = a.size(0);
auto K = a.size(1);
auto N = b.size(1);
TV_ASSERT_RT_ERR(b.size(0) == K && c.size(0) == M && c.size(1) == N, "error");
tv::dispatch_torch<float, at::Half>(c.scalar_type(), [&](auto I) {
using T = decltype(I);
using HalfT = determine_half_t<T>;
auto status = cutlassGemm<HalfT, false, false, false>(
stream, M, N, K, HalfT(1.0), reinterpret_cast<HalfT *>(a.data_ptr<T>()),
a.size(1), reinterpret_cast<HalfT *>(b.data_ptr<T>()), b.size(1),
HalfT(0.0), reinterpret_cast<HalfT *>(c.data_ptr<T>()), c.size(1));
TV_ASSERT_RT_ERR(status == cudaSuccess, "error");
});
}
void cutlass_mm_out(torch::Tensor c, torch::Tensor a, torch::Tensor b) {
auto stream = at::cuda::getCurrentCUDAStream();
return cutlass_mm_out(stream, c, a, b);
}
} // namespace spconv
\ No newline at end of file
#define TV_CUDA
#include <cutlass/gemm/device/gemm.h>
#include <spgemm/gemm.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/tensor.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
int main() {
auto M = 100000;
auto N = 128;
auto K = 128;
auto a =
torch::rand({M, K}, torch::dtype(torch::kFloat32).device(torch::kCUDA));
auto b = torch::rand({K, N}, a.options());
auto c = torch::zeros({a.size(0), b.size(1)}, a.options());
auto c2 = torch::zeros({a.size(0), b.size(1)}, a.options());
torch::mm_out(c, a, b);
auto status = spconv::cutlassGemm<float, false, false, false>(
0, M, N, K, 1.0, a.data_ptr<float>(), a.size(1), b.data_ptr<float>(),
b.size(1), 0.0, c2.data_ptr<float>(), c2.size(1));
auto err = torch::norm(c2 - c);
tv::ssprint(status, "linalg norm", err);
tv::ssprint((c.view({-1}) == 0).sum());
auto timer = spconv::CudaContextTimer<>();
for (int i = 0; i < 10; ++i) {
torch::mm_out(c, a, b);
tv::ssprint("mm", timer.report() / 1000.0);
spconv::cutlassGemm<float, false, false, false>(
0, M, N, K, 1.0, a.data_ptr<float>(), a.size(1), b.data_ptr<float>(),
b.size(1), 0.0, c2.data_ptr<float>(), c2.size(1));
tv::ssprint("cutlass_mm", timer.report() / 1000.0);
}
return 0;
}
\ No newline at end of file
import torch
import spconv
import numpy as np
from spconv.utils import VoxelGeneratorV2
from pathlib import Path
import time
from pathlib import Path
import numpy as np
import torch
from torch import nn
import time
import spconv
from spconv.utils import VoxelGeneratorV2
def waymo_data(batch_size=1):
gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1, 150000)
gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
150000)
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
pc = data["pc"]
data = gen.generate(pc)
......@@ -17,9 +21,9 @@ def waymo_data(batch_size=1):
coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
return voxels, coors, gen.grid_size
class Net(nn.Module):
def __init__(self,
shape):
def __init__(self, shape):
super().__init__()
self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0"),
......@@ -57,7 +61,8 @@ class Net(nn.Module):
)
max_batch_size = 1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
self.grid = torch.full([max_batch_size, *shape], -1, dtype=torch.int32).cuda()
self.grid = torch.full([max_batch_size, *shape], -1,
dtype=torch.int32).cuda()
# self.grid = None
self.shape = shape
......@@ -87,5 +92,6 @@ def main():
# print("spconv time", time.time() - t)
print("spconv time", np.mean(times[10:]))
if __name__ == "__main__":
main()
\ No newline at end of file
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment