Commit 02eb1d56 authored by yanyan's avatar yanyan
Browse files

try to increase half performance

parent 75b46f90
...@@ -17,6 +17,7 @@ set(CMAKE_CXX_EXTENSIONS OFF) # avoid gnu++11 be added to CXX flags ...@@ -17,6 +17,7 @@ set(CMAKE_CXX_EXTENSIONS OFF) # avoid gnu++11 be added to CXX flags
if(CMAKE_BUILD_TYPE STREQUAL "Debug") if(CMAKE_BUILD_TYPE STREQUAL "Debug")
add_compile_definitions(TV_DEBUG) add_compile_definitions(TV_DEBUG)
endif() endif()
# add_compile_definitions(TV_LOG_KERNEL_INFO)
find_package(Torch REQUIRED) find_package(Torch REQUIRED)
# set(CMAKE_VERBOSE_MAKEFILE ON) # set(CMAKE_VERBOSE_MAKEFILE ON)
......
## Performance Guide
### 1. Regular sparse conv is very slow
Regular sparse convolution will greatly increase the number of active points. for 3x3x3 3D convolution, we can get at most 27x active points, which means next convolution will perform 27x slower!
This problem can be solved by using submanifold convolution (SubMConv3d). This kind of sparse convolution doesn't generate new active points.
**NEVER** use SparseConv3d except downsample data, **NEVER** use SparseConv3dTranspose, use SparseInverseConv3d instead.
### 2. Large Spatial Shape cost too much GPU memory
Our implementation use dense map to generate indices in GPU for sparse convolution, which means if your spatial shape is ```[batchSize=4, 1600, 1600, 40]```, it will cost ~2GB GPU memory.
To solve this problem, you can use CPU algorithm (hash map) for first layer that has large shape, then convert generated indices to GPU and use GPU algorithm for downsampled data.
Another way is use cuda hash. Unfortunately this library isn't stable enough, it should only be used when the spatial shape is very large.
### 3. Stacked submanifold convolution can share same indice data
When you using stacked subm convolution, there is no need to generate indice data again, but this can't be done automatically. you need to specify a unique key ```indice_key="c0"``` and use it for all stacked subm convolution.
### 4. Different convolution algorithm may lead to different performance
There are three kind of algorithm: ```Native```, ```Batch```, ```BatchGemmGather```.
* ```Native```: should be used for all submanifold convolutions. should be used when there are too much active points.
* ```Batch```: **cost more GPU memory** should be used when number of active points is small.
* ```BatchGemmGather```: **cost more GPU memory** can be used for regular convolution.
\ No newline at end of file
...@@ -272,6 +272,8 @@ __global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer, ...@@ -272,6 +272,8 @@ __global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
int numPlanes) { int numPlanes) {
int ILPStrideX[NumILP]; int ILPStrideX[NumILP];
constexpr int vecloadFactor = sizeof(VecType) / sizeof(T); constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
constexpr int vecloadHalf2Factor = sizeof(VecType) / sizeof(__half2);
#pragma unroll #pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
...@@ -288,16 +290,65 @@ __global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer, ...@@ -288,16 +290,65 @@ __global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
reinterpret_cast<VecType *>(outFeatures)[idx]; reinterpret_cast<VecType *>(outFeatures)[idx];
reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>( reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]; buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y];
if (std::is_same<T, at::Half>::value) {
#pragma unroll
for (int i = 0; i < vecloadHalf2Factor; i++) {
reinterpret_cast<__half2 *>(buf)[i] =
__hadd2(reinterpret_cast<__half2 *>(buf)[i],
reinterpret_cast<__half2 *>(buf2)[i]);
}
} else {
#pragma unroll #pragma unroll
for (int i = 0; i < vecloadFactor; i++) { for (int i = 0; i < vecloadFactor; i++) {
buf[i] += buf2[i]; buf[i] += buf2[i];
} }
}
reinterpret_cast<VecType *>(outFeatures)[idx] = reinterpret_cast<VecType *>(outFeatures)[idx] =
reinterpret_cast<VecType *>(buf)[0]; reinterpret_cast<VecType *>(buf)[0];
} }
} }
} }
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void scatterAddBlockKernel(T *outFeatures, const T *buffer,
const Index *indices, int size,
int numPlanes) {
int ILPStrideX[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
outFeatures += blockIdx.y * NumTLP;
buffer += blockIdx.y * NumTLP;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
outFeatures[indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y] += buffer[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y];
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void scatterAddHalfBlockKernel(T *outFeatures, const T *buffer,
const Index *indices, int size,
int numPlanes) {
int ILPStrideX[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
outFeatures += blockIdx.y * NumTLP;
buffer += blockIdx.y * NumTLP;
Index idx;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idx = indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y;
reinterpret_cast<__half2 *>(outFeatures)[idx] =
__hadd2(reinterpret_cast<__half2 *>(outFeatures)[idx],
reinterpret_cast<__half2 *>(buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]);
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP> template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void batchScatterAddGenericKernel(T *outFeatures, const T *buffer, __global__ void batchScatterAddGenericKernel(T *outFeatures, const T *buffer,
const Index *indices, int size, const Index *indices, int size,
......
#include "NvInfer.h"
#include <memory>
#include <tensorview/tensor.h>
#include <unordered_map>
#include <vector>
namespace trt {
template <typename T> tv::DType trt_dtype_to_tv(T trt_dtype) {
switch (trt_dtype) {
case nvinfer1::DataType::kFLOAT:
return tv::float32;
case nvinfer1::DataType::kHALF:
return tv::float16;
case nvinfer1::DataType::kINT32:
return tv::int32;
case nvinfer1::DataType::kINT8:
return tv::int8;
default:;
}
TV_THROW_INVALID_ARG("unknown trt dtype");
}
struct InferDeleter {
template <typename T> void operator()(T *obj) const {
if (obj) {
obj->destroy();
}
}
};
template <typename T> using trt_unique_ptr_t = std::unique_ptr<T, InferDeleter>;
class Logger : public nvinfer1::ILogger {
public:
Logger(Severity severity = Severity::kWARNING)
: reportableSeverity(severity) {}
void log(Severity severity, const char *msg) override {
// suppress messages with severity enum value greater than the reportable
if (severity > reportableSeverity)
return;
switch (severity) {
case Severity::kINTERNAL_ERROR:
std::cerr << "INTERNAL_ERROR: ";
break;
case Severity::kERROR:
std::cerr << "ERROR: ";
break;
case Severity::kWARNING:
std::cerr << "WARNING: ";
break;
case Severity::kINFO:
std::cerr << "INFO: ";
break;
default:
std::cerr << "UNKNOWN: ";
break;
}
std::cerr << msg << std::endl;
}
Severity reportableSeverity;
};
class InferenceContext {
public:
explicit InferenceContext(const std::string& engine_bin, int device)
: logger_(nvinfer1::ILogger::Severity::kINFO), device_(device) {
TV_ASSERT_INVALID_ARG(device >= 0, "invalid device id");
int deviceCount;
cudaGetDeviceCount(&deviceCount);
if (device >= deviceCount) {
TV_THROW_INVALID_ARG("you provide device ", device, " but you only have ",
deviceCount, " device.");
}
cudaSetDevice(device);
auto runtime = trt_unique_ptr_t<nvinfer1::IRuntime>(
nvinfer1::createInferRuntime(logger_));
engine_ =
trt_unique_ptr_t<nvinfer1::ICudaEngine>(runtime->deserializeCudaEngine(
engine_bin.c_str(), engine_bin.size(), nullptr));
ctx_ = trt_unique_ptr_t<nvinfer1::IExecutionContext>(
engine_->createExecutionContext());
max_batch_size_ = engine_->getMaxBatchSize();
for (int i = 0; i < engine_->getNbBindings(); ++i) {
auto dims = engine_->getBindingDimensions(i);
std::vector<int> shape_vec(dims.d, dims.d + dims.nbDims);
shape_vec.insert(shape_vec.begin(), {max_batch_size_});
tv::TensorShape shape(shape_vec);
std::string name = engine_->getBindingName(i);
auto trt_dtype = engine_->getBindingDataType(i);
auto tv_dtype = trt_dtype_to_tv(trt_dtype);
bool isInput = engine_->bindingIsInput(i);
name_to_idx_[name] = i;
idx_to_name_[i] = name;
name_to_host_mem_.insert({name, tv::Tensor(shape, tv_dtype, -1)});
name_to_dev_mem_.insert({name, tv::Tensor(shape, tv_dtype, 0)});
if (isInput)
inp_idxes_.push_back(i);
else
out_idxes_.push_back(i);
bindings_.push_back(name_to_dev_mem_[name].raw_data());
}
checkCudaErrors(cudaStreamCreate(&stream_));
}
std::unordered_map<std::string, tv::Tensor>
operator()(std::vector<tv::Tensor> inputs) {
TV_ASSERT_INVALID_ARG(inputs.size() == inp_idxes_.size(), "must provide",
inp_idxes_.size(), "inputs, but got", inputs.size());
// inference batch size
int bs = inputs[0].dim(0);
for (auto &inp : inputs) {
TV_ASSERT_INVALID_ARG(inp.dim(0) == bs,
"batch sizes of all input must same");
}
TV_ASSERT_INVALID_ARG(bs <= max_batch_size_, "your batchsize too large", bs,
max_batch_size_);
for (int i = 0; i < inputs.size(); ++i) {
auto &dev_mem = name_to_dev_mem_[idx_to_name_[i]];
auto shape_inp = inputs[i].shape().subshape(1);
auto shape_dev = dev_mem.shape().subshape(1);
TV_ASSERT_INVALID_ARG(shape_inp == shape_dev,
"shape except batch must same", shape_inp,
shape_dev);
dev_mem.slice_first_axis(0, bs).copy_(inputs[i].slice_first_axis(0, bs),
stream_);
}
ctx_->enqueue(bs, bindings_.data(), stream_, nullptr);
for (int i : out_idxes_) {
name_to_host_mem_[idx_to_name_[i]].slice_first_axis(0, bs).copy_(
name_to_dev_mem_[idx_to_name_[i]].slice_first_axis(0, bs), stream_);
}
checkCudaErrors(cudaStreamSynchronize(stream_));
std::unordered_map<std::string, tv::Tensor> output_map;
for (int i = 0; i < out_idxes_.size(); ++i) {
auto name = idx_to_name_[out_idxes_[i]];
output_map[name] = name_to_host_mem_[name].slice_first_axis(0, bs);
}
return output_map;
}
std::unordered_map<std::string, tv::Tensor>
operator()(std::unordered_map<std::string, tv::Tensor> inputs) {
std::vector<tv::Tensor> inputs_vec(inp_idxes_.size());
int count = 0;
for (auto &p : inputs) {
auto iter = name_to_idx_.find(p.first);
TV_ASSERT_INVALID_ARG(iter != name_to_idx_.end(), "cant find your name",
p.first);
inputs_vec[name_to_idx_[p.first]] = p.second;
}
TV_ASSERT_INVALID_ARG(count == inp_idxes_.size(), "your inp not enough");
return (*this)(inputs_vec);
}
tv::Tensor operator[](std::string name){
auto iter = name_to_host_mem_.find(name);
if (iter == name_to_host_mem_.end()){
TV_THROW_INVALID_ARG(name, "not found.");
}
return iter->second;
}
std::string repr() {
std::stringstream ss;
ss << "InferenceContext[gpu=" << device_ << "]";
ss << "\n Inputs:";
std::string name;
for (auto &i : inp_idxes_) {
name = idx_to_name_[i];
auto &mem = name_to_host_mem_[name];
ss << "\n " << name << "[" << tv::detail::typeString(mem.dtype())
<< "]: " << mem.shape();
}
ss << "\n Outputs:";
for (auto &i : out_idxes_) {
name = idx_to_name_[i];
auto &mem = name_to_host_mem_[name];
ss << "\n " << name << "[" << tv::detail::typeString(mem.dtype())
<< "]: " << mem.shape();
}
return ss.str();
}
private:
Logger logger_;
trt_unique_ptr_t<nvinfer1::ICudaEngine> engine_;
trt_unique_ptr_t<nvinfer1::IExecutionContext> ctx_;
std::unordered_map<std::string, tv::Tensor> name_to_dev_mem_;
std::unordered_map<std::string, tv::Tensor> name_to_host_mem_;
std::unordered_map<std::string, int> name_to_idx_;
std::unordered_map<int, std::string> idx_to_name_;
std::vector<int> inp_idxes_;
std::vector<int> out_idxes_;
std::vector<void *> bindings_;
cudaStream_t stream_;
int max_batch_size_;
int device_;
};
} // namespace trtplus
...@@ -63,7 +63,7 @@ class CMakeBuild(build_ext): ...@@ -63,7 +63,7 @@ class CMakeBuild(build_ext):
# must add following flags to use at::Half # must add following flags to use at::Half
# but will remove raw half operators. # but will remove raw half operators.
cuda_flags += ["-D__CUDA_NO_HALF_OPERATORS__", "-D__CUDA_NO_HALF_CONVERSIONS__"] cuda_flags += ["-D__CUDA_NO_HALF_OPERATORS__", "-D__CUDA_NO_HALF_CONVERSIONS__"]
cuda_flags += ["-D__CUDA_NO_HALF2_OPERATORS__"] # cuda_flags += ["-D__CUDA_NO_HALF2_OPERATORS__"]
cmake_args += ['-DCMAKE_CUDA_FLAGS=' + " ".join(cuda_flags)] cmake_args += ['-DCMAKE_CUDA_FLAGS=' + " ".join(cuda_flags)]
cfg = 'Debug' if self.debug else 'Release' cfg = 'Debug' if self.debug else 'Release'
assert cfg == "Release", "pytorch ops don't support debug build." assert cfg == "Release", "pytorch ops don't support debug build."
......
...@@ -61,8 +61,6 @@ class SparseConvTensor(object): ...@@ -61,8 +61,6 @@ class SparseConvTensor(object):
""" """
self.features = features self.features = features
self.indices = indices self.indices = indices
if self.indices.dtype != torch.int32:
self.indices = self.indices.int()
self.spatial_shape = spatial_shape self.spatial_shape = spatial_shape
self.batch_size = batch_size self.batch_size = batch_size
self.indice_dict = {} self.indice_dict = {}
......
...@@ -29,7 +29,7 @@ ...@@ -29,7 +29,7 @@
namespace spconv { namespace spconv {
using max_kernel_vol_t = tv::mp_list_c<int, 16, 32, 256, 4096>; using max_kernel_vol_t = tv::mp_list_c<int, 9, 16, 27, 32, 128, 256, 4096>;
int create_conv_indice_pair_p1_cuda( int create_conv_indice_pair_p1_cuda(
torch::Tensor indicesIn, torch::Tensor indicePairs, torch::Tensor indiceNum, torch::Tensor indicesIn, torch::Tensor indicePairs, torch::Tensor indiceNum,
...@@ -76,6 +76,12 @@ int create_conv_indice_pair_p1_cuda( ...@@ -76,6 +76,12 @@ int create_conv_indice_pair_p1_cuda(
pa, di, ou); pa, di, ou);
TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed"); TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed");
} }
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(&attr, prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>));
tv::ssprint("prepareIndicePairsKernel<", tv::type_s<Index>, NDim, MaxKernelVolume, ">", attr.numRegs);
#endif
}); });
}); });
}); });
...@@ -158,6 +164,15 @@ int create_conv_indice_pair_p2_cuda( ...@@ -158,6 +164,15 @@ int create_conv_indice_pair_p2_cuda(
tv::torch2tv<Index>(indicePairs), tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indicePairUnique), ou); tv::torch2tv<Index>(indicePairUnique), ou);
TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed"); TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(&attr, assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>));
tv::ssprint("assignGridAndIndiceOutKernel<", tv::type_s<Index>, NDim, ">", attr.numRegs);
cudaFuncAttributes attr2;
checkCudaErrors(cudaFuncGetAttributes(&attr2, assignIndicePairsKernel<Index, IndexGrid, NDim>));
tv::ssprint("assignIndicePairsKernel<", tv::type_s<Index>, NDim, ">", attr2.numRegs);
#endif
} }
if (resetGrid && (!useHash)) { if (resetGrid && (!useHash)) {
......
...@@ -20,7 +20,6 @@ ...@@ -20,7 +20,6 @@
namespace spconv { namespace spconv {
using float_types_t = tv::mp_list<float, double, at::Half>; using float_types_t = tv::mp_list<float, double, at::Half>;
using int_types_t = tv::mp_list<int32_t, int64_t>; using int_types_t = tv::mp_list<int32_t, int64_t>;
void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features, void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size) { torch::Tensor indices, int size) {
int numPlanes = features.size(1); int numPlanes = features.size(1);
...@@ -57,9 +56,10 @@ void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures, ...@@ -57,9 +56,10 @@ void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures,
Index *indices_data = indices.data_ptr<Index>(); Index *indices_data = indices.data_ptr<Index>();
const T *buffer_data = buffer.data_ptr<T>(); const T *buffer_data = buffer.data_ptr<T>();
T *features_data = outFeatures.data_ptr<T>(); T *features_data = outFeatures.data_ptr<T>();
at::parallel_for(0, size, 0, [&](int64_t begin, int64_t end) {
const T *buf = buffer.data_ptr<T>(); const T *buf = buffer.data_ptr<T>();
T *out = outFeatures.data_ptr<T>(); T *out = outFeatures.data_ptr<T>();
for (int i = 0; i < size; ++i) { for (int i = begin; i < end; ++i) {
buf = buffer_data + i * numPlanes; buf = buffer_data + i * numPlanes;
out = features_data + indices_data[i] * numPlanes; out = features_data + indices_data[i] * numPlanes;
for (int j = 0; j < numPlanes; ++j) { for (int j = 0; j < numPlanes; ++j) {
...@@ -68,6 +68,7 @@ void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures, ...@@ -68,6 +68,7 @@ void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures,
} }
}); });
}); });
});
} }
} // namespace spconv } // namespace spconv
...@@ -29,6 +29,11 @@ namespace spconv { ...@@ -29,6 +29,11 @@ namespace spconv {
using float_types_t = tv::mp_list<float, double, at::Half>; using float_types_t = tv::mp_list<float, double, at::Half>;
using int_types_t = tv::mp_list<int32_t, int64_t>; using int_types_t = tv::mp_list<int32_t, int64_t>;
template <typename T>
using half_vec_t = std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
template <typename T>
using half_vec_sadd_t = std::conditional_t<std::is_same<T, at::Half>::value, int4, int4>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features, void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size) { torch::Tensor indices, int size) {
...@@ -40,9 +45,7 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features, ...@@ -40,9 +45,7 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
auto inds_dtype = indices.scalar_type(); auto inds_dtype = indices.scalar_type();
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) { tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = decltype(TValue); using T = decltype(TValue);
using vecload_type_t = using vecload_type_t = half_vec_t<T>;
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) { tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
using Index = decltype(IndexValue); using Index = decltype(IndexValue);
bool notFound = true; bool notFound = true;
...@@ -63,7 +66,12 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features, ...@@ -63,7 +66,12 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(), stream>>>(buffer.data_ptr<T>(), features.data_ptr<T>(),
indices.data_ptr<Index>(), nHotBlock, indices.data_ptr<Index>(), nHotBlock,
numPlanes / vecloadFactor); numPlanes / vecloadFactor);
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(&attr, gatherVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>));
tv::ssprint("gatherVecBlockKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
#endif
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
if (size - nHotBlock > 0) { if (size - nHotBlock > 0) {
...@@ -74,6 +82,12 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features, ...@@ -74,6 +82,12 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
features.data_ptr<T>(), features.data_ptr<T>(),
indices.data_ptr<Index>() + nHotBlock, indices.data_ptr<Index>() + nHotBlock,
size - nHotBlock, numPlanes / vecloadFactor); size - nHotBlock, numPlanes / vecloadFactor);
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(&attr, gatherVecKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>));
tv::ssprint("gatherVecKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
#endif
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
notFound = false; notFound = false;
...@@ -89,6 +103,12 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features, ...@@ -89,6 +103,12 @@ void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
dim3(NumTLP / NumILP, NumTLP), 0, stream>>>( dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
buffer.data_ptr<T>(), features.data_ptr<T>(), buffer.data_ptr<T>(), features.data_ptr<T>(),
indices.data_ptr<Index>(), size, numPlanes); indices.data_ptr<Index>(), size, numPlanes);
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(&attr, gatherGenericKernel<T, Index, NumTLP, NumILP>));
tv::ssprint("gatherGenericKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
#endif
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
}); });
...@@ -106,9 +126,7 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures, ...@@ -106,9 +126,7 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) { tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = decltype(TValue); using T = decltype(TValue);
using vecload_type_t = using vecload_type_t = half_vec_sadd_t<T>;
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) { tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
using Index = decltype(IndexValue); using Index = decltype(IndexValue);
bool notFound = true; bool notFound = true;
...@@ -131,6 +149,13 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures, ...@@ -131,6 +149,13 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
stream>>>(outFeatures.data_ptr<T>(), buffer.data_ptr<T>(), stream>>>(outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
indices.data_ptr<Index>(), nHotBlock, indices.data_ptr<Index>(), nHotBlock,
numPlanes / vecloadFactor); numPlanes / vecloadFactor);
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(&attr, scatterAddVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>));
tv::ssprint("scatterAddVecBlockKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
#endif
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
if (size - nHotBlock > 0) { if (size - nHotBlock > 0) {
...@@ -140,6 +165,11 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures, ...@@ -140,6 +165,11 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
buffer.data_ptr<T>() + nHotBlock * numPlanes, buffer.data_ptr<T>() + nHotBlock * numPlanes,
indices.data_ptr<Index>() + nHotBlock, indices.data_ptr<Index>() + nHotBlock,
size - nHotBlock, numPlanes); size - nHotBlock, numPlanes);
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(&attr, scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>));
tv::ssprint("scatterAddGenericKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
#endif
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
notFound = false; notFound = false;
...@@ -155,6 +185,12 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures, ...@@ -155,6 +185,12 @@ void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
dim3(NumTLP / NumILP, NumTLP), 0, stream>>>( dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
outFeatures.data_ptr<T>(), buffer.data_ptr<T>(), outFeatures.data_ptr<T>(), buffer.data_ptr<T>(),
indices.data_ptr<Index>(), size, numPlanes); indices.data_ptr<Index>(), size, numPlanes);
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(&attr, scatterAddGenericKernel<T, Index, int(NumTLP), NumILP>));
tv::ssprint("notfound scatterAddGenericKernel<", tv::type_s<T>, tv::type_s<Index>, int(NumTLP), NumILP, ">", attr.numRegs);
#endif
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
}); });
...@@ -176,9 +212,7 @@ void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features, ...@@ -176,9 +212,7 @@ void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
int feature_stride = buffer.size(1); int feature_stride = buffer.size(1);
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) { tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = decltype(TValue); using T = decltype(TValue);
using vecload_type_t = using vecload_type_t = half_vec_t<T>;
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) { tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
using Index = decltype(IndexValue); using Index = decltype(IndexValue);
bool notFound = true; bool notFound = true;
...@@ -251,9 +285,7 @@ void batch_sparse_scatter_add_cuda(torch::Tensor buffer, ...@@ -251,9 +285,7 @@ void batch_sparse_scatter_add_cuda(torch::Tensor buffer,
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) { tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = decltype(TValue); using T = decltype(TValue);
using vecload_type_t = using vecload_type_t = half_vec_sadd_t<T>;
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) { tv::DispatchTorch<int_types_t>()(inds_dtype, [&](auto IndexValue) {
using Index = decltype(IndexValue); using Index = decltype(IndexValue);
bool notFound = true; bool notFound = true;
......
...@@ -37,7 +37,7 @@ class SparseConv3dTestTorch(nn.Module): ...@@ -37,7 +37,7 @@ class SparseConv3dTestTorch(nn.Module):
stride, stride,
padding, padding,
dilation, dilation,
algo=spconv.ConvAlgo.BatchGemmGather): algo=spconv.ConvAlgo.Native):
super().__init__() super().__init__()
layers = [ layers = [
spconv.SparseConv3d(in_channels, spconv.SparseConv3d(in_channels,
...@@ -349,7 +349,7 @@ def scatter_nd(indices, updates, shape): ...@@ -349,7 +349,7 @@ def scatter_nd(indices, updates, shape):
class TestSpConv(TestCase): class TestSpConv(TestCase):
def testSpConv3d(self): def testSpConv3d(self):
np.random.seed(484) np.random.seed(484)
devices = ["cuda:0"] devices = ["cpu:0"]
shapes = [[19, 18, 17]] shapes = [[19, 18, 17]]
batchsizes = [1, 2] batchsizes = [1, 2]
...@@ -615,13 +615,13 @@ class TestSpConv(TestCase): ...@@ -615,13 +615,13 @@ class TestSpConv(TestCase):
self.assertAllClose(din_np, din_sparse_np, atol=1e-4) self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
def main(algo=spconv.ConvAlgo.Native): def main(algo=spconv.ConvAlgo.Native, dtype=torch.float32):
# function for develop. # function for develop.
np.random.seed(484) np.random.seed(484)
# devices = ["cuda:0"] # devices = ["cuda:0"]
devices = ["cuda:0"] devices = ["cuda:0"]
shapes = [[400, 400, 15]] shapes = [[400, 400, 15]]
batchsizes = [1] batchsizes = [2]
in_channels = [32] in_channels = [32]
out_channels = [64] out_channels = [64]
...@@ -648,15 +648,15 @@ def main(algo=spconv.ConvAlgo.Native): ...@@ -648,15 +648,15 @@ def main(algo=spconv.ConvAlgo.Native):
indices_t = torch.from_numpy(indices) indices_t = torch.from_numpy(indices)
filters = np.random.uniform(0, 1, size=[k[0], 1, 1, IC, filters = np.random.uniform(0, 1, size=[k[0], 1, 1, IC,
OC]).astype(np.float32) OC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device).float() indices_t = torch.from_numpy(indices).int().to(device).to(dtype)
features_t = torch.from_numpy(features).to(device).float() features_t = torch.from_numpy(features).to(device).to(dtype)
features_dense_t = torch.from_numpy(features_dense).to(device).float() features_dense_t = torch.from_numpy(features_dense).to(device).to(dtype)
net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d, net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d,
algo=algo).to(device).float() algo=algo).to(device).to(dtype)
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device).float() d).to(device).to(dtype)
filters_t = torch.from_numpy(filters).to(device).float() filters_t = torch.from_numpy(filters).to(device).to(dtype)
net_ref.net[0].weight[:] = filters_t.permute(4, 3, 0, 1, net_ref.net[0].weight[:] = filters_t.permute(4, 3, 0, 1,
2).contiguous() 2).contiguous()
net.net[0].weight[:] = filters_t net.net[0].weight[:] = filters_t
...@@ -682,7 +682,7 @@ def main(algo=spconv.ConvAlgo.Native): ...@@ -682,7 +682,7 @@ def main(algo=spconv.ConvAlgo.Native):
out_numpy.sum()) out_numpy.sum())
def main_subm(algo): def main_subm(algo, dtype=torch.float32):
# function for develop. # function for develop.
np.random.seed(484) np.random.seed(484)
torch.manual_seed(50051) torch.manual_seed(50051)
...@@ -703,7 +703,7 @@ def main_subm(algo): ...@@ -703,7 +703,7 @@ def main_subm(algo):
if all([s > 1, d > 1]): if all([s > 1, d > 1]):
continue continue
device = torch.device(dev) device = torch.device(dev)
num_points = [240000] * bs num_points = [120000] * bs
sparse_dict = generate_sparse_data(shape, num_points, IC) sparse_dict = generate_sparse_data(shape, num_points, IC)
...@@ -715,15 +715,15 @@ def main_subm(algo): ...@@ -715,15 +715,15 @@ def main_subm(algo):
indices_t = torch.from_numpy(indices) indices_t = torch.from_numpy(indices)
filters = np.random.uniform(0, 1, size=[k[0], 1, 1, IC, filters = np.random.uniform(0, 1, size=[k[0], 1, 1, IC,
OC]).astype(np.float32) OC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device).float() indices_t = torch.from_numpy(indices).int().to(device).to(dtype)
features_t = torch.from_numpy(features).to(device).float() features_t = torch.from_numpy(features).to(device).to(dtype)
features_dense_t = torch.from_numpy(features_dense).to(device).float() features_dense_t = torch.from_numpy(features_dense).to(device).to(dtype)
net = SubMConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d, net = SubMConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d,
algo=algo).to(device).float() algo=algo).to(device).to(dtype)
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device).float() d).to(device).to(dtype)
filters_t = torch.from_numpy(filters).to(device).float() filters_t = torch.from_numpy(filters).to(device).to(dtype)
net_ref.net[0].weight[:] = filters_t.permute(4, 3, 0, 1, net_ref.net[0].weight[:] = filters_t.permute(4, 3, 0, 1,
2).contiguous() 2).contiguous()
net.net[0].weight[:] = filters_t net.net[0].weight[:] = filters_t
...@@ -741,17 +741,17 @@ def main_subm(algo): ...@@ -741,17 +741,17 @@ def main_subm(algo):
# print(out.indices) # print(out.indices)
out = out.dense() out = out.dense()
out_numpy = out.detach().cpu().numpy() out_numpy = out.detach().cpu().numpy()
print( # print(
np.linalg.norm(out.detach().cpu().numpy() - # np.linalg.norm(out.detach().cpu().numpy() -
out_ref.detach().cpu().numpy())) # out_ref.detach().cpu().numpy()))
print(out_numpy.min(), out_numpy.max(), out_numpy.mean(), print(out_numpy.min(), out_numpy.max(), out_numpy.mean(),
out_numpy.sum()) out_numpy.sum())
return out_numpy return out_numpy
if __name__ == '__main__': if __name__ == '__main__':
# main_subm(algo=spconv.ConvAlgo.BatchGemmGather) main(algo=spconv.ConvAlgo.Native, dtype=torch.float32)
# out_ref = main_subm(algo=spconv.ConvAlgo.Native) main(algo=spconv.ConvAlgo.Native, dtype=torch.half)
# TestCase().assertAllClose(out_my, out_ref) # TestCase().assertAllClose(out_my, out_ref)
# unittest.main() # unittest.main()
TestSpConv().testSpConv3d() # TestSpConv().testSpConv3d()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment