Commit 3517290c authored by yanyan's avatar yanyan
Browse files

format code, add benchmark per layer

parent 540a2209
......@@ -21,15 +21,16 @@
namespace spconv {
template <bool UseDeconv, typename Index, unsigned NDim> struct ConvIndiceDispatch;
template <bool UseDeconv, typename Index, unsigned NDim>
struct ConvIndiceDispatch;
template <typename Index, unsigned NDim>
struct ConvIndiceDispatch<true, Index, NDim>{
constexpr static auto* func = getValidOutPosTranspose<Index, NDim>;
struct ConvIndiceDispatch<true, Index, NDim> {
constexpr static auto *func = getValidOutPosTranspose<Index, NDim>;
};
template <typename Index, unsigned NDim>
struct ConvIndiceDispatch<false, Index, NDim>{
constexpr static auto* func = getValidOutPos<Index, NDim>;
struct ConvIndiceDispatch<false, Index, NDim> {
constexpr static auto *func = getValidOutPos<Index, NDim>;
};
template <typename Index, unsigned NDim, bool UseDeconv,
......@@ -61,8 +62,8 @@ __global__ void prepareIndicePairsKernel(
for (int ix : tv::KernelLoopX<int>(numActIn)) {
numValidPoints = ConvIndiceDispatch<UseDeconv, Index, NDim>::func(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(),
outSpatialShape.data(), validPoints);
stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
......
......@@ -89,7 +89,6 @@ __global__ void matmul(const Dtype *A, const int wA, const int hA,
// C[wB * out_row + x] += Csub;
}
template <typename Dtype, typename Itype, int BLOCK_SIZE>
__global__ void matmul2(const Dtype *A, const int wA, const int hA,
const Dtype *B, const int wB, const int hB,
......
......@@ -7,10 +7,8 @@
namespace spconv {
template <typename Index, unsigned NDim>
__global__ void scatterPointToGridKernel(
tv::TensorView<const float> points,
tv::TensorView<const Index> indexes,
tv::TensorView<float> grids,
tv::TensorView<Index> numPointsPerGrid,
tv::TensorView<const float> points, tv::TensorView<const Index> indexes,
tv::TensorView<float> grids, tv::TensorView<Index> numPointsPerGrid,
tv::TensorView<Index> pointIndex,
const tv::SimpleVector<Index, NDim> gridShape) {
Index index;
......@@ -24,14 +22,15 @@ __global__ void scatterPointToGridKernel(
atomicAdd(numPointsPerGrid.data() + index, Index(1));
#pragma unroll
for (int k = 0; k != numFeatures; ++k) {
atomicAdd(grids.data() + index * numFeatures + k, *(points.data() + ix * numFeatures + k));
atomicAdd(grids.data() + index * numFeatures + k,
*(points.data() + ix * numFeatures + k));
}
}
}
template <typename Index, unsigned NDim>
__global__ void gatherPointFromGridKernel(
tv::TensorView<const float> grids,
__global__ void
gatherPointFromGridKernel(tv::TensorView<const float> grids,
tv::TensorView<const Index> numPointsPerGrid,
tv::TensorView<const Index> pointIndexUnique,
tv::TensorView<float> voxels,
......@@ -47,14 +46,13 @@ __global__ void gatherPointFromGridKernel(
for (int k = 0; k != numFeatures; ++k) {
voxels(ix, k) = grids(index, k) / numPointsPerGrid(index);
}
index = tv::rowArrayIdxInv<Index, NDim>(
index, coors.data() + ix * NDim, gridShape.data());
index = tv::rowArrayIdxInv<Index, NDim>(index, coors.data() + ix * NDim,
gridShape.data());
}
}
template <typename Index>
__global__ void resetGridKernel(
tv::TensorView<float> grids,
__global__ void resetGridKernel(tv::TensorView<float> grids,
tv::TensorView<Index> numPointsPerGrid,
tv::TensorView<Index> pointIndexUnique) {
Index index;
......@@ -72,8 +70,8 @@ __global__ void resetGridKernel(
}
template <typename Index>
__global__ void resetPointIndexKernel(
tv::TensorView<Index> pointIndex, const Index gridVolume) {
__global__ void resetPointIndexKernel(tv::TensorView<Index> pointIndex,
const Index gridVolume) {
int num_max_points = pointIndex.dim(0) - 1;
for (int ix : tv::KernelLoopX<int>(num_max_points)) {
......
......@@ -21,15 +21,10 @@
namespace spconv {
int64_t
pointsToVoxel(torch::Tensor points,
torch::Tensor indexes,
torch::Tensor pointIndex,
torch::Tensor grids,
torch::Tensor numPointsPerGrid,
torch::Tensor voxels,
torch::Tensor coors,
std::vector<int64_t> gridShape,
int64_t pointsToVoxel(torch::Tensor points, torch::Tensor indexes,
torch::Tensor pointIndex, torch::Tensor grids,
torch::Tensor numPointsPerGrid, torch::Tensor voxels,
torch::Tensor coors, std::vector<int64_t> gridShape,
const int64_t ndim);
} // namespace spconv
......@@ -3,20 +3,16 @@
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace spconv {
void scatter_point_to_grid_cuda(
torch::Tensor points,
torch::Tensor indexes,
void scatter_point_to_grid_cuda(torch::Tensor points, torch::Tensor indexes,
torch::Tensor grids,
torch::Tensor numPointsPerGrid,
torch::Tensor pointIndex,
std::vector<int64_t> gridShape,
const int ndim);
std::vector<int64_t> gridShape, const int ndim);
void gather_point_from_grid_cuda(
torch::Tensor grids, torch::Tensor numPointsPerGrid,
void gather_point_from_grid_cuda(torch::Tensor grids,
torch::Tensor numPointsPerGrid,
torch::Tensor pointIndex,
torch::Tensor pointIndexUnique,
torch::Tensor voxels, torch::Tensor coors,
......
......@@ -23,9 +23,15 @@
namespace spconv {
enum ConvAlgo { kNative = 0, kBatch, kBatchGemmGather, kSparseConvNet, kMinkowskiEngine };
using all_conv_algos_t =
tv::mp_list_c<int, kNative, kBatch, kBatchGemmGather, kSparseConvNet, kMinkowskiEngine>;
enum ConvAlgo {
kNative = 0,
kBatch,
kBatchGemmGather,
kSparseConvNet,
kMinkowskiEngine
};
using all_conv_algos_t = tv::mp_list_c<int, kNative, kBatch, kBatchGemmGather,
kSparseConvNet, kMinkowskiEngine>;
// torch.jit's doc says only support int64, so we need to convert to int32.
std::vector<torch::Tensor>
......
......@@ -7,9 +7,10 @@ Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
Copyright (c) 2011-2013 NYU (Clement Farabet)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou,
Iain Melvin, Jason Weston) Copyright (c) 2006 Idiap Research Institute
(Samy Bengio) Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert,
Samy Bengio, Johnny Mariethoz)
From Caffe2:
......@@ -53,8 +54,8 @@ modification, are permitted provided that the following conditions are met:
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
and IDIAP Research Institute nor the names of its contributors may be
3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories
America and IDIAP Research Institute nor the names of its contributors may be
used to endorse or promote products derived from this software without
specific prior written permission.
......@@ -97,47 +98,67 @@ struct _identity final {
return std::forward<T>(arg);
}
};
template<class Func, class Enable = void>
template <class Func, class Enable = void>
struct function_takes_identity_argument : std::false_type {};
#if defined(_MSC_VER)
// For some weird reason, MSVC shows a compiler error when using guts::void_t instead of std::void_t.
// But we're only building on MSVC versions that have std::void_t, so let's just use that one.
template<class Func>
struct function_takes_identity_argument<Func, std::void_t<decltype(std::declval<Func>()(_identity()))>> : std::true_type {};
// For some weird reason, MSVC shows a compiler error when using guts::void_t
// instead of std::void_t. But we're only building on MSVC versions that have
// std::void_t, so let's just use that one.
template <class Func>
struct function_takes_identity_argument<
Func, std::void_t<decltype(std::declval<Func>()(_identity()))>>
: std::true_type {};
#else
template<class Func>
struct function_takes_identity_argument<Func, void_t<decltype(std::declval<Func>()(_identity()))>> : std::true_type {};
template <class Func>
struct function_takes_identity_argument<
Func, void_t<decltype(std::declval<Func>()(_identity()))>>
: std::true_type {};
#endif
template<bool Condition>
struct _if_constexpr;
template<>
struct _if_constexpr<true> final {
template<class ThenCallback, class ElseCallback, std::enable_if_t<function_takes_identity_argument<ThenCallback>::value, void*> = nullptr>
static decltype(auto) call(ThenCallback&& thenCallback, ElseCallback&& /* elseCallback */) {
// The _identity instance passed in can be used to delay evaluation of an expression,
// because the compiler can't know that it's just the identity we're passing in.
template <bool Condition> struct _if_constexpr;
template <> struct _if_constexpr<true> final {
template <
class ThenCallback, class ElseCallback,
std::enable_if_t<function_takes_identity_argument<ThenCallback>::value,
void *> = nullptr>
static decltype(auto) call(ThenCallback &&thenCallback,
ElseCallback && /* elseCallback */) {
// The _identity instance passed in can be used to delay evaluation of an
// expression, because the compiler can't know that it's just the identity
// we're passing in.
return thenCallback(_identity());
}
template<class ThenCallback, class ElseCallback, std::enable_if_t<!function_takes_identity_argument<ThenCallback>::value, void*> = nullptr>
static decltype(auto) call(ThenCallback&& thenCallback, ElseCallback&& /* elseCallback */) {
template <
class ThenCallback, class ElseCallback,
std::enable_if_t<!function_takes_identity_argument<ThenCallback>::value,
void *> = nullptr>
static decltype(auto) call(ThenCallback &&thenCallback,
ElseCallback && /* elseCallback */) {
return thenCallback();
}
};
template<>
struct _if_constexpr<false> final {
template<class ThenCallback, class ElseCallback, std::enable_if_t<function_takes_identity_argument<ElseCallback>::value, void*> = nullptr>
static decltype(auto) call(ThenCallback&& /* thenCallback */, ElseCallback&& elseCallback) {
// The _identity instance passed in can be used to delay evaluation of an expression,
// because the compiler can't know that it's just the identity we're passing in.
template <> struct _if_constexpr<false> final {
template <
class ThenCallback, class ElseCallback,
std::enable_if_t<function_takes_identity_argument<ElseCallback>::value,
void *> = nullptr>
static decltype(auto) call(ThenCallback && /* thenCallback */,
ElseCallback &&elseCallback) {
// The _identity instance passed in can be used to delay evaluation of an
// expression, because the compiler can't know that it's just the identity
// we're passing in.
return elseCallback(_identity());
}
template<class ThenCallback, class ElseCallback, std::enable_if_t<!function_takes_identity_argument<ElseCallback>::value, void*> = nullptr>
static decltype(auto) call(ThenCallback&& /* thenCallback */, ElseCallback&& elseCallback) {
template <
class ThenCallback, class ElseCallback,
std::enable_if_t<!function_takes_identity_argument<ElseCallback>::value,
void *> = nullptr>
static decltype(auto) call(ThenCallback && /* thenCallback */,
ElseCallback &&elseCallback) {
return elseCallback();
}
};
......@@ -173,33 +194,40 @@ struct _if_constexpr<false> final {
* template <class T>
* int func(T t) {
* return if_constexpr<std::is_same<T, MyClass1>::value>(
* [&](auto _) { return _(t).value; }, // this code is invalid for T == MyClass2, so a regular non-constexpr if statement wouldn't compile
* [&](auto _) { return _(t).val; } // this code is invalid for T == MyClass1
* [&](auto _) { return _(t).value; }, // this code is invalid for T ==
* MyClass2, so a regular non-constexpr if statement wouldn't compile
* [&](auto _) { return _(t).val; } // this code is invalid for T ==
* MyClass1
* );
* }
*
* Note: The _ argument passed in Example 3 is the identity function, i.e. it does nothing.
* It is used to force the compiler to delay type checking, because the compiler
* doesn't know what kind of _ is passed in. Without it, the compiler would fail
* when you try to access t.value but the member doesn't exist.
* Note: The _ argument passed in Example 3 is the identity function, i.e. it
* does nothing. It is used to force the compiler to delay type checking,
* because the compiler doesn't know what kind of _ is passed in. Without it,
* the compiler would fail when you try to access t.value but the member doesn't
* exist.
*
* Note: In Example 3, both branches return int, so func() returns int. This is not necessary.
* If func() had a return type of "auto", then both branches could return different
* types, say func<MyClass1>() could return int and func<MyClass2>() could return string.
* Note: In Example 3, both branches return int, so func() returns int. This is
* not necessary. If func() had a return type of "auto", then both branches
* could return different types, say func<MyClass1>() could return int and
* func<MyClass2>() could return string.
*/
template<bool Condition, class ThenCallback, class ElseCallback>
decltype(auto) if_constexpr(ThenCallback&& thenCallback, ElseCallback&& elseCallback) {
template <bool Condition, class ThenCallback, class ElseCallback>
decltype(auto) if_constexpr(ThenCallback &&thenCallback,
ElseCallback &&elseCallback) {
#if defined(__cpp_if_constexpr)
// If we have C++17, just use it's "if constexpr" feature instead of wrapping it.
// This will give us better error messages.
if constexpr(Condition) {
if constexpr (detail::function_takes_identity_argument<ThenCallback>::value) {
// If we have C++17, just use it's "if constexpr" feature instead of wrapping
// it. This will give us better error messages.
if constexpr (Condition) {
if constexpr (detail::function_takes_identity_argument<
ThenCallback>::value) {
return std::forward<ThenCallback>(thenCallback)(detail::_identity());
} else {
return std::forward<ThenCallback>(thenCallback)();
}
} else {
if constexpr (detail::function_takes_identity_argument<ElseCallback>::value) {
if constexpr (detail::function_takes_identity_argument<
ElseCallback>::value) {
return std::forward<ElseCallback>(elseCallback)(detail::_identity());
} else {
return std::forward<ElseCallback>(elseCallback)();
......@@ -207,18 +235,20 @@ decltype(auto) if_constexpr(ThenCallback&& thenCallback, ElseCallback&& elseCall
}
#else
// C++14 implementation of if constexpr
return detail::_if_constexpr<Condition>::call(std::forward<ThenCallback>(thenCallback),
return detail::_if_constexpr<Condition>::call(
std::forward<ThenCallback>(thenCallback),
std::forward<ElseCallback>(elseCallback));
#endif
}
template<bool Condition, class ThenCallback>
decltype(auto) if_constexpr(ThenCallback&& thenCallback) {
template <bool Condition, class ThenCallback>
decltype(auto) if_constexpr(ThenCallback &&thenCallback) {
#if defined(__cpp_if_constexpr)
// If we have C++17, just use it's "if constexpr" feature instead of wrapping it.
// This will give us better error messages.
if constexpr(Condition) {
if constexpr (detail::function_takes_identity_argument<ThenCallback>::value) {
// If we have C++17, just use it's "if constexpr" feature instead of wrapping
// it. This will give us better error messages.
if constexpr (Condition) {
if constexpr (detail::function_takes_identity_argument<
ThenCallback>::value) {
return std::forward<ThenCallback>(thenCallback)(detail::_identity());
} else {
return std::forward<ThenCallback>(thenCallback)();
......@@ -226,9 +256,9 @@ decltype(auto) if_constexpr(ThenCallback&& thenCallback) {
}
#else
// C++14 implementation of if constexpr
return if_constexpr<Condition>(std::forward<ThenCallback>(thenCallback), [] (auto) {});
return if_constexpr<Condition>(std::forward<ThenCallback>(thenCallback),
[](auto) {});
#endif
}
}
} // namespace tv
......@@ -22,13 +22,13 @@ If you can use libtorch, dont use tv::Tensor.
*/
#pragma once
#include "cc17.h"
#include "mp_helper.h"
#include "tensorview.h"
#include <cstring>
#include <iomanip>
#include <memory>
#include <type_traits>
#include "cc17.h"
#ifdef TV_CUDA
#include <cuda_fp16.h>
#include <cuda_runtime.h>
......@@ -632,25 +632,30 @@ struct Tensor {
tview() const {
static_assert(Rank == -1 || Rank > 0, "error");
TV_ASSERT_RT_ERR(dtype_ == type_v<T>, "error");
return if_constexpr<(Rank > 0)>([&](auto _){
return if_constexpr<(Rank > 0)>(
[&](auto _) {
TV_ASSERT_RT_ERR(Rank == ndim(), "error");
ShapeBase<_(Rank) == -1 ? TV_MAX_DIM : Rank, Tindex> shape(Rank), stride(Rank);
ShapeBase<_(Rank) == -1 ? TV_MAX_DIM : Rank, Tindex> shape(Rank),
stride(Rank);
for (int i = 0; i < Rank; ++i) {
shape[i] = shape_[i];
stride[i] = stride_[i];
}
return TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>(
reinterpret_cast<const std::remove_const_t<T> *>(data<T>()), shape,
stride);
}, [&](auto _){
return TensorView<const std::remove_const_t<T>, Rank, PtrTraits,
Tindex>(
reinterpret_cast<const std::remove_const_t<T> *>(data<T>()),
shape, stride);
},
[&](auto _) {
ShapeBase<TV_MAX_DIM, Tindex> shape(_(ndim())), stride(ndim());
for (int i = 0; i < int(ndim()); ++i) {
shape[i] = shape_[i];
stride[i] = stride_[i];
}
return TensorView<const std::remove_const_t<T>, Rank, PtrTraits, Tindex>(
reinterpret_cast<const std::remove_const_t<T> *>(data<T>()), shape,
stride);
return TensorView<const std::remove_const_t<T>, Rank, PtrTraits,
Tindex>(
reinterpret_cast<const std::remove_const_t<T> *>(data<T>()),
shape, stride);
});
}
template <class... Inds> Tensor view(Inds... newShapes) const {
......
......@@ -36,22 +36,21 @@ template <typename TimeT = std::chrono::microseconds> struct CudaContextTimer {
return res;
}
template <int Count, typename F>
double benchmark(F&& f, int start=int(Count) * 0.3){
double benchmark(F &&f, int start = int(Count) * 0.3) {
// std::vector<TimeT::rep> times;
auto res = typename TimeT::rep();
int count = 0;
cudaDeviceSynchronize();
for (int i = 0; i < Count; ++i){
for (int i = 0; i < Count; ++i) {
std::forward<F>(f)();
auto time = report();
if (i >= start){
if (i >= start) {
// times.push_back(time)
res += time;
count += 1;
}
}
return res / double(count);
}
private:
......
......@@ -14,9 +14,9 @@
#pragma once
#include "mp_helper.h"
#include <tensorview/tensorview.h>
#include <tensorview/tensor.h>
#include <ATen/ATen.h>
#include <tensorview/tensor.h>
#include <tensorview/tensorview.h>
#include <torch/script.h>
#ifdef TV_CUDA
#include <ATen/cuda/CUDAContext.h>
......
......@@ -22,6 +22,7 @@ from spconv import ops, utils
from spconv.conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
SparseConvTranspose3d, SparseInverseConv2d,
SparseInverseConv3d, SubMConv2d, SubMConv3d)
from spconv.core import SparseConvTensor
from spconv.identity import Identity
from spconv.modules import SparseModule, SparseSequential
from spconv.ops import ConvAlgo
......@@ -35,85 +36,6 @@ _LIB_PATH = str(Path(__file__).parent / _LIB_FILE_NAME)
torch.ops.load_library(_LIB_PATH)
def scatter_nd(indices, updates, shape):
"""pytorch edition of tensorflow scatter_nd.
this function don't contain except handle code. so use this carefully
when indice repeats, don't support repeat add which is supported
in tensorflow.
"""
ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
ndim = indices.shape[-1]
output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
flatted_indices = indices.view(-1, ndim)
slices = [flatted_indices[:, i] for i in range(ndim)]
slices += [Ellipsis]
ret[slices] = updates.view(*output_shape)
return ret
class SparseConvTensor(object):
def __init__(self, features, indices, spatial_shape, batch_size,
grid=None):
"""
Args:
features: [num_points, num_features] feature tensor
indices: [num_points, ndim + 1] indice tensor. batch index saved in indices[:, 0]
spatial_shape: spatial shape of your sparse data
batch_size: batch size of your sparse data
grid: pre-allocated grid tensor. should be used when the volume of spatial shape
is very large.
"""
self.features = features
self.indices = indices
self.spatial_shape = spatial_shape
self.batch_size = batch_size
self.indice_dict = {}
if grid is None:
grid = torch.Tensor() # empty tensor
self.grid = grid
@classmethod
def from_dense(cls, x: torch.Tensor):
"""create sparse tensor fron channel last dense tensor by to_sparse
x must be NHWC tensor, channel last
"""
x = x.to_sparse(x.ndim - 1)
spatial_shape = x.shape[1:-1]
batch_size = x.shape[0]
indices_th = x.indices().permute(1, 0).contiguous().int()
features_th = x.values()
return cls(features_th, indices_th, spatial_shape, batch_size)
@property
def spatial_size(self):
return np.prod(self.spatial_shape)
def find_indice_pair(self, key):
if key is None:
return None
if key in self.indice_dict:
return self.indice_dict[key]
return None
def dense(self, channels_first=True):
output_shape = [self.batch_size] + list(
self.spatial_shape) + [self.features.shape[1]]
res = scatter_nd(
self.indices.to(self.features.device).long(), self.features,
output_shape)
if not channels_first:
return res
ndim = len(self.spatial_shape)
trans_params = list(range(0, ndim + 1))
trans_params.insert(1, ndim + 1)
return res.permute(*trans_params).contiguous()
@property
def sparity(self):
return self.indices.shape[0] / np.prod(
self.spatial_shape) / self.batch_size
class ToDense(SparseModule):
"""convert SparseConvTensor to NCHW dense tensor.
"""
......
......@@ -24,6 +24,7 @@ from torch.nn.parameter import Parameter
import spconv
import spconv.functional as Fsp
from spconv import ops
from spconv.core import IndiceData, SparseConvTensor
from spconv.modules import SparseModule
......@@ -72,8 +73,9 @@ class SparseConvolution(SparseModule):
indice_key=None,
fused_bn=False,
use_hash=False,
algo=ops.ConvAlgo.Native):
super(SparseConvolution, self).__init__()
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConvolution, self).__init__(name=name)
assert groups == 1
if not isinstance(kernel_size, (list, tuple)):
kernel_size = [kernel_size] * ndim
......@@ -123,8 +125,8 @@ class SparseConvolution(SparseModule):
bound = 1 / math.sqrt(fan_in)
init.uniform_(self.bias, -bound, bound)
def forward(self, input):
assert isinstance(input, spconv.SparseConvTensor)
def forward(self, input: SparseConvTensor):
assert isinstance(input, SparseConvTensor)
features = input.features
device = features.device
indices = input.indices
......@@ -143,29 +145,58 @@ class SparseConvolution(SparseModule):
out_spatial_shape = spatial_shape
# input.update_grid(out_spatial_shape)
# t = time.time()
out_tensor = input.shadow_copy()
if input.benchmark:
if self.name is None:
raise ValueError(
"you need to assign name to spmodules before benchmark (spconv.utils.bench.assign_name_to_spmod)"
)
if self.name not in input.benchmark_record:
input.benchmark_record[self.name] = {
"type": "SparseConvolution",
"indice_gen_time": [],
"time": [],
"num_points": [],
"num_out_points": [],
"params": {
"kernel_size": self.kernel_size,
"stride": self.stride,
"padding": self.padding,
"dilation": self.dilation,
"output_padding": self.output_padding,
"subm": self.subm,
"transposed": self.transposed,
"input_channels": self.in_channels,
"out_channels": self.out_channels,
}
}
if self.conv1x1:
features = torch.mm(
input.features,
self.weight.view(self.in_channels, self.out_channels))
if self.bias is not None:
features += self.bias
out_tensor = spconv.SparseConvTensor(features, input.indices,
input.spatial_shape,
input.batch_size)
out_tensor.indice_dict = input.indice_dict
out_tensor.grid = input.grid
out_tensor.features = features
return out_tensor
datas = input.find_indice_pair(self.indice_key)
if self.inverse:
assert datas is not None and self.indice_key is not None
_, outids, indice_pairs, indice_pair_num, out_spatial_shape = datas
outids = datas.indices
indice_pairs = datas.indice_pairs
indice_pair_num = datas.indice_pair_num
out_spatial_shape = datas.out_spatial_shape
assert indice_pair_num.shape[0] == np.prod(
self.kernel_size
), "inverse conv must have same kernel size as its couple conv"
else:
if self.indice_key is not None and datas is not None:
outids, _, indice_pairs, indice_pair_num, _ = datas
outids = datas.out_indices
indice_pairs = datas.indice_pairs
indice_pair_num = datas.indice_pair_num
else:
if input.benchmark:
torch.cuda.synchronize()
t = time.time()
outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
indices,
batch_size,
......@@ -179,10 +210,19 @@ class SparseConvolution(SparseModule):
self.transposed,
grid=input.grid,
use_hash=self.use_hash)
input.indice_dict[self.indice_key] = (outids, indices,
indice_pairs,
indice_pair_num,
spatial_shape)
if input.benchmark:
torch.cuda.synchronize()
interval = time.time() - t
out_tensor.benchmark_record[
self.name]["indice_gen_time"].append(interval)
indice_data = IndiceData(outids, indices, indice_pairs,
indice_pair_num, spatial_shape)
input.indice_dict[self.indice_key] = indice_data
if input.benchmark:
torch.cuda.synchronize()
t = time.time()
if self.fused_bn:
assert self.bias is not None
out_features = ops.fused_indice_conv(features, self.weight,
......@@ -210,10 +250,18 @@ class SparseConvolution(SparseModule):
if self.bias is not None:
out_features += self.bias
out_tensor = spconv.SparseConvTensor(out_features, outids,
out_spatial_shape, batch_size)
out_tensor.indice_dict = input.indice_dict
out_tensor.grid = input.grid
if input.benchmark:
torch.cuda.synchronize()
interval = time.time() - t
out_tensor.benchmark_record[self.name]["time"].append(interval)
out_tensor.benchmark_record[self.name]["num_points"].append(
features.shape[0])
out_tensor.benchmark_record[self.name]["num_out_points"].append(
out_features.shape[0])
out_tensor.features = out_features
out_tensor.indices = outids
out_tensor.spatial_shape = out_spatial_shape
return out_tensor
......@@ -229,7 +277,8 @@ class SparseConv2d(SparseConvolution):
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native):
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConv2d, self).__init__(2,
in_channels,
out_channels,
......@@ -241,7 +290,8 @@ class SparseConv2d(SparseConvolution):
bias,
indice_key=indice_key,
use_hash=use_hash,
algo=algo)
algo=algo,
name=name)
class SparseConv3d(SparseConvolution):
......@@ -256,7 +306,8 @@ class SparseConv3d(SparseConvolution):
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native):
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConv3d, self).__init__(3,
in_channels,
out_channels,
......@@ -268,7 +319,8 @@ class SparseConv3d(SparseConvolution):
bias,
indice_key=indice_key,
use_hash=use_hash,
algo=algo)
algo=algo,
name=name)
class SparseConv4d(SparseConvolution):
......@@ -283,7 +335,8 @@ class SparseConv4d(SparseConvolution):
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native):
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConv4d, self).__init__(4,
in_channels,
out_channels,
......@@ -295,7 +348,8 @@ class SparseConv4d(SparseConvolution):
bias,
indice_key=indice_key,
use_hash=use_hash,
algo=algo)
algo=algo,
name=name)
class SparseConvTranspose2d(SparseConvolution):
......@@ -310,7 +364,8 @@ class SparseConvTranspose2d(SparseConvolution):
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native):
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConvTranspose2d, self).__init__(2,
in_channels,
out_channels,
......@@ -323,7 +378,8 @@ class SparseConvTranspose2d(SparseConvolution):
transposed=True,
indice_key=indice_key,
use_hash=use_hash,
algo=algo)
algo=algo,
name=name)
class SparseConvTranspose3d(SparseConvolution):
......@@ -338,7 +394,8 @@ class SparseConvTranspose3d(SparseConvolution):
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native):
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConvTranspose3d, self).__init__(3,
in_channels,
out_channels,
......@@ -351,7 +408,8 @@ class SparseConvTranspose3d(SparseConvolution):
transposed=True,
indice_key=indice_key,
use_hash=use_hash,
algo=algo)
algo=algo,
name=name)
class SparseInverseConv2d(SparseConvolution):
......@@ -361,7 +419,8 @@ class SparseInverseConv2d(SparseConvolution):
kernel_size,
indice_key,
bias=True,
algo=ops.ConvAlgo.Native):
algo=ops.ConvAlgo.Native,
name=None):
super(SparseInverseConv2d, self).__init__(2,
in_channels,
out_channels,
......@@ -369,7 +428,8 @@ class SparseInverseConv2d(SparseConvolution):
bias=bias,
inverse=True,
indice_key=indice_key,
algo=algo)
algo=algo,
name=name)
class SparseInverseConv3d(SparseConvolution):
......@@ -379,7 +439,8 @@ class SparseInverseConv3d(SparseConvolution):
kernel_size,
indice_key,
bias=True,
algo=ops.ConvAlgo.Native):
algo=ops.ConvAlgo.Native,
name=None):
super(SparseInverseConv3d, self).__init__(3,
in_channels,
out_channels,
......@@ -387,7 +448,8 @@ class SparseInverseConv3d(SparseConvolution):
bias=bias,
inverse=True,
indice_key=indice_key,
algo=algo)
algo=algo,
name=name)
class SubMConv2d(SparseConvolution):
......@@ -402,7 +464,8 @@ class SubMConv2d(SparseConvolution):
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native):
algo=ops.ConvAlgo.Native,
name=None):
super(SubMConv2d, self).__init__(2,
in_channels,
out_channels,
......@@ -415,7 +478,8 @@ class SubMConv2d(SparseConvolution):
True,
indice_key=indice_key,
use_hash=use_hash,
algo=algo)
algo=algo,
name=name)
class SubMConv3d(SparseConvolution):
......@@ -430,7 +494,8 @@ class SubMConv3d(SparseConvolution):
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native):
algo=ops.ConvAlgo.Native,
name=None):
super(SubMConv3d, self).__init__(3,
in_channels,
out_channels,
......@@ -443,7 +508,8 @@ class SubMConv3d(SparseConvolution):
True,
indice_key=indice_key,
use_hash=use_hash,
algo=algo)
algo=algo,
name=name)
class SubMConv4d(SparseConvolution):
......@@ -458,7 +524,8 @@ class SubMConv4d(SparseConvolution):
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native):
algo=ops.ConvAlgo.Native,
name=None):
super(SubMConv4d, self).__init__(4,
in_channels,
out_channels,
......@@ -471,4 +538,5 @@ class SubMConv4d(SparseConvolution):
True,
indice_key=indice_key,
use_hash=use_hash,
algo=algo)
algo=algo,
name=name)
from typing import Optional
import numpy as np
import torch
class IndiceData(object):
def __init__(self, out_indices, indices, indice_pairs, indice_pair_num,
out_spatial_shape):
self.out_indices = out_indices
self.indices = indices
self.indice_pairs = indice_pairs
self.indice_pair_num = indice_pair_num
self.out_spatial_shape = out_spatial_shape
def scatter_nd(indices, updates, shape):
"""pytorch edition of tensorflow scatter_nd.
this function don't contain except handle code. so use this carefully
when indice repeats, don't support repeat add which is supported
in tensorflow.
"""
ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
ndim = indices.shape[-1]
output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
flatted_indices = indices.view(-1, ndim)
slices = [flatted_indices[:, i] for i in range(ndim)]
slices += [Ellipsis]
ret[slices] = updates.view(*output_shape)
return ret
class SparseConvTensor(object):
def __init__(self,
features,
indices,
spatial_shape,
batch_size,
grid=None,
benchmark=False):
"""
Args:
features: [num_points, num_features] feature tensor
indices: [num_points, ndim + 1] indice tensor. batch index saved in indices[:, 0]
spatial_shape: spatial shape of your sparse data
batch_size: batch size of your sparse data
grid: pre-allocated grid tensor. should be used when the volume of spatial shape
is very large.
benchmark: whether to enable benchmark. if enabled, all sparse operators will be record to
SparseConvTensor.
"""
self.features = features
self.indices = indices
self.spatial_shape = spatial_shape
self.batch_size = batch_size
self.indice_dict = {}
if grid is None:
grid = torch.Tensor() # empty tensor
self.grid = grid
self.benchmark = benchmark
self.benchmark_record = {}
@classmethod
def from_dense(cls, x: torch.Tensor):
"""create sparse tensor fron channel last dense tensor by to_sparse
x must be NHWC tensor, channel last
"""
x = x.to_sparse(x.ndim - 1)
spatial_shape = x.shape[1:-1]
batch_size = x.shape[0]
indices_th = x.indices().permute(1, 0).contiguous().int()
features_th = x.values()
return cls(features_th, indices_th, spatial_shape, batch_size)
@property
def spatial_size(self):
return np.prod(self.spatial_shape)
def find_indice_pair(self, key) -> Optional[IndiceData]:
if key is None:
return None
if key in self.indice_dict:
return self.indice_dict[key]
return None
def dense(self, channels_first=True):
output_shape = [self.batch_size] + list(
self.spatial_shape) + [self.features.shape[1]]
res = scatter_nd(
self.indices.to(self.features.device).long(), self.features,
output_shape)
if not channels_first:
return res
ndim = len(self.spatial_shape)
trans_params = list(range(0, ndim + 1))
trans_params.insert(1, ndim + 1)
return res.permute(*trans_params).contiguous()
@property
def sparity(self):
return self.indices.shape[0] / np.prod(
self.spatial_shape) / self.batch_size
def shadow_copy(self) -> "SparseConvTensor":
"""create a new spconv tensor with all member unchanged"""
tensor = SparseConvTensor(self.features, self.indices,
self.spatial_shape, self.batch_size,
self.grid, self.benchmark)
tensor.benchmark_record = self.benchmark_record
tensor.indice_dict = self.indice_dict
return tensor
......@@ -49,7 +49,9 @@ def _mean_update(vals, m_vals, t):
class SparseModule(nn.Module):
""" place holder, all module subclass from this will take sptensor in SparseSequential.
"""
pass
def __init__(self, name=None):
super().__init__()
self.name = name
class SparseSequential(SparseModule):
......
......@@ -26,6 +26,7 @@ class ConvAlgo(Enum):
SparseConvNet = 3
Minkowski = 4 # https://github.com/StanfordVL/MinkowskiEngine/blob/master/src/convolution.cu
def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
ndim = len(input_size)
output_size = []
......
......@@ -24,6 +24,7 @@ from torch.nn.parameter import Parameter
import spconv
import spconv.functional as Fsp
from spconv import ops
from spconv.core import IndiceData
from spconv.modules import SparseModule
......@@ -34,8 +35,10 @@ class SparseMaxPool(SparseModule):
stride=None,
padding=0,
dilation=1,
subm=False):
super(SparseMaxPool, self).__init__()
indice_key=None,
subm=False,
name=None):
super(SparseMaxPool, self).__init__(name=name)
if not isinstance(kernel_size, (list, tuple)):
kernel_size = [kernel_size] * ndim
if stride is None:
......@@ -52,6 +55,7 @@ class SparseMaxPool(SparseModule):
self.padding = padding
self.subm = subm
self.dilation = dilation
self.indice_key = indice_key
def forward(self, input):
assert isinstance(input, spconv.SparseConvTensor)
......@@ -66,6 +70,32 @@ class SparseMaxPool(SparseModule):
self.dilation)
else:
out_spatial_shape = spatial_shape
out_tensor = input.shadow_copy()
if input.benchmark:
if self.name is None:
raise ValueError(
"you need to assign name to spmodules before benchmark (spconv.utils.bench.assign_name_to_spmod)"
)
if self.name not in input.benchmark_record:
input.benchmark_record[self.name] = {
"type": "SparseMaxPool",
"indice_gen_time": [],
"time": [],
"num_points": [],
"num_out_points": [],
"params": {
"kernel_size": self.kernel_size,
"stride": self.stride,
"padding": self.padding,
"dilation": self.dilation,
"channels": features.shape[1],
}
}
if input.benchmark:
torch.cuda.synchronize()
t = time.time()
outids, indice_pairs, indice_pairs_num = ops.get_indice_pairs(
indices,
batch_size,
......@@ -77,24 +107,65 @@ class SparseMaxPool(SparseModule):
0,
self.subm,
grid=input.grid)
if input.benchmark:
torch.cuda.synchronize()
interval = time.time() - t
out_tensor.benchmark_record[self.name]["indice_gen_time"].append(
interval)
t = time.time()
if self.indice_key is not None:
datas = input.find_indice_pair(self.indice_key)
if datas is None:
indice_data = IndiceData(outids, indices, indice_pairs,
indice_pairs_num, spatial_shape)
input.indice_dict[self.indice_key] = indice_data
else:
raise ValueError("indice data exists")
out_features = Fsp.indice_maxpool(features, indice_pairs.to(device),
indice_pairs_num.to(device),
outids.shape[0])
out_tensor = spconv.SparseConvTensor(out_features, outids,
out_spatial_shape, batch_size)
out_tensor.indice_dict = input.indice_dict
out_tensor.grid = input.grid
if input.benchmark:
torch.cuda.synchronize()
interval = time.time() - t
out_tensor.benchmark_record[self.name]["time"].append(interval)
out_tensor.benchmark_record[self.name]["num_points"].append(
features.shape[0])
out_tensor.benchmark_record[self.name]["num_out_points"].append(
out_features.shape[0])
out_tensor.features = out_features
out_tensor.indices = outids
out_tensor.spatial_shape = out_spatial_shape
return out_tensor
class SparseMaxPool2d(SparseMaxPool):
def __init__(self, kernel_size, stride=None, padding=0, dilation=1):
super(SparseMaxPool2d, self).__init__(2, kernel_size, stride, padding,
dilation)
def __init__(self,
kernel_size,
stride=None,
padding=0,
dilation=1,
name=None):
super(SparseMaxPool2d, self).__init__(2,
kernel_size,
stride,
padding,
dilation,
name=name)
class SparseMaxPool3d(SparseMaxPool):
def __init__(self, kernel_size, stride=None, padding=0, dilation=1):
super(SparseMaxPool3d, self).__init__(3, kernel_size, stride, padding,
dilation)
def __init__(self,
kernel_size,
stride=None,
padding=0,
dilation=1,
name=None):
super(SparseMaxPool3d, self).__init__(3,
kernel_size,
stride,
padding,
dilation,
name=name)
......@@ -24,6 +24,7 @@ from torch.nn.parameter import Parameter
import spconv
from spconv.modules import SparseModule
class RemoveDuplicate(SparseModule):
def forward(self, x: spconv.SparseConvTensor):
inds = x.indices
......@@ -39,5 +40,6 @@ class RemoveDuplicate(SparseModule):
_, unique_inds = torch.unique(indices_index)
new_inds = inds[unique_inds]
new_features = x.features[unique_inds]
res = spconv.SparseConvTensor(new_features, new_inds, x.spatial_shape, x.batch_size, x.grid)
res = spconv.SparseConvTensor(new_features, new_inds, x.spatial_shape,
x.batch_size, x.grid)
return res
......@@ -294,20 +294,18 @@ class VoxelGeneratorV2:
def grid_size(self):
return self._grid_size
class VoxelGeneratorV3:
def __init__(self,
voxel_size,
point_cloud_range,
max_points,
num_features,
dtype,
device):
def __init__(self, voxel_size, point_cloud_range, max_points, num_features,
dtype, device):
self._max_points = max_points
self._point_cloud_range = point_cloud_range
self._voxel_size = voxel_size
self._grid_size = torch.round((self._point_cloud_range[3:] - self._point_cloud_range[:3]) / self._voxel_size).to(torch.int32)
self._grid_size = torch.round(
(self._point_cloud_range[3:] - self._point_cloud_range[:3]) /
self._voxel_size).to(torch.int32)
grid_volume = self._grid_size.prod()
self._grid_size = self._grid_size.cpu().numpy().tolist()
self._ndim = len(self._grid_size)
......@@ -315,19 +313,34 @@ class VoxelGeneratorV3:
self._dtype = dtype
self._device = device
self._point_index = torch.full([max_points + 1], grid_volume, dtype=torch.int32, device=self._device)
self._grids = torch.zeros([grid_volume, num_features], dtype=self._dtype, device=self._device)
self._num_points_per_grid = torch.zeros([grid_volume], dtype=torch.int32, device=self._device)
self._voxels = torch.zeros([max_points, num_features], dtype=self._dtype, device=self._device)
self._coors = torch.zeros([max_points, self._ndim], dtype=torch.int32, device=self._device)
self._point_index = torch.full([max_points + 1],
grid_volume,
dtype=torch.int32,
device=self._device)
self._grids = torch.zeros([grid_volume, num_features],
dtype=self._dtype,
device=self._device)
self._num_points_per_grid = torch.zeros([grid_volume],
dtype=torch.int32,
device=self._device)
self._voxels = torch.zeros([max_points, num_features],
dtype=self._dtype,
device=self._device)
self._coors = torch.zeros([max_points, self._ndim],
dtype=torch.int32,
device=self._device)
def generate(self, points):
assert points.shape[0] <= self._max_points, 'please enlarge max_points to not smaller than ' + str(points.shape[0])
assert points.shape[
0] <= self._max_points, 'please enlarge max_points to not smaller than ' + str(
points.shape[0])
points.to(self._dtype).to(self._device)
return self.points_to_voxel(points)
def generate_multi_gpu(self, points):
assert points.shape[0] <= self._max_points, 'please enlarge max_points to not smaller than ' + str(points.shape[0])
assert points.shape[
0] <= self._max_points, 'please enlarge max_points to not smaller than ' + str(
points.shape[0])
points.to(self._dtype).to(self._device)
return self.points_to_voxel(points)
......@@ -351,23 +364,21 @@ class VoxelGeneratorV3:
coors_range: [6] list/tuple or array or tensor, float. indicate voxel range.
format: xyzxyz, minmax
"""
indexes = torch.floor((points[:, :3] - self._point_cloud_range[:3]) / self._voxel_size).to(torch.int32)
num_voxel = torch.ops.spconv.points_to_voxel(points, indexes,
self._point_index,
self._grids,
self._num_points_per_grid,
self._voxels,
self._coors,
self._grid_size,
self._ndim)
indexes = torch.floor((points[:, :3] - self._point_cloud_range[:3]) /
self._voxel_size).to(torch.int32)
num_voxel = torch.ops.spconv.points_to_voxel(
points, indexes, self._point_index, self._grids,
self._num_points_per_grid, self._voxels, self._coors,
self._grid_size, self._ndim)
voxels = self._voxels[:num_voxel, :]
coors = self._coors[:num_voxel, :]
# xyz --> zyx
#coors = coors[::-1]
x, y, z = coors[:, 0].reshape([-1, 1]), coors[:, 1].reshape([-1, 1]), coors[:, 2].reshape([-1, 1])
x, y, z = coors[:, 0].reshape([-1, 1]), coors[:, 1].reshape(
[-1, 1]), coors[:, 2].reshape([-1, 1])
coors = torch.cat([z, y, x], dim=1)
# can be skipped
# x, y, z, f = voxels[:, 0].reshape([-1, 1]), voxels[:, 1].reshape([-1, 1]), voxels[:, 2].reshape([-1, 1]), voxels[:, 3:]
# voxels = torch.cat([z, y, x, f], dim=1)
# x, y, z, f = voxels[:, 0].reshape([-1, 1]), voxels[:, 1].reshape([-1, 1]), voxels[:, 2].reshape([-1, 1]), voxels[:, 3:]
# voxels = torch.cat([z, y, x, f], dim=1)
return voxels, coors
......@@ -15,8 +15,8 @@
#include <ATen/ATen.h>
#include <spconv/fused_conv.cu.h>
#include <spconv/fused_conv.h>
#include <tensorview/torch_utils.h>
#include <spconv/minkowski.cu.h>
#include <tensorview/torch_utils.h>
namespace spconv {
void fused_conv_cuda(torch::Tensor output, torch::Tensor features,
......@@ -81,10 +81,9 @@ void fused_conv_cuda_minkowski(torch::Tensor output, torch::Tensor features,
int step = (nHot + num_div - 1) / num_div;
dim3 threads(shared_mem_size, shared_mem_size);
tv::dispatch_torch<float>(dtype, [&](auto I) {
using T = decltype(I);
tv::DispatchInt<shmem_sizes_t>()(shared_mem_size, [&](auto ShSizeValue){
tv::DispatchInt<shmem_sizes_t>()(shared_mem_size, [&](auto ShSizeValue) {
constexpr int ShmemSize = decltype(ShSizeValue)::value;
for (int s = 0; s < num_div; s++) {
int remainder = nHot - step * s;
......@@ -93,16 +92,18 @@ void fused_conv_cuda_minkowski(torch::Tensor output, torch::Tensor features,
(curr_num_active + threads.y - 1) / threads.y);
matmul<T, int32_t, ShmemSize><<<grid, threads, 0, stream>>>(
features.data_ptr<T>(), in_nchannel, curr_num_active,
filters.data_ptr<T>(), out_nchannel,
in_nchannel, output.data_ptr<T>(), indicesIn.data_ptr<int32_t>(),
filters.data_ptr<T>(), out_nchannel, in_nchannel,
output.data_ptr<T>(), indicesIn.data_ptr<int32_t>(),
indicesOut.data_ptr<int32_t>());
}
});
});
}
void fused_conv_backward_cuda_minkowski(torch::Tensor features, torch::Tensor din,
torch::Tensor dout, torch::Tensor filters,
torch::Tensor dfilters, torch::Tensor indicesIn,
void fused_conv_backward_cuda_minkowski(torch::Tensor features,
torch::Tensor din, torch::Tensor dout,
torch::Tensor filters,
torch::Tensor dfilters,
torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot) {
auto dtype = features.scalar_type();
auto in_nchannel = features.size(1);
......@@ -131,7 +132,7 @@ void fused_conv_backward_cuda_minkowski(torch::Tensor features, torch::Tensor di
tv::dispatch_torch<float>(dtype, [&](auto I) {
using T = decltype(I);
tv::DispatchInt<shmem_sizes_t>()(shared_mem_size, [&](auto ShSizeValue){
tv::DispatchInt<shmem_sizes_t>()(shared_mem_size, [&](auto ShSizeValue) {
constexpr int ShmemSize = decltype(ShSizeValue)::value;
for (int s = 0; s < num_div; s++) {
int remainder = nHot - step * s;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment