Unverified Commit 333536f6 authored by Wenwei Zhang's avatar Wenwei Zhang Committed by GitHub
Browse files

Release v1.0.0rc1

parents 9c7270d0 f747daab
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from mmcv.cnn import build_conv_layer, build_norm_layer from mmcv.cnn import build_conv_layer, build_norm_layer
from mmcv.ops import SparseModule, SparseSequential
from torch import nn from torch import nn
from mmdet3d.ops import spconv
from mmdet.models.backbones.resnet import BasicBlock, Bottleneck from mmdet.models.backbones.resnet import BasicBlock, Bottleneck
class SparseBottleneck(Bottleneck, spconv.SparseModule): class SparseBottleneck(Bottleneck, SparseModule):
"""Sparse bottleneck block for PartA^2. """Sparse bottleneck block for PartA^2.
Bottleneck block implemented with submanifold sparse convolution. Bottleneck block implemented with submanifold sparse convolution.
...@@ -32,7 +32,7 @@ class SparseBottleneck(Bottleneck, spconv.SparseModule): ...@@ -32,7 +32,7 @@ class SparseBottleneck(Bottleneck, spconv.SparseModule):
conv_cfg=None, conv_cfg=None,
norm_cfg=None): norm_cfg=None):
spconv.SparseModule.__init__(self) SparseModule.__init__(self)
Bottleneck.__init__( Bottleneck.__init__(
self, self,
inplanes, inplanes,
...@@ -65,7 +65,7 @@ class SparseBottleneck(Bottleneck, spconv.SparseModule): ...@@ -65,7 +65,7 @@ class SparseBottleneck(Bottleneck, spconv.SparseModule):
return out return out
class SparseBasicBlock(BasicBlock, spconv.SparseModule): class SparseBasicBlock(BasicBlock, SparseModule):
"""Sparse basic block for PartA^2. """Sparse basic block for PartA^2.
Sparse basic block implemented with submanifold sparse convolution. Sparse basic block implemented with submanifold sparse convolution.
...@@ -90,7 +90,7 @@ class SparseBasicBlock(BasicBlock, spconv.SparseModule): ...@@ -90,7 +90,7 @@ class SparseBasicBlock(BasicBlock, spconv.SparseModule):
downsample=None, downsample=None,
conv_cfg=None, conv_cfg=None,
norm_cfg=None): norm_cfg=None):
spconv.SparseModule.__init__(self) SparseModule.__init__(self)
BasicBlock.__init__( BasicBlock.__init__(
self, self,
inplanes, inplanes,
...@@ -182,5 +182,5 @@ def make_sparse_convmodule(in_channels, ...@@ -182,5 +182,5 @@ def make_sparse_convmodule(in_channels,
elif layer == 'act': elif layer == 'act':
layers.append(nn.ReLU(inplace=True)) layers.append(nn.ReLU(inplace=True))
layers = spconv.SparseSequential(*layers) layers = SparseSequential(*layers)
return layers return layers
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
SparseConvTranspose3d, SparseInverseConv2d,
SparseInverseConv3d, SubMConv2d, SubMConv3d)
from .modules import SparseModule, SparseSequential
from .pool import SparseMaxPool2d, SparseMaxPool3d
from .structure import SparseConvTensor, scatter_nd
__all__ = [
'SparseConv2d',
'SparseConv3d',
'SubMConv2d',
'SubMConv3d',
'SparseConvTranspose2d',
'SparseConvTranspose3d',
'SparseInverseConv2d',
'SparseInverseConv3d',
'SparseModule',
'SparseSequential',
'SparseMaxPool2d',
'SparseMaxPool3d',
'SparseConvTensor',
'scatter_nd',
]
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import numpy as np
import torch
from mmcv.cnn import CONV_LAYERS
from torch.nn import init
from torch.nn.parameter import Parameter
from . import functional as Fsp
from . import ops
from .modules import SparseModule
from .structure import SparseConvTensor
def _calculate_fan_in_and_fan_out_hwio(tensor):
dimensions = tensor.ndimension()
if dimensions < 2:
raise ValueError('fan in and fan out can not be computed for tensor'
'with fewer than 2 dimensions')
if dimensions == 2: # Linear
fan_in = tensor.size(-2)
fan_out = tensor.size(-1)
else:
num_input_fmaps = tensor.size(-2)
num_output_fmaps = tensor.size(-1)
receptive_field_size = 1
if tensor.dim() > 2:
receptive_field_size = tensor[..., 0, 0].numel()
fan_in = num_input_fmaps * receptive_field_size
fan_out = num_output_fmaps * receptive_field_size
return fan_in, fan_out
class SparseConvolution(SparseModule):
def __init__(self,
ndim,
in_channels,
out_channels,
kernel_size=3,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
subm=False,
output_padding=0,
transposed=False,
inverse=False,
indice_key=None,
fused_bn=False):
super(SparseConvolution, self).__init__()
assert groups == 1
if not isinstance(kernel_size, (list, tuple)):
kernel_size = [kernel_size] * ndim
if not isinstance(stride, (list, tuple)):
stride = [stride] * ndim
if not isinstance(padding, (list, tuple)):
padding = [padding] * ndim
if not isinstance(dilation, (list, tuple)):
dilation = [dilation] * ndim
if not isinstance(output_padding, (list, tuple)):
output_padding = [output_padding] * ndim
for d, s in zip(dilation, stride):
assert any([s == 1, d == 1]), "don't support this."
self.ndim = ndim
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.conv1x1 = np.prod(kernel_size) == 1
self.stride = stride
self.padding = padding
self.dilation = dilation
self.transposed = transposed
self.inverse = inverse
self.output_padding = output_padding
self.groups = groups
self.subm = subm
self.indice_key = indice_key
self.fused_bn = fused_bn
self.weight = Parameter(
torch.Tensor(*kernel_size, in_channels, out_channels))
if bias:
self.bias = Parameter(torch.Tensor(out_channels))
else:
self.register_parameter('bias', None)
self.reset_parameters()
def reset_parameters(self):
init.kaiming_uniform_(self.weight, a=math.sqrt(5))
if self.bias is not None:
fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
bound = 1 / math.sqrt(fan_in)
init.uniform_(self.bias, -bound, bound)
def forward(self, input):
assert isinstance(input, SparseConvTensor)
features = input.features
device = features.device
indices = input.indices
spatial_shape = input.spatial_shape
batch_size = input.batch_size
if not self.subm:
if self.transposed:
out_spatial_shape = ops.get_deconv_output_size(
spatial_shape, self.kernel_size, self.stride, self.padding,
self.dilation, self.output_padding)
else:
out_spatial_shape = ops.get_conv_output_size(
spatial_shape, self.kernel_size, self.stride, self.padding,
self.dilation)
else:
out_spatial_shape = spatial_shape
# input.update_grid(out_spatial_shape)
# t = time.time()
if self.conv1x1:
features = torch.mm(
input.features,
self.weight.view(self.in_channels, self.out_channels))
if self.bias is not None:
features += self.bias
out_tensor = SparseConvTensor(features, input.indices,
input.spatial_shape,
input.batch_size)
out_tensor.indice_dict = input.indice_dict
out_tensor.grid = input.grid
return out_tensor
data = input.find_indice_pair(self.indice_key)
if self.inverse:
assert data is not None and self.indice_key is not None
_, outids, indice_pairs, indice_pair_num, out_spatial_shape = data
assert indice_pairs.shape[0] == np.prod(
self.kernel_size
), 'inverse conv must have same kernel size as its couple conv'
else:
if self.indice_key is not None and data is not None:
outids, _, indice_pairs, indice_pair_num, _ = data
else:
outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
indices,
batch_size,
spatial_shape,
self.kernel_size,
self.stride,
self.padding,
self.dilation,
self.output_padding,
self.subm,
self.transposed,
grid=input.grid)
input.indice_dict[self.indice_key] = (outids, indices,
indice_pairs,
indice_pair_num,
spatial_shape)
if self.fused_bn:
assert self.bias is not None
out_features = ops.fused_indice_conv(features, self.weight,
self.bias,
indice_pairs.to(device),
indice_pair_num,
outids.shape[0], self.inverse,
self.subm)
else:
if self.subm:
out_features = Fsp.indice_subm_conv(features, self.weight,
indice_pairs.to(device),
indice_pair_num,
outids.shape[0])
else:
if self.inverse:
out_features = Fsp.indice_inverse_conv(
features, self.weight, indice_pairs.to(device),
indice_pair_num, outids.shape[0])
else:
out_features = Fsp.indice_conv(features, self.weight,
indice_pairs.to(device),
indice_pair_num,
outids.shape[0])
if self.bias is not None:
out_features += self.bias
out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
batch_size)
out_tensor.indice_dict = input.indice_dict
out_tensor.grid = input.grid
return out_tensor
@CONV_LAYERS.register_module(force=True)
class SparseConv2d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SparseConv2d, self).__init__(
2,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
indice_key=indice_key)
@CONV_LAYERS.register_module(force=True)
class SparseConv3d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SparseConv3d, self).__init__(
3,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
indice_key=indice_key)
@CONV_LAYERS.register_module(force=True)
class SparseConv4d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SparseConv4d, self).__init__(
4,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
indice_key=indice_key)
@CONV_LAYERS.register_module(force=True)
class SparseConvTranspose2d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SparseConvTranspose2d, self).__init__(
2,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
transposed=True,
indice_key=indice_key)
@CONV_LAYERS.register_module(force=True)
class SparseConvTranspose3d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SparseConvTranspose3d, self).__init__(
3,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
transposed=True,
indice_key=indice_key)
@CONV_LAYERS.register_module(force=True)
class SparseInverseConv2d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
indice_key,
bias=True):
super(SparseInverseConv2d, self).__init__(
2,
in_channels,
out_channels,
kernel_size,
bias=bias,
inverse=True,
indice_key=indice_key)
@CONV_LAYERS.register_module(force=True)
class SparseInverseConv3d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
indice_key,
bias=True):
super(SparseInverseConv3d, self).__init__(
3,
in_channels,
out_channels,
kernel_size,
bias=bias,
inverse=True,
indice_key=indice_key)
@CONV_LAYERS.register_module(force=True)
class SubMConv2d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SubMConv2d, self).__init__(
2,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
True,
indice_key=indice_key)
@CONV_LAYERS.register_module(force=True)
class SubMConv3d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SubMConv3d, self).__init__(
3,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
True,
indice_key=indice_key)
@CONV_LAYERS.register_module(force=True)
class SubMConv4d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None):
super(SubMConv4d, self).__init__(
4,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
True,
indice_key=indice_key)
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from torch.autograd import Function
from . import ops as ops
class SparseConvFunction(Function):
@staticmethod
def forward(ctx, features, filters, indice_pairs, indice_pair_num,
num_activate_out):
ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
return ops.indice_conv(features, filters, indice_pairs,
indice_pair_num, num_activate_out, False)
@staticmethod
def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
input_bp, filters_bp = ops.indice_conv_backward(
features, filters, grad_output, indice_pairs, indice_pair_num,
False)
return input_bp, filters_bp, None, None, None
class SparseInverseConvFunction(Function):
@staticmethod
def forward(ctx, features, filters, indice_pairs, indice_pair_num,
num_activate_out):
ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
return ops.indice_conv(features, filters, indice_pairs,
indice_pair_num, num_activate_out, True, False)
@staticmethod
def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
input_bp, filters_bp = ops.indice_conv_backward(
features, filters, grad_output, indice_pairs, indice_pair_num,
True, False)
return input_bp, filters_bp, None, None, None
class SubMConvFunction(Function):
@staticmethod
def forward(ctx, features, filters, indice_pairs, indice_pair_num,
num_activate_out):
ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
return ops.indice_conv(features, filters, indice_pairs,
indice_pair_num, num_activate_out, False, True)
@staticmethod
def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
input_bp, filters_bp = ops.indice_conv_backward(
features, filters, grad_output, indice_pairs, indice_pair_num,
False, True)
return input_bp, filters_bp, None, None, None
class SparseMaxPoolFunction(Function):
@staticmethod
def forward(ctx, features, indice_pairs, indice_pair_num,
num_activate_out):
out = ops.indice_maxpool(features, indice_pairs, indice_pair_num,
num_activate_out)
ctx.save_for_backward(indice_pairs, indice_pair_num, features, out)
return out
@staticmethod
def backward(ctx, grad_output):
indice_pairs, indice_pair_num, features, out = ctx.saved_tensors
input_bp = ops.indice_maxpool_backward(features, out, grad_output,
indice_pairs, indice_pair_num)
return input_bp, None, None, None
indice_conv = SparseConvFunction.apply
indice_inverse_conv = SparseInverseConvFunction.apply
indice_subm_conv = SubMConvFunction.apply
indice_maxpool = SparseMaxPoolFunction.apply
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PARAMS_GRID_H_
#define PARAMS_GRID_H_
#include <tuple>
#include <vector>
namespace detail {
template <class T>
int getTotalSize(std::vector<T> arg) {
return arg.size();
}
template <class T, class... TArgs>
int getTotalSize(std::vector<T> arg, std::vector<TArgs>... args) {
return arg.size() * getTotalSize(args...);
}
template <typename T>
int getSize(std::vector<T> arg) {
return arg.size();
}
template <int Idx, class TT, class T>
void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg) {
std::get<Idx>(src) = arg[counter[Idx]];
}
template <int Idx, class TT, class T, class... TArgs>
void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg,
std::vector<TArgs> &... args) {
std::get<Idx>(src) = arg[counter[Idx]];
assigner<Idx + 1>(src, counter, args...);
}
} // namespace detail
template <class... TArgs>
std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
int length = detail::getTotalSize(args...);
std::vector<int> sizes = {detail::getSize(args)...};
int size = sizes.size();
std::vector<std::tuple<TArgs...>> params(length);
std::vector<int> counter(size);
for (int i = 0; i < length; ++i) {
detail::assigner<0>(params[i], counter, args...);
counter[size - 1] += 1;
for (int c = size - 1; c >= 0; --c) {
if (counter[c] == sizes[c] && c > 0) {
counter[c - 1] += 1;
counter[c] = 0;
}
}
}
return params;
}
#endif
// Copyright Louis Delacroix 2010 - 2014.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
//
// A pretty printing library for C++
//
// Usage:
// Include this header, and operator<< will "just work".
#ifndef H_PRETTY_PRINT
#define H_PRETTY_PRINT
#include <cstddef>
#include <iterator>
#include <memory>
#include <ostream>
#include <set>
#include <tuple>
#include <type_traits>
#include <unordered_set>
#include <utility>
#include <valarray>
namespace pretty_print {
namespace detail {
// SFINAE type trait to detect whether T::const_iterator exists.
struct sfinae_base {
using yes = char;
using no = yes[2];
};
template <typename T>
struct has_const_iterator : private sfinae_base {
private:
template <typename C>
static yes &test(typename C::const_iterator *);
template <typename C>
static no &test(...);
public:
static const bool value = sizeof(test<T>(nullptr)) == sizeof(yes);
using type = T;
};
template <typename T>
struct has_begin_end : private sfinae_base {
private:
template <typename C>
static yes &
f(typename std::enable_if<
std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
const>(&C::begin)),
typename C::const_iterator (C::*)() const>::value>::type *);
template <typename C>
static no &f(...);
template <typename C>
static yes &g(typename std::enable_if<
std::is_same<decltype(static_cast<typename C::const_iterator (
C::*)() const>(&C::end)),
typename C::const_iterator (C::*)() const>::value,
void>::type *);
template <typename C>
static no &g(...);
public:
static bool const beg_value = sizeof(f<T>(nullptr)) == sizeof(yes);
static bool const end_value = sizeof(g<T>(nullptr)) == sizeof(yes);
};
} // namespace detail
// Holds the delimiter values for a specific character type
template <typename TChar>
struct delimiters_values {
using char_type = TChar;
const char_type *prefix;
const char_type *delimiter;
const char_type *postfix;
};
// Defines the delimiter values for a specific container and character type
template <typename T, typename TChar>
struct delimiters {
using type = delimiters_values<TChar>;
static const type values;
};
// Functor to print containers. You can use this directly if you want
// to specify a non-default delimiters type. The printing logic can
// be customized by specializing the nested template.
template <typename T, typename TChar = char,
typename TCharTraits = ::std::char_traits<TChar>,
typename TDelimiters = delimiters<T, TChar>>
struct print_container_helper {
using delimiters_type = TDelimiters;
using ostream_type = std::basic_ostream<TChar, TCharTraits>;
template <typename U>
struct printer {
static void print_body(const U &c, ostream_type &stream) {
using std::begin;
using std::end;
auto it = begin(c);
const auto the_end = end(c);
if (it != the_end) {
for (;;) {
stream << *it;
if (++it == the_end) break;
if (delimiters_type::values.delimiter != NULL)
stream << delimiters_type::values.delimiter;
}
}
}
};
print_container_helper(const T &container) : container_(container) {}
inline void operator()(ostream_type &stream) const {
if (delimiters_type::values.prefix != NULL)
stream << delimiters_type::values.prefix;
printer<T>::print_body(container_, stream);
if (delimiters_type::values.postfix != NULL)
stream << delimiters_type::values.postfix;
}
private:
const T &container_;
};
// Specialization for pairs
template <typename T, typename TChar, typename TCharTraits,
typename TDelimiters>
template <typename T1, typename T2>
struct print_container_helper<T, TChar, TCharTraits,
TDelimiters>::printer<std::pair<T1, T2>> {
using ostream_type =
typename print_container_helper<T, TChar, TCharTraits,
TDelimiters>::ostream_type;
static void print_body(const std::pair<T1, T2> &c, ostream_type &stream) {
stream << c.first;
if (print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter != NULL)
stream << print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter;
stream << c.second;
}
};
// Specialization for tuples
template <typename T, typename TChar, typename TCharTraits,
typename TDelimiters>
template <typename... Args>
struct print_container_helper<T, TChar, TCharTraits,
TDelimiters>::printer<std::tuple<Args...>> {
using ostream_type =
typename print_container_helper<T, TChar, TCharTraits,
TDelimiters>::ostream_type;
using element_type = std::tuple<Args...>;
template <std::size_t I>
struct Int {};
static void print_body(const element_type &c, ostream_type &stream) {
tuple_print(c, stream, Int<0>());
}
static void tuple_print(const element_type &, ostream_type &,
Int<sizeof...(Args)>) {}
static void tuple_print(
const element_type &c, ostream_type &stream,
typename std::conditional<sizeof...(Args) != 0, Int<0>,
std::nullptr_t>::type) {
stream << std::get<0>(c);
tuple_print(c, stream, Int<1>());
}
template <std::size_t N>
static void tuple_print(const element_type &c, ostream_type &stream, Int<N>) {
if (print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter != NULL)
stream << print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter;
stream << std::get<N>(c);
tuple_print(c, stream, Int<N + 1>());
}
};
// Prints a print_container_helper to the specified stream.
template <typename T, typename TChar, typename TCharTraits,
typename TDelimiters>
inline std::basic_ostream<TChar, TCharTraits> &operator<<(
std::basic_ostream<TChar, TCharTraits> &stream,
const print_container_helper<T, TChar, TCharTraits, TDelimiters> &helper) {
helper(stream);
return stream;
}
// Basic is_container template; specialize to derive from std::true_type for all
// desired container types
template <typename T>
struct is_container
: public std::integral_constant<bool,
detail::has_const_iterator<T>::value &&
detail::has_begin_end<T>::beg_value &&
detail::has_begin_end<T>::end_value> {};
template <typename T, std::size_t N>
struct is_container<T[N]> : std::true_type {};
template <std::size_t N>
struct is_container<char[N]> : std::false_type {};
template <typename T>
struct is_container<std::valarray<T>> : std::true_type {};
template <typename T1, typename T2>
struct is_container<std::pair<T1, T2>> : std::true_type {};
template <typename... Args>
struct is_container<std::tuple<Args...>> : std::true_type {};
// Default delimiters
template <typename T>
struct delimiters<T, char> {
static const delimiters_values<char> values;
};
template <typename T>
const delimiters_values<char> delimiters<T, char>::values = {"[", ", ", "]"};
template <typename T>
struct delimiters<T, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T>
const delimiters_values<wchar_t> delimiters<T, wchar_t>::values = {L"[", L", ",
L"]"};
// Delimiters for (multi)set and unordered_(multi)set
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::set<T, TComp, TAllocator>, char> {
static const delimiters_values<char> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<char>
delimiters<::std::set<T, TComp, TAllocator>, char>::values = {"{", ", ",
"}"};
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::set<T, TComp, TAllocator>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<wchar_t>
delimiters<::std::set<T, TComp, TAllocator>, wchar_t>::values = {
L"{", L", ", L"}"};
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::multiset<T, TComp, TAllocator>, char> {
static const delimiters_values<char> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<char>
delimiters<::std::multiset<T, TComp, TAllocator>, char>::values = {
"{", ", ", "}"};
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<wchar_t>
delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t>::values = {
L"{", L", ", L"}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, char> {
static const delimiters_values<char> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<char> delimiters<
::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = {
"{", ", ", "}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<wchar_t> delimiters<
::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = {
L"{", L", ", L"}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
char> {
static const delimiters_values<char> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<char> delimiters<
::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = {
"{", ", ", "}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<wchar_t>
delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
wchar_t>::values = {L"{", L", ", L"}"};
// Delimiters for pair and tuple
template <typename T1, typename T2>
struct delimiters<std::pair<T1, T2>, char> {
static const delimiters_values<char> values;
};
template <typename T1, typename T2>
const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = {
"(", ", ", ")"};
template <typename T1, typename T2>
struct delimiters<::std::pair<T1, T2>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T1, typename T2>
const delimiters_values<wchar_t>
delimiters<::std::pair<T1, T2>, wchar_t>::values = {L"(", L", ", L")"};
template <typename... Args>
struct delimiters<std::tuple<Args...>, char> {
static const delimiters_values<char> values;
};
template <typename... Args>
const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = {
"(", ", ", ")"};
template <typename... Args>
struct delimiters<::std::tuple<Args...>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename... Args>
const delimiters_values<wchar_t>
delimiters<::std::tuple<Args...>, wchar_t>::values = {L"(", L", ", L")"};
// Type-erasing helper class for easy use of custom delimiters.
// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t,
// and MyDelims needs to be defined for TChar. Usage: "cout <<
// pretty_print::custom_delims<MyDelims>(x)".
struct custom_delims_base {
virtual ~custom_delims_base() {}
virtual std::ostream &stream(::std::ostream &) = 0;
virtual std::wostream &stream(::std::wostream &) = 0;
};
template <typename T, typename Delims>
struct custom_delims_wrapper : custom_delims_base {
custom_delims_wrapper(const T &t_) : t(t_) {}
std::ostream &stream(std::ostream &s) {
return s << print_container_helper<T, char, std::char_traits<char>, Delims>(
t);
}
std::wostream &stream(std::wostream &s) {
return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>,
Delims>(t);
}
private:
const T &t;
};
template <typename Delims>
struct custom_delims {
template <typename Container>
custom_delims(const Container &c)
: base(new custom_delims_wrapper<Container, Delims>(c)) {}
std::unique_ptr<custom_delims_base> base;
};
template <typename TChar, typename TCharTraits, typename Delims>
inline std::basic_ostream<TChar, TCharTraits> &operator<<(
std::basic_ostream<TChar, TCharTraits> &s, const custom_delims<Delims> &p) {
return p.base->stream(s);
}
// A wrapper for a C-style array given as pointer-plus-size.
// Usage: std::cout << pretty_print_array(arr, n) << std::endl;
template <typename T>
struct array_wrapper_n {
typedef const T *const_iterator;
typedef T value_type;
array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {}
inline const_iterator begin() const { return _array; }
inline const_iterator end() const { return _array + _n; }
private:
const T *const _array;
size_t _n;
};
// A wrapper for hash-table based containers that offer local iterators to each
// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl; (Prints bucket
// 5 of container m.)
template <typename T>
struct bucket_print_wrapper {
typedef typename T::const_local_iterator const_iterator;
typedef typename T::size_type size_type;
const_iterator begin() const { return m_map.cbegin(n); }
const_iterator end() const { return m_map.cend(n); }
bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {}
private:
const T &m_map;
const size_type n;
};
} // namespace pretty_print
// Global accessor functions for the convenience wrappers
template <typename T>
inline pretty_print::array_wrapper_n<T> pretty_print_array(const T *const a,
size_t n) {
return pretty_print::array_wrapper_n<T>(a, n);
}
template <typename T>
pretty_print::bucket_print_wrapper<T> bucket_print(const T &m,
typename T::size_type n) {
return pretty_print::bucket_print_wrapper<T>(m, n);
}
// Main magic entry point: An overload snuck into namespace std.
// Can we do better?
namespace std {
// Prints a container to the stream using default delimiters
template <typename T, typename TChar, typename TCharTraits>
inline typename enable_if<::pretty_print::is_container<T>::value,
basic_ostream<TChar, TCharTraits> &>::type
operator<<(basic_ostream<TChar, TCharTraits> &stream, const T &container) {
return stream
<< ::pretty_print::print_container_helper<T, TChar, TCharTraits>(
container);
}
} // namespace std
#endif // H_PRETTY_PRINT
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <iostream>
#include <pybind11/embed.h> // everything needed for embedding
#include <pybind11/functional.h>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <tensorview/tensorview.h>
namespace py = pybind11;
template <typename T, typename TPyObject>
std::vector<T> array2Vector(TPyObject arr){
py::array arr_np = arr;
size_t size = arr.attr("size").template cast<size_t>();
py::array_t<T> arr_cc = arr_np;
std::vector<T> data(arr_cc.data(), arr_cc.data() + size);
return data;
}
template <typename T>
std::vector<T> arrayT2Vector(py::array_t<T> arr)
{
std::vector<T> data(arr.data(), arr.data() + arr.size());
return data;
}
template <typename T, typename TPyObject>
tv::TensorView<T> array2TensorView(TPyObject arr){
py::array arr_np = arr;
py::array_t<T> arr_cc = arr_np;
tv::Shape shape;
for (int i = 0; i < arr_cc.ndim(); ++i){
shape.push_back(arr_cc.shape(i));
}
return tv::TensorView<T>(arr_cc.mutable_data(), shape);
}
template <typename T>
tv::TensorView<T> arrayT2TensorView(py::array_t<T> arr){
tv::Shape shape;
for (int i = 0; i < arr.ndim(); ++i){
shape.push_back(arr.shape(i));
}
return tv::TensorView<T>(arr.mutable_data(), shape);
}
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef FUSED_SPARSE_CONV_OP_H_
#define FUSED_SPARSE_CONV_OP_H_
#include <cuda_runtime_api.h>
#include <spconv/indice.h>
#include <spconv/reordering.h>
#include <torch/script.h>
#include <torch_utils.h>
#include <utility/timer.h>
namespace spconv {
// torch.jit's doc says only support int64, so we need to convert to int32.
template <typename T>
torch::Tensor fusedIndiceConvBatchNorm(
torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
/*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element(
indicePairNumVec.begin(), indicePairNumVec.end());
indicePairMaxSize = *indicePairVecMaxSizeIter;
}*/
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch::Tensor output =
torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
// auto timer = spconv::CudaContextTimer<>();
auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
{nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
} else {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data_ptr<long>(),
{nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
}
// totalGatherTime += timer.report() / 1000.0;
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
// totalGEMMTime += timer.report() / 1000.0;
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
} else {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
TV_CHECK_CUDA_ERR();
}
// totalSAddTime += timer.report() / 1000.0;
}
// std::cout << "gather time " << totalGatherTime << std::endl;
// std::cout << "gemm time " << totalGEMMTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return output;
}
} // namespace spconv
#endif
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPCONV_GEOMETRY_H_
#define SPCONV_GEOMETRY_H_
#include <tensorview/tensorview.h>
#include <iostream>
#include <limits>
namespace spconv {
template <typename Index, unsigned NDim>
TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
const Index *kernelSize,
const Index *stride, const Index *padding,
const Index *dilation,
const Index *outSpatialShape, Index *out) {
Index lowers[NDim];
Index uppers[NDim];
Index counter[NDim];
Index counterSize[NDim];
Index pointCounter = 0;
Index val;
Index numPoints = 1;
Index m, offset;
bool valid = false;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 +
stride[i] + padding[i]) /
stride[i];
uppers[i] = (input_pos[i] + padding[i]) / stride[i];
}
#pragma unroll
for (unsigned i = 0; i < NDim; ++i) {
counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
numPoints *= counterSize[i];
}
#pragma unroll
for (int i = 0; i < NDim; ++i) {
counter[i] = 0;
}
for (int i = 0; i < numPoints; ++i) {
valid = true;
m = 1;
offset = 0;
#pragma unroll
for (int j = NDim - 1; j >= 0; --j) {
val = uppers[j] - counter[j] * dilation[j];
out[pointCounter * (NDim + 1) + j] = val;
if (val < 0 || (val > outSpatialShape[j] - 1)) {
valid = false;
// break;
}
offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
m *= kernelSize[j];
}
out[pointCounter * (NDim + 1) + NDim] = offset;
if (valid) ++pointCounter;
counter[NDim - 1] += 1;
#pragma unroll
for (int c = NDim - 1; c >= 0; --c) {
if (counter[c] == counterSize[c] && c > 0) {
counter[c - 1] += 1;
counter[c] = 0;
}
}
}
return pointCounter;
}
template <typename Index, unsigned NDim>
TV_HOST_DEVICE Index getValidOutPosTranspose(
const Index *input_pos, const Index *kernelSize, const Index *stride,
const Index *padding, const Index *dilation, const Index *outSpatialShape,
Index *out) {
Index lowers[NDim];
Index uppers[NDim];
Index counter[NDim];
Index counterSize[NDim];
Index pointCounter = 0;
Index val;
Index numPoints = 1;
Index m, offset;
bool valid = false;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
lowers[i] = input_pos[i] * stride[i] - padding[i];
uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i];
}
#pragma unroll
for (unsigned i = 0; i < NDim; ++i) {
counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
numPoints *= counterSize[i];
}
#pragma unroll
for (int i = 0; i < NDim; ++i) {
counter[i] = 0;
}
for (int i = 0; i < numPoints; ++i) {
valid = true;
m = 1;
offset = 0;
#pragma unroll
for (int j = NDim - 1; j >= 0; --j) {
val = uppers[j] - counter[j] * dilation[j];
out[pointCounter * (NDim + 1) + j] = val;
if (val < 0 || (val > outSpatialShape[j] - 1)) {
valid = false;
// break;
}
offset += m * (val - lowers[j]) / dilation[j];
m *= kernelSize[j];
}
out[pointCounter * (NDim + 1) + NDim] = offset;
if (valid) ++pointCounter;
counter[NDim - 1] += 1;
#pragma unroll
for (int c = NDim - 1; c >= 0; --c) {
if (counter[c] == counterSize[c] && c > 0) {
counter[c - 1] += 1;
counter[c] = 0;
}
}
}
return pointCounter;
}
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *kernelSize, const Index *stride,
const Index *padding, const Index *dilation,
const Index *outSpatialShape) {
// indicesOut: num_active * kernelVolume * (NDim + 1)
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
for (int j = 0; j < numActIn; ++j) {
batchIdx = indicesIn(j, 0);
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * batchIdx;
if (gridsOut[index] == -1) {
for (unsigned k = 1; k < NDim + 1; ++k) {
indicesOut(numAct, k) = pointPtr[k - 1];
}
indicesOut(numAct, 0) = batchIdx;
gridsOut[index] = numAct++;
}
// indicePairs: [K, 2, L]
indicePairs(offset, 0, indiceNum[offset]) = j;
indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
}
}
return numAct;
}
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *kernelSize, const Index *stride,
const Index *padding, const Index *dilation,
const Index *outSpatialShape) {
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
for (int j = 0; j < numActIn; ++j) {
batchIdx = indicesIn(j, 0);
numValidPoints = getValidOutPosTranspose<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * batchIdx;
if (gridsOut[index] == -1) {
for (unsigned k = 1; k < NDim + 1; ++k) {
indicesOut(numAct, k) = pointPtr[k - 1];
}
indicesOut(numAct, 0) = batchIdx;
gridsOut[index] = numAct++;
}
// indicePairs: [K, 2, L]
indicePairs(offset, 0, indiceNum[offset]) = j;
indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
}
}
return numAct;
}
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *const kernelSize,
const Index *const stride, const Index *const padding,
const Index *dilation,
const Index *const outSpatialShape) {
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
// Index validPoints[kernelVolume * (NDim + 1)];
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
Index index = 0;
for (int j = 0; j < numActIn; ++j) {
index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
outSpatialShape) +
spatialVolume * indicesIn(j, 0);
gridsOut[index] = j;
}
for (int j = 0; j < numActIn; ++j) {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * indicesIn(j, 0);
if (gridsOut[index] > -1) {
indicePairs(offset, 0, indiceNum[offset]) = j;
indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
}
}
}
return numActIn;
}
} // namespace spconv
#endif
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef INDICE_CU_H_
#define INDICE_CU_H_
#include <spconv/geometry.h>
#include <tensorview/helper_kernel.cu.h>
#include <tensorview/tensorview.h>
namespace spconv {
template <typename Index, typename IndexGrid, unsigned NDim,
int KernelMaxVolume = 256>
__global__ void prepareIndicePairsKernel(
tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape) {
auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
Index validPoints[KernelMaxVolume * (NDim + 1)];
Index *pointPtr = nullptr;
auto indicePairsDim2 = indicePairs.dim(2);
Index index;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
indicePairs(offset, 0, oldNum) = ix;
index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
spatialVolume * indicesIn(ix, 0);
indicePairs(offset, 1, oldNum) = index;
indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
}
}
}
template <typename Index, typename IndexGrid, unsigned NDim,
int KernelMaxVolume = 256>
__global__ void prepareDeConvIndicePairsKernel(
tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape) {
auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
Index validPoints[KernelMaxVolume * (NDim + 1)];
Index *pointPtr = nullptr;
auto indicePairsDim2 = indicePairs.dim(2);
Index index;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
numValidPoints = getValidOutPosTranspose<Index, NDim>(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
indicePairs(offset, 0, oldNum) = ix;
index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
spatialVolume * indicesIn(ix, 0);
indicePairs(offset, 1, oldNum) = index;
indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
}
}
}
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void assignGridAndIndiceOutKernel(
tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
int numAct, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {
Index index;
auto indicesOutPtr = indicesOut.data();
for (int ix : tv::KernelLoopX<int>(numAct)) {
index = indicePairUnique[ix];
gridsOut[index] = ix;
index = tv::rowArrayIdxInv<Index, NDim>(
index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
indicesOut[ix * (NDim + 1)] = index % batchSize;
}
}
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void assignIndicePairsKernel(
tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
int numActIn, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape) {
Index index;
int kernelVolume = indicePairs.dim(0);
for (int ix : tv::KernelLoopX<int>(numActIn)) {
for (int i = 0; i < kernelVolume; ++i) {
index = indicePairs(i, 1, ix);
if (index > -1) {
indicePairs(i, 1, ix) = gridsOut[index];
}
}
}
}
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void prepareSubMGridKernel(
tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
const tv::SimpleVector<Index, NDim> outSpatialShape) {
auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index index = 0;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + ix * (NDim + 1) + 1,
outSpatialShape.data()) +
spatialVolume * indicesIn(ix, 0);
gridsOut[index] = ix;
}
}
template <typename Index, typename IndexGrid, unsigned NDim,
int KernelMaxVolume = 256>
__global__ void getSubMIndicePairsKernel(
tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape) {
auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index numValidPoints = 0;
Index validPoints[KernelMaxVolume * (NDim + 1)];
Index *pointPtr = nullptr;
Index index = 0;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
validPoints);
for (int i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
spatialVolume * indicesIn(ix, 0);
if (gridsOut[index] > -1) {
auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
indicePairs(offset, 1, oldNum) = gridsOut[index];
indicePairs(offset, 0, oldNum) = ix;
}
}
}
}
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void resetGridKernel(const Index *indicePairUnique,
tv::TensorView<IndexGrid> gridsOut,
int numAct) {
for (int ix : tv::KernelLoopX<int>(numAct)) {
gridsOut[indicePairUnique[ix]] = -1;
}
}
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void resetGridSubMKernel(
const Index *indices, tv::TensorView<IndexGrid> gridsOut,
const tv::SimpleVector<Index, NDim> outSpatialShape, int numAct) {
int outSpatialShapeReg[NDim];
for (int i = 0; i < NDim; ++i) {
outSpatialShapeReg[i] = outSpatialShape[i];
}
Index spatialVolume = 1;
auto indsPtr = indices;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index index;
for (int ix : tv::KernelLoopX<int>(numAct)) {
indsPtr = indices + ix * (NDim + 1);
index = tv::rowArrayIdx<Index, NDim>(indsPtr + 1, outSpatialShapeReg);
gridsOut[index + spatialVolume * indsPtr[0]] = -1;
}
}
} // namespace spconv
#endif
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_
#define SPARSE_CONV_INDICE_FUNCTOR_H_
#include <tensorview/tensorview.h>
namespace spconv {
namespace functor {
template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP1 {
Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose);
};
template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP2 {
Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid = false);
};
template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctor {
Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid = false);
};
template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateSubMIndicePairFunctor {
Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid = false);
};
} // namespace functor
} // namespace spconv
#endif
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_MAXPOOL_FUNCTOR_H_
#define SPARSE_MAXPOOL_FUNCTOR_H_
#include <tensorview/tensorview.h>
namespace spconv {
namespace functor {
template <typename Device, typename T, typename Index>
struct SparseMaxPoolForwardFunctor {
void operator()(const Device& d, tv::TensorView<T> outFeatures,
tv::TensorView<const T> inFeatures,
tv::TensorView<const Index> indices, int size);
};
template <typename Device, typename T, typename Index>
struct SparseMaxPoolBackwardFunctor {
void operator()(const Device& d, tv::TensorView<const T> outFeatures,
tv::TensorView<const T> inFeatures,
tv::TensorView<const T> dout, tv::TensorView<T> din,
tv::TensorView<const Index> indices, int size);
};
} // namespace functor
} // namespace spconv
#endif
#ifndef MP_HELPER_H_
#define MP_HELPER_H_
#include <type_traits>
#include <utility>
namespace spconv {
template <class... T>
struct mp_list {};
template <class T, T... I>
using mp_list_c = mp_list<std::integral_constant<T, I>...>;
namespace detail {
template <class... T, class F>
constexpr F mp_for_each_impl(mp_list<T...>, F &&f) {
return std::initializer_list<int>{(f(T()), 0)...}, std::forward<F>(f);
}
template <class F>
constexpr F mp_for_each_impl(mp_list<>, F &&f) {
return std::forward<F>(f);
}
} // namespace detail
namespace detail {
template <class A, template <class...> class B>
struct mp_rename_impl {
// An error "no type named 'type'" here means that the first argument to
// mp_rename is not a list
};
template <template <class...> class A, class... T, template <class...> class B>
struct mp_rename_impl<A<T...>, B> {
using type = B<T...>;
};
} // namespace detail
template <class A, template <class...> class B>
using mp_rename = typename detail::mp_rename_impl<A, B>::type;
template <class L, class F>
constexpr F mp_for_each(F &&f) {
return detail::mp_for_each_impl(mp_rename<L, mp_list>(), std::forward<F>(f));
}
} // namespace spconv
#endif
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <pybind11/pybind11.h>
// must include pybind11/eigen.h if using eigen matrix as arguments.
// must include pybind11/stl.h if using containers in STL in arguments.
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include <algorithm>
// #include <vector>
#include <math.h>
#include <iostream>
namespace spconv {
namespace py = pybind11;
using namespace pybind11::literals;
template <typename DType, int NDim>
int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
py::array_t<int> coors,
py::array_t<int> num_points_per_voxel,
py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size,
std::vector<DType> coors_range, int max_points,
int max_voxels) {
auto points_rw = points.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
// auto ndim = points_rw.shape(1) - 1;
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int voxelidx, num;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed) continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels) continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors_rw(voxelidx, k) = coor[k];
}
}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) {
for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k);
}
num_points_per_voxel_rw(voxelidx) += 1;
}
}
for (int i = 0; i < voxel_num; ++i) {
coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
}
return voxel_num;
}
template <typename DType, int NDim>
int points_to_voxel_3d_np_mean(py::array_t<DType> points,
py::array_t<DType> voxels,
py::array_t<DType> means, py::array_t<int> coors,
py::array_t<int> num_points_per_voxel,
py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size,
std::vector<DType> coors_range, int max_points,
int max_voxels) {
auto points_rw = points.template mutable_unchecked<2>();
auto means_rw = means.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
// auto ndim = points_rw.shape(1) - 1;
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int voxelidx, num;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed) continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels) continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors_rw(voxelidx, k) = coor[k];
}
}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) {
for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k);
}
num_points_per_voxel_rw(voxelidx) += 1;
for (int k = 0; k < num_features; ++k) {
means_rw(voxelidx, k) +=
(points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
}
}
}
for (int i = 0; i < voxel_num; ++i) {
coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
num = num_points_per_voxel_rw(i);
for (int j = num; j < max_points; ++j) {
for (int k = 0; k < num_features; ++k) {
voxels_rw(i, j, k) = means_rw(i, k);
}
}
}
return voxel_num;
}
template <typename DType, int NDim>
int points_to_voxel_3d_np_height(
py::array_t<DType> points, py::array_t<DType> voxels,
py::array_t<DType> height, py::array_t<DType> maxs, py::array_t<int> coors,
py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size, std::vector<DType> coors_range,
int max_points, int max_voxels) {
auto points_rw = points.template mutable_unchecked<2>();
auto height_rw = height.template mutable_unchecked<2>();
auto maxs_rw = maxs.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
// auto ndim = points_rw.shape(1) - 1;
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int voxelidx, num;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed) continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels) continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors_rw(voxelidx, k) = coor[k];
}
}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) {
for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k);
height_rw(voxelidx, k) =
std::min(points_rw(i, k), height_rw(voxelidx, k));
maxs_rw(voxelidx, k) = std::max(points_rw(i, k), maxs_rw(voxelidx, k));
}
num_points_per_voxel_rw(voxelidx) += 1;
}
}
for (int i = 0; i < voxel_num; ++i) {
coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
for (int k = 0; k < num_features; ++k) {
height_rw(i, k) = maxs_rw(i, k) - height_rw(i, k);
}
}
return voxel_num;
}
template <typename DType, int NDim>
int block_filtering(py::array_t<DType> points, py::array_t<int> mask,
py::array_t<DType> height, py::array_t<DType> maxs,
py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size,
std::vector<DType> coors_range, int max_voxels, DType eps) {
auto points_rw = points.template mutable_unchecked<2>();
auto height_rw = height.template mutable_unchecked<1>();
auto maxs_rw = maxs.template mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
// auto ndim = points_rw.shape(1) - 1;
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int voxelidx, num;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed) continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
}
height_rw(voxelidx) = std::min(points_rw(i, 2), height_rw(voxelidx));
maxs_rw(voxelidx) = std::max(points_rw(i, 2), maxs_rw(voxelidx));
}
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed) continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps) {
mask(i) = 0;
}
}
}
template <typename DType, int NDim>
int points_to_voxel_3d_with_filtering(
py::array_t<DType> points, py::array_t<DType> voxels,
py::array_t<int> voxel_mask, py::array_t<DType> mins,
py::array_t<DType> maxs, py::array_t<int> coors,
py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size, std::vector<DType> coors_range,
int max_points, int max_voxels, int block_factor, int block_size,
DType height_threshold) {
auto points_rw = points.template mutable_unchecked<2>();
auto mins_rw = mins.template mutable_unchecked<2>();
auto maxs_rw = maxs.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
// auto ndim = points_rw.shape(1) - 1;
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
DType max_value, min_value;
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int block_shape_H = grid_size[1] / block_factor;
int block_shape_W = grid_size[0] / block_factor;
int voxelidx, num;
int block_coor[2];
int startx, stopx, starty, stopy;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed) continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels) continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors_rw(voxelidx, k) = coor[k];
}
}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) {
for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k);
}
block_coor[0] = coor[1] / block_factor;
block_coor[1] = coor[2] / block_factor;
mins_rw(block_coor[0], block_coor[1]) =
std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
maxs_rw(block_coor[0], block_coor[1]) =
std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
num_points_per_voxel_rw(voxelidx) += 1;
}
}
for (int i = 0; i < voxel_num; ++i) {
coor[1] = coors_rw(i, 1);
coor[2] = coors_rw(i, 2);
coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;
block_coor[0] = coor[1] / block_factor;
block_coor[1] = coor[2] / block_factor;
min_value = mins_rw(block_coor[0], block_coor[1]);
max_value = maxs_rw(block_coor[0], block_coor[1]);
startx = std::max(0, block_coor[0] - block_size / 2);
stopx =
std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
starty = std::max(0, block_coor[1] - block_size / 2);
stopy =
std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);
for (int j = startx; j < stopx; ++j) {
for (int k = starty; k < stopy; ++k) {
min_value = std::min(min_value, mins_rw(j, k));
max_value = std::max(max_value, maxs_rw(j, k));
}
}
voxel_mask_rw(i) = (max_value - min_value) > height_threshold;
}
return voxel_num;
}
} // namespace spconv
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_POOL_OP_H_
#define SPARSE_POOL_OP_H_
#include <cuda_runtime_api.h>
#include <spconv/maxpool.h>
#include <torch/script.h>
#include <torch_utils.h>
#include <utility/timer.h>
namespace spconv {
template <typename T>
torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numAct) {
auto device = features.device().type();
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
double totalTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0) {
continue;
}
// auto timer = spconv::CudaContextTimer<>();
if (device == torch::kCPU) {
functor::SparseMaxPoolForwardFunctor<tv::CPU, T, int> forwardFtor;
forwardFtor(tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i), nHot);
} else {
functor::SparseMaxPoolForwardFunctor<tv::GPU, T, int> forwardFtor;
forwardFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i), nHot);
TV_CHECK_CUDA_ERR();
}
// totalTime += timer.report() / 1000.0;
}
// std::cout << "maxpool forward time " << totalTime << std::endl;
return output;
}
template <typename T>
torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
torch::Tensor outFeatures,
torch::Tensor outGrad,
torch::Tensor indicePairs,
torch::Tensor indiceNum) {
auto device = features.device().type();
auto numInPlanes = features.size(1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
auto kernelVolume = indicePairs.size(0);
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0) {
continue;
}
if (device == torch::kCPU) {
functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, int> backwardFtor;
backwardFtor(tv::CPU(), tv::torch2tv<const T>(outFeatures),
tv::torch2tv<const T>(features),
tv::torch2tv<const T>(outGrad), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const int>(indicePairs).subview(i), nHot);
} else {
functor::SparseMaxPoolBackwardFunctor<tv::GPU, T, int> backwardFtor;
backwardFtor(tv::TorchGPU(), tv::torch2tv<const T>(outFeatures),
tv::torch2tv<const T>(features),
tv::torch2tv<const T>(outGrad), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const int>(indicePairs).subview(i), nHot);
TV_CHECK_CUDA_ERR();
}
}
return inputGrad;
}
} // namespace spconv
#endif
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef REORDERING_CU_H_
#define REORDERING_CU_H_
#include <tensorview/helper_kernel.cu.h>
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
namespace spconv {
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void gatherGenericKernel(T *buffer, const T *features,
const Index *indices, int size,
int numPlanes) {
int ILPStrideX[NumILP];
Index inds[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < size)
inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < size)
buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
features[inds[ilp] + iy];
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
__global__ void gatherVecKernel(T *buffer, const T *features,
const Index *indices, int size, int numPlanes) {
int ILPStrideX[NumILP];
Index inds[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < size)
inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < size)
reinterpret_cast<VecType *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
reinterpret_cast<const VecType *>(features)[inds[ilp] + iy];
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP,
typename VecType = int4>
__global__ void gatherVecBlockKernel(T *buffer, const T *features,
const Index *indices, int size,
int numPlanes) {
int ILPStrideY[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
features += blockIdx.x * NumTLP;
buffer += blockIdx.x * NumTLP;
for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
reinterpret_cast<VecType *>(
buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] =
reinterpret_cast<const VecType *>(
features)[indices[iy + ILPStrideY[ilp]] * numPlanes +
threadIdx.x];
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void scatterAddGenericKernel(T *outFeatures, const T *buffer,
const Index *indices, int size,
int numPlanes) {
int ILPStrideX[NumILP];
Index inds[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < size)
inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < size) {
outFeatures[inds[ilp] + iy] +=
buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy];
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP,
typename VecType = int4>
__global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
const Index *indices, int size,
int numPlanes) {
int ILPStrideY[NumILP];
constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
outFeatures += blockIdx.x * NumTLP;
buffer += blockIdx.x * NumTLP;
T buf[vecloadFactor];
T buf2[vecloadFactor];
Index idx;
for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idx = indices[iy + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
reinterpret_cast<VecType *>(buf)[0] =
reinterpret_cast<VecType *>(outFeatures)[idx];
reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(
buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x];
#pragma unroll
for (int i = 0; i < vecloadFactor; i++) {
buf[i] += buf2[i];
}
reinterpret_cast<VecType *>(outFeatures)[idx] =
reinterpret_cast<VecType *>(buf)[0];
}
}
}
} // namespace spconv
#endif
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_REORDERING_FUNCTOR_H_
#define SPARSE_REORDERING_FUNCTOR_H_
#include <tensorview/tensorview.h>
namespace spconv {
namespace functor {
template <typename Device, typename T, typename Index>
struct SparseGatherFunctor {
void operator()(const Device& d, tv::TensorView<T> buffer,
tv::TensorView<const T> features,
tv::TensorView<const Index> indices, int size);
};
template <typename Device, typename T, typename Index>
struct SparseScatterAddFunctor {
void operator()(const Device& d, tv::TensorView<T> out_features,
tv::TensorView<const T> buffer,
tv::TensorView<const Index> indices, int size,
bool stable = false);
};
} // namespace functor
} // namespace spconv
#endif
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_CONV_OP_H_
#define SPARSE_CONV_OP_H_
#include <cuda_runtime_api.h>
#include <spconv/indice.h>
#include <spconv/reordering.h>
#include <torch/script.h>
#include <torch_utils.h>
#include <utility/timer.h>
namespace spconv {
// torch.jit's doc says only support int64, so we need to convert to int32.
template <unsigned NDim>
std::vector<torch::Tensor> getIndicePair(
torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
// auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0;
bool transpose = _transpose != 0;
auto numAct = indices.size(0);
auto coorDim = indices.size(1) - 1; // batchIdx + xyz
TV_ASSERT_RT_ERR(NDim == coorDim, "error");
TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
auto kernelVolume = kernelSize[0];
for (int i = 1; i < kernelSize.size(); ++i) {
kernelVolume *= kernelSize[i];
}
TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
auto outputVolume = outSpatialShape[0];
for (int i = 1; i < outSpatialShape.size(); ++i) {
outputVolume *= outSpatialShape[i];
}
torch::Tensor indicePairs =
torch::full({kernelVolume, 2, numAct}, -1,
torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor indiceNum = torch::zeros(
{kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor gridOut =
torch::full({batchSize * outputVolume}, -1,
torch::dtype(torch::kInt32).device(indices.device()));
// std::cout << "full time " << timer.report() / 1000.0 << std::endl;
int64_t numActOut = -1;
tv::SimpleVector<int, NDim> outSpatialShape32;
tv::SimpleVector<int, NDim> kernelSize32;
tv::SimpleVector<int, NDim> stride32;
tv::SimpleVector<int, NDim> padding32;
tv::SimpleVector<int, NDim> dilation32;
auto indicePairUnique = torch::full(
{indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(indices.device()));
for (int i = 0; i < NDim; ++i) {
outSpatialShape32.push_back(outSpatialShape[i]);
kernelSize32.push_back(kernelSize[i]);
if (subM) {
stride32.push_back(1);
padding32.push_back(kernelSize[i] / 2);
dilation32.push_back(dilation[i]);
} else {
stride32.push_back(stride[i]);
padding32.push_back(padding[i]);
dilation32.push_back(dilation[i]);
}
}
if (subM) {
if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose);
} else {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose);
}
return {indices, indicePairs, indiceNum};
} else {
torch::Tensor outInds =
torch::zeros({numAct * kernelVolume, coorDim + 1},
torch::dtype(torch::kInt32).device(indices.device()));
if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor =
functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
transpose);
} else {
auto getIndicePairFtorP1 =
functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
auto getIndicePairFtorP2 =
functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
numActOut = getIndicePairFtorP1(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
padding32, dilation32, outSpatialShape32, transpose);
if (numActOut > 0) {
auto res = torch::_unique(indicePairUnique);
indicePairUnique = std::get<0>(res);
numActOut = getIndicePairFtorP2(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose);
}
}
return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
}
}
template <unsigned NDim>
std::vector<torch::Tensor> getIndicePairPreGrid(
torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
// auto timer = spconv::CudaContextTimer<>();
bool subM = _subM != 0;
bool transpose = _transpose != 0;
auto numAct = indices.size(0);
auto coorDim = indices.size(1) - 1; // batchIdx + xyz
TV_ASSERT_RT_ERR(NDim == coorDim, "error");
TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
auto kernelVolume = kernelSize[0];
for (int i = 1; i < kernelSize.size(); ++i) {
kernelVolume *= kernelSize[i];
}
TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
auto outputVolume = outSpatialShape[0];
for (int i = 1; i < outSpatialShape.size(); ++i) {
outputVolume *= outSpatialShape[i];
}
TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error");
torch::Tensor indicePairs =
torch::full({kernelVolume, 2, numAct}, -1,
torch::dtype(torch::kInt32).device(indices.device()));
torch::Tensor indiceNum = torch::zeros(
{kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
// std::cout << "full time " << timer.report() / 1000.0 << std::endl;
int64_t numActOut = -1;
tv::SimpleVector<int, NDim> outSpatialShape32;
tv::SimpleVector<int, NDim> kernelSize32;
tv::SimpleVector<int, NDim> stride32;
tv::SimpleVector<int, NDim> padding32;
tv::SimpleVector<int, NDim> dilation32;
auto indicePairUnique = torch::full(
{indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
torch::dtype(torch::kInt32).device(indices.device()));
for (int i = 0; i < NDim; ++i) {
outSpatialShape32.push_back(outSpatialShape[i]);
kernelSize32.push_back(kernelSize[i]);
if (subM) {
stride32.push_back(1);
padding32.push_back(kernelSize[i] / 2);
dilation32.push_back(dilation[i]);
} else {
stride32.push_back(stride[i]);
padding32.push_back(padding[i]);
dilation32.push_back(dilation[i]);
}
}
if (subM) {
if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose);
gridOut.fill_(-1);
} else {
auto getIndicePairFtor =
functor::CreateSubMIndicePairFunctor<tv::GPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
dilation32, outSpatialShape32, transpose, true);
}
return {indices, indicePairs, indiceNum};
} else {
torch::Tensor outInds =
torch::zeros({numAct * kernelVolume, coorDim + 1},
torch::dtype(torch::kInt32).device(indices.device()));
if (indices.device().type() == torch::kCPU) {
auto getIndicePairFtor =
functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
numActOut = getIndicePairFtor(
tv::CPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
transpose, true);
gridOut.fill_(-1);
} else {
auto getIndicePairFtorP1 =
functor::CreateConvIndicePairFunctorP1<tv::GPU, int, int, NDim>();
auto getIndicePairFtorP2 =
functor::CreateConvIndicePairFunctorP2<tv::GPU, int, int, NDim>();
numActOut = getIndicePairFtorP1(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
padding32, dilation32, outSpatialShape32, transpose);
if (numActOut > 0) {
auto res = torch::_unique(indicePairUnique);
indicePairUnique = std::get<0>(res);
numActOut = getIndicePairFtorP2(
tv::TorchGPU(), tv::torch2tv<const int>(indices),
tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
true);
}
}
return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
}
}
template <typename T>
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
/*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element(
indicePairNumVec.begin(), indicePairNumVec.end());
indicePairMaxSize = *indicePairVecMaxSizeIter;
}*/
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
// auto timer = spconv::CudaContextTimer<>();
auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
{nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
} else {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
/* slower than SparseGatherFunctor, may due to int->long conversion
auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
auto indicePairBlob = torch::from_blob(indicePairLong.data_ptr<long>(),
{nHot}, indicePairOptions); torch::index_select_out(inputBufferBlob,
features, 0, indicePairBlob);*/
}
// totalGatherTime += timer.report() / 1000.0;
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
// totalGEMMTime += timer.report() / 1000.0;
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
} else {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
TV_CHECK_CUDA_ERR();
}
// totalSAddTime += timer.report() / 1000.0;
}
// std::cout << "gather time " << totalGatherTime << std::endl;
// std::cout << "gemm time " << totalGEMMTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return output;
}
template <typename T>
std::vector<torch::Tensor> indiceConvBackward(torch::Tensor features,
torch::Tensor filters,
torch::Tensor outGrad,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
auto filterShape = filters.sizes();
torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
torch::Tensor filtersGrad = torch::zeros(filterShape, options);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
if (subM) {
auto filterGradSub = filtersGrad[indicePairMaxOffset];
torch::mm_out(filterGradSub, features.t(), outGrad);
torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
}
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtorOut;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
gatherFtorOut(tv::CPU(), tv::torch2tv<T>(outputBuffer),
tv::torch2tv<const T>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot);
} else {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtorOut;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBuffer),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
gatherFtorOut(tv::TorchGPU(), tv::torch2tv<T>(outputBuffer),
tv::torch2tv<const T>(outGrad),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot);
TV_CHECK_CUDA_ERR();
}
auto filterGradSub = filtersGrad[i];
auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer.data_ptr<T>(),
{nHot, numInPlanes}, options);
torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const T>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
} else {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(inputGrad),
tv::torch2tv<const T>(inputBuffer),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
}
}
return {inputGrad, filtersGrad.view(filterShape)};
}
template <typename T>
torch::Tensor indiceConvDevelopDontUse(torch::Tensor features,
torch::Tensor filters,
torch::Tensor indicePairs,
torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse,
int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto totalActsTen = indicePairNumCpu.sum();
auto totalActs = indicePairNumCpu.data_ptr<int>()[0];
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
std::vector<int> indicePairNumVec(
indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
int subRuleMaxSize =
*std::max_element(indicePairNumVec.begin(), indicePairNumVec.end());
if (subM) {
indicePairMaxSize = subRuleMaxSize;
}
auto timer = spconv::CudaContextTimer<>();
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
torch::Tensor inputBuffer =
torch::zeros({kernelVolume, indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({kernelVolume, indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
std::cout << "create time " << timer.report() / 1000.0 << std::endl;
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
// auto timer = spconv::CudaContextTimer<>();
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
//
auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer[i].data_ptr<T>(),
{nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseGatherFunctor<tv::CPU, T, int> gatherFtor;
gatherFtor(tv::CPU(), tv::torch2tv<T>(inputBufferBlob),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
} else {
functor::SparseGatherFunctor<tv::GPU, T, int> gatherFtor;
gatherFtor(tv::TorchGPU(), tv::torch2tv<T>(inputBufferBlob),
tv::torch2tv<const T>(features),
tv::torch2tv<const int>(indicePairs).subview(i, inverse),
nHot);
TV_CHECK_CUDA_ERR();
}
// }
// for (int i = 0; i < kernelVolume; ++i) {
// totalGatherTime += timer.report() / 1000.0;
// auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
// {nHot, numOutPlanes}, options);
}
// totalGatherTime += timer.report() / 1000.0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer[i].data_ptr<T>(),
{nHot, numInPlanes}, options);
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
}
// totalGEMMTime += timer.report() / 1000.0;
// totalGEMMTime += timer.report() / 1000.0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
auto outputBufferBlob = torch::from_blob(outputBuffer[i].data_ptr<T>(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob = torch::from_blob(inputBuffer[i].data_ptr<T>(),
{nHot, numInPlanes}, options);
if (device == torch::kCPU) {
functor::SparseScatterAddFunctor<tv::CPU, T, int> scatterFtor;
scatterFtor(tv::CPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBufferBlob),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
} else {
functor::SparseScatterAddFunctor<tv::GPU, T, int> scatterFtor;
scatterFtor(tv::TorchGPU(), tv::torch2tv<T>(output),
tv::torch2tv<const T>(outputBufferBlob),
tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
nHot, true);
TV_CHECK_CUDA_ERR();
}
// totalSAddTime += timer.report() / 1000.0;
}
// totalSAddTime += timer.report() / 1000.0;
// std::cout << "gather time " << totalGatherTime << std::endl;
// std::cout << "gemm time " << totalGEMMTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return output;
}
} // namespace spconv
#endif
#pragma once
// from tensorflow
namespace tv {
namespace detail {
template <typename T>
class KernelLoop {
struct Iterator {
__forceinline__ __device__ Iterator(T index, T delta)
: index_(index), delta_(delta) {}
__forceinline__ __device__ T operator*() const { return index_; }
__forceinline__ __device__ Iterator &operator++() {
index_ += delta_;
return *this;
}
__forceinline__ __device__ bool operator!=(const Iterator &other) const {
bool greater = index_ > other.index_;
bool less = index_ < other.index_;
// Anything past an end iterator (delta_ == 0) is equal.
// In range-based for loops, this optimizes to 'return less'.
if (!other.delta_) {
return less;
}
if (!delta_) {
return greater;
}
return less || greater;
}
private:
T index_;
const T delta_;
};
public:
__forceinline__ __device__ KernelLoop(T begin, T delta, T end)
: begin_(begin), delta_(delta), end_(end) {}
__forceinline__ __device__ Iterator begin() const {
return Iterator{begin_, delta_};
}
__forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }
private:
T begin_;
T delta_;
T end_;
};
} // namespace detail
template <typename T, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<T> KernelLoopX(T count) {
return detail::KernelLoop<T>(blockIdx.x * blockDim.x + threadIdx.x,
gridDim.x * blockDim.x * NumILP, count);
}
// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
// Usage: for(int i : KernelLoopY(count)) { visit(i); }
template <typename T, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<T> KernelLoopY(T count) {
return detail::KernelLoop<T>(blockIdx.y * blockDim.y + threadIdx.y,
gridDim.y * blockDim.y * NumILP, count);
}
// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
template <typename T, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<T> KernelLoopZ(T count) {
return detail::KernelLoop<T>(blockIdx.z * blockDim.z + threadIdx.z,
gridDim.z * blockDim.z * NumILP, count);
}
} // namespace tv
#pragma once
// from pytorch.aten
#include "tensorview.h"
namespace tv
{
namespace launch
{
template <typename T1, typename T2>
inline int DivUp(const T1 a, const T2 b) { return (a + b - 1) / b; }
// Use 1024 threads per block, which requires cuda sm_2x or above
constexpr int CUDA_NUM_THREADS = 1024;
// CUDA: number of blocks for threads.
inline int getBlocks(const int N)
{
TV_ASSERT_RT_ERR(N > 0, "CUDA kernel launch blocks must be positive, but got N=", N);
return DivUp(N, CUDA_NUM_THREADS);
}
} // namespace launch
} // namespace tv
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment