Commit a6abf55d authored by yan.yan's avatar yan.yan
Browse files

Merge branch 'develop'

parents fad30002 79a3eaf2
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from cumm.common import TensorViewKernel, ThrustLib
from cumm.conv.bases import ConvOpType, NHWC
from cumm.conv.params import ConvProblem
from cumm import dtypes
import pccm
from ccimport import compat
from .pointops import Point2Voxel, Point2VoxelCPU
from .indices import SparseConvIndicesKernel, CudaCommonKernel
from .maxpool import IndiceMaxPool
class SpconvOps(pccm.Class):
def __init__(self):
super().__init__()
self.ndims = [1, 2, 3, 4]
for ndim in self.ndims:
p2v = Point2Voxel(dtypes.float32, ndim)
p2v_cpu = Point2VoxelCPU(dtypes.float32, ndim)
self.add_param_class(f"ops{ndim}d", p2v, f"Point2Voxel{ndim}D")
self.add_param_class(f"ops_cpu{ndim}d", p2v_cpu, f"Point2Voxel{ndim}DCPU")
problem = ConvProblem(ndim, ConvOpType.kForward, NHWC, NHWC, NHWC)
indices = SparseConvIndicesKernel(problem, dtypes.int32)
# self.add_param_class("ops", indices, "SpconvIndices")
cuda_funcs = [self.generate_subm_conv_inds,
self.generate_conv_inds_stage1, self.generate_conv_inds_stage1_5, self.generate_conv_inds_stage2, self.sort_1d_by_key]
self.add_impl_only_param_class(cuda_funcs, f"ops{ndim}d", indices, f"SpconvIndices{ndim}D")
@pccm.pybind.mark
@pccm.cuda.static_function
def generate_conv_inds_stage1(self):
code = pccm.FunctionCode()
code.arg("indices", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"std::vector<int>")
code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
code.arg("transposed", f"bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
code.raw(f"""
int ndim = indices.dim(1) - 1;
TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim &&
ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim &&
padding.size() == ndim, "your params size not equal to ndim", ndim);
""")
for ndim in self.ndims:
code.raw(f"""
if (ndim == {ndim}){{
tv::array<int, {ndim}> output_dims_, input_dims_;
tv::array<int, {ndim}> ksize_, stride_, padding_, dilation_;
for (int i = 0; i < {ndim}; ++i){{
output_dims_[i] = output_dims[i];
input_dims_[i] = input_dims[i];
ksize_[i] = ksize[i];
stride_[i] = stride[i];
padding_[i] = padding[i];
dilation_[i] = dilation[i];
}}
return SpconvIndices{ndim}D::generate_conv_inds_stage1(indices,
indice_pairs, indice_pairs_uniq, indice_num_per_loc,
batch_size, output_dims_, input_dims_,
ksize_, stride_, padding_, dilation_, transposed, stream_int);
}}
""")
code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
return code# .ret("int")
@pccm.pybind.mark
@pccm.cuda.static_function
def generate_conv_inds_stage1_5(self):
code = pccm.FunctionCode()
code.arg("indice_pairs_uniq", "tv::Tensor")
code.arg("ndim", "int")
code.arg("uniq_size", "int64_t")
code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
for ndim in self.ndims:
code.raw(f"""
if (ndim == {ndim}){{
return SpconvIndices{ndim}D::generate_conv_inds_stage1_5(indice_pairs_uniq, uniq_size, stream_int);
}}
""")
code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
return code.ret("int")
@pccm.pybind.mark
@pccm.cuda.static_function
def generate_conv_inds_stage2(self):
code = pccm.FunctionCode()
code.arg("indices, hashdata", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, out_inds", "tv::Tensor")
code.arg("num_out_act", "int")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"std::vector<int>")
code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
code.arg("transposed", f"bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
code.raw(f"""
int ndim = indices.dim(1) - 1;
TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim &&
ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim &&
padding.size() == ndim, "your params size not equal to ndim", ndim);
""")
for ndim in self.ndims:
code.raw(f"""
if (ndim == {ndim}){{
tv::array<int, {ndim}> output_dims_, input_dims_;
tv::array<int, {ndim}> ksize_, stride_, padding_, dilation_;
for (int i = 0; i < {ndim}; ++i){{
output_dims_[i] = output_dims[i];
input_dims_[i] = input_dims[i];
ksize_[i] = ksize[i];
stride_[i] = stride[i];
padding_[i] = padding[i];
dilation_[i] = dilation[i];
}}
return SpconvIndices{ndim}D::generate_conv_inds_stage2(indices, hashdata,
indice_pairs, indice_pairs_uniq, out_inds, num_out_act,
batch_size, output_dims_, input_dims_,
ksize_, stride_, padding_, dilation_, transposed, stream_int);
}}
""")
code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
return code.ret("int")
@pccm.pybind.mark
@pccm.cuda.static_function
def generate_subm_conv_inds(self):
code = pccm.FunctionCode()
code.arg("indices, hashdata", "tv::Tensor")
code.arg("indice_pairs, out_inds, indice_num_per_loc", "tv::Tensor")
code.arg("batch_size", "int")
code.arg("input_dims", f"std::vector<int>")
code.arg("ksize, dilation", f"std::vector<int>")
code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()", "cumm.tensorview.Tensor = Tensor()")
code.arg("backward", "bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int = 0")
code.raw(f"""
int ndim = indices.dim(1) - 1;
TV_ASSERT_RT_ERR(input_dims.size() == ndim &&
ksize.size() == ndim && dilation.size() == ndim, "your params size not equal to ndim", ndim);
""")
for ndim in self.ndims:
code.raw(f"""
if (ndim == {ndim}){{
tv::array<int, {ndim}> input_dims_;
tv::array<int, {ndim}> ksize_, dilation_;
for (int i = 0; i < {ndim}; ++i){{
input_dims_[i] = input_dims[i];
ksize_[i] = ksize[i];
dilation_[i] = dilation[i];
}}
return SpconvIndices{ndim}D::generate_subm_conv_inds(indices, hashdata,
indice_pairs, out_inds, indice_num_per_loc,
batch_size, input_dims_,
ksize_, dilation_, indice_pair_mask, backward,
stream_int);
}}
""")
code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
return code.ret("int")
@pccm.pybind.mark
@pccm.cuda.static_function
def maxpool_forward(self):
code = pccm.FunctionCode()
code.arg("out", "tv::Tensor")
code.arg("inp", "tv::Tensor")
code.arg("out_inds", "tv::Tensor")
code.arg("in_inds", "tv::Tensor")
code.arg("stream", "std::uintptr_t", "0", pyanno="int")
code.add_dependency(IndiceMaxPool)
code.raw(f"""
return IndiceMaxPool::forward(out, inp, out_inds, in_inds, stream);
""")
return code
@pccm.pybind.mark
@pccm.cuda.static_function
def maxpool_backward(self):
code = pccm.FunctionCode()
code.arg("out", "tv::Tensor")
code.arg("inp", "tv::Tensor")
code.arg("dout", "tv::Tensor")
code.arg("dinp", "tv::Tensor")
code.arg("out_inds", "tv::Tensor")
code.arg("in_inds", "tv::Tensor")
code.arg("stream", "std::uintptr_t", "0", pyanno="int")
code.add_dependency(IndiceMaxPool)
code.raw(f"""
return IndiceMaxPool::backward(out, inp, dout, dinp, out_inds, in_inds, stream);
""")
return code
@pccm.pybind.mark
@pccm.cuda.static_function
def sort_1d_by_key(self):
code = pccm.FunctionCode()
code.add_dependency(ThrustLib, TensorViewKernel)
code.add_param_class("cudakers", CudaCommonKernel())
code.arg("data", "tv::Tensor")
code.raw(f"""
tv::Tensor indices({{data.dim(0)}}, tv::int32, 0);
tv::cuda::Launch launcher(data.dim(0));
launcher(cudakers::arange_kernel<int32_t>, indices.data_ptr<int32_t>(), indices.dim(0));
tv::dispatch<int32_t, uint32_t, int64_t, uint64_t>(data.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
thrust::device_ptr<T> ptr_tr(data.data_ptr<T>());
thrust::device_ptr<int32_t> ptr_k(indices.data_ptr<int32_t>());
auto thrust_ctx = thrust::cuda::par.on(0);
thrust::sort_by_key(thrust_ctx, ptr_tr, ptr_tr + data.dim(0), ptr_k);
}});
return indices;
""")
return code.ret("tv::Tensor")
import torch
import time
def main():
arr = torch.randint(0, 130000, size=[130000]).to(torch.int32).cuda()
arr2 = torch.randint(0, 130000, size=[130000]).to(torch.int32).cuda()
torch.cuda.synchronize()
ar = torch.arange(arr.shape[0]).cuda()
t = time.time()
for i in range(10):
xx, indices = arr.sort()
# thh = torch.empty_like(indices)
xx2, indices2 = arr2.sort()
# thh[indices] = ar
torch.cuda.synchronize()
print(time.time() - t)
t = time.time()
# print(indices[:10], thh[:10])
a = torch.rand(130000, 27 * 32).cuda().float()
b = torch.rand(27 * 32, 32).cuda().float()
c = torch.rand(130000, 32).cuda().float()
for i in range(10):
torch.cuda.synchronize()
t = time.time()
torch.mm(a, b, out=c)
# thh[indices] = ar
torch.cuda.synchronize()
print(time.time() - t)
if __name__ == "__main__":
main()
\ No newline at end of file
#!/home/yy/library/anaconda3/bin/python
import sys
from pathlib import Path
import ctypes
# _cudart = ctypes.CDLL('libcudart.so')
print(str(Path(__file__).parent.parent.parent.parent))
sys.path.append(str(Path(__file__).parent.parent.parent.parent))
from spconv import tensorview as tv
from spconv.sparse import build
import numpy as np
from pathlib import Path
from spconv.spconv_ops_cc.sparse.all.ops import Point2Voxel
from spconv.spconv_ops_cc.sparse.all import SpconvOps
import time
def main():
data = np.load("/home/yy/OneDrive/dev/spconv/test/data/benchmark-pc.npz")["pc"].astype(np.float32)
print(data.shape, data.dtype)
p2v = Point2Voxel([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 3, 150000, 1)
gs = p2v.grid_size # zyx
print(gs)
# return
data_tv = tv.from_numpy(data).cuda()
for i in range(6):
t = time.time()
voxels, indices, num_per_voxel = p2v.point_to_voxel_hash(data_tv)
print(time.time() - t)
voxels, indices, num_per_voxel = p2v.point_to_voxel_hash(data_tv)
print(voxels.shape, gs)
gs_xyz = gs
indices_np = indices.cpu().numpy()
# indices_offset = indices_np[:, 0] * gs_xyz[1] * gs_xyz[2] + indices_np[:, 1] * gs_xyz[2] + indices_np[:, 2]
# uq = np.unique(indices_offset)
# print(uq.shape, indices_offset.shape, gs_xyz)
# return
ksize = [3] * 3
kv = int(np.prod(ksize))
indices_with_bs = np.zeros((indices_np.shape[0], 4), dtype=np.int32)
indices_with_bs[:, 1:] = indices_np
print(indices_with_bs.mean(), indices_with_bs.max(), indices_with_bs.min())
indices = tv.from_numpy(indices_with_bs).cuda()
out_indices = tv.zeros([indices.dim(0) * kv, 4], tv.int32, 0)
indice_num_per_loc = tv.zeros([kv], tv.int32, 0)
points = voxels.view([-1, 3])
hashdata = tv.zeros([points.dim(0) * kv * 2], tv.custom64, 0)
hashdata_subm = tv.zeros([points.dim(0) * 2], tv.custom64, 0)
indice_pairs = tv.full([2, kv, indices.dim(0)], -1, tv.int32, 0)
indice_pairs_uniq = tv.zeros([indice_pairs.size // 2 + 1], tv.int32, 0)
# for i in range(10):
# indice_pairs.fill_int_(-1)
# np.random.shuffle(indices_with_bs)
# indices = tv.from_numpy(indices_with_bs).cuda()
# indice_num_per_loc.zero_()
# out_act = SpconvOps.generate_conv_inds(indices, hashdata, indice_pairs,
# indice_pairs_uniq, out_indices, indice_num_per_loc,
# 1, gs, gs, [3, 3, 3], [1, 1, 1], [1, 1, 1], [1, 1, 1])
# indice_num_per_loc.zero_()
# out_act = SpconvOps.generate_subm_conv_inds(indices, hashdata_subm, indice_pairs,
# out_indices, indice_num_per_loc,
# 1, gs, ksize, [1, 1, 1])
# indice_num_per_loc_cpu = indice_num_per_loc.cpu().numpy()
# indice_pairs_cpu = indice_pairs.cpu().numpy()
# indice_pairs_cpu_flat = indice_pairs_cpu.reshape(-1)
# uq, count = np.unique(indice_pairs_cpu_flat, return_counts=True)
# print(out_act, indice_pairs_cpu.shape, indice_pairs_cpu.mean(), indice_num_per_loc_cpu.tolist())
# print(indice_pairs_cpu[:, 13, :2])
# print(uq, count)
if __name__ == "__main__":
main()
\ No newline at end of file
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
from cumm.conv.bases import ConvEnum
from cumm.gemm.core.metaarray import MetaArray, seq
from cumm import dtypes
import pccm
from cumm.gemm.layout import TensorGeneric, to_stride
from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, ThrustLib
from cumm.gemm import codeops
from typing import List
from cumm.conv.params import ConvProblem
import numpy as np
class CudaCommonKernel(pccm.ParameterizedClass):
# we need to use PClass instead of Class
# because cuda global function can't be put in class body.
@pccm.cuda.cuda_global_function
def arange_kernel(self):
code = pccm.FunctionCode()
code.targ("T")
code.arg("data", f"T*")
code.arg("size", f"int")
code.raw(f"""
for (int i : tv::KernelLoopX<int>(size)) {{
data[i] = T(i);
}}
""")
return code
@pccm.cuda.cuda_global_function
def fill_kernel(self):
code = pccm.FunctionCode()
code.targ("T")
code.arg("data", f"T*")
code.arg("val", f"T")
code.arg("size", f"int")
code.raw(f"""
for (int i : tv::KernelLoopX<int>(size)) {{
data[i] = T(val);
}}
""")
return code
class ConvOutLocIter(pccm.ParameterizedClass):
# TODO add conv transpose
def __init__(self, problem: ConvProblem):
super().__init__()
self.add_dependency(TensorView)
self.add_param_class("lociter", problem, "ConvProblem")
layout_npq = TensorGeneric(problem.ndim + 1, False)
layout_rs = TensorGeneric(problem.ndim, False)
self.add_param_class("lociter", layout_npq, "LayoutNPQ")
self.add_param_class("lociter_rs", layout_rs, "LayoutRS")
self.ndim = problem.ndim
self.add_member("problem_", f"ConvProblem")
self.add_member("count_", f"tv::array<int, {self.ndim}>")
self.add_member("layout_npq", f"LayoutNPQ")
self.add_member("layout_rs", f"LayoutRS")
@pccm.cuda.constructor(host=True, device=True, forceinline=True)
def ctor(self):
code = pccm.FunctionCode()
code.arg("problem", f"ConvProblem const&")
code.ctor_init("problem_", f"problem")
zeros = ", ".join(["0"] * self.ndim)
code.ctor_init("count_", f"{{{zeros}}}")
pqs = codeops.unpack("problem.output_dims", range(self.ndim))
rss = codeops.unpack("problem.ksize", range(self.ndim))
code.ctor_init("layout_npq", f"LayoutNPQ::from_shape({{problem.N, {pqs}}})")
code.ctor_init("layout_rs", f"LayoutRS::from_shape({{{rss}}})")
return code
@pccm.cuda.member_function(host=True,
device=True,
forceinline=True,
name="operator++")
def increment(self):
code = pccm.FunctionCode()
for i in range(self.ndim - 1, -1, -1):
code.raw(f"""
if (++count_[{i}] < problem_.ksize[{i}]){{
return *this;
}}
count_[{i}] = 0;
""")
code.raw("return *this;")
return code.ret(f"{self.class_name}&")
@pccm.cuda.member_function(host=True,
device=True,
forceinline=True)
def set_filter_offset(self):
code = pccm.FunctionCode()
code.arg("filter_offset", "int")
code.raw(f"""
layout_rs.inverse(filter_offset, count_);
""")
return code
@pccm.cuda.member_function(host=True,
device=True,
forceinline=True,
const=True)
def nhw_to_npq(self):
code = pccm.FunctionCode()
code.arg("nhw_offset", "const int*")
code.nontype_targ("NoStride", "bool")
for i in range(self.ndim):
code.raw(f"""
int r_{i} = count_[{i}];
int h_{i} = (nhw_offset[{i + 1}] + problem_.padding[{i}] -
r_{i} * problem_.dilation[{i}]) / (NoStride ? 1 : problem_.stride[{i}]);
""")
h0h1h2 = codeops.unpack_str("h", range(self.ndim))
code.raw(f"""
return {{nhw_offset[0], {h0h1h2}}};
""")
return code.ret(f"tv::array<int, {self.ndim + 1}>")
@pccm.cuda.member_function(host=True,
device=True,
forceinline=True,
const=True)
def npq_to_nhw(self):
code = pccm.FunctionCode()
code.arg("npq_offset", "const int*")
for i in range(self.ndim):
code.raw(f"""
int r_{i} = count_[{i}];
int h_{i} = npq_offset[{i + 1}] * problem_.stride[{i}] - problem_.padding[{i}] + r_{i} * problem_.dilation[{i}];
""")
h0h1h2 = codeops.unpack_str("h", range(self.ndim))
code.raw(f"""
return {{npq_offset[0], {h0h1h2}}};
""")
return code.ret(f"tv::array<int, {self.ndim + 1}>")
@pccm.cuda.member_function(host=True,
device=True,
forceinline=True,
const=True)
def query_npq(self):
code = pccm.FunctionCode()
code.arg("nhw_offset", "const int*")
code.arg("npq_offset", f"tv::array<int, {self.ndim + 1}>&")
code.ret("bool")
code.raw(f"""
auto npq_no_stride = nhw_to_npq<true>(nhw_offset);
npq_offset[0] = npq_no_stride[0];
""")
hw_valid = [] # type: List[str]
stride_valid = [] # type: List[str]
for i in range(self.ndim):
code.raw(f"npq_offset[{i + 1}] = npq_no_stride[{i + 1}] / problem_.stride[{i}];")
hw_valid.append((f"npq_offset[{i + 1}] >= 0 && "
f"npq_offset[{i + 1}] < problem_.output_dims[{i}]"))
stride_valid.append(f"!(npq_no_stride[{i + 1}] % problem_.stride[{i}])")
code.raw(f"""
return npq_no_stride[0] < problem_.N &&
{' && '.join(hw_valid)} &&
{' && '.join(stride_valid)};
""")
return code
@pccm.cuda.member_function(host=True,
device=True,
forceinline=True,
const=True)
def query_npq_no_stride(self):
code = pccm.FunctionCode()
code.arg("nhw_offset", "const int*")
code.arg("npq_offset", f"tv::array<int, {self.ndim + 1}>&")
code.ret("bool")
code.raw(f"""
npq_offset = nhw_to_npq<true>(nhw_offset);
""")
hw_valid = [] # type: List[str]
for i in range(self.ndim):
hw_valid.append((f"npq_offset[{i + 1}] >= 0 && "
f"npq_offset[{i + 1}] < problem_.output_dims[{i}]"))
code.raw(f"""
return npq_offset[0] < problem_.N &&
{' && '.join(hw_valid)};
""")
return code
@pccm.cuda.member_function(host=True,
device=True,
forceinline=True,
const=True)
def query_nhw(self):
code = pccm.FunctionCode()
code.arg("npq_offset", "const int*")
code.arg("nhw_offset", f"tv::array<int, {self.ndim + 1}>&")
code.ret("bool")
code.raw(f"""
nhw_offset = npq_to_nhw(npq_offset);
""")
hw_valid = [] # type: List[str]
for i in range(self.ndim):
hw_valid.append((f"nhw_offset[{i + 1}] >= 0 && "
f"nhw_offset[{i + 1}] < problem_.input_dims[{i}]"))
code.raw(f"""
return nhw_offset[0] < problem_.N &&
{' && '.join(hw_valid)};
""")
return code
@pccm.cuda.member_function(host=True,
device=True,
forceinline=True,
const=True)
def query_nhw_out(self):
code = pccm.FunctionCode()
code.arg("npq_offset", "const int*")
code.arg("nhw_offset", f"tv::array<int, {self.ndim + 1}>&")
code.ret("bool")
code.raw(f"""
nhw_offset = npq_to_nhw(npq_offset);
""")
hw_valid = [] # type: List[str]
for i in range(self.ndim):
hw_valid.append((f"nhw_offset[{i + 1}] >= 0 && "
f"nhw_offset[{i + 1}] < problem_.output_dims[{i}]"))
code.raw(f"""
return nhw_offset[0] < problem_.N &&
{' && '.join(hw_valid)};
""")
return code
class SparseConvIndicesKernel(pccm.ParameterizedClass):
def __init__(self, problem: ConvProblem, dtype_indices: dtypes.DType):
super().__init__()
self.add_dependency(TensorView, TensorViewKernel, TensorViewHashKernel, ThrustLib)
self.loc_iter = ConvOutLocIter(problem)
self.add_param_class("spinds", self.loc_iter, "ConvLocIter")
self.add_param_class("spinds", problem, "ConvProblem")
self.add_param_class("cudakers", CudaCommonKernel())
self.ndim = problem.ndim
self.dtype_indices = dtype_indices
self.dtype_indices_uniq = dtype_indices
assert dtype_indices == dtypes.int32 or dtype_indices == dtypes.int64
@pccm.cuda.cuda_global_function
def calc_conv_indices_stage1(self):
code = pccm.FunctionCode()
code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_pairs_for_uniq", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_num_per_loc", f"int*") # [kernelProd]
code.arg("num_indices_in", "int")
code.arg("indices_pair_size", "int")
code.arg("RS", "int")
code.arg("transposed", "bool")
code.raw(f"""
int filter_offset = blockIdx.y;
loc_iter.set_filter_offset(filter_offset);
int indices_pair_size_mul_RS = indices_pair_size * RS;
int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
tv::array<int, {self.ndim + 1}> npq_offset;
bool valid;
if (transposed){{
valid = loc_iter.query_nhw_out(indices_in + i * {self.ndim + 1}, npq_offset);
}}else{{
valid = loc_iter.query_npq(indices_in + i * {self.ndim + 1}, npq_offset);
}}
if (valid){{
int old_num = tv::cuda::atomicAggInc(indice_num_per_loc + filter_offset);
{self.dtype_indices} offset = loc_iter.layout_npq(npq_offset);
if (old_num < indices_pair_size){{
indice_pairs[filter_offset_mul_indices_pair_size + old_num] = i;
indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + old_num] = offset;
indice_pairs_for_uniq[filter_offset_mul_indices_pair_size + old_num] = offset;
}}
}}
}}
""")
return code
@pccm.cuda.cuda_global_function
def build_conv_hash_table(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_out", f"int*") # [N, ndim + 1]
code.arg("indice_pairs_for_uniq", f"const {self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("layout_npq", f"spinds::LayoutNPQ") # [2, kernelProd, MaxSize]
code.arg("num_indices", "int")
code.raw(f"""
for (int i : tv::KernelLoopX<int>(num_indices)) {{
{self.dtype_indices} index = indice_pairs_for_uniq[i];
layout_npq.inverse(index, indices_out + {self.ndim + 1} * i);
table.insert(index, i);
}}
""")
return code
@pccm.cuda.cuda_global_function
def calc_conv_indices_stage2(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indice_pairs_out_part", f"int*") # [2, kernelProd, MaxSize]
code.arg("num_indices_in", "int")
code.arg("indices_pair_size", "int")
# TODO use block instead of filter_offset?
code.raw(f"""
int filter_offset = blockIdx.y;
auto indice_pairs_out_part_filter = indice_pairs_out_part + filter_offset * indices_pair_size;
for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
{self.dtype_indices} index = indice_pairs_out_part_filter[i];
if (index > -1){{
auto ptr = table.lookup_ptr(index);
if (ptr){{
indice_pairs_out_part_filter[i] = ptr->second;
}}
}}
}}
""")
return code
@pccm.cuda.cuda_global_function
def build_subm_conv_hash_table(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("layout_npq", f"spinds::LayoutNPQ")
code.arg("num_indices", "int")
code.raw(f"""
for (int i : tv::KernelLoopX<int>(num_indices)) {{
{self.dtype_indices} index = layout_npq(indices_in + i * {self.ndim + 1});
table.insert(index, i);
}}
""")
return code
@pccm.cuda.cuda_global_function
def clean_indices_uniq(self):
code = pccm.FunctionCode()
code.arg("indice_pairs_for_uniq", f"{self.dtype_indices}*")
code.arg("size", f"{self.dtype_indices}")
code.raw(f"""
for ({self.dtype_indices} i : tv::KernelLoopX<{self.dtype_indices}>(size)) {{
indice_pairs_for_uniq[i] = std::numeric_limits<{self.dtype_indices}>::max();
}}
""")
return code
@pccm.cuda.cuda_global_function
def calc_subm_conv_indices(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_num_per_loc", f"int*") # [kernelProd]
code.arg("num_indices_in", "int")
code.arg("indices_pair_size", "int")
code.arg("RS", "int")
code.raw(f"""
int filter_offset = blockIdx.y;
loc_iter.set_filter_offset(filter_offset);
int indices_pair_size_mul_RS = indices_pair_size * RS;
int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
int filter_offset_mul_indices_pair_size_1 = (RS - 1 - filter_offset) * indices_pair_size;
if (filter_offset == (RS / 2)){{
for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
indice_pairs[filter_offset_mul_indices_pair_size + i] = i;
indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + i] = i;
}}
}} else {{
for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
tv::array<int, {self.ndim + 1}> npq_offset;
if (loc_iter.query_npq_no_stride(indices_in + i * {self.ndim + 1}, npq_offset)){{
{self.dtype_indices} offset = loc_iter.layout_npq(npq_offset);
auto item = table.lookup(offset); // performance bound
if (!item.empty()){{
int old_num = tv::cuda::atomicAggInc(indice_num_per_loc + filter_offset);
indice_pairs[filter_offset_mul_indices_pair_size + old_num] = i;
indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + old_num] = item.second;
indice_pairs[filter_offset_mul_indices_pair_size_1 + old_num] = item.second;
indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size_1 + old_num] = i;
}}
}}
}}
}}
""")
return code
@pccm.cuda.cuda_global_function
def calc_subm_conv_indices_mask(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("mask", f"uint32_t*") # [kernelProd]
code.arg("num_indices", "int")
code.arg("indices_pair_size", "int")
code.arg("RS", "int")
code.raw(f"""
int filter_offset = blockIdx.y;
uint32_t filter_mask_out = (1u << (filter_offset));
uint32_t filter_mask_in = (1u << (RS - 1 - filter_offset));
// uint32_t filter_mask_center = (1u << (RS / 2));
loc_iter.set_filter_offset(filter_offset);
int indices_pair_size_mul_RS = indices_pair_size * RS;
int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
int filter_offset_mul_indices_pair_size_1 = (RS - 1 - filter_offset) * indices_pair_size;
if (filter_offset == (RS / 2)){{
for (int i : tv::KernelLoopX<int>(num_indices)) {{
// atomicOr(mask + i, filter_mask_center);
indice_pairs[filter_offset_mul_indices_pair_size + i] = i;
indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + i] = i;
}}
}} else {{
for (int output_index : tv::KernelLoopX<int>(num_indices)) {{
// find input offset from output offset
tv::array<int, {self.ndim + 1}> nhw_offset;
// table: input indice coord to output index (or output indice coord to input index)
if (loc_iter.query_nhw(indices_in + output_index * {self.ndim + 1}, nhw_offset)){{
{self.dtype_indices} offset = loc_iter.layout_npq(nhw_offset);
auto item = table.lookup(offset);
if (!item.empty()) {{
auto input_index = item.second; // we find a input indice idx.
atomicOr(mask + output_index, filter_mask_out);
atomicOr(mask + input_index, filter_mask_in);
// for this output, we set correct input idx.
indice_pairs[filter_offset_mul_indices_pair_size + output_index] = input_index;
// the output in "input location" connect this output idx in another location.
indice_pairs[filter_offset_mul_indices_pair_size_1 + input_index] = output_index;
indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + input_index] = output_index;
indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size_1 + output_index] = input_index;
}}
}}
}}
}}
""")
return code
@pccm.cuda.cuda_global_function
def calc_subm_conv_indices_split_mask(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("mask1", f"uint32_t*") # [kernelProd]
code.arg("mask2", f"uint32_t*") # [kernelProd]
code.arg("num_indices", "int")
code.arg("indices_pair_size", "int")
code.arg("RS", "int")
code.raw(f"""
int filter_offset = blockIdx.y;
uint32_t filter_mask_out = (1u << (filter_offset));
uint32_t filter_mask_in = (1u << (RS - 1 - filter_offset));
// uint32_t filter_mask_center = (1u << (RS / 2));
loc_iter.set_filter_offset(filter_offset);
auto indice_ptr_inv = indice_pairs + indices_pair_size * RS;
int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
int filter_offset_mul_indices_pair_size_1 = (RS - 1 - filter_offset) * indices_pair_size;
if (filter_offset == (RS / 2)){{
for (int i : tv::KernelLoopX<int>(num_indices)) {{
indice_pairs[filter_offset_mul_indices_pair_size + i] = i;
indice_ptr_inv[filter_offset_mul_indices_pair_size + i] = i;
}}
}} else {{
for (int output_index : tv::KernelLoopX<int>(num_indices)) {{
// find input offset from output offset
tv::array<int, {self.ndim + 1}> nhw_offset;
// table: input indice coord to output index (or output indice coord to input index)
if (loc_iter.query_nhw(indices_in + output_index * {self.ndim + 1}, nhw_offset)){{
{self.dtype_indices} offset = loc_iter.layout_npq(nhw_offset);
auto item = table.lookup(offset);
if (!item.empty()) {{
auto input_index = item.second; // we find a input indice idx.
atomicOr(mask1 + output_index, filter_mask_out);
atomicOr(mask2 + input_index, filter_mask_in);
// for this output, we set correct input idx.
indice_pairs[filter_offset_mul_indices_pair_size + output_index] = input_index;
// the output in "input location" connect this output idx in another location.
indice_pairs[filter_offset_mul_indices_pair_size_1 + input_index] = output_index;
indice_ptr_inv[filter_offset_mul_indices_pair_size + input_index] = output_index;
indice_ptr_inv[filter_offset_mul_indices_pair_size_1 + output_index] = input_index;
}}
}}
}}
}}
""")
return code
@pccm.cuda.static_function
def generate_conv_inds_stage1(self):
code = pccm.FunctionCode()
code.arg("indices", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
code.arg("transposed", f"bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0")
code.raw(f"""
// TODO stream
// TODO handle num input == 0
int kv = tv::arrayops::prod(ksize);
TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
// indice_pairs: [2, kv, indices.dim(0)]
// indice_pairs_uniq: [indice_pairs.size() / 2 + 1]
int64_t uniq_size = indice_pairs.size() / 2 + 1;
TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) >= uniq_size, "error");
TV_ASSERT_RT_ERR(indice_num_per_loc.dim(0) == kv, "error");
int64_t expected_out_size = indices.dim(0) * kv;
tv::cuda::Launch launcher_num_act_in(indices.dim(0), reinterpret_cast<cudaStream_t>(stream_int));
// tv::cuda::Launch launcher_num_act_in_2(indices.dim(0));
launcher_num_act_in.blocks.y = kv;
ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
ConvLocIter loc_iter(problem);
tv::cuda::Launch launcher_clean_uniq(uniq_size, reinterpret_cast<cudaStream_t>(stream_int));
launcher_clean_uniq(clean_indices_uniq, indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), uniq_size);
launcher_num_act_in(calc_conv_indices_stage1, loc_iter, indices.data_ptr<const int>(),
indice_pairs.data_ptr<{self.dtype_indices}>(),
indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
indice_pairs.dim(2), kv, transposed);
// thrust::device_ptr<{self.dtype_indices}> ptr_tr(indice_pairs_uniq.data_ptr<{self.dtype_indices}>());
// auto thrust_ctx = thrust::cuda::par.on(reinterpret_cast<cudaStream_t>(stream_int));
// thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
// auto new_end = thrust::unique(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
// auto num_out_act = new_end - ptr_tr - 1;
// return num_out_act;
""")
return code# .ret("int")
@pccm.cuda.static_function
def generate_conv_inds_stage1_5(self):
code = pccm.FunctionCode()
code.arg("indice_pairs_uniq", "tv::Tensor")
code.arg("uniq_size", "int64_t")
code.arg("stream_int", f"std::uintptr_t", "0")
code.raw(f"""
thrust::device_ptr<{self.dtype_indices}> ptr_tr(indice_pairs_uniq.data_ptr<{self.dtype_indices}>());
auto thrust_ctx = thrust::cuda::par.on(reinterpret_cast<cudaStream_t>(stream_int));
thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
auto new_end = thrust::unique(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
auto num_out_act = new_end - ptr_tr - 1;
return num_out_act;
""")
return code.ret("int")
@pccm.cuda.static_function
def generate_conv_inds_stage2(self):
code = pccm.FunctionCode()
code.arg("indices, hashdata", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, out_inds", "tv::Tensor")
code.arg("num_out_act", "int")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
code.arg("transposed", f"bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0")
code.raw(f"""
auto custream = reinterpret_cast<cudaStream_t>(stream_int);
// TODO stream
// TODO handle num input == 0
int kv = tv::arrayops::prod(ksize);
TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
// indice_pairs: [2, kv, indices.dim(0)]
// indice_pairs_uniq: [indice_pairs.size() / 2 + 1]
// out_inds: [MaxSize, {self.ndim + 1}]
auto timer = tv::CudaContextTimer<>();
int64_t uniq_size = indice_pairs.size() / 2 + 1;
TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) >= num_out_act, "error");
TV_ASSERT_RT_ERR(out_inds.dim(0) >= num_out_act && out_inds.dim(1) == {self.ndim + 1}, "error");
tv::cuda::Launch launcher_num_act_in(indices.dim(0), custream);
launcher_num_act_in.blocks.y = kv;
ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
ConvLocIter loc_iter(problem);
// TODO handle invalid num_out_act
indice_pairs_uniq = indice_pairs_uniq.slice_first_axis(0, num_out_act);
tv::cuda::Launch lanucher_build_hash(num_out_act, custream);
using V = {self.dtype_indices};
using KeyType = {self.dtype_indices};
constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
using table_t =
tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
kEmptyKey, false>;
using pair_t = typename table_t::value_type;
TV_ASSERT_RT_ERR(hashdata.dim(0) >= num_out_act, "hash size not enough");
table_t hash = table_t(hashdata.data_ptr<pair_t>(), hashdata.dim(0));
hash.clear(custream);
lanucher_build_hash(build_conv_hash_table<table_t>, hash,
out_inds.data_ptr<int>(), indice_pairs_uniq.data_ptr<const {self.dtype_indices}>(),
loc_iter.layout_npq, num_out_act);
launcher_num_act_in(calc_conv_indices_stage2<table_t>, hash,
indice_pairs[1].data_ptr<int>(), indices.dim(0),
indice_pairs.dim(2));
return num_out_act;
""")
return code.ret("int")
@pccm.cuda.static_function
def generate_subm_conv_inds(self):
code = pccm.FunctionCode()
code.arg("indices, hashdata", "tv::Tensor")
code.arg("indice_pairs, out_inds, indice_num_per_loc", "tv::Tensor")
code.arg("batch_size", "int")
code.arg("input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, dilation", f"tv::array<int, {self.ndim}>")
code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()", "cumm.tensorview.Tensor = Tensor()")
code.arg("backward", "bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0")
code.raw(f"""
auto custream = reinterpret_cast<cudaStream_t>(stream_int);
auto ctx = tv::Context();
ctx.set_cuda_stream(custream);
if (!indice_pair_mask.empty()){{
TV_ASSERT_INVALID_ARG(tv::arrayops::prod(ksize) < 32, "for now only support 32bit mask");
}}
// TODO stream
// TODO handle num input == 0
tv::array<int, {self.ndim}> stride, padding;
for (int i = 0; i < {self.ndim}; ++i){{
TV_ASSERT_RT_ERR(ksize[i] % 2 == 1, "subm only support odd ksize");
stride[i] = 1;
padding[i] = (ksize[i] / 2) * dilation[i];
}}
int kv = tv::arrayops::prod(ksize);
TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
// indice_pairs: [2, kv, indices.dim(0)]
// out_inds: [MaxSize, {self.ndim + 1}]
// auto timer = tv::CudaContextTimer<>();
TV_ASSERT_RT_ERR(indice_num_per_loc.dim(0) == kv, "error");
tv::cuda::Launch launcher_num_act_in(indices.dim(0), custream);
launcher_num_act_in.blocks.y = (kv / 2) + 1;
// launcher_num_act_in.blocks.y = kv;
ConvProblem problem(batch_size, 1, 1, input_dims, input_dims, ksize, padding, stride, dilation);
ConvLocIter loc_iter(problem);
tv::cuda::Launch lanucher_build_hash(indices.dim(0), custream);
using V = {self.dtype_indices};
using KeyType = {self.dtype_indices};
constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
using table_t =
tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
kEmptyKey, false>;
using pair_t = typename table_t::value_type;
TV_ASSERT_RT_ERR(hashdata.dim(0) >= indices.dim(0), "hash size not enough");
table_t hash = table_t(hashdata.data_ptr<pair_t>(), hashdata.dim(0));
hash.clear(custream);
// tv::ssprint("clear hash time", hashdata.dim(0), timer.report() / 1000.0);
lanucher_build_hash(build_subm_conv_hash_table<table_t>, hash, indices.data_ptr<const int>(),
loc_iter.layout_npq, indices.dim(0));
// tv::ssprint("build_hash time", timer.report() / 1000.0);
if (!indice_pair_mask.empty()){{
if (indice_pair_mask.ndim() == 2 && indice_pair_mask.dim(0) == 2){{
auto mask_0 = indice_pair_mask[0];
tv::cuda::Launch lanucher_fill(mask_0.size(), custream);
lanucher_fill(cudakers::fill_kernel<int>, mask_0.data_ptr<int>(), (1 << (kv / 2)), mask_0.size());
indice_pair_mask[1].zero_(ctx);
auto kernel = &calc_subm_conv_indices_split_mask<table_t>;
launcher_num_act_in(kernel, loc_iter, hash,
indices.data_ptr<int>(), indice_pairs.data_ptr<int>(),
indice_pair_mask[0].data_ptr<uint32_t>(), indice_pair_mask[1].data_ptr<uint32_t>(),
indices.dim(0), indice_pairs.dim(2), kv);
}}else{{
tv::cuda::Launch lanucher_fill(indice_pair_mask.size(), custream);
lanucher_fill(cudakers::fill_kernel<int>, indice_pair_mask.data_ptr<int>(), (1 << (kv / 2)), indice_pair_mask.size());
TV_ASSERT_RT_ERR(indice_pair_mask.ndim() == 1, "error");
launcher_num_act_in(calc_subm_conv_indices_mask<table_t>, loc_iter, hash,
indices.data_ptr<int>(), indice_pairs.data_ptr<int>(),
indice_pair_mask.data_ptr<uint32_t>(), indices.dim(0), indice_pairs.dim(2), kv);
}}
}}else{{
launcher_num_act_in(calc_subm_conv_indices<table_t>, loc_iter, hash, indices.data_ptr<int>(),
indice_pairs.data_ptr<int>(),
indice_num_per_loc.data_ptr<int>(), indices.dim(0), indice_pairs.dim(2), kv);
}}
// tv::ssprint("gem subm conv inds time", timer.report() / 1000.0);
return indices.dim(0);
""")
return code.ret("int")
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
from cumm.conv.bases import ConvEnum
from cumm.gemm.core.metaarray import MetaArray, seq
from cumm import dtypes
import pccm
from cumm.gemm.layout import TensorGeneric, to_stride
from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, ThrustLib, GemmBasic
from cumm.gemm import codeops
from typing import List
from cumm.conv.params import ConvProblem
from cumm.gemm.mask_iters import MaskTileIterator, MaskTileIteratorParams
import numpy as np
from cumm.gemm import (thread_map)
class IndiceMaxPool(pccm.Class):
# TODO optimize this function
def __init__(self):
super().__init__()
self.add_dependency(TensorViewKernel, TensorView, GemmBasic)
@pccm.cuda.cuda_global_function
def forward_kernel(self):
code = pccm.FunctionCode()
code.targ("T")
code.arg("out_features", f"T*")
code.arg("in_features", f"const T*")
code.arg("out_indices", "const int*")
code.arg("in_indices", "const int*")
code.arg("size", "int")
code.arg("num_features", "int")
code.raw(f"""
for (int i : tv::KernelLoopY<int>(size)) {{
int in_idx = in_indices[i];
int out_idx = out_indices[i];
auto in_ptr = in_features + in_idx * num_features;
auto out_ptr = out_features + out_idx * num_features;
for (int j : tv::KernelLoopX<int>(num_features)) {{
auto in = in_ptr[j];
auto out = out_ptr[j];
if (in > out){{
out_ptr[j] = in;
}}
}}
}}
""")
return code
@pccm.cuda.cuda_global_function
def backward_kernel(self):
code = pccm.FunctionCode()
code.targ("T")
code.arg("out_features", f"const T*")
code.arg("in_features", f"const T*")
code.arg("dout_features", f"const T*")
code.arg("din_features", f"T*")
code.arg("out_indices", "const int*")
code.arg("in_indices", "const int*")
code.arg("size", "int")
code.arg("num_features", "int")
code.raw(f"""
for (int i : tv::KernelLoopY<int>(size)) {{
int in_idx_offset = in_indices[i] * num_features;
int out_idx_offset = out_indices[i] * num_features;
auto in_ptr = in_features + in_idx_offset;
auto out_ptr = out_features + out_idx_offset;
auto din_ptr = din_features + in_idx_offset;
auto dout_ptr = dout_features + out_idx_offset;
for (int j : tv::KernelLoopX<int>(num_features)) {{
auto in = in_ptr[j];
auto out = out_ptr[j];
if (in == out){{
din_ptr[j] = din_ptr[j] + dout_ptr[j];
}}
}}
}}
""")
return code
@pccm.cuda.static_function
def forward(self):
code = pccm.FunctionCode()
code.arg("out", "tv::Tensor")
code.arg("in", "tv::Tensor")
code.arg("out_inds", "tv::Tensor")
code.arg("in_inds", "tv::Tensor")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
auto nhot = out_inds.dim(0);
auto cudastream = reinterpret_cast<cudaStream_t>(stream);
tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
constexpr int MaxThreads = 512;
tv::cuda::Launch launcher(1);
bool found = tv::dispatch_int_noexcept<512, 256, 128, 64, 32, 16>(out.dim(1), [](int my, int expect){{return my >= expect;}}, [&](auto V){{
// if out.dim(1) > value in list above, run this function.
// if a value is found, other value won't be executed.
int NumFeatures = TV_DECLTYPE(V)::value;
int Num0 = MaxThreads / NumFeatures;
dim3 blocks(tv::div_up(out.dim(1), NumFeatures), tv::div_up(nhot, Num0));
dim3 threads(NumFeatures, Num0);
launcher = tv::cuda::Launch(blocks, threads, cudastream);
}});
if (!found){{
int NumFeatures = 16;
int Num0 = MaxThreads / NumFeatures;
dim3 blocks(tv::div_up(out.dim(1), NumFeatures), tv::div_up(nhot, Num0));
dim3 threads(NumFeatures, Num0);
launcher = tv::cuda::Launch(blocks, threads, cudastream);
}}
launcher(forward_kernel<T>, out.data_ptr<T>(), in.data_ptr<const T>(),
out_inds.data_ptr<const int>(), in_inds.data_ptr<const int>(), nhot, out.dim(1));
}});
""")
return code
@pccm.cuda.static_function
def backward(self):
code = pccm.FunctionCode()
code.arg("out", "tv::Tensor")
code.arg("in", "tv::Tensor")
code.arg("dout", "tv::Tensor")
code.arg("din", "tv::Tensor")
code.arg("out_inds", "tv::Tensor")
code.arg("in_inds", "tv::Tensor")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
auto nhot = out_inds.dim(0);
auto cudastream = reinterpret_cast<cudaStream_t>(stream);
tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
constexpr int MaxThreads = 512;
tv::cuda::Launch launcher(1);
bool found = tv::dispatch_int_noexcept<512, 256, 128, 64, 32, 16>(out.dim(1), [](int my, int expect){{return my >= expect;}}, [&](auto V){{
// if out.dim(1) > value in list above, run this function.
// if a value is found, other value won't be executed.
int NumFeatures = TV_DECLTYPE(V)::value;
int Num0 = MaxThreads / NumFeatures;
dim3 blocks(tv::div_up(out.dim(1), NumFeatures), tv::div_up(nhot, Num0));
dim3 threads(NumFeatures, Num0);
launcher = tv::cuda::Launch(blocks, threads, cudastream);
}});
if (!found){{
int NumFeatures = 16;
int Num0 = MaxThreads / NumFeatures;
dim3 blocks(tv::div_up(out.dim(1), NumFeatures), tv::div_up(nhot, Num0));
dim3 threads(NumFeatures, Num0);
launcher = tv::cuda::Launch(blocks, threads, cudastream);
}}
launcher(backward_kernel<T>, out.data_ptr<const T>(), in.data_ptr<const T>(),
dout.data_ptr<const T>(), din.data_ptr<T>(),
out_inds.data_ptr<const int>(), in_inds.data_ptr<const int>(), nhot, out.dim(1));
}});
""")
return code
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
from cumm.gemm.core.metaarray import MetaArray, seq
from cumm import dtypes
import pccm
from cumm.gemm.layout import TensorGeneric, to_stride
from cumm.common import TensorView, TensorViewHashKernel
from cumm.gemm import codeops
from typing import List
from cumm.conv.params import ConvProblem
import numpy as np
class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
"""this class don't support multi-thread.
create p2v for every thread.
"""
def __init__(self, dtype: dtypes.DType, ndim: int, layout: TensorGeneric, zyx: bool = True):
super().__init__()
self.add_dependency(TensorView, TensorViewHashKernel)
self.add_param_class("layout_ns", layout, "Layout")
self.dtype = dtype
self.ndim = ndim
self.zyx = zyx
@pccm.cuda.cuda_global_function
def build_hash_table(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("table", "TTable")
code.arg("points", f"{self.dtype} const*")
code.arg("points_indice_data", f"int64_t *")
code.arg("point_stride", f"int")
code.arg("vsize", f"tv::array<float, {self.ndim}>")
code.arg("coors_range", f"tv::array<float, {self.ndim * 2}>")
code.arg("grid_bound", f"tv::array<int, {self.ndim}>")
code.arg("grid_stride", f"tv::array<int, {self.ndim}>")
code.arg("num_points", f"int")
point_xyz = f"{self.ndim - 1} - j"
if not self.zyx:
point_xyz = f"j"
# if zyx, the coors_range and grid_bound is zyx too,
# generated indices is zyx.
code.raw(f"""
for (int i : tv::KernelLoopX<int>(num_points)){{
bool failed = false;
int c;
int64_t prod = 0;
#pragma unroll
for (int j = 0; j < {self.ndim}; ++j) {{
c = floor((points[i * point_stride + {point_xyz}] - coors_range[j]) /
vsize[j]);
if ((c < 0 || c >= grid_bound[j])) {{
failed = true;
}}
prod += grid_stride[j] * c;
}}
if (!failed){{
points_indice_data[i] = prod;
table.insert(prod, i);
}}else{{
points_indice_data[i] = -1;
}}
}}
""")
return code
@pccm.cuda.cuda_global_function
def assign_table(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("table", "TTable")
code.arg("indices", f"int*")
code.arg("count", f"int*")
code.arg("layout", f"Layout")
code.arg("max_voxels", f"int")
code.raw(f"""
auto data = table.data();
for (int i : tv::KernelLoopX<int>(table.size())){{
auto &item = data[i];
if (!item.empty()) {{
item.second = tv::cuda::atomicAggInc(count);
if (item.second < max_voxels){{
layout.inverse(item.first, indices + item.second * {self.ndim});
}}
}}
}}
""")
return code
@pccm.cuda.cuda_global_function
def generate_voxel(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("table", "TTable")
code.arg("points", f"{self.dtype} const*")
code.arg("points_indice_data", f"const int64_t*")
code.arg("voxels", f"{self.dtype} *")
code.arg("num_per_voxel", f"int *")
code.arg("point_stride", f"int")
code.arg("max_points_per_voxel", f"int")
code.arg("max_voxels", f"int")
code.arg("vsize", f"tv::array<float, {self.ndim}>")
code.arg("coors_range", f"tv::array<float, {self.ndim * 2}>")
code.arg("grid_bound", f"tv::array<int, {self.ndim}>")
code.arg("grid_stride", f"tv::array<int, {self.ndim}>")
code.arg("num_points", f"int")
code.raw(f"""
int voxel_stride0 = point_stride * max_points_per_voxel;
for (int i : tv::KernelLoopX<int>(num_points)){{
int64_t prod = points_indice_data[i];
if (prod != -1){{
auto voxel_index_pair = table.lookup(prod);
if (!voxel_index_pair.empty() &&
voxel_index_pair.second < max_voxels) {{
int old = atomicAdd(num_per_voxel + voxel_index_pair.second, 1);
if (old < max_points_per_voxel) {{
for (int j = 0; j < point_stride; ++j) {{
voxels[voxel_index_pair.second * voxel_stride0 + old * point_stride + j] = points[i * point_stride + j];
}}
}}
}}
}}
}}
""")
return code
class Point2Voxel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
def __init__(self, dtype: dtypes.DType, ndim: int, zyx: bool = True):
super().__init__()
self.add_dependency(TensorView)
layout = TensorGeneric(ndim, True)
self.add_param_class("layout_ns", layout, "Layout")
self.dtype = dtype
self.ndim = ndim
self.zyx = zyx
cuda_funcs = [self.point_to_voxel_hash]
self.add_impl_only_param_class(cuda_funcs, "kernel", Point2VoxelKernel(dtype, ndim, layout, zyx))
self.add_pybind_member("hashdata", "tv::Tensor", readwrite=False, pyanno="cumm.tensorview.Tensor")
self.add_pybind_member("point_indice_data", "tv::Tensor", readwrite=False, pyanno="cumm.tensorview.Tensor")
self.add_pybind_member("voxels", "tv::Tensor", readwrite=False)
self.add_pybind_member("indices", "tv::Tensor", readwrite=False)
self.add_pybind_member("num_per_voxel", "tv::Tensor", readwrite=False)
self.add_member("vsize", f"tv::array<float, {self.ndim}>")
self.add_member("coors_range", f"tv::array<float, {self.ndim * 2}>")
self.add_member("grid_size", f"tv::array<int, {self.ndim}>")
self.add_member("grid_stride", f"tv::array<int, {self.ndim}>")
@pccm.pybind.mark_prop_getter(prop_name="grid_size")
@pccm.member_function
def get_grid_size(self):
code = pccm.FunctionCode()
code.raw(f"""
std::array<int, {self.ndim}> res;
for (int i = 0; i < {self.ndim}; ++i){{
res[i] = grid_size[i];
}}
return res;
""")
return code.ret(f"std::array<int, {self.ndim}>")
@pccm.pybind.mark
@pccm.constructor
def ctor(self):
code = pccm.FunctionCode()
code.arg("vsize_xyz", f"std::array<float, {self.ndim}>")
code.arg("coors_range_xyz", f"std::array<float, {self.ndim * 2}>")
code.arg("num_point_features", f"int")
code.arg("max_num_voxels, max_num_points_per_voxel", f"int")
if self.zyx:
code.raw(f"""
for (int i = 0; i < {self.ndim}; ++i){{
vsize[{self.ndim - 1} - i] = vsize_xyz[i];
coors_range[{self.ndim - 1} - i] = coors_range_xyz[i];
coors_range[{2 * self.ndim - 1} - i] = coors_range_xyz[i + {self.ndim}];
}}
""")
else:
code.raw(f"""
for (int i = 0; i < {self.ndim}; ++i){{
vsize[i] = vsize_xyz[i];
coors_range[i] = coors_range_xyz[i];
coors_range[i + {self.ndim}] = coors_range_xyz[i + {self.ndim}];
}}
""")
# if zyx, grid_size is zyx.
code.raw(f"""
int64_t prod = 1;
for (size_t i = 0; i < {self.ndim}; ++i) {{
grid_size[i] =
std::round((coors_range[{self.ndim} + i] - coors_range[i]) / vsize[i]);
}}
for (int i = {self.ndim} - 1; i >= 0; --i) {{
grid_stride[i] = prod;
prod *= grid_size[i];
}}
voxels = tv::zeros({{max_num_voxels, max_num_points_per_voxel, num_point_features}}, tv::type_v<{self.dtype}>, 0);
indices = tv::zeros({{max_num_voxels, {self.ndim}}}, tv::int32, 0);
num_per_voxel = tv::zeros({{max_num_voxels}}, tv::int32, 0);
hashdata = tv::zeros({{1}}, tv::custom128, 0);
point_indice_data = tv::zeros({{1}}, tv::int64, 0);
""")
return code
@pccm.pybind.mark
@pccm.cuda.member_function
def point_to_voxel_hash(self):
code = pccm.FunctionCode()
code.arg("points", "tv::Tensor")
code.arg("clear_voxels", "bool", "true")
code.raw(f"""
TV_ASSERT_INVALID_ARG(points.ndim() == 2 && points.dim(1) >= {self.ndim}, "error");
using V = int64_t;
using KeyType = int64_t;
constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
if (clear_voxels){{
voxels.zero_();
}}
using table_t =
tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
kEmptyKey, false>;
using pair_t = typename table_t::value_type;
// int64_t expected_hash_data_num = int64_t(tv::hash::align_to_power2(points.dim(0) * 2));
int64_t expected_hash_data_num = points.dim(0) * 2;
if (hashdata.dim(0) < expected_hash_data_num){{
hashdata = tv::zeros({{expected_hash_data_num}}, tv::custom128, 0);
}}
if (point_indice_data.dim(0) < points.dim(0)){{
point_indice_data = tv::zeros({{points.dim(0)}}, tv::int64, 0);
}}
// auto timer = tv::CudaContextTimer<>();
num_per_voxel.zero_();
table_t hash = table_t(hashdata.data_ptr<pair_t>(), expected_hash_data_num);
hash.clear();
// tv::ssprint("clear time", timer.report());
auto launcher = tv::cuda::Launch(points.dim(0));
launcher(kernel::build_hash_table<table_t>, hash, points.data_ptr<const {self.dtype}>(),
point_indice_data.data_ptr<int64_t>(),
points.dim(1), vsize, coors_range, grid_size, grid_stride, points.dim(0));
// tv::ssprint("build_hash_table", timer.report());
auto table_launcher = tv::cuda::Launch(hash.size());
tv::Tensor count = tv::zeros({{1}}, tv::int32, 0);
Layout layout = Layout::from_shape(grid_size);
table_launcher(kernel::assign_table<table_t>, hash, indices.data_ptr<int>(),
count.data_ptr<int>(),
layout, voxels.dim(0));
auto count_cpu = count.cpu();
int count_val = count_cpu.item<int32_t>();
// tv::ssprint("assign_table", timer.report());
launcher(kernel::generate_voxel<table_t>, hash, points.data_ptr<const {self.dtype}>(),
point_indice_data.data_ptr<const int64_t>(), voxels.data_ptr<{self.dtype}>(),
num_per_voxel.data_ptr<int>(), points.dim(1), voxels.dim(1),
voxels.dim(0), vsize, coors_range,
grid_size, grid_stride, points.dim(0));
// tv::ssprint("generate_voxel", timer.report());
return std::make_tuple(voxels.slice_first_axis(0, count_val),
indices.slice_first_axis(0, count_val),
num_per_voxel.slice_first_axis(0, count_val));
""")
return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")
class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
def __init__(self, dtype: dtypes.DType, ndim: int, zyx: bool = True):
super().__init__()
self.add_dependency(TensorView)
layout = TensorGeneric(ndim, True)
self.add_param_class("layout_ns", layout, "Layout")
self.dtype = dtype
self.ndim = ndim
self.zyx = zyx
self.add_pybind_member("densehashdata", "tv::Tensor", readwrite=False, pyanno="cumm.tensorview.Tensor")
self.add_pybind_member("voxels", "tv::Tensor", readwrite=False)
self.add_pybind_member("indices", "tv::Tensor", readwrite=False)
self.add_pybind_member("num_per_voxel", "tv::Tensor", readwrite=False)
self.add_member("mean_per_voxel", "tv::Tensor")
self.add_member("vsize", f"tv::array<float, {self.ndim}>")
self.add_member("coors_range", f"tv::array<float, {self.ndim * 2}>")
self.add_member("grid_size", f"tv::array<int, {self.ndim}>")
self.add_member("grid_stride", f"tv::array<int, {self.ndim}>")
@pccm.pybind.mark_prop_getter(prop_name="grid_size")
@pccm.member_function
def get_grid_size(self):
code = pccm.FunctionCode()
code.raw(f"""
std::array<int, {self.ndim}> res;
for (int i = 0; i < {self.ndim}; ++i){{
res[i] = grid_size[i];
}}
return res;
""")
return code.ret(f"std::array<int, {self.ndim}>")
@pccm.pybind.mark
@pccm.constructor
def ctor(self):
code = pccm.FunctionCode()
code.arg("vsize_xyz", f"std::array<float, {self.ndim}>")
code.arg("coors_range_xyz", f"std::array<float, {self.ndim * 2}>")
code.arg("num_point_features", f"int")
code.arg("max_num_voxels, max_num_points_per_voxel", f"int")
if self.zyx:
code.raw(f"""
for (int i = 0; i < {self.ndim}; ++i){{
vsize[{self.ndim - 1} - i] = vsize_xyz[i];
coors_range[{self.ndim - 1} - i] = coors_range_xyz[i];
coors_range[{2 * self.ndim - 1} - i] = coors_range_xyz[i + {self.ndim}];
}}
""")
else:
code.raw(f"""
for (int i = 0; i < {self.ndim}; ++i){{
vsize[i] = vsize_xyz[i];
coors_range[i] = coors_range_xyz[i];
coors_range[i + {self.ndim}] = coors_range_xyz[i + {self.ndim}];
}}
""")
code.raw(f"""
int64_t prod = 1;
for (size_t i = 0; i < {self.ndim}; ++i) {{
grid_size[i] =
std::round((coors_range[{self.ndim} + i] - coors_range[i]) / vsize[i]);
}}
for (int i = {self.ndim} - 1; i >= 0; --i) {{
grid_stride[i] = prod;
prod *= grid_size[i];
}}
voxels = tv::zeros({{max_num_voxels, max_num_points_per_voxel, num_point_features}}, tv::type_v<{self.dtype}>, -1);
indices = tv::zeros({{max_num_voxels, {self.ndim}}}, tv::int32, -1);
num_per_voxel = tv::zeros({{max_num_voxels}}, tv::int32, -1);
mean_per_voxel = tv::zeros({{max_num_voxels, num_point_features}}, tv::DType({self.dtype.tv_dtype}), -1);
tv::TensorShape grid_shape(grid_size.data(), grid_size.data() + {self.ndim});
densehashdata = tv::zeros(grid_shape, tv::int32, -1);
auto densehashdata_ptr = densehashdata.data_ptr<int>();
for (int i= 0; i < densehashdata.size(); ++i){{
densehashdata_ptr[i] = -1;
}}
""")
return code
def point_to_voxel_template(self, mean: bool = False):
code = pccm.FunctionCode()
code.arg("points", "tv::Tensor")
code.arg("clear_voxels", "bool", "true")
point_xyz = f"{self.ndim - 1} - j"
if not self.zyx:
point_xyz = f"j"
code.raw(f"""
auto max_num_voxels = voxels.dim(0);
auto max_num_points_per_voxel = voxels.dim(1);
num_per_voxel.zero_();
if (clear_voxels){{
voxels.zero_();
}}
""")
if mean:
code.raw(f"mean_per_voxel.zero_();")
code.raw(f"auto means_rw = mean_per_voxel.tview<{self.dtype}, 2>();")
else:
code.raw(f"auto means_rw = mean_per_voxel.tview<{self.dtype}, 2>();")
code.raw(f"""
int res_voxel_num = 0;
int num_features = points.dim(1);
auto N = points.dim(0);
int c;
TV_ASSERT_RT_ERR(num_features == voxels.dim(2), "your points num features doesn't equal to voxel.");
tv::dispatch<float, double>(points.dtype(), [&](auto I){{
using T = decltype(I);
auto points_rw = points.tview<T, 2>();
auto coors_rw = indices.tview<int, 2>();
auto voxels_rw = voxels.tview<{self.dtype}, 3>();
auto num_points_per_voxel_rw = num_per_voxel.tview<int, 1>();
int coor[{self.ndim}];
auto coor_to_voxelidx_rw = densehashdata.tview<int, {self.ndim}>();
int voxelidx, num;
bool failed;
int voxel_num = 0;
for (int i = 0; i < N; ++i) {{
failed = false;
for (int j = 0; j < {self.ndim}; ++j) {{
c = floor((points_rw(i, {point_xyz}) - coors_range[j]) / vsize[j]);
if ((c < 0 || c >= grid_size[j])) {{
failed = true;
break;
}}
coor[j] = c;
}}
if (failed)
continue;
voxelidx = coor_to_voxelidx_rw({codeops.unpack("coor", range(self.ndim))});
if (voxelidx == -1) {{
voxelidx = voxel_num;
if (voxel_num >= max_num_voxels)
continue;
voxel_num += 1;
coor_to_voxelidx_rw({codeops.unpack("coor", range(self.ndim))}) = voxelidx;
for (int k = 0; k < {self.ndim}; ++k) {{
coors_rw(voxelidx, k) = coor[k];
}}
}}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_num_points_per_voxel) {{
// voxel_point_mask_rw(voxelidx, num) = {self.dtype}(1);
for (int k = 0; k < num_features; ++k) {{
voxels_rw(voxelidx, num, k) = points_rw(i, k);
}}
num_points_per_voxel_rw(voxelidx) += 1;
if TV_IF_CONSTEXPR ({pccm.boolean(mean)}){{
for (int k = 0; k < num_features; ++k) {{
means_rw(voxelidx, k) +=
(points_rw(i, k) - means_rw(voxelidx, k)) / {self.dtype}(num + 1);
}}
}}
}}
}}
for (int i = 0; i < voxel_num; ++i) {{
coor_to_voxelidx_rw({codeops.unpack("coors_rw", range(self.ndim), left="(i, ", right=")")}) = -1;
if TV_IF_CONSTEXPR ({pccm.boolean(mean)}){{
num = num_points_per_voxel_rw(i);
for (int j = num; j < max_num_points_per_voxel; ++j) {{
for (int k = 0; k < num_features; ++k) {{
voxels_rw(i, j, k) = means_rw(i, k);
}}
}}
}}
}}
res_voxel_num = voxel_num;
}});
return std::make_tuple(voxels.slice_first_axis(0, res_voxel_num),
indices.slice_first_axis(0, res_voxel_num),
num_per_voxel.slice_first_axis(0, res_voxel_num));
""")
return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")
@pccm.pybind.mark
@pccm.member_function
def point_to_voxel(self):
return self.point_to_voxel_template(False)
@pccm.pybind.mark
@pccm.member_function
def point_to_voxel_empty_mean(self):
return self.point_to_voxel_template(True)
# Copyright 2019 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from enum import Enum
import torch
import spconv
class ConvAlgo(Enum):
Native = 0 # small memory cost, faster when number of points is large.
Batch = 1 # high memory cost, faster when number of points is small (< 50000)
BatchGemmGather = 2 # high memory cost, faster when number of points medium
def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
ndim = len(input_size)
output_size = []
for i in range(ndim):
size = (input_size[i] + 2 * padding[i] - dilation[i] *
(kernel_size[i] - 1) - 1) // stride[i] + 1
if kernel_size[i] == -1:
output_size.append(1)
else:
output_size.append(size)
return output_size
def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
output_padding):
ndim = len(input_size)
output_size = []
for i in range(ndim):
if kernel_size[i] == -1:
raise ValueError("deconv don't support kernel_size < 0")
size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
i] + output_padding[i]
output_size.append(size)
return output_size
def get_indice_pairs(indices,
batch_size,
spatial_shape,
ksize=3,
stride=1,
padding=0,
dilation=1,
out_padding=0,
subm=False,
transpose=False,
grid=None,
use_hash=False):
ndim = indices.shape[1] - 1
if not isinstance(ksize, (list, tuple)):
ksize = [ksize] * ndim
if not isinstance(stride, (list, tuple)):
stride = [stride] * ndim
if not isinstance(padding, (list, tuple)):
padding = [padding] * ndim
if not isinstance(dilation, (list, tuple)):
dilation = [dilation] * ndim
if not isinstance(out_padding, (list, tuple)):
out_padding = [out_padding] * ndim
for d, s in zip(dilation, stride):
assert any([s == 1, d == 1]), "don't support this."
if not subm:
if transpose:
out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
padding, dilation, out_padding)
else:
out_shape = get_conv_output_size(spatial_shape, ksize, stride,
padding, dilation)
else:
out_shape = spatial_shape
if grid is None:
res = torch.ops.spconv.get_indice_pairs(indices, batch_size, out_shape,
spatial_shape, ksize, stride,
padding, dilation, out_padding,
int(subm), int(transpose),
int(use_hash))
return res
else:
if ndim == 2:
get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_grid_2d
elif ndim == 3:
get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_grid_3d
else:
raise NotImplementedError
return get_indice_pairs_func(indices, grid, batch_size, out_shape,
spatial_shape, ksize, stride, padding,
dilation, out_padding, int(subm),
int(transpose), int(use_hash))
def indice_conv(features,
filters,
indice_pairs,
indice_pair_num,
num_activate_out,
inverse=False,
subm=False,
algo=ConvAlgo.Native.value):
return torch.ops.spconv.indice_conv(features, filters, indice_pairs,
indice_pair_num, num_activate_out,
int(inverse), int(subm), algo)
def fused_indice_conv(features, filters, bias, indice_pairs, indice_pair_num,
num_activate_out, inverse, subm):
return torch.ops.spconv.fused_indice_conv_bn(features, filters, bias,
indice_pairs, indice_pair_num,
num_activate_out,
int(inverse), int(subm))
def indice_conv_backward(features,
filters,
out_bp,
indice_pairs,
indice_pair_num,
inverse=False,
subm=False,
algo=ConvAlgo.Native.value):
return torch.ops.spconv.indice_conv_backward(features, filters, out_bp,
indice_pairs, indice_pair_num,
int(inverse), int(subm), algo)
def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
return torch.ops.spconv.indice_maxpool(features, indice_pairs,
indice_pair_num, num_activate_out)
def indice_maxpool_backward(features, out_features, out_bp, indice_pairs,
indice_pair_num):
return torch.ops.spconv.indice_maxpool_backward(features, out_features,
out_bp, indice_pairs,
indice_pair_num)
def nms(boxes, scores, pre_max_size, post_max_size, thresh, eps):
res = torch.ops.spconv.nms(boxes, scores, pre_max_size, post_max_size,
thresh, eps)
return res
def pillar_scatter(features, coors, shape):
if features.dtype == torch.float32:
return torch.ops.spconv.pillar_scatter_float(features, coors, shape)
elif features.dtype == torch.half:
return torch.ops.spconv.pillar_scatter_half(features, coors, shape)
else:
raise NotImplementedError
import platform
from pathlib import Path
import numpy as np
import torch
from spconv.pytorch import ops
from spconv.pytorch.conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
SparseConvTranspose3d, SparseInverseConv2d,
SparseInverseConv3d, SubMConv2d, SubMConv3d)
from spconv.pytorch.core import SparseConvTensor
from spconv.pytorch.identity import Identity
from spconv.pytorch.modules import SparseModule, SparseSequential
from spconv.pytorch.ops import ConvAlgo
from spconv.pytorch.pool import SparseMaxPool2d, SparseMaxPool3d
from spconv.pytorch.tables import AddTable, ConcatTable, JoinTable
class ToDense(SparseModule):
"""convert SparseConvTensor to NCHW dense tensor.
"""
def forward(self, x: SparseConvTensor):
return x.dense()
class RemoveGrid(SparseModule):
"""remove pre-allocated grid buffer.
"""
def forward(self, x: SparseConvTensor):
x.grid = None
return x
# Copyright 2019 Yan Yan
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -14,6 +14,7 @@
import math
import time
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
......@@ -21,11 +22,13 @@ from torch import nn
from torch.nn import init
from torch.nn.parameter import Parameter
import spconv
import spconv.functional as Fsp
from spconv import ops
from spconv.modules import SparseModule
from spconv import pytorch as spconv
from spconv.algo import ConvAlgo
import spconv.pytorch.functional as Fsp
from spconv.pytorch import ops
from spconv.pytorch.core import IndiceData, SparseConvTensor
from spconv.pytorch.modules import SparseModule
from spconv.constants import FILTER_HWIO
def _calculate_fan_in_and_fan_out_hwio(tensor):
dimensions = tensor.ndimension()
......@@ -38,8 +41,12 @@ def _calculate_fan_in_and_fan_out_hwio(tensor):
fan_in = tensor.size(-2)
fan_out = tensor.size(-1)
else:
if FILTER_HWIO:
num_input_fmaps = tensor.size(-2)
num_output_fmaps = tensor.size(-1)
else:
num_input_fmaps = tensor.size(-1)
num_output_fmaps = tensor.size(-2)
receptive_field_size = 1
if tensor.dim() > 2:
receptive_field_size = tensor[..., 0, 0].numel()
......@@ -56,24 +63,24 @@ class SparseConvolution(SparseModule):
]
def __init__(self,
ndim,
in_channels,
out_channels,
kernel_size=3,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
subm=False,
output_padding=0,
transposed=False,
inverse=False,
indice_key=None,
fused_bn=False,
use_hash=False,
algo=ops.ConvAlgo.Native):
super(SparseConvolution, self).__init__()
ndim: int,
in_channels: int,
out_channels: int,
kernel_size: Union[int, List[int], Tuple[int, ...]]=3,
stride: Union[int, List[int], Tuple[int, ...]]=1,
padding: Union[int, List[int], Tuple[int, ...]]=0,
dilation: Union[int, List[int], Tuple[int, ...]]=1,
groups: Union[int, List[int], Tuple[int, ...]]=1,
bias: bool=True,
subm: bool=False,
output_padding: Union[int, List[int], Tuple[int, ...]]=0,
transposed: bool=False,
inverse: bool=False,
indice_key: Optional[str]=None,
fused_bn: bool=False,
algo: ops.ConvAlgo=ops.ConvAlgo.Native,
name=None):
super(SparseConvolution, self).__init__(name=name)
assert groups == 1
if not isinstance(kernel_size, (list, tuple)):
kernel_size = [kernel_size] * ndim
......@@ -104,11 +111,13 @@ class SparseConvolution(SparseModule):
self.subm = subm
self.indice_key = indice_key
self.fused_bn = fused_bn
self.use_hash = use_hash
self.algo = algo.value
self.algo = algo
if FILTER_HWIO:
self.weight = Parameter(
torch.Tensor(*kernel_size, in_channels, out_channels))
else:
self.weight = Parameter(
torch.Tensor(*kernel_size, out_channels, in_channels))
if bias:
self.bias = Parameter(torch.Tensor(out_channels))
else:
......@@ -117,14 +126,15 @@ class SparseConvolution(SparseModule):
def reset_parameters(self):
n = self.in_channels
init.kaiming_uniform_(self.weight, a=math.sqrt(5))
# init.uniform_(self.weight, 0, 0.001)
init.kaiming_uniform_(self.weight, a=math.sqrt(0.005))
if self.bias is not None:
fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
bound = 1 / math.sqrt(fan_in)
init.uniform_(self.bias, -bound, bound)
def forward(self, input):
assert isinstance(input, spconv.SparseConvTensor)
def forward(self, input: SparseConvTensor):
assert isinstance(input, SparseConvTensor)
features = input.features
device = features.device
indices = input.indices
......@@ -143,47 +153,91 @@ class SparseConvolution(SparseModule):
out_spatial_shape = spatial_shape
# input.update_grid(out_spatial_shape)
# t = time.time()
out_tensor = input.shadow_copy()
if input.benchmark:
if self.name is None:
raise ValueError(
"you need to assign name to spmodules before benchmark (spconv.utils.bench.assign_name_to_spmod)"
)
if self.name not in input.benchmark_record:
input.benchmark_record[self.name] = {
"type": "SparseConvolution",
"indice_gen_time": [],
"time": [],
"num_points": [],
"num_out_points": [],
"params": {
"kernel_size": self.kernel_size,
"stride": self.stride,
"padding": self.padding,
"dilation": self.dilation,
"output_padding": self.output_padding,
"subm": self.subm,
"transposed": self.transposed,
"input_channels": self.in_channels,
"out_channels": self.out_channels,
}
}
if self.conv1x1:
if FILTER_HWIO:
features = torch.mm(
input.features,
self.weight.view(self.in_channels, self.out_channels))
self.weight.view(self.out_channels, self.in_channels).T)
else:
features = torch.mm(
input.features,
self.weight.view(self.in_channels, self.out_channels).T)
if self.bias is not None:
features += self.bias
out_tensor = spconv.SparseConvTensor(features, input.indices,
input.spatial_shape,
input.batch_size)
out_tensor.indice_dict = input.indice_dict
out_tensor.grid = input.grid
out_tensor.features = features
return out_tensor
datas = input.find_indice_pair(self.indice_key)
if self.inverse:
assert datas is not None and self.indice_key is not None
_, outids, indice_pairs, indice_pair_num, out_spatial_shape = datas
outids = datas.indices
indice_pairs = datas.indice_pairs
indice_pair_num = datas.indice_pair_num
out_spatial_shape = datas.out_spatial_shape
assert indice_pair_num.shape[0] == np.prod(
self.kernel_size
), "inverse conv must have same kernel size as its couple conv"
else:
if self.indice_key is not None and datas is not None:
outids, _, indice_pairs, indice_pair_num, _ = datas
outids = datas.out_indices
indice_pairs = datas.indice_pairs
indice_pair_num = datas.indice_pair_num
else:
if input.benchmark:
torch.cuda.synchronize()
t = time.time()
outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
indices,
batch_size,
spatial_shape,
self.algo,
self.kernel_size,
self.stride,
self.padding,
self.dilation,
self.output_padding,
self.subm,
self.transposed,
grid=input.grid,
use_hash=self.use_hash)
input.indice_dict[self.indice_key] = (outids, indices,
indice_pairs,
indice_pair_num,
spatial_shape)
self.transposed)
if input.benchmark:
torch.cuda.synchronize()
interval = time.time() - t
out_tensor.benchmark_record[
self.name]["indice_gen_time"].append(interval)
indice_data = IndiceData(outids, indices, indice_pairs,
indice_pair_num, spatial_shape)
input.indice_dict[self.indice_key] = indice_data
if input.benchmark:
torch.cuda.synchronize()
t = time.time()
if self.fused_bn:
raise NotImplementedError
assert self.bias is not None
out_features = ops.fused_indice_conv(features, self.weight,
self.bias,
......@@ -210,12 +264,46 @@ class SparseConvolution(SparseModule):
if self.bias is not None:
out_features += self.bias
out_tensor = spconv.SparseConvTensor(out_features, outids,
out_spatial_shape, batch_size)
out_tensor.indice_dict = input.indice_dict
out_tensor.grid = input.grid
if input.benchmark:
torch.cuda.synchronize()
interval = time.time() - t
out_tensor.benchmark_record[self.name]["time"].append(interval)
out_tensor.benchmark_record[self.name]["num_points"].append(
features.shape[0])
out_tensor.benchmark_record[self.name]["num_out_points"].append(
out_features.shape[0])
out_tensor.features = out_features
out_tensor.indices = outids
out_tensor.spatial_shape = out_spatial_shape
return out_tensor
class SparseConv1d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None,
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConv1d, self).__init__(1,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
indice_key=indice_key,
algo=algo,
name=name)
class SparseConv2d(SparseConvolution):
def __init__(self,
......@@ -228,8 +316,8 @@ class SparseConv2d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native):
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConv2d, self).__init__(2,
in_channels,
out_channels,
......@@ -240,8 +328,8 @@ class SparseConv2d(SparseConvolution):
groups,
bias,
indice_key=indice_key,
use_hash=use_hash,
algo=algo)
algo=algo,
name=name)
class SparseConv3d(SparseConvolution):
......@@ -255,8 +343,8 @@ class SparseConv3d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native):
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConv3d, self).__init__(3,
in_channels,
out_channels,
......@@ -267,8 +355,8 @@ class SparseConv3d(SparseConvolution):
groups,
bias,
indice_key=indice_key,
use_hash=use_hash,
algo=algo)
algo=algo,
name=name)
class SparseConv4d(SparseConvolution):
......@@ -282,8 +370,8 @@ class SparseConv4d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native):
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConv4d, self).__init__(4,
in_channels,
out_channels,
......@@ -294,8 +382,36 @@ class SparseConv4d(SparseConvolution):
groups,
bias,
indice_key=indice_key,
use_hash=use_hash,
algo=algo)
algo=algo,
name=name)
class SparseConvTranspose1d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None,
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConvTranspose1d, self).__init__(1,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
transposed=True,
indice_key=indice_key,
algo=algo,
name=name)
class SparseConvTranspose2d(SparseConvolution):
......@@ -309,8 +425,8 @@ class SparseConvTranspose2d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native):
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConvTranspose2d, self).__init__(2,
in_channels,
out_channels,
......@@ -322,8 +438,8 @@ class SparseConvTranspose2d(SparseConvolution):
bias,
transposed=True,
indice_key=indice_key,
use_hash=use_hash,
algo=algo)
algo=algo,
name=name)
class SparseConvTranspose3d(SparseConvolution):
......@@ -337,8 +453,8 @@ class SparseConvTranspose3d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native):
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConvTranspose3d, self).__init__(3,
in_channels,
out_channels,
......@@ -350,8 +466,55 @@ class SparseConvTranspose3d(SparseConvolution):
bias,
transposed=True,
indice_key=indice_key,
use_hash=use_hash,
algo=algo)
algo=algo,
name=name)
class SparseConvTranspose4d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None,
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConvTranspose4d, self).__init__(4,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
transposed=True,
indice_key=indice_key,
algo=algo,
name=name)
class SparseInverseConv1d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
indice_key,
bias=True,
algo=ops.ConvAlgo.Native,
name=None):
super(SparseInverseConv1d, self).__init__(1,
in_channels,
out_channels,
kernel_size,
bias=bias,
inverse=True,
indice_key=indice_key,
algo=algo,
name=name)
class SparseInverseConv2d(SparseConvolution):
......@@ -361,7 +524,8 @@ class SparseInverseConv2d(SparseConvolution):
kernel_size,
indice_key,
bias=True,
algo=ops.ConvAlgo.Native):
algo=ops.ConvAlgo.Native,
name=None):
super(SparseInverseConv2d, self).__init__(2,
in_channels,
out_channels,
......@@ -369,7 +533,8 @@ class SparseInverseConv2d(SparseConvolution):
bias=bias,
inverse=True,
indice_key=indice_key,
algo=algo)
algo=algo,
name=name)
class SparseInverseConv3d(SparseConvolution):
......@@ -379,7 +544,8 @@ class SparseInverseConv3d(SparseConvolution):
kernel_size,
indice_key,
bias=True,
algo=ops.ConvAlgo.Native):
algo=ops.ConvAlgo.Native,
name=None):
super(SparseInverseConv3d, self).__init__(3,
in_channels,
out_channels,
......@@ -387,7 +553,54 @@ class SparseInverseConv3d(SparseConvolution):
bias=bias,
inverse=True,
indice_key=indice_key,
algo=algo)
algo=algo,
name=name)
class SparseInverseConv4d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
indice_key,
bias=True,
algo=ops.ConvAlgo.Native,
name=None):
super(SparseInverseConv4d, self).__init__(4,
in_channels,
out_channels,
kernel_size,
bias=bias,
inverse=True,
indice_key=indice_key,
algo=algo,
name=name)
class SubMConv1d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None,
algo=ops.ConvAlgo.Native,
name=None):
super(SubMConv1d, self).__init__(1,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
True,
indice_key=indice_key,
algo=algo,
name=name)
class SubMConv2d(SparseConvolution):
......@@ -401,8 +614,8 @@ class SubMConv2d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native):
algo=ops.ConvAlgo.Native,
name=None):
super(SubMConv2d, self).__init__(2,
in_channels,
out_channels,
......@@ -414,8 +627,8 @@ class SubMConv2d(SparseConvolution):
bias,
True,
indice_key=indice_key,
use_hash=use_hash,
algo=algo)
algo=algo,
name=name)
class SubMConv3d(SparseConvolution):
......@@ -429,8 +642,8 @@ class SubMConv3d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native):
algo=ops.ConvAlgo.Native,
name=None):
super(SubMConv3d, self).__init__(3,
in_channels,
out_channels,
......@@ -442,8 +655,8 @@ class SubMConv3d(SparseConvolution):
bias,
True,
indice_key=indice_key,
use_hash=use_hash,
algo=algo)
algo=algo,
name=name)
class SubMConv4d(SparseConvolution):
......@@ -457,8 +670,8 @@ class SubMConv4d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native):
algo=ops.ConvAlgo.Native,
name=None):
super(SubMConv4d, self).__init__(4,
in_channels,
out_channels,
......@@ -470,5 +683,5 @@ class SubMConv4d(SparseConvolution):
bias,
True,
indice_key=indice_key,
use_hash=use_hash,
algo=algo)
algo=algo,
name=name)
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
if torch.__version__ >= "1.8.0":
from .core_fx import *
else:
from .core import *
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional
import numpy as np
import torch
import torch.fx
from torch.fx.symbolic_trace import ProxyableClassMeta
class IndiceData(object):
def __init__(self, out_indices, indices, indice_pairs, indice_pair_num,
out_spatial_shape):
self.out_indices = out_indices
self.indices = indices
self.indice_pairs = indice_pairs
self.indice_pair_num = indice_pair_num
self.out_spatial_shape = out_spatial_shape
def scatter_nd(indices, updates, shape):
"""pytorch edition of tensorflow scatter_nd.
this function don't contain except handle code. so use this carefully
when indice repeats, don't support repeat add which is supported
in tensorflow.
"""
ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
ndim = indices.shape[-1]
output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
flatted_indices = indices.view(-1, ndim)
slices = [flatted_indices[:, i] for i in range(ndim)]
slices += [Ellipsis]
ret[slices] = updates.view(*output_shape)
return ret
class SparseConvTensor(metaclass=ProxyableClassMeta):
def __init__(self,
features,
indices,
spatial_shape,
batch_size,
grid=None,
voxel_num=None,
benchmark=False):
"""
Args:
features: [num_points, num_features] feature tensor
indices: [num_points, ndim + 1] indice tensor. batch index saved in indices[:, 0]
spatial_shape: spatial shape of your sparse data
batch_size: batch size of your sparse data
grid: pre-allocated grid tensor. should be used when the volume of spatial shape
is very large.
benchmark: whether to enable benchmark. if enabled, all sparse operators will be record to
SparseConvTensor.
"""
self._features = features
self.indices = indices
self.spatial_shape = spatial_shape
self.batch_size = batch_size
self.indice_dict = {}
if grid is None:
grid = torch.Tensor() # empty tensor
self.grid = grid
self.voxel_num = voxel_num # for tensorrt
self.benchmark = benchmark
self.benchmark_record = {}
def replace_feature(self, feature):
"""we need to replace x.features = F.relu(x) with x = x.replace_feature(F.relu(x.features))
due to limit of torch.fx
"""
new_spt = SparseConvTensor(feature, self.indices, self.spatial_shape, self.batch_size, self.grid, self.voxel_num, self.indice_dict)
new_spt.benchmark = self.benchmark
new_spt.benchmark_record = self.benchmark_record
return new_spt
@property
def features(self):
return self._features
@features.setter
def features(self, val):
msg = ("you can't set feature directly, use 'x = x.replace_feature(your_new_feature)'"
" to generate new SparseConvTensor instead.")
raise ValueError(msg)
@classmethod
def from_dense(cls, x: torch.Tensor):
"""create sparse tensor fron channel last dense tensor by to_sparse
x must be NHWC tensor, channel last
"""
x = x.to_sparse(x.ndim - 1)
spatial_shape = x.shape[1:-1]
batch_size = x.shape[0]
indices_th = x.indices().permute(1, 0).contiguous().int()
features_th = x.values()
return cls(features_th, indices_th, spatial_shape, batch_size)
@property
def spatial_size(self):
return np.prod(self.spatial_shape)
def find_indice_pair(self, key) -> Optional[IndiceData]:
if key is None:
return None
if key in self.indice_dict:
return self.indice_dict[key]
return None
def dense(self, channels_first=True):
output_shape = [self.batch_size] + list(
self.spatial_shape) + [self.features.shape[1]]
res = scatter_nd(
self.indices.to(self.features.device).long(), self.features,
output_shape)
if not channels_first:
return res
ndim = len(self.spatial_shape)
trans_params = list(range(0, ndim + 1))
trans_params.insert(1, ndim + 1)
return res.permute(*trans_params).contiguous()
# remove this due to limit of torch.fx
# @property
# def sparity(self):
# return self.indices.shape[0] / np.prod(
# self.spatial_shape) / self.batch_size
def shadow_copy(self) -> "SparseConvTensor":
"""create a new spconv tensor with all member unchanged"""
tensor = SparseConvTensor(self.features, self.indices,
self.spatial_shape, self.batch_size,
self.grid, self.benchmark)
tensor.benchmark_record = self.benchmark_record
tensor.indice_dict = self.indice_dict
tensor.voxel_num = self.voxel_num
return tensor
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional
import numpy as np
import torch
class IndiceData(object):
def __init__(self, out_indices, indices, indice_pairs, indice_pair_num,
out_spatial_shape):
self.out_indices = out_indices
self.indices = indices
self.indice_pairs = indice_pairs
self.indice_pair_num = indice_pair_num
self.out_spatial_shape = out_spatial_shape
def scatter_nd(indices, updates, shape):
"""pytorch edition of tensorflow scatter_nd.
this function don't contain except handle code. so use this carefully
when indice repeats, don't support repeat add which is supported
in tensorflow.
"""
ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
ndim = indices.shape[-1]
output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
flatted_indices = indices.view(-1, ndim)
slices = [flatted_indices[:, i] for i in range(ndim)]
slices += [Ellipsis]
ret[slices] = updates.view(*output_shape)
return ret
class SparseConvTensor(object):
def __init__(self,
features,
indices,
spatial_shape,
batch_size,
grid=None,
voxel_num=None,
benchmark=False):
"""
Args:
features: [num_points, num_features] feature tensor
indices: [num_points, ndim + 1] indice tensor. batch index saved in indices[:, 0]
spatial_shape: spatial shape of your sparse data
batch_size: batch size of your sparse data
grid: pre-allocated grid tensor. should be used when the volume of spatial shape
is very large.
benchmark: whether to enable benchmark. if enabled, all sparse operators will be record to
SparseConvTensor.
"""
self._features = features
self.indices = indices
self.spatial_shape = spatial_shape
self.batch_size = batch_size
self.indice_dict = {}
if grid is None:
grid = torch.Tensor() # empty tensor
self.grid = grid
self.voxel_num = voxel_num
self.benchmark = benchmark
self.benchmark_record = {}
def replace_feature(self, feature):
"""we need to replace x.features = F.relu(x) with x = x.replace_feature(F.relu(x))
due to limit of torch.fx
"""
new_spt = SparseConvTensor(feature, self.indices, self.spatial_shape, self.batch_size, self.grid, self.voxel_num, self.indice_dict)
new_spt.benchmark = self.benchmark
new_spt.benchmark_record = self.benchmark_record
return new_spt
@property
def features(self):
return self._features
@features.setter
def features(self, val):
msg = ("you can't set feature directly, use 'x = x.replace_feature(F.relu(x.feature))'"
" to generate new SparseConvTensor instead.")
raise ValueError(msg)
@classmethod
def from_dense(cls, x: torch.Tensor):
"""create sparse tensor fron channel last dense tensor by to_sparse
x must be NHWC tensor, channel last
"""
x = x.to_sparse(x.ndim - 1)
spatial_shape = x.shape[1:-1]
batch_size = x.shape[0]
indices_th = x.indices().permute(1, 0).contiguous().int()
features_th = x.values()
return cls(features_th, indices_th, spatial_shape, batch_size)
@property
def spatial_size(self):
return np.prod(self.spatial_shape)
def find_indice_pair(self, key) -> Optional[IndiceData]:
if key is None:
return None
if key in self.indice_dict:
return self.indice_dict[key]
return None
def dense(self, channels_first=True):
output_shape = [self.batch_size] + list(
self.spatial_shape) + [self.features.shape[1]]
res = scatter_nd(
self.indices.to(self.features.device).long(), self.features,
output_shape)
if not channels_first:
return res
ndim = len(self.spatial_shape)
trans_params = list(range(0, ndim + 1))
trans_params.insert(1, ndim + 1)
return res.permute(*trans_params).contiguous()
# @property
# def sparity(self):
# return self.indices.shape[0] / np.prod(
# self.spatial_shape) / self.batch_size
def shadow_copy(self) -> "SparseConvTensor":
"""create a new spconv tensor with all member unchanged"""
tensor = SparseConvTensor(self.features, self.indices,
self.spatial_shape, self.batch_size,
self.grid, self.benchmark)
tensor.benchmark_record = self.benchmark_record
tensor.indice_dict = self.indice_dict
return tensor
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from cumm import tensorview as tv
import torch
from typing import Optional, List
_TORCH_DTYPE_TO_TV = {
torch.float32: tv.float32,
torch.float64: tv.float64,
torch.float16: tv.float16,
torch.int32: tv.int32,
torch.int64: tv.int64,
torch.int8: tv.int8,
torch.int16: tv.int16,
torch.uint8: tv.uint8,
}
def torch_tensor_to_tv(ten: torch.Tensor, dtype: Optional[int] = None, shape: Optional[List[int]] = None):
assert ten.is_contiguous(), "must be contiguous tensor"
ptr = ten.data_ptr()
device = ten.device
if device.type == "cpu":
tv_device = -1
elif device.type == "cuda":
tv_device = 0
else:
raise NotImplementedError
if shape is None:
shape = list(ten.shape)
if dtype is None:
dtype = _TORCH_DTYPE_TO_TV[ten.dtype]
return tv.from_blob(ptr, shape, dtype, tv_device)
def get_current_stream():
return torch.cuda.current_stream().cuda_stream
if __name__ == "__main__":
a = torch.rand(2, 2)
atv = torch_tensor_to_tv(a)
print(atv.numpy_view())
\ No newline at end of file
# Copyright 2019 Yan Yan
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -16,7 +16,7 @@ import torch
from torch import nn
from torch.autograd import Function
import spconv.ops as ops
import spconv.pytorch.ops as ops
class SparseConvFunction(Function):
......
# Copyright 2019 Yan Yan
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import time
from collections import OrderedDict
......@@ -19,7 +20,7 @@ from collections import OrderedDict
import torch
from torch import nn
import spconv
from spconv import pytorch as spconv
def is_spconv_module(module):
......@@ -28,7 +29,7 @@ def is_spconv_module(module):
def is_sparse_conv(module):
from spconv.conv import SparseConvolution
from spconv.pytorch.conv import SparseConvolution
return isinstance(module, SparseConvolution)
......@@ -49,7 +50,9 @@ def _mean_update(vals, m_vals, t):
class SparseModule(nn.Module):
""" place holder, all module subclass from this will take sptensor in SparseSequential.
"""
pass
def __init__(self, name=None):
super().__init__()
self.name = name
class SparseSequential(SparseModule):
......@@ -140,50 +143,3 @@ class SparseSequential(SparseModule):
input = module(input)
return input
def fused(self):
"""don't use this. no effect.
"""
from spconv.conv import SparseConvolution
mods = [v for k, v in self._modules.items()]
fused_mods = []
idx = 0
while idx < len(mods):
if is_sparse_conv(mods[idx]):
if idx < len(mods) - 1 and isinstance(mods[idx + 1],
nn.BatchNorm1d):
new_module = SparseConvolution(
ndim=mods[idx].ndim,
in_channels=mods[idx].in_channels,
out_channels=mods[idx].out_channels,
kernel_size=mods[idx].kernel_size,
stride=mods[idx].stride,
padding=mods[idx].padding,
dilation=mods[idx].dilation,
groups=mods[idx].groups,
bias=True,
subm=mods[idx].subm,
output_padding=mods[idx].output_padding,
transposed=mods[idx].transposed,
inverse=mods[idx].inverse,
indice_key=mods[idx].indice_key,
fused_bn=True,
)
new_module.load_state_dict(mods[idx].state_dict(), False)
new_module.to(mods[idx].weight.device)
conv = new_module
bn = mods[idx + 1]
conv.bias.data.zero_()
conv.weight.data[:] = conv.weight.data * bn.weight.data / (
torch.sqrt(bn.running_var) + bn.eps)
conv.bias.data[:] = (
conv.bias.data - bn.running_mean) * bn.weight.data / (
torch.sqrt(bn.running_var) + bn.eps) + bn.bias.data
fused_mods.append(conv)
idx += 2
else:
fused_mods.append(mods[idx])
idx += 1
else:
fused_mods.append(mods[idx])
idx += 1
return SparseSequential(*fused_mods)
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import functools
from enum import Enum
from cumm import tensorview as tv
from cumm.gemm.algospec.core import ShuffleStrideType
import torch
import numpy as np
import spconv
from spconv.algo import AlgoHint, ConvAlgo
from typing import List, Union
from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream
from spconv.core_cc.csrc.sparse.all import SpconvOps
from spconv.algo import GEMM # , GATHER, SCATTER
import time
from spconv.constants import FILTER_HWIO
def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
ndim = len(input_size)
output_size = []
for i in range(ndim):
size = (input_size[i] + 2 * padding[i] - dilation[i] *
(kernel_size[i] - 1) - 1) // stride[i] + 1
if kernel_size[i] == -1:
output_size.append(1)
else:
output_size.append(size)
return output_size
def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
output_padding):
ndim = len(input_size)
output_size = []
for i in range(ndim):
if kernel_size[i] == -1:
raise ValueError("deconv don't support kernel_size < 0")
size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
i] + output_padding[i]
output_size.append(size)
return output_size
def get_indice_pairs(indices: torch.Tensor,
batch_size: int,
spatial_shape: List[int],
algo: ConvAlgo,
ksize: List[int],
stride: List[int],
padding: List[int],
dilation: List[int],
out_padding: List[int],
subm: bool = False,
transpose: bool = False):
# torch.cuda.synchronize()
# t = time.time()
ndim = indices.shape[1] - 1
kv: int = functools.reduce(lambda x, y: x * y, ksize, 1)
if not subm:
if transpose:
out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
padding, dilation, out_padding)
else:
out_shape = get_conv_output_size(spatial_shape, ksize, stride,
padding, dilation)
else:
out_shape = spatial_shape
assert algo == ConvAlgo.Native, "TODO"
stream = get_current_stream()
pair = torch.full((2, kv, indices.shape[0]),
-1,
dtype=indices.dtype,
device=indices.device)
indice_num_per_loc = torch.zeros((kv, ),
dtype=indices.dtype,
device=indices.device)
inds_tv = torch_tensor_to_tv(indices)
pair_tv = torch_tensor_to_tv(pair)
indice_num_per_loc_tv = torch_tensor_to_tv(indice_num_per_loc)
if subm:
out_inds = indices
hashdata = torch.empty((out_inds.shape[0] * 2, ),
dtype=torch.int64,
device=indices.device)
out_inds_tv = torch_tensor_to_tv(out_inds)
hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
SpconvOps.generate_subm_conv_inds(inds_tv,
hashdata_tv,
pair_tv,
out_inds_tv,
indice_num_per_loc_tv,
batch_size=batch_size,
input_dims=spatial_shape,
ksize=ksize,
dilation=dilation,
stream_int=stream)
# torch.cuda.synchronize()
# print("SUBM", time.time() - t)
else:
indice_pairs_uniq = torch.empty((pair.numel() // 2 + 1, ),
dtype=indices.dtype,
device=indices.device)
indice_pairs_uniq_tv = torch_tensor_to_tv(indice_pairs_uniq)
SpconvOps.generate_conv_inds_stage1(inds_tv,
pair_tv,
indice_pairs_uniq_tv,
indice_num_per_loc_tv,
batch_size=batch_size,
output_dims=out_shape,
input_dims=spatial_shape,
ksize=ksize,
stride=stride,
padding=padding,
dilation=dilation,
transposed=transpose,
stream_int=stream)
uniq_res = indice_pairs_uniq.unique()
num_act_out = uniq_res.shape[0] - 1
uniq_res_tv = torch_tensor_to_tv(uniq_res)
# num_act_out = SpconvOps.generate_conv_inds_stage1_5(
# indice_pairs_uniq_tv,
# ndim,
# uniq_size=indice_pairs_uniq_tv.size,
# stream_int=stream)
# uniq_res_tv = indice_pairs_uniq_tv.slice_first_axis(0, num_act_out)
out_inds = torch.empty((num_act_out, indices.shape[1]),
dtype=indices.dtype,
device=indices.device)
hashdata = torch.empty((out_inds.shape[0] * 2, ),
dtype=torch.int64,
device=indices.device)
out_inds_tv = torch_tensor_to_tv(out_inds)
hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
SpconvOps.generate_conv_inds_stage2(inds_tv,
hashdata_tv,
pair_tv,
uniq_res_tv,
out_inds_tv,
num_out_act=num_act_out,
batch_size=batch_size,
output_dims=out_shape,
input_dims=spatial_shape,
ksize=ksize,
stride=stride,
padding=padding,
dilation=dilation,
transposed=transpose,
stream_int=stream)
# torch.cuda.synchronize()
# print("REGU", time.time() - t)
return out_inds, pair, indice_num_per_loc
def indice_conv(features: torch.Tensor,
filters: torch.Tensor,
indice_pairs: torch.Tensor,
indice_pair_num: torch.Tensor,
num_activate_out: int,
inverse: bool = False,
subm: bool = False,
algo: ConvAlgo = ConvAlgo.Native):
# filters: RSKC
# torch.cuda.synchronize()
# t = time.time()
if features.dtype == torch.int8 or features.dtype == torch.qint8:
raise NotImplementedError("work in progress")
if FILTER_HWIO:
out_channel = filters.shape[-1]
else:
out_channel = filters.shape[-2]
filters = filters.reshape(-1, *filters.shape[-2:])
kv = filters.shape[0]
kv_center = kv // 2
if subm:
if FILTER_HWIO:
out_features = torch.mm(features, filters[kv_center])
else:
out_features = torch.mm(features, filters[kv_center].T)
else:
out_features = torch.zeros((num_activate_out, out_channel),
dtype=features.dtype,
device=features.device)
if kv == 1 and subm:
return out_features
stream = get_current_stream()
indice_pair_num_cpu = indice_pair_num.cpu().tolist()
arch = torch.cuda.get_device_capability()
inited: bool = subm
a = torch_tensor_to_tv(features)
c = torch_tensor_to_tv(out_features)
profile_idx = kv_center
if subm:
profile_idx = kv_center - 1
# profile_idx = first_n
nhot_profile = indice_pair_num_cpu[profile_idx]
# print(nhot_profile, indice_pair_num_cpu)
profile_res = GEMM.get_profiled_algo(
a.shape,
filters.shape[-2:],
c.shape,
False,
False if FILTER_HWIO else True,
False,
arch=arch,
shuffle_type=ShuffleStrideType.ShuffleAC,
a_inds_shape=[nhot_profile],
c_inds_shape=[nhot_profile],
hint=AlgoHint.Fowrard.value)
maxnhot = max(indice_pair_num_cpu)
if profile_res is None:
# run profile on center
inp_indices_th = indice_pairs[int(inverse)][profile_idx, :nhot_profile]
out_indices_th = indice_pairs[int(not inverse)][
profile_idx, :nhot_profile]
inp_indices = torch_tensor_to_tv(inp_indices_th)
out_indices = torch_tensor_to_tv(out_indices_th)
filter_tv = torch_tensor_to_tv(filters)[profile_idx]
profile_res, min_time = GEMM.profile_and_cache(
a,
filter_tv,
c,
False,
False if FILTER_HWIO else True,
False,
arch=arch,
shuffle_type=ShuffleStrideType.ShuffleAC,
a_inds=inp_indices,
c_inds=out_indices,
alpha=1.0,
beta=0.0,
hint=AlgoHint.Fowrard.value,
stream=stream)
indice_pairs_tv = torch_tensor_to_tv(indice_pairs)
pair_in = indice_pairs_tv[int(inverse)]
pair_out = indice_pairs_tv[int(not inverse)]
filters_tv = torch_tensor_to_tv(filters)
for i, nhot in enumerate(indice_pair_num_cpu):
if subm and i == kv_center:
continue
if subm and i > kv_center:
nhot = indice_pair_num_cpu[kv - i - 1]
if nhot <= 0:
continue
inp_indices = pair_in[i].slice_first_axis(0, nhot)
out_indices = pair_out[i].slice_first_axis(0, nhot)
b = filters_tv[i]
# inp @ filter.T, NC @ KC
beta = 1.0 if inited else 0.0
algo_desp = GEMM.run_profile(profile_res,
a,
b,
c,
False,
False if FILTER_HWIO else True,
False,
arch=arch,
stream=stream,
shuffle_type=ShuffleStrideType.ShuffleAC,
a_inds=inp_indices,
c_inds=out_indices,
hint=AlgoHint.Fowrard.value,
alpha=1.0,
beta=beta)
# gather_times += gather_time
inited = True
# torch.cuda.synchronize()
# # print(stream, valid_count, maxnhot, features.shape[0], features.shape[1], out_channel, time.time() - t, total_times, txt)
# # print(algo_desp, profile_res.external_gather, profile_res.splitk, features.shape[0], features.shape[1], out_channel, time.time() - t)
# # print(indice_pair_num_cpu)
# print("G", time.time() - t)
return out_features
def fused_indice_conv(features, filters, bias, indice_pairs, indice_pair_num,
num_activate_out, inverse, subm):
raise NotImplementedError
def indice_conv_backward(features: torch.Tensor,
filters: torch.Tensor,
out_bp: torch.Tensor,
indice_pairs: torch.Tensor,
indice_pair_num: torch.Tensor,
inverse: bool = False,
subm: bool = False,
algo: ConvAlgo = ConvAlgo.Native):
# torch.cuda.synchronize()
# t = time.time()
num_activate_out = out_bp.shape[0]
out_channel = out_bp.shape[-1]
filters_shape = filters.shape
filters = filters.reshape(-1, *filters.shape[-2:])
kv = filters.shape[0]
kv_center = kv // 2
assert out_bp.is_contiguous()
assert filters.is_contiguous()
assert features.is_contiguous()
if subm:
dfilters = torch.zeros_like(filters)
if FILTER_HWIO:
torch.mm(features.T, out_bp, out=dfilters[kv_center])
# TODO can we use torch mm for f16 backward weight?
din = torch.mm(out_bp, filters[kv_center].T)
else:
torch.mm(out_bp.T, features, out=dfilters[kv_center])
# TODO can we use torch mm for f16 backward weight?
din = torch.mm(out_bp, filters[kv_center])
else:
dfilters = torch.zeros_like(filters)
din = torch.zeros_like(features)
if kv == 1 and subm:
return (din, dfilters.reshape(filters_shape))
inited: bool = subm
indice_pairs_tv = torch_tensor_to_tv(indice_pairs)
# torch slice (a_th[x]) is very slow, so we need to use tv.Tensor earlier.
pair_in = indice_pairs_tv[int(inverse)]
pair_out = indice_pairs_tv[int(not inverse)]
stream = get_current_stream()
indice_pair_num_cpu = indice_pair_num.cpu().tolist()
arch = torch.cuda.get_device_capability()
filters_tv = torch_tensor_to_tv(filters)
dfilters_tv = torch_tensor_to_tv(dfilters)
out_bp_tv = torch_tensor_to_tv(out_bp)
features_tv = torch_tensor_to_tv(features)
din_tv = torch_tensor_to_tv(din)
profile_idx = kv_center
if subm:
profile_idx = kv_center - 1
# profile_idx = first_n
nhot_profile = indice_pair_num_cpu[profile_idx]
# print(nhot_profile, indice_pair_num_cpu)
profile_res_dgrad = GEMM.get_profiled_algo(
out_bp_tv.shape,
filters.shape[-2:],
din_tv.shape,
False,
True if FILTER_HWIO else False,
False,
arch=arch,
shuffle_type=ShuffleStrideType.ShuffleAC,
a_inds_shape=[nhot_profile],
c_inds_shape=[nhot_profile],
hint=AlgoHint.BackwardInput.value)
if profile_res_dgrad is None:
inp_indices = pair_in[profile_idx].slice_first_axis(0, nhot_profile)
out_indices = pair_out[profile_idx].slice_first_axis(0, nhot_profile)
filter_tv = filters_tv[profile_idx]
profile_res_dgrad, min_time = GEMM.profile_and_cache(
out_bp_tv,
filter_tv,
din_tv,
False,
True if FILTER_HWIO else False,
False,
arch=arch,
shuffle_type=ShuffleStrideType.ShuffleAC,
a_inds=out_indices,
c_inds=inp_indices,
alpha=1.0,
beta=0.0,
hint=AlgoHint.BackwardInput.value,
stream=stream)
if not FILTER_HWIO:
a_wgrad = out_bp_tv
b_wgrad = features_tv
else:
a_wgrad = features_tv
b_wgrad = out_bp_tv
profile_res_wgrad = GEMM.get_profiled_algo(
a_wgrad.shape,
b_wgrad.shape,
filters.shape[-2:],
True,
False,
False,
arch=arch,
shuffle_type=ShuffleStrideType.ShuffleAB,
a_inds_shape=[nhot_profile],
b_inds_shape=[nhot_profile],
hint=AlgoHint.BackwardWeight.value)
if profile_res_wgrad is None:
inp_indices = pair_in[profile_idx].slice_first_axis(0, nhot_profile)
out_indices = pair_out[profile_idx].slice_first_axis(0, nhot_profile)
dfilter_tv = dfilters_tv[profile_idx]
if not FILTER_HWIO:
a_inds_wgrad = out_indices
b_inds_wgrad = inp_indices
else:
a_inds_wgrad = inp_indices
b_inds_wgrad = out_indices
profile_res_wgrad, min_time = GEMM.profile_and_cache(
a_wgrad,
b_wgrad,
dfilter_tv,
True,
False,
False,
arch=arch,
shuffle_type=ShuffleStrideType.ShuffleAB,
a_inds=a_inds_wgrad,
b_inds=b_inds_wgrad,
alpha=1.0,
beta=0.0,
hint=AlgoHint.BackwardWeight.value,
stream=stream)
# print(profile_res_wgrad.algo_desp, profile_res_wgrad.splitk, min_time)
maxnhot = max(indice_pair_num_cpu)
# get workspace size for wgrad
if not FILTER_HWIO:
a_shape = [maxnhot, out_bp_tv.dim(1)]
b_shape = [maxnhot, features_tv.dim(1)]
else:
b_shape = [maxnhot, out_bp_tv.dim(1)]
a_shape = [maxnhot, features_tv.dim(1)]
m, n, k = GEMM.extract_mnk(a_shape,
b_shape,
profile_res_wgrad.algo_desp.trans_a,
profile_res_wgrad.algo_desp.trans_b,
profile_res_wgrad.algo_desp.trans_c,
arch=arch,
shuffle_type=ShuffleStrideType.ShuffleAB,
a_inds_shape=[maxnhot],
b_inds_shape=[maxnhot],
hint=AlgoHint.BackwardWeight.value)
workspace_size = profile_res_wgrad.algo_desp.query_workspace_size(
m, n, k, profile_res_wgrad.splitk)
workspace = torch.Tensor()
workspace_tv = tv.Tensor()
if workspace_size > 0:
workspace = torch.empty((workspace_size, ),
dtype=torch.int8,
device=features.device)
workspace_tv = torch_tensor_to_tv(workspace)
# print(workspace_size, m, n, k, profile_res_wgrad.splitk)
# torch.cuda.synchronize()
# di_time = time.time() - t
# t = time.time()
inited = subm
for i, nhot in enumerate(indice_pair_num_cpu):
if subm and i == kv_center:
continue
if subm and i > kv_center:
nhot = indice_pair_num_cpu[kv - i - 1]
if nhot <= 0:
continue
beta = 1.0 if inited else 0.0
inp_indices = pair_in[i].slice_first_axis(0, nhot)
out_indices = pair_out[i].slice_first_axis(0, nhot)
# out.T @ inp, NK @ NC
# print(features_tv.shape, out_bp_tv.shape)
GEMM.run_profile(profile_res_dgrad,
out_bp_tv,
filters_tv[i],
din_tv,
False,
True if FILTER_HWIO else False,
False,
arch=arch,
stream=stream,
shuffle_type=ShuffleStrideType.ShuffleAC,
a_inds=out_indices,
c_inds=inp_indices,
hint=AlgoHint.BackwardInput.value,
alpha=1.0,
beta=beta)
if not FILTER_HWIO:
a = out_bp_tv
b = features_tv
a_inds = out_indices
b_inds = inp_indices
else:
a = features_tv
b = out_bp_tv
a_inds = inp_indices
b_inds = out_indices
GEMM.run_profile(profile_res_wgrad,
a,
b,
dfilters_tv[i],
True,
False,
False,
arch=arch,
stream=stream,
shuffle_type=ShuffleStrideType.ShuffleAB,
a_inds=a_inds,
b_inds=b_inds,
hint=AlgoHint.BackwardWeight.value,
alpha=1.0,
beta=beta,
workspace=workspace_tv)
inited = True
# torch.cuda.synchronize()
# dw_time = time.time() - t
# # print(dw_time + di_time, di_time, dw_time, profile_res_wgrad.splitk, profile_res_wgrad.algo_desp, dfilters.shape)
# # print(dw_time + di_time)
# print("BWG", time.time() - t)
return (din, dfilters.reshape(filters_shape))
def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
# torch.cuda.synchronize()
# t = time.time()
out_channel = features.shape[-1]
out_features = torch.zeros((num_activate_out, out_channel),
dtype=features.dtype,
device=features.device)
stream = get_current_stream()
indice_pair_num_cpu = indice_pair_num.cpu().tolist()
out_features_tv = torch_tensor_to_tv(out_features)
features_tv = torch_tensor_to_tv(features)
for i, nhot in enumerate(indice_pair_num_cpu):
if nhot <= 0:
continue
inp_indices = torch_tensor_to_tv(indice_pairs[0][i, :nhot])
out_indices = torch_tensor_to_tv(indice_pairs[1][i, :nhot])
SpconvOps.maxpool_forward(out_features_tv, features_tv, out_indices,
inp_indices, stream)
# torch.cuda.synchronize()
# print("M", time.time() - t)
return out_features
def indice_maxpool_backward(features, out_features, out_bp, indice_pairs,
indice_pair_num):
out_channel = features.shape[-1]
din = torch.zeros_like(features)
stream = get_current_stream()
indice_pair_num_cpu = indice_pair_num.cpu().tolist()
out_features_tv = torch_tensor_to_tv(out_features)
features_tv = torch_tensor_to_tv(features)
out_bp_tv = torch_tensor_to_tv(out_bp)
din_tv = torch_tensor_to_tv(din)
for i, nhot in enumerate(indice_pair_num_cpu):
if nhot <= 0:
continue
inp_indices = torch_tensor_to_tv(indice_pairs[0][i, :nhot])
out_indices = torch_tensor_to_tv(indice_pairs[1][i, :nhot])
SpconvOps.maxpool_backward(out_features_tv, features_tv, out_bp_tv,
din_tv, out_indices, inp_indices, stream)
return din
def nms(boxes, scores, pre_max_size, post_max_size, thresh, eps):
raise NotImplementedError
def pillar_scatter(features, coors, shape):
raise NotImplementedError
# Copyright 2019 Yan Yan
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -21,10 +21,12 @@ from torch import nn
from torch.nn import init
from torch.nn.parameter import Parameter
import spconv
import spconv.functional as Fsp
from spconv import ops
from spconv.modules import SparseModule
from spconv import pytorch as spconv
from spconv.algo import ConvAlgo
import spconv.pytorch.functional as Fsp
from spconv.pytorch import ops
from spconv.pytorch.core import IndiceData
from spconv.pytorch.modules import SparseModule
class SparseMaxPool(SparseModule):
......@@ -34,8 +36,10 @@ class SparseMaxPool(SparseModule):
stride=None,
padding=0,
dilation=1,
subm=False):
super(SparseMaxPool, self).__init__()
indice_key=None,
subm=False,
name=None):
super(SparseMaxPool, self).__init__(name=name)
if not isinstance(kernel_size, (list, tuple)):
kernel_size = [kernel_size] * ndim
if stride is None:
......@@ -46,13 +50,13 @@ class SparseMaxPool(SparseModule):
padding = [padding] * ndim
if not isinstance(dilation, (list, tuple)):
dilation = [dilation] * ndim
self.ndim = ndim
self.kernel_size = kernel_size
self.stride = stride
self.padding = padding
self.subm = subm
self.dilation = dilation
self.indice_key = indice_key
def forward(self, input):
assert isinstance(input, spconv.SparseConvTensor)
......@@ -67,27 +71,130 @@ class SparseMaxPool(SparseModule):
self.dilation)
else:
out_spatial_shape = spatial_shape
out_tensor = input.shadow_copy()
if input.benchmark:
if self.name is None:
raise ValueError(
"you need to assign name to spmodules before benchmark (spconv.utils.bench.assign_name_to_spmod)"
)
if self.name not in input.benchmark_record:
input.benchmark_record[self.name] = {
"type": "SparseMaxPool",
"indice_gen_time": [],
"time": [],
"num_points": [],
"num_out_points": [],
"params": {
"kernel_size": self.kernel_size,
"stride": self.stride,
"padding": self.padding,
"dilation": self.dilation,
"channels": features.shape[1],
}
}
if input.benchmark:
torch.cuda.synchronize()
t = time.time()
outids, indice_pairs, indice_pairs_num = ops.get_indice_pairs(
indices, batch_size, spatial_shape, self.kernel_size, self.stride,
self.padding, self.dilation, 0, self.subm)
indices,
batch_size,
spatial_shape,
ConvAlgo.Native,
self.kernel_size,
self.stride,
self.padding,
self.dilation,
0,
False)
if input.benchmark:
torch.cuda.synchronize()
interval = time.time() - t
out_tensor.benchmark_record[self.name]["indice_gen_time"].append(
interval)
t = time.time()
if self.indice_key is not None:
datas = input.find_indice_pair(self.indice_key)
if datas is None:
indice_data = IndiceData(outids, indices, indice_pairs,
indice_pairs_num, spatial_shape)
input.indice_dict[self.indice_key] = indice_data
else:
raise ValueError("indice data exists")
out_features = Fsp.indice_maxpool(features, indice_pairs.to(device),
indice_pairs_num.to(device),
outids.shape[0])
out_tensor = spconv.SparseConvTensor(out_features, outids,
out_spatial_shape, batch_size)
out_tensor.indice_dict = input.indice_dict
out_tensor.grid = input.grid
if input.benchmark:
torch.cuda.synchronize()
interval = time.time() - t
out_tensor.benchmark_record[self.name]["time"].append(interval)
out_tensor.benchmark_record[self.name]["num_points"].append(
features.shape[0])
out_tensor.benchmark_record[self.name]["num_out_points"].append(
out_features.shape[0])
out_tensor.features = out_features
out_tensor.indices = outids
out_tensor.spatial_shape = out_spatial_shape
return out_tensor
class SparseMaxPool1d(SparseMaxPool):
def __init__(self,
kernel_size,
stride=None,
padding=0,
dilation=1,
name=None):
super(SparseMaxPool1d, self).__init__(1,
kernel_size,
stride,
padding,
dilation,
name=name)
class SparseMaxPool2d(SparseMaxPool):
def __init__(self, kernel_size, stride=None, padding=0, dilation=1):
super(SparseMaxPool2d, self).__init__(2, kernel_size, stride, padding,
dilation)
def __init__(self,
kernel_size,
stride=None,
padding=0,
dilation=1,
name=None):
super(SparseMaxPool2d, self).__init__(2,
kernel_size,
stride,
padding,
dilation,
name=name)
class SparseMaxPool3d(SparseMaxPool):
def __init__(self, kernel_size, stride=None, padding=0, dilation=1):
super(SparseMaxPool3d, self).__init__(3, kernel_size, stride, padding,
dilation)
def __init__(self,
kernel_size,
stride=None,
padding=0,
dilation=1,
name=None):
super(SparseMaxPool3d, self).__init__(3,
kernel_size,
stride,
padding,
dilation,
name=name)
class SparseMaxPool4d(SparseMaxPool):
def __init__(self,
kernel_size,
stride=None,
padding=0,
dilation=1,
name=None):
super(SparseMaxPool4d, self).__init__(4,
kernel_size,
stride,
padding,
dilation,
name=name)
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import time
import numpy as np
import torch
from torch import nn
from torch.nn import init
from torch.nn.parameter import Parameter
from spconv import pytorch as spconv
from spconv.pytorch.modules import SparseModule
class RemoveDuplicate(SparseModule):
def forward(self, x: spconv.SparseConvTensor):
inds = x.indices
spatial_shape = [x.batch_size, *x.spatial_shape]
spatial_stride = [0] * len(spatial_shape)
val = 1
for i in range(inds.shape[1] - 1, -1, -1):
spatial_stride[i] = val
val *= spatial_shape[i]
indices_index = inds[:, -1]
for i in range(len(spatial_shape) - 1):
indices_index += spatial_stride[i] * inds[:, i]
_, unique_inds = torch.unique(indices_index)
new_inds = inds[unique_inds]
new_features = x.features[unique_inds]
res = spconv.SparseConvTensor(new_features, new_inds, x.spatial_shape,
x.batch_size, x.grid)
return res
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment