Commit 01ed382c authored by yan.yan's avatar yan.yan
Browse files

working on tensor core test

parent 3517290c
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
class Point2Voxel:
hashdata: Tensor
point_indice_data: Tensor
voxels: Tensor
indices: Tensor
num_per_voxel: Tensor
@property
def grid_size(self) -> List[int]: ...
def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None:
"""
Args:
vsize_xyz:
coors_range_xyz:
num_point_features:
max_num_voxels:
max_num_points_per_voxel:
"""
...
def point_to_voxel_hash(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
clear_voxels:
"""
...
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
class Point2VoxelCPU:
densehashdata: Tensor
voxels: Tensor
indices: Tensor
num_per_voxel: Tensor
@property
def grid_size(self) -> List[int]: ...
def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None:
"""
Args:
vsize_xyz:
coors_range_xyz:
num_point_features:
max_num_voxels:
max_num_points_per_voxel:
"""
...
def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
clear_voxels:
"""
...
def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
clear_voxels:
"""
...
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
class Point2VoxelCPU:
densehashdata: Tensor
voxels: Tensor
indices: Tensor
num_per_voxel: Tensor
@property
def grid_size(self) -> List[int]: ...
def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None:
"""
Args:
vsize_xyz:
coors_range_xyz:
num_point_features:
max_num_voxels:
max_num_points_per_voxel:
"""
...
def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
clear_voxels:
"""
...
def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
clear_voxels:
"""
...
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
class Point2VoxelCPU:
densehashdata: Tensor
voxels: Tensor
indices: Tensor
num_per_voxel: Tensor
@property
def grid_size(self) -> List[int]: ...
def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None:
"""
Args:
vsize_xyz:
coors_range_xyz:
num_point_features:
max_num_voxels:
max_num_points_per_voxel:
"""
...
def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
clear_voxels:
"""
...
def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
clear_voxels:
"""
...
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
class Point2VoxelCPU:
densehashdata: Tensor
voxels: Tensor
indices: Tensor
num_per_voxel: Tensor
@property
def grid_size(self) -> List[int]: ...
def __init__(self, vsize_xyz: List[float], coors_range_xyz: List[float], num_point_features: int, max_num_voxels: int, max_num_points_per_voxel: int) -> None:
"""
Args:
vsize_xyz:
coors_range_xyz:
num_point_features:
max_num_voxels:
max_num_points_per_voxel:
"""
...
def point_to_voxel(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
clear_voxels:
"""
...
def point_to_voxel_empty_mean(self, points: Tensor, clear_voxels: bool = True) -> Tuple[Tensor, Tensor, Tensor]:
"""
Args:
points:
clear_voxels:
"""
...
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
class ScatterAll:
def __init__(self) -> None: ...
@staticmethod
def get_all_scatter_params() -> List[Tuple[int, int, int, int]]: ...
def supported_scatter(self, tile_m: int, tile_k_bytes: int, bytes_per_access: int, num_threads: int, channel_size: int, dtype: int) -> bool:
"""
Args:
tile_m:
tile_k_bytes:
bytes_per_access:
num_threads:
channel_size:
dtype:
"""
...
@staticmethod
def stream_synchronize(stream: int = 0) -> None:
"""
Args:
stream:
"""
...
def scatter(self, output: Tensor, input: Tensor, indices: Tensor, tile_m: int, tile_k_bytes: int, bytes_per_access: int, num_threads: int, stream: int = 0) -> None:
"""
Args:
output:
input:
indices:
tile_m:
tile_k_bytes:
bytes_per_access:
num_threads:
stream:
"""
...
def scatter2(self, output: Tensor, input: Tensor, indices: Tensor, size: int, stream: int = 0) -> None:
"""
Args:
output:
input:
indices:
size:
stream:
"""
...
class GatherAll:
def __init__(self) -> None: ...
@staticmethod
def get_all_gather_params() -> List[Tuple[int, int, int, int]]: ...
@staticmethod
def supported(bytes_per_access: int, channel_size: int, dtype: int) -> bool:
"""
Args:
bytes_per_access:
channel_size:
dtype:
"""
...
@staticmethod
def stream_synchronize(stream: int = 0) -> None:
"""
Args:
stream:
"""
...
def gather(self, output: Tensor, input: Tensor, indices: Tensor, tile_m: int, tile_k_bytes: int, bytes_per_access: int, num_threads: int, stream: int = 0) -> None:
"""
Args:
output:
input:
indices:
tile_m:
tile_k_bytes:
bytes_per_access:
num_threads:
stream:
"""
...
def gather2(self, output: Tensor, input: Tensor, indices: Tensor, size: int, stream: int = 0) -> None:
"""
Args:
output:
input:
indices:
size:
stream:
"""
...
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
class GemmAlgoDesp:
dtype_a: int
dtype_b: int
dtype_c: int
tile_shape: Tuple[int, int, int]
warp_tile_shape: Tuple[int, int, int]
num_stage: int
dacc: int
dcomp: int
algo: str
tensorop: List[int]
split_k_serial_: int
split_k_parallel_: int
shuffle_type: str
element_per_access_a: int
element_per_access_b: int
element_per_access_c: int
def __init__(self) -> None: ...
def __repr__(self) -> str: ...
@property
def split_k_serial(self) -> bool: ...
@split_k_serial.setter
def split_k_serial(self, val: bool) -> None:
"""
Args:
val:
"""
...
@property
def split_k_parallel(self) -> bool: ...
@split_k_parallel.setter
def split_k_parallel(self, val: bool) -> None:
"""
Args:
val:
"""
...
def check_valid(self) -> None: ...
@property
def trans_a(self) -> bool: ...
@trans_a.setter
def trans_a(self, val: bool) -> None:
"""
Args:
val:
"""
...
@property
def trans_b(self) -> bool: ...
@trans_b.setter
def trans_b(self, val: bool) -> None:
"""
Args:
val:
"""
...
@property
def trans_c(self) -> bool: ...
@trans_c.setter
def trans_c(self, val: bool) -> None:
"""
Args:
val:
"""
...
def query_workspace_size(self, m: int, n: int, k: int, split_k_slices: int) -> int:
"""
Args:
m:
n:
k:
split_k_slices:
"""
...
def supported(self, m: int, n: int, k: int) -> bool:
"""
Args:
m:
n:
k:
"""
...
def supported_ldx(self, lda: int, ldb: int, ldc: int) -> bool:
"""
Args:
lda:
ldb:
ldc:
"""
...
class GemmParams:
algo_desp: GemmAlgoDesp
split_k_slices: int
workspace: Tensor = Tensor()
a_inds: Tensor = Tensor()
b_inds: Tensor = Tensor()
c_inds: Tensor = Tensor()
alpha: float
beta: float
stream: int
def __init__(self) -> None: ...
def check_valid(self) -> None: ...
@property
def a(self) -> Tensor: ...
@a.setter
def a(self, val: Tensor) -> None:
"""
Args:
val:
"""
...
@property
def b(self) -> Tensor: ...
@b.setter
def b(self, val: Tensor) -> None:
"""
Args:
val:
"""
...
@property
def c(self) -> Tensor: ...
@c.setter
def c(self, val: Tensor) -> None:
"""
Args:
val:
"""
...
class GemmMainUnitTest:
@staticmethod
def get_all_algo_desp() -> List[GemmAlgoDesp]: ...
@staticmethod
def extract_mnk(a_shape: List[int], b_shape: List[int], trans_a: bool, trans_b: bool, trans_c: bool, shuffle_type: str = "NS", a_inds_shape: List[int] = [], b_inds_shape: List[int] = [], c_inds_shape: List[int] = []) -> Tuple[int, int, int]:
"""
Args:
a_shape:
b_shape:
trans_a:
trans_b:
trans_c:
shuffle_type:
a_inds_shape:
b_inds_shape:
c_inds_shape:
"""
...
@staticmethod
def align_to_power2(val: int) -> int:
"""
Args:
val:
"""
...
@staticmethod
def device_synchronize() -> None: ...
@staticmethod
def stream_synchronize(stream: int) -> None:
"""
Args:
stream:
"""
...
@staticmethod
def simple_select_tile_shape(m: int, n: int, k: int, tile_ms: List[int], tile_ns: List[int], tile_ks: List[int], tile_shape_to_algos: Dict[int, List[int]], large_k_first: bool) -> List[int]:
"""
Args:
m:
n:
k:
tile_ms:
tile_ns:
tile_ks:
tile_shape_to_algos:
large_k_first:
"""
...
@staticmethod
def matmul2(params: GemmParams) -> None:
"""
Args:
params:
"""
...
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from cumm.common import TensorViewKernel, ThrustLib
from cumm.conv.bases import ConvOpType, NHWC
from cumm.conv.params import ConvProblem
from cumm import dtypes
import pccm
from .pointops import Point2Voxel, Point2VoxelCPU
from .indices import SparseConvIndicesKernel, CudaCommonKernel
from .maxpool import IndiceMaxPool
class SpconvOps(pccm.Class):
def __init__(self):
super().__init__()
self.ndims = [1, 2, 3, 4]
for ndim in self.ndims:
p2v = Point2Voxel(dtypes.float32, ndim)
p2v_cpu = Point2VoxelCPU(dtypes.float32, ndim)
self.add_param_class(f"ops{ndim}d", p2v, f"Point2Voxel{ndim}D")
self.add_param_class(f"ops_cpu{ndim}d", p2v_cpu, f"Point2Voxel{ndim}DCPU")
problem = ConvProblem(ndim, ConvOpType.kForward, NHWC, NHWC, NHWC)
indices = SparseConvIndicesKernel(problem, dtypes.int32)
# self.add_param_class("ops", indices, "SpconvIndices")
cuda_funcs = [self.generate_conv_inds, self.generate_subm_conv_inds,
self.generate_conv_inds_stage1, self.generate_conv_inds_stage2, self.sort_1d_by_key]
self.add_impl_only_param_class(cuda_funcs, f"ops{ndim}d", indices, f"SpconvIndices{ndim}D")
@pccm.pybind.mark
@pccm.cuda.static_function
def generate_conv_inds(self):
code = pccm.FunctionCode()
code.arg("indices, hashdata", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, out_inds, indice_num_per_loc", "tv::Tensor")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"std::vector<int>")
code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
code.raw(f"""
int ndim = indices.dim(1) - 1;
TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim &&
ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim &&
padding.size() == ndim, "your params size not equal to ndim", ndim);
""")
for ndim in self.ndims:
code.raw(f"""
if (ndim == {ndim}){{
tv::array<int, {ndim}> output_dims_, input_dims_;
tv::array<int, {ndim}> ksize_, stride_, padding_, dilation_;
for (int i = 0; i < {ndim}; ++i){{
output_dims_[i] = output_dims[i];
input_dims_[i] = input_dims[i];
ksize_[i] = ksize[i];
stride_[i] = stride[i];
padding_[i] = padding[i];
dilation_[i] = dilation[i];
}}
return SpconvIndices{ndim}D::generate_conv_inds(indices, hashdata,
indice_pairs, indice_pairs_uniq, out_inds, indice_num_per_loc,
batch_size, output_dims_, input_dims_,
ksize_, stride_, padding_, dilation_);
}}
""")
code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
return code.ret("int")
@pccm.pybind.mark
@pccm.cuda.static_function
def generate_conv_inds_stage1(self):
code = pccm.FunctionCode()
code.arg("indices", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"std::vector<int>")
code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
code.raw(f"""
int ndim = indices.dim(1) - 1;
TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim &&
ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim &&
padding.size() == ndim, "your params size not equal to ndim", ndim);
""")
for ndim in self.ndims:
code.raw(f"""
if (ndim == {ndim}){{
tv::array<int, {ndim}> output_dims_, input_dims_;
tv::array<int, {ndim}> ksize_, stride_, padding_, dilation_;
for (int i = 0; i < {ndim}; ++i){{
output_dims_[i] = output_dims[i];
input_dims_[i] = input_dims[i];
ksize_[i] = ksize[i];
stride_[i] = stride[i];
padding_[i] = padding[i];
dilation_[i] = dilation[i];
}}
return SpconvIndices{ndim}D::generate_conv_inds_stage1(indices,
indice_pairs, indice_pairs_uniq, indice_num_per_loc,
batch_size, output_dims_, input_dims_,
ksize_, stride_, padding_, dilation_);
}}
""")
code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
return code.ret("int")
@pccm.pybind.mark
@pccm.cuda.static_function
def generate_conv_inds_stage2(self):
code = pccm.FunctionCode()
code.arg("indices, hashdata", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, out_inds", "tv::Tensor")
code.arg("num_out_act", "int")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"std::vector<int>")
code.arg("ksize, stride, padding, dilation", f"std::vector<int>")
code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int")
code.raw(f"""
int ndim = indices.dim(1) - 1;
TV_ASSERT_RT_ERR(output_dims.size() == ndim && input_dims.size() == ndim &&
ksize.size() == ndim && stride.size() == ndim && dilation.size() == ndim &&
padding.size() == ndim, "your params size not equal to ndim", ndim);
""")
for ndim in self.ndims:
code.raw(f"""
if (ndim == {ndim}){{
tv::array<int, {ndim}> output_dims_, input_dims_;
tv::array<int, {ndim}> ksize_, stride_, padding_, dilation_;
for (int i = 0; i < {ndim}; ++i){{
output_dims_[i] = output_dims[i];
input_dims_[i] = input_dims[i];
ksize_[i] = ksize[i];
stride_[i] = stride[i];
padding_[i] = padding[i];
dilation_[i] = dilation[i];
}}
return SpconvIndices{ndim}D::generate_conv_inds_stage2(indices, hashdata,
indice_pairs, indice_pairs_uniq, out_inds, num_out_act,
batch_size, output_dims_, input_dims_,
ksize_, stride_, padding_, dilation_);
}}
""")
code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
return code.ret("int")
@pccm.pybind.mark
@pccm.cuda.static_function
def generate_subm_conv_inds(self):
code = pccm.FunctionCode()
code.arg("indices, hashdata", "tv::Tensor")
code.arg("indice_pairs, out_inds, indice_num_per_loc", "tv::Tensor")
code.arg("batch_size", "int")
code.arg("input_dims", f"std::vector<int>")
code.arg("ksize, dilation", f"std::vector<int>")
code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()", "cumm.tensorview.Tensor = Tensor()")
code.arg("backward", "bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0", pyanno="int = 0")
code.raw(f"""
int ndim = indices.dim(1) - 1;
TV_ASSERT_RT_ERR(input_dims.size() == ndim &&
ksize.size() == ndim && dilation.size() == ndim, "your params size not equal to ndim", ndim);
""")
for ndim in self.ndims:
code.raw(f"""
if (ndim == {ndim}){{
tv::array<int, {ndim}> input_dims_;
tv::array<int, {ndim}> ksize_, dilation_;
for (int i = 0; i < {ndim}; ++i){{
input_dims_[i] = input_dims[i];
ksize_[i] = ksize[i];
dilation_[i] = dilation[i];
}}
return SpconvIndices{ndim}D::generate_subm_conv_inds(indices, hashdata,
indice_pairs, out_inds, indice_num_per_loc,
batch_size, input_dims_,
ksize_, dilation_, indice_pair_mask, backward,
stream_int);
}}
""")
code.raw(f"""TV_THROW_RT_ERR("unknown ndim", ndim);""")
return code.ret("int")
@pccm.pybind.mark
@pccm.cuda.static_function
def maxpool_forward(self):
code = pccm.FunctionCode()
code.arg("out", "tv::Tensor")
code.arg("inp", "tv::Tensor")
code.arg("out_inds", "tv::Tensor")
code.arg("in_inds", "tv::Tensor")
code.arg("stream", "std::uintptr_t", "0", pyanno="int")
code.add_dependency(IndiceMaxPool)
code.raw(f"""
return IndiceMaxPool::forward(out, inp, out_inds, in_inds, stream);
""")
return code
@pccm.pybind.mark
@pccm.cuda.static_function
def maxpool_backward(self):
code = pccm.FunctionCode()
code.arg("out", "tv::Tensor")
code.arg("inp", "tv::Tensor")
code.arg("dout", "tv::Tensor")
code.arg("dinp", "tv::Tensor")
code.arg("out_inds", "tv::Tensor")
code.arg("in_inds", "tv::Tensor")
code.arg("stream", "std::uintptr_t", "0", pyanno="int")
code.add_dependency(IndiceMaxPool)
code.raw(f"""
return IndiceMaxPool::backward(out, inp, dout, dinp, out_inds, in_inds, stream);
""")
return code
@pccm.pybind.mark
@pccm.cuda.static_function
def sort_1d_by_key(self):
code = pccm.FunctionCode()
code.add_dependency(ThrustLib, TensorViewKernel)
code.add_param_class("cudakers", CudaCommonKernel())
code.arg("data", "tv::Tensor")
code.raw(f"""
tv::Tensor indices({{data.dim(0)}}, tv::int32, 0);
tv::cuda::Launch launcher(data.dim(0));
launcher(cudakers::arange_kernel<int32_t>, indices.data_ptr<int32_t>(), indices.dim(0));
tv::dispatch<int32_t, uint32_t, int64_t, uint64_t>(data.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
thrust::device_ptr<T> ptr_tr(data.data_ptr<T>());
thrust::device_ptr<int32_t> ptr_k(indices.data_ptr<int32_t>());
auto thrust_ctx = thrust::cuda::par.on(0);
thrust::sort_by_key(thrust_ctx, ptr_tr, ptr_tr + data.dim(0), ptr_k);
}});
return indices;
""")
return code.ret("tv::Tensor")
import torch
import time
def main():
arr = torch.randint(0, 130000, size=[130000]).to(torch.int32).cuda()
arr2 = torch.randint(0, 130000, size=[130000]).to(torch.int32).cuda()
torch.cuda.synchronize()
ar = torch.arange(arr.shape[0]).cuda()
t = time.time()
for i in range(10):
xx, indices = arr.sort()
# thh = torch.empty_like(indices)
xx2, indices2 = arr2.sort()
# thh[indices] = ar
torch.cuda.synchronize()
print(time.time() - t)
t = time.time()
# print(indices[:10], thh[:10])
a = torch.rand(130000, 27 * 32).cuda().float()
b = torch.rand(27 * 32, 32).cuda().float()
c = torch.rand(130000, 32).cuda().float()
for i in range(10):
torch.cuda.synchronize()
t = time.time()
torch.mm(a, b, out=c)
# thh[indices] = ar
torch.cuda.synchronize()
print(time.time() - t)
if __name__ == "__main__":
main()
\ No newline at end of file
#!/home/yy/library/anaconda3/bin/python
import sys
from pathlib import Path
import ctypes
# _cudart = ctypes.CDLL('libcudart.so')
print(str(Path(__file__).parent.parent.parent.parent))
sys.path.append(str(Path(__file__).parent.parent.parent.parent))
from spconv import tensorview as tv
from spconv.sparse import build
import numpy as np
from pathlib import Path
from spconv.spconv_ops_cc.sparse.all.ops import Point2Voxel
from spconv.spconv_ops_cc.sparse.all import SpconvOps
import time
def main():
data = np.load("/home/yy/OneDrive/dev/spconv/test/data/benchmark-pc.npz")["pc"].astype(np.float32)
print(data.shape, data.dtype)
p2v = Point2Voxel([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 3, 150000, 1)
gs = p2v.grid_size # zyx
print(gs)
# return
data_tv = tv.from_numpy(data).cuda()
for i in range(6):
t = time.time()
voxels, indices, num_per_voxel = p2v.point_to_voxel_hash(data_tv)
print(time.time() - t)
voxels, indices, num_per_voxel = p2v.point_to_voxel_hash(data_tv)
print(voxels.shape, gs)
gs_xyz = gs
indices_np = indices.cpu().numpy()
# indices_offset = indices_np[:, 0] * gs_xyz[1] * gs_xyz[2] + indices_np[:, 1] * gs_xyz[2] + indices_np[:, 2]
# uq = np.unique(indices_offset)
# print(uq.shape, indices_offset.shape, gs_xyz)
# return
ksize = [3] * 3
kv = int(np.prod(ksize))
indices_with_bs = np.zeros((indices_np.shape[0], 4), dtype=np.int32)
indices_with_bs[:, 1:] = indices_np
print(indices_with_bs.mean(), indices_with_bs.max(), indices_with_bs.min())
indices = tv.from_numpy(indices_with_bs).cuda()
out_indices = tv.zeros([indices.dim(0) * kv, 4], tv.int32, 0)
indice_num_per_loc = tv.zeros([kv], tv.int32, 0)
points = voxels.view([-1, 3])
hashdata = tv.zeros([points.dim(0) * kv * 2], tv.custom64, 0)
hashdata_subm = tv.zeros([points.dim(0) * 2], tv.custom64, 0)
indice_pairs = tv.full([2, kv, indices.dim(0)], -1, tv.int32, 0)
indice_pairs_uniq = tv.zeros([indice_pairs.size // 2 + 1], tv.int32, 0)
# for i in range(10):
# indice_pairs.fill_int_(-1)
# np.random.shuffle(indices_with_bs)
# indices = tv.from_numpy(indices_with_bs).cuda()
# indice_num_per_loc.zero_()
# out_act = SpconvOps.generate_conv_inds(indices, hashdata, indice_pairs,
# indice_pairs_uniq, out_indices, indice_num_per_loc,
# 1, gs, gs, [3, 3, 3], [1, 1, 1], [1, 1, 1], [1, 1, 1])
# indice_num_per_loc.zero_()
# out_act = SpconvOps.generate_subm_conv_inds(indices, hashdata_subm, indice_pairs,
# out_indices, indice_num_per_loc,
# 1, gs, ksize, [1, 1, 1])
# indice_num_per_loc_cpu = indice_num_per_loc.cpu().numpy()
# indice_pairs_cpu = indice_pairs.cpu().numpy()
# indice_pairs_cpu_flat = indice_pairs_cpu.reshape(-1)
# uq, count = np.unique(indice_pairs_cpu_flat, return_counts=True)
# print(out_act, indice_pairs_cpu.shape, indice_pairs_cpu.mean(), indice_num_per_loc_cpu.tolist())
# print(indice_pairs_cpu[:, 13, :2])
# print(uq, count)
if __name__ == "__main__":
main()
\ No newline at end of file
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
from cumm.conv.bases import ConvEnum
from cumm.gemm.core.metaarray import MetaArray, seq
from cumm import dtypes
import pccm
from cumm.gemm.layout import TensorGeneric, to_stride
from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, ThrustLib
from cumm.gemm import codeops
from typing import List
from cumm.conv.params import ConvProblem
import numpy as np
class CudaCommonKernel(pccm.ParameterizedClass):
# we need to use PClass instead of Class
# because cuda global function can't be put in class body.
@pccm.cuda.cuda_global_function
def arange_kernel(self):
code = pccm.FunctionCode()
code.targ("T")
code.arg("data", f"T*")
code.arg("size", f"int")
code.raw(f"""
for (int i : tv::KernelLoopX<int>(size)) {{
data[i] = T(i);
}}
""")
return code
@pccm.cuda.cuda_global_function
def fill_kernel(self):
code = pccm.FunctionCode()
code.targ("T")
code.arg("data", f"T*")
code.arg("val", f"T")
code.arg("size", f"int")
code.raw(f"""
for (int i : tv::KernelLoopX<int>(size)) {{
data[i] = T(val);
}}
""")
return code
class ConvOutLocIter(pccm.ParameterizedClass):
# TODO add conv transpose
def __init__(self, problem: ConvProblem):
super().__init__()
self.add_dependency(TensorView)
self.add_param_class("lociter", problem, "ConvProblem")
layout_npq = TensorGeneric(problem.ndim + 1, False)
layout_rs = TensorGeneric(problem.ndim, False)
self.add_param_class("lociter", layout_npq, "LayoutNPQ")
self.add_param_class("lociter_rs", layout_rs, "LayoutRS")
self.ndim = problem.ndim
self.add_member("problem_", f"ConvProblem")
self.add_member("count_", f"tv::array<int, {self.ndim}>")
self.add_member("layout_npq", f"LayoutNPQ")
self.add_member("layout_rs", f"LayoutRS")
@pccm.cuda.constructor(host=True, device=True, forceinline=True)
def ctor(self):
code = pccm.FunctionCode()
code.arg("problem", f"ConvProblem const&")
code.ctor_init("problem_", f"problem")
zeros = ", ".join(["0"] * self.ndim)
code.ctor_init("count_", f"{{{zeros}}}")
pqs = codeops.unpack("problem.output_dims", range(self.ndim))
rss = codeops.unpack("problem.ksize", range(self.ndim))
code.ctor_init("layout_npq", f"LayoutNPQ::from_shape({{problem.N, {pqs}}})")
code.ctor_init("layout_rs", f"LayoutRS::from_shape({{{rss}}})")
return code
@pccm.cuda.member_function(host=True,
device=True,
forceinline=True,
name="operator++")
def increment(self):
code = pccm.FunctionCode()
for i in range(self.ndim - 1, -1, -1):
code.raw(f"""
if (++count_[{i}] < problem_.ksize[{i}]){{
return *this;
}}
count_[{i}] = 0;
""")
code.raw("return *this;")
return code.ret(f"{self.class_name}&")
@pccm.cuda.member_function(host=True,
device=True,
forceinline=True)
def set_filter_offset(self):
code = pccm.FunctionCode()
code.arg("filter_offset", "int")
code.raw(f"""
layout_rs.inverse(filter_offset, count_);
""")
return code
@pccm.cuda.member_function(host=True,
device=True,
forceinline=True,
const=True)
def nhw_to_npq(self):
code = pccm.FunctionCode()
code.arg("nhw_offset", "const int*")
code.nontype_targ("NoStride", "bool")
for i in range(self.ndim):
code.raw(f"""
int r_{i} = count_[{i}];
int h_{i} = (nhw_offset[{i + 1}] + problem_.padding[{i}] -
r_{i} * problem_.dilation[{i}]) / (NoStride ? 1 : problem_.stride[{i}]);
""")
h0h1h2 = codeops.unpack_str("h", range(self.ndim))
code.raw(f"""
return {{nhw_offset[0], {h0h1h2}}};
""")
return code.ret(f"tv::array<int, {self.ndim + 1}>")
@pccm.cuda.member_function(host=True,
device=True,
forceinline=True,
const=True)
def npq_to_nhw(self):
code = pccm.FunctionCode()
code.arg("npq_offset", "const int*")
for i in range(self.ndim):
code.raw(f"""
int r_{i} = count_[{i}];
int h_{i} = npq_offset[{i + 1}] * problem_.stride[{i}] - problem_.padding[{i}] + r_{i} * problem_.dilation[{i}];
""")
h0h1h2 = codeops.unpack_str("h", range(self.ndim))
code.raw(f"""
return {{npq_offset[0], {h0h1h2}}};
""")
return code.ret(f"tv::array<int, {self.ndim + 1}>")
@pccm.cuda.member_function(host=True,
device=True,
forceinline=True,
const=True)
def query_npq(self):
code = pccm.FunctionCode()
code.arg("nhw_offset", "const int*")
code.arg("npq_offset", f"tv::array<int, {self.ndim + 1}>&")
code.ret("bool")
code.raw(f"""
auto npq_no_stride = nhw_to_npq<true>(nhw_offset);
npq_offset[0] = npq_no_stride[0];
""")
hw_valid = [] # type: List[str]
stride_valid = [] # type: List[str]
for i in range(self.ndim):
code.raw(f"npq_offset[{i + 1}] = npq_no_stride[{i + 1}] / problem_.stride[{i}];")
hw_valid.append((f"npq_offset[{i + 1}] >= 0 && "
f"npq_offset[{i + 1}] < problem_.output_dims[{i}]"))
stride_valid.append(f"!(npq_no_stride[{i + 1}] % problem_.stride[{i}])")
code.raw(f"""
return npq_no_stride[0] < problem_.N &&
{' && '.join(hw_valid)} &&
{' && '.join(stride_valid)};
""")
return code
@pccm.cuda.member_function(host=True,
device=True,
forceinline=True,
const=True)
def query_npq_no_stride(self):
code = pccm.FunctionCode()
code.arg("nhw_offset", "const int*")
code.arg("npq_offset", f"tv::array<int, {self.ndim + 1}>&")
code.ret("bool")
code.raw(f"""
npq_offset = nhw_to_npq<true>(nhw_offset);
""")
hw_valid = [] # type: List[str]
for i in range(self.ndim):
hw_valid.append((f"npq_offset[{i + 1}] >= 0 && "
f"npq_offset[{i + 1}] < problem_.output_dims[{i}]"))
code.raw(f"""
return npq_offset[0] < problem_.N &&
{' && '.join(hw_valid)};
""")
return code
@pccm.cuda.member_function(host=True,
device=True,
forceinline=True,
const=True)
def query_nhw(self):
code = pccm.FunctionCode()
code.arg("npq_offset", "const int*")
code.arg("nhw_offset", f"tv::array<int, {self.ndim + 1}>&")
code.ret("bool")
code.raw(f"""
nhw_offset = npq_to_nhw(npq_offset);
""")
hw_valid = [] # type: List[str]
for i in range(self.ndim):
hw_valid.append((f"nhw_offset[{i + 1}] >= 0 && "
f"nhw_offset[{i + 1}] < problem_.input_dims[{i}]"))
code.raw(f"""
return nhw_offset[0] < problem_.N &&
{' && '.join(hw_valid)};
""")
return code
class SparseConvIndicesKernel(pccm.ParameterizedClass):
def __init__(self, problem: ConvProblem, dtype_indices: dtypes.DType):
super().__init__()
self.add_dependency(TensorView, TensorViewKernel, TensorViewHashKernel, ThrustLib)
self.loc_iter = ConvOutLocIter(problem)
self.add_param_class("spinds", self.loc_iter, "ConvLocIter")
self.add_param_class("spinds", problem, "ConvProblem")
self.add_param_class("cudakers", CudaCommonKernel())
self.ndim = problem.ndim
self.dtype_indices = dtype_indices
self.dtype_indices_uniq = dtype_indices
assert dtype_indices == dtypes.int32 or dtype_indices == dtypes.int64
@pccm.cuda.cuda_global_function
def calc_conv_indices_stage1(self):
code = pccm.FunctionCode()
code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_pairs_for_uniq", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_num_per_loc", f"int*") # [kernelProd]
code.arg("num_indices_in", "int")
code.arg("indices_pair_size", "int")
code.arg("RS", "int")
# code.arg("bool", "transposed")
code.raw(f"""
int filter_offset = blockIdx.y;
loc_iter.set_filter_offset(filter_offset);
int indices_pair_size_mul_RS = indices_pair_size * RS;
int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
tv::array<int, {self.ndim + 1}> npq_offset;
if (loc_iter.query_npq(indices_in + i * {self.ndim + 1}, npq_offset)){{
int old_num = tv::cuda::atomicAggInc(indice_num_per_loc + filter_offset);
{self.dtype_indices} offset = loc_iter.layout_npq(npq_offset);
if (old_num < indices_pair_size){{
indice_pairs[filter_offset_mul_indices_pair_size + old_num] = i;
indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + old_num] = offset;
indice_pairs_for_uniq[filter_offset_mul_indices_pair_size + old_num] = offset;
}}
}}
}}
""")
return code
@pccm.cuda.cuda_global_function
def build_conv_hash_table(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_out", f"int*") # [N, ndim + 1]
code.arg("indice_pairs_for_uniq", f"const {self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("layout_npq", f"spinds::LayoutNPQ") # [2, kernelProd, MaxSize]
code.arg("num_indices", "int")
code.raw(f"""
for (int i : tv::KernelLoopX<int>(num_indices)) {{
{self.dtype_indices} index = indice_pairs_for_uniq[i];
layout_npq.inverse(index, indices_out + {self.ndim + 1} * i);
table.insert(index, i);
}}
""")
return code
@pccm.cuda.cuda_global_function
def calc_conv_indices_stage2(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indice_pairs_out_part", f"int*") # [2, kernelProd, MaxSize]
code.arg("num_indices_in", "int")
code.arg("indices_pair_size", "int")
# TODO use block instead of filter_offset?
code.raw(f"""
int filter_offset = blockIdx.y;
auto indice_pairs_out_part_filter = indice_pairs_out_part + filter_offset * indices_pair_size;
for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
{self.dtype_indices} index = indice_pairs_out_part_filter[i];
if (index > -1){{
auto ptr = table.lookup_ptr(index);
if (ptr){{
indice_pairs_out_part_filter[i] = ptr->second;
}}
}}
}}
""")
return code
@pccm.cuda.cuda_global_function
def build_subm_conv_hash_table(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("layout_npq", f"spinds::LayoutNPQ")
code.arg("num_indices", "int")
code.raw(f"""
for (int i : tv::KernelLoopX<int>(num_indices)) {{
{self.dtype_indices} index = layout_npq(indices_in + i * {self.ndim + 1});
table.insert(index, i);
}}
""")
return code
@pccm.cuda.cuda_global_function
def clean_indices_uniq(self):
code = pccm.FunctionCode()
code.arg("indice_pairs_for_uniq", f"{self.dtype_indices}*")
code.arg("size", f"{self.dtype_indices}")
code.raw(f"""
for ({self.dtype_indices} i : tv::KernelLoopX<{self.dtype_indices}>(size)) {{
indice_pairs_for_uniq[i] = std::numeric_limits<{self.dtype_indices}>::max();
}}
""")
return code
@pccm.cuda.cuda_global_function
def calc_subm_conv_indices(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_num_per_loc", f"int*") # [kernelProd]
code.arg("num_indices_in", "int")
code.arg("indices_pair_size", "int")
code.arg("RS", "int")
code.raw(f"""
int filter_offset = blockIdx.y;
loc_iter.set_filter_offset(filter_offset);
int indices_pair_size_mul_RS = indices_pair_size * RS;
int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
int filter_offset_mul_indices_pair_size_1 = (RS - 1 - filter_offset) * indices_pair_size;
if (filter_offset == (RS / 2)){{
for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
indice_pairs[filter_offset_mul_indices_pair_size + i] = i;
indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + i] = i;
}}
}} else {{
for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
tv::array<int, {self.ndim + 1}> npq_offset;
if (loc_iter.query_npq_no_stride(indices_in + i * {self.ndim + 1}, npq_offset)){{
{self.dtype_indices} offset = loc_iter.layout_npq(npq_offset);
auto item = table.lookup(offset); // performance bound
if (!item.empty()){{
int old_num = tv::cuda::atomicAggInc(indice_num_per_loc + filter_offset);
indice_pairs[filter_offset_mul_indices_pair_size + old_num] = i;
indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + old_num] = item.second;
indice_pairs[filter_offset_mul_indices_pair_size_1 + old_num] = item.second;
indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size_1 + old_num] = i;
}}
}}
}}
}}
""")
return code
@pccm.cuda.cuda_global_function
def calc_subm_conv_indices_mask(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("mask", f"uint32_t*") # [kernelProd]
code.arg("num_indices", "int")
code.arg("indices_pair_size", "int")
code.arg("RS", "int")
code.raw(f"""
int filter_offset = blockIdx.y;
uint32_t filter_mask_out = (1u << (filter_offset));
uint32_t filter_mask_in = (1u << (RS - 1 - filter_offset));
uint32_t filter_mask_center = (1u << (RS / 2));
loc_iter.set_filter_offset(filter_offset);
int indices_pair_size_mul_RS = indices_pair_size * RS;
int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
int filter_offset_mul_indices_pair_size_1 = (RS - 1 - filter_offset) * indices_pair_size;
if (filter_offset == (RS / 2)){{
for (int i : tv::KernelLoopX<int>(num_indices)) {{
// atomicOr(mask + i, filter_mask_center);
indice_pairs[filter_offset_mul_indices_pair_size + i] = i;
indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + i] = i;
}}
}} else {{
for (int output_index : tv::KernelLoopX<int>(num_indices)) {{
// find input offset from output offset
tv::array<int, {self.ndim + 1}> nhw_offset;
// table: input indice coord to output index (or output indice coord to input index)
if (loc_iter.query_nhw(indices_in + output_index * {self.ndim + 1}, nhw_offset)){{
{self.dtype_indices} offset = loc_iter.layout_npq(nhw_offset);
auto item = table.lookup(offset);
if (!item.empty()) {{
auto input_index = item.second; // we find a input indice idx.
atomicOr(mask + output_index, filter_mask_out);
atomicOr(mask + input_index, filter_mask_in);
// for this output, we set correct input idx.
indice_pairs[filter_offset_mul_indices_pair_size + output_index] = input_index;
// the output in "input location" connect this output idx in another location.
indice_pairs[filter_offset_mul_indices_pair_size_1 + input_index] = output_index;
indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size + input_index] = output_index;
indice_pairs[indices_pair_size_mul_RS + filter_offset_mul_indices_pair_size_1 + output_index] = input_index;
}}
}}
}}
}}
""")
return code
@pccm.cuda.cuda_global_function
def calc_subm_conv_indices_split_mask(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("indice_pairs", f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("mask1", f"uint32_t*") # [kernelProd]
code.arg("mask2", f"uint32_t*") # [kernelProd]
code.arg("num_indices", "int")
code.arg("indices_pair_size", "int")
code.arg("RS", "int")
code.raw(f"""
int filter_offset = blockIdx.y;
uint32_t filter_mask_out = (1u << (filter_offset));
uint32_t filter_mask_in = (1u << (RS - 1 - filter_offset));
uint32_t filter_mask_center = (1u << (RS / 2));
loc_iter.set_filter_offset(filter_offset);
auto indice_ptr_inv = indice_pairs + indices_pair_size * RS;
int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
int filter_offset_mul_indices_pair_size_1 = (RS - 1 - filter_offset) * indices_pair_size;
if (filter_offset == (RS / 2)){{
for (int i : tv::KernelLoopX<int>(num_indices)) {{
indice_pairs[filter_offset_mul_indices_pair_size + i] = i;
indice_ptr_inv[filter_offset_mul_indices_pair_size + i] = i;
}}
}} else {{
for (int output_index : tv::KernelLoopX<int>(num_indices)) {{
// find input offset from output offset
tv::array<int, {self.ndim + 1}> nhw_offset;
// table: input indice coord to output index (or output indice coord to input index)
if (loc_iter.query_nhw(indices_in + output_index * {self.ndim + 1}, nhw_offset)){{
{self.dtype_indices} offset = loc_iter.layout_npq(nhw_offset);
auto item = table.lookup(offset);
if (!item.empty()) {{
auto input_index = item.second; // we find a input indice idx.
atomicOr(mask1 + output_index, filter_mask_out);
atomicOr(mask2 + input_index, filter_mask_in);
// for this output, we set correct input idx.
indice_pairs[filter_offset_mul_indices_pair_size + output_index] = input_index;
// the output in "input location" connect this output idx in another location.
indice_pairs[filter_offset_mul_indices_pair_size_1 + input_index] = output_index;
indice_ptr_inv[filter_offset_mul_indices_pair_size + input_index] = output_index;
indice_ptr_inv[filter_offset_mul_indices_pair_size_1 + output_index] = input_index;
}}
}}
}}
}}
""")
return code
@pccm.cuda.static_function
def generate_conv_inds(self):
code = pccm.FunctionCode()
code.arg("indices, hashdata", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, out_inds, indice_num_per_loc", "tv::Tensor")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
code.raw(f"""
// TODO stream
// TODO handle num input == 0
int kv = tv::arrayops::prod(ksize);
TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
// indice_pairs: [2, kv, indices.dim(0)]
// indice_pairs_uniq: [indice_pairs.size() / 2 + 1]
// out_inds: [MaxSize, {self.ndim + 1}]
auto timer = tv::CudaContextTimer<>();
int64_t uniq_size = indice_pairs.size() / 2 + 1;
TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) == uniq_size, "error");
TV_ASSERT_RT_ERR(indice_num_per_loc.dim(0) == kv, "error");
int64_t expected_out_size = indices.dim(0) * kv;
TV_ASSERT_RT_ERR(out_inds.dim(0) == expected_out_size && out_inds.dim(1) == {self.ndim + 1}, "error");
tv::cuda::Launch launcher_num_act_in(indices.dim(0));
// tv::cuda::Launch launcher_num_act_in_2(indices.dim(0));
launcher_num_act_in.blocks.y = kv;
ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
ConvLocIter loc_iter(problem);
tv::cuda::Launch launcher_clean_uniq(uniq_size);
launcher_clean_uniq(clean_indices_uniq, indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), uniq_size);
tv::ssprint("clean time", timer.report() / 1000.0);
launcher_num_act_in(calc_conv_indices_stage1, loc_iter, indices.data_ptr<const int>(),
indice_pairs.data_ptr<{self.dtype_indices}>(),
indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
indice_pairs.dim(2), kv);
tv::ssprint("calc_conv_indices_stage1 time", timer.report() / 1000.0, uniq_size);
thrust::device_ptr<{self.dtype_indices}> ptr_tr(indice_pairs_uniq.data_ptr<{self.dtype_indices}>());
auto thrust_ctx = thrust::cuda::par.on(0);
thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
auto new_end = thrust::unique(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
auto num_out_act = new_end - ptr_tr - 1;
tv::ssprint("unique time", num_out_act, timer.report() / 1000.0);
// return num_out_act;
// TODO handle invalid num_out_act
indice_pairs_uniq = indice_pairs_uniq.slice_first_axis(0, num_out_act);
tv::cuda::Launch lanucher_build_hash(num_out_act);
using V = {self.dtype_indices};
using KeyType = {self.dtype_indices};
constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
using table_t =
tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
kEmptyKey, false>;
using pair_t = typename table_t::value_type;
TV_ASSERT_RT_ERR(hashdata.dim(0) >= num_out_act, "hash size not enough");
table_t hash = table_t(hashdata.data_ptr<pair_t>(), hashdata.dim(0));
hash.clear();
tv::ssprint("clear hash time", hashdata.dim(0), timer.report() / 1000.0);
lanucher_build_hash(build_conv_hash_table<table_t>, hash, out_inds.data_ptr<int>(), indice_pairs_uniq.data_ptr<const {self.dtype_indices}>(),
loc_iter.layout_npq, num_out_act);
tv::ssprint("build_hash time", num_out_act, timer.report() / 1000.0);
launcher_num_act_in(calc_conv_indices_stage2<table_t>, hash, indice_pairs[1].data_ptr<int>(), indices.dim(0),
indice_pairs.dim(2));
tv::ssprint("gem conv inds time", timer.report() / 1000.0);
return num_out_act;
""")
return code.ret("int")
@pccm.cuda.static_function
def generate_conv_inds_stage1(self):
code = pccm.FunctionCode()
code.arg("indices", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, indice_num_per_loc", "tv::Tensor")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
code.arg("stream_int", f"std::uintptr_t", "0")
code.raw(f"""
// TODO stream
// TODO handle num input == 0
int kv = tv::arrayops::prod(ksize);
TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
// indice_pairs: [2, kv, indices.dim(0)]
// indice_pairs_uniq: [indice_pairs.size() / 2 + 1]
int64_t uniq_size = indice_pairs.size() / 2 + 1;
TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) >= uniq_size, "error");
TV_ASSERT_RT_ERR(indice_num_per_loc.dim(0) == kv, "error");
int64_t expected_out_size = indices.dim(0) * kv;
tv::cuda::Launch launcher_num_act_in(indices.dim(0), reinterpret_cast<cudaStream_t>(stream_int));
// tv::cuda::Launch launcher_num_act_in_2(indices.dim(0));
launcher_num_act_in.blocks.y = kv;
ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
ConvLocIter loc_iter(problem);
tv::cuda::Launch launcher_clean_uniq(uniq_size, reinterpret_cast<cudaStream_t>(stream_int));
launcher_clean_uniq(clean_indices_uniq, indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), uniq_size);
launcher_num_act_in(calc_conv_indices_stage1, loc_iter, indices.data_ptr<const int>(),
indice_pairs.data_ptr<{self.dtype_indices}>(),
indice_pairs_uniq.data_ptr<{self.dtype_indices}>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
indice_pairs.dim(2), kv);
thrust::device_ptr<{self.dtype_indices}> ptr_tr(indice_pairs_uniq.data_ptr<{self.dtype_indices}>());
auto thrust_ctx = thrust::cuda::par.on(reinterpret_cast<cudaStream_t>(stream_int));
thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
auto new_end = thrust::unique(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
auto num_out_act = new_end - ptr_tr - 1;
return num_out_act;
""")
return code.ret("int")
@pccm.cuda.static_function
def generate_conv_inds_stage2(self):
code = pccm.FunctionCode()
code.arg("indices, hashdata", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, out_inds", "tv::Tensor")
code.arg("num_out_act", "int")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, stride, padding, dilation", f"tv::array<int, {self.ndim}>")
code.arg("stream_int", f"std::uintptr_t", "0")
code.raw(f"""
auto custream = reinterpret_cast<cudaStream_t>(stream_int);
// TODO stream
// TODO handle num input == 0
int kv = tv::arrayops::prod(ksize);
TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
// indice_pairs: [2, kv, indices.dim(0)]
// indice_pairs_uniq: [indice_pairs.size() / 2 + 1]
// out_inds: [MaxSize, {self.ndim + 1}]
auto timer = tv::CudaContextTimer<>();
int64_t uniq_size = indice_pairs.size() / 2 + 1;
TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) == uniq_size, "error");
TV_ASSERT_RT_ERR(out_inds.dim(0) >= num_out_act && out_inds.dim(1) == {self.ndim + 1}, "error");
tv::cuda::Launch launcher_num_act_in(indices.dim(0), custream);
launcher_num_act_in.blocks.y = kv;
ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
ConvLocIter loc_iter(problem);
// TODO handle invalid num_out_act
indice_pairs_uniq = indice_pairs_uniq.slice_first_axis(0, num_out_act);
tv::cuda::Launch lanucher_build_hash(num_out_act, custream);
using V = {self.dtype_indices};
using KeyType = {self.dtype_indices};
constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
using table_t =
tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
kEmptyKey, false>;
using pair_t = typename table_t::value_type;
TV_ASSERT_RT_ERR(hashdata.dim(0) >= num_out_act, "hash size not enough");
table_t hash = table_t(hashdata.data_ptr<pair_t>(), hashdata.dim(0));
hash.clear(custream);
lanucher_build_hash(build_conv_hash_table<table_t>, hash,
out_inds.data_ptr<int>(), indice_pairs_uniq.data_ptr<const {self.dtype_indices}>(),
loc_iter.layout_npq, num_out_act);
launcher_num_act_in(calc_conv_indices_stage2<table_t>, hash,
indice_pairs[1].data_ptr<int>(), indices.dim(0),
indice_pairs.dim(2));
return num_out_act;
""")
return code.ret("int")
@pccm.cuda.static_function
def generate_subm_conv_inds(self):
code = pccm.FunctionCode()
code.arg("indices, hashdata", "tv::Tensor")
code.arg("indice_pairs, out_inds, indice_num_per_loc", "tv::Tensor")
code.arg("batch_size", "int")
code.arg("input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, dilation", f"tv::array<int, {self.ndim}>")
code.arg("indice_pair_mask", "tv::Tensor", "tv::Tensor()", "cumm.tensorview.Tensor = Tensor()")
code.arg("backward", "bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0")
code.raw(f"""
auto custream = reinterpret_cast<cudaStream_t>(stream_int);
auto ctx = tv::Context();
ctx.set_cuda_stream(custream);
if (!indice_pair_mask.empty()){{
TV_ASSERT_INVALID_ARG(tv::arrayops::prod(ksize) < 32, "for now only support 32bit mask");
}}
// TODO stream
// TODO handle num input == 0
tv::array<int, {self.ndim}> stride, padding;
for (int i = 0; i < {self.ndim}; ++i){{
TV_ASSERT_RT_ERR(ksize[i] % 2 == 1, "subm only support odd ksize");
stride[i] = 1;
padding[i] = (ksize[i] / 2) * dilation[i];
}}
int kv = tv::arrayops::prod(ksize);
TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
// indice_pairs: [2, kv, indices.dim(0)]
// out_inds: [MaxSize, {self.ndim + 1}]
// auto timer = tv::CudaContextTimer<>();
TV_ASSERT_RT_ERR(indice_num_per_loc.dim(0) == kv, "error");
tv::cuda::Launch launcher_num_act_in(indices.dim(0), custream);
launcher_num_act_in.blocks.y = (kv / 2) + 1;
// launcher_num_act_in.blocks.y = kv;
ConvProblem problem(batch_size, 1, 1, input_dims, input_dims, ksize, padding, stride, dilation);
ConvLocIter loc_iter(problem);
tv::cuda::Launch lanucher_build_hash(indices.dim(0), custream);
using V = {self.dtype_indices};
using KeyType = {self.dtype_indices};
constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
using table_t =
tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
kEmptyKey, false>;
using pair_t = typename table_t::value_type;
TV_ASSERT_RT_ERR(hashdata.dim(0) >= indices.dim(0), "hash size not enough");
table_t hash = table_t(hashdata.data_ptr<pair_t>(), hashdata.dim(0));
hash.clear(custream);
// tv::ssprint("clear hash time", hashdata.dim(0), timer.report() / 1000.0);
lanucher_build_hash(build_subm_conv_hash_table<table_t>, hash, indices.data_ptr<const int>(),
loc_iter.layout_npq, indices.dim(0));
// tv::ssprint("build_hash time", timer.report() / 1000.0);
if (!indice_pair_mask.empty()){{
if (indice_pair_mask.ndim() == 2 && indice_pair_mask.dim(0) == 2){{
auto mask_0 = indice_pair_mask[0];
tv::cuda::Launch lanucher_fill(mask_0.size(), custream);
lanucher_fill(cudakers::fill_kernel<int>, mask_0.data_ptr<int>(), (1 << (kv / 2)), mask_0.size());
indice_pair_mask[1].zero_(ctx);
auto kernel = &calc_subm_conv_indices_split_mask<table_t>;
launcher_num_act_in(kernel, loc_iter, hash,
indices.data_ptr<int>(), indice_pairs.data_ptr<int>(),
indice_pair_mask[0].data_ptr<uint32_t>(), indice_pair_mask[1].data_ptr<uint32_t>(),
indices.dim(0), indice_pairs.dim(2), kv);
}}else{{
tv::cuda::Launch lanucher_fill(indice_pair_mask.size(), custream);
lanucher_fill(cudakers::fill_kernel<int>, indice_pair_mask.data_ptr<int>(), (1 << (kv / 2)), indice_pair_mask.size());
TV_ASSERT_RT_ERR(indice_pair_mask.ndim() == 1, "error");
launcher_num_act_in(calc_subm_conv_indices_mask<table_t>, loc_iter, hash,
indices.data_ptr<int>(), indice_pairs.data_ptr<int>(),
indice_pair_mask.data_ptr<uint32_t>(), indices.dim(0), indice_pairs.dim(2), kv);
}}
}}else{{
launcher_num_act_in(calc_subm_conv_indices<table_t>, loc_iter, hash, indices.data_ptr<int>(),
indice_pairs.data_ptr<int>(),
indice_num_per_loc.data_ptr<int>(), indices.dim(0), indice_pairs.dim(2), kv);
}}
// tv::ssprint("gem subm conv inds time", timer.report() / 1000.0);
return indices.dim(0);
""")
return code.ret("int")
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
from cumm.conv.bases import ConvEnum
from cumm.gemm.core.metaarray import MetaArray, seq
from cumm import dtypes
import pccm
from cumm.gemm.layout import TensorGeneric, to_stride
from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, ThrustLib, GemmBasic
from cumm.gemm import codeops
from typing import List
from cumm.conv.params import ConvProblem
import numpy as np
class IndiceMaxPool(pccm.Class):
# TODO optimize this function
def __init__(self):
super().__init__()
self.add_dependency(TensorViewKernel, TensorView, GemmBasic)
@pccm.cuda.cuda_global_function
def forward_kernel(self):
code = pccm.FunctionCode()
code.targ("T")
code.arg("out_features", f"T*")
code.arg("in_features", f"const T*")
code.arg("out_indices", "const int*")
code.arg("in_indices", "const int*")
code.arg("size", "int")
code.arg("num_features", "int")
code.raw(f"""
for (int i : tv::KernelLoopY<int>(size)) {{
int in_idx = in_indices[i];
int out_idx = out_indices[i];
auto in_ptr = in_features + in_idx * num_features;
auto out_ptr = out_features + out_idx * num_features;
for (int j : tv::KernelLoopX<int>(num_features)) {{
auto in = in_ptr[j];
auto out = out_ptr[j];
if (in > out){{
out_ptr[j] = in;
}}
}}
}}
""")
return code
@pccm.cuda.cuda_global_function
def backward_kernel(self):
code = pccm.FunctionCode()
code.targ("T")
code.arg("out_features", f"const T*")
code.arg("in_features", f"const T*")
code.arg("dout_features", f"const T*")
code.arg("din_features", f"T*")
code.arg("out_indices", "const int*")
code.arg("in_indices", "const int*")
code.arg("size", "int")
code.arg("num_features", "int")
code.raw(f"""
for (int i : tv::KernelLoopY<int>(size)) {{
int in_idx_offset = in_indices[i] * num_features;
int out_idx_offset = out_indices[i] * num_features;
auto in_ptr = in_features + in_idx_offset;
auto out_ptr = out_features + out_idx_offset;
auto din_ptr = din_features + in_idx_offset;
auto dout_ptr = dout_features + out_idx_offset;
for (int j : tv::KernelLoopX<int>(num_features)) {{
auto in = in_ptr[j];
auto out = out_ptr[j];
if (in == out){{
din_ptr[j] = din_ptr[j] + dout_ptr[j];
}}
}}
}}
""")
return code
@pccm.cuda.static_function
def forward(self):
code = pccm.FunctionCode()
code.arg("out", "tv::Tensor")
code.arg("in", "tv::Tensor")
code.arg("out_inds", "tv::Tensor")
code.arg("in_inds", "tv::Tensor")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
auto nhot = out_inds.dim(0);
auto cudastream = reinterpret_cast<cudaStream_t>(stream);
tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
constexpr int MaxThreads = 512;
tv::cuda::Launch launcher(1);
bool found = tv::dispatch_int_noexcept<512, 256, 128, 64, 32, 16>(out.dim(1), [](int my, int expect){{return my >= expect;}}, [&](auto V){{
// if out.dim(1) > value in list above, run this function.
// if a value is found, other value won't be executed.
constexpr int NumFeatures = TV_DECLTYPE(V)::value;
constexpr int Num0 = MaxThreads / NumFeatures;
dim3 blocks(tv::div_up(out.dim(1), NumFeatures), tv::div_up(nhot, Num0));
dim3 threads(NumFeatures, Num0);
launcher = tv::cuda::Launch(blocks, threads, cudastream);
}});
if (!found){{
constexpr int NumFeatures = 16;
constexpr int Num0 = MaxThreads / NumFeatures;
dim3 blocks(tv::div_up(out.dim(1), NumFeatures), tv::div_up(nhot, Num0));
dim3 threads(NumFeatures, Num0);
launcher = tv::cuda::Launch(blocks, threads, cudastream);
}}
launcher(forward_kernel<T>, out.data_ptr<T>(), in.data_ptr<const T>(),
out_inds.data_ptr<const int>(), in_inds.data_ptr<const int>(), nhot, out.dim(1));
}});
""")
return code
@pccm.cuda.static_function
def backward(self):
code = pccm.FunctionCode()
code.arg("out", "tv::Tensor")
code.arg("in", "tv::Tensor")
code.arg("dout", "tv::Tensor")
code.arg("din", "tv::Tensor")
code.arg("out_inds", "tv::Tensor")
code.arg("in_inds", "tv::Tensor")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
auto nhot = out_inds.dim(0);
auto cudastream = reinterpret_cast<cudaStream_t>(stream);
tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
constexpr int MaxThreads = 512;
tv::cuda::Launch launcher(1);
bool found = tv::dispatch_int_noexcept<512, 256, 128, 64, 32, 16>(out.dim(1), [](int my, int expect){{return my >= expect;}}, [&](auto V){{
// if out.dim(1) > value in list above, run this function.
// if a value is found, other value won't be executed.
constexpr int NumFeatures = TV_DECLTYPE(V)::value;
constexpr int Num0 = MaxThreads / NumFeatures;
dim3 blocks(tv::div_up(out.dim(1), NumFeatures), tv::div_up(nhot, Num0));
dim3 threads(NumFeatures, Num0);
launcher = tv::cuda::Launch(blocks, threads, cudastream);
}});
if (!found){{
constexpr int NumFeatures = 16;
constexpr int Num0 = MaxThreads / NumFeatures;
dim3 blocks(tv::div_up(out.dim(1), NumFeatures), tv::div_up(nhot, Num0));
dim3 threads(NumFeatures, Num0);
launcher = tv::cuda::Launch(blocks, threads, cudastream);
}}
launcher(backward_kernel<T>, out.data_ptr<const T>(), in.data_ptr<const T>(),
dout.data_ptr<const T>(), din.data_ptr<T>(),
out_inds.data_ptr<const int>(), in_inds.data_ptr<const int>(), nhot, out.dim(1));
}});
""")
return code
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
from cumm.gemm.core.metaarray import MetaArray, seq
from cumm import dtypes
import pccm
from cumm.gemm.layout import TensorGeneric, to_stride
from cumm.common import TensorView, TensorViewHashKernel
from cumm.gemm import codeops
from typing import List
from cumm.conv.params import ConvProblem
import numpy as np
class Point2VoxelKernel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
"""this class don't support multi-thread.
create p2v for every thread.
"""
def __init__(self, dtype: dtypes.DType, ndim: int, layout: TensorGeneric, zyx: bool = True):
super().__init__()
self.add_dependency(TensorView, TensorViewHashKernel)
self.add_param_class("layout_ns", layout, "Layout")
self.dtype = dtype
self.ndim = ndim
self.zyx = zyx
@pccm.cuda.cuda_global_function
def build_hash_table(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("table", "TTable")
code.arg("points", f"{self.dtype} const*")
code.arg("points_indice_data", f"int64_t *")
code.arg("point_stride", f"int")
code.arg("vsize", f"tv::array<float, {self.ndim}>")
code.arg("coors_range", f"tv::array<float, {self.ndim * 2}>")
code.arg("grid_bound", f"tv::array<int, {self.ndim}>")
code.arg("grid_stride", f"tv::array<int, {self.ndim}>")
code.arg("num_points", f"int")
point_xyz = f"{self.ndim - 1} - j"
if not self.zyx:
point_xyz = f"j"
# if zyx, the coors_range and grid_bound is zyx too,
# generated indices is zyx.
code.raw(f"""
for (int i : tv::KernelLoopX<int>(num_points)){{
bool failed = false;
int c;
int64_t prod = 0;
#pragma unroll
for (int j = 0; j < {self.ndim}; ++j) {{
c = floor((points[i * point_stride + {point_xyz}] - coors_range[j]) /
vsize[j]);
if ((c < 0 || c >= grid_bound[j])) {{
failed = true;
}}
prod += grid_stride[j] * c;
}}
if (!failed){{
points_indice_data[i] = prod;
table.insert(prod, i);
}}else{{
points_indice_data[i] = -1;
}}
}}
""")
return code
@pccm.cuda.cuda_global_function
def assign_table(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("table", "TTable")
code.arg("indices", f"int*")
code.arg("count", f"int*")
code.arg("layout", f"Layout")
code.arg("max_voxels", f"int")
code.raw(f"""
auto data = table.data();
for (int i : tv::KernelLoopX<int>(table.size())){{
auto &item = data[i];
if (!item.empty()) {{
item.second = tv::cuda::atomicAggInc(count);
if (item.second < max_voxels){{
layout.inverse(item.first, indices + item.second * {self.ndim});
}}
}}
}}
""")
return code
@pccm.cuda.cuda_global_function
def generate_voxel(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("table", "TTable")
code.arg("points", f"{self.dtype} const*")
code.arg("points_indice_data", f"const int64_t*")
code.arg("voxels", f"{self.dtype} *")
code.arg("num_per_voxel", f"int *")
code.arg("point_stride", f"int")
code.arg("max_points_per_voxel", f"int")
code.arg("max_voxels", f"int")
code.arg("vsize", f"tv::array<float, {self.ndim}>")
code.arg("coors_range", f"tv::array<float, {self.ndim * 2}>")
code.arg("grid_bound", f"tv::array<int, {self.ndim}>")
code.arg("grid_stride", f"tv::array<int, {self.ndim}>")
code.arg("num_points", f"int")
code.raw(f"""
int voxel_stride0 = point_stride * max_points_per_voxel;
for (int i : tv::KernelLoopX<int>(num_points)){{
int64_t prod = points_indice_data[i];
if (prod != -1){{
auto voxel_index_pair = table.lookup(prod);
if (!voxel_index_pair.empty() &&
voxel_index_pair.second < max_voxels) {{
int old = atomicAdd(num_per_voxel + voxel_index_pair.second, 1);
if (old < max_points_per_voxel) {{
for (int j = 0; j < point_stride; ++j) {{
voxels[voxel_index_pair.second * voxel_stride0 + old * point_stride + j] = points[i * point_stride + j];
}}
}}
}}
}}
}}
""")
return code
class Point2Voxel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
def __init__(self, dtype: dtypes.DType, ndim: int, zyx: bool = True):
super().__init__()
self.add_dependency(TensorView)
layout = TensorGeneric(ndim, True)
self.add_param_class("layout_ns", layout, "Layout")
self.dtype = dtype
self.ndim = ndim
self.zyx = zyx
cuda_funcs = [self.point_to_voxel_hash]
self.add_impl_only_param_class(cuda_funcs, "kernel", Point2VoxelKernel(dtype, ndim, layout, zyx))
self.add_pybind_member("hashdata", "tv::Tensor", readwrite=False, pyanno="cumm.tensorview.Tensor")
self.add_pybind_member("point_indice_data", "tv::Tensor", readwrite=False, pyanno="cumm.tensorview.Tensor")
self.add_pybind_member("voxels", "tv::Tensor", readwrite=False)
self.add_pybind_member("indices", "tv::Tensor", readwrite=False)
self.add_pybind_member("num_per_voxel", "tv::Tensor", readwrite=False)
self.add_member("vsize", f"tv::array<float, {self.ndim}>")
self.add_member("coors_range", f"tv::array<float, {self.ndim * 2}>")
self.add_member("grid_size", f"tv::array<int, {self.ndim}>")
self.add_member("grid_stride", f"tv::array<int, {self.ndim}>")
@pccm.pybind.mark_prop_getter(prop_name="grid_size")
@pccm.member_function
def get_grid_size(self):
code = pccm.FunctionCode()
code.raw(f"""
std::array<int, {self.ndim}> res;
for (int i = 0; i < {self.ndim}; ++i){{
res[i] = grid_size[i];
}}
return res;
""")
return code.ret(f"std::array<int, {self.ndim}>")
@pccm.pybind.mark
@pccm.constructor
def ctor(self):
code = pccm.FunctionCode()
code.arg("vsize_xyz", f"std::array<float, {self.ndim}>")
code.arg("coors_range_xyz", f"std::array<float, {self.ndim * 2}>")
code.arg("num_point_features", f"int")
code.arg("max_num_voxels, max_num_points_per_voxel", f"int")
if self.zyx:
code.raw(f"""
for (int i = 0; i < {self.ndim}; ++i){{
vsize[{self.ndim - 1} - i] = vsize_xyz[i];
coors_range[{self.ndim - 1} - i] = coors_range_xyz[i];
coors_range[{2 * self.ndim - 1} - i] = coors_range_xyz[i + {self.ndim}];
}}
""")
else:
code.raw(f"""
for (int i = 0; i < {self.ndim}; ++i){{
vsize[i] = vsize_xyz[i];
coors_range[i] = coors_range_xyz[i];
coors_range[i + {self.ndim}] = coors_range_xyz[i + {self.ndim}];
}}
""")
# if zyx, grid_size is zyx.
code.raw(f"""
int64_t prod = 1;
for (size_t i = 0; i < {self.ndim}; ++i) {{
grid_size[i] =
std::round((coors_range[{self.ndim} + i] - coors_range[i]) / vsize[i]);
}}
for (int i = {self.ndim} - 1; i >= 0; --i) {{
grid_stride[i] = prod;
prod *= grid_size[i];
}}
voxels = tv::zeros({{max_num_voxels, max_num_points_per_voxel, num_point_features}}, tv::type_v<{self.dtype}>, 0);
indices = tv::zeros({{max_num_voxels, {self.ndim}}}, tv::int32, 0);
num_per_voxel = tv::zeros({{max_num_voxels}}, tv::int32, 0);
hashdata = tv::zeros({{1}}, tv::custom128, 0);
point_indice_data = tv::zeros({{1}}, tv::int64, 0);
""")
return code
@pccm.pybind.mark
@pccm.cuda.member_function
def point_to_voxel_hash(self):
code = pccm.FunctionCode()
code.arg("points", "tv::Tensor")
code.arg("clear_voxels", "bool", "true")
code.raw(f"""
TV_ASSERT_INVALID_ARG(points.ndim() == 2 && points.dim(1) >= {self.ndim}, "error");
using V = int64_t;
using KeyType = int64_t;
constexpr KeyType kEmptyKey = std::numeric_limits<KeyType>::max();
if (clear_voxels){{
voxels.zero_();
}}
using table_t =
tv::hash::LinearHashTable<KeyType, V, tv::hash::Murmur3Hash<KeyType>,
kEmptyKey, false>;
using pair_t = typename table_t::value_type;
// int64_t expected_hash_data_num = int64_t(tv::hash::align_to_power2(points.dim(0) * 2));
int64_t expected_hash_data_num = points.dim(0) * 2;
if (hashdata.dim(0) < expected_hash_data_num){{
hashdata = tv::zeros({{expected_hash_data_num}}, tv::custom128, 0);
}}
if (point_indice_data.dim(0) < points.dim(0)){{
point_indice_data = tv::zeros({{points.dim(0)}}, tv::int64, 0);
}}
// auto timer = tv::CudaContextTimer<>();
num_per_voxel.zero_();
table_t hash = table_t(hashdata.data_ptr<pair_t>(), expected_hash_data_num);
hash.clear();
// tv::ssprint("clear time", timer.report());
auto launcher = tv::cuda::Launch(points.dim(0));
launcher(kernel::build_hash_table<table_t>, hash, points.data_ptr<const {self.dtype}>(),
point_indice_data.data_ptr<int64_t>(),
points.dim(1), vsize, coors_range, grid_size, grid_stride, points.dim(0));
// tv::ssprint("build_hash_table", timer.report());
auto table_launcher = tv::cuda::Launch(hash.size());
tv::Tensor count = tv::zeros({{1}}, tv::int32, 0);
Layout layout = Layout::from_shape(grid_size);
table_launcher(kernel::assign_table<table_t>, hash, indices.data_ptr<int>(),
count.data_ptr<int>(),
layout, voxels.dim(0));
auto count_cpu = count.cpu();
int count_val = count_cpu.item<int32_t>();
// tv::ssprint("assign_table", timer.report());
launcher(kernel::generate_voxel<table_t>, hash, points.data_ptr<const {self.dtype}>(),
point_indice_data.data_ptr<const int64_t>(), voxels.data_ptr<{self.dtype}>(),
num_per_voxel.data_ptr<int>(), points.dim(1), voxels.dim(1),
voxels.dim(0), vsize, coors_range,
grid_size, grid_stride, points.dim(0));
// tv::ssprint("generate_voxel", timer.report());
return std::make_tuple(voxels.slice_first_axis(0, count_val),
indices.slice_first_axis(0, count_val),
num_per_voxel.slice_first_axis(0, count_val));
""")
return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")
class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
def __init__(self, dtype: dtypes.DType, ndim: int, zyx: bool = True):
super().__init__()
self.add_dependency(TensorView)
layout = TensorGeneric(ndim, True)
self.add_param_class("layout_ns", layout, "Layout")
self.dtype = dtype
self.ndim = ndim
self.zyx = zyx
self.add_pybind_member("densehashdata", "tv::Tensor", readwrite=False, pyanno="cumm.tensorview.Tensor")
self.add_pybind_member("voxels", "tv::Tensor", readwrite=False)
self.add_pybind_member("indices", "tv::Tensor", readwrite=False)
self.add_pybind_member("num_per_voxel", "tv::Tensor", readwrite=False)
self.add_member("mean_per_voxel", "tv::Tensor")
self.add_member("vsize", f"tv::array<float, {self.ndim}>")
self.add_member("coors_range", f"tv::array<float, {self.ndim * 2}>")
self.add_member("grid_size", f"tv::array<int, {self.ndim}>")
self.add_member("grid_stride", f"tv::array<int, {self.ndim}>")
@pccm.pybind.mark_prop_getter(prop_name="grid_size")
@pccm.member_function
def get_grid_size(self):
code = pccm.FunctionCode()
code.raw(f"""
std::array<int, {self.ndim}> res;
for (int i = 0; i < {self.ndim}; ++i){{
res[i] = grid_size[i];
}}
return res;
""")
return code.ret(f"std::array<int, {self.ndim}>")
@pccm.pybind.mark
@pccm.constructor
def ctor(self):
code = pccm.FunctionCode()
code.arg("vsize_xyz", f"std::array<float, {self.ndim}>")
code.arg("coors_range_xyz", f"std::array<float, {self.ndim * 2}>")
code.arg("num_point_features", f"int")
code.arg("max_num_voxels, max_num_points_per_voxel", f"int")
if self.zyx:
code.raw(f"""
for (int i = 0; i < {self.ndim}; ++i){{
vsize[{self.ndim - 1} - i] = vsize_xyz[i];
coors_range[{self.ndim - 1} - i] = coors_range_xyz[i];
coors_range[{2 * self.ndim - 1} - i] = coors_range_xyz[i + {self.ndim}];
}}
""")
else:
code.raw(f"""
for (int i = 0; i < {self.ndim}; ++i){{
vsize[i] = vsize_xyz[i];
coors_range[i] = coors_range_xyz[i];
coors_range[i + {self.ndim}] = coors_range_xyz[i + {self.ndim}];
}}
""")
code.raw(f"""
int64_t prod = 1;
for (size_t i = 0; i < {self.ndim}; ++i) {{
grid_size[i] =
std::round((coors_range[{self.ndim} + i] - coors_range[i]) / vsize[i]);
}}
for (int i = {self.ndim} - 1; i >= 0; --i) {{
grid_stride[i] = prod;
prod *= grid_size[i];
}}
voxels = tv::zeros({{max_num_voxels, max_num_points_per_voxel, num_point_features}}, tv::type_v<{self.dtype}>, -1);
indices = tv::zeros({{max_num_voxels, {self.ndim}}}, tv::int32, -1);
num_per_voxel = tv::zeros({{max_num_voxels}}, tv::int32, -1);
mean_per_voxel = tv::zeros({{max_num_voxels, num_point_features}}, tv::DType({self.dtype.tv_dtype}), -1);
tv::TensorShape grid_shape(grid_size.data(), grid_size.data() + {self.ndim});
densehashdata = tv::zeros(grid_shape, tv::int32, -1);
auto densehashdata_ptr = densehashdata.data_ptr<int>();
for (int i= 0; i < densehashdata.size(); ++i){{
densehashdata_ptr[i] = -1;
}}
""")
return code
def point_to_voxel_template(self, mean: bool = False):
code = pccm.FunctionCode()
code.arg("points", "tv::Tensor")
code.arg("clear_voxels", "bool", "true")
point_xyz = f"{self.ndim - 1} - j"
if not self.zyx:
point_xyz = f"j"
code.raw(f"""
auto max_num_voxels = voxels.dim(0);
auto max_num_points_per_voxel = voxels.dim(1);
num_per_voxel.zero_();
if (clear_voxels){{
voxels.zero_();
}}
""")
if mean:
code.raw(f"mean_per_voxel.zero_();")
code.raw(f"auto means_rw = mean_per_voxel.tview<{self.dtype}, 2>();")
else:
code.raw(f"auto means_rw = mean_per_voxel.tview<{self.dtype}, 2>();")
code.raw(f"""
int res_voxel_num = 0;
int num_features = points.dim(1);
auto N = points.dim(0);
int c;
TV_ASSERT_RT_ERR(num_features == voxels.dim(2), "your points num features doesn't equal to voxel.");
constexpr bool kUseMean = {pccm.boolean(mean)};
tv::dispatch<float, double>(points.dtype(), [&](auto I){{
using T = decltype(I);
auto points_rw = points.tview<T, 2>();
auto coors_rw = indices.tview<int, 2>();
auto voxels_rw = voxels.tview<{self.dtype}, 3>();
auto num_points_per_voxel_rw = num_per_voxel.tview<int, 1>();
int coor[{self.ndim}];
auto coor_to_voxelidx_rw = densehashdata.tview<int, {self.ndim}>();
int voxelidx, num;
bool failed;
int voxel_num = 0;
for (int i = 0; i < N; ++i) {{
failed = false;
for (int j = 0; j < {self.ndim}; ++j) {{
c = floor((points_rw(i, {point_xyz}) - coors_range[j]) / vsize[j]);
if ((c < 0 || c >= grid_size[j])) {{
failed = true;
break;
}}
coor[j] = c;
}}
if (failed)
continue;
voxelidx = coor_to_voxelidx_rw({codeops.unpack("coor", range(self.ndim))});
if (voxelidx == -1) {{
voxelidx = voxel_num;
if (voxel_num >= max_num_voxels)
continue;
voxel_num += 1;
coor_to_voxelidx_rw({codeops.unpack("coor", range(self.ndim))}) = voxelidx;
for (int k = 0; k < {self.ndim}; ++k) {{
coors_rw(voxelidx, k) = coor[k];
}}
}}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_num_points_per_voxel) {{
// voxel_point_mask_rw(voxelidx, num) = {self.dtype}(1);
for (int k = 0; k < num_features; ++k) {{
voxels_rw(voxelidx, num, k) = points_rw(i, k);
}}
num_points_per_voxel_rw(voxelidx) += 1;
if TV_IF_CONSTEXPR (kUseMean){{
for (int k = 0; k < num_features; ++k) {{
means_rw(voxelidx, k) +=
(points_rw(i, k) - means_rw(voxelidx, k)) / {self.dtype}(num + 1);
}}
}}
}}
}}
for (int i = 0; i < voxel_num; ++i) {{
coor_to_voxelidx_rw({codeops.unpack("coors_rw", range(self.ndim), left="(i, ", right=")")}) = -1;
if TV_IF_CONSTEXPR (kUseMean){{
num = num_points_per_voxel_rw(i);
for (int j = num; j < max_num_points_per_voxel; ++j) {{
for (int k = 0; k < num_features; ++k) {{
voxels_rw(i, j, k) = means_rw(i, k);
}}
}}
}}
}}
res_voxel_num = voxel_num;
}});
return std::make_tuple(voxels.slice_first_axis(0, res_voxel_num),
indices.slice_first_axis(0, res_voxel_num),
num_per_voxel.slice_first_axis(0, res_voxel_num));
""")
return code.ret("std::tuple<tv::Tensor, tv::Tensor, tv::Tensor>")
@pccm.pybind.mark
@pccm.member_function
def point_to_voxel(self):
return self.point_to_voxel_template(False)
@pccm.pybind.mark
@pccm.member_function
def point_to_voxel_empty_mean(self):
return self.point_to_voxel_template(True)
# Copyright 2019-2020 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from enum import Enum
import torch
import spconv
class ConvAlgo(Enum):
Native = 0 # small memory cost, faster when number of points is large.
Batch = 1 # high memory cost, faster when number of points is small (< 50000)
BatchGemmGather = 2 # high memory cost, faster when number of points medium
SparseConvNet = 3
Minkowski = 4 # https://github.com/StanfordVL/MinkowskiEngine/blob/master/src/convolution.cu
def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
ndim = len(input_size)
output_size = []
for i in range(ndim):
size = (input_size[i] + 2 * padding[i] - dilation[i] *
(kernel_size[i] - 1) - 1) // stride[i] + 1
if kernel_size[i] == -1:
output_size.append(1)
else:
output_size.append(size)
return output_size
def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
output_padding):
ndim = len(input_size)
output_size = []
for i in range(ndim):
if kernel_size[i] == -1:
raise ValueError("deconv don't support kernel_size < 0")
size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
i] + output_padding[i]
output_size.append(size)
return output_size
def get_indice_pairs(indices,
batch_size,
spatial_shape,
ksize=3,
stride=1,
padding=0,
dilation=1,
out_padding=0,
subm=False,
transpose=False,
grid=None,
use_hash=False):
ndim = indices.shape[1] - 1
if not isinstance(ksize, (list, tuple)):
ksize = [ksize] * ndim
if not isinstance(stride, (list, tuple)):
stride = [stride] * ndim
if not isinstance(padding, (list, tuple)):
padding = [padding] * ndim
if not isinstance(dilation, (list, tuple)):
dilation = [dilation] * ndim
if not isinstance(out_padding, (list, tuple)):
out_padding = [out_padding] * ndim
for d, s in zip(dilation, stride):
assert any([s == 1, d == 1]), "don't support this."
if not subm:
if transpose:
out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
padding, dilation, out_padding)
else:
out_shape = get_conv_output_size(spatial_shape, ksize, stride,
padding, dilation)
else:
out_shape = spatial_shape
if grid is None:
grid = torch.Tensor()
res = torch.ops.spconv.get_indice_pairs(indices, grid, batch_size,
out_shape, spatial_shape, ksize,
stride, padding, dilation,
out_padding, int(subm),
int(transpose), int(use_hash))
return res
def indice_conv(features,
filters,
indice_pairs,
indice_pair_num,
num_activate_out,
inverse=False,
subm=False,
algo=ConvAlgo.Native.value):
return torch.ops.spconv.indice_conv(features, filters, indice_pairs,
indice_pair_num, num_activate_out,
int(inverse), int(subm), algo)
def fused_indice_conv(features, filters, bias, indice_pairs, indice_pair_num,
num_activate_out, inverse, subm):
return torch.ops.spconv.fused_indice_conv_bn(features, filters, bias,
indice_pairs, indice_pair_num,
num_activate_out,
int(inverse), int(subm))
def indice_conv_backward(features,
filters,
out_bp,
indice_pairs,
indice_pair_num,
inverse=False,
subm=False,
algo=ConvAlgo.Native.value):
return torch.ops.spconv.indice_conv_backward(features, filters, out_bp,
indice_pairs, indice_pair_num,
int(inverse), int(subm), algo)
def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
return torch.ops.spconv.indice_maxpool(features, indice_pairs,
indice_pair_num, num_activate_out)
def indice_maxpool_backward(features, out_features, out_bp, indice_pairs,
indice_pair_num):
return torch.ops.spconv.indice_maxpool_backward(features, out_features,
out_bp, indice_pairs,
indice_pair_num)
def nms(boxes, scores, pre_max_size, post_max_size, thresh, eps):
res = torch.ops.spconv.nms(boxes, scores, pre_max_size, post_max_size,
thresh, eps)
return res
def pillar_scatter(features, coors, shape):
if features.dtype == torch.float32:
return torch.ops.spconv.pillar_scatter_float(features, coors, shape)
elif features.dtype == torch.half:
return torch.ops.spconv.pillar_scatter_half(features, coors, shape)
else:
raise NotImplementedError
import platform
from pathlib import Path
import numpy as np
import torch
from spconv.pytorch import ops
from spconv.pytorch.conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
SparseConvTranspose3d, SparseInverseConv2d,
SparseInverseConv3d, SubMConv2d, SubMConv3d)
from spconv.pytorch.core import SparseConvTensor
from spconv.pytorch.identity import Identity
from spconv.pytorch.modules import SparseModule, SparseSequential
from spconv.pytorch.ops import ConvAlgo
from spconv.pytorch.pool import SparseMaxPool2d, SparseMaxPool3d
from spconv.pytorch.tables import AddTable, ConcatTable, JoinTable
class ToDense(SparseModule):
"""convert SparseConvTensor to NCHW dense tensor.
"""
def forward(self, x: SparseConvTensor):
return x.dense()
class RemoveGrid(SparseModule):
"""remove pre-allocated grid buffer.
"""
def forward(self, x: SparseConvTensor):
x.grid = None
return x
# Copyright 2019-2020 Yan Yan
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -21,12 +21,13 @@ from torch import nn
from torch.nn import init
from torch.nn.parameter import Parameter
import spconv
import spconv.functional as Fsp
from spconv import ops
from spconv.core import IndiceData, SparseConvTensor
from spconv.modules import SparseModule
from spconv import pytorch as spconv
from spconv.algo import ConvAlgo
import spconv.pytorch.functional as Fsp
from spconv.pytorch import ops
from spconv.pytorch.core import IndiceData, SparseConvTensor
from spconv.pytorch.modules import SparseModule
from spconv.constants import FILTER_HWIO
def _calculate_fan_in_and_fan_out_hwio(tensor):
dimensions = tensor.ndimension()
......@@ -39,8 +40,8 @@ def _calculate_fan_in_and_fan_out_hwio(tensor):
fan_in = tensor.size(-2)
fan_out = tensor.size(-1)
else:
num_input_fmaps = tensor.size(-2)
num_output_fmaps = tensor.size(-1)
num_input_fmaps = tensor.size(-1)
num_output_fmaps = tensor.size(-2)
receptive_field_size = 1
if tensor.dim() > 2:
receptive_field_size = tensor[..., 0, 0].numel()
......@@ -72,7 +73,6 @@ class SparseConvolution(SparseModule):
inverse=False,
indice_key=None,
fused_bn=False,
use_hash=False,
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConvolution, self).__init__(name=name)
......@@ -106,20 +106,25 @@ class SparseConvolution(SparseModule):
self.subm = subm
self.indice_key = indice_key
self.fused_bn = fused_bn
self.use_hash = use_hash
self.algo = algo.value
self.algo = algo
if FILTER_HWIO:
self.weight = Parameter(
torch.Tensor(*kernel_size, in_channels, out_channels))
else:
self.weight = Parameter(
torch.Tensor(*kernel_size, out_channels, in_channels))
if bias:
self.bias = Parameter(torch.Tensor(out_channels))
else:
self.register_parameter('bias', None)
# self.workspace_for_splitk = torch.zeros((GLOBAL_MAXIMUM_SPLITK,), dtype=torch.int8)
# self.register_buffer("workspace_for_splitk", self.workspace_for_splitk)
self.reset_parameters()
def reset_parameters(self):
n = self.in_channels
init.kaiming_uniform_(self.weight, a=math.sqrt(5))
# init.uniform_(self.weight, 0, 0.001)
init.kaiming_uniform_(self.weight, a=math.sqrt(0.005))
if self.bias is not None:
fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
bound = 1 / math.sqrt(fan_in)
......@@ -171,9 +176,15 @@ class SparseConvolution(SparseModule):
}
}
if self.conv1x1:
if FILTER_HWIO:
features = torch.mm(
input.features,
self.weight.view(self.out_channels, self.in_channels).T)
else:
features = torch.mm(
input.features,
self.weight.view(self.in_channels, self.out_channels))
self.weight.view(self.in_channels, self.out_channels).T)
if self.bias is not None:
features += self.bias
out_tensor.features = features
......@@ -201,15 +212,14 @@ class SparseConvolution(SparseModule):
indices,
batch_size,
spatial_shape,
self.algo,
self.kernel_size,
self.stride,
self.padding,
self.dilation,
self.output_padding,
self.subm,
self.transposed,
grid=input.grid,
use_hash=self.use_hash)
self.transposed)
if input.benchmark:
torch.cuda.synchronize()
interval = time.time() - t
......@@ -264,6 +274,32 @@ class SparseConvolution(SparseModule):
out_tensor.spatial_shape = out_spatial_shape
return out_tensor
class SparseConv1d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None,
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConv1d, self).__init__(1,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
indice_key=indice_key,
algo=algo,
name=name)
class SparseConv2d(SparseConvolution):
def __init__(self,
......@@ -276,7 +312,6 @@ class SparseConv2d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConv2d, self).__init__(2,
......@@ -289,7 +324,6 @@ class SparseConv2d(SparseConvolution):
groups,
bias,
indice_key=indice_key,
use_hash=use_hash,
algo=algo,
name=name)
......@@ -305,7 +339,6 @@ class SparseConv3d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConv3d, self).__init__(3,
......@@ -318,7 +351,6 @@ class SparseConv3d(SparseConvolution):
groups,
bias,
indice_key=indice_key,
use_hash=use_hash,
algo=algo,
name=name)
......@@ -334,7 +366,6 @@ class SparseConv4d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConv4d, self).__init__(4,
......@@ -347,7 +378,6 @@ class SparseConv4d(SparseConvolution):
groups,
bias,
indice_key=indice_key,
use_hash=use_hash,
algo=algo,
name=name)
......@@ -363,7 +393,6 @@ class SparseConvTranspose2d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConvTranspose2d, self).__init__(2,
......@@ -377,7 +406,6 @@ class SparseConvTranspose2d(SparseConvolution):
bias,
transposed=True,
indice_key=indice_key,
use_hash=use_hash,
algo=algo,
name=name)
......@@ -393,7 +421,6 @@ class SparseConvTranspose3d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native,
name=None):
super(SparseConvTranspose3d, self).__init__(3,
......@@ -407,7 +434,25 @@ class SparseConvTranspose3d(SparseConvolution):
bias,
transposed=True,
indice_key=indice_key,
use_hash=use_hash,
algo=algo,
name=name)
class SparseInverseConv1d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
indice_key,
bias=True,
algo=ops.ConvAlgo.Native,
name=None):
super(SparseInverseConv1d, self).__init__(1,
in_channels,
out_channels,
kernel_size,
bias=bias,
inverse=True,
indice_key=indice_key,
algo=algo,
name=name)
......@@ -451,6 +496,52 @@ class SparseInverseConv3d(SparseConvolution):
algo=algo,
name=name)
class SparseInverseConv4d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
indice_key,
bias=True,
algo=ops.ConvAlgo.Native,
name=None):
super(SparseInverseConv4d, self).__init__(4,
in_channels,
out_channels,
kernel_size,
bias=bias,
inverse=True,
indice_key=indice_key,
algo=algo,
name=name)
class SubMConv1d(SparseConvolution):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
indice_key=None,
algo=ops.ConvAlgo.Native,
name=None):
super(SubMConv1d, self).__init__(1,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
True,
indice_key=indice_key,
algo=algo,
name=name)
class SubMConv2d(SparseConvolution):
def __init__(self,
......@@ -463,7 +554,6 @@ class SubMConv2d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native,
name=None):
super(SubMConv2d, self).__init__(2,
......@@ -477,7 +567,6 @@ class SubMConv2d(SparseConvolution):
bias,
True,
indice_key=indice_key,
use_hash=use_hash,
algo=algo,
name=name)
......@@ -493,7 +582,6 @@ class SubMConv3d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native,
name=None):
super(SubMConv3d, self).__init__(3,
......@@ -507,7 +595,6 @@ class SubMConv3d(SparseConvolution):
bias,
True,
indice_key=indice_key,
use_hash=use_hash,
algo=algo,
name=name)
......@@ -523,7 +610,6 @@ class SubMConv4d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False,
algo=ops.ConvAlgo.Native,
name=None):
super(SubMConv4d, self).__init__(4,
......@@ -537,6 +623,5 @@ class SubMConv4d(SparseConvolution):
bias,
True,
indice_key=indice_key,
use_hash=use_hash,
algo=algo,
name=name)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment