Commit 21bb00ae authored by Yan Yan's avatar Yan Yan
Browse files

still working on c++ only

parent 899008fa
<!--
Copyright 2022 Yan Yan
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
TODO
\ No newline at end of file
...@@ -175,7 +175,7 @@ if disable_jit is not None and disable_jit == "1": ...@@ -175,7 +175,7 @@ if disable_jit is not None and disable_jit == "1":
std = "c++14" std = "c++14"
else: else:
std = "c++17" std = "c++17"
if CUMM_CPU_ONLY_BUILD: if not CUMM_CPU_ONLY_BUILD:
gemmtuner = GemmTunerSimple(cu) gemmtuner = GemmTunerSimple(cu)
gemmtuner.namespace = "csrc.sparse.convops.gemmops" gemmtuner.namespace = "csrc.sparse.convops.gemmops"
convtuner = ConvTunerSimple(convcu) convtuner = ConvTunerSimple(convcu)
......
...@@ -62,8 +62,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable( ...@@ -62,8 +62,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
CompileInfo(), CompileInfo(),
ExternalAllocator(), ExternalAllocator(),
ExternalSpconvMatmul(), ExternalSpconvMatmul(),
SimpleExternalSpconvMatmul(), SimpleExternalSpconvMatmul(), # for debug, won't be included in release
] ]
pccm.builder.build_pybind(cus, pccm.builder.build_pybind(cus,
PACKAGE_ROOT / "core_cc", PACKAGE_ROOT / "core_cc",
......
...@@ -64,7 +64,7 @@ SPCONV_DEBUG_CPP_ONLY = project_is_editable(PACKAGE_NAME) ...@@ -64,7 +64,7 @@ SPCONV_DEBUG_CPP_ONLY = project_is_editable(PACKAGE_NAME)
class AllocKeys: class AllocKeys:
Pair = "Pair" PairBwd = "PairBwd"
IndiceNumPerLoc = "IndiceNumPerLoc" IndiceNumPerLoc = "IndiceNumPerLoc"
PairMask = "PairMask" PairMask = "PairMask"
MaskArgSort = "MaskArgSort" MaskArgSort = "MaskArgSort"
...@@ -103,3 +103,5 @@ SPCONV_CPP_INDICE_PAIRS = True ...@@ -103,3 +103,5 @@ SPCONV_CPP_INDICE_PAIRS = True
SPCONV_CPP_INDICE_PAIRS_IGEMM = True SPCONV_CPP_INDICE_PAIRS_IGEMM = True
SPCONV_CPP_GEMM = True SPCONV_CPP_GEMM = True
SPCONV_FX_TRACE_MODE = os.getenv("SPCONV_FX_TRACE_MODE", "0") == "1"
\ No newline at end of file
...@@ -240,6 +240,28 @@ class SpconvOps: ...@@ -240,6 +240,28 @@ class SpconvOps:
""" """
... ...
@staticmethod @staticmethod
def avgpool_implicit_gemm_forward(out: Tensor, inp: Tensor, inds: Tensor, count_out: Tensor, stream: int = 0) -> None:
"""
Args:
out:
inp:
inds:
count_out:
stream:
"""
...
@staticmethod
def avgpool_implicit_gemm_backward(dout: Tensor, dinp: Tensor, inds: Tensor, count_out: Tensor, stream: int = 0) -> None:
"""
Args:
dout:
dinp:
inds:
count_out:
stream:
"""
...
@staticmethod
def maxpool_forward_cpu(out: Tensor, inp: Tensor, out_inds: Tensor, in_inds: Tensor) -> None: def maxpool_forward_cpu(out: Tensor, inp: Tensor, out_inds: Tensor, in_inds: Tensor) -> None:
""" """
Args: Args:
...@@ -280,15 +302,6 @@ class SpconvOps: ...@@ -280,15 +302,6 @@ class SpconvOps:
""" """
... ...
@staticmethod @staticmethod
def sort_1d_by_key(data: Tensor, indices: Tensor = Tensor(), stream: int = 0) -> Tensor:
"""
Args:
data:
indices:
stream:
"""
...
@staticmethod
def sort_1d_by_key_allocator(data: Tensor, alloc_func, indices: Tensor = Tensor(), stream: int = 0) -> Tensor: def sort_1d_by_key_allocator(data: Tensor, alloc_func, indices: Tensor = Tensor(), stream: int = 0) -> Tensor:
""" """
Args: Args:
...@@ -348,6 +361,24 @@ class SpconvOps: ...@@ -348,6 +361,24 @@ class SpconvOps:
""" """
... ...
@staticmethod @staticmethod
def maximum_value_int(data: Tensor, value: int, stream_int: int) -> None:
"""
Args:
data:
value:
stream_int:
"""
...
@staticmethod
def sort_1d_by_key(data: Tensor, indices: Tensor = Tensor(), stream: int = 0) -> Tensor:
"""
Args:
data:
indices:
stream:
"""
...
@staticmethod
def calc_point2voxel_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]: def calc_point2voxel_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]:
""" """
Args: Args:
...@@ -407,6 +438,18 @@ class SpconvOps: ...@@ -407,6 +438,18 @@ class SpconvOps:
""" """
... ...
@staticmethod @staticmethod
def get_indice_gen_tensors_from_workspace(workspace, kv: int, num_act_in: int, num_act_out_bound: int, subm: bool, use_int64_hash_k: bool) -> Dict[str, Tensor]:
"""
Args:
workspace:
kv:
num_act_in:
num_act_out_bound:
subm:
use_int64_hash_k:
"""
...
@staticmethod
def get_indice_pairs_implicit_gemm(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, is_train: bool, stream_int: int = 0, num_out_act_bound: int = -1) -> Tuple[Tensor, int]: def get_indice_pairs_implicit_gemm(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, is_train: bool, stream_int: int = 0, num_out_act_bound: int = -1) -> Tuple[Tensor, int]:
""" """
Args: Args:
...@@ -428,7 +471,7 @@ class SpconvOps: ...@@ -428,7 +471,7 @@ class SpconvOps:
""" """
... ...
@staticmethod @staticmethod
def get_indice_pairs(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, stream_int: int = 0, num_out_act_bound: int = -1) -> int: def get_indice_pairs(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, stream_int: int = 0, num_out_act_bound: int = -1, num_input_act_bound: int = -1) -> int:
""" """
Args: Args:
allocator: allocator:
...@@ -445,5 +488,6 @@ class SpconvOps: ...@@ -445,5 +488,6 @@ class SpconvOps:
transposed: transposed:
stream_int: stream_int:
num_out_act_bound: num_out_act_bound:
num_input_act_bound:
""" """
... ...
...@@ -2,29 +2,29 @@ from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Ty ...@@ -2,29 +2,29 @@ from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Ty
from pccm.stubs import EnumValue, EnumClassValue from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor from cumm.tensorview import Tensor
class ExternalAllocator: class ExternalAllocator:
def zeros(self, name: str, shape: List[int], dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor: def zeros(self, name: str, shape: List[int], dtype: int, device: int, stream: int = 0, is_temp_memory: bool = False) -> Tensor:
""" """
Args: Args:
name: name:
shape: shape:
dtype: dtype:
device: device:
is_temp_memory:
stream: stream:
is_temp_memory:
""" """
... ...
def empty(self, name: str, shape: List[int], dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor: def empty(self, name: str, shape: List[int], dtype: int, device: int, stream: int = 0, is_temp_memory: bool = False) -> Tensor:
""" """
Args: Args:
name: name:
shape: shape:
dtype: dtype:
device: device:
is_temp_memory:
stream: stream:
is_temp_memory:
""" """
... ...
def full_int(self, name: str, shape: List[int], value: int, dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor: def full_int(self, name: str, shape: List[int], value: int, dtype: int, device: int, stream: int = 0, is_temp_memory: bool = False) -> Tensor:
""" """
Args: Args:
name: name:
...@@ -32,11 +32,11 @@ class ExternalAllocator: ...@@ -32,11 +32,11 @@ class ExternalAllocator:
value: value:
dtype: dtype:
device: device:
is_temp_memory:
stream: stream:
is_temp_memory:
""" """
... ...
def full_float(self, name: str, shape: List[int], value: float, dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor: def full_float(self, name: str, shape: List[int], value: float, dtype: int, device: int, stream: int = 0, is_temp_memory: bool = False) -> Tensor:
""" """
Args: Args:
name: name:
...@@ -44,8 +44,8 @@ class ExternalAllocator: ...@@ -44,8 +44,8 @@ class ExternalAllocator:
value: value:
dtype: dtype:
device: device:
is_temp_memory:
stream: stream:
is_temp_memory:
""" """
... ...
def get_tensor_by_name(self, name: str) -> Tensor: def get_tensor_by_name(self, name: str) -> Tensor:
......
This diff is collapsed.
...@@ -2,7 +2,8 @@ import pccm ...@@ -2,7 +2,8 @@ import pccm
from cumm.common import TensorView, TensorViewCPU, TensorViewKernel, ThrustLib from cumm.common import TensorView, TensorViewCPU, TensorViewKernel, ThrustLib
from spconv.constants import AllocKeys from spconv.constants import AllocKeys
from cumm.constants import CUMM_CPU_ONLY_BUILD
from .indices import CudaCommonKernel
class ExternalAllocatorGuard(pccm.Class): class ExternalAllocatorGuard(pccm.Class):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
...@@ -53,8 +54,8 @@ class ExternalAllocator(pccm.Class): ...@@ -53,8 +54,8 @@ class ExternalAllocator(pccm.Class):
code.arg("shape", "std::vector<int64_t>") code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int") code.arg("dtype", "int")
code.arg("device", "int") code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0") code.arg("stream", "std::uintptr_t", "0")
code.arg("is_temp_memory", "bool", "false")
return code.ret("tv::Tensor") return code.ret("tv::Tensor")
...@@ -66,8 +67,8 @@ class ExternalAllocator(pccm.Class): ...@@ -66,8 +67,8 @@ class ExternalAllocator(pccm.Class):
code.arg("shape", "std::vector<int64_t>") code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int") code.arg("dtype", "int")
code.arg("device", "int") code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0") code.arg("stream", "std::uintptr_t", "0")
code.arg("is_temp_memory", "bool", "false")
return code.ret("tv::Tensor") return code.ret("tv::Tensor")
...@@ -80,8 +81,8 @@ class ExternalAllocator(pccm.Class): ...@@ -80,8 +81,8 @@ class ExternalAllocator(pccm.Class):
code.arg("value", "int") code.arg("value", "int")
code.arg("dtype", "int") code.arg("dtype", "int")
code.arg("device", "int") code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0") code.arg("stream", "std::uintptr_t", "0")
code.arg("is_temp_memory", "bool", "false")
return code.ret("tv::Tensor") return code.ret("tv::Tensor")
...@@ -94,8 +95,9 @@ class ExternalAllocator(pccm.Class): ...@@ -94,8 +95,9 @@ class ExternalAllocator(pccm.Class):
code.arg("value", "float") code.arg("value", "float")
code.arg("dtype", "int") code.arg("dtype", "int")
code.arg("device", "int") code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0") code.arg("stream", "std::uintptr_t", "0")
code.arg("is_temp_memory", "bool", "false")
return code.ret("tv::Tensor") return code.ret("tv::Tensor")
@pccm.pybind.mark(virtual=True) @pccm.pybind.mark(virtual=True)
...@@ -129,7 +131,7 @@ class ExternalAllocator(pccm.Class): ...@@ -129,7 +131,7 @@ class ExternalAllocator(pccm.Class):
code.arg("stream", "std::uintptr_t", "0") code.arg("stream", "std::uintptr_t", "0")
code.raw(f""" code.raw(f"""
// "" means temp memory // "" means temp memory
auto ten = zeros(name, shape, dtype, device, true, stream); auto ten = zeros(name, shape, dtype, device, stream, true);
return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{ return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
this->free(ten); this->free(ten);
}}); }});
...@@ -145,7 +147,7 @@ class ExternalAllocator(pccm.Class): ...@@ -145,7 +147,7 @@ class ExternalAllocator(pccm.Class):
code.arg("name", "std::string", "\"\"") code.arg("name", "std::string", "\"\"")
code.arg("stream", "std::uintptr_t", "0") code.arg("stream", "std::uintptr_t", "0")
code.raw(f""" code.raw(f"""
auto ten = empty(name, shape, dtype, device, true, stream); auto ten = empty(name, shape, dtype, device, stream, true);
return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{ return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
this->free(ten); this->free(ten);
}}); }});
...@@ -162,7 +164,7 @@ class ExternalAllocator(pccm.Class): ...@@ -162,7 +164,7 @@ class ExternalAllocator(pccm.Class):
code.arg("name", "std::string", "\"\"") code.arg("name", "std::string", "\"\"")
code.arg("stream", "std::uintptr_t", "0") code.arg("stream", "std::uintptr_t", "0")
code.raw(f""" code.raw(f"""
auto ten = full_int(name, shape, value, dtype, device, true, stream); auto ten = full_int(name, shape, value, dtype, device, stream, true);
return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{ return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
this->free(ten); this->free(ten);
}}); }});
...@@ -179,7 +181,7 @@ class ExternalAllocator(pccm.Class): ...@@ -179,7 +181,7 @@ class ExternalAllocator(pccm.Class):
code.arg("name", "std::string", "\"\"") code.arg("name", "std::string", "\"\"")
code.arg("stream", "std::uintptr_t", "0") code.arg("stream", "std::uintptr_t", "0")
code.raw(f""" code.raw(f"""
auto ten = full_float(name, shape, value, dtype, device, true, stream); auto ten = full_float(name, shape, value, dtype, device, stream, true);
return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor t){{ return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor t){{
this->free(t); this->free(t);
}}); }});
...@@ -222,8 +224,10 @@ class ThrustAllocator(pccm.Class): ...@@ -222,8 +224,10 @@ class ThrustAllocator(pccm.Class):
""") """)
return code return code
class StaticAllocator(ExternalAllocator): class StaticAllocator(ExternalAllocator):
"""a simple allocator for tensorrt plugin. """a static allocator for tensorrt plugin.
""" """
def __init__(self): def __init__(self):
super().__init__() super().__init__()
...@@ -232,6 +236,7 @@ class StaticAllocator(ExternalAllocator): ...@@ -232,6 +236,7 @@ class StaticAllocator(ExternalAllocator):
self.add_member("repr_", "std::string") self.add_member("repr_", "std::string")
self.add_member("thrust_tmp_tensor_", "tv::Tensor") self.add_member("thrust_tmp_tensor_", "tv::Tensor")
self.grow = 1.5 self.grow = 1.5
self.cuda_common_kernel = CudaCommonKernel()
@pccm.pybind.mark @pccm.pybind.mark
@pccm.constructor @pccm.constructor
...@@ -242,7 +247,22 @@ class StaticAllocator(ExternalAllocator): ...@@ -242,7 +247,22 @@ class StaticAllocator(ExternalAllocator):
code.raw(f""" code.raw(f"""
std::stringstream ss; std::stringstream ss;
for (auto& p : tensor_dict){{ for (auto& p : tensor_dict){{
tv::ssprint(ss, p.first, p.second.shape(), tv::dtype_str(p.second.dtype()), "\\n"); tv::sstream_print(ss, p.first, p.second.shape(), tv::dtype_str(p.second.dtype()), "\\n");
}}
repr_ = ss.str();
""")
return code
@pccm.pybind.mark
@pccm.member_function
def set_new_tensor_dict(self):
code = pccm.code()
code.arg("tensor_dict", "std::unordered_map<std::string, tv::Tensor>")
code.raw(f"""
tensor_dict_ = tensor_dict;
std::stringstream ss;
for (auto& p : tensor_dict){{
tv::sstream_print(ss, p.first, p.second.shape(), tv::dtype_str(p.second.dtype()), "\\n");
}} }}
repr_ = ss.str(); repr_ = ss.str();
""") """)
...@@ -255,12 +275,21 @@ class StaticAllocator(ExternalAllocator): ...@@ -255,12 +275,21 @@ class StaticAllocator(ExternalAllocator):
code.arg("shape", "std::vector<int64_t>") code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int") code.arg("dtype", "int")
code.arg("device", "int") code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.raw(f""" code.raw(f"""
auto res = get_tensor_by_name(name); auto res = get_tensor_by_name(name);
size_t total = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int64_t>()); size_t total = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
TV_ASSERT_RT_ERR(res.nbytes() >= total * tv::bit_size(tv::DType(dtype)) TV_ASSERT_RT_ERR(res.nbytes() >= total * tv::bit_size(tv::DType(dtype)) / 8
&& res.device() == device, "alloc failed", shape, res.shape()); && res.device() == device, "alloc failed, tensor size too small", shape, res.shape());
return tv::from_blob(res.raw_data(), shape, dtype, device);
// if (is_temp_memory){{
// }}else{{
// // size must exactly match
// TV_ASSERT_RT_ERR(res.nbytes() == total * tv::bit_size(tv::DType(dtype)) / 8
// && res.device() == device, "alloc failed, named memory size must match", shape, res.shape());
// }}
return tv::from_blob(res.raw_data(), shape, tv::DType(dtype), device);
""") """)
return code.ret("tv::Tensor") return code.ret("tv::Tensor")
...@@ -273,16 +302,22 @@ class StaticAllocator(ExternalAllocator): ...@@ -273,16 +302,22 @@ class StaticAllocator(ExternalAllocator):
code.arg("shape", "std::vector<int64_t>") code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int") code.arg("dtype", "int")
code.arg("device", "int") code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0") code.arg("stream", "std::uintptr_t", "0")
code.arg("is_temp_memory", "bool", "false")
code.raw(f""" code.raw(f"""
auto tvctx = tv::Context(); auto tvctx = tv::Context();
""")
if not CUMM_CPU_ONLY_BUILD:
code.raw(f"""
tvctx.set_cuda_stream(reinterpret_cast<cudaStream_t>(stream)); tvctx.set_cuda_stream(reinterpret_cast<cudaStream_t>(stream));
auto blob = _get_raw_and_check(name, shape, dtype, device); """)
code.raw(f"""
auto blob = _get_raw_and_check(name, shape, dtype, device, is_temp_memory);
return blob.zero_(tvctx); return blob.zero_(tvctx);
""") """)
return code.ret("tv::Tensor") return code.ret("tv::Tensor")
@pccm.pybind.mark @pccm.pybind.mark
@pccm.member_function(virtual=True) @pccm.member_function(virtual=True)
def empty(self): def empty(self):
...@@ -291,8 +326,8 @@ class StaticAllocator(ExternalAllocator): ...@@ -291,8 +326,8 @@ class StaticAllocator(ExternalAllocator):
code.arg("shape", "std::vector<int64_t>") code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int") code.arg("dtype", "int")
code.arg("device", "int") code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0") code.arg("stream", "std::uintptr_t", "0")
code.arg("is_temp_memory", "bool", "false")
code.raw(f""" code.raw(f"""
if (name == {pccm.literal(AllocKeys.ThrustTemp)}){{ if (name == {pccm.literal(AllocKeys.ThrustTemp)}){{
// thrust tmp shouldn't inside tensor_dict. use a simple method to allocate // thrust tmp shouldn't inside tensor_dict. use a simple method to allocate
...@@ -300,23 +335,28 @@ class StaticAllocator(ExternalAllocator): ...@@ -300,23 +335,28 @@ class StaticAllocator(ExternalAllocator):
// so we can just use one tensor // so we can just use one tensor
tv::Tensor res = thrust_tmp_tensor_; tv::Tensor res = thrust_tmp_tensor_;
if (res.empty()){{ if (res.empty()){{
res = tv::empty(shape, dtype, device); res = tv::empty(shape, tv::DType(dtype), device);
thrust_tmp_tensor_ = res; thrust_tmp_tensor_ = res;
}} }}
if (shape[0] > thrust_tmp_tensor_.dim(0)){{ if (shape[0] > thrust_tmp_tensor_.dim(0)){{
res = tv::empty({{int64_t(shape[0] * {self.grow})}}, dtype, device); res = tv::empty({{int64_t(shape[0] * {self.grow})}}, tv::DType(dtype), device);
thrust_tmp_tensor_ = res; thrust_tmp_tensor_ = res;
}} }}
return res; return res;
}}else{{ }}else{{
auto blob = _get_raw_and_check(name, shape, dtype, device); auto blob = _get_raw_and_check(name, shape, dtype, device, is_temp_memory);
return blob; return blob;
}} }}
""") """)
return code.ret("tv::Tensor") return code.ret("tv::Tensor")
# cpu only build can't use pccm.cuda
__CUDA_DECORATOR = pccm.member_function
if not CUMM_CPU_ONLY_BUILD:
__CUDA_DECORATOR = pccm.cuda.member_function
@pccm.pybind.mark @pccm.pybind.mark
@pccm.member_function(virtual=True) @__CUDA_DECORATOR
def full_int(self): def full_int(self):
code = pccm.code() code = pccm.code()
code.arg("name", "std::string") code.arg("name", "std::string")
...@@ -324,17 +364,36 @@ class StaticAllocator(ExternalAllocator): ...@@ -324,17 +364,36 @@ class StaticAllocator(ExternalAllocator):
code.arg("value", "int") code.arg("value", "int")
code.arg("dtype", "int") code.arg("dtype", "int")
code.arg("device", "int") code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0") code.arg("stream", "std::uintptr_t", "0")
code.arg("is_temp_memory", "bool", "false")
code.raw(f""" code.raw(f"""
auto tvctx = tv::Context(); auto tvctx = tv::Context();
auto blob = _get_raw_and_check(name, shape, dtype, device); auto blob = _get_raw_and_check(name, shape, dtype, device, is_temp_memory);
return blob.fill_(tvctx, value);
""")
if not CUMM_CPU_ONLY_BUILD:
code.add_param_class("cudakers", self.cuda_common_kernel)
code.raw(f"""
tvctx.set_cuda_stream(reinterpret_cast<cudaStream_t>(stream));
using ints_t = std::tuple<int32_t, int16_t, int8_t, int64_t, uint32_t, uint64_t, uint16_t, uint8_t>;
tv::Dispatch<ints_t>()(blob.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
tv::cuda::Launch lanucher_fill(blob.size(), reinterpret_cast<cudaStream_t>(stream));
lanucher_fill(cudakers::fill_kernel<T>, blob.data_ptr<T>(), value, blob.size());
}});
""")
else:
code.raw(f"""
blob.fill_(value);
""")
code.raw(f"""
return blob;
""") """)
return code.ret("tv::Tensor") return code.ret("tv::Tensor")
@pccm.pybind.mark @pccm.pybind.mark
@pccm.member_function(virtual=True) @__CUDA_DECORATOR
def full_float(self): def full_float(self):
code = pccm.code() code = pccm.code()
code.arg("name", "std::string") code.arg("name", "std::string")
...@@ -342,11 +401,29 @@ class StaticAllocator(ExternalAllocator): ...@@ -342,11 +401,29 @@ class StaticAllocator(ExternalAllocator):
code.arg("value", "float") code.arg("value", "float")
code.arg("dtype", "int") code.arg("dtype", "int")
code.arg("device", "int") code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0") code.arg("stream", "std::uintptr_t", "0")
code.arg("is_temp_memory", "bool", "false")
code.raw(f"""
auto tvctx = tv::Context();
auto blob = _get_raw_and_check(name, shape, dtype, device, is_temp_memory);
""")
if not CUMM_CPU_ONLY_BUILD:
code.add_param_class("cudakers", self.cuda_common_kernel)
code.raw(f""" code.raw(f"""
auto blob = _get_raw_and_check(name, shape, dtype, device); tvctx.set_cuda_stream(reinterpret_cast<cudaStream_t>(stream));
return blob.fill_(tvctx, value); using dtypes_t = std::tuple<float, double>;
tv::Dispatch<dtypes_t>()(blob.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
tv::cuda::Launch lanucher_fill(blob.size(), reinterpret_cast<cudaStream_t>(stream));
lanucher_fill(cudakers::fill_kernel<T>, blob.data_ptr<T>(), value, blob.size());
}});
""")
else:
code.raw(f"""
blob.fill_(value);
""")
code.raw(f"""
return blob;
""") """)
return code.ret("tv::Tensor") return code.ret("tv::Tensor")
...@@ -364,6 +441,7 @@ class StaticAllocator(ExternalAllocator): ...@@ -364,6 +441,7 @@ class StaticAllocator(ExternalAllocator):
@pccm.pybind.mark @pccm.pybind.mark
@pccm.member_function(virtual=True) @pccm.member_function(virtual=True)
def free(self): def free(self):
# nothing here because this is a static allocator
code = pccm.code() code = pccm.code()
code.arg("ten", "tv::Tensor") code.arg("ten", "tv::Tensor")
return code return code
......
...@@ -78,11 +78,9 @@ class ExternalSpconvMatmul(pccm.Class): ...@@ -78,11 +78,9 @@ class ExternalSpconvMatmul(pccm.Class):
return code return code
class SimpleExternalSpconvMatmul(ExternalSpconvMatmul): class SimpleExternalSpconvMatmul(ExternalSpconvMatmul):
"""a helper class to warp matmul operations """implement gemm in cuda via cublasLt. (only support forward)
because we don't want to implement matmul should be used with tensorrt plugin.
(link to cublas/mkl/pytorch) in python package.
""" """
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.add_dependency(TensorView, ExternalAllocator) self.add_dependency(TensorView, ExternalAllocator)
...@@ -311,7 +309,7 @@ class SimpleExternalSpconvMatmul(ExternalSpconvMatmul): ...@@ -311,7 +309,7 @@ class SimpleExternalSpconvMatmul(ExternalSpconvMatmul):
TV_THROW_RT_ERR("unsupported"); TV_THROW_RT_ERR("unsupported");
}} }}
check_cublas_status(cublasLtMatmul( check_cublas_status(cublasLtMatmul(
handle, operationDesc, alpha_storage, a.raw_data(), Adesc, b.raw_data(), handle, operationDesc, alpha_storage, a.const_raw_data(), Adesc, b.const_raw_data(),
Bdesc, beta_storage, c.raw_data(), Cdesc, c.raw_data(), Cdesc, Bdesc, beta_storage, c.raw_data(), Cdesc, c.raw_data(), Cdesc,
&heuristicResult.algo, nullptr, 0, stream)); &heuristicResult.algo, nullptr, 0, stream));
if (preference) if (preference)
...@@ -1417,11 +1415,12 @@ class ConvGemmOps(pccm.ParameterizedClass): ...@@ -1417,11 +1415,12 @@ class ConvGemmOps(pccm.ParameterizedClass):
is_KC_not_CK, kv_center, out_channel); is_KC_not_CK, kv_center, out_channel);
}}else{{ }}else{{
out_features = allocator.zeros({pccm.literal(AllocKeys.OutFeatures)}, out_features = allocator.zeros({pccm.literal(AllocKeys.OutFeatures)},
{{num_activate_out, out_channel}}, features.dtype(), features.device()); {{num_activate_out, out_channel}}, features.dtype(), features.device(), stream_int);
}} }}
if (kv == 1 && subm){{ if (kv == 1 && subm){{
return; return;
}} }}
auto indice_pair_num_cpu = indice_pair_num.cpu(); auto indice_pair_num_cpu = indice_pair_num.cpu();
auto indice_pair_num_cpu_ptr = indice_pair_num_cpu.data_ptr<int>(); auto indice_pair_num_cpu_ptr = indice_pair_num_cpu.data_ptr<int>();
int maxnhot = 0; int maxnhot = 0;
...@@ -1618,7 +1617,7 @@ class ConvGemmOps(pccm.ParameterizedClass): ...@@ -1618,7 +1617,7 @@ class ConvGemmOps(pccm.ParameterizedClass):
int kv_center = kv / 2; int kv_center = kv / 2;
tv::Tensor din; tv::Tensor din;
auto dfilters = allocator.zeros({pccm.literal(AllocKeys.DFilters)}, auto dfilters = allocator.zeros({pccm.literal(AllocKeys.DFilters)},
prev_filter_shape_vec, features.dtype(), features.device()); prev_filter_shape_vec, features.dtype(), features.device(), stream_int);
dfilters = dfilters.view(filters.shape()); dfilters = dfilters.view(filters.shape());
if (subm){{ if (subm){{
din = ext_mm.indice_conv_bwd_init_gemm({pccm.literal(AllocKeys.Features)}, din = ext_mm.indice_conv_bwd_init_gemm({pccm.literal(AllocKeys.Features)},
...@@ -1628,7 +1627,7 @@ class ConvGemmOps(pccm.ParameterizedClass): ...@@ -1628,7 +1627,7 @@ class ConvGemmOps(pccm.ParameterizedClass):
is_KC_not_CK, kv_center); is_KC_not_CK, kv_center);
}}else{{ }}else{{
din = allocator.zeros({pccm.literal(AllocKeys.DIn)}, din = allocator.zeros({pccm.literal(AllocKeys.DIn)},
features.shape_vector(), features.dtype(), features.device()); features.shape_vector(), features.dtype(), features.device(), stream_int);
}} }}
if (kv == 1 && subm){{ if (kv == 1 && subm){{
return; return;
...@@ -1922,10 +1921,10 @@ class ConvGemmOps(pccm.ParameterizedClass): ...@@ -1922,10 +1921,10 @@ class ConvGemmOps(pccm.ParameterizedClass):
tv::Tensor out_features; tv::Tensor out_features;
if (is_subm){{ if (is_subm){{
out_features = allocator.empty({pccm.literal(AllocKeys.OutFeatures)}, out_features = allocator.empty({pccm.literal(AllocKeys.OutFeatures)},
{{num_activate_out, out_channel}}, features.dtype(), features.device()); {{num_activate_out, out_channel}}, features.dtype(), features.device(), stream_int);
}}else{{ }}else{{
out_features = allocator.zeros({pccm.literal(AllocKeys.OutFeatures)}, out_features = allocator.zeros({pccm.literal(AllocKeys.OutFeatures)},
{{num_activate_out, out_channel}}, features.dtype(), features.device()); {{num_activate_out, out_channel}}, features.dtype(), features.device(), stream_int);
}} }}
auto arch = get_compute_capability(); auto arch = get_compute_capability();
constexpr auto kForwardInt = static_cast<int>(tv::gemm::ConvOpType::kForward); constexpr auto kForwardInt = static_cast<int>(tv::gemm::ConvOpType::kForward);
...@@ -1966,7 +1965,7 @@ class ConvGemmOps(pccm.ParameterizedClass): ...@@ -1966,7 +1965,7 @@ class ConvGemmOps(pccm.ParameterizedClass):
if (is_train){{ if (is_train){{
mask_output_fwd = allocator.empty({pccm.literal(AllocKeys.MaskOutputFwd)}, mask_output_fwd = allocator.empty({pccm.literal(AllocKeys.MaskOutputFwd)},
{{num_split, tv::div_up(num_activate_out, mask_width)}}, {{num_split, tv::div_up(num_activate_out, mask_width)}},
tv::uint32, features.device()); tv::uint32, features.device(), stream_int);
for (int i = 0; i < num_split; ++i){{ for (int i = 0; i < num_split; ++i){{
mask_output_fwd_splits.push_back(mask_output_fwd[i]); mask_output_fwd_splits.push_back(mask_output_fwd[i]);
}} }}
...@@ -2042,13 +2041,13 @@ class ConvGemmOps(pccm.ParameterizedClass): ...@@ -2042,13 +2041,13 @@ class ConvGemmOps(pccm.ParameterizedClass):
tv::Tensor din; tv::Tensor din;
if (is_subm){{ if (is_subm){{
din = allocator.empty({pccm.literal(AllocKeys.DIn)}, din = allocator.empty({pccm.literal(AllocKeys.DIn)},
features.shape_vector(), features.dtype(), features.device()); features.shape_vector(), features.dtype(), features.device(), stream_int);
}}else{{ }}else{{
din = allocator.zeros({pccm.literal(AllocKeys.DIn)}, din = allocator.zeros({pccm.literal(AllocKeys.DIn)},
features.shape_vector(), features.dtype(), features.device()); features.shape_vector(), features.dtype(), features.device(), stream_int);
}} }}
tv::Tensor dfilters = allocator.zeros({pccm.literal(AllocKeys.DFilters)}, tv::Tensor dfilters = allocator.zeros({pccm.literal(AllocKeys.DFilters)},
filters_shape_vec, filters.dtype(), filters.device()); filters_shape_vec, filters.dtype(), filters.device(), stream_int);
dfilters = dfilters.view(out_channel, -1, in_channel); dfilters = dfilters.view(out_channel, -1, in_channel);
constexpr auto kForwardInt = static_cast<int>(tv::gemm::ConvOpType::kForward); constexpr auto kForwardInt = static_cast<int>(tv::gemm::ConvOpType::kForward);
......
This diff is collapsed.
...@@ -180,6 +180,85 @@ class IndiceMaxPool(pccm.Class): ...@@ -180,6 +180,85 @@ class IndiceMaxPool(pccm.Class):
""") """)
return code return code
@pccm.cuda.cuda_global_function
def forward_avgpool_implicit_gemm_kernel(self):
code = pccm.FunctionCode()
code.targ("T")
code.arg("out_features", f"T*")
code.arg("in_features", f"const T*")
code.arg("indices", "const int*")
code.arg("count_out", "int*")
code.arg("num_features", "int")
code.arg("RS", "int")
code.arg("num_indices", "int")
code.raw(f"""
for (int i : tv::KernelLoopY<int>(num_indices)) {{
auto out_ptr = out_features + i * num_features;
auto indices_ptr = indices + i;
int in_idx = 0;
int count = 0;
for (int k = 0; k < RS; ++k){{
in_idx = indices_ptr[0];
count += int(in_idx != -1);
indices_ptr += num_indices;
}}
if (count_out != nullptr){{
count_out[i] = count;
}}
for (int j : tv::KernelLoopX<int>(num_features)) {{
indices_ptr = indices + i;
int in_idx;
T in, in_temp;
in = T(0);
for (int k = 0; k < RS; ++k){{
in_idx = indices_ptr[0];
bool valid = in_idx != -1;
in_temp = valid ? in_features[in_idx * num_features + j] : T(0);
in += in_temp;
indices_ptr += num_indices;
}}
out_ptr[j] = count > 0 ? in / T(count) : T(0);
}}
}}
""")
return code
@pccm.cuda.cuda_global_function
def backward_avgpool_implicit_gemm_kernel(self):
code = pccm.FunctionCode()
code.targ("T")
code.arg("dout_features", f"const T*")
code.arg("din_features", f"T*")
code.arg("indices_bwd", "const int*")
code.arg("count_out", "const int*")
code.arg("num_features", "int")
code.arg("RS", "int")
code.arg("num_indices", "int")
code.raw(f"""
for (int i : tv::KernelLoopY<int>(num_indices)) {{
auto din_ptr = din_features + i * num_features;
for (int j : tv::KernelLoopX<int>(num_features)) {{
auto indices_ptr = indices_bwd + i;
int out_idx = 0;
T sum_val = T(0);
for (int k = 0; k < RS; ++k){{
out_idx = indices_ptr[0];
bool valid = out_idx != -1;
T dout = valid ? dout_features[out_idx * num_features + j] : T(0);
int count = valid ? count_out[out_idx] : T(0);
sum_val += dout * T(count);
indices_ptr += num_indices;
}}
din_ptr[j] = sum_val;
}}
}}
""")
return code
@pccm.cuda.static_function @pccm.cuda.static_function
def forward(self): def forward(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
...@@ -348,6 +427,92 @@ class IndiceMaxPool(pccm.Class): ...@@ -348,6 +427,92 @@ class IndiceMaxPool(pccm.Class):
""") """)
return code return code
@pccm.cuda.static_function
def forward_avgpool_implicit_gemm(self):
code = pccm.FunctionCode()
code.arg("out", "tv::Tensor")
code.arg("in", "tv::Tensor")
code.arg("inds", "tv::Tensor")
code.arg("count_out", "tv::Tensor")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
auto nhot = out.dim(0);
tv::check_shape(inds, {{-1, nhot}});
tv::check_shape(in, {{-1, out.dim(1)}});
auto cudastream = reinterpret_cast<cudaStream_t>(stream);
tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
constexpr int MaxThreads = 512;
tv::cuda::Launch launcher(1);
bool found = tv::dispatch_int_noexcept<512, 256, 128, 64, 32, 16>(out.dim(1), [](int my, int expect){{return my >= expect;}}, [&](auto V){{
// if out.dim(1) > value in list above, run this function.
// if a value is found, other value won't be executed.
int NumFeatures = TV_DECLTYPE(V)::value;
int Num0 = MaxThreads / NumFeatures;
dim3 blocks(tv::div_up(out.dim(1), int64_t(NumFeatures)), tv::div_up(nhot, int64_t(Num0)));
dim3 threads(NumFeatures, Num0);
launcher = tv::cuda::Launch(blocks, threads, cudastream);
}});
if (!found){{
int NumFeatures = 16;
int Num0 = MaxThreads / NumFeatures;
dim3 blocks(tv::div_up(out.dim(1), int64_t(NumFeatures)), tv::div_up(nhot, int64_t(Num0)));
dim3 threads(NumFeatures, Num0);
launcher = tv::cuda::Launch(blocks, threads, cudastream);
}}
launcher(forward_avgpool_implicit_gemm_kernel<T>, out.data_ptr<T>(), in.data_ptr<const T>(),
inds.data_ptr<const int>(), count_out.data_ptr<int>(), out.dim(1), inds.dim(0), inds.dim(1));
}});
""")
return code
@pccm.cuda.static_function
def backward_avgpool_implicit_gemm(self):
code = pccm.FunctionCode()
code.arg("dout", "tv::Tensor")
code.arg("din", "tv::Tensor")
code.arg("inds", "tv::Tensor")
code.arg("count_out", "tv::Tensor")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
auto nhot = din.dim(0);
TV_ASSERT_RT_ERR(!count_out.empty(), "count out must not empty")
tv::check_shape(inds, {{-1, nhot}});
tv::check_shape(din, {{-1, dout.dim(1)}});
int num_act_out = dout.dim(1);
auto cudastream = reinterpret_cast<cudaStream_t>(stream);
tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(dout.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
constexpr int MaxThreads = 512;
tv::cuda::Launch launcher(1);
bool found = tv::dispatch_int_noexcept<512, 256, 128, 64, 32, 16>(dout.dim(1), [](int my, int expect){{return my >= expect;}}, [&](auto V){{
// if out.dim(1) > value in list above, run this function.
// if a value is found, other value won't be executed.
int NumFeatures = TV_DECLTYPE(V)::value;
int Num0 = MaxThreads / NumFeatures;
dim3 blocks(tv::div_up(dout.dim(1), int64_t(NumFeatures)), tv::div_up(nhot, int64_t(Num0)));
dim3 threads(NumFeatures, Num0);
launcher = tv::cuda::Launch(blocks, threads, cudastream);
}});
if (!found){{
int NumFeatures = 16;
int Num0 = MaxThreads / NumFeatures;
dim3 blocks(tv::div_up(dout.dim(1), int64_t(NumFeatures)), tv::div_up(nhot, int64_t(Num0)));
dim3 threads(NumFeatures, Num0);
launcher = tv::cuda::Launch(blocks, threads, cudastream);
}}
launcher(backward_avgpool_implicit_gemm_kernel<T>,
dout.data_ptr<const T>(), din.data_ptr<T>(),
inds.data_ptr<const int>(), count_out.data_ptr<const int>(),
dout.dim(1), inds.dim(0), inds.dim(1));
}});
""")
return code
class IndiceMaxPoolCPU(pccm.Class): class IndiceMaxPoolCPU(pccm.Class):
def __init__(self): def __init__(self):
......
...@@ -297,7 +297,7 @@ class Point2Voxel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin): ...@@ -297,7 +297,7 @@ class Point2Voxel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
self.add_dependency(TensorView) self.add_dependency(TensorView)
self.p2v_c = Point2VoxelCommon(dtype, ndim, zyx) self.p2v_c = Point2VoxelCommon(dtype, ndim, zyx)
self.add_param_class("p2v_c", self.p2v_c, "Point2VoxelCommon") self.add_param_class("p2v_c", self.p2v_c, "Point2VoxelCommon")
layout = TensorGeneric(ndim, True) layout = TensorGeneric(ndim, False)
self.add_param_class("layout_ns", layout, "Layout") self.add_param_class("layout_ns", layout, "Layout")
self.dtype = dtype self.dtype = dtype
self.ndim = ndim self.ndim = ndim
...@@ -489,7 +489,7 @@ class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin): ...@@ -489,7 +489,7 @@ class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
def __init__(self, dtype: dtypes.DType, ndim: int, zyx: bool = True): def __init__(self, dtype: dtypes.DType, ndim: int, zyx: bool = True):
super().__init__() super().__init__()
self.add_dependency(TensorView) self.add_dependency(TensorView)
layout = TensorGeneric(ndim, True) layout = TensorGeneric(ndim, False)
self.add_param_class("layout_ns", layout, "Layout") self.add_param_class("layout_ns", layout, "Layout")
self.dtype = dtype self.dtype = dtype
self.ndim = ndim self.ndim = ndim
......
# Copyright 2022 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -10,33 +10,41 @@ from spconv.core import (IMPLGEMM_SIMT_PARAMS, IMPLGEMM_TURING_PARAMS, ...@@ -10,33 +10,41 @@ from spconv.core import (IMPLGEMM_SIMT_PARAMS, IMPLGEMM_TURING_PARAMS,
SHUFFLE_TURING_PARAMS, SHUFFLE_VOLTA_PARAMS) SHUFFLE_TURING_PARAMS, SHUFFLE_VOLTA_PARAMS)
from spconv.csrc.hash.core import HashTable from spconv.csrc.hash.core import HashTable
from spconv.csrc.sparse.all import SpconvOps from spconv.csrc.sparse.all import SpconvOps
from spconv.csrc.sparse.alloc import ExternalAllocator from spconv.csrc.sparse.alloc import ExternalAllocator, StaticAllocator
from spconv.csrc.sparse.convops import (ConvGemmOps, ConvTunerSimple, from spconv.csrc.sparse.convops import (ConvGemmOps, ConvTunerSimple,
ExternalSpconvMatmul, GemmTunerSimple, ExternalSpconvMatmul, GemmTunerSimple,
SimpleExternalSpconvMatmul) SimpleExternalSpconvMatmul)
from spconv.csrc.utils import BoxOps from spconv.csrc.utils import BoxOps
from cumm.gemm.algospec.core import (GemmAlgo, ShuffleStrideType)
from cumm.conv.bases import ConvLayout, ConvLayoutType, ConvOpType
def main(include: str, def main(include: str,
src: str, src: str,
libname: str = "spconv", libname: str = "spconv",
prefix: str = "spconvlib"): prefix: str = "spconvlib",
inference_only: bool = False):
all_shuffle = SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS all_shuffle = SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS
all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle)) all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle))
if inference_only:
all_shuffle = list(filter(lambda x: x.shuffle_stride != ShuffleStrideType.ShuffleAB, all_shuffle))
cu = GemmMainUnitTest(all_shuffle) cu = GemmMainUnitTest(all_shuffle)
cu.namespace = "cumm.gemm.main" cu.namespace = "cumm.gemm.main"
all_imp = (IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS + all_imp = (IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS +
IMPLGEMM_TURING_PARAMS) IMPLGEMM_TURING_PARAMS)
# all_imp = IMPLGEMM_SIMT_PARAMS # all_imp = IMPLGEMM_SIMT_PARAMS
all_imp = list(filter(lambda x: not x.is_nvrtc, all_imp)) all_imp = list(filter(lambda x: not x.is_nvrtc, all_imp))
if inference_only:
all_imp = list(filter(lambda x: x.op_type == ConvOpType.kForward, all_imp))
convcu = ConvMainUnitTest(all_imp) convcu = ConvMainUnitTest(all_imp)
convcu.namespace = "cumm.conv.main" convcu.namespace = "cumm.conv.main"
gemmtuner = GemmTunerSimple(cu) gemmtuner = GemmTunerSimple(cu)
gemmtuner.namespace = "csrc.sparse.convops.gemmops" gemmtuner.namespace = "spconv.csrc.sparse.convops.gemmops"
convtuner = ConvTunerSimple(convcu) convtuner = ConvTunerSimple(convcu)
convtuner.namespace = "csrc.sparse.convops.convops" convtuner.namespace = "spconv.csrc.sparse.convops.convops"
convops = ConvGemmOps(gemmtuner, convtuner) convops = ConvGemmOps(gemmtuner, convtuner)
convops.namespace = "csrc.sparse.convops.spops" convops.namespace = "spconv.csrc.sparse.convops.spops"
cus = [ cus = [
cu, cu,
...@@ -51,6 +59,7 @@ def main(include: str, ...@@ -51,6 +59,7 @@ def main(include: str,
ExternalAllocator(), ExternalAllocator(),
ExternalSpconvMatmul(), ExternalSpconvMatmul(),
SimpleExternalSpconvMatmul(), SimpleExternalSpconvMatmul(),
StaticAllocator(),
] ]
gen_cmake(libname, cus, include, src, namespace_prefix=prefix) gen_cmake(libname, cus, include, src, namespace_prefix=prefix)
......
...@@ -17,7 +17,9 @@ from spconv.pytorch.modules import (SparseModule, SparseSequential, ...@@ -17,7 +17,9 @@ from spconv.pytorch.modules import (SparseModule, SparseSequential,
assign_name_for_sparse_modules) assign_name_for_sparse_modules)
from spconv.pytorch.ops import ConvAlgo from spconv.pytorch.ops import ConvAlgo
from spconv.pytorch.pool import (SparseMaxPool1d, SparseMaxPool2d, from spconv.pytorch.pool import (SparseMaxPool1d, SparseMaxPool2d,
SparseMaxPool3d, SparseMaxPool4d) SparseMaxPool3d, SparseMaxPool4d,
SparseAvgPool1d, SparseAvgPool2d,
SparseAvgPool3d)
from spconv.pytorch.tables import AddTable, ConcatTable, JoinTable from spconv.pytorch.tables import AddTable, ConcatTable, JoinTable
......
...@@ -38,6 +38,9 @@ from torch.nn.init import calculate_gain ...@@ -38,6 +38,9 @@ from torch.nn.init import calculate_gain
FILTER_HWIO = False FILTER_HWIO = False
_MAX_NUM_VOXELS_DURING_TRAINING = "max_num_voxels_during_training"
class SparseConvolution(SparseModule): class SparseConvolution(SparseModule):
__constants__ = [ __constants__ = [
'stride', 'padding', 'dilation', 'groups', 'bias', 'subm', 'inverse', 'stride', 'padding', 'dilation', 'groups', 'bias', 'subm', 'inverse',
...@@ -61,6 +64,7 @@ class SparseConvolution(SparseModule): ...@@ -61,6 +64,7 @@ class SparseConvolution(SparseModule):
indice_key: Optional[str] = None, indice_key: Optional[str] = None,
algo: Optional[ConvAlgo] = None, algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None, fp32_accum: Optional[bool] = None,
record_voxel_count: bool = False,
name=None): name=None):
super(SparseConvolution, self).__init__(name=name) super(SparseConvolution, self).__init__(name=name)
assert groups == 1, "don't support groups for now" assert groups == 1, "don't support groups for now"
...@@ -89,6 +93,12 @@ class SparseConvolution(SparseModule): ...@@ -89,6 +93,12 @@ class SparseConvolution(SparseModule):
self.groups = groups self.groups = groups
self.subm = subm self.subm = subm
self.indice_key = indice_key self.indice_key = indice_key
if record_voxel_count and not self.subm and not self.inverse:
# we record maximum voxel num in both inference and training if
# record_voxel_count flag setting.
self.register_buffer(_MAX_NUM_VOXELS_DURING_TRAINING,
torch.zeros(1, dtype=torch.int32))
self.record_voxel_count = record_voxel_count
if algo is None: if algo is None:
if kv <= 32 and not CPU_ONLY_BUILD: if kv <= 32 and not CPU_ONLY_BUILD:
if kv < 8: if kv < 8:
...@@ -122,37 +132,46 @@ class SparseConvolution(SparseModule): ...@@ -122,37 +132,46 @@ class SparseConvolution(SparseModule):
else: else:
self.register_parameter('bias', None) self.register_parameter('bias', None)
self.reset_parameters() self.reset_parameters()
if hasattr(self, "_register_load_state_dict_pre_hook"):
self._register_load_state_dict_pre_hook(self._load_weight_different_layout) self._register_load_state_dict_pre_hook(
self._load_weight_different_layout)
def _load_weight_different_layout(
self, state_dict, prefix, local_metadata, strict, def _load_weight_different_layout(self, state_dict, prefix, local_metadata,
missing_keys, unexpected_keys, error_msgs): strict, missing_keys, unexpected_keys,
error_msgs):
if self.record_voxel_count and not self.subm and not self.inverse and _MAX_NUM_VOXELS_DURING_TRAINING not in state_dict:
state_dict[prefix + _MAX_NUM_VOXELS_DURING_TRAINING] = torch.zeros(
1, dtype=torch.int32)
if not SAVED_WEIGHT_LAYOUT: if not SAVED_WEIGHT_LAYOUT:
return return
key = prefix + "weight" key = prefix + "weight"
assert key in state_dict assert key in state_dict
ndim = self.ndim ndim = self.ndim
if SAVED_WEIGHT_LAYOUT == "RSKC": if SAVED_WEIGHT_LAYOUT == "RSKC":
state_dict[key] = state_dict[key].permute(ndim, *range(ndim), ndim + 1).contiguous() state_dict[key] = state_dict[key].permute(ndim, *range(ndim),
ndim + 1).contiguous()
elif SAVED_WEIGHT_LAYOUT == "RSCK": elif SAVED_WEIGHT_LAYOUT == "RSCK":
state_dict[key] = state_dict[key].permute(ndim + 1, *range(ndim), ndim).contiguous() state_dict[key] = state_dict[key].permute(ndim + 1, *range(ndim),
ndim).contiguous()
if ALL_WEIGHT_IS_KRSC or self.algo != ConvAlgo.Native: if ALL_WEIGHT_IS_KRSC or self.algo != ConvAlgo.Native:
# in spconv 2.2, we only support KRSC layout. # in spconv 2.2, we only support KRSC layout.
if SAVED_WEIGHT_LAYOUT == "RSKC": if SAVED_WEIGHT_LAYOUT == "RSKC":
state_dict[key] = state_dict[key].permute(ndim, *range(ndim), ndim + 1).contiguous() state_dict[key] = state_dict[key].permute(
ndim, *range(ndim), ndim + 1).contiguous()
elif SAVED_WEIGHT_LAYOUT == "RSCK": elif SAVED_WEIGHT_LAYOUT == "RSCK":
state_dict[key] = state_dict[key].permute(ndim + 1, *range(ndim), ndim).contiguous() state_dict[key] = state_dict[key].permute(
ndim + 1, *range(ndim), ndim).contiguous()
else: else:
if self.algo == ConvAlgo.Native: if self.algo == ConvAlgo.Native:
# to RSCK # to RSCK
if SAVED_WEIGHT_LAYOUT == "RSKC": if SAVED_WEIGHT_LAYOUT == "RSKC":
state_dict[key] = state_dict[key].permute(*range(ndim), ndim + 1, ndim).contiguous() state_dict[key] = state_dict[key].permute(
*range(ndim), ndim + 1, ndim).contiguous()
elif SAVED_WEIGHT_LAYOUT == "KRSC": elif SAVED_WEIGHT_LAYOUT == "KRSC":
state_dict[key] = state_dict[key].permute(*range(1, ndim + 1), 0, ndim + 1).contiguous() state_dict[key] = state_dict[key].permute(
*range(1, ndim + 1), 0, ndim + 1).contiguous()
def extra_repr(self): def extra_repr(self):
s = ('{in_channels}, {out_channels}, kernel_size={kernel_size}' s = ('{in_channels}, {out_channels}, kernel_size={kernel_size}'
...@@ -218,6 +237,9 @@ class SparseConvolution(SparseModule): ...@@ -218,6 +237,9 @@ class SparseConvolution(SparseModule):
bound = 1 / math.sqrt(fan_in) bound = 1 / math.sqrt(fan_in)
init.uniform_(self.bias, -bound, bound) init.uniform_(self.bias, -bound, bound)
def is_inverseable(self):
return self.indice_key is not None and not self.subm
def forward(self, input: SparseConvTensor): def forward(self, input: SparseConvTensor):
assert isinstance(input, SparseConvTensor) assert isinstance(input, SparseConvTensor)
assert input.features.shape[ assert input.features.shape[
...@@ -410,7 +432,6 @@ class SparseConvolution(SparseModule): ...@@ -410,7 +432,6 @@ class SparseConvolution(SparseModule):
self._check_subm_reuse_valid(input, spatial_shape, self._check_subm_reuse_valid(input, spatial_shape,
datas) datas)
else: else:
with input._timer.namespace("gen_pairs"): with input._timer.namespace("gen_pairs"):
# we need to gen bwd indices for regular conv # we need to gen bwd indices for regular conv
# because it may be inversed. # because it may be inversed.
...@@ -491,6 +512,14 @@ class SparseConvolution(SparseModule): ...@@ -491,6 +512,14 @@ class SparseConvolution(SparseModule):
features.shape[0]) features.shape[0])
out_tensor.benchmark_record[self.name]["num_out_points"].append( out_tensor.benchmark_record[self.name]["num_out_points"].append(
out_features.shape[0]) out_features.shape[0])
if not self.subm and not self.inverse and self.record_voxel_count:
if hasattr(self,
_MAX_NUM_VOXELS_DURING_TRAINING):
ops.maximum_value_int_(
getattr(
self,
_MAX_NUM_VOXELS_DURING_TRAINING),
outids.shape[0])
out_tensor = out_tensor.replace_feature(out_features) out_tensor = out_tensor.replace_feature(out_features)
out_tensor.indices = outids out_tensor.indices = outids
out_tensor.indice_dict = indice_dict out_tensor.indice_dict = indice_dict
...@@ -534,8 +563,10 @@ class SparseConv1d(SparseConvolution): ...@@ -534,8 +563,10 @@ class SparseConv1d(SparseConvolution):
indice_key=None, indice_key=None,
algo: Optional[ConvAlgo] = None, algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None, fp32_accum: Optional[bool] = None,
record_voxel_count: bool = False,
name=None): name=None):
super(SparseConv1d, self).__init__(1, super(SparseConv1d,
self).__init__(1,
in_channels, in_channels,
out_channels, out_channels,
kernel_size, kernel_size,
...@@ -547,6 +578,7 @@ class SparseConv1d(SparseConvolution): ...@@ -547,6 +578,7 @@ class SparseConv1d(SparseConvolution):
indice_key=indice_key, indice_key=indice_key,
algo=algo, algo=algo,
fp32_accum=fp32_accum, fp32_accum=fp32_accum,
record_voxel_count=record_voxel_count,
name=name) name=name)
...@@ -563,8 +595,10 @@ class SparseConv2d(SparseConvolution): ...@@ -563,8 +595,10 @@ class SparseConv2d(SparseConvolution):
indice_key=None, indice_key=None,
algo: Optional[ConvAlgo] = None, algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None, fp32_accum: Optional[bool] = None,
record_voxel_count: bool = False,
name=None): name=None):
super(SparseConv2d, self).__init__(2, super(SparseConv2d,
self).__init__(2,
in_channels, in_channels,
out_channels, out_channels,
kernel_size, kernel_size,
...@@ -576,6 +610,7 @@ class SparseConv2d(SparseConvolution): ...@@ -576,6 +610,7 @@ class SparseConv2d(SparseConvolution):
indice_key=indice_key, indice_key=indice_key,
algo=algo, algo=algo,
fp32_accum=fp32_accum, fp32_accum=fp32_accum,
record_voxel_count=record_voxel_count,
name=name) name=name)
...@@ -592,8 +627,10 @@ class SparseConv3d(SparseConvolution): ...@@ -592,8 +627,10 @@ class SparseConv3d(SparseConvolution):
indice_key=None, indice_key=None,
algo: Optional[ConvAlgo] = None, algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None, fp32_accum: Optional[bool] = None,
record_voxel_count: bool = False,
name=None): name=None):
super(SparseConv3d, self).__init__(3, super(SparseConv3d,
self).__init__(3,
in_channels, in_channels,
out_channels, out_channels,
kernel_size, kernel_size,
...@@ -605,6 +642,7 @@ class SparseConv3d(SparseConvolution): ...@@ -605,6 +642,7 @@ class SparseConv3d(SparseConvolution):
indice_key=indice_key, indice_key=indice_key,
algo=algo, algo=algo,
fp32_accum=fp32_accum, fp32_accum=fp32_accum,
record_voxel_count=record_voxel_count,
name=name) name=name)
...@@ -621,8 +659,10 @@ class SparseConv4d(SparseConvolution): ...@@ -621,8 +659,10 @@ class SparseConv4d(SparseConvolution):
indice_key=None, indice_key=None,
algo: Optional[ConvAlgo] = None, algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None, fp32_accum: Optional[bool] = None,
record_voxel_count: bool = False,
name=None): name=None):
super(SparseConv4d, self).__init__(4, super(SparseConv4d,
self).__init__(4,
in_channels, in_channels,
out_channels, out_channels,
kernel_size, kernel_size,
...@@ -634,6 +674,7 @@ class SparseConv4d(SparseConvolution): ...@@ -634,6 +674,7 @@ class SparseConv4d(SparseConvolution):
indice_key=indice_key, indice_key=indice_key,
algo=algo, algo=algo,
fp32_accum=fp32_accum, fp32_accum=fp32_accum,
record_voxel_count=record_voxel_count,
name=name) name=name)
...@@ -650,8 +691,10 @@ class SparseConvTranspose1d(SparseConvolution): ...@@ -650,8 +691,10 @@ class SparseConvTranspose1d(SparseConvolution):
indice_key=None, indice_key=None,
algo: Optional[ConvAlgo] = None, algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None, fp32_accum: Optional[bool] = None,
record_voxel_count: bool = False,
name=None): name=None):
super(SparseConvTranspose1d, self).__init__(1, super(SparseConvTranspose1d,
self).__init__(1,
in_channels, in_channels,
out_channels, out_channels,
kernel_size, kernel_size,
...@@ -664,6 +707,7 @@ class SparseConvTranspose1d(SparseConvolution): ...@@ -664,6 +707,7 @@ class SparseConvTranspose1d(SparseConvolution):
indice_key=indice_key, indice_key=indice_key,
algo=algo, algo=algo,
fp32_accum=fp32_accum, fp32_accum=fp32_accum,
record_voxel_count=record_voxel_count,
name=name) name=name)
...@@ -680,8 +724,10 @@ class SparseConvTranspose2d(SparseConvolution): ...@@ -680,8 +724,10 @@ class SparseConvTranspose2d(SparseConvolution):
indice_key=None, indice_key=None,
algo: Optional[ConvAlgo] = None, algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None, fp32_accum: Optional[bool] = None,
record_voxel_count: bool = False,
name=None): name=None):
super(SparseConvTranspose2d, self).__init__(2, super(SparseConvTranspose2d,
self).__init__(2,
in_channels, in_channels,
out_channels, out_channels,
kernel_size, kernel_size,
...@@ -694,6 +740,7 @@ class SparseConvTranspose2d(SparseConvolution): ...@@ -694,6 +740,7 @@ class SparseConvTranspose2d(SparseConvolution):
indice_key=indice_key, indice_key=indice_key,
algo=algo, algo=algo,
fp32_accum=fp32_accum, fp32_accum=fp32_accum,
record_voxel_count=record_voxel_count,
name=name) name=name)
...@@ -710,8 +757,10 @@ class SparseConvTranspose3d(SparseConvolution): ...@@ -710,8 +757,10 @@ class SparseConvTranspose3d(SparseConvolution):
indice_key=None, indice_key=None,
algo: Optional[ConvAlgo] = None, algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None, fp32_accum: Optional[bool] = None,
record_voxel_count: bool = False,
name=None): name=None):
super(SparseConvTranspose3d, self).__init__(3, super(SparseConvTranspose3d,
self).__init__(3,
in_channels, in_channels,
out_channels, out_channels,
kernel_size, kernel_size,
...@@ -724,6 +773,7 @@ class SparseConvTranspose3d(SparseConvolution): ...@@ -724,6 +773,7 @@ class SparseConvTranspose3d(SparseConvolution):
indice_key=indice_key, indice_key=indice_key,
algo=algo, algo=algo,
fp32_accum=fp32_accum, fp32_accum=fp32_accum,
record_voxel_count=record_voxel_count,
name=name) name=name)
...@@ -740,8 +790,10 @@ class SparseConvTranspose4d(SparseConvolution): ...@@ -740,8 +790,10 @@ class SparseConvTranspose4d(SparseConvolution):
indice_key=None, indice_key=None,
algo: Optional[ConvAlgo] = None, algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None, fp32_accum: Optional[bool] = None,
record_voxel_count: bool = False,
name=None): name=None):
super(SparseConvTranspose4d, self).__init__(4, super(SparseConvTranspose4d,
self).__init__(4,
in_channels, in_channels,
out_channels, out_channels,
kernel_size, kernel_size,
...@@ -754,6 +806,7 @@ class SparseConvTranspose4d(SparseConvolution): ...@@ -754,6 +806,7 @@ class SparseConvTranspose4d(SparseConvolution):
indice_key=indice_key, indice_key=indice_key,
algo=algo, algo=algo,
fp32_accum=fp32_accum, fp32_accum=fp32_accum,
record_voxel_count=record_voxel_count,
name=name) name=name)
......
...@@ -12,13 +12,14 @@ ...@@ -12,13 +12,14 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from typing import List, Optional, Tuple, Union, Dict from typing import Any, List, Optional, Tuple, Union, Dict
import numpy as np import numpy as np
import torch import torch
from spconv.core import ConvAlgo from spconv.core import ConvAlgo
from spconv.pytorch.constants import PYTORCH_VERSION from spconv.pytorch.constants import PYTORCH_VERSION
from spconv.tools import CUDAKernelTimer from spconv.tools import CUDAKernelTimer
from spconv.constants import SPCONV_FX_TRACE_MODE
if PYTORCH_VERSION >= [1, 8, 0]: if PYTORCH_VERSION >= [1, 8, 0]:
try: try:
...@@ -59,7 +60,8 @@ class ThrustSortAllocator: ...@@ -59,7 +60,8 @@ class ThrustSortAllocator:
class IndiceData(object): class IndiceData(object):
def __init__(self, out_indices, indices, indice_pairs, indice_pair_num, def __init__(self, out_indices, indices, indice_pairs, indice_pair_num,
spatial_shape, out_spatial_shape, is_subm: bool, algo: ConvAlgo, spatial_shape, out_spatial_shape, is_subm: bool, algo: ConvAlgo,
ksize: List[int], stride: List[int], dilation: List[int], padding: List[int]): ksize: List[int], stride: List[int], dilation: List[int], padding: List[int],
voxel_num: Optional[Any] = None):
self.out_indices = out_indices self.out_indices = out_indices
self.indices = indices self.indices = indices
self.indice_pairs = indice_pairs self.indice_pairs = indice_pairs
...@@ -72,6 +74,8 @@ class IndiceData(object): ...@@ -72,6 +74,8 @@ class IndiceData(object):
self.stride = stride self.stride = stride
self.dilation = dilation self.dilation = dilation
self.padding = padding self.padding = padding
# voxel_num is only used in tensorrt conversion.
self.voxel_num = voxel_num
class ImplicitGemmIndiceData(object): class ImplicitGemmIndiceData(object):
...@@ -83,7 +87,9 @@ class ImplicitGemmIndiceData(object): ...@@ -83,7 +87,9 @@ class ImplicitGemmIndiceData(object):
mask_argsort_bwd_splits: List[torch.Tensor], mask_argsort_bwd_splits: List[torch.Tensor],
masks: List[np.ndarray], spatial_shape, masks: List[np.ndarray], spatial_shape,
out_spatial_shape, is_subm: bool, algo: ConvAlgo, out_spatial_shape, is_subm: bool, algo: ConvAlgo,
ksize: List[int], stride: List[int], dilation: List[int], padding: List[int]): ksize: List[int], stride: List[int], dilation: List[int], padding: List[int],
in_voxel_num: Optional[Any] = None,
out_voxel_num: Optional[Any] = None):
self.out_indices = out_indices self.out_indices = out_indices
self.indices = indices self.indices = indices
self.pair_fwd = pair_fwd self.pair_fwd = pair_fwd
...@@ -101,6 +107,9 @@ class ImplicitGemmIndiceData(object): ...@@ -101,6 +107,9 @@ class ImplicitGemmIndiceData(object):
self.stride = stride self.stride = stride
self.dilation = dilation self.dilation = dilation
self.padding = padding self.padding = padding
# in/out voxel_num is only used in tensorrt conversion.
self.in_voxel_num = in_voxel_num
self.out_voxel_num = out_voxel_num
def scatter_nd(indices, updates, shape): def scatter_nd(indices, updates, shape):
...@@ -147,6 +156,7 @@ class SparseConvTensor(metaclass=SpConvTensorMeta): ...@@ -147,6 +156,7 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
force_algo: force conv/pool layers use this algo, should only used for debug. force_algo: force conv/pool layers use this algo, should only used for debug.
""" """
ndim = indices.shape[1] - 1 ndim = indices.shape[1] - 1
if not SPCONV_FX_TRACE_MODE:
assert features.ndim == 2 assert features.ndim == 2
assert indices.ndim == 2 assert indices.ndim == 2
assert len(spatial_shape) == ndim, "spatial shape must equal to ndim" assert len(spatial_shape) == ndim, "spatial shape must equal to ndim"
......
...@@ -103,7 +103,7 @@ class TorchAllocator(ExternalAllocator): ...@@ -103,7 +103,7 @@ class TorchAllocator(ExternalAllocator):
self.allocated: Dict[Union[str, int], torch.Tensor] = {} self.allocated: Dict[Union[str, int], torch.Tensor] = {}
def zeros(self, name: str, shape: List[int], dtype: int, def zeros(self, name: str, shape: List[int], dtype: int,
device: int, is_temp_memory: bool = False, stream: int = 0) -> tv.Tensor: device: int, stream: int = 0, is_temp_memory: bool = False) -> tv.Tensor:
# TODO free memory by name if its already free by pointer. # TODO free memory by name if its already free by pointer.
# provide a name if you want to access it after c++ function exit. # provide a name if you want to access it after c++ function exit.
torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS
...@@ -126,7 +126,7 @@ class TorchAllocator(ExternalAllocator): ...@@ -126,7 +126,7 @@ class TorchAllocator(ExternalAllocator):
return ten_tv return ten_tv
def empty(self, name: str, shape: List[int], dtype: int, def empty(self, name: str, shape: List[int], dtype: int,
device: int, is_temp_memory: bool = False, stream: int = 0) -> tv.Tensor: device: int, stream: int = 0, is_temp_memory: bool = False) -> tv.Tensor:
torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS
dtype_bkp = dtype dtype_bkp = dtype
if dtype in _TORCH_UINT_WORKAROUNDS: if dtype in _TORCH_UINT_WORKAROUNDS:
...@@ -147,7 +147,7 @@ class TorchAllocator(ExternalAllocator): ...@@ -147,7 +147,7 @@ class TorchAllocator(ExternalAllocator):
return ten_tv return ten_tv
def full_int(self, name: str, shape: List[int], value: int, dtype: int, def full_int(self, name: str, shape: List[int], value: int, dtype: int,
device: int, is_temp_memory: bool = False, stream: int = 0) -> tv.Tensor: device: int, stream: int = 0, is_temp_memory: bool = False) -> tv.Tensor:
if dtype in _TORCH_UINT_WORKAROUNDS and value < 0: if dtype in _TORCH_UINT_WORKAROUNDS and value < 0:
raise NotImplementedError("you can't use full for unsigned dtypes") raise NotImplementedError("you can't use full for unsigned dtypes")
torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS
...@@ -171,7 +171,7 @@ class TorchAllocator(ExternalAllocator): ...@@ -171,7 +171,7 @@ class TorchAllocator(ExternalAllocator):
return ten_tv return ten_tv
def full_float(self, name: str, shape: List[int], value: float, dtype: int, def full_float(self, name: str, shape: List[int], value: float, dtype: int,
device: int, is_temp_memory: bool = False, stream: int = 0) -> tv.Tensor: device: int, stream: int = 0, is_temp_memory: bool = False) -> tv.Tensor:
if dtype in _TORCH_UINT_WORKAROUNDS and value < 0: if dtype in _TORCH_UINT_WORKAROUNDS and value < 0:
raise NotImplementedError("you can't use full for unsigned dtypes") raise NotImplementedError("you can't use full for unsigned dtypes")
torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS
......
...@@ -361,6 +361,25 @@ class SparseMaxPoolImplicitGemmFunction(Function): ...@@ -361,6 +361,25 @@ class SparseMaxPoolImplicitGemmFunction(Function):
features, out, grad_output, indice_pairs_bwd) features, out, grad_output, indice_pairs_bwd)
return input_bp, None, None, None return input_bp, None, None, None
class SparseAvgPoolImplicitGemmFunction(Function):
@staticmethod
@_TORCH_CUSTOM_FWD
def forward(ctx, features: torch.Tensor, indice_pairs_fwd: torch.Tensor,
indice_pairs_bwd: torch.Tensor, num_activate_out: int, calc_count):
out, count = ops.indice_avgpool_implicit_gemm(features, indice_pairs_fwd,
num_activate_out, calc_count)
ctx.save_for_backward(indice_pairs_bwd, features, out, count)
return out
@staticmethod
@once_differentiable
@_TORCH_CUSTOM_BWD
def backward(ctx, grad_output):
indice_pairs_bwd, features, out, count = ctx.saved_tensors
input_bp = ops.indice_avgpool_implicit_gemm_backward(
grad_output, indice_pairs_bwd, count)
return input_bp, None, None, None, None
indice_conv = SparseConvFunction.apply indice_conv = SparseConvFunction.apply
implicit_gemm = SparseImplicitGemmFunction.apply implicit_gemm = SparseImplicitGemmFunction.apply
...@@ -368,6 +387,7 @@ indice_inverse_conv = SparseInverseConvFunction.apply ...@@ -368,6 +387,7 @@ indice_inverse_conv = SparseInverseConvFunction.apply
indice_subm_conv = SubMConvFunction.apply indice_subm_conv = SubMConvFunction.apply
indice_maxpool = SparseMaxPoolFunction.apply indice_maxpool = SparseMaxPoolFunction.apply
indice_maxpool_implicit_gemm = SparseMaxPoolImplicitGemmFunction.apply indice_maxpool_implicit_gemm = SparseMaxPoolImplicitGemmFunction.apply
indice_avgpool_implicit_gemm = SparseAvgPoolImplicitGemmFunction.apply
def _indice_to_scalar(indices: torch.Tensor, shape: List[int]): def _indice_to_scalar(indices: torch.Tensor, shape: List[int]):
......
...@@ -132,12 +132,11 @@ class SparseSequential(SparseModule): ...@@ -132,12 +132,11 @@ class SparseSequential(SparseModule):
if isinstance(input, list): if isinstance(input, list):
input = module(input) input = module(input)
else: else:
assert isinstance(input, spconv.SparseConvTensor) # assert isinstance(input, spconv.SparseConvTensor)
# self._sparity_dict[k] = input.sparity # self._sparity_dict[k] = input.sparity
input = module(input) input = module(input)
else: else:
if isinstance(input, spconv.SparseConvTensor): if isinstance(input, spconv.SparseConvTensor):
print(input.features.shape)
if input.indices.shape[0] != 0: if input.indices.shape[0] != 0:
input = input.replace_feature(module(input.features)) input = input.replace_feature(module(input.features))
else: else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment