Commit 21bb00ae authored by Yan Yan's avatar Yan Yan
Browse files

still working on c++ only

parent 899008fa
<!--
Copyright 2022 Yan Yan
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
TODO
\ No newline at end of file
......@@ -175,7 +175,7 @@ if disable_jit is not None and disable_jit == "1":
std = "c++14"
else:
std = "c++17"
if CUMM_CPU_ONLY_BUILD:
if not CUMM_CPU_ONLY_BUILD:
gemmtuner = GemmTunerSimple(cu)
gemmtuner.namespace = "csrc.sparse.convops.gemmops"
convtuner = ConvTunerSimple(convcu)
......
......@@ -62,8 +62,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
CompileInfo(),
ExternalAllocator(),
ExternalSpconvMatmul(),
SimpleExternalSpconvMatmul(),
SimpleExternalSpconvMatmul(), # for debug, won't be included in release
]
pccm.builder.build_pybind(cus,
PACKAGE_ROOT / "core_cc",
......
......@@ -64,7 +64,7 @@ SPCONV_DEBUG_CPP_ONLY = project_is_editable(PACKAGE_NAME)
class AllocKeys:
Pair = "Pair"
PairBwd = "PairBwd"
IndiceNumPerLoc = "IndiceNumPerLoc"
PairMask = "PairMask"
MaskArgSort = "MaskArgSort"
......@@ -103,3 +103,5 @@ SPCONV_CPP_INDICE_PAIRS = True
SPCONV_CPP_INDICE_PAIRS_IGEMM = True
SPCONV_CPP_GEMM = True
SPCONV_FX_TRACE_MODE = os.getenv("SPCONV_FX_TRACE_MODE", "0") == "1"
\ No newline at end of file
......@@ -240,6 +240,28 @@ class SpconvOps:
"""
...
@staticmethod
def avgpool_implicit_gemm_forward(out: Tensor, inp: Tensor, inds: Tensor, count_out: Tensor, stream: int = 0) -> None:
"""
Args:
out:
inp:
inds:
count_out:
stream:
"""
...
@staticmethod
def avgpool_implicit_gemm_backward(dout: Tensor, dinp: Tensor, inds: Tensor, count_out: Tensor, stream: int = 0) -> None:
"""
Args:
dout:
dinp:
inds:
count_out:
stream:
"""
...
@staticmethod
def maxpool_forward_cpu(out: Tensor, inp: Tensor, out_inds: Tensor, in_inds: Tensor) -> None:
"""
Args:
......@@ -280,15 +302,6 @@ class SpconvOps:
"""
...
@staticmethod
def sort_1d_by_key(data: Tensor, indices: Tensor = Tensor(), stream: int = 0) -> Tensor:
"""
Args:
data:
indices:
stream:
"""
...
@staticmethod
def sort_1d_by_key_allocator(data: Tensor, alloc_func, indices: Tensor = Tensor(), stream: int = 0) -> Tensor:
"""
Args:
......@@ -348,6 +361,24 @@ class SpconvOps:
"""
...
@staticmethod
def maximum_value_int(data: Tensor, value: int, stream_int: int) -> None:
"""
Args:
data:
value:
stream_int:
"""
...
@staticmethod
def sort_1d_by_key(data: Tensor, indices: Tensor = Tensor(), stream: int = 0) -> Tensor:
"""
Args:
data:
indices:
stream:
"""
...
@staticmethod
def calc_point2voxel_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]:
"""
Args:
......@@ -407,6 +438,18 @@ class SpconvOps:
"""
...
@staticmethod
def get_indice_gen_tensors_from_workspace(workspace, kv: int, num_act_in: int, num_act_out_bound: int, subm: bool, use_int64_hash_k: bool) -> Dict[str, Tensor]:
"""
Args:
workspace:
kv:
num_act_in:
num_act_out_bound:
subm:
use_int64_hash_k:
"""
...
@staticmethod
def get_indice_pairs_implicit_gemm(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, is_train: bool, stream_int: int = 0, num_out_act_bound: int = -1) -> Tuple[Tensor, int]:
"""
Args:
......@@ -428,7 +471,7 @@ class SpconvOps:
"""
...
@staticmethod
def get_indice_pairs(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, stream_int: int = 0, num_out_act_bound: int = -1) -> int:
def get_indice_pairs(allocator, indices: Tensor, batch_size: int, input_dims: List[int], algo: int, ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], out_padding: List[int], subm: bool, transposed: bool, stream_int: int = 0, num_out_act_bound: int = -1, num_input_act_bound: int = -1) -> int:
"""
Args:
allocator:
......@@ -445,5 +488,6 @@ class SpconvOps:
transposed:
stream_int:
num_out_act_bound:
num_input_act_bound:
"""
...
......@@ -2,29 +2,29 @@ from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Ty
from pccm.stubs import EnumValue, EnumClassValue
from cumm.tensorview import Tensor
class ExternalAllocator:
def zeros(self, name: str, shape: List[int], dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor:
def zeros(self, name: str, shape: List[int], dtype: int, device: int, stream: int = 0, is_temp_memory: bool = False) -> Tensor:
"""
Args:
name:
shape:
dtype:
device:
is_temp_memory:
stream:
is_temp_memory:
"""
...
def empty(self, name: str, shape: List[int], dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor:
def empty(self, name: str, shape: List[int], dtype: int, device: int, stream: int = 0, is_temp_memory: bool = False) -> Tensor:
"""
Args:
name:
shape:
dtype:
device:
is_temp_memory:
stream:
is_temp_memory:
"""
...
def full_int(self, name: str, shape: List[int], value: int, dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor:
def full_int(self, name: str, shape: List[int], value: int, dtype: int, device: int, stream: int = 0, is_temp_memory: bool = False) -> Tensor:
"""
Args:
name:
......@@ -32,11 +32,11 @@ class ExternalAllocator:
value:
dtype:
device:
is_temp_memory:
stream:
is_temp_memory:
"""
...
def full_float(self, name: str, shape: List[int], value: float, dtype: int, device: int, is_temp_memory: bool = False, stream: int = 0) -> Tensor:
def full_float(self, name: str, shape: List[int], value: float, dtype: int, device: int, stream: int = 0, is_temp_memory: bool = False) -> Tensor:
"""
Args:
name:
......@@ -44,8 +44,8 @@ class ExternalAllocator:
value:
dtype:
device:
is_temp_memory:
stream:
is_temp_memory:
"""
...
def get_tensor_by_name(self, name: str) -> Tensor:
......
This diff is collapsed.
......@@ -2,7 +2,8 @@ import pccm
from cumm.common import TensorView, TensorViewCPU, TensorViewKernel, ThrustLib
from spconv.constants import AllocKeys
from cumm.constants import CUMM_CPU_ONLY_BUILD
from .indices import CudaCommonKernel
class ExternalAllocatorGuard(pccm.Class):
def __init__(self):
super().__init__()
......@@ -53,8 +54,8 @@ class ExternalAllocator(pccm.Class):
code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
code.arg("is_temp_memory", "bool", "false")
return code.ret("tv::Tensor")
......@@ -66,8 +67,8 @@ class ExternalAllocator(pccm.Class):
code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
code.arg("is_temp_memory", "bool", "false")
return code.ret("tv::Tensor")
......@@ -80,8 +81,8 @@ class ExternalAllocator(pccm.Class):
code.arg("value", "int")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
code.arg("is_temp_memory", "bool", "false")
return code.ret("tv::Tensor")
......@@ -94,8 +95,9 @@ class ExternalAllocator(pccm.Class):
code.arg("value", "float")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
code.arg("is_temp_memory", "bool", "false")
return code.ret("tv::Tensor")
@pccm.pybind.mark(virtual=True)
......@@ -129,7 +131,7 @@ class ExternalAllocator(pccm.Class):
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
// "" means temp memory
auto ten = zeros(name, shape, dtype, device, true, stream);
auto ten = zeros(name, shape, dtype, device, stream, true);
return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
this->free(ten);
}});
......@@ -145,7 +147,7 @@ class ExternalAllocator(pccm.Class):
code.arg("name", "std::string", "\"\"")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
auto ten = empty(name, shape, dtype, device, true, stream);
auto ten = empty(name, shape, dtype, device, stream, true);
return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
this->free(ten);
}});
......@@ -162,7 +164,7 @@ class ExternalAllocator(pccm.Class):
code.arg("name", "std::string", "\"\"")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
auto ten = full_int(name, shape, value, dtype, device, true, stream);
auto ten = full_int(name, shape, value, dtype, device, stream, true);
return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor ten){{
this->free(ten);
}});
......@@ -179,7 +181,7 @@ class ExternalAllocator(pccm.Class):
code.arg("name", "std::string", "\"\"")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
auto ten = full_float(name, shape, value, dtype, device, true, stream);
auto ten = full_float(name, shape, value, dtype, device, stream, true);
return std::make_{self.ptr_type}<ExternalAllocatorGuard>(ten, [this](tv::Tensor t){{
this->free(t);
}});
......@@ -222,8 +224,10 @@ class ThrustAllocator(pccm.Class):
""")
return code
class StaticAllocator(ExternalAllocator):
"""a simple allocator for tensorrt plugin.
"""a static allocator for tensorrt plugin.
"""
def __init__(self):
super().__init__()
......@@ -232,6 +236,7 @@ class StaticAllocator(ExternalAllocator):
self.add_member("repr_", "std::string")
self.add_member("thrust_tmp_tensor_", "tv::Tensor")
self.grow = 1.5
self.cuda_common_kernel = CudaCommonKernel()
@pccm.pybind.mark
@pccm.constructor
......@@ -242,7 +247,22 @@ class StaticAllocator(ExternalAllocator):
code.raw(f"""
std::stringstream ss;
for (auto& p : tensor_dict){{
tv::ssprint(ss, p.first, p.second.shape(), tv::dtype_str(p.second.dtype()), "\\n");
tv::sstream_print(ss, p.first, p.second.shape(), tv::dtype_str(p.second.dtype()), "\\n");
}}
repr_ = ss.str();
""")
return code
@pccm.pybind.mark
@pccm.member_function
def set_new_tensor_dict(self):
code = pccm.code()
code.arg("tensor_dict", "std::unordered_map<std::string, tv::Tensor>")
code.raw(f"""
tensor_dict_ = tensor_dict;
std::stringstream ss;
for (auto& p : tensor_dict){{
tv::sstream_print(ss, p.first, p.second.shape(), tv::dtype_str(p.second.dtype()), "\\n");
}}
repr_ = ss.str();
""")
......@@ -255,12 +275,21 @@ class StaticAllocator(ExternalAllocator):
code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.raw(f"""
auto res = get_tensor_by_name(name);
size_t total = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
TV_ASSERT_RT_ERR(res.nbytes() >= total * tv::bit_size(tv::DType(dtype))
&& res.device() == device, "alloc failed", shape, res.shape());
return tv::from_blob(res.raw_data(), shape, dtype, device);
TV_ASSERT_RT_ERR(res.nbytes() >= total * tv::bit_size(tv::DType(dtype)) / 8
&& res.device() == device, "alloc failed, tensor size too small", shape, res.shape());
// if (is_temp_memory){{
// }}else{{
// // size must exactly match
// TV_ASSERT_RT_ERR(res.nbytes() == total * tv::bit_size(tv::DType(dtype)) / 8
// && res.device() == device, "alloc failed, named memory size must match", shape, res.shape());
// }}
return tv::from_blob(res.raw_data(), shape, tv::DType(dtype), device);
""")
return code.ret("tv::Tensor")
......@@ -273,16 +302,22 @@ class StaticAllocator(ExternalAllocator):
code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
code.arg("is_temp_memory", "bool", "false")
code.raw(f"""
auto tvctx = tv::Context();
""")
if not CUMM_CPU_ONLY_BUILD:
code.raw(f"""
tvctx.set_cuda_stream(reinterpret_cast<cudaStream_t>(stream));
auto blob = _get_raw_and_check(name, shape, dtype, device);
""")
code.raw(f"""
auto blob = _get_raw_and_check(name, shape, dtype, device, is_temp_memory);
return blob.zero_(tvctx);
""")
return code.ret("tv::Tensor")
@pccm.pybind.mark
@pccm.member_function(virtual=True)
def empty(self):
......@@ -291,8 +326,8 @@ class StaticAllocator(ExternalAllocator):
code.arg("shape", "std::vector<int64_t>")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
code.arg("is_temp_memory", "bool", "false")
code.raw(f"""
if (name == {pccm.literal(AllocKeys.ThrustTemp)}){{
// thrust tmp shouldn't inside tensor_dict. use a simple method to allocate
......@@ -300,23 +335,28 @@ class StaticAllocator(ExternalAllocator):
// so we can just use one tensor
tv::Tensor res = thrust_tmp_tensor_;
if (res.empty()){{
res = tv::empty(shape, dtype, device);
res = tv::empty(shape, tv::DType(dtype), device);
thrust_tmp_tensor_ = res;
}}
if (shape[0] > thrust_tmp_tensor_.dim(0)){{
res = tv::empty({{int64_t(shape[0] * {self.grow})}}, dtype, device);
res = tv::empty({{int64_t(shape[0] * {self.grow})}}, tv::DType(dtype), device);
thrust_tmp_tensor_ = res;
}}
return res;
}}else{{
auto blob = _get_raw_and_check(name, shape, dtype, device);
auto blob = _get_raw_and_check(name, shape, dtype, device, is_temp_memory);
return blob;
}}
""")
return code.ret("tv::Tensor")
# cpu only build can't use pccm.cuda
__CUDA_DECORATOR = pccm.member_function
if not CUMM_CPU_ONLY_BUILD:
__CUDA_DECORATOR = pccm.cuda.member_function
@pccm.pybind.mark
@pccm.member_function(virtual=True)
@__CUDA_DECORATOR
def full_int(self):
code = pccm.code()
code.arg("name", "std::string")
......@@ -324,17 +364,36 @@ class StaticAllocator(ExternalAllocator):
code.arg("value", "int")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
code.arg("is_temp_memory", "bool", "false")
code.raw(f"""
auto tvctx = tv::Context();
auto blob = _get_raw_and_check(name, shape, dtype, device);
return blob.fill_(tvctx, value);
auto blob = _get_raw_and_check(name, shape, dtype, device, is_temp_memory);
""")
if not CUMM_CPU_ONLY_BUILD:
code.add_param_class("cudakers", self.cuda_common_kernel)
code.raw(f"""
tvctx.set_cuda_stream(reinterpret_cast<cudaStream_t>(stream));
using ints_t = std::tuple<int32_t, int16_t, int8_t, int64_t, uint32_t, uint64_t, uint16_t, uint8_t>;
tv::Dispatch<ints_t>()(blob.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
tv::cuda::Launch lanucher_fill(blob.size(), reinterpret_cast<cudaStream_t>(stream));
lanucher_fill(cudakers::fill_kernel<T>, blob.data_ptr<T>(), value, blob.size());
}});
""")
else:
code.raw(f"""
blob.fill_(value);
""")
code.raw(f"""
return blob;
""")
return code.ret("tv::Tensor")
@pccm.pybind.mark
@pccm.member_function(virtual=True)
@__CUDA_DECORATOR
def full_float(self):
code = pccm.code()
code.arg("name", "std::string")
......@@ -342,11 +401,29 @@ class StaticAllocator(ExternalAllocator):
code.arg("value", "float")
code.arg("dtype", "int")
code.arg("device", "int")
code.arg("is_temp_memory", "bool", "false")
code.arg("stream", "std::uintptr_t", "0")
code.arg("is_temp_memory", "bool", "false")
code.raw(f"""
auto tvctx = tv::Context();
auto blob = _get_raw_and_check(name, shape, dtype, device, is_temp_memory);
""")
if not CUMM_CPU_ONLY_BUILD:
code.add_param_class("cudakers", self.cuda_common_kernel)
code.raw(f"""
auto blob = _get_raw_and_check(name, shape, dtype, device);
return blob.fill_(tvctx, value);
tvctx.set_cuda_stream(reinterpret_cast<cudaStream_t>(stream));
using dtypes_t = std::tuple<float, double>;
tv::Dispatch<dtypes_t>()(blob.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
tv::cuda::Launch lanucher_fill(blob.size(), reinterpret_cast<cudaStream_t>(stream));
lanucher_fill(cudakers::fill_kernel<T>, blob.data_ptr<T>(), value, blob.size());
}});
""")
else:
code.raw(f"""
blob.fill_(value);
""")
code.raw(f"""
return blob;
""")
return code.ret("tv::Tensor")
......@@ -364,6 +441,7 @@ class StaticAllocator(ExternalAllocator):
@pccm.pybind.mark
@pccm.member_function(virtual=True)
def free(self):
# nothing here because this is a static allocator
code = pccm.code()
code.arg("ten", "tv::Tensor")
return code
......
......@@ -78,11 +78,9 @@ class ExternalSpconvMatmul(pccm.Class):
return code
class SimpleExternalSpconvMatmul(ExternalSpconvMatmul):
"""a helper class to warp matmul operations
because we don't want to implement matmul
(link to cublas/mkl/pytorch) in python package.
"""implement gemm in cuda via cublasLt. (only support forward)
should be used with tensorrt plugin.
"""
def __init__(self):
super().__init__()
self.add_dependency(TensorView, ExternalAllocator)
......@@ -311,7 +309,7 @@ class SimpleExternalSpconvMatmul(ExternalSpconvMatmul):
TV_THROW_RT_ERR("unsupported");
}}
check_cublas_status(cublasLtMatmul(
handle, operationDesc, alpha_storage, a.raw_data(), Adesc, b.raw_data(),
handle, operationDesc, alpha_storage, a.const_raw_data(), Adesc, b.const_raw_data(),
Bdesc, beta_storage, c.raw_data(), Cdesc, c.raw_data(), Cdesc,
&heuristicResult.algo, nullptr, 0, stream));
if (preference)
......@@ -1417,11 +1415,12 @@ class ConvGemmOps(pccm.ParameterizedClass):
is_KC_not_CK, kv_center, out_channel);
}}else{{
out_features = allocator.zeros({pccm.literal(AllocKeys.OutFeatures)},
{{num_activate_out, out_channel}}, features.dtype(), features.device());
{{num_activate_out, out_channel}}, features.dtype(), features.device(), stream_int);
}}
if (kv == 1 && subm){{
return;
}}
auto indice_pair_num_cpu = indice_pair_num.cpu();
auto indice_pair_num_cpu_ptr = indice_pair_num_cpu.data_ptr<int>();
int maxnhot = 0;
......@@ -1618,7 +1617,7 @@ class ConvGemmOps(pccm.ParameterizedClass):
int kv_center = kv / 2;
tv::Tensor din;
auto dfilters = allocator.zeros({pccm.literal(AllocKeys.DFilters)},
prev_filter_shape_vec, features.dtype(), features.device());
prev_filter_shape_vec, features.dtype(), features.device(), stream_int);
dfilters = dfilters.view(filters.shape());
if (subm){{
din = ext_mm.indice_conv_bwd_init_gemm({pccm.literal(AllocKeys.Features)},
......@@ -1628,7 +1627,7 @@ class ConvGemmOps(pccm.ParameterizedClass):
is_KC_not_CK, kv_center);
}}else{{
din = allocator.zeros({pccm.literal(AllocKeys.DIn)},
features.shape_vector(), features.dtype(), features.device());
features.shape_vector(), features.dtype(), features.device(), stream_int);
}}
if (kv == 1 && subm){{
return;
......@@ -1922,10 +1921,10 @@ class ConvGemmOps(pccm.ParameterizedClass):
tv::Tensor out_features;
if (is_subm){{
out_features = allocator.empty({pccm.literal(AllocKeys.OutFeatures)},
{{num_activate_out, out_channel}}, features.dtype(), features.device());
{{num_activate_out, out_channel}}, features.dtype(), features.device(), stream_int);
}}else{{
out_features = allocator.zeros({pccm.literal(AllocKeys.OutFeatures)},
{{num_activate_out, out_channel}}, features.dtype(), features.device());
{{num_activate_out, out_channel}}, features.dtype(), features.device(), stream_int);
}}
auto arch = get_compute_capability();
constexpr auto kForwardInt = static_cast<int>(tv::gemm::ConvOpType::kForward);
......@@ -1966,7 +1965,7 @@ class ConvGemmOps(pccm.ParameterizedClass):
if (is_train){{
mask_output_fwd = allocator.empty({pccm.literal(AllocKeys.MaskOutputFwd)},
{{num_split, tv::div_up(num_activate_out, mask_width)}},
tv::uint32, features.device());
tv::uint32, features.device(), stream_int);
for (int i = 0; i < num_split; ++i){{
mask_output_fwd_splits.push_back(mask_output_fwd[i]);
}}
......@@ -2042,13 +2041,13 @@ class ConvGemmOps(pccm.ParameterizedClass):
tv::Tensor din;
if (is_subm){{
din = allocator.empty({pccm.literal(AllocKeys.DIn)},
features.shape_vector(), features.dtype(), features.device());
features.shape_vector(), features.dtype(), features.device(), stream_int);
}}else{{
din = allocator.zeros({pccm.literal(AllocKeys.DIn)},
features.shape_vector(), features.dtype(), features.device());
features.shape_vector(), features.dtype(), features.device(), stream_int);
}}
tv::Tensor dfilters = allocator.zeros({pccm.literal(AllocKeys.DFilters)},
filters_shape_vec, filters.dtype(), filters.device());
filters_shape_vec, filters.dtype(), filters.device(), stream_int);
dfilters = dfilters.view(out_channel, -1, in_channel);
constexpr auto kForwardInt = static_cast<int>(tv::gemm::ConvOpType::kForward);
......
This diff is collapsed.
......@@ -180,6 +180,85 @@ class IndiceMaxPool(pccm.Class):
""")
return code
@pccm.cuda.cuda_global_function
def forward_avgpool_implicit_gemm_kernel(self):
code = pccm.FunctionCode()
code.targ("T")
code.arg("out_features", f"T*")
code.arg("in_features", f"const T*")
code.arg("indices", "const int*")
code.arg("count_out", "int*")
code.arg("num_features", "int")
code.arg("RS", "int")
code.arg("num_indices", "int")
code.raw(f"""
for (int i : tv::KernelLoopY<int>(num_indices)) {{
auto out_ptr = out_features + i * num_features;
auto indices_ptr = indices + i;
int in_idx = 0;
int count = 0;
for (int k = 0; k < RS; ++k){{
in_idx = indices_ptr[0];
count += int(in_idx != -1);
indices_ptr += num_indices;
}}
if (count_out != nullptr){{
count_out[i] = count;
}}
for (int j : tv::KernelLoopX<int>(num_features)) {{
indices_ptr = indices + i;
int in_idx;
T in, in_temp;
in = T(0);
for (int k = 0; k < RS; ++k){{
in_idx = indices_ptr[0];
bool valid = in_idx != -1;
in_temp = valid ? in_features[in_idx * num_features + j] : T(0);
in += in_temp;
indices_ptr += num_indices;
}}
out_ptr[j] = count > 0 ? in / T(count) : T(0);
}}
}}
""")
return code
@pccm.cuda.cuda_global_function
def backward_avgpool_implicit_gemm_kernel(self):
code = pccm.FunctionCode()
code.targ("T")
code.arg("dout_features", f"const T*")
code.arg("din_features", f"T*")
code.arg("indices_bwd", "const int*")
code.arg("count_out", "const int*")
code.arg("num_features", "int")
code.arg("RS", "int")
code.arg("num_indices", "int")
code.raw(f"""
for (int i : tv::KernelLoopY<int>(num_indices)) {{
auto din_ptr = din_features + i * num_features;
for (int j : tv::KernelLoopX<int>(num_features)) {{
auto indices_ptr = indices_bwd + i;
int out_idx = 0;
T sum_val = T(0);
for (int k = 0; k < RS; ++k){{
out_idx = indices_ptr[0];
bool valid = out_idx != -1;
T dout = valid ? dout_features[out_idx * num_features + j] : T(0);
int count = valid ? count_out[out_idx] : T(0);
sum_val += dout * T(count);
indices_ptr += num_indices;
}}
din_ptr[j] = sum_val;
}}
}}
""")
return code
@pccm.cuda.static_function
def forward(self):
code = pccm.FunctionCode()
......@@ -348,6 +427,92 @@ class IndiceMaxPool(pccm.Class):
""")
return code
@pccm.cuda.static_function
def forward_avgpool_implicit_gemm(self):
code = pccm.FunctionCode()
code.arg("out", "tv::Tensor")
code.arg("in", "tv::Tensor")
code.arg("inds", "tv::Tensor")
code.arg("count_out", "tv::Tensor")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
auto nhot = out.dim(0);
tv::check_shape(inds, {{-1, nhot}});
tv::check_shape(in, {{-1, out.dim(1)}});
auto cudastream = reinterpret_cast<cudaStream_t>(stream);
tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
constexpr int MaxThreads = 512;
tv::cuda::Launch launcher(1);
bool found = tv::dispatch_int_noexcept<512, 256, 128, 64, 32, 16>(out.dim(1), [](int my, int expect){{return my >= expect;}}, [&](auto V){{
// if out.dim(1) > value in list above, run this function.
// if a value is found, other value won't be executed.
int NumFeatures = TV_DECLTYPE(V)::value;
int Num0 = MaxThreads / NumFeatures;
dim3 blocks(tv::div_up(out.dim(1), int64_t(NumFeatures)), tv::div_up(nhot, int64_t(Num0)));
dim3 threads(NumFeatures, Num0);
launcher = tv::cuda::Launch(blocks, threads, cudastream);
}});
if (!found){{
int NumFeatures = 16;
int Num0 = MaxThreads / NumFeatures;
dim3 blocks(tv::div_up(out.dim(1), int64_t(NumFeatures)), tv::div_up(nhot, int64_t(Num0)));
dim3 threads(NumFeatures, Num0);
launcher = tv::cuda::Launch(blocks, threads, cudastream);
}}
launcher(forward_avgpool_implicit_gemm_kernel<T>, out.data_ptr<T>(), in.data_ptr<const T>(),
inds.data_ptr<const int>(), count_out.data_ptr<int>(), out.dim(1), inds.dim(0), inds.dim(1));
}});
""")
return code
@pccm.cuda.static_function
def backward_avgpool_implicit_gemm(self):
code = pccm.FunctionCode()
code.arg("dout", "tv::Tensor")
code.arg("din", "tv::Tensor")
code.arg("inds", "tv::Tensor")
code.arg("count_out", "tv::Tensor")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
auto nhot = din.dim(0);
TV_ASSERT_RT_ERR(!count_out.empty(), "count out must not empty")
tv::check_shape(inds, {{-1, nhot}});
tv::check_shape(din, {{-1, dout.dim(1)}});
int num_act_out = dout.dim(1);
auto cudastream = reinterpret_cast<cudaStream_t>(stream);
tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(dout.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
constexpr int MaxThreads = 512;
tv::cuda::Launch launcher(1);
bool found = tv::dispatch_int_noexcept<512, 256, 128, 64, 32, 16>(dout.dim(1), [](int my, int expect){{return my >= expect;}}, [&](auto V){{
// if out.dim(1) > value in list above, run this function.
// if a value is found, other value won't be executed.
int NumFeatures = TV_DECLTYPE(V)::value;
int Num0 = MaxThreads / NumFeatures;
dim3 blocks(tv::div_up(dout.dim(1), int64_t(NumFeatures)), tv::div_up(nhot, int64_t(Num0)));
dim3 threads(NumFeatures, Num0);
launcher = tv::cuda::Launch(blocks, threads, cudastream);
}});
if (!found){{
int NumFeatures = 16;
int Num0 = MaxThreads / NumFeatures;
dim3 blocks(tv::div_up(dout.dim(1), int64_t(NumFeatures)), tv::div_up(nhot, int64_t(Num0)));
dim3 threads(NumFeatures, Num0);
launcher = tv::cuda::Launch(blocks, threads, cudastream);
}}
launcher(backward_avgpool_implicit_gemm_kernel<T>,
dout.data_ptr<const T>(), din.data_ptr<T>(),
inds.data_ptr<const int>(), count_out.data_ptr<const int>(),
dout.dim(1), inds.dim(0), inds.dim(1));
}});
""")
return code
class IndiceMaxPoolCPU(pccm.Class):
def __init__(self):
......
......@@ -297,7 +297,7 @@ class Point2Voxel(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
self.add_dependency(TensorView)
self.p2v_c = Point2VoxelCommon(dtype, ndim, zyx)
self.add_param_class("p2v_c", self.p2v_c, "Point2VoxelCommon")
layout = TensorGeneric(ndim, True)
layout = TensorGeneric(ndim, False)
self.add_param_class("layout_ns", layout, "Layout")
self.dtype = dtype
self.ndim = ndim
......@@ -489,7 +489,7 @@ class Point2VoxelCPU(pccm.ParameterizedClass, pccm.pybind.PybindClassMixin):
def __init__(self, dtype: dtypes.DType, ndim: int, zyx: bool = True):
super().__init__()
self.add_dependency(TensorView)
layout = TensorGeneric(ndim, True)
layout = TensorGeneric(ndim, False)
self.add_param_class("layout_ns", layout, "Layout")
self.dtype = dtype
self.ndim = ndim
......
# Copyright 2022 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -10,33 +10,41 @@ from spconv.core import (IMPLGEMM_SIMT_PARAMS, IMPLGEMM_TURING_PARAMS,
SHUFFLE_TURING_PARAMS, SHUFFLE_VOLTA_PARAMS)
from spconv.csrc.hash.core import HashTable
from spconv.csrc.sparse.all import SpconvOps
from spconv.csrc.sparse.alloc import ExternalAllocator
from spconv.csrc.sparse.alloc import ExternalAllocator, StaticAllocator
from spconv.csrc.sparse.convops import (ConvGemmOps, ConvTunerSimple,
ExternalSpconvMatmul, GemmTunerSimple,
SimpleExternalSpconvMatmul)
from spconv.csrc.utils import BoxOps
from cumm.gemm.algospec.core import (GemmAlgo, ShuffleStrideType)
from cumm.conv.bases import ConvLayout, ConvLayoutType, ConvOpType
def main(include: str,
src: str,
libname: str = "spconv",
prefix: str = "spconvlib"):
prefix: str = "spconvlib",
inference_only: bool = False):
all_shuffle = SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS
all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle))
if inference_only:
all_shuffle = list(filter(lambda x: x.shuffle_stride != ShuffleStrideType.ShuffleAB, all_shuffle))
cu = GemmMainUnitTest(all_shuffle)
cu.namespace = "cumm.gemm.main"
all_imp = (IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS +
IMPLGEMM_TURING_PARAMS)
# all_imp = IMPLGEMM_SIMT_PARAMS
all_imp = list(filter(lambda x: not x.is_nvrtc, all_imp))
if inference_only:
all_imp = list(filter(lambda x: x.op_type == ConvOpType.kForward, all_imp))
convcu = ConvMainUnitTest(all_imp)
convcu.namespace = "cumm.conv.main"
gemmtuner = GemmTunerSimple(cu)
gemmtuner.namespace = "csrc.sparse.convops.gemmops"
gemmtuner.namespace = "spconv.csrc.sparse.convops.gemmops"
convtuner = ConvTunerSimple(convcu)
convtuner.namespace = "csrc.sparse.convops.convops"
convtuner.namespace = "spconv.csrc.sparse.convops.convops"
convops = ConvGemmOps(gemmtuner, convtuner)
convops.namespace = "csrc.sparse.convops.spops"
convops.namespace = "spconv.csrc.sparse.convops.spops"
cus = [
cu,
......@@ -51,6 +59,7 @@ def main(include: str,
ExternalAllocator(),
ExternalSpconvMatmul(),
SimpleExternalSpconvMatmul(),
StaticAllocator(),
]
gen_cmake(libname, cus, include, src, namespace_prefix=prefix)
......
......@@ -17,7 +17,9 @@ from spconv.pytorch.modules import (SparseModule, SparseSequential,
assign_name_for_sparse_modules)
from spconv.pytorch.ops import ConvAlgo
from spconv.pytorch.pool import (SparseMaxPool1d, SparseMaxPool2d,
SparseMaxPool3d, SparseMaxPool4d)
SparseMaxPool3d, SparseMaxPool4d,
SparseAvgPool1d, SparseAvgPool2d,
SparseAvgPool3d)
from spconv.pytorch.tables import AddTable, ConcatTable, JoinTable
......
......@@ -38,6 +38,9 @@ from torch.nn.init import calculate_gain
FILTER_HWIO = False
_MAX_NUM_VOXELS_DURING_TRAINING = "max_num_voxels_during_training"
class SparseConvolution(SparseModule):
__constants__ = [
'stride', 'padding', 'dilation', 'groups', 'bias', 'subm', 'inverse',
......@@ -61,6 +64,7 @@ class SparseConvolution(SparseModule):
indice_key: Optional[str] = None,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
record_voxel_count: bool = False,
name=None):
super(SparseConvolution, self).__init__(name=name)
assert groups == 1, "don't support groups for now"
......@@ -89,6 +93,12 @@ class SparseConvolution(SparseModule):
self.groups = groups
self.subm = subm
self.indice_key = indice_key
if record_voxel_count and not self.subm and not self.inverse:
# we record maximum voxel num in both inference and training if
# record_voxel_count flag setting.
self.register_buffer(_MAX_NUM_VOXELS_DURING_TRAINING,
torch.zeros(1, dtype=torch.int32))
self.record_voxel_count = record_voxel_count
if algo is None:
if kv <= 32 and not CPU_ONLY_BUILD:
if kv < 8:
......@@ -122,37 +132,46 @@ class SparseConvolution(SparseModule):
else:
self.register_parameter('bias', None)
self.reset_parameters()
self._register_load_state_dict_pre_hook(self._load_weight_different_layout)
def _load_weight_different_layout(
self, state_dict, prefix, local_metadata, strict,
missing_keys, unexpected_keys, error_msgs):
if hasattr(self, "_register_load_state_dict_pre_hook"):
self._register_load_state_dict_pre_hook(
self._load_weight_different_layout)
def _load_weight_different_layout(self, state_dict, prefix, local_metadata,
strict, missing_keys, unexpected_keys,
error_msgs):
if self.record_voxel_count and not self.subm and not self.inverse and _MAX_NUM_VOXELS_DURING_TRAINING not in state_dict:
state_dict[prefix + _MAX_NUM_VOXELS_DURING_TRAINING] = torch.zeros(
1, dtype=torch.int32)
if not SAVED_WEIGHT_LAYOUT:
return
key = prefix + "weight"
assert key in state_dict
ndim = self.ndim
if SAVED_WEIGHT_LAYOUT == "RSKC":
state_dict[key] = state_dict[key].permute(ndim, *range(ndim), ndim + 1).contiguous()
state_dict[key] = state_dict[key].permute(ndim, *range(ndim),
ndim + 1).contiguous()
elif SAVED_WEIGHT_LAYOUT == "RSCK":
state_dict[key] = state_dict[key].permute(ndim + 1, *range(ndim), ndim).contiguous()
state_dict[key] = state_dict[key].permute(ndim + 1, *range(ndim),
ndim).contiguous()
if ALL_WEIGHT_IS_KRSC or self.algo != ConvAlgo.Native:
# in spconv 2.2, we only support KRSC layout.
if SAVED_WEIGHT_LAYOUT == "RSKC":
state_dict[key] = state_dict[key].permute(ndim, *range(ndim), ndim + 1).contiguous()
state_dict[key] = state_dict[key].permute(
ndim, *range(ndim), ndim + 1).contiguous()
elif SAVED_WEIGHT_LAYOUT == "RSCK":
state_dict[key] = state_dict[key].permute(ndim + 1, *range(ndim), ndim).contiguous()
state_dict[key] = state_dict[key].permute(
ndim + 1, *range(ndim), ndim).contiguous()
else:
if self.algo == ConvAlgo.Native:
# to RSCK
if SAVED_WEIGHT_LAYOUT == "RSKC":
state_dict[key] = state_dict[key].permute(*range(ndim), ndim + 1, ndim).contiguous()
state_dict[key] = state_dict[key].permute(
*range(ndim), ndim + 1, ndim).contiguous()
elif SAVED_WEIGHT_LAYOUT == "KRSC":
state_dict[key] = state_dict[key].permute(*range(1, ndim + 1), 0, ndim + 1).contiguous()
state_dict[key] = state_dict[key].permute(
*range(1, ndim + 1), 0, ndim + 1).contiguous()
def extra_repr(self):
s = ('{in_channels}, {out_channels}, kernel_size={kernel_size}'
......@@ -218,6 +237,9 @@ class SparseConvolution(SparseModule):
bound = 1 / math.sqrt(fan_in)
init.uniform_(self.bias, -bound, bound)
def is_inverseable(self):
return self.indice_key is not None and not self.subm
def forward(self, input: SparseConvTensor):
assert isinstance(input, SparseConvTensor)
assert input.features.shape[
......@@ -410,7 +432,6 @@ class SparseConvolution(SparseModule):
self._check_subm_reuse_valid(input, spatial_shape,
datas)
else:
with input._timer.namespace("gen_pairs"):
# we need to gen bwd indices for regular conv
# because it may be inversed.
......@@ -491,6 +512,14 @@ class SparseConvolution(SparseModule):
features.shape[0])
out_tensor.benchmark_record[self.name]["num_out_points"].append(
out_features.shape[0])
if not self.subm and not self.inverse and self.record_voxel_count:
if hasattr(self,
_MAX_NUM_VOXELS_DURING_TRAINING):
ops.maximum_value_int_(
getattr(
self,
_MAX_NUM_VOXELS_DURING_TRAINING),
outids.shape[0])
out_tensor = out_tensor.replace_feature(out_features)
out_tensor.indices = outids
out_tensor.indice_dict = indice_dict
......@@ -534,8 +563,10 @@ class SparseConv1d(SparseConvolution):
indice_key=None,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
record_voxel_count: bool = False,
name=None):
super(SparseConv1d, self).__init__(1,
super(SparseConv1d,
self).__init__(1,
in_channels,
out_channels,
kernel_size,
......@@ -547,6 +578,7 @@ class SparseConv1d(SparseConvolution):
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
record_voxel_count=record_voxel_count,
name=name)
......@@ -563,8 +595,10 @@ class SparseConv2d(SparseConvolution):
indice_key=None,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
record_voxel_count: bool = False,
name=None):
super(SparseConv2d, self).__init__(2,
super(SparseConv2d,
self).__init__(2,
in_channels,
out_channels,
kernel_size,
......@@ -576,6 +610,7 @@ class SparseConv2d(SparseConvolution):
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
record_voxel_count=record_voxel_count,
name=name)
......@@ -592,8 +627,10 @@ class SparseConv3d(SparseConvolution):
indice_key=None,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
record_voxel_count: bool = False,
name=None):
super(SparseConv3d, self).__init__(3,
super(SparseConv3d,
self).__init__(3,
in_channels,
out_channels,
kernel_size,
......@@ -605,6 +642,7 @@ class SparseConv3d(SparseConvolution):
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
record_voxel_count=record_voxel_count,
name=name)
......@@ -621,8 +659,10 @@ class SparseConv4d(SparseConvolution):
indice_key=None,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
record_voxel_count: bool = False,
name=None):
super(SparseConv4d, self).__init__(4,
super(SparseConv4d,
self).__init__(4,
in_channels,
out_channels,
kernel_size,
......@@ -634,6 +674,7 @@ class SparseConv4d(SparseConvolution):
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
record_voxel_count=record_voxel_count,
name=name)
......@@ -650,8 +691,10 @@ class SparseConvTranspose1d(SparseConvolution):
indice_key=None,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
record_voxel_count: bool = False,
name=None):
super(SparseConvTranspose1d, self).__init__(1,
super(SparseConvTranspose1d,
self).__init__(1,
in_channels,
out_channels,
kernel_size,
......@@ -664,6 +707,7 @@ class SparseConvTranspose1d(SparseConvolution):
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
record_voxel_count=record_voxel_count,
name=name)
......@@ -680,8 +724,10 @@ class SparseConvTranspose2d(SparseConvolution):
indice_key=None,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
record_voxel_count: bool = False,
name=None):
super(SparseConvTranspose2d, self).__init__(2,
super(SparseConvTranspose2d,
self).__init__(2,
in_channels,
out_channels,
kernel_size,
......@@ -694,6 +740,7 @@ class SparseConvTranspose2d(SparseConvolution):
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
record_voxel_count=record_voxel_count,
name=name)
......@@ -710,8 +757,10 @@ class SparseConvTranspose3d(SparseConvolution):
indice_key=None,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
record_voxel_count: bool = False,
name=None):
super(SparseConvTranspose3d, self).__init__(3,
super(SparseConvTranspose3d,
self).__init__(3,
in_channels,
out_channels,
kernel_size,
......@@ -724,6 +773,7 @@ class SparseConvTranspose3d(SparseConvolution):
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
record_voxel_count=record_voxel_count,
name=name)
......@@ -740,8 +790,10 @@ class SparseConvTranspose4d(SparseConvolution):
indice_key=None,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
record_voxel_count: bool = False,
name=None):
super(SparseConvTranspose4d, self).__init__(4,
super(SparseConvTranspose4d,
self).__init__(4,
in_channels,
out_channels,
kernel_size,
......@@ -754,6 +806,7 @@ class SparseConvTranspose4d(SparseConvolution):
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
record_voxel_count=record_voxel_count,
name=name)
......
......@@ -12,13 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Optional, Tuple, Union, Dict
from typing import Any, List, Optional, Tuple, Union, Dict
import numpy as np
import torch
from spconv.core import ConvAlgo
from spconv.pytorch.constants import PYTORCH_VERSION
from spconv.tools import CUDAKernelTimer
from spconv.constants import SPCONV_FX_TRACE_MODE
if PYTORCH_VERSION >= [1, 8, 0]:
try:
......@@ -59,7 +60,8 @@ class ThrustSortAllocator:
class IndiceData(object):
def __init__(self, out_indices, indices, indice_pairs, indice_pair_num,
spatial_shape, out_spatial_shape, is_subm: bool, algo: ConvAlgo,
ksize: List[int], stride: List[int], dilation: List[int], padding: List[int]):
ksize: List[int], stride: List[int], dilation: List[int], padding: List[int],
voxel_num: Optional[Any] = None):
self.out_indices = out_indices
self.indices = indices
self.indice_pairs = indice_pairs
......@@ -72,6 +74,8 @@ class IndiceData(object):
self.stride = stride
self.dilation = dilation
self.padding = padding
# voxel_num is only used in tensorrt conversion.
self.voxel_num = voxel_num
class ImplicitGemmIndiceData(object):
......@@ -83,7 +87,9 @@ class ImplicitGemmIndiceData(object):
mask_argsort_bwd_splits: List[torch.Tensor],
masks: List[np.ndarray], spatial_shape,
out_spatial_shape, is_subm: bool, algo: ConvAlgo,
ksize: List[int], stride: List[int], dilation: List[int], padding: List[int]):
ksize: List[int], stride: List[int], dilation: List[int], padding: List[int],
in_voxel_num: Optional[Any] = None,
out_voxel_num: Optional[Any] = None):
self.out_indices = out_indices
self.indices = indices
self.pair_fwd = pair_fwd
......@@ -101,6 +107,9 @@ class ImplicitGemmIndiceData(object):
self.stride = stride
self.dilation = dilation
self.padding = padding
# in/out voxel_num is only used in tensorrt conversion.
self.in_voxel_num = in_voxel_num
self.out_voxel_num = out_voxel_num
def scatter_nd(indices, updates, shape):
......@@ -147,6 +156,7 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
force_algo: force conv/pool layers use this algo, should only used for debug.
"""
ndim = indices.shape[1] - 1
if not SPCONV_FX_TRACE_MODE:
assert features.ndim == 2
assert indices.ndim == 2
assert len(spatial_shape) == ndim, "spatial shape must equal to ndim"
......
......@@ -103,7 +103,7 @@ class TorchAllocator(ExternalAllocator):
self.allocated: Dict[Union[str, int], torch.Tensor] = {}
def zeros(self, name: str, shape: List[int], dtype: int,
device: int, is_temp_memory: bool = False, stream: int = 0) -> tv.Tensor:
device: int, stream: int = 0, is_temp_memory: bool = False) -> tv.Tensor:
# TODO free memory by name if its already free by pointer.
# provide a name if you want to access it after c++ function exit.
torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS
......@@ -126,7 +126,7 @@ class TorchAllocator(ExternalAllocator):
return ten_tv
def empty(self, name: str, shape: List[int], dtype: int,
device: int, is_temp_memory: bool = False, stream: int = 0) -> tv.Tensor:
device: int, stream: int = 0, is_temp_memory: bool = False) -> tv.Tensor:
torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS
dtype_bkp = dtype
if dtype in _TORCH_UINT_WORKAROUNDS:
......@@ -147,7 +147,7 @@ class TorchAllocator(ExternalAllocator):
return ten_tv
def full_int(self, name: str, shape: List[int], value: int, dtype: int,
device: int, is_temp_memory: bool = False, stream: int = 0) -> tv.Tensor:
device: int, stream: int = 0, is_temp_memory: bool = False) -> tv.Tensor:
if dtype in _TORCH_UINT_WORKAROUNDS and value < 0:
raise NotImplementedError("you can't use full for unsigned dtypes")
torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS
......@@ -171,7 +171,7 @@ class TorchAllocator(ExternalAllocator):
return ten_tv
def full_float(self, name: str, shape: List[int], value: float, dtype: int,
device: int, is_temp_memory: bool = False, stream: int = 0) -> tv.Tensor:
device: int, stream: int = 0, is_temp_memory: bool = False) -> tv.Tensor:
if dtype in _TORCH_UINT_WORKAROUNDS and value < 0:
raise NotImplementedError("you can't use full for unsigned dtypes")
torch_uint_workaround = dtype in _TORCH_UINT_WORKAROUNDS
......
......@@ -361,6 +361,25 @@ class SparseMaxPoolImplicitGemmFunction(Function):
features, out, grad_output, indice_pairs_bwd)
return input_bp, None, None, None
class SparseAvgPoolImplicitGemmFunction(Function):
@staticmethod
@_TORCH_CUSTOM_FWD
def forward(ctx, features: torch.Tensor, indice_pairs_fwd: torch.Tensor,
indice_pairs_bwd: torch.Tensor, num_activate_out: int, calc_count):
out, count = ops.indice_avgpool_implicit_gemm(features, indice_pairs_fwd,
num_activate_out, calc_count)
ctx.save_for_backward(indice_pairs_bwd, features, out, count)
return out
@staticmethod
@once_differentiable
@_TORCH_CUSTOM_BWD
def backward(ctx, grad_output):
indice_pairs_bwd, features, out, count = ctx.saved_tensors
input_bp = ops.indice_avgpool_implicit_gemm_backward(
grad_output, indice_pairs_bwd, count)
return input_bp, None, None, None, None
indice_conv = SparseConvFunction.apply
implicit_gemm = SparseImplicitGemmFunction.apply
......@@ -368,6 +387,7 @@ indice_inverse_conv = SparseInverseConvFunction.apply
indice_subm_conv = SubMConvFunction.apply
indice_maxpool = SparseMaxPoolFunction.apply
indice_maxpool_implicit_gemm = SparseMaxPoolImplicitGemmFunction.apply
indice_avgpool_implicit_gemm = SparseAvgPoolImplicitGemmFunction.apply
def _indice_to_scalar(indices: torch.Tensor, shape: List[int]):
......
......@@ -132,12 +132,11 @@ class SparseSequential(SparseModule):
if isinstance(input, list):
input = module(input)
else:
assert isinstance(input, spconv.SparseConvTensor)
# assert isinstance(input, spconv.SparseConvTensor)
# self._sparity_dict[k] = input.sparity
input = module(input)
else:
if isinstance(input, spconv.SparseConvTensor):
print(input.features.shape)
if input.indices.shape[0] != 0:
input = input.replace_feature(module(input.features))
else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment