Commit e2df774f authored by yan.yan's avatar yan.yan
Browse files

fix #532 overflow in huge dim

parent 1f5ce924
......@@ -116,7 +116,7 @@ jobs:
strategy:
matrix:
python-version: ['3.7', '3.8', '3.9', '3.10', '3.11'] # this version is only used for upload.
cuda-version: ['102', '113', '114', '116', '117', '118']
cuda-version: ['102', '113', '114', '116', '117', '118', '']
steps:
- uses: actions/checkout@master
......
# Changelog
## [2.2.5] - 2022-11-05
### Fixed
- Fix overflow when shape is too large
## [2.2.4] - 2022-10-13
### Added
- Add prebuilt for CUDA 11.8 (RTX 4090 and H100) and CUDA 11.6.
......
......@@ -41,8 +41,8 @@
[pypi-url-118]: https://pypi.org/project/spconv-cu118/
[pypi-download-118]: https://img.shields.io/pypi/dm/spconv-cu118
[pypi-url-116]: https://pypi.org/project/spconv-cu118/
[pypi-download-116]: https://img.shields.io/pypi/dm/spconv-cu118
[pypi-url-116]: https://pypi.org/project/spconv-cu116/
[pypi-download-116]: https://img.shields.io/pypi/dm/spconv-cu116
# SpConv: Spatially Sparse Convolution Library
[![Build Status](https://github.com/traveller59/spconv/workflows/build/badge.svg)](https://github.com/traveller59/spconv/actions?query=workflow%3Abuild)
......@@ -57,7 +57,9 @@
| CUDA 11.4 | [![PyPI Version][pypi-ver-114]][pypi-url-114] | ```pip install spconv-cu114```| [![pypi monthly download][pypi-download-114]][pypi-url-114]|
| CUDA 11.6 | [![PyPI Version][pypi-ver-116]][pypi-url-116] | ```pip install spconv-cu116```| [![pypi monthly download][pypi-download-116]][pypi-url-116]|
| CUDA 11.7 | [![PyPI Version][pypi-ver-117]][pypi-url-117] | ```pip install spconv-cu117```| [![pypi monthly download][pypi-download-117]][pypi-url-117]|
| CUDA 11.8 | [![PyPI Version][pypi-ver-118]][pypi-url-118] | ```pip install spconv-cu118```| [![pypi monthly download][pypi-download-118]][pypi-url-118]|
| CUDA 11.8* | [![PyPI Version][pypi-ver-118]][pypi-url-118] | ```pip install spconv-cu118```| [![pypi monthly download][pypi-download-118]][pypi-url-118]|
*: sm_89 and sm_90 is added in CUDA 11.8. If you use RTX 4090 or H100, you should use this version.
<!-- | CUDA 12.0 | [![PyPI Version][pypi-ver-120]][pypi-url-120] | ```pip install spconv-cu120```| [![pypi monthly download][pypi-download-120]][pypi-url-120]| -->
......
[build-system]
requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm>=0.3.5"]
requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm>=0.3.7"]
# requires = ["setuptools>=41.0", "wheel", "pccm>=0.4.0", "cumm @ file:///io/dist/cumm_cu118-0.3.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl"]
build-backend = "setuptools.build_meta"
......@@ -39,9 +39,9 @@ if cuda_ver:
cuda_ver_str = cuda_ver.replace(".", "") # 10.2 to 102
RELEASE_NAME += "-cu{}".format(cuda_ver_str)
deps = ["cumm-cu{}>=0.3.4".format(cuda_ver_str)]
deps = ["cumm-cu{}>=0.3.7".format(cuda_ver_str)]
else:
deps = ["cumm>=0.3.4"]
deps = ["cumm>=0.3.7"]
......
......@@ -618,7 +618,6 @@ class SimpleConv:
]
self.prebuilt_desps = prebuilt_desps
self.prebuilt_desp_names = {str(d) for d in prebuilt_desps}
self.prebuilt_desp_names.clear()
self.lock = Lock()
self.static_key_to_desps = group_by(self.get_static_key, all_desps)
......
......@@ -1677,7 +1677,7 @@ class SpconvOps(pccm.Class):
}}
std::vector<int64_t> output_dims_i64(out_shape.begin(), out_shape.end());
int64_t out_spatial_volume = std::accumulate(output_dims_i64.begin(),
output_dims_i64.end(), int64_t(1), std::multiplies<int64_t>());
output_dims_i64.end(), int64_t(1), std::multiplies<int64_t>()) * batch_size;
bool use_int64_hash_k = out_spatial_volume >= int64_t(std::numeric_limits<int>::max());
tv::DType indice_uniq_dtype = use_int64_hash_k ? tv::int64 : tv::int32;
TV_ASSERT_RT_ERR(conv_algo == tv::gemm::SparseConvAlgo::kMaskImplicitGemm ||
......@@ -2022,7 +2022,7 @@ Your Conv Params: )" << "\\n";
}}
std::vector<int64_t> output_dims_i64(out_shape.begin(), out_shape.end());
int64_t out_spatial_volume = std::accumulate(output_dims_i64.begin(),
output_dims_i64.end(), int64_t(1), std::multiplies<int64_t>());
output_dims_i64.end(), int64_t(1), std::multiplies<int64_t>()) * batch_size;
bool use_int64_hash_k = out_spatial_volume >= int64_t(std::numeric_limits<int>::max());
tv::DType indice_uniq_dtype = use_int64_hash_k ? tv::int64 : tv::int32;
......
......@@ -76,10 +76,13 @@ class CudaCommonKernel(pccm.ParameterizedClass):
class ConvOutLocIter(pccm.ParameterizedClass):
def __init__(self, problem: ConvProblem):
def __init__(self, problem: ConvProblem, use_i64: bool = False):
super().__init__()
self.add_dependency(TensorView)
self.add_param_class("lociter", problem, "ConvProblem")
if use_i64:
layout_npq = TensorGeneric(problem.ndim + 1, False, dtypes.int64)
else:
layout_npq = TensorGeneric(problem.ndim + 1, False)
layout_rs = TensorGeneric(problem.ndim, False)
......@@ -271,7 +274,10 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
super().__init__()
self.add_dependency(TensorView, TensorViewKernel, TensorViewHashKernel)
self.loc_iter = ConvOutLocIter(problem)
self.loc_iter_64 = ConvOutLocIter(problem, True)
self.add_param_class("spinds", self.loc_iter, "ConvLocIter")
self.add_param_class("spinds64", self.loc_iter_64, "ConvLocIter64")
self.add_param_class("spinds", problem, "ConvProblem")
self.add_param_class("cudakers", CudaCommonKernel())
self.add_include("tensorview/hash/ops.h")
......@@ -285,8 +291,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def calc_conv_indices_stage1(self):
code = pccm.FunctionCode()
code.targ("TIndiceUniq")
code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
code.targ("TConvLocIter")
code.arg("loc_iter", f"TConvLocIter") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("indice_pairs",
f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
......@@ -330,15 +336,14 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def build_conv_hash_table(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.targ("TLayoutNPQ")
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_out", f"int*") # [N, ndim + 1]
code.arg(
"indice_pairs_for_uniq",
f"const typename TTable::key_type*") # [2, kernelProd, MaxSize]
code.arg("layout_npq",
f"spinds::LayoutNPQ") # [2, kernelProd, MaxSize]
code.arg("layout_npq", f"TLayoutNPQ") # [N, ndim + 1]
code.arg("num_indices", "int")
......@@ -355,13 +360,13 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def arange_hash_table_and_assign_out(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.targ("TLayoutNPQ")
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_out", f"int*") # [N, ndim + 1]
code.arg("count", f"int*") # [N, ndim + 1]
code.arg("limit", f"int") # [N, ndim + 1]
code.arg("layout_npq", f"TLayoutNPQ") # [N, ndim + 1]
code.arg("layout_npq",
f"spinds::LayoutNPQ") # [2, kernelProd, MaxSize]
code.raw(f"""
auto key_ptr = table.key_ptr();
......@@ -387,7 +392,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("out_indices_offset", f"typename TTable::key_type *") # [N, ndim + 1]
code.arg("out_indices_offset",
f"typename TTable::key_type *") # [N, ndim + 1]
code.arg("count", f"int*") # [N, ndim + 1]
code.arg("limit", f"int") # [N, ndim + 1]
......@@ -411,12 +417,11 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def assign_out_indices(self):
code = pccm.FunctionCode()
code.targ("T")
code.targ("TLayoutNPQ")
code.arg("indices_out", f"int*") # [N, ndim + 1]
code.arg("out_indices_offset", f"const T*") # [N, ndim + 1]
code.arg("layout_npq",
f"spinds::LayoutNPQ") # [2, kernelProd, MaxSize]
code.arg("layout_npq", f"TLayoutNPQ") # [N, ndim + 1]
code.arg("size", f"int") # [N, ndim + 1]
code.raw(f"""
for (auto i : tv::KernelLoopX<int>(size)) {{
layout_npq.inverse(out_indices_offset[i], indices_out + {self.ndim + 1} * i);
......@@ -424,7 +429,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
""")
return code
@pccm.cuda.cuda_global_function
def calc_conv_indices_stage2(self):
code = pccm.FunctionCode()
......@@ -497,9 +501,9 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def calc_conv_indices_stage1_mask(self):
code = pccm.FunctionCode()
code.targ("TIndiceUniq")
code.targ("TConvLocIter")
code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
code.arg("loc_iter", f"TConvLocIter") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("indice_pairs_bwd",
f"{self.dtype_indices}*") # [kernelProd, MaxSize]
......@@ -545,9 +549,10 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code = pccm.FunctionCode()
code.targ("TIndiceUniq")
code.targ("TTable")
code.arg("table", f"TTable") # [N, ndim + 1]
code.targ("TConvLocIter")
code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("loc_iter", f"TConvLocIter") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("indice_pairs_bwd",
......@@ -710,10 +715,12 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def build_subm_conv_hash_table(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.targ("TLayoutNPQ")
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1]
code.arg("layout_npq", f"spinds::LayoutNPQ")
code.arg("layout_npq", f"TLayoutNPQ")
code.arg("num_indices", "int")
......@@ -741,7 +748,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def calc_subm_conv_indices(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
code.targ("TConvLocIter")
code.arg("loc_iter", f"TConvLocIter") # [N, ndim + 1]
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1]
......@@ -790,7 +798,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def calc_subm_conv_indices_mask(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
code.targ("TConvLocIter")
code.arg("loc_iter", f"TConvLocIter") # [N, ndim + 1]
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1]
......@@ -857,7 +866,9 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
def calc_subm_conv_indices_split_mask(self):
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("loc_iter", f"ConvLocIter") # [N, ndim + 1]
code.targ("TConvLocIter")
code.arg("loc_iter", f"TConvLocIter") # [N, ndim + 1]
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indices_in", f"const int*") # [N, ndim + 1]
......@@ -952,15 +963,19 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
// tv::cuda::Launch launcher_num_act_in_2(indices.dim(0));
launcher_num_act_in.blocks.y = kv;
ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
ConvLocIter loc_iter(problem);
bool use_int32 = problem.check_npq_not_overflow();
tv::cuda::Launch launcher_clean_uniq(uniq_size, reinterpret_cast<cudaStream_t>(stream_int));
""")
for x in codeops.dispatch_ints(code, [0, 1], "int(use_int32)"):
loc_type = "ConvLocIter" if x == 1 else "ConvLocIter64"
code.raw(f"""
{loc_type} loc_iter(problem);
tv::dispatch<int32_t, int64_t>(indice_pairs_uniq.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
TV_ASSERT_RT_ERR(input_dims.op<tv::arrayops::prod>() < std::numeric_limits<T>::max(),
"kernel volume must smaller than max value of T");
launcher_clean_uniq(clean_indices_uniq<T>, indice_pairs_uniq.data_ptr<T>(), uniq_size);
launcher_num_act_in(calc_conv_indices_stage1<T>, loc_iter, indices.data_ptr<const int>(),
launcher_num_act_in(calc_conv_indices_stage1<T, {loc_type}>, loc_iter, indices.data_ptr<const int>(),
indice_pairs.data_ptr<{self.dtype_indices}>(),
indice_pairs_uniq.data_ptr<T>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
indice_pairs.dim(2), kv, transposed);
......@@ -1029,12 +1044,17 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
tv::cuda::Launch launcher_num_act_in(indices.dim(0), custream);
launcher_num_act_in.blocks.y = kv;
ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
ConvLocIter loc_iter(problem);
bool use_int32 = problem.check_npq_not_overflow();
// TODO handle invalid num_out_act
indice_pairs_uniq = indice_pairs_uniq.slice_first_axis(0, num_out_act);
tv::cuda::Launch lanucher_build_hash(num_out_act, custream);
tv::dispatch<int32_t, int64_t>(hashdata_k.dtype(), [&](auto I){{
""")
with code.block(
"",
"tv::dispatch<int32_t, int64_t>(hashdata_k.dtype(), [&](auto I){",
"});"):
code.raw(f"""
using V = {self.dtype_indices};
using K = TV_DECLTYPE(I);
using table_t =
......@@ -1044,9 +1064,17 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
table_t hash = table_t(hashdata_k.data_ptr<K>(), hashdata_v.data_ptr<V>(), hashdata_k.dim(0));
tv::hash::clear_map_split(hash, custream);
// hash.clear(custream);
lanucher_build_hash(build_conv_hash_table<table_t>, hash,
""")
for x in codeops.dispatch_ints(code, [0, 1],
"int(use_int32)"):
loc_type = "ConvLocIter" if x == 1 else "ConvLocIter64"
code.raw(f"""
{loc_type} loc_iter(problem);
lanucher_build_hash(build_conv_hash_table<table_t, std::decay_t<decltype(loc_iter.layout_npq)>>, hash,
out_inds.data_ptr<int>(), indice_pairs_uniq.data_ptr<const K>(),
loc_iter.layout_npq, num_out_act);
""")
code.raw(f"""
if (!use_bound_algo){{
launcher_num_act_in(calc_conv_indices_stage2<table_t>, hash,
indice_pairs_uniq_before_sort.data_ptr<const K>(),
......@@ -1070,7 +1098,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
indices.dim(0),
indice_pairs.dim(2));
}}
}});
""")
code.raw(f"""
return num_out_act;
""")
return code.ret("int")
......@@ -1108,14 +1137,20 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
// tv::cuda::Launch launcher_num_act_in_2(indices.dim(0));
launcher_num_act_in.blocks.y = kv;
ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
ConvLocIter loc_iter(problem);
bool use_int32 = problem.check_npq_not_overflow();
tv::cuda::Launch launcher_clean_uniq(uniq_size, reinterpret_cast<cudaStream_t>(stream_int));
""")
for x in codeops.dispatch_ints(code, [0, 1], "int(use_int32)"):
loc_type = "ConvLocIter" if x == 1 else "ConvLocIter64"
code.raw(f"""
{loc_type} loc_iter(problem);
tv::dispatch<int32_t, int64_t>(indice_pairs_uniq.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
TV_ASSERT_RT_ERR(input_dims.op<tv::arrayops::prod>() < std::numeric_limits<T>::max(),
"kernel volume must smaller than max value of T");
launcher_clean_uniq(clean_indices_uniq<T>, indice_pairs_uniq.data_ptr<T>(), uniq_size);
launcher_num_act_in(calc_conv_indices_stage1_mask<T>, loc_iter, indices.data_ptr<const int>(),
launcher_num_act_in(calc_conv_indices_stage1_mask<T, {loc_type}>, loc_iter, indices.data_ptr<const int>(),
indice_pairs_bwd.data_ptr<{self.dtype_indices}>(),
indice_pairs_uniq.data_ptr<T>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
kv, transposed);
......@@ -1123,13 +1158,11 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
""")
return code # .ret("int")
@pccm.cuda.static_function
def generate_conv_inds_mask_stage1_direct_table(self):
code = pccm.FunctionCode()
code.arg("indices, hashdata_k, hashdata_v", "tv::Tensor")
code.arg("indice_pairs_bwd, indice_pairs_uniq",
"tv::Tensor")
code.arg("indice_pairs_bwd, indice_pairs_uniq", "tv::Tensor")
code.arg("indice_num_per_loc", "tv::Tensor")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
......@@ -1158,9 +1191,15 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
// tv::cuda::Launch launcher_num_act_in_2(indices.dim(0));
launcher_num_act_in.blocks.y = kv;
ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
ConvLocIter loc_iter(problem);
tv::cuda::Launch launcher_clean_uniq(uniq_size, reinterpret_cast<cudaStream_t>(stream_int));
tv::dispatch<int32_t, int64_t>(indice_pairs_uniq.dtype(), [&](auto I){{
bool use_int32 = problem.check_npq_not_overflow();
""")
with code.block(
"",
"tv::dispatch<int32_t, int64_t>(indice_pairs_uniq.dtype(), [&](auto I){",
"});"):
code.raw(f"""
using V = {self.dtype_indices};
using K = TV_DECLTYPE(I);
using table_t =
......@@ -1172,17 +1211,21 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
TV_ASSERT_RT_ERR(input_dims.op<tv::arrayops::prod>() < std::numeric_limits<T>::max(),
"kernel volume must smaller than max value of T");
launcher_clean_uniq(clean_indices_uniq<T>, indice_pairs_uniq.data_ptr<T>(), uniq_size);
launcher_num_act_in(calc_conv_indices_stage1_mask_direct_table<T, table_t>, table,
""")
for x in codeops.dispatch_ints(code, [0, 1],
"int(use_int32)"):
loc_type = "ConvLocIter" if x == 1 else "ConvLocIter64"
code.raw(f"""
{loc_type} loc_iter(problem);
launcher_num_act_in(calc_conv_indices_stage1_mask_direct_table<T, table_t, {loc_type}>, table,
loc_iter, indices.data_ptr<const int>(),
indice_pairs_bwd.data_ptr<{self.dtype_indices}>(),
indice_pairs_uniq.data_ptr<T>(), indice_num_per_loc.data_ptr<int>(),
indices.dim(0),
kv, transposed);
}});
""")
return code
def generate_conv_inds_stage2_mask_template(self, is_direct_table: bool):
"""here indice_pairs_uniq may be bounded, some
points may be dropped.
......@@ -1233,8 +1276,9 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
tv::cuda::Launch launcher_num_act_in_no_y(num_act_in, custream);
ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
ConvLocIter loc_iter(problem);
tv::cuda::Launch lanucher_build_hash(num_out_act, custream);
bool use_int32 = problem.check_npq_not_overflow();
// TODO handle invalid num_out_act
""")
......@@ -1242,7 +1286,10 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code.raw(f"""
indice_pairs_uniq = indice_pairs_uniq.slice_first_axis(0, num_out_act);
""")
with code.block("", start="tv::dispatch<int32_t, int64_t>(hashdata_k.dtype(), [&](auto I){",
with code.block(
"",
start=
"tv::dispatch<int32_t, int64_t>(hashdata_k.dtype(), [&](auto I){",
end="});"):
code.raw(f"""
using V = {self.dtype_indices};
......@@ -1254,10 +1301,16 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
table_t hash = table_t(hashdata_k.data_ptr<K>(), hashdata_v.data_ptr<V>(), hashdata_k.dim(0));
""")
if not is_direct_table:
# direct table built in stage 1.
code.raw(f"""
tv::hash::clear_map_split(hash, custream);
lanucher_build_hash(build_conv_hash_table<table_t>, hash,
""")
# direct table built in stage 1.
for x in codeops.dispatch_ints(code, [0, 1],
"int(use_int32)"):
loc_type = "ConvLocIter" if x == 1 else "ConvLocIter64"
code.raw(f"""
{loc_type} loc_iter(problem);
lanucher_build_hash(build_conv_hash_table<table_t, std::decay_t<decltype(loc_iter.layout_npq)>>, hash,
out_inds.data_ptr<int>(), indice_pairs_uniq.data_ptr<const K>(),
loc_iter.layout_npq, num_out_act);
""")
......@@ -1314,9 +1367,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
"""
code = pccm.FunctionCode()
code.arg("hashdata_k, hashdata_v, uniq_cnt", "tv::Tensor")
code.arg(
"out_inds",
"tv::Tensor")
code.arg("out_inds", "tv::Tensor")
code.arg("num_out_bound", "int")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
......@@ -1328,12 +1379,17 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
auto custream = reinterpret_cast<cudaStream_t>(stream_int);
tv::cuda::Launch lanucher_build_hash(hashdata_k.size(), custream);
ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
ConvLocIter loc_iter(problem);
bool use_int32 = problem.check_npq_not_overflow();
auto tvctx = tv::Context();
tvctx.set_cuda_stream(reinterpret_cast<cudaStream_t>(stream_int));
if (num_out_bound <= 0){{
num_out_bound = hashdata_k.size();
}}
""")
for x in codeops.dispatch_ints(code, [0, 1], "int(use_int32)"):
loc_type = "ConvLocIter" if x == 1 else "ConvLocIter64"
code.raw(f"""
{loc_type} loc_iter(problem);
tv::dispatch<int32_t, int64_t>(hashdata_k.dtype(), [&](auto I){{
using V = {self.dtype_indices};
using K = TV_DECLTYPE(I);
......@@ -1341,10 +1397,12 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
tv::hash::LinearHashTableSplit<K, V, tv::hash::Murmur3Hash<K>,
tv::hash::default_empty_key_v<K>, false>;
table_t table = table_t(hashdata_k.data_ptr<K>(), hashdata_v.data_ptr<V>(), hashdata_k.dim(0));
lanucher_build_hash(arange_hash_table_and_assign_out<table_t>, table,
lanucher_build_hash(arange_hash_table_and_assign_out<table_t, std::decay_t<decltype(loc_iter.layout_npq)>>, table,
out_inds.data_ptr<int>(), uniq_cnt.data_ptr<int>(), num_out_bound,
loc_iter.layout_npq);
}});
""")
code.raw(f"""
auto uniq_cnt_cpu = uniq_cnt.cpu(tvctx);
return std::min(uniq_cnt_cpu.data_ptr<int>()[0], num_out_bound);
""")
......@@ -1355,7 +1413,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
"""unique by hash
"""
code = pccm.FunctionCode()
code.arg("hashdata_k, hashdata_v, uniq_cnt, out_indices_offset", "tv::Tensor")
code.arg("hashdata_k, hashdata_v, uniq_cnt, out_indices_offset",
"tv::Tensor")
code.arg("num_out_bound", "int")
code.arg("stream_int", f"std::uintptr_t", "0")
code.raw(f"""
......@@ -1400,12 +1459,18 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
tv::cuda::Launch lanucher_build_hash(out_inds.dim(0), custream);
TV_ASSERT_RT_ERR(out_indices_offset.dim(0) >= out_inds.dim(0), "error");
ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
ConvLocIter loc_iter(problem);
bool use_int32 = problem.check_npq_not_overflow();
auto tvctx = tv::Context();
tvctx.set_cuda_stream(reinterpret_cast<cudaStream_t>(stream_int));
""")
for x in codeops.dispatch_ints(code, [0, 1], "int(use_int32)"):
loc_type = "ConvLocIter" if x == 1 else "ConvLocIter64"
code.raw(f"""
{loc_type} loc_iter(problem);
tv::dispatch<int32_t, int64_t>(out_indices_offset.dtype(), [&](auto I){{
using K = TV_DECLTYPE(I);
lanucher_build_hash(assign_out_indices<K>, out_inds.data_ptr<int>(),
lanucher_build_hash(assign_out_indices<K, std::decay_t<decltype(loc_iter.layout_npq)>>, out_inds.data_ptr<int>(),
out_indices_offset.data_ptr<const K>(),
loc_iter.layout_npq, out_inds.dim(0));
}});
......@@ -1451,9 +1516,13 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
launcher_num_act_in.blocks.y = (kv / 2) + 1;
// launcher_num_act_in.blocks.y = kv;
ConvProblem problem(batch_size, 1, 1, input_dims, input_dims, ksize, padding, stride, dilation);
ConvLocIter loc_iter(problem);
bool use_int32 = problem.check_npq_not_overflow();
tv::cuda::Launch lanucher_build_hash(num_act_in_real, custream);
""")
for x in codeops.dispatch_ints(code, [0, 1], "int(use_int32)"):
loc_type = "ConvLocIter" if x == 1 else "ConvLocIter64"
code.raw(f"""
{loc_type} loc_iter(problem);
tv::dispatch<int32_t, int64_t>(hashdata_k.dtype(), [&](auto I){{
using V = {self.dtype_indices};
using K = TV_DECLTYPE(I);
......@@ -1466,8 +1535,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
TV_ASSERT_RT_ERR(hashdata_k.dim(0) >= num_act_in_real, "hash size not enough");
table_t hash = table_t(hashdata_k.data_ptr<K>(), hashdata_v.data_ptr<V>(), hashdata_k.dim(0));
tv::hash::clear_map_split(hash, custream);
lanucher_build_hash(build_subm_conv_hash_table<table_t>, hash, indices.data_ptr<const int>(),
lanucher_build_hash(build_subm_conv_hash_table<table_t, std::decay_t<decltype(loc_iter.layout_npq)>>, hash, indices.data_ptr<const int>(),
loc_iter.layout_npq, num_act_in_real);
if (!indice_pair_mask.empty()){{
TV_ASSERT_RT_ERR(indice_pairs.ndim() == 3, "error");
......@@ -1480,24 +1548,25 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
tv::cuda::Launch lanucher_fill(num_act_in_real, custream);
lanucher_fill(cudakers::fill_kernel<uint32_t>, mask_0.data_ptr<uint32_t>(), (1 << (kv / 2)), indices.dim(0));
mask_1.zero_(ctx);
auto kernel = &calc_subm_conv_indices_split_mask<table_t>;
auto kernel = &calc_subm_conv_indices_split_mask<table_t, {loc_type}>;
launcher_num_act_in(kernel, loc_iter, hash,
indices.data_ptr<const int>(), indice_pairs.data_ptr<int>(),
mask_0.data_ptr<uint32_t>(), mask_1.data_ptr<uint32_t>(),
indices.dim(0), indice_pairs.dim(2), kv, is_train);
}}else{{
// indice_pair_mask: [1, num_act_in]
tv::cuda::Launch lanucher_fill(num_act_in_real, custream);
lanucher_fill(cudakers::fill_kernel<uint32_t>, indice_pair_mask.data_ptr<uint32_t>(), (1 << (kv / 2)), indices.dim(0));
TV_ASSERT_RT_ERR(indice_pair_mask.dim(0) == 1, "error");
launcher_num_act_in(calc_subm_conv_indices_mask<table_t>, loc_iter, hash,
launcher_num_act_in(calc_subm_conv_indices_mask<table_t, {loc_type}>, loc_iter, hash,
indices.data_ptr<const int>(), indice_pairs.data_ptr<int>(),
indice_pair_mask.data_ptr<uint32_t>(), indices.dim(0), indice_pairs.dim(2), kv, is_train);
}}
}}else{{
TV_ASSERT_RT_ERR(indice_pairs.ndim() == 3, "error");
TV_ASSERT_RT_ERR(indice_pairs.dim(0) == 2, "error");
launcher_num_act_in(calc_subm_conv_indices<table_t>, loc_iter, hash, indices.data_ptr<const int>(),
launcher_num_act_in(calc_subm_conv_indices<table_t, {loc_type}>, loc_iter, hash, indices.data_ptr<const int>(),
indice_pairs.data_ptr<int>(),
indice_num_per_loc.data_ptr<int>(), indices.dim(0), indice_pairs.dim(2), kv);
}}
......@@ -1515,7 +1584,9 @@ class SparseConvIndicesCPU(pccm.ParameterizedClass):
self.add_dependency(TensorView)
self.add_include("unordered_map")
self.loc_iter = ConvOutLocIter(problem)
self.loc_iter_64 = ConvOutLocIter(problem, True)
self.add_param_class("spinds", self.loc_iter, "ConvLocIter")
self.add_param_class("spinds64", self.loc_iter_64, "ConvLocIter64")
self.add_param_class("spinds", problem, "ConvProblem")
self.ndim = problem.ndim
......@@ -1532,7 +1603,6 @@ class SparseConvIndicesCPU(pccm.ParameterizedClass):
code.arg("batch_size", "int")
code.arg("input_dims", f"tv::array<int, {self.ndim}>")
code.arg("ksize, dilation", f"tv::array<int, {self.ndim}>")
code.raw(f"""
tv::array<int, {self.ndim}> stride, padding;
for (int i = 0; i < {self.ndim}; ++i){{
......@@ -1544,7 +1614,12 @@ class SparseConvIndicesCPU(pccm.ParameterizedClass):
TV_ASSERT_RT_ERR(input_dims.op<tv::arrayops::prod>() < std::numeric_limits<{self.dtype_indices}>::max(),
"kernel volume must smaller than max value of {self.dtype_indices}");
ConvProblem problem(batch_size, 1, 1, input_dims, input_dims, ksize, padding, stride, dilation);
ConvLocIter loc_iter(problem);
bool use_int32 = problem.check_npq_not_overflow();
""")
for x in codeops.dispatch_ints(code, [0, 1], "int(use_int32)"):
loc_type = "ConvLocIter" if x == 1 else "ConvLocIter64"
code.raw(f"""
{loc_type} loc_iter(problem);
int indices_pair_size = indice_pairs.dim(2);
int indices_pair_size_mul_RS = indices_pair_size * kv;
auto indice_pairs_ptr = indice_pairs.data_ptr<{self.dtype_indices}>();
......@@ -1585,6 +1660,8 @@ class SparseConvIndicesCPU(pccm.ParameterizedClass):
}}
++loc_iter;
}}
""")
code.raw(f"""
return indices.dim(0);
""")
return code.ret("int")
......@@ -1602,7 +1679,15 @@ class SparseConvIndicesCPU(pccm.ParameterizedClass):
code.raw(f"""
int kv = ksize.op<tv::arrayops::prod>();
ConvProblem problem(batch_size, 1, 1, input_dims, output_dims, ksize, padding, stride, dilation);
ConvLocIter loc_iter(problem);
bool use_int32 = problem.check_npq_not_overflow();
int num_act = 0;
""")
for x in codeops.dispatch_ints(code, [0, 1], "int(use_int32)"):
loc_type = "ConvLocIter" if x == 1 else "ConvLocIter64"
code.raw(f"""
{loc_type} loc_iter(problem);
int indices_pair_size = indice_pairs.dim(2);
int indices_pair_size_mul_RS = indices_pair_size * kv;
auto indice_pairs_ptr = indice_pairs.data_ptr<{self.dtype_indices}>();
......@@ -1611,9 +1696,7 @@ class SparseConvIndicesCPU(pccm.ParameterizedClass):
auto out_inds_ptr = out_inds.data_ptr<{self.dtype_indices}>();
TV_ASSERT_RT_ERR(input_dims.op<tv::arrayops::prod>() < std::numeric_limits<{self.dtype_indices}>::max(),
"kernel volume must smaller than max value of {self.dtype_indices}");
int indice_in_num = indices.dim(0);
int num_act = 0;
{self.dtype_indices} hashval;
for (int filter_offset = 0; filter_offset < kv; ++filter_offset){{
int filter_offset_mul_indices_pair_size = filter_offset * indices_pair_size;
......@@ -1647,6 +1730,8 @@ class SparseConvIndicesCPU(pccm.ParameterizedClass):
}}
++loc_iter;
}}
""")
code.raw(f"""
return num_act;
""")
return code.ret("int")
......@@ -185,7 +185,7 @@ def get_indice_pairs(indices: torch.Tensor,
)
assert algo == ConvAlgo.Native, "TODO"
# indices = indices.cpu()
spatial_volume = functools.reduce(lambda x, y: x * y, out_shape, 1)
spatial_volume = functools.reduce(lambda x, y: x * y, out_shape, 1) * batch_size
use_int64_hash_k = spatial_volume >= INT32_MAX or DEBUG_INT64_HASH_K
indice_dtype = torch.int64 if use_int64_hash_k else indices.dtype
pair = torch.full((2, kv, indices.shape[0]),
......@@ -457,7 +457,7 @@ def get_indice_pairs_implicit_gemm(
raise ValueError(
f"your out spatial shape {out_shape} reach zero!!! input shape: {spatial_shape}"
)
spatial_volume = functools.reduce(lambda x, y: x * y, spatial_shape, 1)
spatial_volume = functools.reduce(lambda x, y: x * y, spatial_shape, 1) * batch_size
use_int64_hash_k = spatial_volume >= INT32_MAX or DEBUG_INT64_HASH_K
indice_dtype = torch.int64 if use_int64_hash_k else indices.dtype
assert algo == ConvAlgo.MaskImplicitGemm or algo == ConvAlgo.MaskSplitImplicitGemm, "TODO"
......
......@@ -145,7 +145,8 @@ def generate_sparse_data(shape,
integer=False,
data_range=(-1, 1),
with_dense=True,
dtype=np.float32):
dtype=np.float32,
shape_scale = 1):
dense_shape = shape
ndim = len(dense_shape)
# num_points = np.random.randint(10, 100, size=[batch_size, ndim])
......@@ -153,9 +154,9 @@ def generate_sparse_data(shape,
# num_points = np.array([3, 2])
batch_size = len(num_points)
batch_indices = []
coors_total = np.stack(np.meshgrid(*[np.arange(0, s) for s in shape]),
coors_total = np.stack(np.meshgrid(*[np.arange(0, s // shape_scale) for s in shape]),
axis=-1)
coors_total = coors_total.reshape(-1, ndim)
coors_total = coors_total.reshape(-1, ndim) * shape_scale
for i in range(batch_size):
np.random.shuffle(coors_total)
inds_total = coors_total[:num_points[i]]
......
import spconv
import spconv.pytorch as spconv
from spconv.core import ConvAlgo
import spconv.pytorch as spconv
from spconv.test_utils import TestCase, generate_sparse_data, params_grid
from spconv.core_cc.cumm.common import CompileInfo
if __name__ == "__main__":
print(CompileInfo.arch_is_compatible_gemm((9, 0)), CompileInfo.arch_is_compiled_gemm((9, 0)))
print(CompileInfo.arch_is_compatible_gemm((8, 6)), CompileInfo.arch_is_compiled_gemm((8, 6)))
\ No newline at end of file
import torch
import numpy as np
class SparseMaxPool2dTestTorch(torch.nn.Module):
def __init__(self, num_layers, ndim, shape, kernel_size, stride, padding,
dilation, algo):
super().__init__()
self.algo = algo
layers = [
spconv.SparseMaxPool2d(kernel_size, stride, padding, dilation, algo=algo)
]
for i in range(1, num_layers):
layers.append(
spconv.SparseMaxPool2d(kernel_size, stride, padding, dilation, algo=algo))
self.net = spconv.SparseSequential(*layers, )
self.shape = shape
def forward(self, features, coors, batch_size):
coors = coors.int()
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
return self.net(x) # .dense()
shapes = [[65536, 65536]]
batchsizes = [32]
in_channels = [32]
out_channels = [32]
ksizes = [2]
strides = [2]
paddings = [0]
dilations = [1]
algos = [
# ConvAlgo.Native,
ConvAlgo.MaskImplicitGemm,
# ConvAlgo.MaskSplitImplicitGemm
]
devices = ["cuda:0"]
for dev, shape, bs, IC, OC, k, s, p, d, al in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations, algos):
device = torch.device(dev)
num_points = [1000] * bs
print(1)
sparse_dict = generate_sparse_data(shape,
num_points,
IC,
with_dense=False,
data_range=[0.1, 1],
shape_scale = 64)
print(2)
net = SparseMaxPool2dTestTorch(1, 2, shape, k, s, p, d, al).to(device)
features = np.ascontiguousarray(sparse_dict["features"]).astype(
np.float32)
indices = np.ascontiguousarray(
sparse_dict["indices"][:, [2, 0, 1]]).astype(np.int32)
print(indices.max(0))
indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True
out = net(features_t, indices_t, bs)
print(out.indices.min(0))
......@@ -916,8 +916,8 @@ def _test_native_conv_cuda(subm: bool):
def test_all_algo_unit():
# for i in range(5):
# _test_impgemm_conv_cuda(True)
# _test_impgemm_conv_cuda(False)
_test_impgemm_conv_cuda(True)
_test_impgemm_conv_cuda(False)
_test_native_conv_cuda(True)
_test_native_conv_cuda(False)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment