Commit f582ec34 authored by yan.yan's avatar yan.yan
Browse files

v2.3.4: global pool, generative model support

parent 004effbd
# Changelog # Changelog
## [2.3.4] - 2023-03-23
### Added
- Add SparseGlobalMaxPool and SparseGlobalAvgPool for training only. libspconv don't support it.
## [2.3.3] - 2023-02-02 ## [2.3.3] - 2023-02-02
### Fixed ### Fixed
- Fix int8 nvrtc error when use prebuilt - Fix int8 nvrtc error when use prebuilt
......
...@@ -31,9 +31,11 @@ from spconv.pytorch.hash import HashTable ...@@ -31,9 +31,11 @@ from spconv.pytorch.hash import HashTable
| ```spconv.SparseConv3d``` | Downsample | ```nn.Conv3d``` | Use ```indice_key``` to save data for inverse | | ```spconv.SparseConv3d``` | Downsample | ```nn.Conv3d``` | Use ```indice_key``` to save data for inverse |
| ```spconv.SubMConv3d``` | Convolution | N/A | Use ```indice_key``` to save data for reuse | | ```spconv.SubMConv3d``` | Convolution | N/A | Use ```indice_key``` to save data for reuse |
| ```spconv.SparseInverseConv3d``` | Upsample | N/A | Use pre-saved ```indice_key``` to upsample | | ```spconv.SparseInverseConv3d``` | Upsample | N/A | Use pre-saved ```indice_key``` to upsample |
| ```spconv.SparseConvTranspose3d``` | Upsample (don't use this)| ```nn.ConvTranspose3d``` | VERY SLOW and CAN'T RECOVER ORIGIN POINT CLOUD | | ```spconv.SparseConvTranspose3d``` | Upsample (for generative model)| ```nn.ConvTranspose3d``` | VERY SLOW and CAN'T RECOVER ORIGIN POINT CLOUD |
| ```spconv.SparseMaxPool3d``` | Downsample | ```nn.MaxPool3d``` | Use ```indice_key``` to save data for inverse | | ```spconv.SparseMaxPool3d``` | Downsample | ```nn.MaxPool3d``` | Use ```indice_key``` to save data for inverse |
| ```spconv.SparseSequential``` | Container | ```nn.Sequential``` | support layers above and ```nn.ReLU, nn.BatchNorm, ...```| | ```spconv.SparseSequential``` | Container | ```nn.Sequential``` | support layers above and ```nn.ReLU, nn.BatchNorm, ...```|
| ```spconv.SparseGlobalMaxPool``` | global pool | N/A | return dense tensor instead of SparseConvTensor|
| ```spconv.SparseGlobalAvgPool``` | global pool | N/A | return dense tensor instead of SparseConvTensor|
| Functional APIs | Usage | | Functional APIs | Usage |
...@@ -143,6 +145,16 @@ class ExampleNet(nn.Module): ...@@ -143,6 +145,16 @@ class ExampleNet(nn.Module):
return self.net(x) return self.net(x)
``` ```
### How To Use SparseConvTranspose
```SparseConvTranspose``` (standard upsampling) should only be used in generative model. You need to use a classifier to check if a output coordicates is empty, then set batch indices (or xyz) of that sparse tensor to a negative number:
```Python
spt.indices[empty_mask, 0] = -1
```
In next sparse convolution, invalid coordinates will be removed until you perform next ```spt.indices[empty_mask, 0] = -1```.
#### Common Mistake #### Common Mistake
* issue [#467](https://github.com/traveller59/spconv/issues/467) * issue [#467](https://github.com/traveller59/spconv/issues/467)
```Python ```Python
......
...@@ -296,6 +296,16 @@ class SpconvOps: ...@@ -296,6 +296,16 @@ class SpconvOps:
""" """
... ...
@staticmethod @staticmethod
def global_pool_rearrange(out_indices: Tensor, coords: Tensor, counts: Tensor, stream: int = 0) -> None:
"""
Args:
out_indices:
coords:
counts:
stream:
"""
...
@staticmethod
def maxpool_implicit_gemm_forward(out: Tensor, inp: Tensor, inds: Tensor, stream: int = 0) -> None: def maxpool_implicit_gemm_forward(out: Tensor, inp: Tensor, inds: Tensor, stream: int = 0) -> None:
""" """
Args: Args:
......
...@@ -782,6 +782,32 @@ class SpconvOps(pccm.Class): ...@@ -782,6 +782,32 @@ class SpconvOps(pccm.Class):
""") """)
return code return code
@pccm.pybind.mark
@_STATIC_FUNCTION
def global_pool_rearrange(self):
code = pccm.FunctionCode()
code.arg("out_indices, coords, counts", "tv::Tensor")
code.arg("stream", "std::uintptr_t", "0", pyanno="int")
code.add_dependency(IndiceMaxPoolCPU)
if not CUMM_CPU_ONLY_BUILD:
code.add_dependency(IndiceMaxPool)
code.raw(f"""
if (out_indices.is_cpu()){{
IndiceMaxPoolCPU::global_pool_rearrange(out_indices, coords, counts);
}}
""")
if not CUMM_CPU_ONLY_BUILD:
with code.else_():
code.raw(f"""
IndiceMaxPool::global_pool_rearrange(out_indices, coords, counts, stream);
""")
else:
code.raw(f"""
TV_THROW_RT_ERR("not implemented in cpu-only spconv!!! ")
""")
return code
@pccm.pybind.mark @pccm.pybind.mark
@_STATIC_FUNCTION @_STATIC_FUNCTION
def maxpool_implicit_gemm_forward(self): def maxpool_implicit_gemm_forward(self):
......
...@@ -195,7 +195,8 @@ class ConvOutLocIter(pccm.ParameterizedClass): ...@@ -195,7 +195,8 @@ class ConvOutLocIter(pccm.ParameterizedClass):
stride_valid.append( stride_valid.append(
f"!(npq_no_stride[{i + 1}] % problem_.stride[{i}])") f"!(npq_no_stride[{i + 1}] % problem_.stride[{i}])")
code.raw(f""" code.raw(f"""
return npq_no_stride[0] < problem_.N && return (npq_no_stride[0] < problem_.N) &&
(npq_no_stride[0] >= 0) &&
{' && '.join(hw_valid)} && {' && '.join(hw_valid)} &&
{' && '.join(stride_valid)}; {' && '.join(stride_valid)};
""") """)
...@@ -218,7 +219,7 @@ class ConvOutLocIter(pccm.ParameterizedClass): ...@@ -218,7 +219,7 @@ class ConvOutLocIter(pccm.ParameterizedClass):
(f"npq_offset[{i + 1}] >= 0 && " (f"npq_offset[{i + 1}] >= 0 && "
f"npq_offset[{i + 1}] < problem_.output_dims[{i}]")) f"npq_offset[{i + 1}] < problem_.output_dims[{i}]"))
code.raw(f""" code.raw(f"""
return npq_offset[0] < problem_.N && return (npq_offset[0] < problem_.N) && (npq_offset[0] >= 0) &&
{' && '.join(hw_valid)}; {' && '.join(hw_valid)};
""") """)
return code return code
...@@ -240,7 +241,7 @@ class ConvOutLocIter(pccm.ParameterizedClass): ...@@ -240,7 +241,7 @@ class ConvOutLocIter(pccm.ParameterizedClass):
(f"nhw_offset[{i + 1}] >= 0 && " (f"nhw_offset[{i + 1}] >= 0 && "
f"nhw_offset[{i + 1}] < problem_.input_dims[{i}]")) f"nhw_offset[{i + 1}] < problem_.input_dims[{i}]"))
code.raw(f""" code.raw(f"""
return nhw_offset[0] < problem_.N && return (nhw_offset[0] < problem_.N) && (nhw_offset[0] >= 0) &&
{' && '.join(hw_valid)}; {' && '.join(hw_valid)};
""") """)
return code return code
...@@ -262,7 +263,7 @@ class ConvOutLocIter(pccm.ParameterizedClass): ...@@ -262,7 +263,7 @@ class ConvOutLocIter(pccm.ParameterizedClass):
(f"nhw_offset[{i + 1}] >= 0 && " (f"nhw_offset[{i + 1}] >= 0 && "
f"nhw_offset[{i + 1}] < problem_.output_dims[{i}]")) f"nhw_offset[{i + 1}] < problem_.output_dims[{i}]"))
code.raw(f""" code.raw(f"""
return nhw_offset[0] < problem_.N && return (nhw_offset[0] < problem_.N) && (nhw_offset[0] >= 0) &&
{' && '.join(hw_valid)}; {' && '.join(hw_valid)};
""") """)
return code return code
......
...@@ -298,6 +298,46 @@ class IndiceMaxPool(pccm.Class): ...@@ -298,6 +298,46 @@ class IndiceMaxPool(pccm.Class):
}} }}
""") """)
return code return code
@pccm.cuda.cuda_global_function
def global_pool_rearrange_kernel(self):
code = pccm.FunctionCode()
code.arg("out_indices", "int*")
code.arg("coords", "const int*")
code.arg("counts", "int*")
code.arg("num_indices", "int")
code.arg("indices_stride", "int")
code.raw(f"""
for (int i : tv::KernelLoopX<int>(num_indices)) {{
int batch_idx = coords[i * indices_stride];
if (batch_idx >= 0){{
auto old = atomicAdd(counts + batch_idx, 1);
out_indices[batch_idx * num_indices + old] = i;
}}
}}
""")
return code
@pccm.cuda.static_function
def global_pool_rearrange(self):
code = pccm.FunctionCode()
code.arg("out_indices", "tv::Tensor")
code.arg("coords", "tv::Tensor")
code.arg("counts", "tv::Tensor")
code.arg("stream", "std::uintptr_t", "0")
code.raw(f"""
auto nhot = coords.dim(0);
auto cudastream = reinterpret_cast<cudaStream_t>(stream);
tv::cuda::Launch launcher = tv::cuda::Launch(nhot, cudastream);
launcher(global_pool_rearrange_kernel, out_indices.data_ptr<int>(),
coords.data_ptr<const int>(), counts.data_ptr<int>(), nhot,
coords.stride(0));
TV_CHECK_CUDA_ERR_V2("global_pool_feature_rearrange failed!!!");
""")
return code
@pccm.cuda.static_function @pccm.cuda.static_function
def forward(self): def forward(self):
...@@ -555,6 +595,30 @@ class IndiceMaxPoolCPU(pccm.Class): ...@@ -555,6 +595,30 @@ class IndiceMaxPoolCPU(pccm.Class):
self.add_dependency(OMPLib) self.add_dependency(OMPLib)
self.add_include("tensorview/parallel/all.h") self.add_include("tensorview/parallel/all.h")
@pccm.static_function
def global_pool_rearrange(self):
code = pccm.FunctionCode()
code.arg("out_indices", "tv::Tensor")
code.arg("coords", "tv::Tensor")
code.arg("counts", "tv::Tensor")
code.raw(f"""
auto nhot = coords.dim(0);
auto out_ptr = out_indices.data_ptr<int>();
auto coord_ptr = coords.data_ptr<const int>();
auto count_ptr = counts.data_ptr<int>();
int indices_stride = coords.stride(0);
for (int i = 0; i < nhot; ++i){{
int batch_idx = coord_ptr[0];
if (batch_idx >= 0){{
out_ptr[batch_idx * nhot + (count_ptr[batch_idx]++)] = i;
}}
coord_ptr += indices_stride;
}}
""")
return code
@pccm.static_function @pccm.static_function
def forward(self): def forward(self):
code = pccm.FunctionCode() code = pccm.FunctionCode()
......
...@@ -21,7 +21,8 @@ from spconv.pytorch.ops import ConvAlgo ...@@ -21,7 +21,8 @@ from spconv.pytorch.ops import ConvAlgo
from spconv.pytorch.pool import (SparseMaxPool1d, SparseMaxPool2d, from spconv.pytorch.pool import (SparseMaxPool1d, SparseMaxPool2d,
SparseMaxPool3d, SparseMaxPool4d, SparseMaxPool3d, SparseMaxPool4d,
SparseAvgPool1d, SparseAvgPool2d, SparseAvgPool1d, SparseAvgPool2d,
SparseAvgPool3d) SparseAvgPool3d, SparseGlobalMaxPool,
SparseGlobalAvgPool)
from spconv.pytorch.tables import AddTable, ConcatTable, JoinTable from spconv.pytorch.tables import AddTable, ConcatTable, JoinTable
......
...@@ -2103,3 +2103,17 @@ def maximum_value_int_(ten: torch.Tensor, value: int): ...@@ -2103,3 +2103,17 @@ def maximum_value_int_(ten: torch.Tensor, value: int):
else: else:
assert not ten.is_cuda assert not ten.is_cuda
SpconvOps.maximum_value_int(torch_tensor_to_tv(ten), value, stream) SpconvOps.maximum_value_int(torch_tensor_to_tv(ten), value, stream)
def global_pool_rearrange(coords: torch.Tensor, batch_size: int):
is_cpu = not coords.is_cuda
stream = 0
if not is_cpu:
stream = get_current_stream()
out_indices = torch.empty((batch_size, coords.shape[0]), dtype=torch.int32, device=coords.device)
counts = torch.zeros((batch_size, ), dtype=torch.int32, device=coords.device)
SpconvOps.global_pool_rearrange(torch_tensor_to_tv(out_indices), torch_tensor_to_tv(coords),
torch_tensor_to_tv(counts), stream )
return out_indices, counts
\ No newline at end of file
...@@ -248,6 +248,42 @@ class SparseMaxPool(SparseModule): ...@@ -248,6 +248,42 @@ class SparseMaxPool(SparseModule):
out_tensor.spatial_shape = out_spatial_shape out_tensor.spatial_shape = out_spatial_shape
return out_tensor return out_tensor
class SparseGlobalMaxOrAvgPool(SparseModule):
"""TODO: deploy not supported. this implementation support
backward natively. for deploy, we should use single kernel with
smem based reduce.
"""
def __init__(self, is_mean: bool, name=None):
super(SparseGlobalMaxOrAvgPool, self).__init__(name=name)
self.is_mean = is_mean
def forward(self, input: spconv.SparseConvTensor):
is_int8 = input.is_quantized
assert not is_int8, "not implemented"
assert isinstance(input, spconv.SparseConvTensor)
out_indices, counts = ops.global_pool_rearrange(input.indices, input.batch_size)
counts_cpu = counts.cpu()
counts_cpu_np = counts_cpu.numpy()
res_features_list: List[torch.Tensor] = []
for i in range(input.batch_size):
real_inds = out_indices[i, :counts_cpu_np[i]]
real_features = input.features[real_inds]
if self.is_mean:
real_features_reduced = torch.mean(real_features, dim=0)[0]
else:
real_features_reduced = torch.max(real_features, dim=0)[0]
res_features_list.append(real_features_reduced)
res = torch.stack(res_features_list)
return res
class SparseGlobalAvgPool(SparseGlobalMaxOrAvgPool):
def __init__(self, name=None):
super(SparseGlobalAvgPool, self).__init__(is_mean=True, name=name)
class SparseGlobalMaxPool(SparseGlobalMaxOrAvgPool):
def __init__(self, name=None):
super(SparseGlobalMaxPool, self).__init__(is_mean=False, name=name)
class SparseAvgPool(SparseModule): class SparseAvgPool(SparseModule):
def __init__(self, def __init__(self,
......
import cProfile
import re
import pstats
import io
pr = cProfile.Profile()
pr.enable()
import spconv
pr.disable()
s = io.StringIO()
ps = pstats.Stats(pr, stream=s).sort_stats('tottime')
ps.print_stats()
with open('test.txt', 'w') as f:
f.write(s.getvalue())
...@@ -190,6 +190,20 @@ class SparseMaxPoolTestTorch(nn.Module): ...@@ -190,6 +190,20 @@ class SparseMaxPoolTestTorch(nn.Module):
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size) x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
return self.net(x) # .dense() return self.net(x) # .dense()
class SparseGlobalMaxPoolTestTorch(nn.Module):
def __init__(self, shape):
super().__init__()
layers = [
spconv.SparseGlobalMaxPool()
]
self.net = spconv.SparseSequential(*layers, )
self.shape = shape
def forward(self, features, coors, batch_size):
coors = coors.int()
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
return self.net(x) # .dense()
class MaxPool3dTestTorch(nn.Module): class MaxPool3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, kernel_size, stride, padding, def __init__(self, num_layers, ndim, shape, kernel_size, stride, padding,
...@@ -526,6 +540,63 @@ def test_spmaxpool3d(): ...@@ -526,6 +540,63 @@ def test_spmaxpool3d():
test_case.assertAllClose(din_np, din_sparse_np, atol=1e-4) test_case.assertAllClose(din_np, din_sparse_np, atol=1e-4)
def test_spglobalmaxpool3d():
test_case = TestCase()
np.random.seed(485)
devices = ["cpu:0", "cuda:0"]
shapes = [[19, 18, 17]]
batchsizes = [1, 2]
channels = [64]
# ksizes = [2]
# strides = [2]
# paddings = [0]
# dilations = [1]
for dev, shape, bs, C in params_grid(
devices, shapes, batchsizes, channels):
device = torch.device(dev)
num_points = [1000] * bs
# when data contains negative, sparse maxpool is not equal to dense maxpool.
sparse_dict = generate_sparse_data(shape,
num_points,
C,
data_range=[0.1, 0.4])
features = np.ascontiguousarray(sparse_dict["features"]).astype(
np.float32)
indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True
features_dense_t = torch.from_numpy(features_dense).to(device)
features_dense_t.requires_grad = True
net = SparseGlobalMaxPoolTestTorch(shape).to(device)
net_ref = MaxPool3dTestTorch(1, 3, shape, shape, shape, 0, 1).to(device)
out_ref = net_ref(features_dense_t)
out = net(features_t, indices_t, bs)
out_dense = out
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
test_case.assertAllClose(out_np.reshape(-1), out_ref_np.reshape(-1), atol=1e-4)
dout = np.random.uniform(
-0.2, 0.2, out_dense.shape).astype(features.dtype)
dout_t = torch.from_numpy(dout).to(device).view(bs, C, 1, 1, 1)
out.backward(dout_t.reshape(bs, C))
out_ref.backward(dout_t)
din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4,
1).contiguous()
din_sparse = gather_nd(din_dense, indices_t.long())
din = features_t.grad.detach()
din_np = din.cpu().numpy()
din_sparse_np = din_sparse.cpu().numpy()
test_case.assertAllClose(din_np, din_sparse_np, atol=1e-4)
if __name__ == "__main__": if __name__ == "__main__":
test_spmaxpool3d() test_spglobalmaxpool3d()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment