merge master code

06a01f0f · yan.yan · d03b947a · 370334aa · 06a01f0f · 06a01f0f
Commit 06a01f0f authored Dec 06, 2021 by yan.yan
8 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Changelog

+## [2.1.19] - 2021-12-3
+### Fixed
+- Fix wrong arch assert in all kernels for old GPUs to make spconv work in sm_50 GPUs
+
+## [2.1.18] - 2021-11-29
+### Fixed
+- Fix a small bug of spatial_shape.
+- Fix a bug in PointToVoxel, we must always return a clone instead of a view.
+
 ## [2.1.17] - 2021-11-29
 ### Fixed
 - Fix a bug in sparse add.
+- Fix a serious bug in conv weight init.
 ### Added
 - Add more wrong usage check
 - Add insert_exist_keys for hash table

--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@

 Check [spconv 2.x algorithm introduction](docs/spconv2_algo.pdf) to understand sparse convolution algorithm in spconv 2.x!

-**WARNING** spconv < 2.1.4 users need to upgrade your version to 2.1.4, it fix a serious bug in SparseInverseConvXd.
+**WARNING** spconv < 2.1.18 users need to upgrade your version to 2.1.18, it fix a bug in conv weight init which cause std of inited weight too large, and a bug in PointToVoxel.

 ## Breaking changes in Spconv 2.x


--- a/setup.py
+++ b/setup.py
@@ -38,9 +38,9 @@ if cuda_ver:
    cuda_ver = cuda_ver.replace(".", "") # 10.2 to 102

    RELEASE_NAME += "-cu{}".format(cuda_ver)
-    deps = ["cumm-cu{}>=0.2.3".format(cuda_ver)]
+    deps = ["cumm-cu{}>=0.2.6".format(cuda_ver)]
 else:
-    deps = ["cumm>=0.2.3"]
+    deps = ["cumm>=0.2.6"]




--- a/spconv/core.py
+++ b/spconv/core.py
@@ -196,7 +196,7 @@ IMPLGEMM_SIMT_PARAMS = [
    *gen_conv_params(ConvFwdAndBwdInput, (32, 128, 16), (32, 32, 8),
                     NDIM_DONT_CARE,
                     ConvIterAlgo.Optimized,
-                     2, ["f32,f32,f32,f32,f32"],
+                     2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f16,f16"],
                     NHWC,
                     NHWC,
                     NHWC,
@@ -208,7 +208,7 @@ IMPLGEMM_SIMT_PARAMS = [
    *gen_conv_params(ConvFwdAndBwdInput, (32, 256, 8), (32, 64, 8),
                     NDIM_DONT_CARE,
                     ConvIterAlgo.Optimized,
-                     2, ["f32,f32,f32,f32,f32"],
+                     2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f16,f16"],
                     NHWC,
                     NHWC,
                     NHWC,
@@ -220,7 +220,7 @@ IMPLGEMM_SIMT_PARAMS = [
    *gen_conv_params(ConvFwdAndBwdInput, (32, 64, 16), (32, 32, 8),
                     NDIM_DONT_CARE,
                     ConvIterAlgo.Optimized,
-                     2, ["f32,f32,f32,f32,f32"],
+                     2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f16,f16"],
                     NHWC,
                     NHWC,
                     NHWC,
@@ -232,7 +232,7 @@ IMPLGEMM_SIMT_PARAMS = [
    *gen_conv_params(ConvFwdAndBwdInput, (32, 32, 32), (32, 32, 8),
                     NDIM_DONT_CARE,
                     ConvIterAlgo.Optimized,
-                     2, ["f32,f32,f32,f32,f32"],
+                     2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f16,f16"],
                     NHWC,
                     NHWC,
                     NHWC,
@@ -244,7 +244,7 @@ IMPLGEMM_SIMT_PARAMS = [
    *gen_conv_params(ConvFwdAndBwdInput, (64, 256, 8), (32, 64, 8),
                     NDIM_DONT_CARE,
                     ConvIterAlgo.Optimized,
-                     2, ["f32,f32,f32,f32,f32"],
+                     2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f16,f16"],
                     NHWC,
                     NHWC,
                     NHWC,
@@ -256,7 +256,7 @@ IMPLGEMM_SIMT_PARAMS = [
    *gen_conv_params(ConvFwdAndBwdInput, (64, 128, 8), (32, 64, 8),
                     NDIM_DONT_CARE,
                     ConvIterAlgo.Optimized,
-                     2, ["f32,f32,f32,f32,f32"],
+                     2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f16,f16"],
                     NHWC,
                     NHWC,
                     NHWC,
@@ -268,7 +268,7 @@ IMPLGEMM_SIMT_PARAMS = [
    *gen_conv_params(ConvFwdAndBwdInput, (64, 64, 8), (32, 32, 8),
                     NDIM_DONT_CARE,
                     ConvIterAlgo.Optimized,
-                     2, ["f32,f32,f32,f32,f32"],
+                     2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f16,f16"],
                     NHWC,
                     NHWC,
                     NHWC,
@@ -280,7 +280,7 @@ IMPLGEMM_SIMT_PARAMS = [
    *gen_conv_params(ConvFwdAndBwdInput, (64, 32, 16), (32, 32, 8),
                     NDIM_DONT_CARE,
                     ConvIterAlgo.Optimized,
-                     2, ["f32,f32,f32,f32,f32"],
+                     2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f16,f16"],
                     NHWC,
                     NHWC,
                     NHWC,
@@ -292,7 +292,7 @@ IMPLGEMM_SIMT_PARAMS = [
    *gen_conv_params(ConvBwdWeight, (32, 128, 16), (32, 32, 8),
                     NDIM_DONT_CARE,
                     ConvIterAlgo.Optimized,
-                     2, ["f32,f32,f32,f32,f32"],
+                     2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
                     NHWC,
                     NHWC,
                     NHWC,
@@ -306,7 +306,7 @@ IMPLGEMM_SIMT_PARAMS = [
    *gen_conv_params(ConvBwdWeight, (32, 64, 16), (32, 32, 8),
                     NDIM_DONT_CARE,
                     ConvIterAlgo.Optimized,
-                     2, ["f32,f32,f32,f32,f32"],
+                     2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
                     NHWC,
                     NHWC,
                     NHWC,
@@ -318,7 +318,7 @@ IMPLGEMM_SIMT_PARAMS = [
    *gen_conv_params(ConvBwdWeight, (32, 32, 32), (32, 32, 8),
                     NDIM_DONT_CARE,
                     ConvIterAlgo.Optimized,
-                     2, ["f32,f32,f32,f32,f32"],
+                     2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
                     NHWC,
                     NHWC,
                     NHWC,
@@ -330,7 +330,7 @@ IMPLGEMM_SIMT_PARAMS = [
    *gen_conv_params(ConvBwdWeight, (64, 256, 8), (32, 64, 8),
                     NDIM_DONT_CARE,
                     ConvIterAlgo.Optimized,
-                     2, ["f32,f32,f32,f32,f32"],
+                     2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
                     NHWC,
                     NHWC,
                     NHWC,
@@ -342,7 +342,7 @@ IMPLGEMM_SIMT_PARAMS = [
    *gen_conv_params(ConvBwdWeight, (64, 128, 8), (32, 64, 8),
                     NDIM_DONT_CARE,
                     ConvIterAlgo.Optimized,
-                     2, ["f32,f32,f32,f32,f32"],
+                     2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
                     NHWC,
                     NHWC,
                     NHWC,
@@ -354,7 +354,7 @@ IMPLGEMM_SIMT_PARAMS = [
    *gen_conv_params(ConvBwdWeight, (64, 64, 8), (32, 32, 8),
                     NDIM_DONT_CARE,
                     ConvIterAlgo.Optimized,
-                     2, ["f32,f32,f32,f32,f32"],
+                     2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
                     NHWC,
                     NHWC,
                     NHWC,
@@ -366,7 +366,7 @@ IMPLGEMM_SIMT_PARAMS = [
    *gen_conv_params(ConvBwdWeight, (64, 32, 16), (32, 32, 8),
                     NDIM_DONT_CARE,
                     ConvIterAlgo.Optimized,
-                     2, ["f32,f32,f32,f32,f32"],
+                     2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
                     NHWC,
                     NHWC,
                     NHWC,
@@ -378,7 +378,7 @@ IMPLGEMM_SIMT_PARAMS = [
    *gen_conv_params(ConvBwdWeight, (128, 128, 8), (32, 64, 8),
                     NDIM_DONT_CARE,
                     ConvIterAlgo.Optimized,
-                     2, ["f32,f32,f32,f32,f32"],
+                     2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
                     NHWC,
                     NHWC,
                     NHWC,
@@ -390,7 +390,7 @@ IMPLGEMM_SIMT_PARAMS = [
    *gen_conv_params(ConvBwdWeight, (128, 64, 8), (64, 32, 8),
                     NDIM_DONT_CARE,
                     ConvIterAlgo.Optimized,
-                     2, ["f32,f32,f32,f32,f32"],
+                     2, ["f32,f32,f32,f32,f32", "f16,f16,f16,f32,f32"],
                     NHWC,
                     NHWC,
                     NHWC,

--- a/spconv/pytorch/conv.py
+++ b/spconv/pytorch/conv.py
@@ -34,6 +34,7 @@ from spconv.pytorch.core import IndiceData, SparseConvTensor, ImplicitGemmIndice
 from spconv.pytorch.modules import SparseModule
 from spconv.constants import SAVED_WEIGHT_LAYOUT, ALL_WEIGHT_IS_KRSC
 from spconv.utils import nullcontext
+from torch.nn.init import calculate_gain

 FILTER_HWIO = False

@@ -51,39 +52,6 @@ def expand_nd(val: Union[int, List[int], Tuple[int, ...]], ndim: int) -> List[in
    return val


-def _calculate_fan_in_and_fan_out_hwio(tensor, algo: ConvAlgo):
-    dimensions = tensor.ndimension()
-    if dimensions < 2:
-        raise ValueError(
-            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
-        )
-
-    if dimensions == 2:  # Linear
-        fan_in = tensor.size(-2)
-        fan_out = tensor.size(-1)
-    else:
-        if algo == ConvAlgo.Native:
-            if FILTER_HWIO:
-                num_input_fmaps = tensor.size(-2)
-                num_output_fmaps = tensor.size(-1)
-            else:
-                num_input_fmaps = tensor.size(-1)
-                num_output_fmaps = tensor.size(-2)
-
-            receptive_field_size = 1
-            if tensor.dim() > 2:
-                receptive_field_size = tensor[..., 0, 0].numel()
-        else:
-            num_input_fmaps = tensor.size(-1)
-            num_output_fmaps = tensor.size(0)
-            receptive_field_size = 1
-            if tensor.dim() > 2:
-                receptive_field_size = int(np.prod(tensor.shape[1:-1]))
-
-        fan_in = num_input_fmaps * receptive_field_size
-        fan_out = num_output_fmaps * receptive_field_size
-
-    return fan_in, fan_out


 class SparseConvolution(SparseModule):
@@ -115,15 +83,18 @@ class SparseConvolution(SparseModule):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = expand_nd(ndim, kernel_size)
-        kv = int(np.prod(kernel_size))
-        kv_stride = int(np.prod(stride))
+        self.stride = expand_nd(ndim, stride)
+        kv = int(np.prod(self.kernel_size))
+        kv_stride = int(np.prod(self.stride))
+        self.dilation = expand_nd(ndim, dilation)
+        self.padding = expand_nd(ndim, padding)
+
        self.conv1x1 = kv == 1
        # TODO we should deprecate support for ksize == 1 but stride != 1.
        if not subm:
            self.conv1x1 &= kv_stride == 1
-        self.stride = expand_nd(ndim, stride)
-        self.padding = expand_nd(ndim, padding)
-        self.dilation = expand_nd(ndim, dilation)
+            if self.conv1x1:
+                assert self.padding == [0] * ndim, "padding must be zero for 1x1 conv (k=1,s=1)"
        self.transposed = transposed
        self.inverse = inverse
        self.output_padding = expand_nd(ndim, output_padding)
@@ -212,20 +183,39 @@ class SparseConvolution(SparseModule):
            s += f', algo={self.algo}'
        return s.format(**self.__dict__)

+    def _calculate_fan_in_and_fan_out(self):
+        receptive_field_size = 1
+        # math.prod is not always available, accumulate the product manually
+        # we could use functools.reduce but that is not supported by TorchScript
+        for s in self.kernel_size:
+            receptive_field_size *= s
+        fan_in = self.in_channels * receptive_field_size
+        fan_out = self.out_channels * receptive_field_size
+        return fan_in, fan_out
+
+    def _calculate_correct_fan(self, mode):
+        mode = mode.lower()
+        valid_modes = ['fan_in', 'fan_out']
+        if mode not in valid_modes:
+            raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes))
+
+        fan_in, fan_out = self._calculate_fan_in_and_fan_out()
+        return fan_in if mode == 'fan_in' else fan_out
+
+    def _custom_kaiming_uniform_(self, tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
+        r"""same as torch.init.kaiming_uniform_, with KRSC layout support
+        """
+        fan = self._calculate_correct_fan(mode)
+        gain = calculate_gain(nonlinearity, a)
+        std = gain / math.sqrt(fan)
+        bound = math.sqrt(3.0) * std  # Calculate uniform bounds from standard deviation
+        with torch.no_grad():
+            return tensor.uniform_(-bound, bound)
+
    def reset_parameters(self):
-        n = self.in_channels
-        # following commented code is used to make weight different layout have same value
-        # if self.algo != ConvAlgo.Native:
-        #     weight2 = self.weight.data.permute(1, 2, 3, 0,
-        #                                        4).contiguous().clone()
-        #     init.uniform_(weight2, 0, 0.001)
-        #     self.weight.data[:] = weight2.permute(3, 0, 1, 2, 4)
-        # else:
-        #     init.uniform_(self.weight, 0, 0.001)
-        init.kaiming_uniform_(self.weight, a=math.sqrt(0.005))
+        self._custom_kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
-            fan_in, _ = _calculate_fan_in_and_fan_out_hwio(
-                self.weight, self.algo)
+            fan_in, _ = self._calculate_fan_in_and_fan_out()
            bound = 1 / math.sqrt(fan_in)
            init.uniform_(self.bias, -bound, bound)

@@ -318,14 +308,14 @@ class SparseConvolution(SparseModule):
                    indice_pairs = datas.indice_pairs
                    indice_pair_num = datas.indice_pair_num
                    out_spatial_shape = datas.spatial_shape
-                    assert indice_pair_num.shape[0] == np.prod(
-                        self.kernel_size
-                    ), "inverse conv must have same kernel size as its couple conv"
+                    assert datas.ksize == self.kernel_size, "inverse conv must have same kernel size as its couple conv"
                else:
                    if self.indice_key is not None and datas is not None:
                        outids = datas.out_indices
                        indice_pairs = datas.indice_pairs
                        indice_pair_num = datas.indice_pair_num
+                        assert self.subm, "only support reuse subm indices"
+                        self._check_subm_reuse_valid(input, spatial_shape, datas)
                    else:
                        if input.benchmark:
                            torch.cuda.synchronize()
@@ -416,19 +406,8 @@ class SparseConvolution(SparseModule):
                        mask_argsort_fwd_splits = datas.mask_argsort_fwd_splits
                        mask_argsort_bwd_splits = datas.mask_argsort_bwd_splits
                        masks = datas.masks
-                        assert datas.is_subm, "only support reuse subm indices"
-                        if self.kernel_size != datas.ksize:
-                            raise ValueError(f"subm with same indice_key must have same kernel"
-                                f" size, expect {datas.ksize}, this layer {self.kernel_size}")
-                        if self.dilation != datas.dilation:
-                            raise ValueError(f"subm with same indice_key must have same dilation"
-                                f", expect {datas.dilation}, this layer {self.dilation}")
-                        if input.spatial_shape != datas.spatial_shape:
-                            raise ValueError(f"subm with same indice_key must have same spatial structure"
-                                f", expect {datas.spatial_shape}, input {spatial_shape}")
-                        if input.indices.shape[0] != datas.indices.shape[0]:
-                            raise ValueError(f"subm with same indice_key must have same num of indices"
-                                f", expect {datas.indices.shape[0]}, input {input.indices.shape[0]}")
+                        assert self.subm, "only support reuse subm indices"
+                        self._check_subm_reuse_valid(input, spatial_shape, datas)
                    else:

                        with input._timer.namespace("gen_pairs"):
@@ -518,6 +497,22 @@ class SparseConvolution(SparseModule):
        return out_tensor


+    def _check_subm_reuse_valid(self, inp: SparseConvTensor, spatial_shape: List[int], datas: Union[ImplicitGemmIndiceData, IndiceData]):
+        assert datas.is_subm, "only support reuse subm indices"
+        if self.kernel_size != datas.ksize:
+            raise ValueError(f"subm with same indice_key must have same kernel"
+                f" size, expect {datas.ksize}, this layer {self.kernel_size}")
+        if self.dilation != datas.dilation:
+            raise ValueError(f"subm with same indice_key must have same dilation"
+                f", expect {datas.dilation}, this layer {self.dilation}")
+        if inp.spatial_shape != datas.spatial_shape:
+            raise ValueError(f"subm with same indice_key must have same spatial structure"
+                f", expect {datas.spatial_shape}, input {spatial_shape}")
+        if inp.indices.shape[0] != datas.indices.shape[0]:
+            raise ValueError(f"subm with same indice_key must have same num of indices"
+                f", expect {datas.indices.shape[0]}, input {inp.indices.shape[0]}")
+
+
 class SparseConv1d(SparseConvolution):
    def __init__(self,
                 in_channels,

--- a/spconv/pytorch/core.py
+++ b/spconv/pytorch/core.py
@@ -124,7 +124,7 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
    def __init__(self,
                 features: torch.Tensor,
                 indices: torch.Tensor,
-                 spatial_shape: List[int],
+                 spatial_shape: Union[List[int], np.ndarray],
                 batch_size: int,
                 grid: Optional[torch.Tensor] = None,
                 voxel_num: Optional[torch.Tensor] = None,
@@ -154,7 +154,7 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
        assert batch_size > 0
        self._features = features
        self.indices = indices
-        self.spatial_shape = spatial_shape
+        self.spatial_shape = [int(v) for v in spatial_shape]
        self.batch_size = batch_size
        if indice_dict is None:
            indice_dict = {}
@@ -253,12 +253,14 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
        tensor.force_algo = self.force_algo
        return tensor

-def expand_nd(ndim: int, val: Union[int, List[int], Tuple[int, ...]]) -> List[int]:
+def expand_nd(ndim: int, val: Union[int, List[int], Tuple[int, ...], np.ndarray]) -> List[int]:
    if isinstance(val, int):
        res = [val] * ndim 
    elif isinstance(val, tuple):
        res = list(val)
+    elif isinstance(val, np.ndarray):
+        res = list(val)
    else:
        res = val
    assert len(res) == ndim
-    return res 
+    return [int(v) for v in res] 
--- a/spconv/pytorch/hash.py
+++ b/spconv/pytorch/hash.py
@@ -79,7 +79,7 @@ class HashTable:

    def query(self, keys: torch.Tensor, values: Optional[torch.Tensor] = None):
        """query value by keys, if values is not None, create a new one.
-        return values and a uint8 tensor that whether query success.
+        return values and a uint8 tensor that whether query fail.
        """
        keys_tv = torch_tensor_to_tv(keys)
        if values is None:
@@ -95,17 +95,17 @@ class HashTable:

    def insert_exist_keys(self, keys: torch.Tensor, values: torch.Tensor):
        """insert kv that k exists in table. return a uint8 tensor that
-        whether insert success.
+        whether insert fail.
        """
        keys_tv = torch_tensor_to_tv(keys)
        values_tv = torch_tensor_to_tv(values)
        stream = 0
        if not self.is_cpu:
            stream = get_current_stream()
-        is_success = torch.empty([keys.shape[0]], dtype=torch.uint8, device=keys.device)
-        is_success_tv = torch_tensor_to_tv(is_success)
-        self._table.insert_exist_keys(keys_tv, values_tv, is_success_tv, stream)
-        return is_success
+        is_empty = torch.empty([keys.shape[0]], dtype=torch.uint8, device=keys.device)
+        is_empty_tv = torch_tensor_to_tv(is_empty)
+        self._table.insert_exist_keys(keys_tv, values_tv, is_empty_tv, stream)
+        return is_empty

    def assign_arange_(self):
        """iterate table, assign values with "arange" value.

--- a/spconv/pytorch/utils.py
+++ b/spconv/pytorch/utils.py
@@ -153,8 +153,8 @@ class PointToVoxel(object):
                                                clear_voxels)
                num_voxels = res[0].shape[0]

-            return (self.voxels[:num_voxels], self.indices[:num_voxels],
-                    self.num_per_voxel[:num_voxels], pc_voxel_id)
+            return (self.voxels[:num_voxels].clone(), self.indices[:num_voxels].clone(),
+                    self.num_per_voxel[:num_voxels].clone(), pc_voxel_id)


 def gather_features_by_pc_voxel_id(seg_res_features: torch.Tensor, pc_voxel_id: torch.Tensor, invalid_value: Union[int, float] = 0):