Commit bf011c76 authored by yan.yan's avatar yan.yan
Browse files

temp commit

parent 4791f582
...@@ -35,6 +35,20 @@ from spconv.utils import nullcontext ...@@ -35,6 +35,20 @@ from spconv.utils import nullcontext
FILTER_HWIO = False FILTER_HWIO = False
def expand_nd(val: Union[int, List[int], Tuple[int, ...]], ndim: int) -> List[int]:
if isinstance(val, int):
val = [val] * ndim
elif isinstance(val, list):
assert len(val) == ndim
elif isinstance(val, tuple):
assert len(val) == ndim
return [*val]
else:
raise NotImplementedError
return val
def _calculate_fan_in_and_fan_out_hwio(tensor, algo: ConvAlgo): def _calculate_fan_in_and_fan_out_hwio(tensor, algo: ConvAlgo):
dimensions = tensor.ndimension() dimensions = tensor.ndimension()
if dimensions < 2: if dimensions < 2:
...@@ -110,7 +124,9 @@ class SparseConvolution(SparseModule): ...@@ -110,7 +124,9 @@ class SparseConvolution(SparseModule):
self.out_channels = out_channels self.out_channels = out_channels
self.kernel_size = kernel_size self.kernel_size = kernel_size
kv = int(np.prod(kernel_size)) kv = int(np.prod(kernel_size))
self.conv1x1 = kv == 1 kv_stride = int(np.prod(kernel_size))
self.conv1x1 = kv == 1 and kv_stride == 1
self.stride = stride self.stride = stride
self.padding = padding self.padding = padding
self.dilation = dilation self.dilation = dilation
......
...@@ -104,7 +104,8 @@ class SparseConvTensor(metaclass=SpConvTensorMeta): ...@@ -104,7 +104,8 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
indice_dict: Optional[dict] = None, indice_dict: Optional[dict] = None,
benchmark: bool = False, benchmark: bool = False,
permanent_thrust_allocator: bool = False, permanent_thrust_allocator: bool = False,
enable_timer: bool = False): enable_timer: bool = False,
force_algo: Optional[ConvAlgo] = None):
""" """
Args: Args:
features: [num_points, num_features] feature tensor features: [num_points, num_features] feature tensor
...@@ -115,6 +116,8 @@ class SparseConvTensor(metaclass=SpConvTensorMeta): ...@@ -115,6 +116,8 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
is very large. is very large.
benchmark: whether to enable benchmark. if enabled, all sparse operators will be record to benchmark: whether to enable benchmark. if enabled, all sparse operators will be record to
SparseConvTensor. SparseConvTensor.
enable_timer: if exists, all spconv internal ops run time will be record in _timer.
force_algo: force conv/pool layers use this algo, should only used for debug.
""" """
ndim = indices.shape[1] - 1 ndim = indices.shape[1] - 1
assert features.ndim == 2 assert features.ndim == 2
...@@ -139,6 +142,7 @@ class SparseConvTensor(metaclass=SpConvTensorMeta): ...@@ -139,6 +142,7 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
if permanent_thrust_allocator: if permanent_thrust_allocator:
self.thrust_allocator = ThrustSortAllocator(features.device) self.thrust_allocator = ThrustSortAllocator(features.device)
self._timer = CUDAKernelTimer(enable_timer) self._timer = CUDAKernelTimer(enable_timer)
self.force_algo = force_algo
def replace_feature(self, feature: torch.Tensor): def replace_feature(self, feature: torch.Tensor):
"""we need to replace x.features = F.relu(x.features) with x = x.replace_feature(F.relu(x.features)) """we need to replace x.features = F.relu(x.features) with x = x.replace_feature(F.relu(x.features))
...@@ -152,6 +156,8 @@ class SparseConvTensor(metaclass=SpConvTensorMeta): ...@@ -152,6 +156,8 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
new_spt.benchmark_record = self.benchmark_record new_spt.benchmark_record = self.benchmark_record
new_spt.thrust_allocator = self.thrust_allocator new_spt.thrust_allocator = self.thrust_allocator
new_spt._timer = self._timer new_spt._timer = self._timer
new_spt.force_algo = self.force_algo
return new_spt return new_spt
@property @property
...@@ -217,4 +223,5 @@ class SparseConvTensor(metaclass=SpConvTensorMeta): ...@@ -217,4 +223,5 @@ class SparseConvTensor(metaclass=SpConvTensorMeta):
tensor.benchmark_record = self.benchmark_record tensor.benchmark_record = self.benchmark_record
tensor.thrust_allocator = self.thrust_allocator tensor.thrust_allocator = self.thrust_allocator
tensor._timer = self._timer tensor._timer = self._timer
tensor.force_algo = self.force_algo
return tensor return tensor
...@@ -30,7 +30,8 @@ _TORCH_DTYPE_TO_TV = { ...@@ -30,7 +30,8 @@ _TORCH_DTYPE_TO_TV = {
def torch_tensor_to_tv(ten: torch.Tensor, def torch_tensor_to_tv(ten: torch.Tensor,
dtype: Optional[int] = None, dtype: Optional[int] = None,
shape: Optional[List[int]] = None): shape: Optional[List[int]] = None,
stride: Optional[List[int]] = None):
# assert ten.is_contiguous(), "must be contiguous tensor" # assert ten.is_contiguous(), "must be contiguous tensor"
ptr = ten.data_ptr() ptr = ten.data_ptr()
device = ten.device device = ten.device
...@@ -40,11 +41,20 @@ def torch_tensor_to_tv(ten: torch.Tensor, ...@@ -40,11 +41,20 @@ def torch_tensor_to_tv(ten: torch.Tensor,
tv_device = 0 tv_device = 0
else: else:
raise NotImplementedError raise NotImplementedError
if shape is None:
shape = list(ten.shape)
if dtype is None: if dtype is None:
dtype = _TORCH_DTYPE_TO_TV[ten.dtype] dtype = _TORCH_DTYPE_TO_TV[ten.dtype]
return tv.from_blob(ptr, shape, list(ten.stride()), dtype, tv_device) if stride is None:
stride = list(ten.stride())
if shape is None:
shape = list(ten.shape)
else:
if not ten.is_contiguous():
msg = "if you provide custom shape for non-contig tensor, stride must not None"
assert stride is not None, msg
else:
# custom shape, if tensor is contiguous, we use from_blob and calc strides
return tv.from_blob(ptr, shape, dtype, tv_device)
return tv.from_blob_strided(ptr, shape, stride, dtype, tv_device)
def get_current_stream(): def get_current_stream():
......
...@@ -137,6 +137,7 @@ class SparseSequential(SparseModule): ...@@ -137,6 +137,7 @@ class SparseSequential(SparseModule):
input = module(input) input = module(input)
else: else:
if isinstance(input, spconv.SparseConvTensor): if isinstance(input, spconv.SparseConvTensor):
print(input.features.shape)
if input.indices.shape[0] != 0: if input.indices.shape[0] != 0:
input = input.replace_feature(module(input.features)) input = input.replace_feature(module(input.features))
else: else:
......
...@@ -1066,7 +1066,7 @@ def indice_conv_backward(features: torch.Tensor, ...@@ -1066,7 +1066,7 @@ def indice_conv_backward(features: torch.Tensor,
alpha=1.0, alpha=1.0,
beta=beta) beta=beta)
if not FILTER_HWIO: if is_KC_not_CK:
a = out_bp_tv a = out_bp_tv
b = features_tv b = features_tv
a_inds = out_indices a_inds = out_indices
...@@ -1376,6 +1376,9 @@ def implicit_gemm_backward(features: torch.Tensor, ...@@ -1376,6 +1376,9 @@ def implicit_gemm_backward(features: torch.Tensor,
mask_width=-1, mask_width=-1,
beta=beta, beta=beta,
stream=stream) stream=stream)
# for backward weight, beta = 0 because each split
# handle different kernel locations.
# TODO remove D iterator in backward weight kernel
CONV.run_with_tuned_result( CONV.run_with_tuned_result(
wgrad_tune_res, wgrad_tune_res,
ConvOpType.kBackwardWeight, ConvOpType.kBackwardWeight,
...@@ -1389,7 +1392,7 @@ def implicit_gemm_backward(features: torch.Tensor, ...@@ -1389,7 +1392,7 @@ def implicit_gemm_backward(features: torch.Tensor,
reverse_mask=False, reverse_mask=False,
mask_filter=masks[j].item(), mask_filter=masks[j].item(),
mask_width=mask_width, mask_width=mask_width,
beta=beta, beta=0,
workspace=workspace_tv, workspace=workspace_tv,
stream=stream) stream=stream)
...@@ -1403,6 +1406,8 @@ def indice_maxpool(features: torch.Tensor, indice_pairs: torch.Tensor, ...@@ -1403,6 +1406,8 @@ def indice_maxpool(features: torch.Tensor, indice_pairs: torch.Tensor,
# stream = get_current_stream() # stream = get_current_stream()
# CONV.stream_synchronize(stream) # CONV.stream_synchronize(stream)
# t = time.time() # t = time.time()
if not features.is_contiguous():
features = features.contiguous()
out_channel = features.shape[-1] out_channel = features.shape[-1]
out_features = torch.zeros((num_activate_out, out_channel), out_features = torch.zeros((num_activate_out, out_channel),
...@@ -1474,6 +1479,8 @@ def indice_maxpool_implicit_gemm(features: torch.Tensor, ...@@ -1474,6 +1479,8 @@ def indice_maxpool_implicit_gemm(features: torch.Tensor,
stream = get_current_stream() stream = get_current_stream()
# CONV.stream_synchronize(stream) # CONV.stream_synchronize(stream)
# t = time.time() # t = time.time()
if not features.is_contiguous():
features = features.contiguous()
out_channel = features.shape[-1] out_channel = features.shape[-1]
out_features = torch.empty((num_activate_out, out_channel), out_features = torch.empty((num_activate_out, out_channel),
......
...@@ -71,36 +71,72 @@ class PointToVoxel(object): ...@@ -71,36 +71,72 @@ class PointToVoxel(object):
pc: torch.Tensor, pc: torch.Tensor,
clear_voxels: bool = True, clear_voxels: bool = True,
empty_mean: bool = False): empty_mean: bool = False):
"""generate voxels/indices/num_point_per_voxel/pc_voxel_ids from
point cloud.
This function don't return pc_voxel_id for backward compatility.
pc_voxel_id will be added in spconv 2.2.
Args:
pc: [N, 3+] point cloud.
clear_voxels: if True, call zero on voxels
empty_mean: if True, full empty location of voxels with mean.
Returns:
voxels: voxels
indices: quantized coords
num_per_voxel: number of points in a voxel
"""
res = self.generate_voxel_with_id(pc, clear_voxels, empty_mean)
return res[0], res[1], res[2]
def generate_voxel_with_id(self,
pc: torch.Tensor,
clear_voxels: bool = True,
empty_mean: bool = False):
"""generate voxels/indices/num_point_per_voxel/pc_voxel_ids from
point cloud.
Args:
pc: [N, 3+] point cloud.
clear_voxels: if True, call zero on voxels
empty_mean: if True, full empty location of voxels with mean.
Returns:
voxels: voxels
indices: quantized coords
num_per_voxel: number of points in a voxel
pc_voxel_id: voxel id for every point. if not exists, -1.
"""
assert pc.device.type == self.device.type, "your pc device is wrong" assert pc.device.type == self.device.type, "your pc device is wrong"
expected_hash_data_num = pc.shape[0] * 2 expected_hash_data_num = pc.shape[0] * 2
with torch.no_grad(): with torch.no_grad():
pc_voxel_id = torch.empty([pc.shape[0]],
dtype=torch.int64,
device=self.device)
pc_voxel_id_tv = torch_tensor_to_tv(pc_voxel_id)
if self.device.type != "cpu": if self.device.type != "cpu":
if self.hashdata.shape[0] < expected_hash_data_num: hashdata = torch.empty([expected_hash_data_num, 2],
self.hashdata = torch.empty([expected_hash_data_num, 2], dtype=torch.int64,
dtype=torch.int64, device=pc.device)
device=self.device)
point_indice_data = torch.empty([pc.shape[0]],
dtype=torch.int64,
device=pc.device)
if self.point_indice_data.shape[0] < pc.shape[0]:
self.point_indice_data = torch.empty([pc.shape[0]],
dtype=torch.int64,
device=self.device)
pc_tv = torch_tensor_to_tv(pc) pc_tv = torch_tensor_to_tv(pc)
stream = get_current_stream() stream = get_current_stream()
voxels_tv = torch_tensor_to_tv(self.voxels) voxels_tv = torch_tensor_to_tv(self.voxels)
indices_tv = torch_tensor_to_tv(self.indices) indices_tv = torch_tensor_to_tv(self.indices)
num_per_voxel_tv = torch_tensor_to_tv(self.num_per_voxel) num_per_voxel_tv = torch_tensor_to_tv(self.num_per_voxel)
hashdata_tv = torch_tensor_to_tv( hashdata_tv = torch_tensor_to_tv(
self.hashdata, hashdata,
dtype=tv.custom128, dtype=tv.custom128,
shape=[self.hashdata.shape[0]]) shape=[hashdata.shape[0]])
point_indice_data_tv = torch_tensor_to_tv( point_indice_data_tv = torch_tensor_to_tv(point_indice_data)
self.point_indice_data) with torch.cuda.device(pc.device):
res = SpconvOps.point2voxel_cuda(
res = SpconvOps.point2voxel_cuda( pc_tv, voxels_tv, indices_tv, num_per_voxel_tv,
pc_tv, voxels_tv, indices_tv, num_per_voxel_tv, hashdata_tv, point_indice_data_tv, pc_voxel_id_tv, self.vsize,
hashdata_tv, point_indice_data_tv, self.vsize, self.grid_size, self.grid_stride, self.coors_range,
self.grid_size, self.grid_stride, self.coors_range, empty_mean, clear_voxels, stream)
empty_mean, clear_voxels, stream)
num_voxels = res[0].shape[0] num_voxels = res[0].shape[0]
else: else:
pc_tv = torch_tensor_to_tv(pc) pc_tv = torch_tensor_to_tv(pc)
...@@ -111,6 +147,7 @@ class PointToVoxel(object): ...@@ -111,6 +147,7 @@ class PointToVoxel(object):
hashdata_tv = torch_tensor_to_tv(self.hashdata, dtype=tv.int32) hashdata_tv = torch_tensor_to_tv(self.hashdata, dtype=tv.int32)
res = SpconvOps.point2voxel_cpu(pc_tv, voxels_tv, indices_tv, res = SpconvOps.point2voxel_cpu(pc_tv, voxels_tv, indices_tv,
num_per_voxel_tv, hashdata_tv, num_per_voxel_tv, hashdata_tv,
pc_voxel_id_tv,
self.vsize, self.grid_size, self.vsize, self.grid_size,
self.grid_stride, self.grid_stride,
self.coors_range, empty_mean, self.coors_range, empty_mean,
...@@ -118,4 +155,4 @@ class PointToVoxel(object): ...@@ -118,4 +155,4 @@ class PointToVoxel(object):
num_voxels = res[0].shape[0] num_voxels = res[0].shape[0]
return (self.voxels[:num_voxels], self.indices[:num_voxels], return (self.voxels[:num_voxels], self.indices[:num_voxels],
self.num_per_voxel[:num_voxels]) self.num_per_voxel[:num_voxels], pc_voxel_id)
...@@ -24,7 +24,7 @@ from spconv.core import ConvAlgo ...@@ -24,7 +24,7 @@ from spconv.core import ConvAlgo
import spconv.pytorch as spconv import spconv.pytorch as spconv
from spconv.utils import Point2VoxelCPU3d from spconv.utils import Point2VoxelCPU3d
# torch.backends.cudnn.enabled = False
def waymo_data(batch_size=1): def waymo_data(batch_size=1):
gen = Point2VoxelCPU3d([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 3, gen = Point2VoxelCPU3d([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 3,
150000, 1) 150000, 1)
...@@ -289,7 +289,7 @@ def main(): ...@@ -289,7 +289,7 @@ def main():
voxels_th = torch.from_numpy(voxels).to(device).to(dtype) voxels_th = torch.from_numpy(voxels).to(device).to(dtype)
coors_th = torch.from_numpy(coors).to(device).int() coors_th = torch.from_numpy(coors).to(device).int()
voxels_th.requires_grad = True voxels_th.requires_grad = True
algo = spconv.ConvAlgo.Native algo = spconv.ConvAlgo.MaskImplicitGemm
# 3080 Laptop # 3080 Laptop
# MaskImpGemm: 11.2ms # MaskImpGemm: 11.2ms
# MaskSplitImpGemm: 12.2ms # MaskSplitImpGemm: 12.2ms
...@@ -324,26 +324,26 @@ def main(): ...@@ -324,26 +324,26 @@ def main():
print(out.spatial_shape, out.features.mean(), out.features.max(), print(out.spatial_shape, out.features.mean(), out.features.max(),
out.features.min()) out.features.min())
# times = [] times = []
# with torch.no_grad(): with torch.no_grad():
# for i in range(20): for i in range(20):
# print("------------") print("------------")
# torch.cuda.synchronize() torch.cuda.synchronize()
# t = time.time() t = time.time()
# out_nograd = net(voxels_th, coors_th, 1, False) out_nograd = net(voxels_th, coors_th, 1, False)
# timer = out_nograd._timer timer = out_nograd._timer
# # res = timer.collect_by_name("forward", timer.get_all_pair_time()) # res = timer.collect_by_name("forward", timer.get_all_pair_time())
# # res2 = timer.collect_by_name("forward0", timer.get_all_pair_time()) # res2 = timer.collect_by_name("forward0", timer.get_all_pair_time())
# # print(sum(res.values()) + sum(res2.values())) # print(sum(res.values()) + sum(res2.values()))
# # print(timer.get_all_pair_time()) # print(timer.get_all_pair_time())
# # print(sum(timer.get_all_pair_time().values())) # print(sum(timer.get_all_pair_time().values()))
# torch.cuda.synchronize() torch.cuda.synchronize()
# # sort_bench() # sort_bench()
# times.append(time.time() - t) times.append(time.time() - t)
# print("spconv time", np.mean(times[10:])) print("spconv time", np.mean(times[10:]))
# times = [] times = []
# for i in range(10): # for i in range(10):
# out = net(voxels_th, coors_th, 1) # out = net(voxels_th, coors_th, 1)
......
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Test all gemm/conv kernels.
We can't test all kernels in network because auto-tuner will only find one best kernel.
"""
import sys
from pathlib import Path
from typing import Dict, List, Tuple
import pickle
import sys
import time
from pathlib import Path
from cumm.gemm.algospec.core import GemmAlgo, ShuffleStrideType
import numpy as np
import pccm
import torch
import torch.nn.functional as F
from spconv.test_utils import TestCase
from cumm import tensorview as tv
from cumm.conv.bases import NCHW, NHWC, ConvIterAlgo, ConvOpType
import os
from cumm.gemm.codeops import div_up
from spconv.core import AlgoHint, ConvAlgo
from spconv.pytorch.conv import expand_nd
from spconv.pytorch import ops
from spconv.algo import CONV, GEMM, BestAlgoByProfile, BestConvAlgoByProfile
from spconv.pytorch.cppcore import get_current_stream, torch_tensor_to_tv
from spconv.test_utils import generate_sparse_data, params_grid
import tqdm
from spconv.constants import ALL_WEIGHT_IS_KRSC
assert ALL_WEIGHT_IS_KRSC is True, "we only support KRSC in spconv >= 2.2"
# TODO remove or release this when tf32 op is ready
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
NUMPY_DTYPE_TO_TORCH = {
np.float32: torch.float32,
np.float16: torch.float16,
np.int8: torch.int8,
}
class SparseConvTester:
def __init__(self, algo: ConvAlgo, subm: bool, shape: List[int], bs: int, dtype: np.dtype, N: int, K: int, C: int,
ksize: int, stride: int, padding: int, dilation: int) -> None:
ndim = 3
self.shape = shape
self.bs = bs
self.dtype = dtype
self.dtype_th = NUMPY_DTYPE_TO_TORCH[dtype]
self.K = K
self.C = C
self.ksize = expand_nd(ksize, ndim)
self.stride = expand_nd(stride, ndim)
self.padding = expand_nd(padding, ndim)
self.dilation = expand_nd(dilation, ndim)
self.N = N
self.device = torch.device("cuda:0")
op = expand_nd(0, ndim)
self.kv: int = np.prod(self.ksize)
self.num_split = 1 if algo == ConvAlgo.MaskImplicitGemm else 2
sparse_dict = generate_sparse_data(shape, [1500] * bs, C)
voxels_np = np.ascontiguousarray(sparse_dict["features"]).astype(
np.float32)
indices_np = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
indices_th = torch.from_numpy(indices_np).to(self.device)
out_inds, pair_ref, indice_num_per_loc = ops.get_indice_pairs(
indices_th, 1, shape, ConvAlgo.Native, self.ksize, self.stride, self.padding,
self.dilation, op, subm)
self.indice_num_per_loc_np = indice_num_per_loc.cpu().numpy()
self.indice_pairs_np = pair_ref.cpu().numpy()
self.pair_native = pair_ref
self.indice_num_per_loc = indice_num_per_loc
if algo == ConvAlgo.Native:
self.out_inds: torch.Tensor = out_inds
self.num_inds_per_loc: torch.Tensor = indice_num_per_loc
self.pair_fwd : torch.Tensor = torch.Tensor()
self.pair_bwd: torch.Tensor = torch.Tensor()
self.pair_mask_fwd_splits: List[torch.Tensor] = []
self.pair_mask_bwd_splits: List[torch.Tensor] = []
self.mask_argsort_fwd_splits: List[torch.Tensor] = []
self.mask_argsort_bwd_splits: List[torch.Tensor] = []
self.masks = np.array([])
else:
res = ops.get_indice_pairs_implicit_gemm(indices_th, bs, shape,
algo, self.ksize, self.stride, self.padding,
self.dilation, op, subm=subm)
self.out_inds = res[0]
self.num_inds_per_loc = res[1]
self.pair_fwd = res[2]
self.pair_bwd = res[3]
self.pair_mask_fwd_splits = res[4]
self.pair_mask_bwd_splits = res[5]
self.mask_argsort_fwd_splits = res[6]
self.mask_argsort_bwd_splits = res[7]
self.masks = res[8]
self.voxels_np = voxels_np
self.indices_np = indices_np
self.subm = subm
if dtype == np.int8:
self.inp = np.random.randint(-2, 2, size=[voxels_np.shape[0],
C]).astype(np.int8)
self.weight = np.random.randint(-2, 2, size=[K, *self.ksize,
C]).astype(np.int8)
self.output = np.random.randint(-2, 2, size=[
self.out_inds.shape[0], K
]).astype(dtype)
else:
self.inp = np.random.uniform(-1, 1, size=[
voxels_np.shape[0], C
]).astype(dtype)
self.weight = np.random.uniform(-1, 1, size=[K, *self.ksize, C]).astype(dtype)
self.output = np.random.uniform(-1, 1, size=[
self.out_inds.shape[0], K
]).astype(dtype)
self.weight_ref = self.weight.transpose(1, 2, 3, 0, 4)
self.weight_ref = np.ascontiguousarray(self.weight_ref).reshape(-1, K, C)
self.out_ref, self.din_ref, self.dw_ref = self._get_ref_output()
self.dw_ref = np.ascontiguousarray(self.dw_ref.transpose(1, 0, 2).reshape(K, *self.ksize, C))
def _get_ref_output(self):
output_ref = np.zeros_like(self.output, dtype=np.float32)
dinput_ref = np.zeros_like(self.inp, dtype=np.float32)
dw_ref = np.zeros_like(self.weight_ref,
dtype=np.float32) # KV, K, C
for filter_offset in range(self.kv):
if self.subm and filter_offset > self.kv // 2:
nhot = self.indice_num_per_loc_np[self.kv - 1 - filter_offset]
elif self.subm and filter_offset == self.kv // 2:
nhot = self.voxels_np.shape[0]
else:
nhot = self.indice_num_per_loc_np[filter_offset]
i_inds = self.indice_pairs_np[0][filter_offset][:nhot]
o_inds = self.indice_pairs_np[1][filter_offset][:nhot]
a = self.inp[i_inds]
cc = a.astype(
np.float32) @ self.weight_ref[filter_offset].T.astype(
np.float32)
output_ref[o_inds] += cc
a = self.output[o_inds]
# NK @ KC
cc = a.astype(
np.float32) @ self.weight_ref[filter_offset].astype(
np.float32)
dinput_ref[i_inds] += cc
out_gather = self.output[o_inds] # [N, K]
inp_gather = self.inp[i_inds] # [N, C]
# KN @ NC
dw_res = out_gather.astype(
np.float32).T @ inp_gather.astype(np.float32)
dw_ref[filter_offset] = dw_res
return output_ref, dinput_ref, dw_ref
def get_operands(self, op_type: ConvOpType):
zeros_func = tv.zeros if not self.subm else tv.empty
if op_type == ConvOpType.kBackwardInput:
inp_tv = zeros_func(list(self.inp.shape), self.dtype, 0)
else:
inp_tv = tv.from_numpy(self.inp).cuda()
if op_type == ConvOpType.kBackwardWeight:
weight_tv = zeros_func(list(self.weight.shape), self.dtype, 0)
else:
weight_tv = tv.from_numpy(self.weight).cuda()
if op_type == ConvOpType.kForward:
output_tv = zeros_func(list(self.output.shape), self.dtype, 0)
else:
output_tv = tv.from_numpy(self.output).cuda()
return inp_tv, weight_tv, output_tv
def get_operands_torch(self, op_type: ConvOpType):
zeros_func = torch.zeros if not self.subm else torch.empty
if op_type == ConvOpType.kBackwardInput:
inp_tv = zeros_func(list(self.inp.shape), dtype=self.dtype_th, device=self.device)
else:
inp_tv = torch.from_numpy(self.inp).cuda()
if op_type == ConvOpType.kBackwardWeight:
weight_tv = zeros_func(list(self.weight.shape), dtype=self.dtype_th, device=self.device)
else:
weight_tv = torch.from_numpy(self.weight).cuda()
if op_type == ConvOpType.kForward:
output_tv = zeros_func(list(self.output.shape), dtype=self.dtype_th, device=self.device)
else:
output_tv = torch.from_numpy(self.output).cuda()
return inp_tv, weight_tv, output_tv
def _test_impgemm_conv_cuda(subm: bool):
ndim = 3
dtype_to_tol = {
np.float32: (1e-4, 1e-4),
np.float16: (1e-2, 1e-2),
np.int8: (1e-4, 1e-4),
}
device = torch.device("cuda:0")
shapes = [[19, 18, 17]]
batchsizes = [1]
dtypes = [np.float32, np.float16]
test_case = TestCase()
in_channels = [32, 47]
out_channels = [32, 48, 62]
if subm:
ksizes = [3]
strides = [1]
paddings = [0]
dilations = [1]
else:
ksizes = [2, 3]
strides = [1, 2, 3]
paddings = [0, 1]
dilations = [1, 2]
algos = [
ConvAlgo.MaskSplitImplicitGemm,
ConvAlgo.MaskImplicitGemm,
]
arch = torch.cuda.get_device_capability()
for shape, bs, C, K, k, s, p, d, algo, dtype in tqdm.tqdm(params_grid(
shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations, algos, dtypes)):
tester = SparseConvTester(algo, subm, shape, bs, dtype, 1500, K, C, k, s, p, d)
atol, rtol = dtype_to_tol[dtype]
mask_width_to_mask_out_fwd: Dict[int, torch.Tensor] = {}
mask_width_to_mask_out_bwd: Dict[int, torch.Tensor] = {}
op_types = [ConvOpType.kForward, ConvOpType.kBackwardInput]
spk = 1
for op_type in op_types:
inp_tv, weight_tv, output_tv = tester.get_operands(op_type)
avail_desps = CONV.get_all_available(inp_tv, weight_tv, output_tv, NHWC, NHWC, NHWC, arch, op_type, -1)
for desp in avail_desps:
if not subm:
if op_type == ConvOpType.kForward:
output_tv.zero_()
else:
inp_tv.zero_()
# this algo must success
mask_width = desp.tile_shape[0]
# if mask_width != 32:
# continue
if mask_width not in mask_width_to_mask_out_fwd:
mask_width_to_mask_out_fwd[mask_width] = torch.zeros([2, div_up(tester.out_inds.shape[0], mask_width)],
dtype=torch.int32,
device=tester.device)
mask_output_fwd = mask_width_to_mask_out_fwd[mask_width]
if subm:
if desp.op_type == ConvOpType.kForward.value:
indice_pairs = tester.pair_fwd
elif desp.op_type == ConvOpType.kBackwardInput.value:
indice_pairs = tester.pair_bwd
else:
indice_pairs = tester.pair_fwd
mask_output = mask_output_fwd
# print([bin(x.item()) for x in masks])
for j in range(tester.num_split):
beta = 1 if j == 1 else 0
mask_filter = tester.masks[j].item()
reverse_mask = False
if desp.op_type == ConvOpType.kBackwardWeight.value:
mask_op = mask_output[j]
else:
mask_op = tester.pair_mask_fwd_splits[j]
if desp.op_type == ConvOpType.kBackwardInput.value:
reverse_mask = True
mask_output_run = torch_tensor_to_tv(mask_output[j], dtype=tv.uint32)
if desp.op_type == ConvOpType.kBackwardWeight.value:
mask_output_run = tv.Tensor()
CONV.run_with_tuned_result(
BestConvAlgoByProfile(desp, spk),
desp.op_type,
inp_tv,
weight_tv,
output_tv,
torch_tensor_to_tv(mask_op, dtype=tv.uint32),
torch_tensor_to_tv(tester.mask_argsort_fwd_splits[j]),
mask_output_run,
torch_tensor_to_tv(indice_pairs),
reverse_mask,
mask_filter=mask_filter,
mask_width=mask_width,
beta=beta,
verbose=False,
)
else:
if mask_width not in mask_width_to_mask_out_bwd:
mask_width_to_mask_out_bwd[mask_width] = torch.zeros([2, div_up(tester.indices_np.shape[0], mask_width)],
dtype=torch.int32,
device=tester.device)
mask_output_bwd = mask_width_to_mask_out_bwd[mask_width]
if desp.op_type == ConvOpType.kForward.value:
indice_pairs = tester.pair_fwd # inp -> out
mask_ops = tester.pair_mask_fwd_splits
mask_argsorts = tester.mask_argsort_fwd_splits
mask_output = mask_output_fwd
elif desp.op_type == ConvOpType.kBackwardInput.value:
indice_pairs = tester.pair_bwd # out -> inp
mask_ops = tester.pair_mask_bwd_splits
mask_argsorts = tester.mask_argsort_bwd_splits
mask_output = mask_output_bwd
else:
indice_pairs = tester.pair_fwd # inp -> out
mask_ops = tester.pair_mask_fwd_splits
mask_argsorts = tester.mask_argsort_fwd_splits
mask_output = mask_output_fwd
for j in range(tester.num_split):
beta = 1 if j == 1 else 0
mask_filter = tester.masks[j].item()
reverse_mask = False
if desp.op_type == ConvOpType.kBackwardWeight.value:
mask_op = mask_output[j]
else:
mask_op = mask_ops[j]
CONV.run_with_tuned_result(
BestConvAlgoByProfile(desp, spk),
desp.op_type,
inp_tv,
weight_tv,
output_tv,
torch_tensor_to_tv(mask_op, dtype=tv.uint32),
torch_tensor_to_tv(mask_argsorts[j]),
torch_tensor_to_tv(mask_output[j], dtype=tv.uint32),
torch_tensor_to_tv(indice_pairs),
reverse_mask,
mask_filter=mask_filter,
mask_width=mask_width,
beta=beta,
verbose=False,
)
out_ref = tester.out_ref
din_ref = tester.din_ref
dw_ref = tester.dw_ref
if op_type == ConvOpType.kForward:
out_my = output_tv.cpu().numpy()
if dtype != np.float16:
test_case.assertAllClose(out_ref, out_my, atol=atol, rtol=rtol)
else:
error_norm = np.linalg.norm(out_ref.reshape(-1) - out_my.reshape(-1))
assert error_norm < 5
# print(desp, )
else:
din_my = inp_tv.cpu().numpy()
if dtype != np.float16:
test_case.assertAllClose(din_ref, din_my, atol=atol, rtol=rtol)
else:
error_norm = np.linalg.norm(din_ref.reshape(-1) - din_my.reshape(-1))
assert error_norm < 10, f"{desp}, {error_norm}, {k}, {s}, {p}, {d}"
inp_tv, weight_tv, output_tv = tester.get_operands(ConvOpType.kBackwardWeight)
for spk in [1, 4, 16, 64]:
for mask_width, mask_output in mask_width_to_mask_out_fwd.items():
avail_desps = CONV.get_all_available(inp_tv, weight_tv, output_tv, NHWC, NHWC, NHWC, arch, ConvOpType.kBackwardWeight, mask_width)
for desp in avail_desps:
weight_tv.zero_()
if subm:
indice_pairs = tester.pair_fwd
for j in range(tester.num_split):
beta = 0
mask_filter = tester.masks[j].item()
mask_op = mask_output[j]
mask_op_tv = torch_tensor_to_tv(mask_op, dtype=tv.uint32)
# mask_op_np = mask_op_tv.cpu().numpy()
# bit_ref = np.bitwise_or.reduce(mask_op_np, axis=0)
# bit_my = mask_filter
CONV.run_with_tuned_result(
BestConvAlgoByProfile(desp, spk),
desp.op_type,
inp_tv,
weight_tv,
output_tv,
mask_op_tv,
torch_tensor_to_tv(tester.mask_argsort_fwd_splits[j]),
tv.Tensor(),
torch_tensor_to_tv(indice_pairs),
reverse_mask=False,
mask_filter=mask_filter,
mask_width=mask_width,
beta=beta,
verbose=False,
)
else:
indice_pairs = tester.pair_fwd # inp -> out
mask_ops = tester.pair_mask_fwd_splits
mask_argsorts = tester.mask_argsort_fwd_splits
for j in range(tester.num_split):
# beta = 1 if j == 1 else 0
beta = 0
mask_filter = tester.masks[j].item()
reverse_mask = False
mask_op = mask_output[j]
CONV.run_with_tuned_result(
BestConvAlgoByProfile(desp, spk),
desp.op_type,
inp_tv,
weight_tv,
output_tv,
torch_tensor_to_tv(mask_op, dtype=tv.uint32),
torch_tensor_to_tv(mask_argsorts[j]),
torch_tensor_to_tv(mask_output[j], dtype=tv.uint32),
torch_tensor_to_tv(indice_pairs),
reverse_mask,
mask_filter=mask_filter,
mask_width=mask_width,
beta=beta,
verbose=False,
)
dw_ref = tester.dw_ref
dw_my = weight_tv.cpu().numpy()
if dtype != np.float16:
# print(desp, spk, K, C, mask_width, algo)
test_case.assertAllClose(dw_ref, dw_my, atol=atol, rtol=rtol)
else:
error_norm = np.linalg.norm(dw_ref.reshape(-1) - dw_my.reshape(-1))
# print(desp, error_norm)
assert error_norm < 5
def _test_native_conv_cuda(subm: bool):
ndim = 3
dtype_to_tol = {
np.float32: (1e-4, 1e-4),
np.float16: (1e-2, 1e-2),
np.int8: (1e-4, 1e-4),
}
device = torch.device("cuda:0")
shapes = [[19, 18, 17]]
batchsizes = [1]
dtypes = [np.float32, np.float16]
test_case = TestCase()
in_channels = [32, 47]
out_channels = [32, 48, 62]
if subm:
ksizes = [3, 5]
strides = [1]
paddings = [0]
dilations = [1]
else:
ksizes = [2, 3]
strides = [1, 2, 3]
paddings = [0, 1]
dilations = [1, 2]
arch = torch.cuda.get_device_capability()
stream = get_current_stream()
for shape, bs, C, K, k, s, p, d, dtype in tqdm.tqdm(params_grid(
shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations, dtypes)):
tester = SparseConvTester(ConvAlgo.Native, subm, shape, bs, dtype, 1500, K, C, k, s, p, d)
atol, rtol = dtype_to_tol[dtype]
kv_center = tester.kv // 2
kv = tester.kv
pair_in = torch_tensor_to_tv(tester.pair_native)[0]
pair_out = torch_tensor_to_tv(tester.pair_native)[1]
op_types = [ConvOpType.kForward, ConvOpType.kBackwardInput, ConvOpType.kBackwardWeight]
indice_pair_num_cpu = tester.indice_num_per_loc_np
spk = 1
out_ref = tester.out_ref
din_ref = tester.din_ref
dw_ref = tester.dw_ref.reshape(K, -1, C)
for op_type in op_types:
inp_th, weight_th, output_th = tester.get_operands_torch(op_type)
weight_th = weight_th.view(K, -1, C)
inp_tv = torch_tensor_to_tv(inp_th)
weight_tv = torch_tensor_to_tv(weight_th)
output_tv = torch_tensor_to_tv(output_th)
if op_type == ConvOpType.kForward:
a = inp_tv
c = output_tv
b = weight_tv.select(1, tester.kv // 2)
avail_desps = GEMM.get_all_available(a, b, c, False, True, False, arch, ShuffleStrideType.ShuffleAC)
for desp in avail_desps:
if subm:
torch.mm(inp_th, weight_th[:, tester.kv // 2].T, out=output_th)
else:
output_tv.zero_()
inited = subm
for i, nhot in enumerate(indice_pair_num_cpu):
if subm and i == kv_center:
continue
if subm and i > kv_center:
nhot = indice_pair_num_cpu[kv - i - 1]
if nhot <= 0:
continue
inp_indices = pair_in[i].slice_first_axis(0, nhot)
out_indices = pair_out[i].slice_first_axis(0, nhot)
b = weight_tv.select(1, i)
# inp @ filter.T, NC @ KC
beta = 1.0 if inited else 0.0
GEMM.run_with_tuned_result(
BestAlgoByProfile(desp, 1),
a,
b,
c,
False,
True,
False,
arch=arch,
stream=stream,
shuffle_type=ShuffleStrideType.ShuffleAC,
a_inds=inp_indices,
c_inds=out_indices,
hint=AlgoHint.Fowrard.value,
alpha=1.0,
beta=beta)
inited = True
out_my = output_tv.cpu().numpy()
if dtype != np.float16:
# error_norm = np.linalg.norm(out_ref.reshape(-1) - out_my.reshape(-1))
# assert error_norm < 1
# print(desp, K, C, k, error_norm)
test_case.assertAllClose(out_ref, out_my, atol=atol, rtol=rtol)
else:
error_norm = np.linalg.norm(out_ref.reshape(-1) - out_my.reshape(-1))
assert error_norm < 10
elif op_type == ConvOpType.kBackwardInput:
a = output_tv
b = weight_tv.select(1, tester.kv // 2)
c = inp_tv
avail_desps = GEMM.get_all_available(a, b, c, False, False, False, arch, ShuffleStrideType.ShuffleAC)
for desp in avail_desps:
if subm:
torch.mm(output_th, weight_th[:, tester.kv // 2], out=inp_th)
else:
inp_tv.zero_()
inited = subm
for i, nhot in enumerate(indice_pair_num_cpu):
if subm and i == kv_center:
continue
if subm and i > kv_center:
nhot = indice_pair_num_cpu[kv - i - 1]
if nhot <= 0:
continue
inp_indices = pair_in[i].slice_first_axis(0, nhot)
out_indices = pair_out[i].slice_first_axis(0, nhot)
b = weight_tv.select(1, i)
# inp @ filter.T, NC @ KC
beta = 1.0 if inited else 0.0
GEMM.run_with_tuned_result(
BestAlgoByProfile(desp, 1),
a,
b,
c,
False,
False,
False,
arch=arch,
stream=stream,
shuffle_type=ShuffleStrideType.ShuffleAC,
a_inds=out_indices,
c_inds=inp_indices,
hint=AlgoHint.Fowrard.value,
alpha=1.0,
beta=beta)
inited = True
din_my = inp_tv.cpu().numpy()
if dtype != np.float16:
# error_norm = np.linalg.norm(din_ref.reshape(-1) - din_my.reshape(-1))
# print(desp, K, C, k, error_norm)
test_case.assertAllClose(din_ref, din_my, atol=atol, rtol=rtol)
# assert error_norm < 1
else:
error_norm = np.linalg.norm(din_ref.reshape(-1) - din_my.reshape(-1))
assert error_norm < 10
else:
a = output_tv
b = inp_tv
c = weight_tv.select(1, tester.kv // 2)
avail_desps = GEMM.get_all_available(a, b, c, True, False, False, arch, ShuffleStrideType.ShuffleAB)
for desp in avail_desps:
inited = subm
weight_tv.zero_()
if subm:
torch.mm(output_th.T, inp_th, out=weight_th[:, kv_center])
for i, nhot in enumerate(indice_pair_num_cpu):
if subm and i == kv_center:
continue
if subm and i > kv_center:
nhot = indice_pair_num_cpu[kv - i - 1]
if nhot <= 0:
continue
beta = 1.0 if inited else 0.0
inp_indices = pair_in[i].slice_first_axis(0, nhot)
out_indices = pair_out[i].slice_first_axis(0, nhot)
a_inds = out_indices
b_inds = inp_indices
GEMM.run_with_tuned_result(BestAlgoByProfile(desp, 32),
a,
b,
weight_tv.select(1, i),
True,
False,
False,
arch=arch,
stream=stream,
shuffle_type=ShuffleStrideType.ShuffleAB,
a_inds=a_inds,
b_inds=b_inds,
hint=AlgoHint.BackwardWeight.value,
alpha=1.0,
beta=beta)
dw_my = weight_tv.cpu().numpy()
if dtype != np.float16:
error_norm = np.linalg.norm(dw_ref.reshape(-1) - dw_my.reshape(-1))
assert error_norm < 1
# test_case.assertAllClose(dw_ref, dw_my, atol=atol, rtol=rtol)
# print(desp, error_norm)
else:
error_norm = np.linalg.norm(dw_ref.reshape(-1) - dw_my.reshape(-1))
# print(desp, error_norm)
assert error_norm < 10
def test_all_algo_unit():
_test_impgemm_conv_cuda(True)
_test_impgemm_conv_cuda(False)
_test_native_conv_cuda(True)
_test_native_conv_cuda(False)
if __name__ == "__main__":
test_all_algo_unit()
\ No newline at end of file
...@@ -12,6 +12,12 @@ ...@@ -12,6 +12,12 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Compare results between sparse and dense layers:
SparseConvXd
SparseConvTransposeXd
SparseMaxPoolXd
"""
import time import time
import unittest import unittest
from pathlib import Path from pathlib import Path
...@@ -24,13 +30,11 @@ from spconv.core import ConvAlgo ...@@ -24,13 +30,11 @@ from spconv.core import ConvAlgo
import spconv.pytorch as spconv import spconv.pytorch as spconv
from spconv.test_utils import TestCase, generate_sparse_data, params_grid from spconv.test_utils import TestCase, generate_sparse_data, params_grid
from spconv.constants import ALL_WEIGHT_IS_KRSC, FILTER_HWIO from spconv.constants import ALL_WEIGHT_IS_KRSC, FILTER_HWIO
# import sparseconvnet as scn
# we must disable tf32 to increase reference precision. # we must disable tf32 to increase reference precision.
torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False torch.backends.cudnn.allow_tf32 = False
class SparseConv3dTestTorch(nn.Module): class SparseConv3dTestTorch(nn.Module):
def __init__(self, def __init__(self,
num_layers, num_layers,
...@@ -76,52 +80,6 @@ class SparseConv3dTestTorch(nn.Module): ...@@ -76,52 +80,6 @@ class SparseConv3dTestTorch(nn.Module):
self.grid) self.grid)
return self.net(x) # .dense() return self.net(x) # .dense()
class SubMConv3dTestTorch(nn.Module):
def __init__(self,
num_layers,
ndim,
shape,
in_channels,
out_channels,
kernel_size,
stride,
padding,
dilation,
algo=spconv.ConvAlgo.Native):
super().__init__()
layers = [
spconv.SubMConv3d(in_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False,
algo=algo)
]
for i in range(1, num_layers):
layers.append(
spconv.SubMConv3d(out_channels,
out_channels,
kernel_size,
stride,
padding=padding,
dilation=dilation,
bias=False,
algo=algo))
self.net = spconv.SparseSequential(*layers, )
# self.grid = torch.full([3, *shape], -1, dtype=torch.int32).cuda()
self.grid = None
self.shape = shape
def forward(self, features, coors, batch_size):
coors = coors.int() # .cpu()
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size,
self.grid)
return self.net(x) # .dense()
class Conv3dTestTorch(nn.Module): class Conv3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
kernel_size, stride, padding, dilation): kernel_size, stride, padding, dilation):
...@@ -150,11 +108,11 @@ class Conv3dTestTorch(nn.Module): ...@@ -150,11 +108,11 @@ class Conv3dTestTorch(nn.Module):
def forward(self, x): def forward(self, x):
return self.net(x) # .dense() return self.net(x) # .dense()
class SparseDeConv3dTestTorch(nn.Module): class SparseDeConv3dTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels, def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
kernel_size, stride, padding, dilation): kernel_size, stride, padding, dilation, algo):
super().__init__() super().__init__()
self.algo = algo
layers = [ layers = [
spconv.SparseConvTranspose3d(in_channels, spconv.SparseConvTranspose3d(in_channels,
out_channels, out_channels,
...@@ -162,7 +120,8 @@ class SparseDeConv3dTestTorch(nn.Module): ...@@ -162,7 +120,8 @@ class SparseDeConv3dTestTorch(nn.Module):
stride, stride,
padding=padding, padding=padding,
dilation=dilation, dilation=dilation,
bias=False) bias=False,
algo=algo)
] ]
for i in range(1, num_layers): for i in range(1, num_layers):
layers.append( layers.append(
...@@ -172,7 +131,8 @@ class SparseDeConv3dTestTorch(nn.Module): ...@@ -172,7 +131,8 @@ class SparseDeConv3dTestTorch(nn.Module):
stride, stride,
padding=padding, padding=padding,
dilation=dilation, dilation=dilation,
bias=False)) bias=False,
algo=algo))
self.net = spconv.SparseSequential(*layers, ) self.net = spconv.SparseSequential(*layers, )
self.shape = shape self.shape = shape
...@@ -213,14 +173,15 @@ class DeConv3dTestTorch(nn.Module): ...@@ -213,14 +173,15 @@ class DeConv3dTestTorch(nn.Module):
class SparseMaxPoolTestTorch(nn.Module): class SparseMaxPoolTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, kernel_size, stride, padding, def __init__(self, num_layers, ndim, shape, kernel_size, stride, padding,
dilation): dilation, algo):
super().__init__() super().__init__()
self.algo = algo
layers = [ layers = [
spconv.SparseMaxPool3d(kernel_size, stride, padding, dilation) spconv.SparseMaxPool3d(kernel_size, stride, padding, dilation, algo=algo)
] ]
for i in range(1, num_layers): for i in range(1, num_layers):
layers.append( layers.append(
spconv.SparseMaxPool3d(kernel_size, stride, padding, dilation)) spconv.SparseMaxPool3d(kernel_size, stride, padding, dilation, algo=algo))
self.net = spconv.SparseSequential(*layers, ) self.net = spconv.SparseSequential(*layers, )
self.shape = shape self.shape = shape
...@@ -243,86 +204,6 @@ class MaxPool3dTestTorch(nn.Module): ...@@ -243,86 +204,6 @@ class MaxPool3dTestTorch(nn.Module):
def forward(self, x): def forward(self, x):
return self.net(x) # .dense() return self.net(x) # .dense()
class SubmanifoldConvTestTorch(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
kernel_size, stride):
super().__init__()
layers = [
spconv.SubMConv3d(in_channels,
out_channels,
kernel_size,
bias=False,
indice_key="subm0")
]
for i in range(1, num_layers):
layers.append(
spconv.SubMConv3d(out_channels,
out_channels,
kernel_size,
bias=False))
self.net = nn.Sequential(*layers, )
self.shape = shape
def forward(self, features, coors, batch_size):
coors = coors.int()
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
return self.net(x)
class SCNCoupleDeConvTest(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
kernel_size, stride):
super().__init__()
self.scn_input = scn.InputLayer(ndim, shape, mode=0)
self.net = nn.Sequential(
scn.Convolution(ndim,
in_channels,
out_channels,
kernel_size,
stride,
bias=False),
scn.Deconvolution(ndim,
out_channels,
in_channels,
kernel_size,
stride,
bias=False),
scn.SparseToDense(ndim, in_channels),
)
def forward(self, features, coors, batch_size):
coors = coors.long().cpu()
x = self.scn_input((coors, features))
return self.net(x)
class SparseCoupleDeConvTest(nn.Module):
def __init__(self, num_layers, ndim, shape, in_channels, out_channels,
kernel_size, stride):
super().__init__()
self.net = spconv.SparseSequential(
spconv.SparseConv3d(in_channels,
out_channels,
kernel_size,
stride,
indice_key="cp0",
bias=False),
spconv.SparseInverseConv3d(out_channels,
in_channels,
kernel_size,
indice_key="cp0",
bias=False),
)
self.todense = spconv.ToDense()
self.shape = shape
def forward(self, features, coors, batch_size):
coors = coors.int()
x = spconv.SparseConvTensor(features, coors, self.shape, batch_size)
return self.todense(self.net(x)) # .dense()
def gather_nd(params, indices): def gather_nd(params, indices):
# this function has a limit that MAX_ADVINDEX_CALC_DIMS=5 # this function has a limit that MAX_ADVINDEX_CALC_DIMS=5
ndim = indices.shape[-1] ndim = indices.shape[-1]
...@@ -349,367 +230,147 @@ def scatter_nd(indices, updates, shape): ...@@ -349,367 +230,147 @@ def scatter_nd(indices, updates, shape):
ret[slices] = updates.view(*output_shape) ret[slices] = updates.view(*output_shape)
return ret return ret
def test_spconv3d():
test_case = TestCase()
np.random.seed(484)
torch.manual_seed(48848)
devices = ["cuda:0"]
shapes = [[19, 18, 17]]
batchsizes = [1, 2]
class TestSpConv(TestCase): in_channels = [32]
def testSpConv3d(self): out_channels = [32, 48, 64]
np.random.seed(484) ksizes = [2, 3]
torch.manual_seed(48848) strides = [1, 2, 3]
devices = ["cuda:0"] paddings = [0, 1, 2]
shapes = [[19, 18, 17]] dilations = [1, 2, 3]
batchsizes = [1, 2] algos = [
ConvAlgo.Native, ConvAlgo.MaskImplicitGemm,
in_channels = [32] ConvAlgo.MaskSplitImplicitGemm
out_channels = [32, 48, 64] ]
ksizes = [2, 3] # algos = [ConvAlgo.Native]
strides = [1, 2, 3]
paddings = [0, 1, 2] for dev, shape, bs, IC, OC, k, s, p, d, al in params_grid(
dilations = [1, 2, 3] devices, shapes, batchsizes, in_channels, out_channels, ksizes,
algos = [ strides, paddings, dilations, algos):
ConvAlgo.Native, ConvAlgo.MaskImplicitGemm, if all([s > 1, d > 1]):
ConvAlgo.MaskSplitImplicitGemm continue # don't support this.
] # print(dev, shape, bs, IC, OC, k, s, p, d)
# algos = [ConvAlgo.Native] device = torch.device(dev)
num_points = [1500] * bs
for dev, shape, bs, IC, OC, k, s, p, d, al in params_grid( dtype = torch.float32
devices, shapes, batchsizes, in_channels, out_channels, ksizes, net = SparseConv3dTestTorch(1,
strides, paddings, dilations, algos): 3,
if all([s > 1, d > 1]): shape,
continue # don't support this. IC,
# print(dev, shape, bs, IC, OC, k, s, p, d) OC,
device = torch.device(dev) k,
num_points = [1000] * bs s,
dtype = torch.float32 p,
net = SparseConv3dTestTorch(1, d,
3, algo=al).to(device).to(dtype)
shape, net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
IC, d).to(device).to(dtype)
OC,
k, sparse_dict = generate_sparse_data(shape, num_points, IC)
s,
p, features = np.ascontiguousarray(sparse_dict["features"]).astype(
d, np.float32)
algo=al).to(device).to(dtype) indices = np.ascontiguousarray(
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
d).to(device).to(dtype) features_dense = sparse_dict["features_dense"].astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device)
sparse_dict = generate_sparse_data(shape, num_points, IC) features_t = torch.from_numpy(features).to(device).to(dtype)
features_t.requires_grad = True
features = np.ascontiguousarray(sparse_dict["features"]).astype( features_dense_t = torch.from_numpy(features_dense).to(device).to(
np.float32) dtype)
indices = np.ascontiguousarray( features_dense_t.requires_grad = True
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32) if net.algo == ConvAlgo.Native and not ALL_WEIGHT_IS_KRSC:
features_dense = sparse_dict["features_dense"].astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device).to(dtype)
features_t.requires_grad = True
features_dense_t = torch.from_numpy(features_dense).to(device).to(
dtype)
features_dense_t.requires_grad = True
if net.algo == ConvAlgo.Native and not ALL_WEIGHT_IS_KRSC:
if FILTER_HWIO:
filters = np.random.uniform(-1, 1,
size=[k, k, k, IC,
OC]).astype(np.float32)
else:
filters = np.random.uniform(-1, 1,
size=[k, k, k, OC,
IC]).astype(np.float32)
filters_t = torch.from_numpy(filters).to(device).to(dtype)
if FILTER_HWIO:
net_ref.net[0].weight.data[:] = filters_t.permute(
4, 3, 0, 1, 2).contiguous()
else:
net_ref.net[0].weight.data[:] = filters_t.permute(
3, 4, 0, 1, 2).contiguous()
else:
filters = np.random.uniform(-1, 1,
size=[OC, k, k, k,
IC]).astype(np.float32)
filters_t = torch.from_numpy(filters).to(device).to(dtype)
net_ref.net[0].weight.data[:] = filters_t.permute(
0, 4, 1, 2, 3).contiguous()
net.net[0].weight.data[:] = filters_t
out_ref = net_ref(features_dense_t)
out = net(features_t, indices_t, bs).dense()
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
dout = np.random.uniform(-0.2, 0.2,
out_ref.shape).astype(features.dtype)
dout_t = torch.from_numpy(dout).to(device)
out.backward(dout_t)
out_ref.backward(dout_t)
din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4,
1).contiguous()
din_sparse = gather_nd(din_dense, indices_t.long())
din = features_t.grad.detach()
din_np = din.cpu().numpy()
din_sparse_np = din_sparse.cpu().numpy()
for layer, layer_ref in zip(net.net, net_ref.net):
dw = layer.weight.grad.detach().cpu().numpy()
dw_ref = layer_ref.weight.grad.detach().cpu().numpy()
if net.algo == ConvAlgo.Native and not ALL_WEIGHT_IS_KRSC:
if FILTER_HWIO:
dw = dw.transpose(4, 3, 0, 1, 2)
else:
dw = dw.transpose(3, 4, 0, 1, 2)
else:
# OHWI -> OIHW
dw = dw.transpose(0, 4, 1, 2, 3)
self.assertAllClose(dw, dw_ref, atol=1e-4)
self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
def testSpDeConv3d(self):
np.random.seed(484)
devices = ["cuda:0"]
shapes = [[19, 18, 17]]
batchsizes = [1, 2]
in_channels = [64]
out_channels = [32, 48, 64]
ksizes = [2, 3]
strides = [2, 3]
paddings = [0, 1, 2]
dilations = [1, 2, 3]
ksizes = [3]
strides = [1]
paddings = [0]
dilations = [1]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations):
if all([s > 1, d > 1]):
continue # don't support this.
device = torch.device(dev)
num_points = [1000] * bs
sparse_dict = generate_sparse_data(shape, num_points, IC)
features = np.ascontiguousarray(sparse_dict["features"]).astype(
np.float32)
indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32)
if FILTER_HWIO: if FILTER_HWIO:
filters = np.random.uniform(0, 1, size=[k, k, k, IC, filters = np.random.uniform(-1, 1,
OC]).astype(np.float32) size=[k, k, k, IC,
OC]).astype(np.float32)
else: else:
filters = np.random.uniform(0, 1, size=[k, k, k, OC, filters = np.random.uniform(-1, 1,
IC]).astype(np.float32) size=[k, k, k, OC,
IC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device) filters_t = torch.from_numpy(filters).to(device).to(dtype)
features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True
features_dense_t = torch.from_numpy(features_dense).to(device)
features_dense_t.requires_grad = True
net = SparseDeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device)
net_ref = DeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device)
filters_t = torch.from_numpy(filters).to(device)
print(net_ref.net[0].weight.shape)
if FILTER_HWIO: if FILTER_HWIO:
net_ref.net[0].weight.data[:] = filters_t.permute( net_ref.net[0].weight.data[:] = filters_t.permute(
3, 4, 0, 1, 2).contiguous() 4, 3, 0, 1, 2).contiguous()
else: else:
net_ref.net[0].weight.data[:] = filters_t.permute( net_ref.net[0].weight.data[:] = filters_t.permute(
4, 3, 0, 1, 2).contiguous() 3, 4, 0, 1, 2).contiguous()
net.net[0].weight.data[:] = filters_t else:
out_ref = net_ref(features_dense_t) filters = np.random.uniform(-1, 1,
out = net(features_t, indices_t, bs).dense() size=[OC, k, k, k,
out_np = out.detach().cpu().numpy() IC]).astype(np.float32)
out_ref_np = out_ref.detach().cpu().numpy() filters_t = torch.from_numpy(filters).to(device).to(dtype)
self.assertAllClose(out_np, out_ref_np, atol=1e-4) net_ref.net[0].weight.data[:] = filters_t.permute(
0, 4, 1, 2, 3).contiguous()
dout = np.random.uniform(-0.2, 0.2,
out_ref.shape).astype(features.dtype) net.net[0].weight.data[:] = filters_t
dout_t = torch.from_numpy(dout).to(device) out_ref = net_ref(features_dense_t)
out.backward(dout_t) out = net(features_t, indices_t, bs).dense()
out_ref.backward(dout_t) out_np = out.detach().cpu().numpy()
din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4, out_ref_np = out_ref.detach().cpu().numpy()
1).contiguous() test_case.assertAllClose(out_np, out_ref_np, atol=1e-4)
din_sparse = gather_nd(din_dense, indices_t.long())
din = features_t.grad.detach() dout = np.random.uniform(-0.2, 0.2,
din_np = din.cpu().numpy() out_ref.shape).astype(features.dtype)
din_sparse_np = din_sparse.cpu().numpy() dout_t = torch.from_numpy(dout).to(device)
self.assertAllClose(din_np, din_sparse_np, atol=1e-4) out.backward(dout_t)
for layer, layer_ref in zip(net.net, net_ref.net): out_ref.backward(dout_t)
dw = layer.weight.grad.detach().cpu().numpy() din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4,
dw_ref = layer_ref.weight.grad.detach().cpu().numpy() 1).contiguous()
din_sparse = gather_nd(din_dense, indices_t.long())
din = features_t.grad.detach()
din_np = din.cpu().numpy()
din_sparse_np = din_sparse.cpu().numpy()
for layer, layer_ref in zip(net.net, net_ref.net):
dw = layer.weight.grad.detach().cpu().numpy()
dw_ref = layer_ref.weight.grad.detach().cpu().numpy()
if net.algo == ConvAlgo.Native and not ALL_WEIGHT_IS_KRSC:
if FILTER_HWIO: if FILTER_HWIO:
dw = dw.transpose(3, 4, 0, 1, 2)
else:
dw = dw.transpose(4, 3, 0, 1, 2) dw = dw.transpose(4, 3, 0, 1, 2)
self.assertAllClose(dw, dw_ref, atol=1e-4) else:
dw = dw.transpose(3, 4, 0, 1, 2)
def testSpCpConv3d(self): else:
np.random.seed(484) # OHWI -> OIHW
devices = ["cuda:0", "cpu:0"] dw = dw.transpose(0, 4, 1, 2, 3)
shapes = [[20, 20, 20]]
batchsizes = [1, 2] test_case.assertAllClose(dw, dw_ref, atol=1e-4)
test_case.assertAllClose(din_np, din_sparse_np, atol=1e-4)
in_channels = [64]
out_channels = [32, 48, 64]
ksizes = [2]
strides = [2]
paddings = [0, 1, 2]
dilations = [1, 2, 3]
for dev, shape, bs, IC, OC, k, s in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides):
device = torch.device(dev)
num_points = [1000] * bs
sparse_dict = generate_sparse_data(shape, num_points, IC)
features = np.ascontiguousarray(sparse_dict["features"]).astype(
np.float32)
indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, IC,
OC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device)
indices_scn_t = torch.from_numpy(
indices[:, [1, 2, 3, 0]]).int().to(device)
features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True
features_ref_t = torch.from_numpy(features).to(device)
features_ref_t.requires_grad = True
net_ref = SCNCoupleDeConvTest(1, 3, shape, IC, OC, k, s).to(device)
net = SparseCoupleDeConvTest(1, 3, shape, IC, OC, k, s).to(device)
net_ref.net[0].weight.data[:] = net.net[0].weight.data[:].view(
*net_ref.net[0].weight.shape)
net_ref.net[1].weight.data[:] = net.net[1].weight.data[:].view(
*net_ref.net[1].weight.shape)
out_ref = net_ref(features_ref_t, indices_scn_t, bs)
out = net(features_t, indices_t, bs)
dout = np.random.uniform(-0.2, 0.2,
out_ref.shape).astype(features.dtype)
dout_t = torch.from_numpy(dout).to(device)
out.backward(dout_t)
out_ref.backward(dout_t)
din = features_t.grad.detach()
din_ref = features_ref_t.grad.detach()
din_np = din.cpu().numpy()
din_ref_np = din_ref.cpu().numpy()
self.assertAllClose(din_ref_np, din_np, atol=1e-4)
for layer, layer_ref in zip(net.net, net_ref.net):
dw = layer.weight.grad.detach().cpu().numpy()
dw_ref = layer_ref.weight.grad.detach().cpu().view(
*dw.shape).numpy()
self.assertAllClose(dw, dw_ref, atol=1e-4)
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
def testSpMaxPool3d(self):
np.random.seed(485)
devices = ["cuda:0"]
shapes = [[19, 18, 17]]
batchsizes = [1, 2]
in_channels = [64]
out_channels = [64]
ksizes = [2, 3]
strides = [1, 2, 3]
paddings = [0, 1]
dilations = [1, 2, 3]
# ksizes = [2]
# strides = [2]
# paddings = [0]
# dilations = [1]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations):
if all([s > 1, d > 1]):
continue # don't support this.
device = torch.device(dev)
num_points = [1000] * bs
# when data contains negative, sparse maxpool is not equal to dense maxpool.
sparse_dict = generate_sparse_data(shape,
num_points,
IC,
data_range=[0.1, 1])
features = np.ascontiguousarray(sparse_dict["features"]).astype(
np.float32)
indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32)
filters = np.random.uniform(0, 1, size=[k, k, k, OC,
IC]).astype(np.float32)
indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True
features_dense_t = torch.from_numpy(features_dense).to(device)
features_dense_t.requires_grad = True
net = SparseMaxPoolTestTorch(1, 3, shape, k, s, p, d).to(device)
net_ref = MaxPool3dTestTorch(1, 3, shape, k, s, p, d).to(device)
out_ref = net_ref(features_dense_t)
out = net(features_t, indices_t, bs)
outids = out.indices
outfeatures = out.features
outids_dev = outids.float()
out_dense = out.dense(channels_first=False)
out = out_dense.permute(0, 4, 1, 2, 3).contiguous()
out_np = out.detach().cpu().numpy()
out_ref_np = out_ref.detach().cpu().numpy()
self.assertAllClose(out_np, out_ref_np, atol=1e-4)
dout_sparse = np.random.uniform(
-0.2, 0.2, outfeatures.shape).astype(features.dtype)
dout_sparse_t = torch.from_numpy(dout_sparse).to(device)
dout_t = scatter_nd(outids.long(), dout_sparse_t,
list(out_dense.shape))
dout_t = dout_t.permute(0, 4, 1, 2, 3).contiguous()
out.backward(dout_t)
out_ref.backward(dout_t)
din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4,
1).contiguous()
din_sparse = gather_nd(din_dense, indices_t.long())
din = features_t.grad.detach()
din_np = din.cpu().numpy()
din_sparse_np = din_sparse.cpu().numpy()
self.assertAllClose(din_np, din_sparse_np, atol=1e-4)
def main(algo=spconv.ConvAlgo.Native, dtype=torch.float32):
# function for develop.
np.random.seed(484)
# devices = ["cuda:0"]
devices = ["cuda:0"]
shapes = [[400, 400, 15]]
batchsizes = [2]
in_channels = [19] def test_spdeconv3d():
out_channels = [17] test_case = TestCase()
ksizes = [(3, 3, 3)]
strides = [1]
paddings = [0]
dilations = [1]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid( np.random.seed(484)
devices = ["cuda:0"]
shapes = [[19, 18, 17]]
batchsizes = [1, 2]
in_channels = [64]
out_channels = [32, 48, 64]
ksizes = [2, 3]
strides = [2, 3]
paddings = [0, 1, 2]
dilations = [1, 2, 3]
algos = [
ConvAlgo.Native, ConvAlgo.MaskImplicitGemm,
ConvAlgo.MaskSplitImplicitGemm
]
for dev, shape, bs, IC, OC, k, s, p, d, al in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes, devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations): strides, paddings, dilations, algos):
if all([s > 1, d > 1]): if all([s > 1, d > 1]):
continue continue # don't support this.
device = torch.device(dev) device = torch.device(dev)
num_points = [30000] * bs num_points = [1000] * bs
dtype = torch.float32
sparse_dict = generate_sparse_data(shape, num_points, IC) sparse_dict = generate_sparse_data(shape, num_points, IC)
...@@ -718,115 +379,154 @@ def main(algo=spconv.ConvAlgo.Native, dtype=torch.float32): ...@@ -718,115 +379,154 @@ def main(algo=spconv.ConvAlgo.Native, dtype=torch.float32):
indices = np.ascontiguousarray( indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32) sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32) features_dense = sparse_dict["features_dense"].astype(np.float32)
indices_t = torch.from_numpy(indices) net = SparseDeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
filters = np.random.uniform(0, 1, size=[k[0], 1, 1, IC, d, al).to(device)
OC]).astype(np.float32) net_ref = DeConv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
indices_t = torch.from_numpy(indices).int().to(device).to(dtype) d).to(device)
features_t = torch.from_numpy(features).to(device).to(dtype)
features_dense_t = torch.from_numpy(features_dense).to(device).to( if net.algo == ConvAlgo.Native and not ALL_WEIGHT_IS_KRSC:
dtype) if FILTER_HWIO:
net = SparseConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d, filters = np.random.uniform(-1, 1,
algo=algo).to(device).to(dtype) size=[k, k, k, IC,
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p, OC]).astype(np.float32)
d).to(device).to(dtype) else:
filters_t = torch.from_numpy(filters).to(device).to(dtype) filters = np.random.uniform(-1, 1,
net_ref.net[0].weight[:] = filters_t.permute(4, 3, 0, 1, size=[k, k, k, OC,
2).contiguous() IC]).astype(np.float32)
net.net[0].weight[:] = filters_t filters_t = torch.from_numpy(filters).to(device).to(dtype)
if FILTER_HWIO:
net_ref.net[0].weight.data[:] = filters_t.permute(
3, 4, 0, 1, 2).contiguous()
else:
net_ref.net[0].weight.data[:] = filters_t.permute(
4, 3, 0, 1, 2).contiguous()
else:
filters = np.random.uniform(-1, 1,
size=[OC, k, k, k,
IC]).astype(np.float32)
filters_t = torch.from_numpy(filters).to(device).to(dtype)
net_ref.net[0].weight.data[:] = filters_t.permute(
4, 0, 1, 2, 3).contiguous()
net.net[0].weight.data[:] = filters_t
indices_t = torch.from_numpy(indices).int().to(device)
features_t = torch.from_numpy(features).to(device)
features_t.requires_grad = True
features_dense_t = torch.from_numpy(features_dense).to(device)
features_dense_t.requires_grad = True
filters_t = torch.from_numpy(filters).to(device)
out_ref = net_ref(features_dense_t) out_ref = net_ref(features_dense_t)
times = [] out = net(features_t, indices_t, bs).dense()
for i in range(10): out_np = out.detach().cpu().numpy()
t = time.time() out_ref_np = out_ref.detach().cpu().numpy()
out = net(features_t, indices_t, bs) test_case.assertAllClose(out_np, out_ref_np, atol=1e-4)
torch.cuda.synchronize()
times.append(time.time() - t) dout = np.random.uniform(-0.2, 0.2,
# print((net.grid == -1).float().sum(), net.grid.numel()) out_ref.shape).astype(features.dtype)
# print("spconv time", time.time() - t) dout_t = torch.from_numpy(dout).to(device)
print("spconv time", np.mean(times[2:])) out.backward(dout_t)
out = net(features_t, indices_t, bs) out_ref.backward(dout_t)
# print(out.indices) din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4,
out = out.dense() 1).contiguous()
out_numpy = out.detach().cpu().numpy() din_sparse = gather_nd(din_dense, indices_t.long())
din = features_t.grad.detach()
print( din_np = din.cpu().numpy()
np.linalg.norm(out.detach().cpu().numpy() - din_sparse_np = din_sparse.cpu().numpy()
out_ref.detach().cpu().numpy())) test_case.assertAllClose(din_np, din_sparse_np, atol=1e-4)
print(out_numpy.min(), out_numpy.max(), out_numpy.mean(), for layer, layer_ref in zip(net.net, net_ref.net):
out_numpy.sum()) dw = layer.weight.grad.detach().cpu().numpy()
dw_ref = layer_ref.weight.grad.detach().cpu().numpy()
if net.algo == ConvAlgo.Native and not ALL_WEIGHT_IS_KRSC:
if FILTER_HWIO:
dw = dw.transpose(3, 4, 0, 1, 2)
else:
dw = dw.transpose(4, 3, 0, 1, 2)
else:
# OHWI -> OIHW
dw = dw.transpose(4, 0, 1, 2, 3)
test_case.assertAllClose(dw, dw_ref, atol=1e-4)
def test_spmaxpool3d():
test_case = TestCase()
def main_subm(algo, dtype=torch.float32): np.random.seed(485)
# function for develop.
np.random.seed(484)
torch.manual_seed(50051)
# devices = ["cuda:0"]
devices = ["cuda:0"] devices = ["cuda:0"]
shapes = [[400, 400, 15]] shapes = [[19, 18, 17]]
batchsizes = [2] batchsizes = [1, 2]
in_channels = [32] in_channels = [64]
out_channels = [64] out_channels = [64]
ksizes = [(3, 3, 3)] ksizes = [2, 3]
strides = [1] strides = [1, 2, 3]
paddings = [1] paddings = [0, 1]
dilations = [1] dilations = [1, 2, 3]
for dev, shape, bs, IC, OC, k, s, p, d in params_grid( # ksizes = [2]
# strides = [2]
# paddings = [0]
# dilations = [1]
algos = [
ConvAlgo.Native, ConvAlgo.MaskImplicitGemm,
ConvAlgo.MaskSplitImplicitGemm
]
for dev, shape, bs, IC, OC, k, s, p, d, al in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes, devices, shapes, batchsizes, in_channels, out_channels, ksizes,
strides, paddings, dilations): strides, paddings, dilations, algos):
if all([s > 1, d > 1]): if all([s > 1, d > 1]):
continue continue # don't support this.
device = torch.device(dev) device = torch.device(dev)
num_points = [120000] * bs num_points = [1000] * bs
sparse_dict = generate_sparse_data(shape, num_points, IC) # when data contains negative, sparse maxpool is not equal to dense maxpool.
sparse_dict = generate_sparse_data(shape,
num_points,
IC,
data_range=[0.1, 1])
features = np.ascontiguousarray(sparse_dict["features"]).astype( features = np.ascontiguousarray(sparse_dict["features"]).astype(
np.float32) np.float32)
indices = np.ascontiguousarray( indices = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32) sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
features_dense = sparse_dict["features_dense"].astype(np.float32) features_dense = sparse_dict["features_dense"].astype(np.float32)
indices_t = torch.from_numpy(indices) indices_t = torch.from_numpy(indices).int().to(device)
filters = np.random.uniform(0, 1, size=[k[0], 1, 1, IC, features_t = torch.from_numpy(features).to(device)
OC]).astype(np.float32) features_t.requires_grad = True
indices_t = torch.from_numpy(indices).int().to(device).to(dtype) features_dense_t = torch.from_numpy(features_dense).to(device)
features_t = torch.from_numpy(features).to(device).to(dtype) features_dense_t.requires_grad = True
net = SparseMaxPoolTestTorch(1, 3, shape, k, s, p, d, al).to(device)
net_ref = MaxPool3dTestTorch(1, 3, shape, k, s, p, d).to(device)
features_dense_t = torch.from_numpy(features_dense).to(device).to(
dtype)
net = SubMConv3dTestTorch(1, 3, shape, IC, OC, k, s, p, d,
algo=algo).to(device).to(dtype)
net_ref = Conv3dTestTorch(1, 3, shape, IC, OC, k, s, p,
d).to(device).to(dtype)
filters_t = torch.from_numpy(filters).to(device).to(dtype)
net_ref.net[0].weight[:] = filters_t.permute(4, 3, 0, 1,
2).contiguous()
net.net[0].weight[:] = filters_t
out_ref = net_ref(features_dense_t) out_ref = net_ref(features_dense_t)
times = []
for i in range(20):
t = time.time()
out = net(features_t, indices_t, bs)
torch.cuda.synchronize()
times.append(time.time() - t)
# print((net.grid == -1).float().sum(), net.grid.numel())
# print("spconv time", time.time() - t)
print("spconv time", np.mean(times[10:]))
out = net(features_t, indices_t, bs) out = net(features_t, indices_t, bs)
# print(out.indices)
out = out.dense() outids = out.indices
out_numpy = out.detach().cpu().numpy() outfeatures = out.features
# print( outids_dev = outids.float()
# np.linalg.norm(out.detach().cpu().numpy() - out_dense = out.dense(channels_first=False)
# out_ref.detach().cpu().numpy())) out = out_dense.permute(0, 4, 1, 2, 3).contiguous()
print(out_numpy.min(), out_numpy.max(), out_numpy.mean(), out_np = out.detach().cpu().numpy()
out_numpy.sum()) out_ref_np = out_ref.detach().cpu().numpy()
return out_numpy test_case.assertAllClose(out_np, out_ref_np, atol=1e-4)
dout_sparse = np.random.uniform(
if __name__ == '__main__': -0.2, 0.2, outfeatures.shape).astype(features.dtype)
# main_subm(algo=spconv.ConvAlgo.SparseConvNet, dtype=torch.float32) dout_sparse_t = torch.from_numpy(dout_sparse).to(device)
# main(algo=spconv.ConvAlgo.SparseConvNet, dtype=torch.float32) dout_t = scatter_nd(outids.long(), dout_sparse_t,
# TestCase().assertAllClose(out_my, out_ref) list(out_dense.shape))
# unittest.main() dout_t = dout_t.permute(0, 4, 1, 2, 3).contiguous()
TestSpConv().testSpConv3d() out.backward(dout_t)
out_ref.backward(dout_t)
din_dense = features_dense_t.grad.detach().permute(0, 2, 3, 4,
1).contiguous()
din_sparse = gather_nd(din_dense, indices_t.long())
din = features_t.grad.detach()
din_np = din.cpu().numpy()
din_sparse_np = din_sparse.cpu().numpy()
test_case.assertAllClose(din_np, din_sparse_np, atol=1e-4)
if __name__ == "__main__":
test_spmaxpool3d()
\ No newline at end of file
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from spconv.core_cc.csrc.sparse.all import SpconvOps
...@@ -12,9 +12,330 @@ ...@@ -12,9 +12,330 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Compare results between different algo: """Compare results between different algos:
CPU: gather-mm-scatter CPU: simple gather-mm-scatter
Native: Fused gather-mm-scatter Native: Fused gather-mm-scatter
ImplicitGemm ImplicitGemm: implicit gemm
""" """
import time
from pathlib import Path
import numpy as np
import torch
from torch import nn
from cumm import tensorview as tv
from spconv.core import ConvAlgo
import spconv.pytorch as spconv
import pickle
from spconv.test_utils import generate_sparse_data, params_grid
class Net(nn.Module):
def __init__(self, shape, algo):
super().__init__()
pool_algo = algo
# pool_algo = ConvAlgo.Native
self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 32, 3, bias=False, indice_key="c0",
algo=algo),
spconv.SubMConv3d(32,
32,
3,
bias=False,
indice_key="c0",
algo=algo),
# # nn.BatchNorm1d(32),
# # nn.ReLU(),
spconv.SubMConv3d(32, 64, 3, bias=False, indice_key="c0",
algo=algo),
spconv.SubMConv3d(64,
64,
3,
bias=False,
indice_key="c0",
algo=algo),
# nn.BatchNorm1d(32),
# # nn.ReLU(),
spconv.SparseConv3d(64, 64, 3, 2, 1, bias=False, indice_key="m0", algo=algo),
# # spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv.SubMConv3d(64,
96,
3,
bias=False,
indice_key="c1",
algo=algo),
spconv.SubMConv3d(96,
96,
3,
bias=False,
indice_key="c1",
algo=algo),
# nn.BatchNorm1d(64),
# nn.ReLU(),
spconv.SparseConv3d(96, 96, 2, 2, bias=False, indice_key="m1", algo=algo),
# spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv.SubMConv3d(96,
128,
3,
bias=False,
indice_key="c2",
algo=algo),
spconv.SubMConv3d(128,
128,
3,
bias=False,
indice_key="c2",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
# spconv.SparseConv3d(128, 128, 2, 2, bias=False, indice_key="m2"),
spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv.SubMConv3d(128,
160,
3,
bias=False,
indice_key="c3",
algo=algo),
spconv.SubMConv3d(160,
160,
3,
bias=False,
indice_key="c3",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
# spconv.SparseConv3d(160, 160, 2, 2, bias=False, indice_key="m3"),
spconv.SparseMaxPool3d(2, 2, algo=pool_algo, indice_key="m3"),
spconv.SubMConv3d(160,
192,
3,
bias=False,
indice_key="c4",
algo=algo),
spconv.SubMConv3d(192,
192,
3,
bias=False,
indice_key="c4",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2, indice_key="m4", algo=pool_algo),
# spconv.SparseConv3d(192, 192, 2, 2, bias=False, indice_key="m4"),
spconv.SubMConv3d(192,
224,
3,
bias=False,
indice_key="c5",
algo=algo),
spconv.SubMConv3d(224,
224,
3,
bias=False,
indice_key="c5",
algo=algo),
# nn.BatchNorm1d(256),
# nn.ReLU(),
spconv.SparseInverseConv3d(224, 128, 2, indice_key="m4", bias=False, algo=algo),
# # nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseInverseConv3d(128, 64, 2, indice_key="m3", bias=False, algo=algo),
)
max_batch_size = 1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
# self.grid = None
self.shape = shape
def forward(self, features, coors, batch_size):
x = spconv.SparseConvTensor(features,
coors,
self.shape,
batch_size)
return self.net(x)
class NetLight(nn.Module):
def __init__(self, shape, algo):
super().__init__()
pool_algo = algo
# pool_algo = ConvAlgo.Native
self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 32, 3, bias=False, indice_key="c0",
algo=algo),
spconv.SubMConv3d(32,
32,
3,
bias=False,
indice_key="c0",
algo=algo),
# # nn.BatchNorm1d(32),
# # nn.ReLU(),
spconv.SubMConv3d(32, 64, 3, bias=False, indice_key="c0",
algo=algo),
spconv.SubMConv3d(64,
64,
3,
bias=False,
indice_key="c0",
algo=algo),
# nn.BatchNorm1d(32),
# # nn.ReLU(),
spconv.SparseConv3d(64, 64, 3, 2, 1, bias=False, indice_key="m0", algo=algo),
# # spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv.SubMConv3d(64,
96,
3,
bias=False,
indice_key="c1",
algo=algo),
spconv.SubMConv3d(96,
96,
3,
bias=False,
indice_key="c1",
algo=algo),
# nn.BatchNorm1d(64),
# nn.ReLU(),
spconv.SparseConv3d(96, 96, 2, 2, bias=False, indice_key="m1", algo=algo),
# spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv.SparseInverseConv3d(96, 64, 2, indice_key="m1", bias=False, algo=algo),
# # nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseInverseConv3d(64, 32, 3, indice_key="m0", bias=False, algo=algo),
)
max_batch_size = 1
# grid (dense map) is used for indice generation. use pre-allocated grid can run faster.
# self.grid = None
self.shape = shape
def forward(self, features, coors, batch_size):
x = spconv.SparseConvTensor(features,
coors,
self.shape,
batch_size)
return self.net(x)
def _test_multi_impl(dtype: torch.dtype):
# TODO remove or release this when tf32 op is ready
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
np.random.seed(50051)
if dtype != torch.float16:
with open(Path(__file__).parent / "data" / "test_spconv.pkl", "rb") as f:
(voxels, coors, spatial_shape) = pickle.load(f)
else:
# CPU fp16 is very slow, so we use a small data here.
spatial_shape = [19, 18, 17]
sparse_dict = generate_sparse_data(spatial_shape, [1500] * 1, 3)
voxels = np.ascontiguousarray(sparse_dict["features"]).astype(
np.float32)
coors = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
device = torch.device("cuda:0")
device_cpu = torch.device("cpu:0")
voxels_th = torch.from_numpy(voxels).to(device_cpu).to(dtype)
coors_th = torch.from_numpy(coors).to(device_cpu).int()
voxels_th_cuda = torch.from_numpy(voxels).to(device).to(dtype)
coors_th_cuda = torch.from_numpy(coors).to(device).int()
net_cls = Net
if dtype == torch.float16:
# CPU fp16 is very slow, so we use a small network here.
net_cls = NetLight
# cpu
torch.manual_seed(50051)
net_native_cpu = net_cls(spatial_shape, ConvAlgo.Native).to(device_cpu).to(dtype)
# gpu_native
torch.manual_seed(50051)
net_native_gpu = net_cls(spatial_shape, ConvAlgo.Native).to(device).to(dtype)
torch.manual_seed(50051)
net_imp_gpu = net_cls(spatial_shape, ConvAlgo.MaskImplicitGemm).to(device).to(dtype)
torch.manual_seed(50051)
net_simp_gpu = net_cls(spatial_shape, ConvAlgo.MaskSplitImplicitGemm).to(device).to(dtype)
spconv.assign_name_for_sparse_modules(net_native_cpu)
spconv.assign_name_for_sparse_modules(net_native_gpu)
spconv.assign_name_for_sparse_modules(net_imp_gpu)
spconv.assign_name_for_sparse_modules(net_simp_gpu)
with torch.no_grad():
out: torch.Tensor = net_native_cpu(voxels_th, coors_th, 1).dense()
dout = np.random.uniform(-0.2, 0.2, out.shape).astype(np.float32)
dout_t = torch.from_numpy(dout).to(device_cpu).to(dtype)
dout_t_cu = torch.from_numpy(dout).to(device).to(dtype)
out_cpu = net_native_cpu(voxels_th, coors_th, 1).dense()
out_cpu.backward(dout_t)
out = net_native_gpu(voxels_th_cuda, coors_th_cuda, 1).dense()
out.backward(dout_t_cu)
out_imp = net_imp_gpu(voxels_th_cuda, coors_th_cuda, 1).dense()
out_imp.backward(dout_t_cu)
out_simp = net_simp_gpu(voxels_th_cuda, coors_th_cuda, 1).dense()
out_simp.backward(dout_t_cu)
with torch.no_grad():
dense_cpu = out_cpu.cuda()
dense_native = out
dense_imp = out_imp
dense_simp = out_simp
error_native = torch.linalg.norm(dense_cpu - dense_native).cpu().item()
error_imp = torch.linalg.norm(dense_cpu - dense_imp).cpu().item()
error_simp = torch.linalg.norm(dense_cpu - dense_simp).cpu().item()
print("error_native", error_native)
print("error_imp", error_imp)
print("error_simp", error_simp)
if dtype == torch.float32:
assert error_native < 0.01
assert error_imp < 0.01
assert error_simp < 0.01
else:
assert error_native < 10
assert error_imp < 10
assert error_simp < 10
cpu_params = dict(net_native_cpu.named_parameters())
native_params = dict(net_native_gpu.named_parameters())
imp_params = dict(net_imp_gpu.named_parameters())
simp_params = dict(net_simp_gpu.named_parameters())
for k, cpu_w in cpu_params.items():
native_w = native_params[k]
imp_w = imp_params[k]
simp_w = simp_params[k]
cpu_w_grad = cpu_w.grad.detach().cuda()
native_w_grad = native_w.grad.detach()
imp_w_grad = imp_w.grad.detach()
simp_w_grad = simp_w.grad.detach()
error_native = torch.linalg.norm(native_w_grad - cpu_w_grad).cpu().item()
error_imp = torch.linalg.norm(native_w_grad - imp_w_grad).cpu().item()
error_simp = torch.linalg.norm(native_w_grad - simp_w_grad).cpu().item()
print(k, error_native, error_imp, error_simp)
assert error_imp < 1
assert error_simp < 1
def test_multi_impl():
_test_multi_impl(torch.float32)
_test_multi_impl(torch.float16)
if __name__ == "__main__":
test_multi_impl()
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# developers must run this file before push or pull request.
# this script contains three parts:
# 1. unit tests for all gemm/conv kernels
# 2. comparison test: compare network fwd/bwd results between CPU, Native, ImplicitGemm
# 3. f32/f16 train/eval test based on mnist and some small datasets
echo "-------------UNIT TEST START--------------"
pytest ./test
echo "-------------UNIT TEST END--------------"
python ./example/mnist_sparse.py --fp16
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment