Commit 52594038 authored by yan.yan's avatar yan.yan
Browse files

v2.1.21: add sm37, avoid fp16 nan

parent b0f52b8a
# Changelog
## [2.1.21] - 2021-12-9
### Added
- add sm_37
- add fp16 kernels witl fp32 accumulator (run slower, but can avoid nan if channel size is too large)
## [2.1.20] - 2021-12-6
### Added
- Add fp16 conv simt kernels for mixed-training in pascal or older GPUS. WARNING: not optimized for TESLA P100 which has 2x throughput in half.
......
......@@ -38,9 +38,9 @@ if cuda_ver:
cuda_ver = cuda_ver.replace(".", "") # 10.2 to 102
RELEASE_NAME += "-cu{}".format(cuda_ver)
deps = ["cumm-cu{}>=0.2.6".format(cuda_ver)]
deps = ["cumm-cu{}>=0.2.8".format(cuda_ver)]
else:
deps = ["cumm>=0.2.6"]
deps = ["cumm>=0.2.8"]
......@@ -158,6 +158,7 @@ if disable_jit is not None and disable_jit == "1":
from spconv.csrc.sparse.all import SpconvOps
from spconv.csrc.utils import BoxOps
from spconv.csrc.hash.core import HashTable
from cumm.common import CompileInfo
cu = GemmMainUnitTest(SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS)
convcu = ConvMainUnitTest(IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS + IMPLGEMM_TURING_PARAMS)
......@@ -171,9 +172,9 @@ if disable_jit is not None and disable_jit == "1":
std = "c++14"
else:
std = "c++17"
cus = [cu, convcu, SpconvOps(), BoxOps(), HashTable()]
cus = [cu, convcu, SpconvOps(), BoxOps(), HashTable(), CompileInfo()]
if CUMM_CPU_ONLY_BUILD:
cus = [SpconvOps(), BoxOps(), HashTable()]
cus = [SpconvOps(), BoxOps(), HashTable(), CompileInfo()]
ext_modules: List[Extension] = [
PCCMExtension(cus,
"spconv/core_cc",
......
......@@ -514,10 +514,23 @@ class SimpleConv:
out: tv.Tensor, layout_i: ConvLayout,
layout_w: ConvLayout, layout_o: ConvLayout,
arch: Tuple[int, int], op_type: ConvOpType,
mask_width: int):
mask_width: int, fp32_accum: Optional[bool] = None):
avail_algos = get_available_algo_str_from_arch(arch)
finally_algos: List[ConvAlgoDesp] = []
is_fp16 = inp.dtype == tv.float16 and weight.dtype == tv.float16 and out.dtype == tv.float16
use_f32_as_accum = False
kv = int(np.prod(weight.shape[1:-1]))
# for 3d conv, if reduce axis is too large, may cause nan during
# forward.
if is_fp16:
if fp32_accum is None:
if op_type == ConvOpType.kForward:
use_f32_as_accum = weight.dim(-1) * kv > 128 * 27
elif op_type == ConvOpType.kBackwardInput:
use_f32_as_accum = weight.dim(0) * kv > 128 * 27
else:
use_f32_as_accum = fp32_accum
for algo in avail_algos:
static_key = (layout_i.layout_type.value,
layout_w.layout_type.value,
......@@ -531,6 +544,14 @@ class SimpleConv:
# skip volta tensor op since it is very slow in architectures except volta.
if arch >= (7, 5) and desp.algo == GemmAlgo.Volta.value:
continue
if arch >= (7, 0) and is_fp16:
# skip simt fp16 kernels if we have tensor core
if desp.algo == GemmAlgo.Simt:
continue
if use_f32_as_accum:
if desp.dacc == tv.float16:
continue
ldi = inp.dim(-1)
ldw = weight.dim(-1)
ldo = out.dim(-1)
......@@ -589,9 +610,11 @@ class SimpleConv:
mask_output: tv.Tensor = tv.Tensor(),
alpha: float = 1.0,
beta: float = 0.0,
stream: int = 0):
stream: int = 0,
fp32_accum: Optional[bool] = None):
avail = self.get_all_available(inp, weight, output, layout_i, layout_w,
layout_o, arch, op_type, mask_width)
layout_o, arch, op_type, mask_width,
fp32_accum)
inp = inp.clone()
weight = weight.clone()
output = output.clone()
......
......@@ -26,6 +26,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
from cumm.gemm.main import GemmMainUnitTest
from cumm.conv.main import ConvMainUnitTest
from cumm.common import CompileInfo
from spconv.csrc.sparse.all import SpconvOps
from spconv.csrc.utils import BoxOps
......@@ -41,7 +42,7 @@ if project_is_installed(PACKAGE_NAME) and project_is_editable(
if InWindows:
# windows have command line limit, so we use objects_folder to reduce command size.
objects_folder = "objects"
pccm.builder.build_pybind([cu, convcu, SpconvOps(), BoxOps(), HashTable()],
pccm.builder.build_pybind([cu, convcu, SpconvOps(), BoxOps(), HashTable(), CompileInfo()],
PACKAGE_ROOT / "core_cc",
namespace_root=PACKAGE_ROOT,
objects_folder=objects_folder,
......
......@@ -403,7 +403,7 @@ IMPLGEMM_VOLTA_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (64, 64, 32), (32, 32, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f16,f16,f16,f16,f16"],
2, ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -415,7 +415,7 @@ IMPLGEMM_VOLTA_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (64, 64, 32), (32, 32, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f16,f16,f16,f16,f16"],
2, ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -427,7 +427,7 @@ IMPLGEMM_VOLTA_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (64, 128, 32), (32, 64, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f16,f16,f16,f16,f16"],
2, ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -439,7 +439,7 @@ IMPLGEMM_VOLTA_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (32, 256, 32), (32, 64, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f16,f16,f16,f16,f16"],
2, ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -490,7 +490,7 @@ IMPLGEMM_TURING_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (32, 64, 32), (32, 32, 16),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f16,f16,f16,f16,f16"],
2, ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -502,7 +502,7 @@ IMPLGEMM_TURING_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (32, 64, 32), (32, 32, 16),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f16,f16,f16,f16,f16"],
2, ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -514,7 +514,7 @@ IMPLGEMM_TURING_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (32, 256, 32), (32, 64, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f16,f16,f16,f16,f16"],
2, ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -526,7 +526,7 @@ IMPLGEMM_TURING_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (32, 128, 32), (32, 32, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f16,f16,f16,f16,f16"],
2, ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -538,7 +538,7 @@ IMPLGEMM_TURING_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (32, 128, 64), (32, 32, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f16,f16,f16,f16,f16"],
2, ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -550,7 +550,7 @@ IMPLGEMM_TURING_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (32, 128, 64), (32, 64, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f16,f16,f16,f16,f16"],
2, ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -562,7 +562,7 @@ IMPLGEMM_TURING_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (32, 128, 64), (32, 32, 64),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f16,f16,f16,f16,f16"],
2, ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -574,7 +574,7 @@ IMPLGEMM_TURING_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (32, 128, 64), (32, 64, 64),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f16,f16,f16,f16,f16"],
2, ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -586,7 +586,7 @@ IMPLGEMM_TURING_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (64, 128, 32), (32, 64, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f16,f16,f16,f16,f16"],
2, ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -598,7 +598,7 @@ IMPLGEMM_TURING_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (64, 128, 64), (32, 64, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f16,f16,f16,f16,f16"],
2, ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......@@ -610,7 +610,7 @@ IMPLGEMM_TURING_PARAMS = [
*gen_conv_params(ConvFwdAndBwdInput, (64, 64, 32), (32, 32, 32),
NDIM_DONT_CARE,
ConvIterAlgo.Optimized,
2, ["f16,f16,f16,f16,f16"],
2, ["f16,f16,f16,f16,f16", "f16,f16,f16,f32,f32"],
NHWC,
NHWC,
NHWC,
......
from typing import overload, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
from pccm.stubs import EnumValue, EnumClassValue
class CompileInfo:
@staticmethod
def get_compiled_cuda_arch() -> List[Tuple[int, int]]: ...
......@@ -20,5 +20,7 @@ else:
CPU_ONLY_BUILD = True
from spconv.core_cc.csrc.utils.boxops import BoxOps
from spconv.core_cc.cumm.common import CompileInfo
HAS_BOOST = BoxOps.has_boost()
COMPILED_CUDA_ARCHS = set(CompileInfo.get_compiled_cuda_arch())
......@@ -36,8 +36,6 @@ from spconv.utils import nullcontext
from torch.nn.init import calculate_gain
class SparseConvolution(SparseModule):
__constants__ = [
'stride', 'padding', 'dilation', 'groups', 'bias', 'subm', 'inverse',
......@@ -60,6 +58,7 @@ class SparseConvolution(SparseModule):
inverse: bool = False,
indice_key: Optional[str] = None,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
name=None):
super(SparseConvolution, self).__init__(name=name)
assert groups == 1, "don't support groups for now"
......@@ -78,7 +77,9 @@ class SparseConvolution(SparseModule):
if not subm:
self.conv1x1 &= kv_stride == 1
if self.conv1x1:
assert self.padding == [0] * ndim, "padding must be zero for 1x1 conv (k=1,s=1)"
assert self.padding == [
0
] * ndim, "padding must be zero for 1x1 conv (k=1,s=1)"
self.transposed = transposed
self.inverse = inverse
self.output_padding = expand_nd(ndim, output_padding)
......@@ -98,6 +99,7 @@ class SparseConvolution(SparseModule):
if CPU_ONLY_BUILD:
assert algo == ConvAlgo.Native, "cpu only build only support native algorithm"
self.algo = algo
self.fp32_accum = fp32_accum
# self.algo = ConvAlgo.Native
if self.algo == ConvAlgo.Native:
if FILTER_HWIO:
......@@ -150,18 +152,25 @@ class SparseConvolution(SparseModule):
mode = mode.lower()
valid_modes = ['fan_in', 'fan_out']
if mode not in valid_modes:
raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes))
raise ValueError(
"Mode {} not supported, please use one of {}".format(
mode, valid_modes))
fan_in, fan_out = self._calculate_fan_in_and_fan_out()
return fan_in if mode == 'fan_in' else fan_out
def _custom_kaiming_uniform_(self, tensor, a=0, mode='fan_in', nonlinearity='leaky_relu'):
def _custom_kaiming_uniform_(self,
tensor,
a=0,
mode='fan_in',
nonlinearity='leaky_relu'):
r"""same as torch.init.kaiming_uniform_, with KRSC layout support
"""
fan = self._calculate_correct_fan(mode)
gain = calculate_gain(nonlinearity, a)
std = gain / math.sqrt(fan)
bound = math.sqrt(3.0) * std # Calculate uniform bounds from standard deviation
bound = math.sqrt(
3.0) * std # Calculate uniform bounds from standard deviation
with torch.no_grad():
return tensor.uniform_(-bound, bound)
......@@ -268,7 +277,8 @@ class SparseConvolution(SparseModule):
indice_pairs = datas.indice_pairs
indice_pair_num = datas.indice_pair_num
assert self.subm, "only support reuse subm indices"
self._check_subm_reuse_valid(input, spatial_shape, datas)
self._check_subm_reuse_valid(input, spatial_shape,
datas)
else:
if input.benchmark:
torch.cuda.synchronize()
......@@ -360,7 +370,8 @@ class SparseConvolution(SparseModule):
mask_argsort_bwd_splits = datas.mask_argsort_bwd_splits
masks = datas.masks
assert self.subm, "only support reuse subm indices"
self._check_subm_reuse_valid(input, spatial_shape, datas)
self._check_subm_reuse_valid(input, spatial_shape,
datas)
else:
with input._timer.namespace("gen_pairs"):
......@@ -432,7 +443,7 @@ class SparseConvolution(SparseModule):
pair_mask_fwd_splits, pair_mask_bwd_splits,
mask_argsort_fwd_splits, mask_argsort_bwd_splits,
num_activate_out, masks, self.training, self.subm,
input._timer)
input._timer, self.fp32_accum)
if self.bias is not None:
out_features += self.bias
if input.benchmark:
......@@ -449,21 +460,28 @@ class SparseConvolution(SparseModule):
out_tensor.spatial_shape = out_spatial_shape
return out_tensor
def _check_subm_reuse_valid(self, inp: SparseConvTensor, spatial_shape: List[int], datas: Union[ImplicitGemmIndiceData, IndiceData]):
def _check_subm_reuse_valid(self, inp: SparseConvTensor,
spatial_shape: List[int],
datas: Union[ImplicitGemmIndiceData,
IndiceData]):
assert datas.is_subm, "only support reuse subm indices"
if self.kernel_size != datas.ksize:
raise ValueError(f"subm with same indice_key must have same kernel"
raise ValueError(
f"subm with same indice_key must have same kernel"
f" size, expect {datas.ksize}, this layer {self.kernel_size}")
if self.dilation != datas.dilation:
raise ValueError(f"subm with same indice_key must have same dilation"
raise ValueError(
f"subm with same indice_key must have same dilation"
f", expect {datas.dilation}, this layer {self.dilation}")
if inp.spatial_shape != datas.spatial_shape:
raise ValueError(f"subm with same indice_key must have same spatial structure"
raise ValueError(
f"subm with same indice_key must have same spatial structure"
f", expect {datas.spatial_shape}, input {spatial_shape}")
if inp.indices.shape[0] != datas.indices.shape[0]:
raise ValueError(f"subm with same indice_key must have same num of indices"
f", expect {datas.indices.shape[0]}, input {inp.indices.shape[0]}")
raise ValueError(
f"subm with same indice_key must have same num of indices"
f", expect {datas.indices.shape[0]}, input {inp.indices.shape[0]}"
)
class SparseConv1d(SparseConvolution):
......@@ -478,6 +496,7 @@ class SparseConv1d(SparseConvolution):
bias=True,
indice_key=None,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
name=None):
super(SparseConv1d, self).__init__(1,
in_channels,
......@@ -490,6 +509,7 @@ class SparseConv1d(SparseConvolution):
bias,
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
name=name)
......@@ -505,6 +525,7 @@ class SparseConv2d(SparseConvolution):
bias=True,
indice_key=None,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
name=None):
super(SparseConv2d, self).__init__(2,
in_channels,
......@@ -517,6 +538,7 @@ class SparseConv2d(SparseConvolution):
bias,
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
name=name)
......@@ -532,6 +554,7 @@ class SparseConv3d(SparseConvolution):
bias=True,
indice_key=None,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
name=None):
super(SparseConv3d, self).__init__(3,
in_channels,
......@@ -544,6 +567,7 @@ class SparseConv3d(SparseConvolution):
bias,
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
name=name)
......@@ -559,6 +583,7 @@ class SparseConv4d(SparseConvolution):
bias=True,
indice_key=None,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
name=None):
super(SparseConv4d, self).__init__(4,
in_channels,
......@@ -571,6 +596,7 @@ class SparseConv4d(SparseConvolution):
bias,
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
name=name)
......@@ -586,6 +612,7 @@ class SparseConvTranspose1d(SparseConvolution):
bias=True,
indice_key=None,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
name=None):
super(SparseConvTranspose1d, self).__init__(1,
in_channels,
......@@ -599,6 +626,7 @@ class SparseConvTranspose1d(SparseConvolution):
transposed=True,
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
name=name)
......@@ -614,6 +642,7 @@ class SparseConvTranspose2d(SparseConvolution):
bias=True,
indice_key=None,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
name=None):
super(SparseConvTranspose2d, self).__init__(2,
in_channels,
......@@ -627,6 +656,7 @@ class SparseConvTranspose2d(SparseConvolution):
transposed=True,
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
name=name)
......@@ -642,6 +672,7 @@ class SparseConvTranspose3d(SparseConvolution):
bias=True,
indice_key=None,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
name=None):
super(SparseConvTranspose3d, self).__init__(3,
in_channels,
......@@ -655,6 +686,7 @@ class SparseConvTranspose3d(SparseConvolution):
transposed=True,
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
name=name)
......@@ -670,6 +702,7 @@ class SparseConvTranspose4d(SparseConvolution):
bias=True,
indice_key=None,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
name=None):
super(SparseConvTranspose4d, self).__init__(4,
in_channels,
......@@ -683,6 +716,7 @@ class SparseConvTranspose4d(SparseConvolution):
transposed=True,
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
name=name)
......@@ -694,6 +728,7 @@ class SparseInverseConv1d(SparseConvolution):
indice_key,
bias=True,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
name=None):
super(SparseInverseConv1d, self).__init__(1,
in_channels,
......@@ -703,6 +738,7 @@ class SparseInverseConv1d(SparseConvolution):
inverse=True,
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
name=name)
......@@ -714,6 +750,7 @@ class SparseInverseConv2d(SparseConvolution):
indice_key,
bias=True,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
name=None):
super(SparseInverseConv2d, self).__init__(2,
in_channels,
......@@ -723,6 +760,7 @@ class SparseInverseConv2d(SparseConvolution):
inverse=True,
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
name=name)
......@@ -734,6 +772,7 @@ class SparseInverseConv3d(SparseConvolution):
indice_key,
bias=True,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
name=None):
super(SparseInverseConv3d, self).__init__(3,
in_channels,
......@@ -743,6 +782,7 @@ class SparseInverseConv3d(SparseConvolution):
inverse=True,
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
name=name)
......@@ -754,6 +794,7 @@ class SparseInverseConv4d(SparseConvolution):
indice_key,
bias=True,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
name=None):
super(SparseInverseConv4d, self).__init__(4,
in_channels,
......@@ -763,6 +804,7 @@ class SparseInverseConv4d(SparseConvolution):
inverse=True,
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
name=name)
......@@ -778,6 +820,7 @@ class SubMConv1d(SparseConvolution):
bias=True,
indice_key=None,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
name=None):
super(SubMConv1d, self).__init__(1,
in_channels,
......@@ -791,6 +834,7 @@ class SubMConv1d(SparseConvolution):
True,
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
name=name)
......@@ -806,6 +850,7 @@ class SubMConv2d(SparseConvolution):
bias=True,
indice_key=None,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
name=None):
super(SubMConv2d, self).__init__(2,
in_channels,
......@@ -819,6 +864,7 @@ class SubMConv2d(SparseConvolution):
True,
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
name=name)
......@@ -834,6 +880,7 @@ class SubMConv3d(SparseConvolution):
bias=True,
indice_key=None,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
name=None):
super(SubMConv3d, self).__init__(3,
in_channels,
......@@ -847,6 +894,7 @@ class SubMConv3d(SparseConvolution):
True,
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
name=name)
......@@ -862,6 +910,7 @@ class SubMConv4d(SparseConvolution):
bias=True,
indice_key=None,
algo: Optional[ConvAlgo] = None,
fp32_accum: Optional[bool] = None,
name=None):
super(SubMConv4d, self).__init__(4,
in_channels,
......@@ -875,4 +924,5 @@ class SubMConv4d(SparseConvolution):
True,
indice_key=indice_key,
algo=algo,
fp32_accum=fp32_accum,
name=name)
......@@ -15,6 +15,8 @@
from cumm import tensorview as tv
import torch
from typing import Optional, List
from spconv.cppconstants import COMPILED_CUDA_ARCHS
import sys
_TORCH_DTYPE_TO_TV = {
torch.float32: tv.float32,
......@@ -53,6 +55,13 @@ def torch_tensors_to_tv(*tens: torch.Tensor):
def get_current_stream():
return torch.cuda.current_stream().cuda_stream
def get_arch():
arch = torch.cuda.get_device_capability()
if arch not in COMPILED_CUDA_ARCHS:
print(f"[WARNING]your gpu arch {arch} isn't compiled in prebuilt, "
f"may cause invalid device function. "
f"available: {COMPILED_CUDA_ARCHS}", file=sys.stderr)
return arch
if __name__ == "__main__":
a = torch.rand(2, 2)
......
......@@ -179,14 +179,16 @@ class SparseImplicitGemmFunction(Function):
masks: List[np.ndarray],
is_train: bool,
is_subm: bool,
timer: CUDAKernelTimer = CUDAKernelTimer(False)):
timer: CUDAKernelTimer = CUDAKernelTimer(False),
fp32_accum: Optional[bool] = None):
try:
out, mask_out, mask_width = ops.implicit_gemm(features, filters,
pair_fwd,
pair_mask_fwd_splits,
mask_argsort_fwd_splits,
num_activate_out, masks,
is_train, is_subm, timer)
is_train, is_subm, timer,
fp32_accum)
except Exception as e:
msg = "[Exception|implicit_gemm]"
msg += f"feat={features.shape},w={filters.shape},pair={pair_fwd.shape},"
......@@ -208,6 +210,7 @@ class SparseImplicitGemmFunction(Function):
# ctx.num_activate_out = num_activate_out
ctx.masks = masks
ctx.is_subm = is_subm
ctx.fp32_accum = fp32_accum
return out
@staticmethod
......@@ -225,6 +228,8 @@ class SparseImplicitGemmFunction(Function):
masks = ctx.masks
is_subm = ctx.is_subm
timer = ctx.timer
fp32_accum = ctx.fp32_accum
try:
input_bp, filters_bp = ops.implicit_gemm_backward(
features,
......@@ -240,7 +245,8 @@ class SparseImplicitGemmFunction(Function):
masks=masks,
mask_width=mask_width,
is_subm=is_subm,
timer=timer)
timer=timer,
fp32_accum=fp32_accum)
except Exception as e:
msg = "[Exception|implicit_gemm_backward]"
msg += f"feat={features.shape},w={filters.shape},pair={pair_fwd.shape},"
......@@ -251,7 +257,7 @@ class SparseImplicitGemmFunction(Function):
masks))
raise e
None_9 = [None] * 11
None_9 = [None] * 12
return (input_bp, filters_bp, *None_9)
......
......@@ -23,7 +23,7 @@ import spconv
from spconv.core import AlgoHint, ConvAlgo
from typing import List, Optional, Union
from spconv.pytorch.core import ThrustSortAllocator
from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream
from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream, get_arch
from spconv.core_cc.csrc.sparse.all import SpconvOps
import spconv.core_cc as _ext
......@@ -666,7 +666,7 @@ def indice_conv(features: torch.Tensor,
profile_idx = i
assert nhot_profile > 0, "this shouldn't happen"
# print(nhot_profile, indice_pair_num_cpu)
arch = torch.cuda.get_device_capability()
arch = get_arch()
tuned_res = GEMM.get_tuned_algo(a.dtype,
filters_tv.dtype,
......@@ -809,7 +809,7 @@ def indice_conv_backward(features: torch.Tensor,
return (din, dfilters.reshape(filters_shape))
maxnhot = max(indice_pair_num_cpu)
arch = torch.cuda.get_device_capability()
arch = get_arch()
filters_tv = torch_tensor_to_tv(filters)
dfilters_tv = torch_tensor_to_tv(dfilters)
......@@ -1051,7 +1051,8 @@ def implicit_gemm(features: torch.Tensor,
masks: List[np.ndarray],
is_train: bool,
is_subm: bool,
timer: CUDAKernelTimer = CUDAKernelTimer(False)):
timer: CUDAKernelTimer = CUDAKernelTimer(False),
fp32_accum: Optional[bool] = None):
stream = get_current_stream()
# if DEBUG:
......@@ -1085,7 +1086,7 @@ def implicit_gemm(features: torch.Tensor,
features_tv = torch_tensor_to_tv(features)
filters_tv = torch_tensor_to_tv(filters)
out_features_tv = torch_tensor_to_tv(out_features)
arch = torch.cuda.get_device_capability()
arch = get_arch()
pair_mask_fwd_split_tvs = [
torch_tensor_to_tv(x, dtype=tv.uint32) for x in pair_mask_fwd_splits
]
......@@ -1113,7 +1114,8 @@ def implicit_gemm(features: torch.Tensor,
indices=pair_fwd_tv,
reverse_mask=False,
mask_filter=masks[0].item(),
stream=stream)
stream=stream,
fp32_accum=fp32_accum)
mask_width = tune_res.algo_desp.tile_shape[0]
if is_train:
mask_output_fwd = torch.empty(
......@@ -1180,7 +1182,8 @@ def implicit_gemm_backward(features: torch.Tensor,
masks: List[np.ndarray],
mask_width: int,
is_subm: bool,
timer: CUDAKernelTimer = CUDAKernelTimer(False)):
timer: CUDAKernelTimer = CUDAKernelTimer(False),
fp32_accum: Optional[bool] = None):
# print(out_bp.mean(), out_bp.max(), out_bp.min())
if features.dtype == torch.int8 or features.dtype == torch.qint8:
raise NotImplementedError("work in progress")
......@@ -1217,7 +1220,7 @@ def implicit_gemm_backward(features: torch.Tensor,
dout_tv = torch_tensor_to_tv(out_bp)
din_tv = torch_tensor_to_tv(din)
mask_output_fwd_tv = torch_tensor_to_tv(mask_output_fwd, dtype=tv.uint32)
arch = torch.cuda.get_device_capability()
arch = get_arch()
pair_mask_fwd_split_tvs = [
torch_tensor_to_tv(x, dtype=tv.uint32) for x in pair_mask_fwd_splits
]
......@@ -1263,7 +1266,8 @@ def implicit_gemm_backward(features: torch.Tensor,
indices=pair_bwd_tv,
reverse_mask=is_subm,
mask_filter=masks[0].item(),
stream=stream)
stream=stream,
fp32_accum=fp32_accum)
if wgrad_tune_res is None:
wgrad_tune_res, _ = CONV.tune_and_cache(
ConvOpType.kBackwardWeight,
......
......@@ -289,7 +289,7 @@ def main():
voxels_th = torch.from_numpy(voxels).to(device).to(dtype)
coors_th = torch.from_numpy(coors).to(device).int()
voxels_th.requires_grad = True
algo = spconv.ConvAlgo.Native
algo = spconv.ConvAlgo.MaskImplicitGemm
# 3080 Laptop
# MaskImpGemm: 11.2ms
# MaskSplitImpGemm: 12.2ms
......
2.1.20
\ No newline at end of file
2.1.21
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment