Commit bab09b63 authored by yan.yan's avatar yan.yan
Browse files

Merge branch 'develop'

parents 7af751dc 66529500
......@@ -24,4 +24,4 @@
* VoxelGenerator has been replaced by ```spconv.pytorch.utils.PointToVoxel``` (torch API) or Point2VoxelGPU[1-4]d/Point2VoxelCPU[1-4]d (tv.Tensor API).
* spconv < 2.1 don't support CPU. spconv 2.1+ support cpu for debug usage.
* test spconv 1.x model in spconv 2.x: Firstly set environment variable before run program, Then set all ```algo``` in conv/pool to ```ConvAlgo.Native```. Linux: ```export SPCONV_FILTER_HWIO="1"```, Windows powershell: ```$Env:SPCONV_FILTER_HWIO = "1"```. **WARNING** test spconv 1.x model don't support implicit gemm algorithm.
* test spconv 1.x model in spconv 2.x: Linux: ```export SPCONV_SAVED_WEIGHT_LAYOUT="RSCK"```, Windows powershell: ```$Env:SPCONV_SAVED_WEIGHT_LAYOUT = "RSCK"```.
\ No newline at end of file
import sys
from pathlib import Path
from typing import Dict, List, Tuple
import pickle
import sys
import time
from pathlib import Path
from cumm.gemm.algospec.core import GemmAlgo
import numpy as np
import pccm
import torch
import torch.nn.functional as F
from cumm import dtypes
from cumm import tensorview as tv
from cumm.constants import PACKAGE_ROOT
from cumm.conv.bases import NCHW, NHWC, ConvIterAlgo, ConvOpType
from cumm.conv.main import ConvMainUnitTest, gen_gemm_kernels
from cumm.conv.params import ConvProblem
from cumm.gemm import kernel
import os
from spconv.core_cc.csrc.sparse.all import SpconvOps
from cumm.gemm.codeops import div_up
from spconv.constants import PACKAGE_ROOT
from spconv.core import ConvAlgo
from spconv.pytorch import ops
from spconv.algo import CONV, BestConvAlgoByProfile
from spconv.pytorch.cppcore import torch_tensor_to_tv
def reduce_mask_count(mask: np.ndarray, width: int):
mask_length_32 = (div_up(mask.shape[0], width)) * width
if mask.shape[0] < mask_length_32:
mask_pad = np.zeros((mask_length_32, ), dtype=mask.dtype)
mask_pad[:mask.shape[0]] = mask
mask = mask_pad
mask = mask.reshape(-1, width)
maskr = np.bitwise_or.reduce(mask, axis=1)
maskr_tv = tv.from_numpy(maskr)
return SpconvOps.count_bits(maskr_tv).numpy().sum() * width
def reduce_mask_count_x(mask: np.ndarray, width: int):
mask_length_32 = (div_up(mask.shape[0], width)) * width
if mask.shape[0] < mask_length_32:
mask_pad = np.zeros((mask_length_32, ), dtype=mask.dtype)
mask_pad[:mask.shape[0]] = mask
mask = mask_pad
mask = mask.reshape(-1, width)
maskr = np.bitwise_or.reduce(mask, axis=1)
return maskr
def dev_subm_inds_v2(subm: bool = False, run_conv: bool = True):
limit_input_n = 16384
limit_input_n = None
np.random.seed(484)
with (PACKAGE_ROOT.parent / "test/data/test_spconv.pkl").open("rb") as f:
voxels_np, indices_np, spatial_shape = pickle.load(f)
from spconv.test_utils import generate_sparse_data
voxels_np = voxels_np[:limit_input_n]
indices_np = indices_np[:limit_input_n]
spatial_shape = [19, 18, 17]
sparse_dict = generate_sparse_data(spatial_shape, [1024], 128)
voxels_np = np.ascontiguousarray(sparse_dict["features"]).astype(
np.float32)
indices_np = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
voxels = tv.from_numpy(voxels_np).cuda()
indices = tv.from_numpy(indices_np).cuda()
indices_th = torch.from_numpy(indices_np).cuda()
print(spatial_shape, indices_np.shape)
ndim = 3
if subm:
ksize = [3, 3, 3]
kv = np.prod(ksize)
padding = [1] * ndim
stride = [1] * ndim
dilation = [1] * ndim
out_padding = [0] * ndim
else:
ksize = [2, 2, 2]
kv = np.prod(ksize)
padding = [0] * ndim
stride = [1] * ndim
dilation = [1] * ndim
out_padding = [0] * ndim
out_inds, pair_ref, indice_num_per_loc = ops.get_indice_pairs(
indices_th, 1, spatial_shape, ConvAlgo.Native, ksize, stride, padding,
dilation, out_padding, subm)
indice_num_per_loc_np = indice_num_per_loc.cpu().numpy()
indice_pairs_np = pair_ref.cpu().numpy()
algo = ConvAlgo.MaskSplitImplicitGemm
if algo == ConvAlgo.MaskImplicitGemm:
num_split = 1
else:
num_split = 2
for i in range(5):
res = ops.get_indice_pairs_implicit_gemm(indices_th, 1, spatial_shape,
algo, ksize, stride, padding,
dilation, out_padding, subm)
out_inds = res[0]
num_inds_per_loc = res[1]
pair_fwd = res[2]
pair_fwd_x = pair_fwd.cpu().numpy().reshape(-1)
pair_fwd_x[pair_fwd_x == -1] = 0
loc_num_np = (pair_fwd_x > 0).reshape(kv, -1).sum(1)
print(loc_num_np)
print(indice_num_per_loc_np)
pair_bwd = res[3]
pair_mask_fwd_splits = res[4]
pair_mask_bwd_splits = res[5]
mask_argsort_fwd_splits = res[6]
mask_argsort_bwd_splits = res[7]
masks = res[8]
pair_mask_fwd_splits_tv = [
ops.torch_tensor_to_tv(t, dtype=tv.uint32)
for t in pair_mask_fwd_splits
]
valid_location_bitcount = [
SpconvOps.count_bits(t) for t in pair_mask_fwd_splits_tv
]
valid_location_count = sum(
[t.cpu().numpy().sum() for t in valid_location_bitcount])
reduce_length = 32
split_mask_valid_count = sum([
reduce_mask_count(t.cpu().numpy(), reduce_length)
for t in pair_mask_fwd_splits_tv
])
if subm:
print("SUBM", valid_location_count, split_mask_valid_count,
pair_fwd.numel())
else:
print("REGULAR", valid_location_count, split_mask_valid_count,
pair_fwd.numel())
# return
if run_conv:
C = 64
K = 64
desps = CONV.desps
mask_output_fwd = torch.zeros([2, div_up(out_inds.shape[0], 32)],
dtype=torch.int32,
device=indices_th.device)
mask_output_bwd = torch.zeros([2, div_up(indices.dim(0), 32)],
dtype=torch.int32,
device=indices_th.device)
for desp in desps:
if desp.algo != GemmAlgo.Simt.value:
continue
# if desp.op_type == ConvOpType.kBackwardWeight.value:
# continue
# if desp.tile_shape !
if desp.dtype_a == dtypes.int8.tv_dtype:
inp = np.random.randint(-1, 1, size=[voxels_np.shape[0],
C]).astype(np.int8)
weight = np.random.randint(-1, 1, size=[K, *ksize,
C]).astype(np.int8)
output = np.random.randint(-1, 1, size=[
out_inds.shape[0], K
]).astype(dtypes.get_npdtype_from_tvdtype(desp.dtype_output))
else:
inp = np.random.uniform(-1, 1, size=[
voxels_np.shape[0], C
]).astype(dtypes.get_npdtype_from_tvdtype(desp.dtype_input))
weight = np.random.uniform(-1, 1, size=[K, *ksize, C]).astype(
dtypes.get_npdtype_from_tvdtype(desp.dtype_weight))
output = np.random.uniform(-1, 1, size=[
out_inds.shape[0], K
]).astype(dtypes.get_npdtype_from_tvdtype(desp.dtype_output))
weight_ref = weight.transpose(1, 2, 3, 0, 4)
weight_ref = np.ascontiguousarray(weight_ref).reshape(-1, K, C)
if desp.op_type == ConvOpType.kBackwardInput.value:
inp_tv = tv.zeros(inp.shape, desp.dtype_input, 0)
else:
inp_tv = tv.from_numpy(inp).cuda()
if desp.op_type == ConvOpType.kBackwardWeight.value:
weight_tv = tv.zeros(weight.shape, desp.dtype_weight, 0)
else:
weight_tv = tv.from_numpy(weight).cuda()
# _ = tv.zeros([5000, 10], tv.float32, 0)
if desp.op_type == ConvOpType.kForward.value:
output_tv = tv.zeros(output.shape, desp.dtype_output, 0)
else:
output_tv = tv.from_numpy(output).cuda()
torch.cuda.synchronize()
t = time.time()
spk = 1
if desp.op_type == ConvOpType.kBackwardWeight.value:
# TODO support splitk parallel
spk = 32
if subm:
if desp.op_type == ConvOpType.kForward.value:
indice_pairs = pair_fwd
elif desp.op_type == ConvOpType.kBackwardInput.value:
indice_pairs = pair_bwd
else:
indice_pairs = pair_fwd
mask_output = mask_output_fwd
# print([bin(x.item()) for x in masks])
for j in range(num_split):
beta = 1 if j == 1 else 0
mask_filter = 0xffffffff
mask_filter = masks[j].item()
reverse_mask = False
if desp.op_type == ConvOpType.kBackwardWeight.value:
mask_op = mask_output[j]
else:
mask_op = pair_mask_fwd_splits[j]
if desp.op_type == ConvOpType.kBackwardInput.value:
reverse_mask = True
CONV.run_with_tuned_result(
BestConvAlgoByProfile(desp, spk),
desp.op_type,
inp_tv,
weight_tv,
output_tv,
torch_tensor_to_tv(mask_op, dtype=tv.uint32),
torch_tensor_to_tv(mask_argsort_fwd_splits[j]),
torch_tensor_to_tv(mask_output[j], dtype=tv.uint32),
torch_tensor_to_tv(indice_pairs),
reverse_mask,
mask_filter=mask_filter,
mask_width=32,
beta=beta,
verbose=True,
)
else:
if desp.op_type == ConvOpType.kForward.value:
indice_pairs = pair_fwd # inp -> out
mask_ops = pair_mask_fwd_splits
mask_argsorts = mask_argsort_fwd_splits
mask_output = mask_output_fwd
elif desp.op_type == ConvOpType.kBackwardInput.value:
indice_pairs = pair_bwd # out -> inp
mask_ops = pair_mask_bwd_splits
mask_argsorts = mask_argsort_bwd_splits
mask_output = mask_output_bwd
print([bin(x.item()) for x in masks])
else:
indice_pairs = pair_fwd # inp -> out
mask_ops = pair_mask_fwd_splits
mask_argsorts = mask_argsort_fwd_splits
mask_output = mask_output_fwd
for j in range(2):
beta = 1 if j == 1 else 0
mask_filter = masks[j].item()
reverse_mask = False
if desp.op_type == ConvOpType.kBackwardWeight.value:
mask_op = mask_output[j]
else:
mask_op = mask_ops[j]
CONV.run_with_tuned_result(
BestConvAlgoByProfile(desp, spk),
desp.op_type,
inp_tv,
weight_tv,
output_tv,
torch_tensor_to_tv(mask_op, dtype=tv.uint32),
torch_tensor_to_tv(mask_argsorts[j]),
torch_tensor_to_tv(mask_output[j], dtype=tv.uint32),
torch_tensor_to_tv(indice_pairs),
reverse_mask,
mask_filter=mask_filter,
mask_width=32,
beta=beta,
verbose=True,
)
torch.cuda.synchronize()
duration = time.time() - t
if desp.op_type == ConvOpType.kForward.value:
output_ref = np.zeros_like(output, dtype=np.float32)
# ref algorithm
for filter_offset in range(kv):
if subm and filter_offset > kv // 2:
nhot = indice_num_per_loc_np[kv - 1 - filter_offset]
elif subm and filter_offset == kv // 2:
nhot = voxels.shape[0]
else:
nhot = indice_num_per_loc_np[filter_offset]
a_inds = indice_pairs_np[0][filter_offset][:nhot]
c_inds = indice_pairs_np[1][filter_offset][:nhot]
# print(a_inds_cpu[:10])
a = inp[a_inds]
cc = a.astype(
np.float32) @ weight_ref[filter_offset].T.astype(
np.float32)
output_ref[c_inds] += cc
output_cpu = output_tv.cpu().numpy().astype(np.float32)
duration = time.time() - t
my = output_cpu.reshape(-1)
print("ERROR", np.linalg.norm(output_ref.reshape(-1) - my))
elif desp.op_type == ConvOpType.kBackwardInput.value:
dinput_ref = np.zeros_like(inp, dtype=np.float32)
# ref algorithm
for filter_offset in range(kv):
if subm and filter_offset > kv // 2:
nhot = indice_num_per_loc_np[kv - 1 - filter_offset]
elif subm and filter_offset == kv // 2:
nhot = voxels.shape[0]
else:
nhot = indice_num_per_loc_np[filter_offset]
a_inds = indice_pairs_np[1][filter_offset][:nhot]
c_inds = indice_pairs_np[0][filter_offset][:nhot]
# print(a_inds_cpu[:10])
a = output[a_inds]
# NK @ KC
cc = a.astype(
np.float32) @ weight_ref[filter_offset].astype(
np.float32)
dinput_ref[c_inds] += cc
din_cpu = inp_tv.cpu().numpy()
print(
"ERROR",
np.linalg.norm(
din_cpu.reshape(-1) - dinput_ref.reshape(-1)))
else:
dw_ref = np.zeros_like(weight_ref,
dtype=np.float32) # KV, K, C
for filter_offset in range(kv):
if subm and filter_offset > kv // 2:
nhot = indice_num_per_loc_np[kv - 1 - filter_offset]
elif subm and filter_offset == kv // 2:
nhot = voxels.shape[0]
else:
nhot = indice_num_per_loc_np[filter_offset]
o_inds = indice_pairs_np[1][filter_offset][:nhot]
i_inds = indice_pairs_np[0][filter_offset][:nhot]
# print(a_inds_cpu[:10])
out_gather = output[o_inds] # [N, K]
inp_gather = inp[i_inds] # [N, C]
# KN @ NC
dw_res = out_gather.astype(
np.float32).T @ inp_gather.astype(np.float32)
dw_ref[filter_offset] = dw_res
# print(indice_pairs_np_test[0])
dw_ref_kcrs = dw_ref.transpose(1, 0, 2)
dw_cpu = weight_tv.cpu().numpy().reshape(K, np.prod(ksize), C)
print(
"ERROR",
np.linalg.norm(
dw_cpu.reshape(-1) - dw_ref_kcrs.reshape(-1)))
if __name__ == "__main__":
dev_subm_inds_v2()
......@@ -17,3 +17,5 @@ from . import build as _build
from .core import ConvAlgo, AlgoHint
from . import constants
from .__version__ import __version__
SPCONV_VERSION_NUMBERS = list(map(int, __version__.split(".")))
\ No newline at end of file
......@@ -228,9 +228,9 @@ class SimpleGemm:
# skip volta tensor op since it is very slow in architectures except volta.
if arch >= (7, 5) and desp.algo == GemmAlgo.Volta.value:
continue
lda = a.dim(1)
ldb = b.dim(1)
ldc = c.dim(1)
lda = a.stride[0]
ldb = b.stride[0]
ldc = c.stride[0]
if desp.supported_ldx(lda, ldb, ldc):
if arch not in COMPILED_CUDA_ARCHS:
desp = desp.copy()
......@@ -422,15 +422,16 @@ class SimpleGemm:
c_inds.shape)
avail = self.get_all_available(a, b, c, trans_a, trans_b, trans_c,
arch, shuffle_type)
c_ = c.clone()
# c may be weight, may non-contiguous.
# cumm.tensorview.Tensor don't support non-contiguous clone
c_ = c.clone_whole_storage()
times: List[float] = []
best_gather_params = (-1, -1, -1, -1)
best_scatter_params = (-1, -1, -1, -1)
all_profile_res: List[BestAlgoByProfile] = []
for desp in avail:
c_.zero_()
c_.zero_whole_storage_()
split_k_slices = 1
# TODO better splitk selection
if desp.split_k_serial and hint & AlgoHint.BackwardWeight.value:
......
from .basic import bench_basic
import fire
def bench_me_basic(dtype_str: str):
from spconv.benchmark.me import bench_me_basic
return bench_me_basic(dtype_str)
def bench_torchsparse_basic(dtype_str: str):
from spconv.benchmark.thsp import bench_torchsparse_basic
return bench_torchsparse_basic(dtype_str)
if __name__ == "__main__":
fire.Fire()
from spconv.benchmark.core import get_voxel_data
import time
from pathlib import Path
import numpy as np
import torch
from torch import nn
from cumm import tensorview as tv
from spconv.core import ConvAlgo
from cumm import dtypes
import spconv.pytorch as spconv
from spconv.test_utils import params_grid
class Net(nn.Module):
def __init__(self, shape, algo):
super().__init__()
pool_algo = algo
# pool_algo = ConvAlgo.Native
self.net = spconv.SparseSequential(
spconv.SubMConv3d(3, 64, 3, bias=False, indice_key="c0",
algo=algo),
spconv.SubMConv3d(64,
64,
3,
bias=False,
indice_key="c0",
algo=algo),
# nn.BatchNorm1d(32),
# nn.ReLU(),
# spconv.SparseConv3d(64, 64, 2, 2, bias=False, indice_key="m0"),
spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv.SubMConv3d(64,
96,
3,
bias=False,
indice_key="c1",
algo=algo),
spconv.SubMConv3d(96,
96,
3,
bias=False,
indice_key="c1",
algo=algo),
# nn.BatchNorm1d(64),
# nn.ReLU(),
# spconv.SparseConv3d(96, 96, 2, 2, bias=False, indice_key="m1"),
spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv.SubMConv3d(96,
128,
3,
bias=False,
indice_key="c2",
algo=algo),
spconv.SubMConv3d(128,
128,
3,
bias=False,
indice_key="c2",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
# spconv.SparseConv3d(128, 128, 2, 2, bias=False, indice_key="m2"),
spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv.SubMConv3d(128,
160,
3,
bias=False,
indice_key="c3",
algo=algo),
spconv.SubMConv3d(160,
160,
3,
bias=False,
indice_key="c3",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
# spconv.SparseConv3d(160, 160, 2, 2, bias=False, indice_key="m3"),
spconv.SparseMaxPool3d(2, 2, algo=pool_algo),
spconv.SubMConv3d(160,
192,
3,
bias=False,
indice_key="c4",
algo=algo),
spconv.SubMConv3d(192,
192,
3,
bias=False,
indice_key="c4",
algo=algo),
# nn.BatchNorm1d(128),
# nn.ReLU(),
spconv.SparseMaxPool3d(2, 2, indice_key="m4", algo=pool_algo),
# spconv.SparseConv3d(192, 192, 2, 2, bias=False, indice_key="m4"),
spconv.SubMConv3d(192,
224,
3,
bias=False,
indice_key="c5",
algo=algo),
spconv.SubMConv3d(224,
224,
3,
bias=False,
indice_key="c5",
algo=algo),
# nn.BatchNorm1d(224),
# nn.ReLU(),
# spconv.SparseConv3d(224, 224, 2, 2, bias=False, indice_key="m5"),
spconv.SparseMaxPool3d(2, 2, indice_key="m5", algo=pool_algo),
spconv.SubMConv3d(224,
256,
3,
bias=False,
indice_key="c6",
algo=algo),
spconv.SubMConv3d(256,
256,
3,
bias=False,
indice_key="c6",
algo=algo),
# nn.BatchNorm1d(256),
# nn.ReLU(),
# spconv.SparseInverseConv3d(256, 128, 2, indice_key="m5", bias=False, algo=algo),
# # # nn.BatchNorm1d(128),
# # # nn.ReLU(),
# spconv.SparseInverseConv3d(128, 64, 2, indice_key="m4", bias=False, algo=algo),
)
max_batch_size = 1
self.shape = shape
def forward(self, features, coors, batch_size, enable_timer: bool = False):
x = spconv.SparseConvTensor(features,
coors,
self.shape,
batch_size,
enable_timer=enable_timer)
return self.net(x)
_DTYPE_TO_TORCH_DTYPE = {
dtypes.float32: torch.float32,
dtypes.float16: torch.float16,
}
def bench_basic(dtype_str: str):
dtype = dtypes.get_dtype_by_shortcut(dtype_str)
if dtype not in _DTYPE_TO_TORCH_DTYPE:
raise NotImplementedError("only support bench f32 and f16 for now")
torch_dtype = _DTYPE_TO_TORCH_DTYPE[dtype]
algos = [spconv.ConvAlgo.Native, spconv.ConvAlgo.MaskImplicitGemm, spconv.ConvAlgo.MaskSplitImplicitGemm]
(voxels, coors, spatial_shape) = get_voxel_data()
device = torch.device("cuda:0")
for algo, in params_grid(algos):
voxels_th = torch.from_numpy(voxels).to(device).to(torch_dtype)
coors_th = torch.from_numpy(coors).to(device).int()
voxels_th.requires_grad = True
net = Net(spatial_shape, algo).to(device).train().to(torch_dtype)# .train()
spconv.assign_name_for_sparse_modules(net)
with torch.no_grad():
out: spconv.SparseConvTensor = net(voxels_th, coors_th, 1)
dout = np.random.uniform(-0.2, 0.2, out.features.shape).astype(np.float32)
dout_t = torch.from_numpy(dout).to(device).to(torch_dtype)
times = []
with torch.no_grad():
for i in range(100):
torch.cuda.synchronize()
t = time.time()
out_nograd = net(voxels_th, coors_th, 1, False)
timer = out_nograd._timer
torch.cuda.synchronize()
times.append(time.time() - t)
print(f"basic[{dtype_str}|{algo}|forward]", np.mean(times[50:]))
times = []
for i in range(50):
out = net(voxels_th, coors_th, 1)
torch.cuda.synchronize()
t = time.time()
out.features.backward(dout_t)
torch.cuda.synchronize()
times.append(time.time() - t)
print(f"basic[{dtype_str}|{algo}|backward]", np.mean(times[25:]))
if __name__ == "__main__":
bench_basic("f16")
\ No newline at end of file
import requests
import fire
import pickle
from io import BytesIO
import numpy as np
from spconv.constants import PACKAGE_ROOT
RAW_TEST_DATA_PATH = "https://raw.githubusercontent.com/traveller59/spconv/v2.1.10/test/data/test_spconv.pkl"
RAW_PC_PATH = "https://raw.githubusercontent.com/traveller59/spconv/v2.1.10/test/data/benchmark-pc.npz"
def get_voxel_data():
editable_test_data_path = PACKAGE_ROOT.parent / "test/data/test_spconv.pkl"
if editable_test_data_path.exists():
with editable_test_data_path.open("rb") as f:
return pickle.load(f)
ff = BytesIO()
with requests.get(RAW_TEST_DATA_PATH, stream=True) as req:
req.raise_for_status()
for chunk in req.iter_content(chunk_size=8192):
ff.write(chunk)
ff.seek(0)
(voxels, coors, spatial_shape) = pickle.load(ff)
return voxels, coors, spatial_shape
def get_pc_data():
editable_test_data_path = PACKAGE_ROOT.parent / "test/data/benchmark-pc.npz"
if editable_test_data_path.exists():
pc = np.load(str(editable_test_data_path))["pc"]
return pc
ff = BytesIO()
with requests.get(RAW_PC_PATH, stream=True) as req:
req.raise_for_status()
for chunk in req.iter_content(chunk_size=8192):
ff.write(chunk)
ff.seek(0)
pc = np.load(ff)["pc"]
return pc
if __name__ == "__main__":
pc = get_pc_data()
print(pc[:10])
\ No newline at end of file
"""Benchmark MinkowskiEngine
"""
from spconv.benchmark.core import get_voxel_data
import time
from pathlib import Path
import numpy as np
import torch
from torch import nn
from spconv.core import ConvAlgo
from cumm import dtypes
from spconv.test_utils import params_grid
_DTYPE_TO_TORCH_DTYPE = {
dtypes.float32: torch.float32,
dtypes.float16: torch.float16,
}
def bench_me_basic(dtype_str: str):
dtype = dtypes.get_dtype_by_shortcut(dtype_str)
if dtype not in _DTYPE_TO_TORCH_DTYPE:
raise NotImplementedError("only support bench f32 and f16 for now")
torch_dtype = _DTYPE_TO_TORCH_DTYPE[dtype]
"""Benchmark torchsparse
"""
from spconv.benchmark.core import get_voxel_data
import time
from pathlib import Path
import numpy as np
import torch
from torch import nn
from spconv.core import ConvAlgo
from cumm import dtypes
from spconv.test_utils import params_grid
_DTYPE_TO_TORCH_DTYPE = {
dtypes.float32: torch.float32,
dtypes.float16: torch.float16,
}
def bench_torchsparse_basic(dtype_str: str):
dtype = dtypes.get_dtype_by_shortcut(dtype_str)
if dtype not in _DTYPE_TO_TORCH_DTYPE:
raise NotImplementedError("only support bench f32 and f16 for now")
torch_dtype = _DTYPE_TO_TORCH_DTYPE[dtype]
......@@ -25,10 +25,20 @@ PACKAGE_ROOT = Path(__file__).parent.resolve()
EDITABLE_INSTALLED = project_is_installed(
PACKAGE_NAME) and project_is_editable(PACKAGE_NAME)
_filter_hwio_env = os.getenv("SPCONV_FILTER_HWIO", "0")
FILTER_HWIO = _filter_hwio_env == "1"
_filter_hwio_env = os.getenv("SPCONV_FILTER_HWIO", None)
if _filter_hwio_env is not None:
raise NotImplementedError("SPCONV_FILTER_HWIO is deprecated. use SPCONV_SAVED_WEIGHT_LAYOUT instead.")
DISABLE_JIT = os.getenv("SPCONV_DISABLE_JIT", "0") == "1"
NDIM_DONT_CARE = 3
FILTER_HWIO = False
SAVED_WEIGHT_LAYOUT = os.getenv("SPCONV_SAVED_WEIGHT_LAYOUT", "")
if SAVED_WEIGHT_LAYOUT != "":
assert SAVED_WEIGHT_LAYOUT in ["KRSC", "RSKC", "RSCK"], "please set SAVED_WEIGHT_LAYOUT to KRSC, RSKC or RSCK"
ALL_WEIGHT_IS_KRSC = True
SPCONV_DEBUG_SAVE_PATH = os.getenv("SPCONV_DEBUG_SAVE_PATH", "")
......@@ -63,3 +73,4 @@ class SpconvAllocatorKeys:
MaskArgSortBwd = "MaskArgSortBwd"
OutFeatures = "OutFeatures"
SPCONV_DEBUG_WEIGHT = False
......@@ -83,6 +83,9 @@ SHUFFLE_SIMT_PARAMS: List[GemmAlgoParams] = [
"f32,f32,f32,f32,f32", 2, kernel.GemmAlgo.Simt, None),
*gen_shuffle_params((32, 32, 32), (32, 32, 8), ["f32,f32,f32,f32,f32"],
"f32,f32,f32,f32,f32", 2, kernel.GemmAlgo.Simt, None),
*gen_shuffle_params((16, 32, 8), (16, 16, 8), ["f32,f32,f32,f32,f32"],
"f32,f32,f32,f32,f32", 2, kernel.GemmAlgo.Simt, None),
# fall back kernels if mat is misaligned for half
# TODO use access-per-vector kernel instead of simt kernel for fallback
*gen_shuffle_params((128, 128, 8), (32, 64, 8), ["f16,f16,f16,f32,f32"],
......
......@@ -5,6 +5,20 @@ class ThrustCustomAllocatorV2:
alloc_func: Callable[int, int]
class SpconvOps:
@staticmethod
def cumm_version() -> str:
"""
get cumm version when build spconv.
"""
...
@staticmethod
def pccm_version() -> str:
"""
get pccm version when build spconv.
"""
...
@staticmethod
def generate_conv_inds_stage1(indices: Tensor, indice_pairs: Tensor, indice_pairs_uniq: Tensor, indice_num_per_loc: Tensor, batch_size: int, output_dims: List[int], input_dims: List[int], ksize: List[int], stride: List[int], padding: List[int], dilation: List[int], transposed: bool = False, stream_int: int = 0) -> None:
"""
Args:
......@@ -300,6 +314,13 @@ class SpconvOps:
"""
...
@staticmethod
def reverse_bits(a: Tensor) -> Tensor:
"""
Args:
a:
"""
...
@staticmethod
def calc_point2voxel_meta_data(vsize_xyz: List[float], coors_range_xyz: List[float]) -> Tuple[List[float], List[int], List[int], List[float]]:
"""
Args:
......
......@@ -19,6 +19,10 @@ if hasattr(_ext, "cumm"):
else:
CPU_ONLY_BUILD = True
from spconv.core_cc.csrc.sparse.all import SpconvOps
BUILD_CUMM_VERSION = SpconvOps.cumm_version()
BUILD_PCCM_VERSION = SpconvOps.pccm_version()
from spconv.core_cc.csrc.utils.boxops import BoxOps
from spconv.core_cc.cumm.common import CompileInfo
HAS_BOOST = BoxOps.has_boost()
......
......@@ -13,11 +13,13 @@
# limitations under the License.
from cumm.common import TensorView, TensorViewCPU, TensorViewKernel, ThrustLib, GemmBasicHost
import cumm
from cumm.conv.bases import ConvOpType, NHWC
from cumm.conv.params import ConvProblem
from cumm import dtypes
from cumm.constants import CUMM_CPU_ONLY_BUILD
import pccm
from pccm.__version__ import __version__ as pccm_version
from ccimport import compat
from .pointops import Point2Voxel, Point2VoxelCPU
from .indices import SparseConvIndicesKernel, CudaCommonKernel, SparseConvIndicesCPU
......@@ -99,6 +101,28 @@ class SpconvOps(pccm.Class):
self.add_impl_only_param_class(cuda_funcs, f"ops{ndim}d",
indices,
f"SpconvIndices{ndim}D")
@pccm.pybind.mark
@pccm.static_function
def cumm_version(self):
"""get cumm version when build spconv.
"""
code = pccm.FunctionCode()
code.raw(f"""
return \"{cumm.__version__}\";
""")
return code.ret("std::string")
@pccm.pybind.mark
@pccm.static_function
def pccm_version(self):
"""get pccm version when build spconv.
"""
code = pccm.FunctionCode()
code.raw(f"""
return \"{pccm_version}\";
""")
return code.ret("std::string")
@pccm.pybind.mark
@pccm.cuda.static_function
......@@ -853,6 +877,66 @@ class SpconvOps(pccm.Class):
""")
return code.ret("tv::Tensor")
@pccm.pybind.mark
@pccm.cuda.static_function
def reverse_bits(self):
code = pccm.FunctionCode()
if CUMM_CPU_ONLY_BUILD:
return code.make_invalid()
code.add_dependency(TensorViewKernel)
code.arg("a", "tv::Tensor")
code.code_after_include = f"""
__global__ void reverse_bits_kernel_64(const uint64_t* data, uint64_t* out, int size){{
for (int i : tv::KernelLoopX<int>(size)){{
out[i] = __brevll(reinterpret_cast<const unsigned long long*>(data)[i]);
}}
}}
__global__ void reverse_bits_kernel(const uint32_t* data, uint32_t* out, int size){{
for (int i : tv::KernelLoopX<int>(size)){{
out[i] = __brev(data[i]);
}}
}}
uint32_t reverse(uint32_t x)
{{
x = ((x >> 1) & 0x55555555u) | ((x & 0x55555555u) << 1);
x = ((x >> 2) & 0x33333333u) | ((x & 0x33333333u) << 2);
x = ((x >> 4) & 0x0f0f0f0fu) | ((x & 0x0f0f0f0fu) << 4);
x = ((x >> 8) & 0x00ff00ffu) | ((x & 0x00ff00ffu) << 8);
x = ((x >> 16) & 0xffffu) | ((x & 0xffffu) << 16);
return x;
}}
int reverse(uint64_t i)
{{
return (reverse(uint32_t(i)) << 32) | reverse(uint32_t(i >> 32));
}}
"""
code.raw(f"""
tv::Tensor res(a.shape(), a.dtype(), a.device());
tv::dispatch<uint32_t, uint64_t>(a.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
auto res_ptr = res.data_ptr<T>();
auto a_ptr = a.data_ptr<const T>();
if (a.device() == -1){{
for (int i = 0; i < a.size(); ++i){{
res_ptr[i] = reverse(a_ptr[i]);
}}
}}else{{
tv::cuda::Launch launcher(a.size());
tv::if_constexpr<std::is_same<T, uint64_t>::value>([=](auto _)mutable{{
launcher(_(reverse_bits_kernel_64), a_ptr, res_ptr, int(a.size()));
}}, [=](auto _)mutable{{
launcher(_(reverse_bits_kernel), a_ptr, res_ptr, int(a.size()));
}});
}}
}});
return res;
""")
return code.ret("tv::Tensor")
@pccm.pybind.mark
@pccm.static_function
def calc_point2voxel_meta_data(self):
......
......@@ -16,7 +16,6 @@ import pccm
from ccimport import compat
from cumm.common import TensorView
class OMPLib(pccm.Class):
def __init__(self):
super().__init__()
......
......@@ -13,7 +13,7 @@
# limitations under the License.
import pccm
from cumm.common import TensorView
from cumm.common import TensorView, GemmDTypes
from cumm.constants import CUMM_CPU_ONLY_BUILD
from spconv.csrc.sparse.cpu_core import OMPLib
from typing import List
......@@ -24,7 +24,7 @@ class GatherCPU(pccm.Class):
super().__init__()
if CUMM_CPU_ONLY_BUILD:
self.add_dependency(OMPLib)
self.add_dependency(TensorView)
self.add_dependency(TensorView, GemmDTypes)
self.add_include("tensorview/parallel/all.h")
@pccm.static_function
......@@ -39,7 +39,7 @@ class GatherCPU(pccm.Class):
auto nhot = inds.dim(0);
int channel = in.dim(1);
tv::dispatch<float, double>(out.dtype(), [&](auto I){{
tv::dispatch<float, double, tv::bfloat16_t, tv::half_t>(out.dtype(), [&](auto I){{
auto indices_data = inds.data_ptr<const int>();
using T = TV_DECLTYPE(I);
T *buffer_data = out.data_ptr<T>();
......@@ -65,7 +65,7 @@ class GatherCPU(pccm.Class):
// tv::check_shape(inds, {{in.dim(0)}});
auto nhot = inds.dim(0);
int channel = in.dim(1);
tv::dispatch<float, double>(out.dtype(), [&](auto I){{
tv::dispatch<float, double, tv::bfloat16_t, tv::half_t>(out.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
auto indices_data = inds.data_ptr<const int>();
const T *buffer_data = in.data_ptr<const T>();
......
......@@ -17,7 +17,7 @@ from cumm.gemm.core.metaarray import MetaArray, seq
from cumm import dtypes
import pccm
from cumm.gemm.layout import TensorGeneric, to_stride
from cumm.common import TensorView, TensorViewHashKernel, TensorViewKernel, ThrustLib, GemmBasic
from cumm.common import TensorView, GemmDTypes, TensorViewKernel, ThrustLib, GemmBasic
from cumm.gemm import codeops
from typing import List
from cumm.conv.params import ConvProblem
......@@ -352,7 +352,7 @@ class IndiceMaxPool(pccm.Class):
class IndiceMaxPoolCPU(pccm.Class):
def __init__(self):
super().__init__()
self.add_dependency(TensorView)
self.add_dependency(TensorView, GemmDTypes)
if CUMM_CPU_ONLY_BUILD:
self.add_dependency(OMPLib)
self.add_include("tensorview/parallel/all.h")
......@@ -369,7 +369,7 @@ class IndiceMaxPoolCPU(pccm.Class):
code.raw(f"""
int nhot = out_inds.dim(0);
int num_features = in.dim(1);
tv::dispatch<float, double>(out.dtype(), [&](auto I){{
tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
auto out_features = out.data_ptr<T>();
auto in_features = in.data_ptr<const T>();
......@@ -409,7 +409,7 @@ class IndiceMaxPoolCPU(pccm.Class):
code.raw(f"""
int nhot = out_inds.dim(0);
int num_features = in.dim(1);
tv::dispatch<float, double>(out.dtype(), [&](auto I){{
tv::dispatch<float, double, tv::half_t, tv::bfloat16_t>(out.dtype(), [&](auto I){{
using T = TV_DECLTYPE(I);
auto out_features = out.data_ptr<const T>();
auto in_features = in.data_ptr<const T>();
......
......@@ -27,3 +27,15 @@ try:
except:
# for unknown errors, just set a version
PYTORCH_VERSION = [1, 8, 0]
if PYTORCH_VERSION >= [1, 6, 0]:
TORCH_HAS_AMP = True
else:
TORCH_HAS_AMP = False
def is_amp_enabled():
if TORCH_HAS_AMP:
return torch.is_autocast_enabled()
else:
return False
\ No newline at end of file
......@@ -24,6 +24,7 @@ from torch.nn import init
from torch.nn.parameter import Parameter
from spconv import pytorch as spconv
from spconv import SPCONV_VERSION_NUMBERS
from spconv.core import ConvAlgo
from spconv.debug_utils import spconv_save_debug_data
from spconv.pytorch import functional as Fsp
......@@ -31,10 +32,25 @@ from spconv.pytorch import ops
from spconv.cppconstants import CPU_ONLY_BUILD
from spconv.pytorch.core import IndiceData, SparseConvTensor, ImplicitGemmIndiceData, expand_nd
from spconv.pytorch.modules import SparseModule
from spconv.constants import FILTER_HWIO
from spconv.constants import SAVED_WEIGHT_LAYOUT, ALL_WEIGHT_IS_KRSC, SPCONV_DEBUG_WEIGHT
from spconv.utils import nullcontext
from torch.nn.init import calculate_gain
FILTER_HWIO = False
def expand_nd(val: Union[int, List[int], Tuple[int, ...]], ndim: int) -> List[int]:
if isinstance(val, int):
val = [val] * ndim
elif isinstance(val, list):
assert len(val) == ndim
elif isinstance(val, tuple):
assert len(val) == ndim
return [*val]
else:
raise NotImplementedError
return val
class SparseConvolution(SparseModule):
__constants__ = [
......@@ -101,7 +117,7 @@ class SparseConvolution(SparseModule):
self.algo = algo
self.fp32_accum = fp32_accum
# self.algo = ConvAlgo.Native
if self.algo == ConvAlgo.Native:
if self.algo == ConvAlgo.Native and not ALL_WEIGHT_IS_KRSC:
if FILTER_HWIO:
# RSCK
self.weight = Parameter(
......@@ -121,6 +137,37 @@ class SparseConvolution(SparseModule):
self.register_parameter('bias', None)
self.reset_parameters()
self._register_load_state_dict_pre_hook(self._load_weight_different_layout)
def _load_weight_different_layout(
self, state_dict, prefix, local_metadata, strict,
missing_keys, unexpected_keys, error_msgs):
if not SAVED_WEIGHT_LAYOUT:
return
key = prefix + "weight"
assert key in state_dict
ndim = self.ndim
if SAVED_WEIGHT_LAYOUT == "RSKC":
state_dict[key] = state_dict[key].permute(ndim, *range(ndim), ndim + 1).contiguous()
elif SAVED_WEIGHT_LAYOUT == "RSCK":
state_dict[key] = state_dict[key].permute(ndim + 1, *range(ndim), ndim).contiguous()
if ALL_WEIGHT_IS_KRSC or self.algo != ConvAlgo.Native:
# in spconv 2.2, we only support KRSC layout.
if SAVED_WEIGHT_LAYOUT == "RSKC":
state_dict[key] = state_dict[key].permute(ndim, *range(ndim), ndim + 1).contiguous()
elif SAVED_WEIGHT_LAYOUT == "RSCK":
state_dict[key] = state_dict[key].permute(ndim + 1, *range(ndim), ndim).contiguous()
else:
if self.algo == ConvAlgo.Native:
# to RSCK
if SAVED_WEIGHT_LAYOUT == "RSKC":
state_dict[key] = state_dict[key].permute(*range(ndim), ndim + 1, ndim).contiguous()
elif SAVED_WEIGHT_LAYOUT == "KRSC":
state_dict[key] = state_dict[key].permute(*range(1, ndim + 1), 0, ndim + 1).contiguous()
def extra_repr(self):
s = ('{in_channels}, {out_channels}, kernel_size={kernel_size}'
', stride={stride}')
......@@ -175,7 +222,11 @@ class SparseConvolution(SparseModule):
return tensor.uniform_(-bound, bound)
def reset_parameters(self):
self._custom_kaiming_uniform_(self.weight, a=math.sqrt(5))
if SPCONV_DEBUG_WEIGHT:
self._custom_kaiming_uniform_(self.weight, a=math.sqrt(0.005))
else:
self._custom_kaiming_uniform_(self.weight, a=math.sqrt(5))
if self.bias is not None:
fan_in, _ = self._calculate_fan_in_and_fan_out()
bound = 1 / math.sqrt(fan_in)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment