Commit 899008fa authored by yan.yan's avatar yan.yan
Browse files

working on c++ only

parent f78575ea
......@@ -248,8 +248,7 @@ class ConvOutLocIter(pccm.ParameterizedClass):
class SparseConvIndicesKernel(pccm.ParameterizedClass):
def __init__(self, problem: ConvProblem, dtype_indices: dtypes.DType):
super().__init__()
self.add_dependency(TensorView, TensorViewKernel, TensorViewHashKernel,
ThrustLib)
self.add_dependency(TensorView, TensorViewKernel, TensorViewHashKernel)
self.loc_iter = ConvOutLocIter(problem)
self.add_param_class("spinds", self.loc_iter, "ConvLocIter")
self.add_param_class("spinds", problem, "ConvProblem")
......@@ -271,7 +270,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code.arg("indice_pairs",
f"{self.dtype_indices}*") # [2, kernelProd, MaxSize]
code.arg("indice_pairs_for_uniq",
f"TIndiceUniq*") # [2, kernelProd, MaxSize]
f"TIndiceUniq*") # [kernelProd * MaxSize + 1]
code.arg("indice_num_per_loc", f"int*") # [kernelProd]
code.arg("num_indices_in", "int")
......@@ -340,7 +339,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code.arg("indice_pairs_out_part", f"int*") # [kernelProd, MaxSize]
code.arg("num_indices_in", "int")
code.arg("indices_pair_size", "int")
# TODO use block instead of filter_offset?
code.raw(f"""
int filter_offset = blockIdx.y;
auto indice_pairs_out_part_filter = indice_pairs_out_part + filter_offset * indices_pair_size;
......@@ -358,6 +356,46 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
""")
return code
@pccm.cuda.cuda_global_function
def calc_conv_indices_stage2_bounded(self):
"""if we bound output indices, some pair may be invalid,
so we need to atomicAdd and assign again.
here we will use indice_pairs_uniq as temp memory of
indice_pairs_in_part.
"""
code = pccm.FunctionCode()
code.targ("TTable")
code.arg("table", f"TTable") # [N, ndim + 1]
code.arg("indice_pairs_uniq_before_sort", f"const typename TTable::key_type*") # [kernelProd, MaxSize]
code.arg("indice_pairs_in_part_temp", f"const int*") # [kernelProd, MaxSize]
code.arg("indice_pairs_in_part", f"int*") # [kernelProd, MaxSize]
code.arg("indice_pairs_out_part", f"int*") # [kernelProd, MaxSize]
code.arg("indice_num_per_loc", f"int*") # [kernelProd]
code.arg("num_indices_in", "int")
code.arg("indices_pair_size", "int")
code.raw(f"""
int filter_offset = blockIdx.y;
auto indice_pairs_in_part_filter = indice_pairs_in_part + filter_offset * indices_pair_size;
auto indice_pairs_out_part_filter = indice_pairs_out_part + filter_offset * indices_pair_size;
auto indice_pairs_in_part_temp_filter = indice_pairs_in_part_temp + filter_offset * indices_pair_size;
auto indice_pairs_uniq_before_sort_filter = indice_pairs_uniq_before_sort + filter_offset * indices_pair_size;
for (int i : tv::KernelLoopX<int>(num_indices_in)) {{
{self.dtype_indices} output_coord_offset = indice_pairs_uniq_before_sort_filter[i];
if (output_coord_offset != std::numeric_limits<typename TTable::key_type>::max()){{
auto table_offset = table.lookup_offset(output_coord_offset);
if (table_offset != -1){{
int old_num = tv::cuda::atomicAggInc(indice_num_per_loc + filter_offset);
indice_pairs_in_part_filter[old_num] = indice_pairs_in_part_temp_filter[i];
indice_pairs_out_part_filter[old_num] = table.value_ptr()[table_offset];
}}
}}
}}
""")
return code
@pccm.cuda.cuda_global_function
def calc_conv_indices_stage1_mask(self):
code = pccm.FunctionCode()
......@@ -369,7 +407,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code.arg("indice_pairs_bwd",
f"{self.dtype_indices}*") # [kernelProd, MaxSize]
code.arg("indice_pairs_for_uniq",
f"TIndiceUniq*") # [2, kernelProd, MaxSize]
f"TIndiceUniq*") # [kernelProd * MaxSize + 1]
code.arg("indice_num_per_loc", f"int*") # [kernelProd]
code.arg("num_indices_in", "int")
......@@ -397,6 +435,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
// indice_pairs[filter_offset_mul_indices_pair_size + old_num] = i;
// indice_pairs_bwd[filter_offset_mul_indices_pair_size + input_index] = output_coord_offset;
// indice_pairs_for_uniq[filter_offset_mul_indices_pair_size + old_num] = output_coord_offset;
indice_pairs_for_uniq[filter_offset_mul_indices_pair_size + input_index] = output_coord_offset;
// }}
}}
......@@ -420,7 +459,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code.arg("num_indices_in", "int")
code.arg("num_indices_out", "int")
# TODO use block instead of filter_offset?
code.raw(f"""
int filter_offset = blockIdx.y;
uint32_t filter_mask_fwd = (1u << (filter_offset));
......@@ -458,7 +496,6 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code.arg("num_indices_in", "int")
code.arg("kv", "int")
# TODO use block instead of filter_offset?
code.raw(f"""
for (int input_index : tv::KernelLoopX<int>(num_indices_in)) {{
uint32_t mask = 0;
......@@ -749,18 +786,13 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
indice_pairs_uniq.data_ptr<T>(), indice_num_per_loc.data_ptr<int>(), indices.dim(0),
indice_pairs.dim(2), kv, transposed);
}});
// thrust::device_ptr<{self.dtype_indices}> ptr_tr(indice_pairs_uniq.data_ptr<{self.dtype_indices}>());
// auto thrust_ctx = thrust::cuda::par.on(reinterpret_cast<cudaStream_t>(stream_int));
// thrust::sort(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
// auto new_end = thrust::unique(thrust_ctx, ptr_tr, ptr_tr + uniq_size);
// auto num_out_act = new_end - ptr_tr - 1;
// return num_out_act;
""")
return code # .ret("int")
@pccm.cuda.static_function
def generate_conv_inds_stage1_5(self):
code = pccm.FunctionCode()
code.add_dependency(ThrustLib)
code.arg("indice_pairs_uniq", "tv::Tensor")
code.arg("uniq_size", "int64_t")
code.arg("stream_int", f"std::uintptr_t", "0")
......@@ -783,6 +815,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
code = pccm.FunctionCode()
code.arg("indices, hashdata_k, hashdata_v", "tv::Tensor")
code.arg("indice_pairs, indice_pairs_uniq, indice_pairs_uniq_before_sort, out_inds", "tv::Tensor")
code.arg("indice_num_per_loc", "tv::Tensor")
code.arg("num_out_act", "int")
code.arg("batch_size", "int")
code.arg("output_dims, input_dims", f"tv::array<int, {self.ndim}>")
......@@ -790,6 +824,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
f"tv::array<int, {self.ndim}>")
code.arg("transposed", f"bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0")
code.arg("use_bound_algo", "bool", "false")
code.raw(f"""
auto custream = reinterpret_cast<cudaStream_t>(stream_int);
// TODO stream
......@@ -798,6 +834,8 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
TV_ASSERT_RT_ERR(kv == indice_pairs.dim(1), "error");
TV_ASSERT_RT_ERR(hashdata_k.dtype() == indice_pairs_uniq.dtype(), "error");
TV_ASSERT_RT_ERR(hashdata_v.dtype() == tv::int32, "error");
auto ctx = tv::Context();
ctx.set_cuda_stream(custream);
// indice_pairs: [2, kv, indices.dim(0)]
// indice_pairs_uniq: [indice_pairs.size() / 2 + 1]
......@@ -805,6 +843,10 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
// auto timer = tv::CudaContextTimer<>();
int64_t uniq_size = indice_pairs.size() / 2 + 1;
TV_ASSERT_RT_ERR(indice_pairs_uniq.dim(0) >= num_out_act, "error");
// int num_out_act_bounded = num_out_act;
// if (num_out_act_bound > 0){{
// num_out_act_bounded = std::min(num_out_act_bounded, num_out_act);
// }}
TV_ASSERT_RT_ERR(out_inds.dim(0) >= num_out_act && out_inds.dim(1) == {self.ndim + 1}, "error");
tv::cuda::Launch launcher_num_act_in(indices.dim(0), custream);
launcher_num_act_in.blocks.y = kv;
......@@ -827,11 +869,29 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
lanucher_build_hash(build_conv_hash_table<table_t>, hash,
out_inds.data_ptr<int>(), indice_pairs_uniq.data_ptr<const K>(),
loc_iter.layout_npq, num_out_act);
if (!use_bound_algo){{
launcher_num_act_in(calc_conv_indices_stage2<table_t>, hash,
indice_pairs_uniq_before_sort.data_ptr<const K>(),
indice_pairs[1].data_ptr<int>(),
indices.dim(0),
indice_pairs.dim(2));
}}else{{
indice_num_per_loc.zero_(ctx);
// copy previous pair in to indice_pairs_uniq
// we need to ensure size of indice_pairs_uniq larger than pair in
TV_ASSERT_RT_ERR({pccm.literal(self.dtype_indices == dtypes.int32)}, "error");
tv::Tensor indice_pairs_in_temp = tv::from_blob(indice_pairs_uniq.raw_data(), {{indice_pairs.dim(1), indice_pairs.dim(2)}},
indice_pairs.dtype(), indice_pairs.device());
indice_pairs_in_temp.copy_(indice_pairs[0].view(-1), ctx);
launcher_num_act_in(calc_conv_indices_stage2_bounded<table_t>, hash,
indice_pairs_uniq_before_sort.data_ptr<const K>(),
indice_pairs_in_temp.data_ptr<const int>(),
indice_pairs[0].data_ptr<int>(),
indice_pairs[1].data_ptr<int>(),
indice_num_per_loc.data_ptr<int>(),
indices.dim(0),
indice_pairs.dim(2));
}}
}});
return num_out_act;
""")
......@@ -899,6 +959,7 @@ class SparseConvIndicesKernel(pccm.ParameterizedClass):
f"tv::array<int, {self.ndim}>")
code.arg("transposed", f"bool", "false")
code.arg("stream_int", f"std::uintptr_t", "0")
code.raw(f"""
auto custream = reinterpret_cast<cudaStream_t>(stream_int);
// TODO stream
......
import os
import fire
from cumm.common import CompileInfo
from cumm.conv.main import ConvMainUnitTest
from cumm.gemm.main import GemmMainUnitTest
from pccm.builder.pybind import gen_cmake
from spconv.core import (IMPLGEMM_SIMT_PARAMS, IMPLGEMM_TURING_PARAMS,
IMPLGEMM_VOLTA_PARAMS, SHUFFLE_SIMT_PARAMS,
SHUFFLE_TURING_PARAMS, SHUFFLE_VOLTA_PARAMS)
from spconv.csrc.hash.core import HashTable
from spconv.csrc.sparse.all import SpconvOps
from spconv.csrc.sparse.alloc import ExternalAllocator
from spconv.csrc.sparse.convops import (ConvGemmOps, ConvTunerSimple,
ExternalSpconvMatmul, GemmTunerSimple,
SimpleExternalSpconvMatmul)
from spconv.csrc.utils import BoxOps
def main(include: str,
src: str,
libname: str = "spconv",
prefix: str = "spconvlib"):
all_shuffle = SHUFFLE_SIMT_PARAMS + SHUFFLE_VOLTA_PARAMS + SHUFFLE_TURING_PARAMS
all_shuffle = list(filter(lambda x: not x.is_nvrtc, all_shuffle))
cu = GemmMainUnitTest(all_shuffle)
cu.namespace = "cumm.gemm.main"
all_imp = (IMPLGEMM_SIMT_PARAMS + IMPLGEMM_VOLTA_PARAMS +
IMPLGEMM_TURING_PARAMS)
# all_imp = IMPLGEMM_SIMT_PARAMS
all_imp = list(filter(lambda x: not x.is_nvrtc, all_imp))
convcu = ConvMainUnitTest(all_imp)
convcu.namespace = "cumm.conv.main"
gemmtuner = GemmTunerSimple(cu)
gemmtuner.namespace = "csrc.sparse.convops.gemmops"
convtuner = ConvTunerSimple(convcu)
convtuner.namespace = "csrc.sparse.convops.convops"
convops = ConvGemmOps(gemmtuner, convtuner)
convops.namespace = "csrc.sparse.convops.spops"
cus = [
cu,
convcu,
gemmtuner,
convtuner,
convops,
SpconvOps(),
BoxOps(),
HashTable(),
CompileInfo(),
ExternalAllocator(),
ExternalSpconvMatmul(),
SimpleExternalSpconvMatmul(),
]
gen_cmake(libname, cus, include, src, namespace_prefix=prefix)
if __name__ == "__main__":
fire.Fire(main)
......@@ -38,20 +38,6 @@ from torch.nn.init import calculate_gain
FILTER_HWIO = False
def expand_nd(val: Union[int, List[int], Tuple[int, ...]], ndim: int) -> List[int]:
if isinstance(val, int):
val = [val] * ndim
elif isinstance(val, list):
assert len(val) == ndim
elif isinstance(val, tuple):
assert len(val) == ndim
return [*val]
else:
raise NotImplementedError
return val
class SparseConvolution(SparseModule):
__constants__ = [
'stride', 'padding', 'dilation', 'groups', 'bias', 'subm', 'inverse',
......@@ -82,6 +68,7 @@ class SparseConvolution(SparseModule):
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = expand_nd(ndim, kernel_size)
self.stride = expand_nd(ndim, stride)
kv = int(np.prod(self.kernel_size))
kv_stride = int(np.prod(self.stride))
......@@ -130,7 +117,6 @@ class SparseConvolution(SparseModule):
# KRSC
self.weight = Parameter(
torch.Tensor(out_channels, *self.kernel_size, in_channels))
if bias:
self.bias = Parameter(torch.Tensor(out_channels))
else:
......
This diff is collapsed.
This diff is collapsed.
"""this file can only be used by spconv developer for now.
the "tensorpc" isn't a open source project.
"""
import tensorpc
from tensorpc.apps.flow.flowapp import App
class TestApp(App):
pass
\ No newline at end of file
......@@ -25,7 +25,7 @@ import spconv.pytorch as spconv
from spconv.utils import Point2VoxelCPU3d
# torch.backends.cudnn.enabled = False
def waymo_data(batch_size=1):
def waymo_data(batch_size=1, num_features=-1):
gen = Point2VoxelCPU3d([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 3,
150000, 1)
# gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
......@@ -35,11 +35,39 @@ def waymo_data(batch_size=1):
print(pc.shape)
voxels_tv, indices_tv, _ = gen.point_to_voxel(tv.from_numpy(pc))
voxels = voxels_tv.numpy().reshape(-1, 3)
if num_features > 0:
voxels = np.zeros((voxels.shape[0], num_features), dtype=voxels.dtype)
coors = indices_tv.numpy()
N = coors.shape[0]
coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
return voxels, coors, gen.grid_size
def waymo_data_large(batch_size=1):
gen = Point2VoxelCPU3d([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 3,
1200000, 1)
# gen = VoxelGeneratorV2([0.1, 0.1, 0.1], [-80, -80, -2, 80, 80, 6], 1,
# 150000)
data = np.load(Path(__file__).parent / "data" / "benchmark-pc.npz")
pc = np.ascontiguousarray(data["pc"])
pc2 = pc.copy()
pc2[:, 1] += 1
pc3 = pc.copy()
pc3[:, 1] += 2
pc4 = pc.copy()
pc4[:, 1] += 3
pc5 = pc.copy()
pc5[:, 1] += 4
pc = np.concatenate([pc, pc2, pc3, pc4, pc5])
print(pc.shape)
voxels_tv, indices_tv, _ = gen.point_to_voxel(tv.from_numpy(pc))
voxels = voxels_tv.numpy().reshape(-1, 3)
coors = indices_tv.numpy()
N = coors.shape[0]
print("num voxels", N)
coors = np.concatenate([np.full([N, 1], 0, coors.dtype), coors], axis=1)
return voxels, coors, gen.grid_size
class Net(nn.Module):
def __init__(self, shape, algo):
......@@ -61,6 +89,21 @@ class Net(nn.Module):
# # algo=algo),
# spconv.SubMConv3d(32, 64, 3, bias=False, indice_key="c0",
# algo=algo),
# spconv.SubMConv3d(64, 64, 3, bias=False, indice_key="c0",
# algo=algo),
# spconv.SubMConv3d(32,
# 32,
# 3,
# bias=False,
# indice_key="c0",
# algo=algo),
# # nn.BatchNorm1d(32),
# # nn.ReLU(),
# # spconv.SparseConv3d(64, 64, 2, 2, bias=False,
# # algo=algo),
# spconv.SubMConv3d(32, 64, 3, bias=False, indice_key="c0",
# algo=algo),
spconv.SubMConv3d(64,
64,
3,
......@@ -275,7 +318,7 @@ def main():
import pickle
np.random.seed(50051)
torch.manual_seed(50051)
# voxels, coors, spatial_shape = waymo_data()
# voxels, coors, spatial_shape = waymo_data(num_features=128)
# with open("/home/yy/test_spconv.pkl", "wb") as f:
# pickle.dump((voxels, coors, spatial_shape), f)
with open(Path(__file__).parent / "data" / "test_spconv.pkl", "rb") as f:
......@@ -312,7 +355,7 @@ def main():
# MaskImpGemm: 51.0ms
# MaskSplitImpGemm: 41.1ms
# algo = None
net = Net(spatial_shape, algo).to(device).eval().to(dtype)# .train()
net = Net(spatial_shape, algo).to(device).eval().to(dtype).train()
# net.load_state_dict(net.state_dict())
spconv.assign_name_for_sparse_modules(net)
print(coors_th.shape)
......@@ -345,18 +388,18 @@ def main():
print("spconv time", np.mean(times[10:]))
times = []
# for i in range(10):
# out = net(voxels_th, coors_th, 1)
# print("------------")
# torch.cuda.synchronize()
# t = time.time()
# out.features.backward(dout_t)
# torch.cuda.synchronize()
# times.append(time.time() - t)
# # # print((net.grid == -1).float().sum(), net.grid.numel())
# # # print("spconv time", time.time() - t)
# print("spconv bw time", np.mean(times[5:]))
for i in range(10):
out = net(voxels_th, coors_th, 1)
print("------------")
torch.cuda.synchronize()
t = time.time()
out.features.backward(dout_t)
torch.cuda.synchronize()
times.append(time.time() - t)
# # print((net.grid == -1).float().sum(), net.grid.numel())
# # print("spconv time", time.time() - t)
print("spconv bw time", np.mean(times[5:]))
if __name__ == "__main__":
......
This diff is collapsed.
......@@ -248,7 +248,7 @@ def test_spconv3d():
ConvAlgo.Native, ConvAlgo.MaskImplicitGemm,
ConvAlgo.MaskSplitImplicitGemm
]
# algos = [ConvAlgo.Native]
algos = [ConvAlgo.Native]
for dev, shape, bs, IC, OC, k, s, p, d, al in params_grid(
devices, shapes, batchsizes, in_channels, out_channels, ksizes,
......@@ -308,7 +308,6 @@ def test_spconv3d():
filters_t = torch.from_numpy(filters).to(device).to(dtype)
net_ref.net[0].weight.data[:] = filters_t.permute(
0, 4, 1, 2, 3).contiguous()
net.net[0].weight.data[:] = filters_t
out_ref = net_ref(features_dense_t)
out = net(features_t, indices_t, bs).dense()
......@@ -529,4 +528,4 @@ def test_spmaxpool3d():
if __name__ == "__main__":
test_spmaxpool3d()
test_spconv3d()
......@@ -222,6 +222,7 @@ class NetLight(nn.Module):
def _test_multi_impl(dtype: torch.dtype):
# TODO pytorch 1.12 don't support cpu half mm, f**k pytorch
# TODO remove or release this when tf32 op is ready
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
......@@ -239,8 +240,6 @@ def _test_multi_impl(dtype: torch.dtype):
np.float32)
coors = np.ascontiguousarray(
sparse_dict["indices"][:, [3, 0, 1, 2]]).astype(np.int32)
device = torch.device("cuda:0")
device_cpu = torch.device("cpu:0")
......@@ -275,17 +274,21 @@ def _test_multi_impl(dtype: torch.dtype):
dout_t = torch.from_numpy(dout).to(device_cpu).to(dtype)
dout_t_cu = torch.from_numpy(dout).to(device).to(dtype)
t = time.time()
print(1, time.time() - t)
out_cpu = net_native_cpu(voxels_th, coors_th, 1).dense()
if dtype != torch.float16:
out_cpu.backward(dout_t)
out = net_native_gpu(voxels_th_cuda, coors_th_cuda, 1).dense()
print(2, time.time() - t)
out.backward(dout_t_cu)
out_imp = net_imp_gpu(voxels_th_cuda, coors_th_cuda, 1).dense()
print(3, time.time() - t)
out_imp.backward(dout_t_cu)
out_simp = net_simp_gpu(voxels_th_cuda, coors_th_cuda, 1).dense()
print(4, time.time() - t)
out_simp.backward(dout_t_cu)
with torch.no_grad():
......@@ -297,6 +300,7 @@ def _test_multi_impl(dtype: torch.dtype):
error_native = torch.linalg.norm(dense_cpu - dense_native).cpu().item()
error_imp = torch.linalg.norm(dense_cpu - dense_imp).cpu().item()
error_simp = torch.linalg.norm(dense_cpu - dense_simp).cpu().item()
print(5, time.time() - t)
print("error_native", error_native)
print("error_imp", error_imp)
......@@ -320,15 +324,15 @@ def _test_multi_impl(dtype: torch.dtype):
native_w = native_params[k]
imp_w = imp_params[k]
simp_w = simp_params[k]
cpu_w_grad = cpu_w.grad.detach().cuda()
native_w_grad = native_w.grad.detach()
imp_w_grad = imp_w.grad.detach()
simp_w_grad = simp_w.grad.detach()
if dtype != torch.float16:
cpu_w_grad = cpu_w.grad.detach().cuda()
error_native = torch.linalg.norm(native_w_grad - cpu_w_grad).cpu().item()
error_imp = torch.linalg.norm(native_w_grad - imp_w_grad).cpu().item()
error_simp = torch.linalg.norm(native_w_grad - simp_w_grad).cpu().item()
print(k, error_native, error_imp, error_simp)
print(k, error_imp, error_simp)
assert error_imp < 1
assert error_simp < 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment