Commit 01ed382c authored by yan.yan's avatar yan.yan
Browse files

working on tensor core test

parent 3517290c
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional
import numpy as np
......
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from cumm import tensorview as tv
import torch
from typing import Optional, List
_TORCH_DTYPE_TO_TV = {
torch.float32: tv.float32,
torch.float64: tv.float64,
torch.float16: tv.float16,
torch.int32: tv.int32,
torch.int64: tv.int64,
torch.int8: tv.int8,
torch.int16: tv.int16,
torch.uint8: tv.uint8,
}
def torch_tensor_to_tv(ten: torch.Tensor, dtype: Optional[int] = None, shape: Optional[List[int]] = None):
assert ten.is_contiguous(), "must be contiguous tensor"
ptr = ten.data_ptr()
device = ten.device
if device.type == "cpu":
tv_device = -1
elif device.type == "cuda":
tv_device = 0
else:
raise NotImplementedError
if shape is None:
shape = list(ten.shape)
if dtype is None:
dtype = _TORCH_DTYPE_TO_TV[ten.dtype]
return tv.from_blob(ptr, shape, dtype, tv_device)
def get_current_stream():
return torch.cuda.current_stream().cuda_stream
if __name__ == "__main__":
a = torch.rand(2, 2)
atv = torch_tensor_to_tv(a)
print(atv.numpy_view())
\ No newline at end of file
# Copyright 2019-2020 Yan Yan
#
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -16,7 +16,7 @@ import torch
from torch import nn
from torch.autograd import Function
import spconv.ops as ops
import spconv.pytorch.ops as ops
class SparseConvFunction(Function):
......
# Copyright 2019-2020 Yan Yan
#
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import time
from collections import OrderedDict
......@@ -19,7 +20,7 @@ from collections import OrderedDict
import torch
from torch import nn
import spconv
from spconv import pytorch as spconv
def is_spconv_module(module):
......@@ -28,7 +29,7 @@ def is_spconv_module(module):
def is_sparse_conv(module):
from spconv.conv import SparseConvolution
from spconv.pytorch.conv import SparseConvolution
return isinstance(module, SparseConvolution)
......@@ -145,7 +146,7 @@ class SparseSequential(SparseModule):
def fused(self):
"""don't use this. no effect.
"""
from spconv.conv import SparseConvolution
from spconv.pytorch.conv import SparseConvolution
mods = [v for k, v in self._modules.items()]
fused_mods = []
idx = 0
......
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from enum import Enum
from cumm import tensorview as tv
from cumm.gemm.algospec.core import ShuffleStrideType
import torch
import numpy as np
import spconv
from spconv.algo import AlgoHint, ConvAlgo
from typing import List, Union
from spconv.pytorch.cppcore import torch_tensor_to_tv, get_current_stream
from spconv.core_cc.csrc.sparse.all import SpconvOps
from spconv.algo import GEMM# , GATHER, SCATTER
import time
from spconv.constants import FILTER_HWIO
def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
ndim = len(input_size)
output_size = []
for i in range(ndim):
size = (input_size[i] + 2 * padding[i] - dilation[i] *
(kernel_size[i] - 1) - 1) // stride[i] + 1
if kernel_size[i] == -1:
output_size.append(1)
else:
output_size.append(size)
return output_size
def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
output_padding):
ndim = len(input_size)
output_size = []
for i in range(ndim):
if kernel_size[i] == -1:
raise ValueError("deconv don't support kernel_size < 0")
size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
i] + output_padding[i]
output_size.append(size)
return output_size
def get_indice_pairs(indices: torch.Tensor,
batch_size: int,
spatial_shape: List[int],
algo: ConvAlgo,
ksize: Union[int, List[int]],
stride: Union[int, List[int]],
padding: Union[int, List[int]],
dilation: Union[int, List[int]],
out_padding: Union[int, List[int]],
subm: bool = False,
transpose: bool = False):
ndim = indices.shape[1] - 1
if not isinstance(ksize, (list, tuple)):
ksize = [ksize] * ndim
if not isinstance(stride, (list, tuple)):
stride = [stride] * ndim
if not isinstance(padding, (list, tuple)):
padding = [padding] * ndim
if not isinstance(dilation, (list, tuple)):
dilation = [dilation] * ndim
if not isinstance(out_padding, (list, tuple)):
out_padding = [out_padding] * ndim
kv: int = int(np.prod(ksize))
if not subm:
if transpose:
out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
padding, dilation, out_padding)
else:
out_shape = get_conv_output_size(spatial_shape, ksize, stride,
padding, dilation)
else:
out_shape = spatial_shape
assert algo == ConvAlgo.Native and not transpose, "TODO"
stream = get_current_stream()
pair = torch.full((2, kv, indices.shape[0]),
-1,
dtype=indices.dtype,
device=indices.device)
indice_num_per_loc = torch.zeros((kv, ),
dtype=indices.dtype,
device=indices.device)
inds_tv = torch_tensor_to_tv(indices)
pair_tv = torch_tensor_to_tv(pair)
indice_num_per_loc_tv = torch_tensor_to_tv(indice_num_per_loc)
# torch.cuda.synchronize()
# t = time.time()
if subm:
out_inds = indices
hashdata = torch.empty((out_inds.shape[0] * 2, ),
dtype=torch.int64,
device=indices.device)
out_inds_tv = torch_tensor_to_tv(out_inds)
hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
SpconvOps.generate_subm_conv_inds(inds_tv,
hashdata_tv,
pair_tv,
out_inds_tv,
indice_num_per_loc_tv,
batch_size=batch_size,
input_dims=spatial_shape,
ksize=ksize,
dilation=dilation,
stream_int=stream)
# torch.cuda.synchronize()
# print("SUBM INDICE GEN", time.time() - t)
else:
indice_pairs_uniq = torch.empty((pair.numel() // 2 + 1, ),
dtype=indices.dtype,
device=indices.device)
indice_pairs_uniq_tv = torch_tensor_to_tv(indice_pairs_uniq)
num_act_out = SpconvOps.generate_conv_inds_stage1(
inds_tv,
pair_tv,
indice_pairs_uniq_tv,
indice_num_per_loc_tv,
batch_size=batch_size,
output_dims=out_shape,
input_dims=spatial_shape,
ksize=ksize,
stride=stride,
padding=padding,
dilation=dilation,
stream_int=stream)
out_inds = torch.empty((num_act_out, indices.shape[1]),
dtype=indices.dtype,
device=indices.device)
hashdata = torch.empty((out_inds.shape[0] * 2, ),
dtype=torch.int64,
device=indices.device)
out_inds_tv = torch_tensor_to_tv(out_inds)
hashdata_tv = torch_tensor_to_tv(hashdata, dtype=tv.custom64)
SpconvOps.generate_conv_inds_stage2(inds_tv,
hashdata_tv,
pair_tv,
indice_pairs_uniq_tv,
out_inds_tv,
num_out_act=num_act_out,
batch_size=batch_size,
output_dims=out_shape,
input_dims=spatial_shape,
ksize=ksize,
stride=stride,
padding=padding,
dilation=dilation,
stream_int=stream)
# torch.cuda.synchronize()
# print("INDICE GEN", time.time() - t)
return out_inds, pair, indice_num_per_loc
def indice_conv(features: torch.Tensor,
filters: torch.Tensor,
indice_pairs: torch.Tensor,
indice_pair_num: torch.Tensor,
num_activate_out: int,
inverse: bool = False,
subm: bool = False,
algo: ConvAlgo = ConvAlgo.Native):
# filters: RSKC
# torch.cuda.synchronize()
# t = time.time()
if features.dtype == torch.int8 or features.dtype == torch.qint8:
raise NotImplementedError("work in progress")
if FILTER_HWIO:
out_channel = filters.shape[-1]
else:
out_channel = filters.shape[-2]
filters = filters.reshape(-1, *filters.shape[-2:])
kv = filters.shape[0]
kv_center = kv // 2
if subm:
if FILTER_HWIO:
out_features = torch.mm(features, filters[kv_center])
else:
out_features = torch.mm(features, filters[kv_center].T)
else:
out_features = torch.zeros((num_activate_out, out_channel),
dtype=features.dtype,
device=features.device)
if kv == 1 and subm:
return out_features
stream = get_current_stream()
indice_pair_num_cpu = indice_pair_num.cpu().tolist()
arch = torch.cuda.get_device_capability()
inited: bool = subm
a = torch_tensor_to_tv(features)
c = torch_tensor_to_tv(out_features)
profile_idx = kv_center
if subm:
profile_idx = kv_center - 1
# profile_idx = first_n
nhot_profile = indice_pair_num_cpu[profile_idx]
# print(nhot_profile, indice_pair_num_cpu)
profile_res = GEMM.get_profiled_algo(
a.shape,
filters.shape[-2:],
c.shape,
False,
False if FILTER_HWIO else True,
False,
arch=arch,
shuffle_type=ShuffleStrideType.ShuffleAC,
a_inds_shape=[nhot_profile],
c_inds_shape=[nhot_profile],
hint=AlgoHint.Fowrard.value)
gather_data_tv = tv.Tensor()
scatter_data_tv = tv.Tensor()
maxnhot = max(indice_pair_num_cpu)
if profile_res is None:
# run profile on center
inp_indices_th = indice_pairs[int(inverse)][profile_idx, :nhot_profile]
out_indices_th = indice_pairs[int(not inverse)][
profile_idx, :nhot_profile]
inp_indices = torch_tensor_to_tv(inp_indices_th)
out_indices = torch_tensor_to_tv(out_indices_th)
filter_tv = torch_tensor_to_tv(filters)[profile_idx]
profile_res, min_time = GEMM.profile_and_cache(
a,
filter_tv,
c,
False,
False if FILTER_HWIO else True,
False,
arch=arch,
shuffle_type=ShuffleStrideType.ShuffleAC,
a_inds=inp_indices,
c_inds=out_indices,
alpha=1.0,
beta=0.0,
hint=AlgoHint.Fowrard.value,
stream=stream)
indice_pairs_tv = torch_tensor_to_tv(indice_pairs)
pair_in = indice_pairs_tv[int(inverse)]
pair_out = indice_pairs_tv[int(not inverse)]
filters_tv = torch_tensor_to_tv(filters)
for i, nhot in enumerate(indice_pair_num_cpu):
if subm and i == kv_center:
continue
if subm and i > kv_center:
nhot = indice_pair_num_cpu[kv - i - 1]
if nhot <= 0:
continue
inp_indices = pair_in[i].slice_first_axis(0, nhot)
out_indices = pair_out[i].slice_first_axis(0, nhot)
# inp_indices = torch_tensor_to_tv(inp_indices_th)
# out_indices = torch_tensor_to_tv(out_indices_th)
b = filters_tv[i]
# inp @ filter.T, NC @ KC
beta = 1.0 if inited else 0.0
algo_desp = GEMM.run_profile(
profile_res,
a,
b,
c,
False,
False if FILTER_HWIO else True,
False,
arch=arch,
stream=stream,
shuffle_type=ShuffleStrideType.ShuffleAC,
a_inds=inp_indices,
c_inds=out_indices,
hint=AlgoHint.Fowrard.value,
alpha=1.0,
beta=beta)
# gather_times += gather_time
inited = True
# torch.cuda.synchronize()
# print(stream, valid_count, maxnhot, features.shape[0], features.shape[1], out_channel, time.time() - t, total_times, txt)
# print(algo_desp, profile_res.external_gather, profile_res.splitk, features.shape[0], features.shape[1], out_channel, time.time() - t, total_times)
# print(indice_pair_num_cpu)
# print(time.time() - t)
return out_features
def fused_indice_conv(features, filters, bias, indice_pairs, indice_pair_num,
num_activate_out, inverse, subm):
raise NotImplementedError
def indice_conv_backward(features: torch.Tensor,
filters: torch.Tensor,
out_bp: torch.Tensor,
indice_pairs: torch.Tensor,
indice_pair_num: torch.Tensor,
inverse: bool = False,
subm: bool = False,
algo: ConvAlgo = ConvAlgo.Native):
# workspace = torch.empty((10000), dtype=torch.uint8, device=features.device)
# workspace_tv = torch_tensor_to_tv(workspace)
# torch.cuda.synchronize()
# t = time.time()
num_activate_out = out_bp.shape[0]
out_channel = out_bp.shape[-1]
filters_shape = filters.shape
filters = filters.reshape(-1, *filters.shape[-2:])
kv = filters.shape[0]
kv_center = kv // 2
assert out_bp.is_contiguous()
assert filters.is_contiguous()
assert features.is_contiguous()
if subm:
dfilters = torch.zeros_like(filters)
if FILTER_HWIO:
torch.mm(features.T, out_bp, out=dfilters[kv_center])
# TODO can we use torch mm for f16 backward weight?
din = torch.mm(out_bp, filters[kv_center].T)
else:
torch.mm(out_bp.T, features, out=dfilters[kv_center])
# TODO can we use torch mm for f16 backward weight?
din = torch.mm(out_bp, filters[kv_center])
else:
dfilters = torch.zeros_like(filters)
din = torch.zeros_like(features)
if kv == 1 and subm:
return (din, dfilters.reshape(filters_shape))
inited: bool = subm
indice_pairs_tv = torch_tensor_to_tv(indice_pairs)
# torch slice (a_th[x]) is very slow, so we need to use tv.Tensor earlier.
pair_in = indice_pairs_tv[int(inverse)]
pair_out = indice_pairs_tv[int(not inverse)]
stream = get_current_stream()
indice_pair_num_cpu = indice_pair_num.cpu().tolist()
arch = torch.cuda.get_device_capability()
filters_tv = torch_tensor_to_tv(filters)
dfilters_tv = torch_tensor_to_tv(dfilters)
out_bp_tv = torch_tensor_to_tv(out_bp)
features_tv = torch_tensor_to_tv(features)
din_tv = torch_tensor_to_tv(din)
profile_idx = kv_center
if subm:
profile_idx = kv_center - 1
# profile_idx = first_n
nhot_profile = indice_pair_num_cpu[profile_idx]
# print(nhot_profile, indice_pair_num_cpu)
profile_res_dgrad = GEMM.get_profiled_algo(
out_bp_tv.shape,
filters.shape[-2:],
din_tv.shape,
False,
True if FILTER_HWIO else False,
False,
arch=arch,
shuffle_type=ShuffleStrideType.ShuffleAC,
a_inds_shape=[nhot_profile],
c_inds_shape=[nhot_profile],
hint=AlgoHint.BackwardInput.value)
if profile_res_dgrad is None:
inp_indices = pair_in[profile_idx].slice_first_axis(0, nhot_profile)
out_indices = pair_out[profile_idx].slice_first_axis(0, nhot_profile)
filter_tv = filters_tv[profile_idx]
profile_res_dgrad, min_time = GEMM.profile_and_cache(
out_bp_tv,
filter_tv,
din_tv,
False,
True if FILTER_HWIO else False,
False,
arch=arch,
shuffle_type=ShuffleStrideType.ShuffleAC,
a_inds=inp_indices,
c_inds=out_indices,
alpha=1.0,
beta=0.0,
# scatter_data=scatter_data_tv.slice_first_axis(0, nhot_profile),
hint=AlgoHint.BackwardInput.value,
stream=stream)
if not FILTER_HWIO:
a_wgrad = out_bp_tv
b_wgrad = features_tv
else:
a_wgrad = features_tv
b_wgrad = out_bp_tv
profile_res_wgrad = GEMM.get_profiled_algo(
a_wgrad.shape,
b_wgrad.shape,
filters.shape[-2:],
True,
False,
False,
arch=arch,
shuffle_type=ShuffleStrideType.ShuffleAB,
a_inds_shape=[nhot_profile],
b_inds_shape=[nhot_profile],
hint=AlgoHint.BackwardWeight.value)
if profile_res_wgrad is None:
inp_indices = pair_in[profile_idx].slice_first_axis(0, nhot_profile)
out_indices = pair_out[profile_idx].slice_first_axis(0, nhot_profile)
dfilter_tv = dfilters_tv[profile_idx]
if not FILTER_HWIO:
a_inds_wgrad = out_indices
b_inds_wgrad = inp_indices
else:
a_inds_wgrad = inp_indices
b_inds_wgrad = out_indices
profile_res_wgrad, min_time = GEMM.profile_and_cache(
a_wgrad,
b_wgrad,
dfilter_tv,
True,
False,
False,
arch=arch,
shuffle_type=ShuffleStrideType.ShuffleAB,
a_inds=a_inds_wgrad,
b_inds=b_inds_wgrad,
alpha=1.0,
beta=0.0,
# scatter_data=scatter_data_tv.slice_first_axis(0, nhot_profile),
hint=AlgoHint.BackwardWeight.value,
stream=stream)
# print(profile_res_wgrad.algo_desp, profile_res_wgrad.splitk, min_time)
maxnhot = max(indice_pair_num_cpu)
# get workspace size for wgrad
if not FILTER_HWIO:
a_shape = [maxnhot, out_bp_tv.dim(1)]
b_shape = [maxnhot, features_tv.dim(1)]
else:
b_shape = [maxnhot, out_bp_tv.dim(1)]
a_shape = [maxnhot, features_tv.dim(1)]
m, n, k = GEMM.extract_mnk(
a_shape, b_shape, profile_res_wgrad.algo_desp.trans_a,
profile_res_wgrad.algo_desp.trans_b,
profile_res_wgrad.algo_desp.trans_c,
arch=arch,
shuffle_type=ShuffleStrideType.ShuffleAB,
a_inds_shape=[maxnhot],
b_inds_shape=[maxnhot],
hint=AlgoHint.BackwardWeight.value)
workspace_size = profile_res_wgrad.algo_desp.query_workspace_size(m, n, k, profile_res_wgrad.splitk)
workspace = torch.Tensor()
workspace_tv = tv.Tensor()
if workspace_size > 0:
workspace = torch.empty((workspace_size,), dtype=torch.int8, device=features.device)
workspace_tv = torch_tensor_to_tv(workspace)
# print(workspace_size, m, n, k, profile_res_wgrad.splitk)
# torch.cuda.synchronize()
# di_time = time.time() - t
# t = time.time()
inited = subm
for i, nhot in enumerate(indice_pair_num_cpu):
if subm and i == kv_center:
continue
if subm and i > kv_center:
nhot = indice_pair_num_cpu[kv - i - 1]
if nhot <= 0:
continue
beta = 1.0 if inited else 0.0
inp_indices = pair_in[i].slice_first_axis(0, nhot)
out_indices = pair_out[i].slice_first_axis(0, nhot)
# out.T @ inp, NK @ NC
# print(features_tv.shape, out_bp_tv.shape)
GEMM.run_profile(profile_res_dgrad,
out_bp_tv,
filters_tv[i],
din_tv,
False,
True if FILTER_HWIO else False,
False,
arch=arch,
stream=stream,
shuffle_type=ShuffleStrideType.ShuffleAC,
a_inds=out_indices,
c_inds=inp_indices,
hint=AlgoHint.BackwardInput.value,
alpha=1.0,
beta=beta)
if not FILTER_HWIO:
a = out_bp_tv
b = features_tv
a_inds = out_indices
b_inds = inp_indices
else:
a = features_tv
b = out_bp_tv
a_inds = inp_indices
b_inds = out_indices
GEMM.run_profile(profile_res_wgrad,
a,
b,
dfilters_tv[i],
True,
False,
False,
arch=arch,
stream=stream,
shuffle_type=ShuffleStrideType.ShuffleAB,
a_inds=a_inds,
b_inds=b_inds,
hint=AlgoHint.BackwardWeight.value,
alpha=1.0,
beta=beta,
workspace=workspace_tv)
inited = True
# torch.cuda.synchronize()
# dw_time = time.time() - t
# # print(dw_time + di_time, di_time, dw_time, profile_res_wgrad.splitk, profile_res_wgrad.algo_desp, dfilters.shape)
# # print(dw_time + di_time)
# print(time.time() - t)
return (din, dfilters.reshape(filters_shape))
def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
out_channel = features.shape[-1]
out_features = torch.zeros((num_activate_out, out_channel),
dtype=features.dtype,
device=features.device)
stream = get_current_stream()
indice_pair_num_cpu = indice_pair_num.cpu().tolist()
out_features_tv = torch_tensor_to_tv(out_features)
features_tv = torch_tensor_to_tv(features)
for i, nhot in enumerate(indice_pair_num_cpu):
if nhot <= 0:
continue
inp_indices = torch_tensor_to_tv(indice_pairs[0][i, :nhot])
out_indices = torch_tensor_to_tv(indice_pairs[1][i, :nhot])
SpconvOps.maxpool_forward(out_features_tv, features_tv, out_indices,
inp_indices, stream)
return out_features
def indice_maxpool_backward(features, out_features, out_bp, indice_pairs,
indice_pair_num):
out_channel = features.shape[-1]
din = torch.zeros_like(features)
stream = get_current_stream()
indice_pair_num_cpu = indice_pair_num.cpu().tolist()
out_features_tv = torch_tensor_to_tv(out_features)
features_tv = torch_tensor_to_tv(features)
out_bp_tv = torch_tensor_to_tv(out_bp)
din_tv = torch_tensor_to_tv(din)
for i, nhot in enumerate(indice_pair_num_cpu):
if nhot <= 0:
continue
inp_indices = torch_tensor_to_tv(indice_pairs[0][i, :nhot])
out_indices = torch_tensor_to_tv(indice_pairs[1][i, :nhot])
SpconvOps.maxpool_backward(out_features_tv, features_tv, out_bp_tv,
din_tv, out_indices, inp_indices, stream)
return din
def nms(boxes, scores, pre_max_size, post_max_size, thresh, eps):
raise NotImplementedError
def pillar_scatter(features, coors, shape):
raise NotImplementedError
# Copyright 2019-2020 Yan Yan
#
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -21,11 +21,12 @@ from torch import nn
from torch.nn import init
from torch.nn.parameter import Parameter
import spconv
import spconv.functional as Fsp
from spconv import ops
from spconv.core import IndiceData
from spconv.modules import SparseModule
from spconv import pytorch as spconv
from spconv.algo import ConvAlgo
import spconv.pytorch.functional as Fsp
from spconv.pytorch import ops
from spconv.pytorch.core import IndiceData
from spconv.pytorch.modules import SparseModule
class SparseMaxPool(SparseModule):
......@@ -100,13 +101,13 @@ class SparseMaxPool(SparseModule):
indices,
batch_size,
spatial_shape,
ConvAlgo.Native,
self.kernel_size,
self.stride,
self.padding,
self.dilation,
0,
self.subm,
grid=input.grid)
False)
if input.benchmark:
torch.cuda.synchronize()
interval = time.time() - t
......
# Copyright 2019-2020 Yan Yan
#
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -21,8 +21,8 @@ from torch import nn
from torch.nn import init
from torch.nn.parameter import Parameter
import spconv
from spconv.modules import SparseModule
from spconv import pytorch as spconv
from spconv.pytorch.modules import SparseModule
class RemoveDuplicate(SparseModule):
......
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from torch.autograd import Function
import spconv
import spconv.pytorch as spconv
#from torch.nn import Module
from spconv.modules import SparseModule
from spconv.pytorch.modules import SparseModule
class JoinTable(SparseModule): # Module):
......
# Copyright 2019-2020 Yan Yan
#
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......
# Copyright 2019-2020 Yan Yan
#
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -13,372 +13,13 @@
# limitations under the License.
import numpy as np
import torch
from spconv import spconv_utils
from spconv.spconv_utils import (non_max_suppression_cpu,
points_to_voxel_3d_np,
points_to_voxel_3d_np_mean,
points_to_voxel_3d_with_filtering,
rbbox_intersection, rbbox_iou,
rotate_non_max_suppression_cpu)
try:
from spconv.spconv_utils import non_max_suppression
except ImportError:
pass
def points_to_voxel(points,
voxel_size,
coors_range,
coor_to_voxelidx,
max_points=35,
max_voxels=20000,
full_mean=False,
block_filtering=True,
block_factor=1,
block_size=8,
height_threshold=0.2,
height_high_threshold=3.0,
pad_output=False):
"""convert 3d points(N, >=3) to voxels. This version calculate
everything in one loop. now it takes only 0.8ms(~6k voxels)
with c++ and 3.2ghz cpu.
Args:
points: [N, ndim] float tensor. points[:, :3] contain xyz points and
points[:, 3:] contain other information such as reflectivity.
voxel_size: [3] list/tuple or array, float. xyz, indicate voxel size
coors_range: [6] list/tuple or array, float. indicate voxel range.
format: xyzxyz, minmax
coor_to_voxelidx: int array. used as a dense map.
max_points: int. indicate maximum points contained in a voxel.
max_voxels: int. indicate maximum voxels this function create.
for voxelnet, 20000 is a good choice. you should shuffle points
before call this function because max_voxels may drop some points.
full_mean: bool. if true, all empty points in voxel will be filled with mean
of exist points.
block_filtering: filter voxels by height. used for lidar point cloud.
use some visualization tool to see filtered result.
Returns:
voxels: [M, max_points, ndim] float tensor. only contain points.
coordinates: [M, 3] int32 tensor. zyx format.
num_points_per_voxel: [M] int32 tensor.
"""
if full_mean:
assert block_filtering is False
if not isinstance(voxel_size, np.ndarray):
voxel_size = np.array(voxel_size, dtype=points.dtype)
if not isinstance(coors_range, np.ndarray):
coors_range = np.array(coors_range, dtype=points.dtype)
voxelmap_shape = (coors_range[3:] - coors_range[:3]) / voxel_size
voxelmap_shape = tuple(np.round(voxelmap_shape).astype(np.int32).tolist())
voxelmap_shape = voxelmap_shape[::-1]
num_points_per_voxel = np.zeros(shape=(max_voxels, ), dtype=np.int32)
voxels = np.zeros(shape=(max_voxels, max_points, points.shape[-1]),
dtype=points.dtype)
voxel_point_mask = np.zeros(shape=(max_voxels, max_points),
dtype=points.dtype)
coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32)
res = {
"voxels": voxels,
"coordinates": coors,
"num_points_per_voxel": num_points_per_voxel,
"voxel_point_mask": voxel_point_mask,
}
if full_mean:
means = np.zeros(shape=(max_voxels, points.shape[-1]),
dtype=points.dtype)
voxel_num = points_to_voxel_3d_np_mean(points, voxels,
voxel_point_mask, means, coors,
num_points_per_voxel,
coor_to_voxelidx,
voxel_size.tolist(),
coors_range.tolist(),
max_points, max_voxels)
else:
if block_filtering:
block_shape = [*voxelmap_shape[1:]]
block_shape = [b // block_factor for b in block_shape]
mins = np.full(block_shape, 99999999, dtype=points.dtype)
maxs = np.full(block_shape, -99999999, dtype=points.dtype)
voxel_mask = np.zeros((max_voxels, ), dtype=np.int32)
voxel_num = points_to_voxel_3d_with_filtering(
points, voxels, voxel_point_mask, voxel_mask, mins, maxs,
coors, num_points_per_voxel, coor_to_voxelidx,
voxel_size.tolist(), coors_range.tolist(), max_points,
max_voxels, block_factor, block_size, height_threshold,
height_high_threshold)
voxel_mask = voxel_mask.astype(np.bool_)
coors_ = coors[voxel_mask]
if pad_output:
res["coordinates"][:voxel_num] = coors_
res["voxels"][:voxel_num] = voxels[voxel_mask]
res["voxel_point_mask"][:voxel_num] = voxel_point_mask[
voxel_mask]
res["num_points_per_voxel"][:voxel_num] = num_points_per_voxel[
voxel_mask]
res["coordinates"][voxel_num:] = 0
res["voxels"][voxel_num:] = 0
res["num_points_per_voxel"][voxel_num:] = 0
res["voxel_point_mask"][voxel_num:] = 0
else:
res["coordinates"] = coors_
res["voxels"] = voxels[voxel_mask]
res["num_points_per_voxel"] = num_points_per_voxel[voxel_mask]
res["voxel_point_mask"] = voxel_point_mask[voxel_mask]
voxel_num = coors_.shape[0]
else:
voxel_num = points_to_voxel_3d_np(points, voxels, voxel_point_mask,
coors, num_points_per_voxel,
coor_to_voxelidx,
voxel_size.tolist(),
coors_range.tolist(), max_points,
max_voxels)
res["voxel_num"] = voxel_num
res["voxel_point_mask"] = res["voxel_point_mask"].reshape(
-1, max_points, 1)
return res
class VoxelGenerator:
def __init__(self,
voxel_size,
point_cloud_range,
max_num_points,
max_voxels=20000,
full_mean=True):
point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
# [0, -40, -3, 70.4, 40, 1]
voxel_size = np.array(voxel_size, dtype=np.float32)
grid_size = (point_cloud_range[3:] -
point_cloud_range[:3]) / voxel_size
grid_size = np.round(grid_size).astype(np.int64)
voxelmap_shape = tuple(np.round(grid_size).astype(np.int32).tolist())
voxelmap_shape = voxelmap_shape[::-1]
self._coor_to_voxelidx = np.full(voxelmap_shape, -1, dtype=np.int32)
self._voxel_size = voxel_size
self._point_cloud_range = point_cloud_range
self._max_num_points = max_num_points
self._max_voxels = max_voxels
self._grid_size = grid_size
self._full_mean = full_mean
def generate(self, points, max_voxels=None):
res = points_to_voxel(points, self._voxel_size,
self._point_cloud_range, self._coor_to_voxelidx,
self._max_num_points, max_voxels
or self._max_voxels, self._full_mean)
voxels = res["voxels"]
coors = res["coordinates"]
num_points_per_voxel = res["num_points_per_voxel"]
voxel_num = res["voxel_num"]
coors = coors[:voxel_num]
voxels = voxels[:voxel_num]
num_points_per_voxel = num_points_per_voxel[:voxel_num]
return (voxels, coors, num_points_per_voxel)
def generate_multi_gpu(self, points, max_voxels=None):
res = points_to_voxel(points, self._voxel_size,
self._point_cloud_range, self._coor_to_voxelidx,
self._max_num_points, max_voxels
or self._max_voxels, self._full_mean)
voxels = res["voxels"]
coors = res["coordinates"]
num_points_per_voxel = res["num_points_per_voxel"]
voxel_num = res["voxel_num"]
return (voxels, coors, num_points_per_voxel)
@property
def voxel_size(self):
return self._voxel_size
@property
def max_num_points_per_voxel(self):
return self._max_num_points
@property
def point_cloud_range(self):
return self._point_cloud_range
@property
def grid_size(self):
return self._grid_size
class VoxelGeneratorV2:
def __init__(self,
voxel_size,
point_cloud_range,
max_num_points,
max_voxels=20000,
full_mean=False,
block_filtering=False,
block_factor=8,
block_size=3,
height_threshold=0.1,
height_high_threshold=2.0):
assert full_mean is False, "don't use this."
point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
# [0, -40, -3, 70.4, 40, 1]
voxel_size = np.array(voxel_size, dtype=np.float32)
grid_size = (point_cloud_range[3:] -
point_cloud_range[:3]) / voxel_size
grid_size = np.round(grid_size).astype(np.int64)
if block_filtering:
assert block_size > 0
assert grid_size[0] % block_factor == 0
assert grid_size[1] % block_factor == 0
voxelmap_shape = tuple(np.round(grid_size).astype(np.int32).tolist())
voxelmap_shape = voxelmap_shape[::-1]
self._coor_to_voxelidx = np.full(voxelmap_shape, -1, dtype=np.int32)
self._voxel_size = voxel_size
self._point_cloud_range = point_cloud_range
self._max_num_points = max_num_points
self._max_voxels = max_voxels
self._grid_size = grid_size
self._full_mean = full_mean
self._block_filtering = block_filtering
self._block_factor = block_factor
self._height_threshold = height_threshold
self._block_size = block_size
self._height_high_threshold = height_high_threshold
def generate(self, points, max_voxels=None):
res = points_to_voxel(points, self._voxel_size,
self._point_cloud_range, self._coor_to_voxelidx,
self._max_num_points, max_voxels
or self._max_voxels, self._full_mean,
self._block_filtering, self._block_factor,
self._block_size, self._height_threshold,
self._height_high_threshold)
for k, v in res.items():
if k != "voxel_num":
res[k] = v[:res["voxel_num"]]
return res
def generate_multi_gpu(self, points, max_voxels=None):
res = points_to_voxel(points,
self._voxel_size,
self._point_cloud_range,
self._coor_to_voxelidx,
self._max_num_points,
max_voxels or self._max_voxels,
self._full_mean,
self._block_filtering,
self._block_factor,
self._block_size,
self._height_threshold,
self._height_high_threshold,
pad_output=True)
return res
@property
def voxel_size(self):
return self._voxel_size
@property
def max_num_points_per_voxel(self):
return self._max_num_points
@property
def point_cloud_range(self):
return self._point_cloud_range
@property
def grid_size(self):
return self._grid_size
class VoxelGeneratorV3:
def __init__(self, voxel_size, point_cloud_range, max_points, num_features,
dtype, device):
self._max_points = max_points
self._point_cloud_range = point_cloud_range
self._voxel_size = voxel_size
self._grid_size = torch.round(
(self._point_cloud_range[3:] - self._point_cloud_range[:3]) /
self._voxel_size).to(torch.int32)
grid_volume = self._grid_size.prod()
self._grid_size = self._grid_size.cpu().numpy().tolist()
self._ndim = len(self._grid_size)
self._dtype = dtype
self._device = device
self._point_index = torch.full([max_points + 1],
grid_volume,
dtype=torch.int32,
device=self._device)
self._grids = torch.zeros([grid_volume, num_features],
dtype=self._dtype,
device=self._device)
self._num_points_per_grid = torch.zeros([grid_volume],
dtype=torch.int32,
device=self._device)
self._voxels = torch.zeros([max_points, num_features],
dtype=self._dtype,
device=self._device)
self._coors = torch.zeros([max_points, self._ndim],
dtype=torch.int32,
device=self._device)
def generate(self, points):
assert points.shape[
0] <= self._max_points, 'please enlarge max_points to not smaller than ' + str(
points.shape[0])
points.to(self._dtype).to(self._device)
return self.points_to_voxel(points)
def generate_multi_gpu(self, points):
assert points.shape[
0] <= self._max_points, 'please enlarge max_points to not smaller than ' + str(
points.shape[0])
points.to(self._dtype).to(self._device)
return self.points_to_voxel(points)
@property
def voxel_size(self):
return self._voxel_size
@property
def point_cloud_range(self):
return self._point_cloud_range
@property
def grid_size(self):
return self._grid_size
def points_to_voxel(self, points):
"""
points: [N, ndim] float tensor. points[:, :3] contain xyz points and
points[:, 3:] contain other information such as reflectivity.
voxel_size: [3] list/tuple or array or tensor, float. xyz, indicate voxel size
coors_range: [6] list/tuple or array or tensor, float. indicate voxel range.
format: xyzxyz, minmax
"""
indexes = torch.floor((points[:, :3] - self._point_cloud_range[:3]) /
self._voxel_size).to(torch.int32)
num_voxel = torch.ops.spconv.points_to_voxel(
points, indexes, self._point_index, self._grids,
self._num_points_per_grid, self._voxels, self._coors,
self._grid_size, self._ndim)
voxels = self._voxels[:num_voxel, :]
coors = self._coors[:num_voxel, :]
# xyz --> zyx
#coors = coors[::-1]
x, y, z = coors[:, 0].reshape([-1, 1]), coors[:, 1].reshape(
[-1, 1]), coors[:, 2].reshape([-1, 1])
coors = torch.cat([z, y, x], dim=1)
# can be skipped
# x, y, z, f = voxels[:, 0].reshape([-1, 1]), voxels[:, 1].reshape([-1, 1]), voxels[:, 2].reshape([-1, 1]), voxels[:, 3:]
# voxels = torch.cat([z, y, x, f], dim=1)
return voxels, coors
from cumm import tensorview as tv
from spconv.core_cc.csrc.sparse.all.ops1d import Point2Voxel as Point2VoxelGPU1d
from spconv.core_cc.csrc.sparse.all.ops2d import Point2Voxel as Point2VoxelGPU2d
from spconv.core_cc.csrc.sparse.all.ops3d import Point2Voxel as Point2VoxelGPU3d
from spconv.core_cc.csrc.sparse.all.ops4d import Point2Voxel as Point2VoxelGPU4d
from spconv.core_cc.csrc.sparse.all.ops_cpu1d import Point2VoxelCPU as Point2VoxelCPU1d
from spconv.core_cc.csrc.sparse.all.ops_cpu2d import Point2VoxelCPU as Point2VoxelCPU2d
from spconv.core_cc.csrc.sparse.all.ops_cpu3d import Point2VoxelCPU as Point2VoxelCPU3d
from spconv.core_cc.csrc.sparse.all.ops_cpu4d import Point2VoxelCPU as Point2VoxelCPU4d
\ No newline at end of file
if(WIN32)
add_library(cuhash SHARED hash_functions.cu hash_table.cpp hash_table.cu hash_functions.cpp)
else()
add_library(cuhash STATIC hash_functions.cu hash_table.cpp hash_table.cu hash_functions.cpp)
endif()
target_include_directories(cuhash PRIVATE ${ALL_INCLUDE} )
set_property(TARGET cuhash PROPERTY CUDA_STANDARD 14)
set_property(TARGET cuhash PROPERTY CXX_STANDARD 14)
set_target_properties(cuhash PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
set_target_properties(cuhash PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
if(NOT WIN32)
set_property(TARGET cuhash PROPERTY POSITION_INDEPENDENT_CODE ON)
endif()
target_link_libraries(cuhash PRIVATE ${ALL_LIBS})
install (TARGETS cuhash DESTINATION lib)
if (SPCONV_BuildTests)
add_executable(cuhash_test main.cc)
target_include_directories(cuhash_test PRIVATE ${ALL_INCLUDE} )
set_property(TARGET cuhash_test PROPERTY CUDA_STANDARD 14)
set_property(TARGET cuhash_test PROPERTY CXX_STANDARD 14)
set_target_properties(cuhash_test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(cuhash_test PRIVATE ${ALL_LIBS} cuhash)
install (TARGETS cuhash_test DESTINATION bin)
endif()
\ No newline at end of file
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* debugging.cpp
*
* @brief Debugging/statistics/performance utilities for hash tables.
*/
#include <cuhash/debugging.h>
#include <cuhash/definitions.h>
#include <algorithm>
#include <cstring>
#include <cuhash/cuda_util.h>
namespace cuhash {
void OutputRetrievalStatistics(const unsigned n_queries,
const unsigned *d_retrieval_probes,
const unsigned n_functions) {
unsigned *retrieval_probes = new unsigned[n_queries];
CUDA_SAFE_CALL(cudaMemcpy(retrieval_probes, d_retrieval_probes,
sizeof(unsigned) * n_queries,
cudaMemcpyDeviceToHost));
// Create a histogram showing how many items needed how many probes to be
// found.
unsigned possible_probes = n_functions + 2;
unsigned *histogram = new unsigned[possible_probes];
memset(histogram, 0, sizeof(unsigned) * (possible_probes));
for (unsigned i = 0; i < n_queries; ++i) {
histogram[retrieval_probes[i]]++;
}
// Dump it.
char buffer[10000];
sprintf(buffer, "Probes for retrieval: ");
PrintMessage(buffer);
for (unsigned i = 0; i < possible_probes; ++i) {
sprintf(buffer, "\t(%u, %u)", i, histogram[i]);
PrintMessage(buffer);
}
delete[] retrieval_probes;
delete[] histogram;
}
void OutputBuildStatistics(const unsigned n,
const unsigned *d_iterations_taken) {
// Output how many iterations each thread took until it found an empty slot.
unsigned *iterations_taken = new unsigned[n];
CUDA_SAFE_CALL(cudaMemcpy(iterations_taken, d_iterations_taken,
sizeof(unsigned) * n, cudaMemcpyDeviceToHost));
std::sort(iterations_taken, iterations_taken + n);
unsigned total_iterations = 0;
unsigned max_iterations_taken = 0;
for (unsigned i = 0; i < n; ++i) {
total_iterations += iterations_taken[i];
max_iterations_taken = std::max(max_iterations_taken, iterations_taken[i]);
}
unsigned current_value = iterations_taken[0];
unsigned count = 1;
char buffer[10000];
sprintf(buffer, "Iterations taken:\n");
for (unsigned i = 1; i < n; ++i) {
if (iterations_taken[i] != current_value) {
sprintf(buffer, "%s\t(%u, %u)\n", buffer, current_value, count);
current_value = iterations_taken[i];
count = 1;
} else {
count++;
}
}
sprintf(buffer, "%s\t(%u, %u)", buffer, current_value, count);
PrintMessage(buffer);
sprintf(buffer, "Total iterations: %u", total_iterations);
PrintMessage(buffer);
sprintf(buffer, "Avg/Med/Max iterations: (%f %u %u)",
(float)total_iterations / n, iterations_taken[n / 2],
iterations_taken[n - 1]);
PrintMessage(buffer);
delete[] iterations_taken;
// Print the length of the longest eviction chain.
sprintf(buffer, "Max iterations: %u", max_iterations_taken);
PrintMessage(buffer);
}
}; // namespace cuhash
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* debugging.cu
*
* @brief Debugging/statistics/performance utilities for hash tables.
*/
#include <cuhash/debugging.h>
#include <cuhash/definitions.h>
#include <cuhash/hash_table.cuh>
#include <algorithm>
#include <cuhash/cuda_util.h>
namespace cuhash {
//! Debugging function: Takes statistics on the hash functions' distribution.
/*! Determines:
* - How many unique slots each key has.
* - How many keys hash into each slot.
* - Whether any keys failed to get a full set of slots.
*/
__global__ void take_hash_function_statistics_kernel(
const unsigned *keys, const unsigned n_entries, const unsigned table_size,
const uint2 *constants, const unsigned num_functions,
unsigned *num_slots_available, unsigned *num_hashing_in, unsigned *failed) {
unsigned thread_index = threadIdx.x + blockIdx.x * blockDim.x +
blockIdx.y * blockDim.x * gridDim.x;
if (thread_index >= n_entries)
return;
unsigned key = keys[thread_index];
// Determine all of the locations the key hashes into.
// Also count how many keys hash into each location.
unsigned locations[kMaxHashFunctions];
for (unsigned i = 0; i < num_functions; ++i) {
locations[i] = hash_function_inner(constants[i], key) % table_size;
if (num_hashing_in != NULL) {
atomicAdd(num_hashing_in + locations[i], 1);
}
}
// Determine whether all of the locations were different.
unsigned num_slots = 1;
for (unsigned i = 1; i < num_functions; ++i) {
bool matched = false;
for (unsigned j = 0; j < i; ++j) {
if (locations[i] == locations[j]) {
matched = true;
break;
}
}
if (!matched) {
num_slots++;
}
}
if (num_slots_available != NULL) {
num_slots_available[thread_index] = num_slots;
}
if (failed != NULL && num_slots != num_functions) {
*failed = 1;
}
}
void TakeHashFunctionStatistics(const unsigned num_keys, const unsigned *d_keys,
const unsigned table_size,
const uint2 *constants,
const unsigned kNumHashFunctions) {
char buffer[16000];
PrintMessage("Hash function constants: ");
for (unsigned i = 0; i < kNumHashFunctions; ++i) {
sprintf(buffer, "\t%10u, %10u", constants[i].x, constants[i].y);
PrintMessage(buffer);
}
unsigned *d_num_hashing_in = NULL;
#ifdef COUNT_HOW_MANY_HASH_INTO_EACH_SLOT
CUDA_SAFE_CALL(
cudaMalloc((void **)&d_num_hashing_in, sizeof(unsigned) * table_size));
CUDA_SAFE_CALL(
cudaMemset(d_num_hashing_in, 0, sizeof(unsigned) * table_size));
#endif
unsigned *d_num_slots_available = NULL;
#ifdef COUNT_HOW_MANY_HAVE_CYCLES
CUDA_SAFE_CALL(
cudaMalloc((void **)&d_num_slots_available, sizeof(unsigned) * num_keys));
#endif
uint2 *d_constants = NULL;
CUDA_SAFE_CALL(
cudaMalloc((void **)&d_constants, sizeof(uint2) * kNumHashFunctions));
CUDA_SAFE_CALL(cudaMemcpy(d_constants, constants,
sizeof(uint2) * kNumHashFunctions,
cudaMemcpyHostToDevice));
take_hash_function_statistics_kernel<<<ComputeGridDim(num_keys),
kBlockSize>>>(
d_keys, num_keys, table_size, d_constants, kNumHashFunctions,
d_num_slots_available, d_num_hashing_in, NULL);
CUDA_SAFE_CALL(cudaFree(d_constants));
#ifdef COUNT_HOW_MANY_HASH_INTO_EACH_SLOT
unsigned *num_hashing_in = new unsigned[table_size];
CUDA_SAFE_CALL(cudaMemcpy(num_hashing_in, d_num_hashing_in,
sizeof(unsigned) * table_size,
cudaMemcpyDeviceToHost));
/*
// Print how many items hash into each slot.
// Used to make sure items are spread evenly throughout the table.
buffer[0] = '\0';
PrintMessage("Num hashing into each: ", true);
for (unsigned i = 0; i < table_size; ++i) {
sprintf(buffer, "%s\t%2u", buffer, num_hashing_in[i]);
if (i % 25 == 24) {
PrintMessage(buffer, true);
buffer[0] = '\0';
}
}
PrintMessage(buffer,true);
*/
// Print a histogram of how many items are hashed into each slot. Shows
// if average number of items hashing into each slot is low.
std::sort(num_hashing_in, num_hashing_in + table_size);
int count = 1;
unsigned previous = num_hashing_in[0];
sprintf(buffer, "Num items hashing into a slot:\t");
PrintMessage(buffer);
for (unsigned i = 1; i < table_size; ++i) {
if (num_hashing_in[i] != previous) {
sprintf(buffer, "\t(%u, %u)", previous, count);
PrintMessage(buffer);
previous = num_hashing_in[i];
count = 1;
} else {
count++;
}
}
sprintf(buffer, "\t(%u, %u)", previous, count);
PrintMessage(buffer);
delete[] num_hashing_in;
CUDA_SAFE_CALL(cudaFree(d_num_hashing_in));
#endif
#ifdef COUNT_HOW_MANY_HAVE_CYCLES
unsigned *num_slots_available = new unsigned[num_keys];
CUDA_SAFE_CALL(cudaMemcpy(num_slots_available, d_num_slots_available,
sizeof(unsigned) * num_keys,
cudaMemcpyDeviceToHost));
static const unsigned kHistogramSize = kNumHashFunctions + 1;
unsigned *histogram = new unsigned[kHistogramSize];
memset(histogram, 0, sizeof(unsigned) * kHistogramSize);
for (unsigned i = 0; i < num_keys; ++i) {
histogram[num_slots_available[i]]++;
}
sprintf(buffer, "Slots assigned to each key: ");
for (unsigned i = 1; i < kHistogramSize; ++i) {
sprintf(buffer, "%s(%u, %u) ", buffer, i, histogram[i]);
}
PrintMessage(buffer);
delete[] histogram;
delete[] num_slots_available;
CUDA_SAFE_CALL(cudaFree(d_num_slots_available));
#endif
}
bool CheckAssignedSameSlot(const unsigned N, const unsigned num_keys,
const unsigned *d_keys, const unsigned table_size,
uint2 *constants) {
unsigned *d_cycle_exists = NULL;
uint2 *d_constants = NULL;
CUDA_SAFE_CALL(cudaMalloc((void **)&d_cycle_exists, sizeof(unsigned)));
CUDA_SAFE_CALL(cudaMalloc((void **)&d_constants, sizeof(uint2) * N));
CUDA_SAFE_CALL(cudaMemset(d_cycle_exists, 0, sizeof(unsigned)));
CUDA_SAFE_CALL(cudaMemcpy(d_constants, constants, sizeof(uint2) * N,
cudaMemcpyHostToDevice));
// Check if all keys were given a full set of N slots by the functions.
take_hash_function_statistics_kernel<<<ComputeGridDim(num_keys),
kBlockSize>>>(
d_keys, num_keys, table_size, d_constants, N, NULL, NULL, d_cycle_exists);
unsigned cycle_exists;
CUDA_SAFE_CALL(cudaMemcpy(&cycle_exists, d_cycle_exists, sizeof(unsigned),
cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaFree(d_cycle_exists));
CUDA_SAFE_CALL(cudaFree(d_constants));
return (cycle_exists != 0);
}
void PrintStashContents(const Entry *d_stash) {
Entry *stash = new Entry[cuhash::kStashSize];
CUDA_SAFE_CALL(cudaMemcpy(stash, d_stash, sizeof(Entry) * cuhash::kStashSize,
cudaMemcpyDeviceToHost));
for (unsigned i = 0; i < cuhash::kStashSize; ++i) {
if (get_key(stash[i]) != kKeyEmpty) {
char buffer[256];
sprintf(buffer, "Stash[%u]: %u = %u", i, get_key(stash[i]),
get_value(stash[i]));
PrintMessage(buffer, true);
}
}
delete[] stash;
}
}; // namespace cuhash
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:
// nvcc (cuda) 9.0 with gcc 5.5 don't support random, so compile it in host
#include <random>
namespace cuhash {
std::random_device random_dev;
std::mt19937 random_engine(random_dev());
std::uniform_int_distribution<unsigned> uint_distribution;
unsigned generate_random_uint32() { return uint_distribution(random_engine); }
} // namespace cuhash
\ No newline at end of file
#include <cassert>
#include <cuhash/debugging.h>
#include <cuhash/hash_functions.h>
#include <cuhash/hash_table.h>
namespace cuhash {
void GenerateFunctions(const unsigned N, const unsigned num_keys,
const unsigned *d_keys, const unsigned table_size,
uint2 *constants) {
bool regenerate = true;
while (regenerate) {
regenerate = false;
// Generate a set of hash function constants for this build attempt.
for (unsigned i = 0; i < N; ++i) {
// uint_distribution(random_engine) % kPrimeDivisor;
// genrand_int32() % kPrimeDivisor;
unsigned new_a = generate_random_uint32() % kPrimeDivisor;
constants[i].x = (1 > new_a ? 1 : new_a);
constants[i].y = generate_random_uint32() % kPrimeDivisor;
}
#ifdef FORCEFULLY_GENERATE_NO_CYCLES
// Ensure that every key gets N different slots.
regenerate =
CheckAssignedSameSlot(N, num_keys, d_keys, table_size, constants);
#endif
}
#ifdef TAKE_HASH_FUNCTION_STATISTICS
// Examine how well distributed the items are.
TakeHashFunctionStatistics(num_keys, d_keys, table_size, constants, N);
#endif
}
}; // namespace cuhash
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file hash_table.cpp
*
* @brief Implements a basic hash table that stores one value per key.
*/
#include <cuhash/debugging.h>
#include <cuhash/hash_table.h>
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <cuda_runtime_api.h>
#include <cuhash/cuda_util.h>
#include <limits>
namespace cuhash {
char buffer[256];
//! @name Internal
/// @{
dim3 ComputeGridDim(unsigned n) {
// Round up in order to make sure all items are hashed in.
dim3 grid((n + kBlockSize - 1) / kBlockSize);
if (grid.x > kGridSize) {
grid.y = (grid.x + kGridSize - 1) / kGridSize;
grid.x = kGridSize;
}
return grid;
}
unsigned ComputeMaxIterations(const unsigned n, const unsigned table_size,
const unsigned num_functions) {
float lg_input_size = (float)(log((double)n) / log(2.0));
// #define CONSTANT_ITERATIONS
#ifdef CONSTANT_ITERATIONS
// Set the maximum number of iterations to 7lg(N).
const unsigned MAX_ITERATION_CONSTANT = 7;
unsigned max_iterations = MAX_ITERATION_CONSTANT * lg_input_size;
#else
// Use an empirical formula for determining what the maximum number of
// iterations should be. Works OK in most situations.
float load_factor = float(n) / table_size;
float ln_load_factor = (float)(log(load_factor) / log(2.71828183));
unsigned max_iterations =
(unsigned)(4.0 * ceil(-1.0 / (0.028255 + 1.1594772 * ln_load_factor) *
lg_input_size));
#endif
return max_iterations;
}
/// @}
HashTable::HashTable()
: table_size_(0), d_contents_(NULL), stash_count_(0), d_failures_(NULL) {
CUDA_CHECK_ERROR("Failed in constructor.\n");
}
bool HashTable::Initialize(const unsigned max_table_entries,
const float space_usage,
const unsigned num_functions) {
Release();
// Determine the minimum amount of slots the table requires,
// and whether the space_usage is within range.
float minimum_space_usage;
if (num_functions < 2 || num_functions > 5) {
char message[256] = "Number of hash functions must be from 2 to 5; "
"others are unimplemented.";
PrintMessage(message, true);
return false;
} else {
minimum_space_usage = kMinimumSpaceUsages[num_functions];
}
if (space_usage < minimum_space_usage) {
sprintf(buffer, "Minimum possible space usage for %u functions is %f.",
num_functions, minimum_space_usage);
PrintMessage(buffer);
return false;
}
num_hash_functions_ = num_functions;
table_size_ = unsigned(ceil(max_table_entries * space_usage));
// Allocate memory.
const unsigned slots_to_allocate = table_size_ + kStashSize;
CUDA_SAFE_CALL(
cudaMalloc((void **)&d_contents_, sizeof(Entry) * slots_to_allocate));
CUDA_SAFE_CALL(cudaMalloc((void **)&d_failures_, sizeof(unsigned)));
if (!d_contents_ || !d_failures_) {
fprintf(stderr, "Failed to allocate %u slots.\n", slots_to_allocate);
return false;
}
CUDA_CHECK_ERROR("Failed to initialize.\n");
return true;
}
void HashTable::Release() {
table_size_ = 0;
CUDA_SAFE_CALL(cudaFree(d_contents_));
CUDA_SAFE_CALL(cudaFree(d_failures_));
d_contents_ = NULL;
d_failures_ = NULL;
CUDA_CHECK_ERROR("Failed during release.\n");
}
bool HashTable::Build(const unsigned n, const unsigned *d_keys,
const unsigned *d_values) {
unsigned max_iterations =
ComputeMaxIterations(n, table_size_, num_hash_functions_);
unsigned num_failures = 1;
unsigned num_attempts = 0;
// Storage for statistics collection.
unsigned *d_iterations_taken = NULL;
#ifdef TRACK_ITERATIONS
CUDA_SAFE_CALL(
cudaMalloc((void **)&d_iterations_taken, sizeof(unsigned) * n));
#endif
// Track how many items ended up in the stash.
unsigned *d_stash_count = NULL;
CUDA_SAFE_CALL(cudaMalloc((void **)&d_stash_count, sizeof(unsigned)));
CUDA_CHECK_ERROR("Failed before main build loop.\n");
// Main build loop.
while (num_failures && ++num_attempts < kMaxRestartAttempts) {
CUDA_SAFE_CALL(cudaMemset(d_stash_count, 0, sizeof(unsigned)));
// Generate new hash functions.
if (num_hash_functions_ == 2)
constants_2_.Generate(n, d_keys, table_size_);
else if (num_hash_functions_ == 3)
constants_3_.Generate(n, d_keys, table_size_);
else if (num_hash_functions_ == 4)
constants_4_.Generate(n, d_keys, table_size_);
else
constants_5_.Generate(n, d_keys, table_size_);
stash_constants_.x = std::max(1u, generate_random_uint32()) % kPrimeDivisor;
stash_constants_.y = generate_random_uint32() % kPrimeDivisor;
stash_count_ = 0;
// Initialize memory.
unsigned slots_in_table = table_size_ + kStashSize;
CUDAWrapper::ClearTable(slots_in_table, kEntryEmpty, d_contents_);
num_failures = 0;
CUDAWrapper::CallCuckooHash(
n, num_hash_functions_, d_keys, d_values, table_size_, constants_2_,
constants_3_, constants_4_, constants_5_, max_iterations, d_contents_,
stash_constants_, d_stash_count, d_failures_, d_iterations_taken);
// Check if successful.
CUDA_SAFE_CALL(cudaMemcpy(&num_failures, d_failures_, sizeof(unsigned),
cudaMemcpyDeviceToHost));
#ifdef COUNT_UNINSERTED
if (num_failures) {
printf("Failed to insert %u items.\n", num_failures);
}
#endif
}
// Copy out the stash size.
CUDA_SAFE_CALL(cudaMemcpy(&stash_count_, d_stash_count, sizeof(unsigned),
cudaMemcpyDeviceToHost));
if (stash_count_ && num_failures == 0) {
// sprintf(buffer, "Stash size: %u", stash_count_);
// PrintMessage(buffer, true);
#ifdef _DEBUG
PrintStashContents(d_contents_ + table_size_);
#endif
}
CUDA_SAFE_CALL(cudaFree(d_stash_count));
#ifdef TRACK_ITERATIONS
if (num_failures == 0) {
OutputBuildStatistics(n, d_iterations_taken);
}
CUDA_SAFE_CALL(cudaFree(d_iterations_taken));
#endif
// Dump some info if a restart was required.
if (num_attempts >= kMaxRestartAttempts) {
sprintf(buffer, "Completely failed to build");
PrintMessage(buffer, true);
} else if (num_attempts > 1) {
sprintf(buffer, "Needed %u attempts to build, you can ignore this message.",
num_attempts);
PrintMessage(buffer, true);
}
CUDA_CHECK_ERROR("Error occurred during hash table build.\n");
return num_failures == 0;
}
void HashTable::Retrieve(const unsigned n_queries, const unsigned *d_keys,
unsigned *d_values) {
CUDAWrapper::CallHashRetrieve(n_queries, num_hash_functions_, d_keys,
table_size_, d_contents_, constants_2_,
constants_3_, constants_4_, constants_5_,
stash_constants_, stash_count_, d_values);
}
}; // namespace cuhash
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file hash_table.cu
*
* @brief Hides all of the CUDA calls from the actual CPP file.
*/
#include <cuhash/cuda_util.h>
#include <cuhash/debugging.h>
#include <cuhash/definitions.h>
#include <cuhash/hash_table.cuh>
#include <cuda.h>
namespace cuhash {
namespace CUDAWrapper {
void ClearTable(const unsigned slots_in_table, const Entry fill_value,
Entry *d_contents) {
clear_table<Entry><<<ComputeGridDim(slots_in_table), kBlockSize>>>(
slots_in_table, fill_value, d_contents);
TV_CHECK_CUDA_ERR_V2("Error occurred during hash table clear.\n");
}
void CallCuckooHash(const unsigned n, const unsigned num_hash_functions,
const unsigned *d_keys, const unsigned *d_values,
const unsigned table_size, const Functions<2> constants_2,
const Functions<3> constants_3,
const Functions<4> constants_4,
const Functions<5> constants_5,
const unsigned max_iterations, Entry *d_contents,
uint2 stash_constants, unsigned *d_stash_count,
unsigned *d_failures, unsigned *d_iterations_taken) {
// Build the table.
cudaMemset(d_failures, 0, sizeof(unsigned));
if (num_hash_functions == 2) {
CuckooHash<<<ComputeGridDim(n), kBlockSize>>>(
n, d_keys, d_values, table_size, constants_2, max_iterations,
d_contents, stash_constants, d_stash_count, d_failures,
d_iterations_taken);
} else if (num_hash_functions == 3) {
CuckooHash<<<ComputeGridDim(n), kBlockSize>>>(
n, d_keys, d_values, table_size, constants_3, max_iterations,
d_contents, stash_constants, d_stash_count, d_failures,
d_iterations_taken);
} else if (num_hash_functions == 4) {
CuckooHash<<<ComputeGridDim(n), kBlockSize>>>(
n, d_keys, d_values, table_size, constants_4, max_iterations,
d_contents, stash_constants, d_stash_count, d_failures,
d_iterations_taken);
} else {
CuckooHash<<<ComputeGridDim(n), kBlockSize>>>(
n, d_keys, d_values, table_size, constants_5, max_iterations,
d_contents, stash_constants, d_stash_count, d_failures,
d_iterations_taken);
}
CUDA_CHECK_ERROR("Error occurred during hash table build.\n");
}
void CallHashRetrieve(const unsigned n_queries,
const unsigned num_hash_functions, const unsigned *d_keys,
const unsigned table_size, const Entry *d_contents,
const Functions<2> constants_2,
const Functions<3> constants_3,
const Functions<4> constants_4,
const Functions<5> constants_5,
const uint2 stash_constants, const unsigned stash_count,
unsigned *d_values) {
unsigned *d_retrieval_probes = NULL;
#ifdef TRACK_ITERATIONS
CUDA_SAFE_CALL(
cudaMalloc((void **)&d_retrieval_probes, sizeof(unsigned) * n_queries));
#endif
if (num_hash_functions == 2) {
hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>(
n_queries, d_keys, table_size, d_contents, constants_2, stash_constants,
stash_count, d_values, d_retrieval_probes);
} else if (num_hash_functions == 3) {
hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>(
n_queries, d_keys, table_size, d_contents, constants_3, stash_constants,
stash_count, d_values, d_retrieval_probes);
} else if (num_hash_functions == 4) {
hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>(
n_queries, d_keys, table_size, d_contents, constants_4, stash_constants,
stash_count, d_values, d_retrieval_probes);
} else {
hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>(
n_queries, d_keys, table_size, d_contents, constants_5, stash_constants,
stash_count, d_values, d_retrieval_probes);
}
CUDA_CHECK_ERROR("Retrieval failed.\n");
#ifdef TRACK_ITERATIONS
OutputRetrievalStatistics(n_queries, d_retrieval_probes, num_hash_functions);
CUDA_SAFE_CALL(cudaFree(d_retrieval_probes));
#endif
}
}; // namespace CUDAWrapper
}; // namespace cuhash
#include <cuda.h>
#include <cuhash/hash_table.h>
int main() {
auto table = cuhash::HashTable();
table.Initialize(10, 2.0);
const int N = 10;
// ハッシュテーブルに格納するデータ
int keys[N] = {1, 6, 4, 9, 0, 3, 7, 2, 5, 8};
int vals[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
// デバイスメモリにコピー
int *d_keys, *d_vals;
cudaMalloc((void **)&d_keys, sizeof(int) * N);
cudaMemcpy(d_keys, keys, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_vals, sizeof(int) * N);
cudaMemcpy(d_vals, vals, sizeof(int) * N, cudaMemcpyHostToDevice);
// ハッシュテーブルにクエリするデータ
int input[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
int output[N];
// デバイスメモリにコピー
int *d_input, *d_output;
cudaMalloc((void **)&d_input, sizeof(int) * N);
cudaMemcpy(d_input, input, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_output, sizeof(int) * N);
cudaMemset(d_output, 0, sizeof(int) * N);
bool s = table.Build(N, (const unsigned int *)d_keys,
(const unsigned int *)d_vals);
std::cout << s << std::endl;
table.Retrieve(N, (const unsigned int *)d_input, (unsigned int *)d_output);
std::cout << s << std::endl;
cudaMemcpy(output, d_output, sizeof(int) * N, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; ++i) {
printf("%d\n", output[i]);
}
return 0;
}
\ No newline at end of file
set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc spconv_ops.cc pool_ops.cc point2voxel_ops.cc)
if (SPCONV_BuildCUDA)
set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu cublas_gemm.cc point2voxel.cu fused_conv.cu)
endif()
add_library(spconv SHARED ${ALL_FILES})
find_package(OpenMP)
if(OpenMP_CXX_FOUND)
target_link_libraries(spconv PUBLIC OpenMP::OpenMP_CXX)
endif()
target_include_directories(spconv PRIVATE ${ALL_INCLUDE} ${MP11_INCLUDE} )
set_property(TARGET spconv PROPERTY CUDA_STANDARD 14)
set_property(TARGET spconv PROPERTY CXX_STANDARD 14)
set_target_properties(spconv PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
if (SPCONV_BuildCUDA)
target_link_libraries(spconv PRIVATE ${ALL_LIBS} cuhash spgemm)
else()
target_link_libraries(spconv PRIVATE ${ALL_LIBS})
endif()
install (TARGETS spconv DESTINATION lib)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment