Commit 47527b00 authored by traveller59's avatar traveller59
Browse files

v1.1 alpha: add cuda hash implementation

parent 5df97387
......@@ -52,7 +52,8 @@ def get_indice_pairs(indices,
out_padding=0,
subm=False,
transpose=False,
grid=None):
grid=None,
use_hash=True):
ndim = indices.shape[1] - 1
if not isinstance(ksize, (list, tuple)):
ksize = [ksize] * ndim
......@@ -88,7 +89,7 @@ def get_indice_pairs(indices,
else:
raise NotImplementedError
return get_indice_pairs_func(indices, batch_size, out_shape, spatial_shape, ksize,
stride, padding, dilation, out_padding, int(subm), int(transpose))
stride, padding, dilation, out_padding, int(subm), int(transpose), int(use_hash))
else:
if ndim == 2:
get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_grid_2d
......@@ -97,7 +98,7 @@ def get_indice_pairs(indices,
else:
raise NotImplementedError
return get_indice_pairs_func(indices, grid, batch_size, out_shape, spatial_shape, ksize,
stride, padding, dilation, out_padding, int(subm), int(transpose))
stride, padding, dilation, out_padding, int(subm), int(transpose), int(use_hash))
......@@ -177,3 +178,11 @@ def nms(boxes, scores, pre_max_size, post_max_size, thresh, eps):
res = torch.ops.spconv.nms(
boxes, scores, pre_max_size, post_max_size, thresh, eps)
return res
def pillar_scatter(features, coors, shape):
if features.dtype == torch.float32:
return torch.ops.spconv.pillar_scatter_float(features, coors, shape)
elif features.dtype == torch.half:
return torch.ops.spconv.pillar_scatter_half(features, coors, shape)
else:
raise NotImplementedError
......@@ -13,12 +13,12 @@
# limitations under the License.
import numpy as np
from spconv import spconv_utils
from spconv.spconv_utils import (
non_max_suppression, non_max_suppression_cpu, points_to_voxel_3d_np,
rbbox_iou, points_to_voxel_3d_np_mean, points_to_voxel_3d_np_height,
points_to_voxel_3d_with_filtering, rotate_non_max_suppression_cpu,
rbbox_intersection)
points_to_voxel_3d_np_mean, points_to_voxel_3d_with_filtering,
rbbox_intersection, rbbox_iou, rotate_non_max_suppression_cpu)
def points_to_voxel(points,
......@@ -28,11 +28,11 @@ def points_to_voxel(points,
max_points=35,
max_voxels=20000,
full_mean=False,
with_height=False,
block_filtering=True,
block_factor=1,
block_size=8,
height_threshold=0.2,
height_high_threshold=3.0,
pad_output=False):
"""convert 3d points(N, >=3) to voxels. This version calculate
everything in one loop. now it takes only 0.8ms(~6k voxels)
......@@ -51,7 +51,6 @@ def points_to_voxel(points,
before call this function because max_voxels may drop some points.
full_mean: bool. if true, all empty points in voxel will be filled with mean
of exist points.
with_height: bool. don't use this.
block_filtering: filter voxels by height. used for lidar point cloud.
use some visualization tool to see filtered result.
Returns:
......@@ -71,67 +70,63 @@ def points_to_voxel(points,
num_points_per_voxel = np.zeros(shape=(max_voxels, ), dtype=np.int32)
voxels = np.zeros(
shape=(max_voxels, max_points, points.shape[-1]), dtype=points.dtype)
voxel_point_mask = np.zeros(
shape=(max_voxels, max_points), dtype=points.dtype)
coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32)
res = {
"voxels": voxels,
"coordinates": coors,
"num_points_per_voxel": num_points_per_voxel,
"voxel_point_mask": voxel_point_mask,
}
if full_mean:
means = np.zeros(
shape=(max_voxels, points.shape[-1]), dtype=points.dtype)
voxel_num = points_to_voxel_3d_np_mean(
points, voxels, means, coors,
points, voxels, voxel_point_mask, means, coors,
num_points_per_voxel, coor_to_voxelidx, voxel_size.tolist(),
coors_range.tolist(), max_points, max_voxels)
else:
if with_height:
heights = np.zeros(
shape=(max_voxels, points.shape[-1]), dtype=points.dtype)
maxs = np.zeros(
shape=(max_voxels, points.shape[-1]), dtype=points.dtype)
res["heights"] = heights
voxel_num = points_to_voxel_3d_np_height(
points, voxels, heights, maxs, coors,
if block_filtering:
block_shape = [*voxelmap_shape[1:]]
block_shape = [b // block_factor for b in block_shape]
mins = np.full(block_shape, 99999999, dtype=points.dtype)
maxs = np.full(block_shape, -99999999, dtype=points.dtype)
voxel_mask = np.zeros((max_voxels, ), dtype=np.int32)
voxel_num = points_to_voxel_3d_with_filtering(
points, voxels, voxel_point_mask, voxel_mask, mins, maxs,
coors, num_points_per_voxel, coor_to_voxelidx,
voxel_size.tolist(), coors_range.tolist(), max_points,
max_voxels, block_factor, block_size, height_threshold,
height_high_threshold)
voxel_mask = voxel_mask.astype(np.bool_)
coors_ = coors[voxel_mask]
if pad_output:
res["coordinates"][:voxel_num] = coors_
res["voxels"][:voxel_num] = voxels[voxel_mask]
res["voxel_point_mask"][:voxel_num] = voxel_point_mask[
voxel_mask]
res["num_points_per_voxel"][:voxel_num] = num_points_per_voxel[
voxel_mask]
res["coordinates"][voxel_num:] = 0
res["voxels"][voxel_num:] = 0
res["num_points_per_voxel"][voxel_num:] = 0
res["voxel_point_mask"][voxel_num:] = 0
else:
res["coordinates"] = coors_
res["voxels"] = voxels[voxel_mask]
res["num_points_per_voxel"] = num_points_per_voxel[voxel_mask]
res["voxel_point_mask"] = voxel_point_mask[voxel_mask]
voxel_num = coors_.shape[0]
else:
voxel_num = points_to_voxel_3d_np(
points, voxels, voxel_point_mask, coors,
num_points_per_voxel, coor_to_voxelidx, voxel_size.tolist(),
coors_range.tolist(), max_points, max_voxels)
else:
if block_filtering:
block_shape = [*voxelmap_shape[1:]]
block_shape = [b // block_factor for b in block_shape]
mins = np.full(block_shape, 99999999, dtype=points.dtype)
maxs = np.full(block_shape, -99999999, dtype=points.dtype)
voxel_mask = np.zeros((max_voxels, ), dtype=np.int32)
voxel_num = points_to_voxel_3d_with_filtering(
points, voxels, voxel_mask, mins, maxs,
coors, num_points_per_voxel, coor_to_voxelidx,
voxel_size.tolist(), coors_range.tolist(), max_points,
max_voxels, block_factor, block_size, height_threshold)
voxel_mask = voxel_mask.astype(np.bool_)
coors_ = coors[voxel_mask]
if pad_output:
res["coordinates"][:voxel_num] = coors_
res["voxels"][:voxel_num] = voxels[voxel_mask]
res["num_points_per_voxel"][:
voxel_num] = num_points_per_voxel[
voxel_mask]
res["coordinates"][voxel_num:] = 0
res["voxels"][voxel_num:] = 0
res["num_points_per_voxel"][voxel_num:] = 0
else:
res["coordinates"] = coors_
res["voxels"] = voxels[voxel_mask]
res["num_points_per_voxel"] = num_points_per_voxel[
voxel_mask]
voxel_num = coors_.shape[0]
else:
voxel_num = points_to_voxel_3d_np(points, voxels, coors,
num_points_per_voxel,
coor_to_voxelidx,
voxel_size.tolist(),
coors_range.tolist(),
max_points, max_voxels)
res["voxel_num"] = voxel_num
res["voxel_point_mask"] = res["voxel_point_mask"].reshape(
-1, max_points, 1)
return res
......@@ -209,12 +204,11 @@ class VoxelGeneratorV2:
max_num_points,
max_voxels=20000,
full_mean=False,
with_height=False,
block_filtering=False,
block_factor=8,
block_size=3,
height_threshold=0.1):
assert with_height is False, "don't use this."
height_threshold=0.1,
height_high_threshold=2.0):
assert full_mean is False, "don't use this."
point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
# [0, -40, -3, 70.4, 40, 1]
......@@ -236,19 +230,19 @@ class VoxelGeneratorV2:
self._max_voxels = max_voxels
self._grid_size = grid_size
self._full_mean = full_mean
self._with_height = with_height
self._block_filtering = block_filtering
self._block_factor = block_factor
self._height_threshold = height_threshold
self._block_size = block_size
self._height_high_threshold = height_high_threshold
def generate(self, points, max_voxels=None):
res = points_to_voxel(
points, self._voxel_size, self._point_cloud_range,
self._coor_to_voxelidx, self._max_num_points, max_voxels
or self._max_voxels, self._full_mean, self._with_height,
self._block_filtering, self._block_factor, self._block_size,
self._height_threshold)
or self._max_voxels, self._full_mean, self._block_filtering,
self._block_factor, self._block_size, self._height_threshold,
self._height_high_threshold)
for k, v in res.items():
if k != "voxel_num":
res[k] = v[:res["voxel_num"]]
......@@ -263,11 +257,11 @@ class VoxelGeneratorV2:
self._max_num_points,
max_voxels or self._max_voxels,
self._full_mean,
self._with_height,
self._block_filtering,
self._block_factor,
self._block_size,
self._height_threshold,
self._height_high_threshold,
pad_output=True)
return res
......@@ -285,4 +279,4 @@ class VoxelGeneratorV2:
@property
def grid_size(self):
return self._grid_size
\ No newline at end of file
return self._grid_size
add_library(cudahash SHARED hash_functions.cu hash_table.cpp hash_table.cu
mt19937ar.cpp)
target_include_directories(cudahash PRIVATE ${ALL_INCLUDE} )
set_property(TARGET cudahash PROPERTY CUDA_STANDARD 14)
set_property(TARGET cudahash PROPERTY CXX_STANDARD 14)
set_target_properties(cudahash PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(cudahash PRIVATE ${ALL_LIBS})
install (TARGETS cudahash DESTINATION lib)
add_executable(cudahash_test main.cc)
target_include_directories(cudahash_test PRIVATE ${ALL_INCLUDE} )
set_property(TARGET cudahash_test PROPERTY CUDA_STANDARD 14)
set_property(TARGET cudahash_test PROPERTY CXX_STANDARD 14)
set_target_properties(cudahash_test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(cudahash_test PRIVATE ${ALL_LIBS} cudahash)
install (TARGETS cudahash_test DESTINATION bin)
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* debugging.cpp
*
* @brief Debugging/statistics/performance utilities for hash tables.
*/
#include <hash/debugging.h>
#include <hash/definitions.h>
#include <algorithm>
#include <cstring>
#include <hash/cuda_util.h>
namespace cudahash {
void OutputRetrievalStatistics(const unsigned n_queries,
const unsigned *d_retrieval_probes,
const unsigned n_functions)
{
unsigned *retrieval_probes = new unsigned[n_queries];
CUDA_SAFE_CALL(cudaMemcpy(retrieval_probes,
d_retrieval_probes,
sizeof(unsigned) * n_queries,
cudaMemcpyDeviceToHost));
// Create a histogram showing how many items needed how many probes to be found.
unsigned possible_probes = n_functions + 2;
unsigned *histogram = new unsigned[possible_probes];
memset(histogram, 0, sizeof(unsigned) * (possible_probes));
for (unsigned i = 0; i < n_queries; ++i) {
histogram[retrieval_probes[i]]++;
}
// Dump it.
char buffer[10000];
sprintf(buffer, "Probes for retrieval: ");
PrintMessage(buffer);
for (unsigned i = 0; i < possible_probes; ++i) {
sprintf(buffer, "\t(%u, %u)", i, histogram[i]);
PrintMessage(buffer);
}
delete [] retrieval_probes;
delete [] histogram;
}
void OutputBuildStatistics(const unsigned n,
const unsigned *d_iterations_taken) {
// Output how many iterations each thread took until it found an empty slot.
unsigned *iterations_taken = new unsigned[n];
CUDA_SAFE_CALL(cudaMemcpy(iterations_taken, d_iterations_taken, sizeof(unsigned) * n, cudaMemcpyDeviceToHost));
std::sort(iterations_taken, iterations_taken + n);
unsigned total_iterations = 0;
unsigned max_iterations_taken = 0;
for (unsigned i = 0; i < n; ++i) {
total_iterations += iterations_taken[i];
max_iterations_taken = std::max(max_iterations_taken, iterations_taken[i]);
}
unsigned current_value = iterations_taken[0];
unsigned count = 1;
char buffer[10000];
sprintf(buffer, "Iterations taken:\n");
for (unsigned i = 1; i < n; ++i) {
if (iterations_taken[i] != current_value) {
sprintf(buffer, "%s\t(%u, %u)\n", buffer, current_value, count);
current_value = iterations_taken[i];
count = 1;
} else {
count++;
}
}
sprintf(buffer, "%s\t(%u, %u)", buffer, current_value, count);
PrintMessage(buffer);
sprintf(buffer, "Total iterations: %u", total_iterations);
PrintMessage(buffer);
sprintf(buffer, "Avg/Med/Max iterations: (%f %u %u)", (float)total_iterations / n, iterations_taken[n/2], iterations_taken[n-1]);
PrintMessage(buffer);
delete [] iterations_taken;
// Print the length of the longest eviction chain.
sprintf(buffer, "Max iterations: %u", max_iterations_taken);
PrintMessage(buffer);
}
}; // namespace CuckooHashing
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* debugging.cu
*
* @brief Debugging/statistics/performance utilities for hash tables.
*/
#include <hash/debugging.h>
#include <hash/definitions.h>
#include <hash/hash_table.cuh>
#include <algorithm>
#include <hash/cuda_util.h>
namespace cudahash {
//! Debugging function: Takes statistics on the hash functions' distribution.
/*! Determines:
* - How many unique slots each key has.
* - How many keys hash into each slot.
* - Whether any keys failed to get a full set of slots.
*/
__global__
void take_hash_function_statistics_kernel(const unsigned *keys,
const unsigned n_entries,
const unsigned table_size,
const uint2 *constants,
const unsigned num_functions,
unsigned *num_slots_available,
unsigned *num_hashing_in,
unsigned *failed) {
unsigned thread_index = threadIdx.x +
blockIdx.x * blockDim.x +
blockIdx.y * blockDim.x * gridDim.x;
if (thread_index >= n_entries)
return;
unsigned key = keys[thread_index];
// Determine all of the locations the key hashes into.
// Also count how many keys hash into each location.
unsigned locations[kMaxHashFunctions];
for (unsigned i = 0; i < num_functions; ++i) {
locations[i] = hash_function_inner(constants[i], key) % table_size;
if (num_hashing_in != NULL) {
atomicAdd(num_hashing_in + locations[i], 1);
}
}
// Determine whether all of the locations were different.
unsigned num_slots = 1;
for (unsigned i = 1; i < num_functions; ++i) {
bool matched = false;
for (unsigned j = 0; j < i; ++j) {
if (locations[i] == locations[j]) {
matched = true;
break;
}
}
if (!matched) {
num_slots++;
}
}
if (num_slots_available != NULL) {
num_slots_available[thread_index] = num_slots;
}
if (failed != NULL && num_slots != num_functions) {
*failed = 1;
}
}
void TakeHashFunctionStatistics(const unsigned num_keys,
const unsigned *d_keys,
const unsigned table_size,
const uint2 *constants,
const unsigned kNumHashFunctions) {
char buffer[16000];
PrintMessage("Hash function constants: ");
for (unsigned i = 0; i < kNumHashFunctions; ++i) {
sprintf(buffer, "\t%10u, %10u", constants[i].x, constants[i].y);
PrintMessage(buffer);
}
unsigned *d_num_hashing_in = NULL;
#ifdef COUNT_HOW_MANY_HASH_INTO_EACH_SLOT
CUDA_SAFE_CALL(cudaMalloc((void**)&d_num_hashing_in,
sizeof(unsigned) * table_size));
CUDA_SAFE_CALL(cudaMemset(d_num_hashing_in, 0, sizeof(unsigned) * table_size));
#endif
unsigned *d_num_slots_available = NULL;
#ifdef COUNT_HOW_MANY_HAVE_CYCLES
CUDA_SAFE_CALL(cudaMalloc((void**)&d_num_slots_available,
sizeof(unsigned) * num_keys));
#endif
uint2 *d_constants = NULL;
CUDA_SAFE_CALL(cudaMalloc((void**)&d_constants, sizeof(uint2) * kNumHashFunctions));
CUDA_SAFE_CALL(cudaMemcpy(d_constants, constants, sizeof(uint2) * kNumHashFunctions, cudaMemcpyHostToDevice));
take_hash_function_statistics_kernel<<<ComputeGridDim(num_keys), kBlockSize>>>
(d_keys, num_keys,
table_size,
d_constants,
kNumHashFunctions,
d_num_slots_available,
d_num_hashing_in,
NULL);
CUDA_SAFE_CALL(cudaFree(d_constants));
#ifdef COUNT_HOW_MANY_HASH_INTO_EACH_SLOT
unsigned *num_hashing_in = new unsigned[table_size];
CUDA_SAFE_CALL(cudaMemcpy(num_hashing_in,
d_num_hashing_in,
sizeof(unsigned) * table_size,
cudaMemcpyDeviceToHost));
/*
// Print how many items hash into each slot.
// Used to make sure items are spread evenly throughout the table.
buffer[0] = '\0';
PrintMessage("Num hashing into each: ", true);
for (unsigned i = 0; i < table_size; ++i) {
sprintf(buffer, "%s\t%2u", buffer, num_hashing_in[i]);
if (i % 25 == 24) {
PrintMessage(buffer, true);
buffer[0] = '\0';
}
}
PrintMessage(buffer,true);
*/
// Print a histogram of how many items are hashed into each slot. Shows
// if average number of items hashing into each slot is low.
std::sort(num_hashing_in, num_hashing_in + table_size);
int count = 1;
unsigned previous = num_hashing_in[0];
sprintf(buffer, "Num items hashing into a slot:\t");
PrintMessage(buffer);
for (unsigned i = 1; i < table_size; ++i) {
if (num_hashing_in[i] != previous) {
sprintf(buffer, "\t(%u, %u)", previous, count);
PrintMessage(buffer);
previous = num_hashing_in[i];
count = 1;
} else {
count++;
}
}
sprintf(buffer, "\t(%u, %u)", previous, count);
PrintMessage(buffer);
delete [] num_hashing_in;
CUDA_SAFE_CALL(cudaFree(d_num_hashing_in));
#endif
#ifdef COUNT_HOW_MANY_HAVE_CYCLES
unsigned *num_slots_available = new unsigned[num_keys];
CUDA_SAFE_CALL(cudaMemcpy(num_slots_available,
d_num_slots_available,
sizeof(unsigned) * num_keys,
cudaMemcpyDeviceToHost));
static const unsigned kHistogramSize = kNumHashFunctions + 1;
unsigned *histogram = new unsigned[kHistogramSize];
memset(histogram, 0, sizeof(unsigned) * kHistogramSize);
for (unsigned i = 0; i < num_keys; ++i) {
histogram[num_slots_available[i]]++;
}
sprintf(buffer, "Slots assigned to each key: ");
for (unsigned i = 1; i < kHistogramSize; ++i) {
sprintf(buffer, "%s(%u, %u) ", buffer, i, histogram[i]);
}
PrintMessage(buffer);
delete [] histogram;
delete [] num_slots_available;
CUDA_SAFE_CALL(cudaFree(d_num_slots_available));
#endif
}
bool CheckAssignedSameSlot(const unsigned N,
const unsigned num_keys,
const unsigned *d_keys,
const unsigned table_size,
uint2 *constants) {
unsigned *d_cycle_exists = NULL;
uint2 *d_constants = NULL;
CUDA_SAFE_CALL(cudaMalloc((void**)&d_cycle_exists, sizeof(unsigned)));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_constants, sizeof(uint2) * N));
CUDA_SAFE_CALL(cudaMemset(d_cycle_exists, 0, sizeof(unsigned)));
CUDA_SAFE_CALL(cudaMemcpy(d_constants,
constants,
sizeof(uint2) * N,
cudaMemcpyHostToDevice));
// Check if all keys were given a full set of N slots by the functions.
take_hash_function_statistics_kernel<<<ComputeGridDim(num_keys), kBlockSize>>>
(d_keys, num_keys, table_size, d_constants, N,
NULL, NULL, d_cycle_exists);
unsigned cycle_exists;
CUDA_SAFE_CALL(cudaMemcpy(&cycle_exists,
d_cycle_exists,
sizeof(unsigned),
cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaFree(d_cycle_exists));
CUDA_SAFE_CALL(cudaFree(d_constants));
return (cycle_exists != 0);
}
void PrintStashContents(const Entry *d_stash) {
Entry *stash = new Entry[cudahash::kStashSize];
CUDA_SAFE_CALL(cudaMemcpy(stash, d_stash, sizeof(Entry) * cudahash::kStashSize, cudaMemcpyDeviceToHost));
for (unsigned i = 0; i < cudahash::kStashSize; ++i) {
if (get_key(stash[i]) != kKeyEmpty) {
char buffer[256];
sprintf(buffer, "Stash[%u]: %u = %u", i, get_key(stash[i]), get_value(stash[i]));
PrintMessage(buffer, true);
}
}
delete [] stash;
}
}; // namespace CuckooHashing
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:
#include <hash/hash_table.h>
#include <hash/debugging.h>
#include <hash/mt19937ar.h>
#include <cassert>
namespace cudahash {
void GenerateFunctions(const unsigned N,
const unsigned num_keys,
const unsigned *d_keys,
const unsigned table_size,
uint2 *constants) {
bool regenerate = true;
while (regenerate) {
regenerate = false;
// Generate a set of hash function constants for this build attempt.
for (unsigned i = 0 ; i < N; ++i) {
unsigned new_a = genrand_int32() % kPrimeDivisor;
constants[i].x = (1 > new_a ? 1 : new_a);
constants[i].y = genrand_int32() % kPrimeDivisor;
}
#ifdef FORCEFULLY_GENERATE_NO_CYCLES
// Ensure that every key gets N different slots.
regenerate = CheckAssignedSameSlot(N, num_keys, d_keys, table_size, constants);
#endif
}
#ifdef TAKE_HASH_FUNCTION_STATISTICS
// Examine how well distributed the items are.
TakeHashFunctionStatistics(num_keys, d_keys, table_size, constants, N);
#endif
}
}; // namespace CuckooHashing
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file hash_table.cpp
*
* @brief Implements a basic hash table that stores one value per key.
*/
#include <hash/hash_table.h>
#include <hash/debugging.h>
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <limits>
#include <hash/mt19937ar.h>
#include <cuda_runtime_api.h>
#include <hash/cuda_util.h>
namespace cudahash {
char buffer[256];
//! @name Internal
/// @{
dim3 ComputeGridDim(unsigned n) {
// Round up in order to make sure all items are hashed in.
dim3 grid( (n + kBlockSize-1) / kBlockSize );
if (grid.x > kGridSize) {
grid.y = (grid.x + kGridSize - 1) / kGridSize;
grid.x = kGridSize;
}
return grid;
}
unsigned ComputeMaxIterations(const unsigned n,
const unsigned table_size,
const unsigned num_functions) {
float lg_input_size = (float)(log((double)n) / log(2.0));
// #define CONSTANT_ITERATIONS
#ifdef CONSTANT_ITERATIONS
// Set the maximum number of iterations to 7lg(N).
const unsigned MAX_ITERATION_CONSTANT = 7;
unsigned max_iterations = MAX_ITERATION_CONSTANT * lg_input_size;
#else
// Use an empirical formula for determining what the maximum number of
// iterations should be. Works OK in most situations.
float load_factor = float(n) / table_size;
float ln_load_factor = (float)(log(load_factor) / log(2.71828183));
unsigned max_iterations = (unsigned)(4.0 * ceil(-1.0 / (0.028255 + 1.1594772 *
ln_load_factor)* lg_input_size));
#endif
return max_iterations;
}
/// @}
HashTable::HashTable() : table_size_(0),
d_contents_(NULL),
stash_count_(0),
d_failures_(NULL) {
CUDA_CHECK_ERROR("Failed in constructor.\n");
}
bool HashTable::Initialize(const unsigned max_table_entries,
const float space_usage,
const unsigned num_functions) {
Release();
// Determine the minimum amount of slots the table requires,
// and whether the space_usage is within range.
float minimum_space_usage;
if (num_functions < 2 || num_functions > 5) {
char message[256] = "Number of hash functions must be from 2 to 5; "
"others are unimplemented.";
PrintMessage(message, true);
return false;
} else {
minimum_space_usage = kMinimumSpaceUsages[num_functions];
}
if (space_usage < minimum_space_usage) {
sprintf(buffer, "Minimum possible space usage for %u functions is %f.",
num_functions, minimum_space_usage);
PrintMessage(buffer);
return false;
}
num_hash_functions_ = num_functions;
table_size_ = unsigned(ceil(max_table_entries * space_usage));
// Allocate memory.
const unsigned slots_to_allocate = table_size_ + kStashSize;
CUDA_SAFE_CALL(cudaMalloc( (void**)&d_contents_,
sizeof(Entry) * slots_to_allocate ));
CUDA_SAFE_CALL(cudaMalloc( (void**)&d_failures_, sizeof(unsigned) ));
if (!d_contents_ || !d_failures_) {
fprintf(stderr, "Failed to allocate %u slots.\n", slots_to_allocate);
return false;
}
CUDA_CHECK_ERROR("Failed to initialize.\n");
return true;
}
void HashTable::Release() {
table_size_ = 0;
CUDA_SAFE_CALL(cudaFree(d_contents_));
CUDA_SAFE_CALL(cudaFree(d_failures_));
d_contents_ = NULL;
d_failures_ = NULL;
CUDA_CHECK_ERROR("Failed during release.\n");
}
bool HashTable::Build(const unsigned n,
const unsigned *d_keys,
const unsigned *d_values) {
unsigned max_iterations = ComputeMaxIterations(n, table_size_,
num_hash_functions_);
unsigned num_failures = 1;
unsigned num_attempts = 0;
// Storage for statistics collection.
unsigned *d_iterations_taken = NULL;
#ifdef TRACK_ITERATIONS
CUDA_SAFE_CALL(cudaMalloc((void**)&d_iterations_taken, sizeof(unsigned) * n));
#endif
// Track how many items ended up in the stash.
unsigned *d_stash_count = NULL;
CUDA_SAFE_CALL(cudaMalloc((void**)&d_stash_count, sizeof(unsigned)));
CUDA_CHECK_ERROR("Failed before main build loop.\n");
// Main build loop.
while (num_failures && ++num_attempts < kMaxRestartAttempts) {
CUDA_SAFE_CALL(cudaMemset(d_stash_count, 0, sizeof(unsigned)));
// Generate new hash functions.
if (num_hash_functions_ == 2)
constants_2_.Generate(n, d_keys,table_size_);
else if (num_hash_functions_ == 3)
constants_3_.Generate(n, d_keys,table_size_);
else if (num_hash_functions_ == 4)
constants_4_.Generate(n, d_keys,table_size_);
else
constants_5_.Generate(n, d_keys,table_size_);
stash_constants_.x = std::max(1lu, genrand_int32()) % kPrimeDivisor;
stash_constants_.y = genrand_int32() % kPrimeDivisor;
stash_count_ = 0;
// Initialize memory.
unsigned slots_in_table = table_size_ + kStashSize;
CUDAWrapper::ClearTable(slots_in_table,
kEntryEmpty,
d_contents_);
num_failures = 0;
CUDAWrapper::CallCuckooHash(n,
num_hash_functions_,
d_keys,
d_values,
table_size_,
constants_2_,
constants_3_,
constants_4_,
constants_5_,
max_iterations,
d_contents_,
stash_constants_,
d_stash_count,
d_failures_,
d_iterations_taken);
// Check if successful.
CUDA_SAFE_CALL(cudaMemcpy( &num_failures, d_failures_, sizeof(unsigned), cudaMemcpyDeviceToHost ));
#ifdef COUNT_UNINSERTED
if (num_failures) {
printf("Failed to insert %u items.\n", num_failures);
}
#endif
}
// Copy out the stash size.
CUDA_SAFE_CALL(cudaMemcpy( &stash_count_, d_stash_count, sizeof(unsigned), cudaMemcpyDeviceToHost ));
if (stash_count_ && num_failures == 0) {
sprintf(buffer, "Stash size: %u", stash_count_);
PrintMessage(buffer, true);
#ifdef _DEBUG
PrintStashContents(d_contents_ + table_size_);
#endif
}
CUDA_SAFE_CALL(cudaFree(d_stash_count));
#ifdef TRACK_ITERATIONS
if (num_failures == 0) {
OutputBuildStatistics(n, d_iterations_taken);
}
CUDA_SAFE_CALL(cudaFree(d_iterations_taken));
#endif
// Dump some info if a restart was required.
if (num_attempts >= kMaxRestartAttempts) {
sprintf(buffer, "Completely failed to build");
PrintMessage(buffer, true);
} else if (num_attempts > 1) {
sprintf(buffer, "Needed %u attempts to build", num_attempts);
PrintMessage(buffer, true);
}
CUDA_CHECK_ERROR("Error occurred during hash table build.\n");
return num_failures == 0;
}
void HashTable::Retrieve(const unsigned n_queries,
const unsigned *d_keys,
unsigned *d_values) {
CUDAWrapper::CallHashRetrieve(n_queries,
num_hash_functions_,
d_keys,
table_size_,
d_contents_,
constants_2_,
constants_3_,
constants_4_,
constants_5_,
stash_constants_,
stash_count_,
d_values);
}
}; // namesapce CuckooHashing
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file hash_table.cu
*
* @brief Hides all of the CUDA calls from the actual CPP file.
*/
#include <hash/cuda_util.h>
#include <hash/debugging.h>
#include <hash/definitions.h>
#include <hash/hash_table.cuh>
#include <cuda.h>
namespace cudahash {
namespace CUDAWrapper {
void ClearTable(const unsigned slots_in_table,
const Entry fill_value,
Entry *d_contents) {
clear_table<Entry><<<ComputeGridDim(slots_in_table), kBlockSize>>>
(slots_in_table, fill_value, d_contents);
TV_CHECK_CUDA_ERR_V2("Error occurred during hash table clear.\n");
}
void CallCuckooHash(const unsigned n,
const unsigned num_hash_functions,
const unsigned *d_keys,
const unsigned *d_values,
const unsigned table_size,
const Functions<2> constants_2,
const Functions<3> constants_3,
const Functions<4> constants_4,
const Functions<5> constants_5,
const unsigned max_iterations,
Entry *d_contents,
uint2 stash_constants,
unsigned *d_stash_count,
unsigned *d_failures,
unsigned *d_iterations_taken) {
// Build the table.
cudaMemset(d_failures, 0, sizeof(unsigned));
if (num_hash_functions == 2) {
CuckooHash<<<ComputeGridDim(n), kBlockSize>>>
(n,
d_keys,
d_values,
table_size,
constants_2,
max_iterations,
d_contents,
stash_constants,
d_stash_count,
d_failures,
d_iterations_taken);
} else if (num_hash_functions == 3) {
CuckooHash<<<ComputeGridDim(n), kBlockSize>>>
(n,
d_keys,
d_values,
table_size,
constants_3,
max_iterations,
d_contents,
stash_constants,
d_stash_count,
d_failures,
d_iterations_taken);
} else if (num_hash_functions == 4) {
CuckooHash<<<ComputeGridDim(n), kBlockSize>>>
(n,
d_keys,
d_values,
table_size,
constants_4,
max_iterations,
d_contents,
stash_constants,
d_stash_count,
d_failures,
d_iterations_taken);
} else {
CuckooHash<<<ComputeGridDim(n), kBlockSize>>>
(n,
d_keys,
d_values,
table_size,
constants_5,
max_iterations,
d_contents,
stash_constants,
d_stash_count,
d_failures,
d_iterations_taken);
}
CUDA_CHECK_ERROR("Error occurred during hash table build.\n");
}
void CallHashRetrieve(const unsigned n_queries,
const unsigned num_hash_functions,
const unsigned *d_keys,
const unsigned table_size,
const Entry *d_contents,
const Functions<2> constants_2,
const Functions<3> constants_3,
const Functions<4> constants_4,
const Functions<5> constants_5,
const uint2 stash_constants,
const unsigned stash_count,
unsigned *d_values) {
unsigned *d_retrieval_probes = NULL;
#ifdef TRACK_ITERATIONS
CUDA_SAFE_CALL(cudaMalloc((void**)&d_retrieval_probes, sizeof(unsigned) * n_queries));
#endif
if (num_hash_functions == 2) {
hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>
(n_queries,
d_keys,
table_size,
d_contents,
constants_2,
stash_constants,
stash_count,
d_values,
d_retrieval_probes);
} else if (num_hash_functions == 3) {
hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>
(n_queries,
d_keys,
table_size,
d_contents,
constants_3,
stash_constants,
stash_count,
d_values,
d_retrieval_probes);
} else if (num_hash_functions == 4) {
hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>
(n_queries,
d_keys,
table_size,
d_contents,
constants_4,
stash_constants,
stash_count,
d_values,
d_retrieval_probes);
} else {
hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>
(n_queries,
d_keys,
table_size,
d_contents,
constants_5,
stash_constants,
stash_count,
d_values,
d_retrieval_probes);
}
CUDA_CHECK_ERROR("Retrieval failed.\n");
#ifdef TRACK_ITERATIONS
OutputRetrievalStatistics(n_queries,
d_retrieval_probes,
num_hash_functions);
CUDA_SAFE_CALL(cudaFree(d_retrieval_probes));
#endif
}
}; // namespace CUDAWrapper
}; // namespace CuckooHashing
#include <hash/hash_table.h>
#include <cuda.h>
int main(){
auto table = cudahash::HashTable();
table.Initialize(10, 2.0);
const int N = 10;
// ハッシュテーブルに格納するデータ
int keys[N] = {1, 6, 4, 9, 0, 3, 7, 2, 5, 8};
int vals[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
// デバイスメモリにコピー
int *d_keys, *d_vals;
cudaMalloc((void**)&d_keys, sizeof(int) * N);
cudaMemcpy(d_keys, keys, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_vals, sizeof(int) * N);
cudaMemcpy(d_vals, vals, sizeof(int) * N, cudaMemcpyHostToDevice);
// ハッシュテーブルにクエリするデータ
int input[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
int output[N];
// デバイスメモリにコピー
int *d_input, *d_output;
cudaMalloc((void**)&d_input, sizeof(int) * N);
cudaMemcpy(d_input, input, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMalloc((void**)&d_output, sizeof(int) * N);
cudaMemset(d_output, 0, sizeof(int) * N);
bool s = table.Build(N, (const unsigned int *) d_keys,
(const unsigned int *) d_vals);
std::cout << s << std::endl;
table.Retrieve(N, (const unsigned int *) d_input,
(unsigned int *) d_output);
std::cout << s << std::endl;
cudaMemcpy(output, d_output, sizeof(int) * N, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; ++i) {
printf("%d\n", output[i]);
}
return 0;
}
\ No newline at end of file
/*
A C-program for MT19937, with initialization improved 2002/1/26.
Coded by Takuji Nishimura and Makoto Matsumoto.
Before using, initialize the state by using init_genrand(seed)
or init_by_array(init_key, key_length).
Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. The names of its contributors may not be used to endorse or promote
products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Any feedback is very welcome.
http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
*/
#include <stdio.h>
/* Period parameters */
#define N 624
#define M 397
#define MATRIX_A 0x9908b0dfUL /* constant vector a */
#define UPPER_MASK 0x80000000UL /* most significant w-r bits */
#define LOWER_MASK 0x7fffffffUL /* least significant r bits */
static unsigned long mt[N]; /* the array for the state vector */
static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */
/* initializes mt[N] with a seed */
void init_genrand(unsigned long s)
{
mt[0]= s & 0xffffffffUL;
for (mti=1; mti<N; mti++) {
mt[mti] =
(1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
/* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
/* In the previous versions, MSBs of the seed affect */
/* only MSBs of the array mt[]. */
/* 2002/01/09 modified by Makoto Matsumoto */
mt[mti] &= 0xffffffffUL;
/* for >32 bit machines */
}
}
/* initialize by an array with array-length */
/* init_key is the array for initializing keys */
/* key_length is its length */
/* slight change for C++, 2004/2/26 */
void init_by_array(unsigned long init_key[], int key_length)
{
int i, j, k;
init_genrand(19650218UL);
i=1; j=0;
k = (N>key_length ? N : key_length);
for (; k; k--) {
mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL))
+ init_key[j] + j; /* non linear */
mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
i++; j++;
if (i>=N) { mt[0] = mt[N-1]; i=1; }
if (j>=key_length) j=0;
}
for (k=N-1; k; k--) {
mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL))
- i; /* non linear */
mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
i++;
if (i>=N) { mt[0] = mt[N-1]; i=1; }
}
mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */
}
/* generates a random number on [0,0xffffffff]-interval */
unsigned long genrand_int32(void)
{
unsigned long y;
static unsigned long mag01[2]={0x0UL, MATRIX_A};
/* mag01[x] = x * MATRIX_A for x=0,1 */
if (mti >= N) { /* generate N words at one time */
int kk;
if (mti == N+1) /* if init_genrand() has not been called, */
init_genrand(5489UL); /* a default initial seed is used */
for (kk=0;kk<N-M;kk++) {
y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];
}
for (;kk<N-1;kk++) {
y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1UL];
}
y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK);
mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];
mti = 0;
}
y = mt[mti++];
/* Tempering */
y ^= (y >> 11);
y ^= (y << 7) & 0x9d2c5680UL;
y ^= (y << 15) & 0xefc60000UL;
y ^= (y >> 18);
return y;
}
/* generates a random number on [0,0x7fffffff]-interval */
long genrand_int31(void)
{
return (long)(genrand_int32()>>1);
}
/* generates a random number on [0,1]-real-interval */
double genrand_real1(void)
{
return genrand_int32()*(1.0/4294967295.0);
/* divided by 2^32-1 */
}
/* generates a random number on [0,1)-real-interval */
double genrand_real2(void)
{
return genrand_int32()*(1.0/4294967296.0);
/* divided by 2^32 */
}
/* generates a random number on (0,1)-real-interval */
double genrand_real3(void)
{
return (((double)genrand_int32()) + 0.5)*(1.0/4294967296.0);
/* divided by 2^32 */
}
/* generates a random number on [0,1) with 53-bit resolution*/
double genrand_res53(void)
{
unsigned long a=genrand_int32()>>5, b=genrand_int32()>>6;
return(a*67108864.0+b)*(1.0/9007199254740992.0);
}
/* These real versions are due to Isaku Wada, 2002/01/09 added */
add_library(spconv SHARED all.cc indice.cc indice.cu
reordering.cc reordering.cu maxpool.cc maxpool.cu nms.cc)
reordering.cc reordering.cu maxpool.cc maxpool.cu nms.cc
pillar_scatter.cu)
target_include_directories(spconv PRIVATE ${ALL_INCLUDE} )
set_property(TARGET spconv PROPERTY CUDA_STANDARD 14)
set_property(TARGET spconv PROPERTY CXX_STANDARD 14)
set_target_properties(spconv PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(spconv PRIVATE ${ALL_LIBS})
target_link_libraries(spconv PRIVATE ${ALL_LIBS} cudahash)
install (TARGETS spconv DESTINATION lib)
......@@ -15,6 +15,7 @@
#include <cuda_runtime_api.h>
#include <spconv/pool_ops.h>
#include <spconv/spconv_ops.h>
#include <spconv/pillar_scatter_ops.h>
#include <spconv/fused_spconv_ops.h>
#include <spconv/nms_ops.h>
......@@ -34,7 +35,9 @@ static auto registry =
.op("spconv::indice_maxpool_fp32", &spconv::indiceMaxPool<float>)
.op("spconv::indice_maxpool_backward_fp32",
&spconv::indiceMaxPoolBackward<float>)
.op("spconv::indice_maxpool_half", &spconv::indiceMaxPool<at::Half>)
.op("spconv::indice_maxpool_backward_half",
&spconv::indiceMaxPoolBackward<at::Half>)
.op("spconv::nms", &spconv::nonMaxSuppression<float>);
\ No newline at end of file
// .op("spconv::indice_maxpool_half", &spconv::indiceMaxPool<at::Half>)
// .op("spconv::indice_maxpool_backward_half",
// &spconv::indiceMaxPoolBackward<at::Half>)
.op("spconv::nms", &spconv::nonMaxSuppression<float>)
.op("spconv::pillar_scatter_float", &spconv::pointPillarScatter<float>)
.op("spconv::pillar_scatter_half", &spconv::pointPillarScatter<at::Half>);
\ No newline at end of file
......@@ -32,7 +32,7 @@ struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid) {
bool transpose, bool resetGrid, bool useHash) {
if (transpose)
return getIndicePairsDeConv<Index, IndexGrid, NDim>(
indicesIn, indicesOut,
......@@ -59,7 +59,7 @@ struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid) {
bool transpose, bool resetGrid, bool useHash) {
return getIndicePairsSubM<Index, IndexGrid, NDim>(
indicesIn,
gridsOut, indicePairs, indiceNum,
......
......@@ -22,6 +22,7 @@
#include <tensorview/tensorview.h>
#include <type_traits>
#include <utility/timer.h>
#include <hash/hash_table.h>
namespace spconv {
namespace functor {
......@@ -71,28 +72,61 @@ struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
tv::TensorView<Index> indiceNum,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid) {
bool transpose, bool resetGrid, bool useHash) {
Index batchSize = gridsOut.dim(0);
auto kernelVolume = indicePairs.dim(0);
auto numActIn = indicesIn.dim(0);
if (numActIn == 0)
return 0;
Index numAct = indicePairUnique.dim(0) - 1;
assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
indicePairUnique, outSpatialShape, batchSize);
TV_CHECK_CUDA_ERR();
assignIndicePairsKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
indicePairUnique, outSpatialShape);
TV_CHECK_CUDA_ERR();
if (resetGrid) {
if (useHash){
auto table = cudahash::HashTable();
table.Initialize(numAct, 2.0);
Index *d_values = nullptr;
cudaMalloc((void**)&d_values, sizeof(Index) * numAct);
TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
arangeKernel<Index><<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(d_values, numAct);
bool res = table.Build(numAct, reinterpret_cast<unsigned*>(indicePairUnique.data()),
reinterpret_cast<unsigned*>(d_values));
TV_ASSERT_RT_ERR(res, "err");
assignIndiceOutKernel<Index, NDim>
<<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, numAct,
indicePairUnique, outSpatialShape, batchSize);
TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
cudaFree(d_values);
auto tableSize = table.get_table_size();
auto tableData = table.data();
auto constants = table.get_constants_4();
auto stash_constants = table.get_stash_constants();
auto stash_count = table.get_stash_count();
assignIndicePairsHashKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, numActIn, indicePairs,
indicePairUnique,
tableSize, tableData, constants, stash_constants,
stash_count);
TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
}else{
assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
indicePairUnique, outSpatialShape, batchSize);
TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
assignIndicePairsKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
indicePairUnique, outSpatialShape);
TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
}
if (resetGrid && (!useHash)) {
resetGridKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
TV_CHECK_CUDA_ERR();
TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
}
return numAct;
}
......@@ -109,22 +143,50 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid) {
bool transpose, bool resetGrid, bool useHash) {
auto numActIn = indicesIn.dim(0);
if (numActIn == 0)
return 0;
// auto timer = spconv::CudaContextTimer<>();
prepareSubMGridKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
TV_CHECK_CUDA_ERR();
getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
kernelSize, stride, padding, dilation, outSpatialShape);
TV_CHECK_CUDA_ERR();
if (useHash){
auto table = cudahash::HashTable();
table.Initialize(numActIn, 2.0);
unsigned *d_keyvalues = nullptr;
cudaMalloc((void**)&d_keyvalues, sizeof(unsigned) * numActIn * 2);
unsigned *d_values = d_keyvalues + numActIn;
prepareSubMHashKernel<Index, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, d_keyvalues, d_values, outSpatialShape);
TV_CHECK_CUDA_ERR_V2("prepareSubMHashKernel failed");
bool res = table.Build(numActIn, reinterpret_cast<unsigned*>(d_keyvalues),
reinterpret_cast<unsigned*>(d_values));
TV_ASSERT_RT_ERR(res, "err");
cudaFree(d_keyvalues);
auto tableSize = table.get_table_size();
auto tableData = table.data();
auto constants = table.get_constants_4();
auto stash_constants = table.get_stash_constants();
auto stash_count = table.get_stash_count();
getSubMIndicePairsHashKernel<Index, NDim, 4096>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, indicePairs, indiceNum,
kernelSize, stride, padding, dilation, outSpatialShape,
tableSize, tableData, constants, stash_constants,
stash_count);
TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsHashKernel failed");
}else{
prepareSubMGridKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
TV_CHECK_CUDA_ERR();
getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
kernelSize, stride, padding, dilation, outSpatialShape);
TV_CHECK_CUDA_ERR();
}
// std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
if (resetGrid) {
if (resetGrid && (!useHash)) {
resetGridSubMKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape, numActIn);
......
......@@ -464,7 +464,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
DECLARE_GPU_SPECS(float);
DECLARE_GPU_SPECS(double);
DECLARE_GPU_SPECS(at::Half);
// DECLARE_GPU_SPECS(at::Half); // currently have problem
#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX
......
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/mp_helper.h>
#include <spconv/pillar_scatter_functor.h>
#include <tensorview/helper_kernel.cu.h>
#include <tensorview/helper_launch.h>
#include <tensorview/tensorview.h>
#include <type_traits>
#include <utility/timer.h>
namespace spconv {
template <typename T, typename Index>
__global__ void pointPillarsScatterKernel(tv::TensorView<T> canvas,
tv::TensorView<const T> features,
tv::TensorView<const T> coors) {
auto numFeatures = features.dim(0);
auto numPoints = features.dim(1);
for (int i : tv::KernelLoopX<int>(numPoints)) {
for (int ifeature : tv::KernelLoopY<int>(numFeatures)) {
canvas(int(coors(0, i)), ifeature, int(coors(2, i)), int(coors(3, i))) =
features(ifeature, i);
}
}
}
namespace functor {
template <typename T, typename Index>
struct PointPillarScatter<tv::GPU, T, Index> {
void operator()(const tv::GPU &d, tv::TensorView<T> canvas,
tv::TensorView<const T> features,
tv::TensorView<const T> coors) {
auto grid = dim3(tv::launch::DivUp(features.dim(1), 32),
tv::launch::DivUp(features.dim(0), 32));
pointPillarsScatterKernel<T, Index>
<<<grid, dim3(32, 32), 0, d.getStream()>>>(canvas, features, coors);
TV_CHECK_CUDA_ERR();
}
};
} // namespace functor
#define DECLARE_GPU_SPECS_T_INDEX(T, Index) \
template struct functor::PointPillarScatter<tv::GPU, T, Index>;
#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
DECLARE_GPU_SPECS(float);
DECLARE_GPU_SPECS(double);
DECLARE_GPU_SPECS(at::Half);
#undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX
} // namespace spconv
\ No newline at end of file
// Copyright 2019 Yan Yan
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <spconv/box_iou.h>
#include <spconv/nms.h>
#include <spconv/point2voxel.h>
#include <spconv/box_iou.h>
namespace py = pybind11;
using namespace pybind11::literals;
PYBIND11_MODULE(spconv_utils, m)
{
m.doc() = "util pybind11 functions for spconv";
m.def("non_max_suppression", &spconv::non_max_suppression<double>, py::return_value_policy::reference_internal, "bbox iou",
"boxes"_a = 1, "keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
m.def("non_max_suppression", &spconv::non_max_suppression<float>, py::return_value_policy::reference_internal, "bbox iou",
"boxes"_a = 1, "keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<double>, py::return_value_policy::reference_internal, "bbox iou",
"boxes"_a = 1, "order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<float>, py::return_value_policy::reference_internal, "bbox iou",
"boxes"_a = 1, "order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
m.def("rotate_non_max_suppression_cpu", &spconv::rotate_non_max_suppression_cpu<float>, py::return_value_policy::reference_internal, "bbox iou",
"box_corners"_a = 1, "order"_a = 2, "standup_iou"_a = 3, "thresh"_a = 4);
m.def("rotate_non_max_suppression_cpu", &spconv::rotate_non_max_suppression_cpu<double>, py::return_value_policy::reference_internal, "bbox iou",
"box_corners"_a = 1, "order"_a = 2, "standup_iou"_a = 3, "thresh"_a = 4);
m.def("rbbox_iou", &spconv::rbbox_iou<double>,
py::return_value_policy::reference_internal, "rbbox iou",
"box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
"standup_thresh"_a = 4);
m.def("rbbox_iou", &spconv::rbbox_iou<float>,
py::return_value_policy::reference_internal, "rbbox iou",
"box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
"standup_thresh"_a = 4);
m.def("rbbox_intersection", &spconv::rbbox_intersection<double>,
py::return_value_policy::reference_internal, "rbbox iou",
"box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
"standup_thresh"_a = 4);
m.def("rbbox_intersection", &spconv::rbbox_intersection<float>,
py::return_value_policy::reference_internal, "rbbox iou",
"box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
"standup_thresh"_a = 4);
m.def("points_to_voxel_3d_np", &spconv::points_to_voxel_3d_np<float, 3>,
"matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "coors"_a = 3,
"num_points_per_voxel"_a = 4, "coor_to_voxelidx"_a = 5,
"voxel_size"_a = 6, "coors_range"_a = 7, "max_points"_a = 8,
"max_voxels"_a = 9);
m.def("points_to_voxel_3d_np", &spconv::points_to_voxel_3d_np<double, 3>,
"matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "coors"_a = 3,
"num_points_per_voxel"_a = 4, "coor_to_voxelidx"_a = 5,
"voxel_size"_a = 6, "coors_range"_a = 7, "max_points"_a = 8,
"max_voxels"_a = 9);
m.def("points_to_voxel_3d_np_mean", &spconv::points_to_voxel_3d_np_mean<float, 3>,
"matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "means"_a = 3, "coors"_a = 4,
"num_points_per_voxel"_a = 5, "coor_to_voxelidx"_a = 6,
"voxel_size"_a = 7, "coors_range"_a = 8, "max_points"_a = 9,
"max_voxels"_a = 10);
m.def("points_to_voxel_3d_np_mean", &spconv::points_to_voxel_3d_np_mean<double, 3>,
"matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "means"_a = 3, "coors"_a = 4,
"num_points_per_voxel"_a = 5, "coor_to_voxelidx"_a = 6,
"voxel_size"_a = 7, "coors_range"_a = 8, "max_points"_a = 9,
"max_voxels"_a = 10);
m.def("points_to_voxel_3d_np_height", &spconv::points_to_voxel_3d_np_height<double, 3>,
"matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "heights"_a = 3,
"maxs"_a = 4, "coors"_a = 5, "num_points_per_voxel"_a = 6, "coor_to_voxelidx"_a = 7,
"voxel_size"_a = 8, "coors_range"_a = 9, "max_points"_a = 10,
"max_voxels"_a = 11);
m.def("points_to_voxel_3d_with_filtering", &spconv::points_to_voxel_3d_with_filtering<float, 3>,
"matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "voxel_mask"_a = 3, "mins"_a = 4,
"maxs"_a = 5, "coors"_a = 6, "num_points_per_voxel"_a = 7, "coor_to_voxelidx"_a = 8,
"voxel_size"_a = 9, "coors_range"_a = 10, "max_points"_a = 11,
"max_voxels"_a = 12, "block_factor"_a = 13, "block_size"_a = 14, "height_threshold"_a = 15);
m.def("points_to_voxel_3d_with_filtering", &spconv::points_to_voxel_3d_with_filtering<double, 3>,
"matrix tensor_square", "points"_a = 1, "voxels"_a = 2, "voxel_mask"_a = 3, "mins"_a = 4,
"maxs"_a = 5, "coors"_a = 6, "num_points_per_voxel"_a = 7, "coor_to_voxelidx"_a = 8,
"voxel_size"_a = 9, "coors_range"_a = 10, "max_points"_a = 11,
"max_voxels"_a = 12, "block_factor"_a = 13, "block_size"_a = 14, "height_threshold"_a = 15);
PYBIND11_MODULE(spconv_utils, m) {
m.doc() = "util pybind11 functions for spconv";
m.def("non_max_suppression", &spconv::non_max_suppression<double>,
py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
"keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
m.def("non_max_suppression", &spconv::non_max_suppression<float>,
py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
"keep_out"_a = 2, "nms_overlap_thresh"_a = 3, "device_id"_a = 4);
m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<double>,
py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
"order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
m.def("non_max_suppression_cpu", &spconv::non_max_suppression_cpu<float>,
py::return_value_policy::reference_internal, "bbox iou", "boxes"_a = 1,
"order"_a = 2, "nms_overlap_thresh"_a = 3, "eps"_a = 4);
m.def("rotate_non_max_suppression_cpu",
&spconv::rotate_non_max_suppression_cpu<float>,
py::return_value_policy::reference_internal, "bbox iou",
"box_corners"_a = 1, "order"_a = 2, "standup_iou"_a = 3,
"thresh"_a = 4);
m.def("rotate_non_max_suppression_cpu",
&spconv::rotate_non_max_suppression_cpu<double>,
py::return_value_policy::reference_internal, "bbox iou",
"box_corners"_a = 1, "order"_a = 2, "standup_iou"_a = 3,
"thresh"_a = 4);
m.def("rbbox_iou", &spconv::rbbox_iou<double>,
py::return_value_policy::reference_internal, "rbbox iou",
"box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
"standup_thresh"_a = 4);
m.def("rbbox_iou", &spconv::rbbox_iou<float>,
py::return_value_policy::reference_internal, "rbbox iou",
"box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
"standup_thresh"_a = 4);
m.def("rbbox_intersection", &spconv::rbbox_intersection<double>,
py::return_value_policy::reference_internal, "rbbox iou",
"box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
"standup_thresh"_a = 4);
m.def("rbbox_intersection", &spconv::rbbox_intersection<float>,
py::return_value_policy::reference_internal, "rbbox iou",
"box_corners"_a = 1, "qbox_corners"_a = 2, "standup_iou"_a = 3,
"standup_thresh"_a = 4);
m.def("points_to_voxel_3d_np", &spconv::points_to_voxel_3d_np<float, 3>,
"matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
"voxel_point_mask"_a = 3, "coors"_a = 4, "num_points_per_voxel"_a = 5,
"coor_to_voxelidx"_a = 6, "voxel_size"_a = 7, "coors_range"_a = 8,
"max_points"_a = 9, "max_voxels"_a = 10);
m.def("points_to_voxel_3d_np", &spconv::points_to_voxel_3d_np<double, 3>,
"matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
"voxel_point_mask"_a = 3, "coors"_a = 4, "num_points_per_voxel"_a = 5,
"coor_to_voxelidx"_a = 6, "voxel_size"_a = 7, "coors_range"_a = 8,
"max_points"_a = 9, "max_voxels"_a = 10);
m.def("points_to_voxel_3d_np_mean",
&spconv::points_to_voxel_3d_np_mean<float, 3>, "matrix tensor_square",
"points"_a = 1, "voxels"_a = 2, "voxel_point_mask"_a = 3, "means"_a = 4,
"coors"_a = 5, "num_points_per_voxel"_a = 6, "coor_to_voxelidx"_a = 7,
"voxel_size"_a = 8, "coors_range"_a = 9, "max_points"_a = 10,
"max_voxels"_a = 11);
m.def("points_to_voxel_3d_np_mean",
&spconv::points_to_voxel_3d_np_mean<double, 3>, "matrix tensor_square",
"points"_a = 1, "voxels"_a = 2, "voxel_point_mask"_a = 3, "means"_a = 4,
"coors"_a = 5, "num_points_per_voxel"_a = 6, "coor_to_voxelidx"_a = 7,
"voxel_size"_a = 8, "coors_range"_a = 9, "max_points"_a = 10,
"max_voxels"_a = 11);
m.def("points_to_voxel_3d_with_filtering",
&spconv::points_to_voxel_3d_with_filtering<float, 3>,
"matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
"voxel_point_mask"_a = 3, "voxel_mask"_a = 4, "mins"_a = 5,
"maxs"_a = 6, "coors"_a = 7, "num_points_per_voxel"_a = 8,
"coor_to_voxelidx"_a = 9, "voxel_size"_a = 10, "coors_range"_a = 11,
"max_points"_a = 12, "max_voxels"_a = 13, "block_factor"_a = 14,
"block_size"_a = 15, "height_threshold"_a = 16,
"height_high_threshold"_a = 17);
m.def("points_to_voxel_3d_with_filtering",
&spconv::points_to_voxel_3d_with_filtering<float, 3>,
"matrix tensor_square", "points"_a = 1, "voxels"_a = 2,
"voxel_point_mask"_a = 3, "voxel_mask"_a = 4, "mins"_a = 5,
"maxs"_a = 6, "coors"_a = 7, "num_points_per_voxel"_a = 8,
"coor_to_voxelidx"_a = 9, "voxel_size"_a = 10, "coors_range"_a = 11,
"max_points"_a = 12, "max_voxels"_a = 13, "block_factor"_a = 14,
"block_size"_a = 15, "height_threshold"_a = 16,
"height_high_threshold"_a = 17);
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment