Commit a6abf55d authored by yan.yan's avatar yan.yan
Browse files

Merge branch 'develop'

parents fad30002 79a3eaf2
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from torch.autograd import Function
import spconv
import spconv.pytorch as spconv
#from torch.nn import Module
from spconv.modules import SparseModule
from spconv.pytorch.modules import SparseModule
class JoinTable(SparseModule): # Module):
......
# Copyright 2019 Yan Yan
#
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......
# Copyright 2019 Yan Yan
#
# Copyright 2021 Yan Yan
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -13,282 +13,13 @@
# limitations under the License.
import numpy as np
from spconv import spconv_utils
from spconv.spconv_utils import (non_max_suppression_cpu,
points_to_voxel_3d_np,
points_to_voxel_3d_np_mean,
points_to_voxel_3d_with_filtering,
rbbox_intersection, rbbox_iou,
rotate_non_max_suppression_cpu)
try:
from spconv.spconv_utils import non_max_suppression
except ImportError:
pass
def points_to_voxel(points,
voxel_size,
coors_range,
coor_to_voxelidx,
max_points=35,
max_voxels=20000,
full_mean=False,
block_filtering=True,
block_factor=1,
block_size=8,
height_threshold=0.2,
height_high_threshold=3.0,
pad_output=False):
"""convert 3d points(N, >=3) to voxels. This version calculate
everything in one loop. now it takes only 0.8ms(~6k voxels)
with c++ and 3.2ghz cpu.
Args:
points: [N, ndim] float tensor. points[:, :3] contain xyz points and
points[:, 3:] contain other information such as reflectivity.
voxel_size: [3] list/tuple or array, float. xyz, indicate voxel size
coors_range: [6] list/tuple or array, float. indicate voxel range.
format: xyzxyz, minmax
coor_to_voxelidx: int array. used as a dense map.
max_points: int. indicate maximum points contained in a voxel.
max_voxels: int. indicate maximum voxels this function create.
for voxelnet, 20000 is a good choice. you should shuffle points
before call this function because max_voxels may drop some points.
full_mean: bool. if true, all empty points in voxel will be filled with mean
of exist points.
block_filtering: filter voxels by height. used for lidar point cloud.
use some visualization tool to see filtered result.
Returns:
voxels: [M, max_points, ndim] float tensor. only contain points.
coordinates: [M, 3] int32 tensor. zyx format.
num_points_per_voxel: [M] int32 tensor.
"""
if full_mean:
assert block_filtering is False
if not isinstance(voxel_size, np.ndarray):
voxel_size = np.array(voxel_size, dtype=points.dtype)
if not isinstance(coors_range, np.ndarray):
coors_range = np.array(coors_range, dtype=points.dtype)
voxelmap_shape = (coors_range[3:] - coors_range[:3]) / voxel_size
voxelmap_shape = tuple(np.round(voxelmap_shape).astype(np.int32).tolist())
voxelmap_shape = voxelmap_shape[::-1]
num_points_per_voxel = np.zeros(shape=(max_voxels, ), dtype=np.int32)
voxels = np.zeros(shape=(max_voxels, max_points, points.shape[-1]),
dtype=points.dtype)
voxel_point_mask = np.zeros(shape=(max_voxels, max_points),
dtype=points.dtype)
coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32)
res = {
"voxels": voxels,
"coordinates": coors,
"num_points_per_voxel": num_points_per_voxel,
"voxel_point_mask": voxel_point_mask,
}
if full_mean:
means = np.zeros(shape=(max_voxels, points.shape[-1]),
dtype=points.dtype)
voxel_num = points_to_voxel_3d_np_mean(points, voxels,
voxel_point_mask, means, coors,
num_points_per_voxel,
coor_to_voxelidx,
voxel_size.tolist(),
coors_range.tolist(),
max_points, max_voxels)
else:
if block_filtering:
block_shape = [*voxelmap_shape[1:]]
block_shape = [b // block_factor for b in block_shape]
mins = np.full(block_shape, 99999999, dtype=points.dtype)
maxs = np.full(block_shape, -99999999, dtype=points.dtype)
voxel_mask = np.zeros((max_voxels, ), dtype=np.int32)
voxel_num = points_to_voxel_3d_with_filtering(
points, voxels, voxel_point_mask, voxel_mask, mins, maxs,
coors, num_points_per_voxel, coor_to_voxelidx,
voxel_size.tolist(), coors_range.tolist(), max_points,
max_voxels, block_factor, block_size, height_threshold,
height_high_threshold)
voxel_mask = voxel_mask.astype(np.bool_)
coors_ = coors[voxel_mask]
if pad_output:
res["coordinates"][:voxel_num] = coors_
res["voxels"][:voxel_num] = voxels[voxel_mask]
res["voxel_point_mask"][:voxel_num] = voxel_point_mask[
voxel_mask]
res["num_points_per_voxel"][:voxel_num] = num_points_per_voxel[
voxel_mask]
res["coordinates"][voxel_num:] = 0
res["voxels"][voxel_num:] = 0
res["num_points_per_voxel"][voxel_num:] = 0
res["voxel_point_mask"][voxel_num:] = 0
else:
res["coordinates"] = coors_
res["voxels"] = voxels[voxel_mask]
res["num_points_per_voxel"] = num_points_per_voxel[voxel_mask]
res["voxel_point_mask"] = voxel_point_mask[voxel_mask]
voxel_num = coors_.shape[0]
else:
voxel_num = points_to_voxel_3d_np(points, voxels, voxel_point_mask,
coors, num_points_per_voxel,
coor_to_voxelidx,
voxel_size.tolist(),
coors_range.tolist(), max_points,
max_voxels)
res["voxel_num"] = voxel_num
res["voxel_point_mask"] = res["voxel_point_mask"].reshape(
-1, max_points, 1)
return res
class VoxelGenerator:
def __init__(self,
voxel_size,
point_cloud_range,
max_num_points,
max_voxels=20000,
full_mean=True):
point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
# [0, -40, -3, 70.4, 40, 1]
voxel_size = np.array(voxel_size, dtype=np.float32)
grid_size = (point_cloud_range[3:] -
point_cloud_range[:3]) / voxel_size
grid_size = np.round(grid_size).astype(np.int64)
voxelmap_shape = tuple(np.round(grid_size).astype(np.int32).tolist())
voxelmap_shape = voxelmap_shape[::-1]
self._coor_to_voxelidx = np.full(voxelmap_shape, -1, dtype=np.int32)
self._voxel_size = voxel_size
self._point_cloud_range = point_cloud_range
self._max_num_points = max_num_points
self._max_voxels = max_voxels
self._grid_size = grid_size
self._full_mean = full_mean
def generate(self, points, max_voxels=None):
res = points_to_voxel(points, self._voxel_size,
self._point_cloud_range, self._coor_to_voxelidx,
self._max_num_points, max_voxels
or self._max_voxels, self._full_mean)
voxels = res["voxels"]
coors = res["coordinates"]
num_points_per_voxel = res["num_points_per_voxel"]
voxel_num = res["voxel_num"]
coors = coors[:voxel_num]
voxels = voxels[:voxel_num]
num_points_per_voxel = num_points_per_voxel[:voxel_num]
return (voxels, coors, num_points_per_voxel)
def generate_multi_gpu(self, points, max_voxels=None):
res = points_to_voxel(points, self._voxel_size,
self._point_cloud_range, self._coor_to_voxelidx,
self._max_num_points, max_voxels
or self._max_voxels, self._full_mean)
voxels = res["voxels"]
coors = res["coordinates"]
num_points_per_voxel = res["num_points_per_voxel"]
voxel_num = res["voxel_num"]
return (voxels, coors, num_points_per_voxel)
@property
def voxel_size(self):
return self._voxel_size
@property
def max_num_points_per_voxel(self):
return self._max_num_points
@property
def point_cloud_range(self):
return self._point_cloud_range
@property
def grid_size(self):
return self._grid_size
class VoxelGeneratorV2:
def __init__(self,
voxel_size,
point_cloud_range,
max_num_points,
max_voxels=20000,
full_mean=False,
block_filtering=False,
block_factor=8,
block_size=3,
height_threshold=0.1,
height_high_threshold=2.0):
assert full_mean is False, "don't use this."
point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
# [0, -40, -3, 70.4, 40, 1]
voxel_size = np.array(voxel_size, dtype=np.float32)
grid_size = (point_cloud_range[3:] -
point_cloud_range[:3]) / voxel_size
grid_size = np.round(grid_size).astype(np.int64)
if block_filtering:
assert block_size > 0
assert grid_size[0] % block_factor == 0
assert grid_size[1] % block_factor == 0
voxelmap_shape = tuple(np.round(grid_size).astype(np.int32).tolist())
voxelmap_shape = voxelmap_shape[::-1]
self._coor_to_voxelidx = np.full(voxelmap_shape, -1, dtype=np.int32)
self._voxel_size = voxel_size
self._point_cloud_range = point_cloud_range
self._max_num_points = max_num_points
self._max_voxels = max_voxels
self._grid_size = grid_size
self._full_mean = full_mean
self._block_filtering = block_filtering
self._block_factor = block_factor
self._height_threshold = height_threshold
self._block_size = block_size
self._height_high_threshold = height_high_threshold
def generate(self, points, max_voxels=None):
res = points_to_voxel(points, self._voxel_size,
self._point_cloud_range, self._coor_to_voxelidx,
self._max_num_points, max_voxels
or self._max_voxels, self._full_mean,
self._block_filtering, self._block_factor,
self._block_size, self._height_threshold,
self._height_high_threshold)
for k, v in res.items():
if k != "voxel_num":
res[k] = v[:res["voxel_num"]]
return res
def generate_multi_gpu(self, points, max_voxels=None):
res = points_to_voxel(points,
self._voxel_size,
self._point_cloud_range,
self._coor_to_voxelidx,
self._max_num_points,
max_voxels or self._max_voxels,
self._full_mean,
self._block_filtering,
self._block_factor,
self._block_size,
self._height_threshold,
self._height_high_threshold,
pad_output=True)
return res
@property
def voxel_size(self):
return self._voxel_size
@property
def max_num_points_per_voxel(self):
return self._max_num_points
@property
def point_cloud_range(self):
return self._point_cloud_range
@property
def grid_size(self):
return self._grid_size
from cumm import tensorview as tv
from spconv.core_cc.csrc.sparse.all.ops1d import Point2Voxel as Point2VoxelGPU1d
from spconv.core_cc.csrc.sparse.all.ops2d import Point2Voxel as Point2VoxelGPU2d
from spconv.core_cc.csrc.sparse.all.ops3d import Point2Voxel as Point2VoxelGPU3d
from spconv.core_cc.csrc.sparse.all.ops4d import Point2Voxel as Point2VoxelGPU4d
from spconv.core_cc.csrc.sparse.all.ops_cpu1d import Point2VoxelCPU as Point2VoxelCPU1d
from spconv.core_cc.csrc.sparse.all.ops_cpu2d import Point2VoxelCPU as Point2VoxelCPU2d
from spconv.core_cc.csrc.sparse.all.ops_cpu3d import Point2VoxelCPU as Point2VoxelCPU3d
from spconv.core_cc.csrc.sparse.all.ops_cpu4d import Point2VoxelCPU as Point2VoxelCPU4d
\ No newline at end of file
if(WIN32)
add_library(cuhash SHARED hash_functions.cu hash_table.cpp hash_table.cu hash_functions.cpp)
else()
add_library(cuhash SHARED hash_functions.cu hash_table.cpp hash_table.cu hash_functions.cpp)
endif()
target_include_directories(cuhash PRIVATE ${ALL_INCLUDE} )
set_property(TARGET cuhash PROPERTY CUDA_STANDARD 14)
set_property(TARGET cuhash PROPERTY CXX_STANDARD 14)
set_target_properties(cuhash PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
set_target_properties(cuhash PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
if(NOT WIN32)
set_property(TARGET cuhash PROPERTY POSITION_INDEPENDENT_CODE ON)
endif()
target_link_libraries(cuhash PRIVATE ${ALL_LIBS})
install (TARGETS cuhash DESTINATION lib)
if (SPCONV_BuildTests)
add_executable(cuhash_test main.cc)
target_include_directories(cuhash_test PRIVATE ${ALL_INCLUDE} )
set_property(TARGET cuhash_test PROPERTY CUDA_STANDARD 14)
set_property(TARGET cuhash_test PROPERTY CXX_STANDARD 14)
set_target_properties(cuhash_test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(cuhash_test PRIVATE ${ALL_LIBS} cuhash)
install (TARGETS cuhash_test DESTINATION bin)
endif()
\ No newline at end of file
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* debugging.cpp
*
* @brief Debugging/statistics/performance utilities for hash tables.
*/
#include <cuhash/debugging.h>
#include <cuhash/definitions.h>
#include <algorithm>
#include <cstring>
#include <cuhash/cuda_util.h>
namespace cuhash {
void OutputRetrievalStatistics(const unsigned n_queries,
const unsigned *d_retrieval_probes,
const unsigned n_functions) {
unsigned *retrieval_probes = new unsigned[n_queries];
CUDA_SAFE_CALL(cudaMemcpy(retrieval_probes, d_retrieval_probes,
sizeof(unsigned) * n_queries,
cudaMemcpyDeviceToHost));
// Create a histogram showing how many items needed how many probes to be
// found.
unsigned possible_probes = n_functions + 2;
unsigned *histogram = new unsigned[possible_probes];
memset(histogram, 0, sizeof(unsigned) * (possible_probes));
for (unsigned i = 0; i < n_queries; ++i) {
histogram[retrieval_probes[i]]++;
}
// Dump it.
char buffer[10000];
sprintf(buffer, "Probes for retrieval: ");
PrintMessage(buffer);
for (unsigned i = 0; i < possible_probes; ++i) {
sprintf(buffer, "\t(%u, %u)", i, histogram[i]);
PrintMessage(buffer);
}
delete[] retrieval_probes;
delete[] histogram;
}
void OutputBuildStatistics(const unsigned n,
const unsigned *d_iterations_taken) {
// Output how many iterations each thread took until it found an empty slot.
unsigned *iterations_taken = new unsigned[n];
CUDA_SAFE_CALL(cudaMemcpy(iterations_taken, d_iterations_taken,
sizeof(unsigned) * n, cudaMemcpyDeviceToHost));
std::sort(iterations_taken, iterations_taken + n);
unsigned total_iterations = 0;
unsigned max_iterations_taken = 0;
for (unsigned i = 0; i < n; ++i) {
total_iterations += iterations_taken[i];
max_iterations_taken = std::max(max_iterations_taken, iterations_taken[i]);
}
unsigned current_value = iterations_taken[0];
unsigned count = 1;
char buffer[10000];
sprintf(buffer, "Iterations taken:\n");
for (unsigned i = 1; i < n; ++i) {
if (iterations_taken[i] != current_value) {
sprintf(buffer, "%s\t(%u, %u)\n", buffer, current_value, count);
current_value = iterations_taken[i];
count = 1;
} else {
count++;
}
}
sprintf(buffer, "%s\t(%u, %u)", buffer, current_value, count);
PrintMessage(buffer);
sprintf(buffer, "Total iterations: %u", total_iterations);
PrintMessage(buffer);
sprintf(buffer, "Avg/Med/Max iterations: (%f %u %u)",
(float)total_iterations / n, iterations_taken[n / 2],
iterations_taken[n - 1]);
PrintMessage(buffer);
delete[] iterations_taken;
// Print the length of the longest eviction chain.
sprintf(buffer, "Max iterations: %u", max_iterations_taken);
PrintMessage(buffer);
}
}; // namespace cuhash
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file
* debugging.cu
*
* @brief Debugging/statistics/performance utilities for hash tables.
*/
#include <cuhash/debugging.h>
#include <cuhash/definitions.h>
#include <cuhash/hash_table.cuh>
#include <algorithm>
#include <cuhash/cuda_util.h>
namespace cuhash {
//! Debugging function: Takes statistics on the hash functions' distribution.
/*! Determines:
* - How many unique slots each key has.
* - How many keys hash into each slot.
* - Whether any keys failed to get a full set of slots.
*/
__global__ void take_hash_function_statistics_kernel(
const unsigned *keys, const unsigned n_entries, const unsigned table_size,
const uint2 *constants, const unsigned num_functions,
unsigned *num_slots_available, unsigned *num_hashing_in, unsigned *failed) {
unsigned thread_index = threadIdx.x + blockIdx.x * blockDim.x +
blockIdx.y * blockDim.x * gridDim.x;
if (thread_index >= n_entries)
return;
unsigned key = keys[thread_index];
// Determine all of the locations the key hashes into.
// Also count how many keys hash into each location.
unsigned locations[kMaxHashFunctions];
for (unsigned i = 0; i < num_functions; ++i) {
locations[i] = hash_function_inner(constants[i], key) % table_size;
if (num_hashing_in != NULL) {
atomicAdd(num_hashing_in + locations[i], 1);
}
}
// Determine whether all of the locations were different.
unsigned num_slots = 1;
for (unsigned i = 1; i < num_functions; ++i) {
bool matched = false;
for (unsigned j = 0; j < i; ++j) {
if (locations[i] == locations[j]) {
matched = true;
break;
}
}
if (!matched) {
num_slots++;
}
}
if (num_slots_available != NULL) {
num_slots_available[thread_index] = num_slots;
}
if (failed != NULL && num_slots != num_functions) {
*failed = 1;
}
}
void TakeHashFunctionStatistics(const unsigned num_keys, const unsigned *d_keys,
const unsigned table_size,
const uint2 *constants,
const unsigned kNumHashFunctions) {
char buffer[16000];
PrintMessage("Hash function constants: ");
for (unsigned i = 0; i < kNumHashFunctions; ++i) {
sprintf(buffer, "\t%10u, %10u", constants[i].x, constants[i].y);
PrintMessage(buffer);
}
unsigned *d_num_hashing_in = NULL;
#ifdef COUNT_HOW_MANY_HASH_INTO_EACH_SLOT
CUDA_SAFE_CALL(
cudaMalloc((void **)&d_num_hashing_in, sizeof(unsigned) * table_size));
CUDA_SAFE_CALL(
cudaMemset(d_num_hashing_in, 0, sizeof(unsigned) * table_size));
#endif
unsigned *d_num_slots_available = NULL;
#ifdef COUNT_HOW_MANY_HAVE_CYCLES
CUDA_SAFE_CALL(
cudaMalloc((void **)&d_num_slots_available, sizeof(unsigned) * num_keys));
#endif
uint2 *d_constants = NULL;
CUDA_SAFE_CALL(
cudaMalloc((void **)&d_constants, sizeof(uint2) * kNumHashFunctions));
CUDA_SAFE_CALL(cudaMemcpy(d_constants, constants,
sizeof(uint2) * kNumHashFunctions,
cudaMemcpyHostToDevice));
take_hash_function_statistics_kernel<<<ComputeGridDim(num_keys),
kBlockSize>>>(
d_keys, num_keys, table_size, d_constants, kNumHashFunctions,
d_num_slots_available, d_num_hashing_in, NULL);
CUDA_SAFE_CALL(cudaFree(d_constants));
#ifdef COUNT_HOW_MANY_HASH_INTO_EACH_SLOT
unsigned *num_hashing_in = new unsigned[table_size];
CUDA_SAFE_CALL(cudaMemcpy(num_hashing_in, d_num_hashing_in,
sizeof(unsigned) * table_size,
cudaMemcpyDeviceToHost));
/*
// Print how many items hash into each slot.
// Used to make sure items are spread evenly throughout the table.
buffer[0] = '\0';
PrintMessage("Num hashing into each: ", true);
for (unsigned i = 0; i < table_size; ++i) {
sprintf(buffer, "%s\t%2u", buffer, num_hashing_in[i]);
if (i % 25 == 24) {
PrintMessage(buffer, true);
buffer[0] = '\0';
}
}
PrintMessage(buffer,true);
*/
// Print a histogram of how many items are hashed into each slot. Shows
// if average number of items hashing into each slot is low.
std::sort(num_hashing_in, num_hashing_in + table_size);
int count = 1;
unsigned previous = num_hashing_in[0];
sprintf(buffer, "Num items hashing into a slot:\t");
PrintMessage(buffer);
for (unsigned i = 1; i < table_size; ++i) {
if (num_hashing_in[i] != previous) {
sprintf(buffer, "\t(%u, %u)", previous, count);
PrintMessage(buffer);
previous = num_hashing_in[i];
count = 1;
} else {
count++;
}
}
sprintf(buffer, "\t(%u, %u)", previous, count);
PrintMessage(buffer);
delete[] num_hashing_in;
CUDA_SAFE_CALL(cudaFree(d_num_hashing_in));
#endif
#ifdef COUNT_HOW_MANY_HAVE_CYCLES
unsigned *num_slots_available = new unsigned[num_keys];
CUDA_SAFE_CALL(cudaMemcpy(num_slots_available, d_num_slots_available,
sizeof(unsigned) * num_keys,
cudaMemcpyDeviceToHost));
static const unsigned kHistogramSize = kNumHashFunctions + 1;
unsigned *histogram = new unsigned[kHistogramSize];
memset(histogram, 0, sizeof(unsigned) * kHistogramSize);
for (unsigned i = 0; i < num_keys; ++i) {
histogram[num_slots_available[i]]++;
}
sprintf(buffer, "Slots assigned to each key: ");
for (unsigned i = 1; i < kHistogramSize; ++i) {
sprintf(buffer, "%s(%u, %u) ", buffer, i, histogram[i]);
}
PrintMessage(buffer);
delete[] histogram;
delete[] num_slots_available;
CUDA_SAFE_CALL(cudaFree(d_num_slots_available));
#endif
}
bool CheckAssignedSameSlot(const unsigned N, const unsigned num_keys,
const unsigned *d_keys, const unsigned table_size,
uint2 *constants) {
unsigned *d_cycle_exists = NULL;
uint2 *d_constants = NULL;
CUDA_SAFE_CALL(cudaMalloc((void **)&d_cycle_exists, sizeof(unsigned)));
CUDA_SAFE_CALL(cudaMalloc((void **)&d_constants, sizeof(uint2) * N));
CUDA_SAFE_CALL(cudaMemset(d_cycle_exists, 0, sizeof(unsigned)));
CUDA_SAFE_CALL(cudaMemcpy(d_constants, constants, sizeof(uint2) * N,
cudaMemcpyHostToDevice));
// Check if all keys were given a full set of N slots by the functions.
take_hash_function_statistics_kernel<<<ComputeGridDim(num_keys),
kBlockSize>>>(
d_keys, num_keys, table_size, d_constants, N, NULL, NULL, d_cycle_exists);
unsigned cycle_exists;
CUDA_SAFE_CALL(cudaMemcpy(&cycle_exists, d_cycle_exists, sizeof(unsigned),
cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaFree(d_cycle_exists));
CUDA_SAFE_CALL(cudaFree(d_constants));
return (cycle_exists != 0);
}
void PrintStashContents(const Entry *d_stash) {
Entry *stash = new Entry[cuhash::kStashSize];
CUDA_SAFE_CALL(cudaMemcpy(stash, d_stash, sizeof(Entry) * cuhash::kStashSize,
cudaMemcpyDeviceToHost));
for (unsigned i = 0; i < cuhash::kStashSize; ++i) {
if (get_key(stash[i]) != kKeyEmpty) {
char buffer[256];
sprintf(buffer, "Stash[%u]: %u = %u", i, get_key(stash[i]),
get_value(stash[i]));
PrintMessage(buffer, true);
}
}
delete[] stash;
}
}; // namespace cuhash
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:
// nvcc (cuda) 9.0 with gcc 5.5 don't support random, so compile it in host
#include <random>
namespace cuhash {
std::random_device random_dev;
std::mt19937 random_engine(random_dev());
std::uniform_int_distribution<unsigned> uint_distribution;
unsigned generate_random_uint32() { return uint_distribution(random_engine); }
} // namespace cuhash
\ No newline at end of file
#include <cassert>
#include <cuhash/debugging.h>
#include <cuhash/hash_functions.h>
#include <cuhash/hash_table.h>
namespace cuhash {
void GenerateFunctions(const unsigned N, const unsigned num_keys,
const unsigned *d_keys, const unsigned table_size,
uint2 *constants) {
bool regenerate = true;
while (regenerate) {
regenerate = false;
// Generate a set of hash function constants for this build attempt.
for (unsigned i = 0; i < N; ++i) {
// uint_distribution(random_engine) % kPrimeDivisor;
// genrand_int32() % kPrimeDivisor;
unsigned new_a = generate_random_uint32() % kPrimeDivisor;
constants[i].x = (1 > new_a ? 1 : new_a);
constants[i].y = generate_random_uint32() % kPrimeDivisor;
}
#ifdef FORCEFULLY_GENERATE_NO_CYCLES
// Ensure that every key gets N different slots.
regenerate =
CheckAssignedSameSlot(N, num_keys, d_keys, table_size, constants);
#endif
}
#ifdef TAKE_HASH_FUNCTION_STATISTICS
// Examine how well distributed the items are.
TakeHashFunctionStatistics(num_keys, d_keys, table_size, constants, N);
#endif
}
}; // namespace cuhash
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file hash_table.cpp
*
* @brief Implements a basic hash table that stores one value per key.
*/
#include <cuhash/debugging.h>
#include <cuhash/hash_table.h>
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <cuda_runtime_api.h>
#include <cuhash/cuda_util.h>
#include <limits>
namespace cuhash {
char buffer[256];
//! @name Internal
/// @{
dim3 ComputeGridDim(unsigned n) {
// Round up in order to make sure all items are hashed in.
dim3 grid((n + kBlockSize - 1) / kBlockSize);
if (grid.x > kGridSize) {
grid.y = (grid.x + kGridSize - 1) / kGridSize;
grid.x = kGridSize;
}
return grid;
}
unsigned ComputeMaxIterations(const unsigned n, const unsigned table_size,
const unsigned num_functions) {
float lg_input_size = (float)(log((double)n) / log(2.0));
// #define CONSTANT_ITERATIONS
#ifdef CONSTANT_ITERATIONS
// Set the maximum number of iterations to 7lg(N).
const unsigned MAX_ITERATION_CONSTANT = 7;
unsigned max_iterations = MAX_ITERATION_CONSTANT * lg_input_size;
#else
// Use an empirical formula for determining what the maximum number of
// iterations should be. Works OK in most situations.
float load_factor = float(n) / table_size;
float ln_load_factor = (float)(log(load_factor) / log(2.71828183));
unsigned max_iterations =
(unsigned)(4.0 * ceil(-1.0 / (0.028255 + 1.1594772 * ln_load_factor) *
lg_input_size));
#endif
return max_iterations;
}
/// @}
HashTable::HashTable()
: table_size_(0), d_contents_(NULL), stash_count_(0), d_failures_(NULL) {
CUDA_CHECK_ERROR("Failed in constructor.\n");
}
bool HashTable::Initialize(const unsigned max_table_entries,
const float space_usage,
const unsigned num_functions) {
Release();
// Determine the minimum amount of slots the table requires,
// and whether the space_usage is within range.
float minimum_space_usage;
if (num_functions < 2 || num_functions > 5) {
char message[256] = "Number of hash functions must be from 2 to 5; "
"others are unimplemented.";
PrintMessage(message, true);
return false;
} else {
minimum_space_usage = kMinimumSpaceUsages[num_functions];
}
if (space_usage < minimum_space_usage) {
sprintf(buffer, "Minimum possible space usage for %u functions is %f.",
num_functions, minimum_space_usage);
PrintMessage(buffer);
return false;
}
num_hash_functions_ = num_functions;
table_size_ = unsigned(ceil(max_table_entries * space_usage));
// Allocate memory.
const unsigned slots_to_allocate = table_size_ + kStashSize;
CUDA_SAFE_CALL(
cudaMalloc((void **)&d_contents_, sizeof(Entry) * slots_to_allocate));
CUDA_SAFE_CALL(cudaMalloc((void **)&d_failures_, sizeof(unsigned)));
if (!d_contents_ || !d_failures_) {
fprintf(stderr, "Failed to allocate %u slots.\n", slots_to_allocate);
return false;
}
CUDA_CHECK_ERROR("Failed to initialize.\n");
return true;
}
void HashTable::Release() {
table_size_ = 0;
CUDA_SAFE_CALL(cudaFree(d_contents_));
CUDA_SAFE_CALL(cudaFree(d_failures_));
d_contents_ = NULL;
d_failures_ = NULL;
CUDA_CHECK_ERROR("Failed during release.\n");
}
bool HashTable::Build(const unsigned n, const unsigned *d_keys,
const unsigned *d_values) {
unsigned max_iterations =
ComputeMaxIterations(n, table_size_, num_hash_functions_);
unsigned num_failures = 1;
unsigned num_attempts = 0;
// Storage for statistics collection.
unsigned *d_iterations_taken = NULL;
#ifdef TRACK_ITERATIONS
CUDA_SAFE_CALL(
cudaMalloc((void **)&d_iterations_taken, sizeof(unsigned) * n));
#endif
// Track how many items ended up in the stash.
unsigned *d_stash_count = NULL;
CUDA_SAFE_CALL(cudaMalloc((void **)&d_stash_count, sizeof(unsigned)));
CUDA_CHECK_ERROR("Failed before main build loop.\n");
// Main build loop.
while (num_failures && ++num_attempts < kMaxRestartAttempts) {
CUDA_SAFE_CALL(cudaMemset(d_stash_count, 0, sizeof(unsigned)));
// Generate new hash functions.
if (num_hash_functions_ == 2)
constants_2_.Generate(n, d_keys, table_size_);
else if (num_hash_functions_ == 3)
constants_3_.Generate(n, d_keys, table_size_);
else if (num_hash_functions_ == 4)
constants_4_.Generate(n, d_keys, table_size_);
else
constants_5_.Generate(n, d_keys, table_size_);
stash_constants_.x = std::max(1u, generate_random_uint32()) % kPrimeDivisor;
stash_constants_.y = generate_random_uint32() % kPrimeDivisor;
stash_count_ = 0;
// Initialize memory.
unsigned slots_in_table = table_size_ + kStashSize;
CUDAWrapper::ClearTable(slots_in_table, kEntryEmpty, d_contents_);
num_failures = 0;
CUDAWrapper::CallCuckooHash(
n, num_hash_functions_, d_keys, d_values, table_size_, constants_2_,
constants_3_, constants_4_, constants_5_, max_iterations, d_contents_,
stash_constants_, d_stash_count, d_failures_, d_iterations_taken);
// Check if successful.
CUDA_SAFE_CALL(cudaMemcpy(&num_failures, d_failures_, sizeof(unsigned),
cudaMemcpyDeviceToHost));
#ifdef COUNT_UNINSERTED
if (num_failures) {
printf("Failed to insert %u items.\n", num_failures);
}
#endif
}
// Copy out the stash size.
CUDA_SAFE_CALL(cudaMemcpy(&stash_count_, d_stash_count, sizeof(unsigned),
cudaMemcpyDeviceToHost));
if (stash_count_ && num_failures == 0) {
// sprintf(buffer, "Stash size: %u", stash_count_);
// PrintMessage(buffer, true);
#ifdef _DEBUG
PrintStashContents(d_contents_ + table_size_);
#endif
}
CUDA_SAFE_CALL(cudaFree(d_stash_count));
#ifdef TRACK_ITERATIONS
if (num_failures == 0) {
OutputBuildStatistics(n, d_iterations_taken);
}
CUDA_SAFE_CALL(cudaFree(d_iterations_taken));
#endif
// Dump some info if a restart was required.
if (num_attempts >= kMaxRestartAttempts) {
sprintf(buffer, "Completely failed to build");
PrintMessage(buffer, true);
} else if (num_attempts > 1) {
sprintf(buffer, "Needed %u attempts to build, you can ignore this message.",
num_attempts);
PrintMessage(buffer, true);
}
CUDA_CHECK_ERROR("Error occurred during hash table build.\n");
return num_failures == 0;
}
void HashTable::Retrieve(const unsigned n_queries, const unsigned *d_keys,
unsigned *d_values) {
CUDAWrapper::CallHashRetrieve(n_queries, num_hash_functions_, d_keys,
table_size_, d_contents_, constants_2_,
constants_3_, constants_4_, constants_5_,
stash_constants_, stash_count_, d_values);
}
}; // namespace cuhash
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file hash_table.cu
*
* @brief Hides all of the CUDA calls from the actual CPP file.
*/
#include <cuhash/cuda_util.h>
#include <cuhash/debugging.h>
#include <cuhash/definitions.h>
#include <cuhash/hash_table.cuh>
#include <cuda.h>
namespace cuhash {
namespace CUDAWrapper {
void ClearTable(const unsigned slots_in_table, const Entry fill_value,
Entry *d_contents) {
clear_table<Entry><<<ComputeGridDim(slots_in_table), kBlockSize>>>(
slots_in_table, fill_value, d_contents);
TV_CHECK_CUDA_ERR_V2("Error occurred during hash table clear.\n");
}
void CallCuckooHash(const unsigned n, const unsigned num_hash_functions,
const unsigned *d_keys, const unsigned *d_values,
const unsigned table_size, const Functions<2> constants_2,
const Functions<3> constants_3,
const Functions<4> constants_4,
const Functions<5> constants_5,
const unsigned max_iterations, Entry *d_contents,
uint2 stash_constants, unsigned *d_stash_count,
unsigned *d_failures, unsigned *d_iterations_taken) {
// Build the table.
cudaMemset(d_failures, 0, sizeof(unsigned));
if (num_hash_functions == 2) {
CuckooHash<<<ComputeGridDim(n), kBlockSize>>>(
n, d_keys, d_values, table_size, constants_2, max_iterations,
d_contents, stash_constants, d_stash_count, d_failures,
d_iterations_taken);
} else if (num_hash_functions == 3) {
CuckooHash<<<ComputeGridDim(n), kBlockSize>>>(
n, d_keys, d_values, table_size, constants_3, max_iterations,
d_contents, stash_constants, d_stash_count, d_failures,
d_iterations_taken);
} else if (num_hash_functions == 4) {
CuckooHash<<<ComputeGridDim(n), kBlockSize>>>(
n, d_keys, d_values, table_size, constants_4, max_iterations,
d_contents, stash_constants, d_stash_count, d_failures,
d_iterations_taken);
} else {
CuckooHash<<<ComputeGridDim(n), kBlockSize>>>(
n, d_keys, d_values, table_size, constants_5, max_iterations,
d_contents, stash_constants, d_stash_count, d_failures,
d_iterations_taken);
}
CUDA_CHECK_ERROR("Error occurred during hash table build.\n");
}
void CallHashRetrieve(const unsigned n_queries,
const unsigned num_hash_functions, const unsigned *d_keys,
const unsigned table_size, const Entry *d_contents,
const Functions<2> constants_2,
const Functions<3> constants_3,
const Functions<4> constants_4,
const Functions<5> constants_5,
const uint2 stash_constants, const unsigned stash_count,
unsigned *d_values) {
unsigned *d_retrieval_probes = NULL;
#ifdef TRACK_ITERATIONS
CUDA_SAFE_CALL(
cudaMalloc((void **)&d_retrieval_probes, sizeof(unsigned) * n_queries));
#endif
if (num_hash_functions == 2) {
hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>(
n_queries, d_keys, table_size, d_contents, constants_2, stash_constants,
stash_count, d_values, d_retrieval_probes);
} else if (num_hash_functions == 3) {
hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>(
n_queries, d_keys, table_size, d_contents, constants_3, stash_constants,
stash_count, d_values, d_retrieval_probes);
} else if (num_hash_functions == 4) {
hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>(
n_queries, d_keys, table_size, d_contents, constants_4, stash_constants,
stash_count, d_values, d_retrieval_probes);
} else {
hash_retrieve<<<ComputeGridDim(n_queries), kBlockSize>>>(
n_queries, d_keys, table_size, d_contents, constants_5, stash_constants,
stash_count, d_values, d_retrieval_probes);
}
CUDA_CHECK_ERROR("Retrieval failed.\n");
#ifdef TRACK_ITERATIONS
OutputRetrievalStatistics(n_queries, d_retrieval_probes, num_hash_functions);
CUDA_SAFE_CALL(cudaFree(d_retrieval_probes));
#endif
}
}; // namespace CUDAWrapper
}; // namespace cuhash
#include <cuda.h>
#include <cuhash/hash_table.h>
int main() {
auto table = cuhash::HashTable();
table.Initialize(10, 2.0);
const int N = 10;
// ハッシュテーブルに格納するデータ
int keys[N] = {1, 6, 4, 9, 0, 3, 7, 2, 5, 8};
int vals[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
// デバイスメモリにコピー
int *d_keys, *d_vals;
cudaMalloc((void **)&d_keys, sizeof(int) * N);
cudaMemcpy(d_keys, keys, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_vals, sizeof(int) * N);
cudaMemcpy(d_vals, vals, sizeof(int) * N, cudaMemcpyHostToDevice);
// ハッシュテーブルにクエリするデータ
int input[N] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
int output[N];
// デバイスメモリにコピー
int *d_input, *d_output;
cudaMalloc((void **)&d_input, sizeof(int) * N);
cudaMemcpy(d_input, input, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_output, sizeof(int) * N);
cudaMemset(d_output, 0, sizeof(int) * N);
bool s = table.Build(N, (const unsigned int *)d_keys,
(const unsigned int *)d_vals);
std::cout << s << std::endl;
table.Retrieve(N, (const unsigned int *)d_input, (unsigned int *)d_output);
std::cout << s << std::endl;
cudaMemcpy(output, d_output, sizeof(int) * N, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; ++i) {
printf("%d\n", output[i]);
}
return 0;
}
\ No newline at end of file
set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc spconv_ops.cc pool_ops.cc)
if (SPCONV_BuildCUDA)
set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu cublas_gemm.cc)
endif()
add_library(spconv SHARED ${ALL_FILES})
find_package(OpenMP)
if(OpenMP_CXX_FOUND)
target_link_libraries(spconv PUBLIC OpenMP::OpenMP_CXX)
endif()
target_include_directories(spconv PRIVATE ${ALL_INCLUDE} )
set_property(TARGET spconv PROPERTY CUDA_STANDARD 14)
set_property(TARGET spconv PROPERTY CXX_STANDARD 14)
set_target_properties(spconv PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
if (SPCONV_BuildCUDA)
target_link_libraries(spconv PRIVATE ${ALL_LIBS} cuhash)
else()
target_link_libraries(spconv PRIVATE ${ALL_LIBS})
endif()
install (TARGETS spconv DESTINATION lib)
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <spconv/fused_spconv_ops.h>
#include <spconv/nms_ops.h>
#include <spconv/pillar_scatter_ops.h>
#include <spconv/pool_ops.h>
#include <spconv/spconv_ops.h>
#include <torch/script.h>
static auto registry =
torch::RegisterOperators()
.op("spconv::get_indice_pairs", &spconv::getIndicePairs)
.op("spconv::indice_conv", &spconv::indiceConv)
.op("spconv::indice_conv_batch", &spconv::indiceConvBatch)
.op("spconv::indice_conv_backward", &spconv::indiceConvBackward)
.op("spconv::fused_indice_conv_bn", &spconv::fusedIndiceConvBatchNorm)
.op("spconv::indice_maxpool", &spconv::indiceMaxPool)
.op("spconv::indice_maxpool_backward", &spconv::indiceMaxPoolBackward)
.op("spconv::nms", &spconv::nonMaxSuppression<float>)
.op("spconv::pillar_scatter_float", &spconv::pointPillarScatter<float>)
.op("spconv::pillar_scatter_half",
&spconv::pointPillarScatter<at::Half>);
#include <ATen/ATen.h>
#include <spconv/cublas_gemm.h>
namespace spconv {
template <>
cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const float *alpha, const float *A, int lda,
const float *B, int ldb, const float *beta, float *C,
int ldc) {
return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
beta, C, ldc);
}
template <>
cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const __half *alpha, const __half *A, int lda,
const __half *B, int ldb, const __half *beta,
__half *C, int ldc) {
return cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
beta, C, ldc);
}
template <>
cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const at::Half *alpha, const at::Half *A, int lda,
const at::Half *B, int ldb, const at::Half *beta,
at::Half *C, int ldc) {
return cublasHgemm(handle, transa, transb, m, n, k,
reinterpret_cast<const __half *>(alpha),
reinterpret_cast<const __half *>(A), lda,
reinterpret_cast<const __half *>(B), ldb,
reinterpret_cast<const __half *>(beta),
reinterpret_cast<__half *>(C), ldc);
}
template <>
cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const double *alpha, const double *A, int lda,
const double *B, int ldb, const double *beta,
double *C, int ldc) {
return cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
beta, C, ldc);
}
} // namespace spconv
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/Parallel.h>
#include <spconv/geometry.h>
#include <spconv/indice.h>
#include <spconv/spconv_ops.h>
#include <tensorview/tensor.h>
#include <torch/script.h>
namespace spconv {
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *kernelSize, const Index *stride,
const Index *padding, const Index *dilation,
const Index *outSpatialShape) {
// indicesOut: num_active * kernelVolume * (NDim + 1)
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
Index hashval;
tsl::robin_map<Index, Index> hash;
for (int j = 0; j < numActIn; ++j) {
batchIdx = indicesIn(j, 0);
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * batchIdx;
auto iter = hash.find(index);
if (iter == hash.end()) {
for (unsigned k = 1; k < NDim + 1; ++k) {
indicesOut(numAct, k) = pointPtr[k - 1];
}
indicesOut(numAct, 0) = batchIdx;
hashval = numAct++;
hash[index] = hashval;
} else {
hashval = iter->second;
}
// indicePairs: [K, 2, L]
indicePairs(0, offset, indiceNum[offset]) = j;
indicePairs(1, offset, indiceNum[offset]++) = hashval;
}
}
return numAct;
}
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *kernelSize, const Index *stride,
const Index *padding, const Index *dilation,
const Index *outSpatialShape) {
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
Index hashval;
tsl::robin_map<Index, Index> hash;
for (int j = 0; j < numActIn; ++j) {
batchIdx = indicesIn(j, 0);
numValidPoints = getValidOutPosTranspose<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * batchIdx;
auto iter = hash.find(index);
if (iter == hash.end()) {
for (unsigned k = 1; k < NDim + 1; ++k) {
indicesOut(numAct, k) = pointPtr[k - 1];
}
indicesOut(numAct, 0) = batchIdx;
hashval = numAct++;
hash[index] = hashval;
} else {
hashval = iter->second;
}
// indicePairs: [K, 2, L]
indicePairs(0, offset, indiceNum[offset]) = j;
indicePairs(1, offset, indiceNum[offset]++) = hashval;
}
}
return numAct;
}
#ifndef TV_WINDOWS
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *const kernelSize,
const Index *const stride, const Index *const padding,
const Index *dilation,
const Index *const outSpatialShape) {
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
tsl::robin_map<Index, Index> hash;
for (int j = 0; j < numActIn; ++j) {
Index index = 0;
index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
outSpatialShape) +
spatialVolume * indicesIn(j, 0);
hash[index] = j;
}
at::parallel_for(0, numActIn, 0, [&](int64_t begin, int64_t end) {
Index index = 0;
Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
Index oldOffset = 0;
for (int j = begin; j < end; ++j) {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * indicesIn(j, 0);
auto iter = hash.find(index);
if (iter != hash.end()) {
#pragma omp atomic capture
oldOffset = indiceNum[offset]++;
indicePairs(0, offset, oldOffset) = j;
indicePairs(1, offset, oldOffset) = iter->second;
}
}
}
});
return numActIn;
}
#else
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *const kernelSize,
const Index *const stride, const Index *const padding,
const Index *dilation,
const Index *const outSpatialShape) {
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
// Index validPoints[kernelVolume * (NDim + 1)];
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
tsl::robin_map<Index, Index> hash;
for (int j = 0; j < numActIn; ++j) {
Index index = 0;
index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
outSpatialShape) +
spatialVolume * indicesIn(j, 0);
hash[index] = j;
}
Index index = 0;
for (int j = 0; j < numActIn; ++j) {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * indicesIn(j, 0);
auto iter = hash.find(index);
if (iter != hash.end()) {
indicePairs(0, offset, indiceNum[offset]) = j;
indicePairs(1, offset, indiceNum[offset]++) = iter->second;
}
}
}
return numActIn;
}
#endif
int create_conv_indice_pair_cpu(
torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
torch::Tensor indicePairs, torch::Tensor indiceNum,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outSpatialShape, bool transpose, bool resetGrid,
bool useHash) {
auto ndim = outSpatialShape.size();
auto numActIn = indicesIn.size(0);
int batchSize = gridsOut.size(0);
auto kernelVolume = indiceNum.size(0);
if (numActIn == 0)
return 0;
tv::dispatch_torch<int32_t, int64_t>(indicesIn.scalar_type(), [&](auto V) {
using Index = TV_DECLTYPE(V);
using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = TV_DECLTYPE(I)::value;
tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
outSpatialShape.end());
if (transpose)
numActIn = getIndicePairsDeConv<Index, IndexGrid, NDim>(
tv::torch2tv<Index>(indicesIn), tv::torch2tv<Index>(indicesOut),
tv::torch2tv<IndexGrid>(gridsOut), tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ks.data(), st.data(), pa.data(),
di.data(), ou.data());
else
numActIn = getIndicePairsConv<Index, IndexGrid, NDim>(
tv::torch2tv<Index>(indicesIn), tv::torch2tv<Index>(indicesOut),
tv::torch2tv<IndexGrid>(gridsOut), tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ks.data(), st.data(), pa.data(),
di.data(), ou.data());
});
});
return numActIn;
}
int create_submconv_indice_pair_cpu(
torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
std::vector<int64_t> stride, std::vector<int64_t> padding,
std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
bool transpose, bool resetGrid, bool useHash) {
auto ndim = outSpatialShape.size();
auto numActIn = indicesIn.size(0);
int batchSize = gridsOut.size(0);
auto kernelVolume = indiceNum.size(0);
if (numActIn == 0)
return 0;
tv::dispatch_torch<int32_t, int64_t>(indicesIn.scalar_type(), [&](auto V) {
using Index = TV_DECLTYPE(V);
using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = TV_DECLTYPE(I)::value;
tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
outSpatialShape.end());
numActIn = getIndicePairsSubM<Index, IndexGrid, NDim>(
tv::torch2tv<Index>(indicesIn), tv::torch2tv<IndexGrid>(gridsOut),
tv::torch2tv<Index>(indicePairs), tv::torch2tv<Index>(indiceNum),
ks.data(), st.data(), pa.data(), di.data(), ou.data());
});
});
return numActIn;
}
} // namespace spconv
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <chrono>
#include <cuhash/hash_table.h>
#include <limits>
#include <spconv/indice.cu.h>
#include <spconv/indice.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensor.h>
#include <tensorview/tensorview.h>
#include <tensorview/torch_utils.h>
#include <thrust/copy.h>
#include <thrust/execution_policy.h>
#include <type_traits>
#include <utility/timer.h>
namespace spconv {
using max_kernel_vol_t = tv::mp_list_c<int, 9, 16, 27, 32, 128, 256, 4096>;
int create_conv_indice_pair_p1_cuda(
torch::Tensor indicesIn, torch::Tensor indicePairs, torch::Tensor indiceNum,
torch::Tensor indicePairUnique, std::vector<int64_t> kernelSize,
std::vector<int64_t> stride, std::vector<int64_t> padding,
std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
bool transpose) {
auto stream = at::cuda::getCurrentCUDAStream();
auto ndim = kernelSize.size();
auto numActIn = indicesIn.size(0);
auto kernelVolume = indiceNum.size(0);
if (numActIn == 0)
return 0;
tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
using Index = TV_DECLTYPE(IndexValue);
using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = TV_DECLTYPE(I)::value;
tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
outSpatialShape.end());
tv::DispatchInt<max_kernel_vol_t>()(
kernelVolume, std::less_equal<int>(), [&](auto I2) {
constexpr int MaxKernelVolume = TV_DECLTYPE(I2)::value;
if (transpose) {
prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum),
tv::torch2tv<Index>(indicePairUnique), ks, st,
pa, di, ou);
TV_CHECK_CUDA_ERR_V2("prepareDeConvIndicePairsKernel failed");
} else {
prepareIndicePairsKernel<Index, NDim, MaxKernelVolume>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum),
tv::torch2tv<Index>(indicePairUnique), ks, st,
pa, di, ou);
TV_CHECK_CUDA_ERR_V2("prepareIndicePairsKernel failed");
}
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(
&attr,
prepareDeConvIndicePairsKernel<Index, NDim, MaxKernelVolume>));
tv::ssprint("prepareIndicePairsKernel<", tv::type_s<Index>, NDim,
MaxKernelVolume, ">", attr.numRegs);
#endif
});
});
});
return 1;
}
int create_conv_indice_pair_p2_cuda(
torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
torch::Tensor indicePairs, torch::Tensor indiceNum,
torch::Tensor indicePairUnique, std::vector<int64_t> outSpatialShape,
bool transpose, bool resetGrid, bool useHash) {
auto stream = at::cuda::getCurrentCUDAStream();
auto ndim = outSpatialShape.size();
auto numActIn = indicesIn.size(0);
int batchSize = gridsOut.size(0);
int numAct = indicePairUnique.size(0) - 1;
auto kernelVolume = indiceNum.size(0);
if (numActIn == 0)
return 0;
bool failed = false;
tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
using Index = TV_DECLTYPE(IndexValue);
using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = TV_DECLTYPE(I)::value;
using IndexGrid = int32_t;
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
outSpatialShape.end());
if (useHash) {
auto table = cuhash::HashTable();
// std::cout << "create " << numAct << " size table..." << std::endl;
table.Initialize(numAct, 2.0, 4);
unsigned *d_values = nullptr;
cudaMalloc((void **)&d_values, sizeof(unsigned) * numAct);
TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
arangeKernel<unsigned>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(d_values, numAct);
TV_CHECK_CUDA_ERR_V2("arangeKernel failed");
bool res = table.Build(
numAct,
reinterpret_cast<unsigned *>(indicePairUnique.data_ptr<Index>()),
d_values);
cudaFree(d_values);
TV_CHECK_CUDA_ERR_V2("cudaFree failed");
if (!res) {
failed = true;
return;
}
assignIndiceOutKernel<Index, NDim>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesOut), numAct,
tv::torch2tv<Index>(indicePairUnique), ou, batchSize);
TV_CHECK_CUDA_ERR_V2("assignIndiceOutKernel failed");
auto tableSize = table.get_table_size();
auto tableData = table.data();
auto constants = table.get_constants_4();
auto stash_constants = table.get_stash_constants();
auto stash_count = table.get_stash_count();
assignIndicePairsHashKernel<Index, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesOut), numActIn,
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indicePairUnique), tableSize,
tableData, constants, stash_constants, stash_count);
TV_CHECK_CUDA_ERR_V2("assignIndicePairsHashKernel failed");
} else {
assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesOut),
tv::torch2tv<IndexGrid>(gridsOut), numAct,
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indicePairUnique), ou, batchSize);
TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
assignIndicePairsKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesOut),
tv::torch2tv<IndexGrid>(gridsOut), numActIn,
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indicePairUnique), ou);
TV_CHECK_CUDA_ERR_V2("assignIndicePairsKernel failed");
#ifdef TV_LOG_KERNEL_INFO
cudaFuncAttributes attr;
checkCudaErrors(cudaFuncGetAttributes(
&attr, assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>));
tv::ssprint("assignGridAndIndiceOutKernel<", tv::type_s<Index>, NDim,
">", attr.numRegs);
cudaFuncAttributes attr2;
checkCudaErrors(cudaFuncGetAttributes(
&attr2, assignIndicePairsKernel<Index, IndexGrid, NDim>));
tv::ssprint("assignIndicePairsKernel<", tv::type_s<Index>, NDim, ">",
attr2.numRegs);
#endif
}
if (resetGrid && (!useHash)) {
resetGridKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numAct), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(indicePairUnique.data_ptr<Index>(),
tv::torch2tv<IndexGrid>(gridsOut), numAct);
TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
}
});
});
if (failed){
return -1;
}
return numAct;
}
template <typename T> struct is_valid {
__device__ __forceinline__ bool operator()(const T x) { return x != -1; }
};
int create_submconv_indice_pair_cuda(
torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
std::vector<int64_t> stride, std::vector<int64_t> padding,
std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
bool transpose, bool resetGrid, bool useHash) {
auto stream = at::cuda::getCurrentCUDAStream();
auto ndim = outSpatialShape.size();
auto numActIn = indicesIn.size(0);
int batchSize = gridsOut.size(0);
auto kernelVolume = indiceNum.size(0);
if (numActIn == 0)
return 0;
bool failed = false;
tv::dispatch_torch<int32_t>(indicesIn.scalar_type(), [&](auto IndexValue) {
using Index = TV_DECLTYPE(IndexValue);
using IndexGrid = int32_t;
tv::dispatch_int<2, 3, 4>(ndim, [&](auto I) {
constexpr int NDim = TV_DECLTYPE(I)::value;
tv::SimpleVector<Index, NDim> ks(kernelSize.begin(), kernelSize.end());
tv::SimpleVector<Index, NDim> st(stride.begin(), stride.end());
tv::SimpleVector<Index, NDim> pa(padding.begin(), padding.end());
tv::SimpleVector<Index, NDim> di(dilation.begin(), dilation.end());
tv::SimpleVector<Index, NDim> ou(outSpatialShape.begin(),
outSpatialShape.end());
Index spatialVolume = 1;
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
if (useHash) {
auto table = cuhash::HashTable();
// std::cout << "create " << numAct << " size table..." << std::endl;
table.Initialize(numActIn, 2.0, 4);
unsigned *d_keyvalues = nullptr;
cudaMalloc((void **)&d_keyvalues, sizeof(unsigned) * numActIn * 2);
unsigned *d_values = d_keyvalues + numActIn;
TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
prepareSubMHashKernel<Index, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesIn), d_keyvalues, d_values,
ou);
TV_CHECK_CUDA_ERR_V2("prepareSubMHashKernel failed");
bool res =
table.Build(numActIn, reinterpret_cast<unsigned *>(d_keyvalues),
reinterpret_cast<unsigned *>(d_values));
cudaFree(d_keyvalues);
TV_CHECK_CUDA_ERR_V2("cudaFree failed");
if (!res) {
failed = true;
return;
}
auto tableSize = table.get_table_size();
auto tableData = table.data();
auto constants = table.get_constants_4();
auto stash_constants = table.get_stash_constants();
auto stash_count = table.get_stash_count();
tv::DispatchInt<max_kernel_vol_t>()(
kernelVolume, std::less_equal<int>(), [&](auto I2) {
constexpr int MaxKernelVolume = TV_DECLTYPE(I2)::value;
getSubMIndicePairsHashKernel<Index, NDim, MaxKernelVolume>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS,
0, stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ks, st, pa,
di, ou, tableSize, tableData, constants,
stash_constants, stash_count);
TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsHashKernel failed");
});
} else {
// auto timer = spconv::CudaContextTimer<>();
prepareSubMGridKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<IndexGrid>(gridsOut), ou, spatialVolume);
// tv::ssprint("prepareSubMGridKernel", timer.report() / 1000.0);
TV_CHECK_CUDA_ERR_V2("prepareSubMGridKernel failed");
// when dilation all one, we use a simple kernel to calc result
bool dilation_one = true;
for (int i = 0; i < NDim; ++i) {
dilation_one &= di[i] == 1;
}
auto found = false;
if (dilation_one && (NDim == 2 || NDim == 3)) {
auto indiceNumCpu = indiceNum.cpu();
if (NDim == 2) {
tv::SimpleVector<Index, 2> ou_(outSpatialShape.begin(),
outSpatialShape.end());
tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[0], [&](auto K0C) {
tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[1], [&](auto K1C) {
constexpr int K0 = TV_DECLTYPE(K0C)::value;
constexpr int K1 = TV_DECLTYPE(K1C)::value;
found = true;
getSubMIndicePairsKernel2<Index, IndexGrid, K0, K1>
<<<tv::cuda::getBlocks(numActIn),
tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
tv::torch2tv<Index>(indicesIn),
tv::torch2tv<IndexGrid>(gridsOut),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ou_, spatialVolume);
});
});
} else if (NDim == 3) {
tv::SimpleVector<Index, 3> ou_(outSpatialShape.begin(),
outSpatialShape.end());
tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[0], [&](auto K0C) {
tv::dispatch_int_noexcept<1, 3, 5>(kernelSize[1], [&](auto K1C) {
tv::dispatch_int_noexcept<1, 3, 5>(
kernelSize[2], [&](auto K2C) {
constexpr int K0 = TV_DECLTYPE(K0C)::value;
constexpr int K1 = TV_DECLTYPE(K1C)::value;
constexpr int K2 = TV_DECLTYPE(K2C)::value;
found = true;
getSubMIndicePairsKernel3<Index, IndexGrid, K0, K1, K2>
<<<tv::cuda::getBlocks(numActIn),
tv::cuda::CUDA_NUM_THREADS, 0, stream>>>(
tv::torch2tv<Index>(indicesIn),
tv::torch2tv<IndexGrid>(gridsOut),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ou_,
spatialVolume);
});
});
});
}
}
if (!found) {
tv::DispatchInt<
max_kernel_vol_t>()(ndim, std::less_equal<int>(), [&](auto I2) {
constexpr int MaxKernelVolume = TV_DECLTYPE(I2)::value;
getSubMIndicePairsKernel<Index, IndexGrid, NDim, MaxKernelVolume>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(tv::torch2tv<Index>(indicesIn),
tv::torch2tv<IndexGrid>(gridsOut),
tv::torch2tv<Index>(indicePairs),
tv::torch2tv<Index>(indiceNum), ks, st, pa, di,
ou);
TV_CHECK_CUDA_ERR_V2("getSubMIndicePairsKernel failed");
});
}
// tv::ssprint("getSubMIndicePairsKernel", timer.report() / 1000.0);
}
if (resetGrid && (!useHash)) {
resetGridSubMKernel<Index, IndexGrid, NDim>
<<<tv::cuda::getBlocks(numActIn), tv::cuda::CUDA_NUM_THREADS, 0,
stream>>>(indicesIn.data_ptr<Index>(),
tv::torch2tv<IndexGrid>(gridsOut), ou, numActIn);
TV_CHECK_CUDA_ERR_V2("resetGridKernel failed");
}
});
});
if (failed){
return -1;
}
return numActIn;
}
} // namespace spconv
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <spconv/maxpool.h>
#include <torch/script.h>
namespace spconv {
using float_types_t = tv::mp_list<float, double, at::Half>;
using int_types_t = tv::mp_list<int32_t, int64_t>;
void maxpool_fwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
torch::Tensor indicesIn, torch::Tensor indicesOut,
int size) {
if (size <= 0)
return;
int stride = inFeatures.size(1);
auto dtype = inFeatures.scalar_type();
auto int_dtype = indicesIn.scalar_type();
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = TV_DECLTYPE(TValue);
tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
using Index = TV_DECLTYPE(IndexValue);
auto outFeaturesData = outFeatures.data_ptr<T>();
auto inFeaturesData = inFeatures.data_ptr<T>();
auto indicesInData = indicesIn.data_ptr<Index>();
auto indicesOutData = indicesOut.data_ptr<Index>();
Index idxi, idxo;
for (int row = 0; row < size; row++) {
idxi = indicesInData[row] * stride;
idxo = indicesOutData[row] * stride;
for (int plane = 0; plane < stride; ++plane)
if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])
outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];
}
});
});
}
void maxpool_bwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
torch::Tensor dout, torch::Tensor din,
torch::Tensor indicesIn, torch::Tensor indicesOut,
int size) {
if (size <= 0)
return;
int stride = inFeatures.size(1);
auto dtype = inFeatures.scalar_type();
auto int_dtype = indicesIn.scalar_type();
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = TV_DECLTYPE(TValue);
tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
using Index = TV_DECLTYPE(IndexValue);
auto outFeaturesData = outFeatures.data_ptr<T>();
auto inFeaturesData = inFeatures.data_ptr<T>();
auto doutData = dout.data_ptr<T>();
auto dinData = din.data_ptr<T>();
auto indicesInData = indicesIn.data_ptr<Index>();
auto indicesOutData = indicesOut.data_ptr<Index>();
Index idxi, idxo;
for (int row = 0; row < size; row++) {
idxi = indicesInData[row] * stride;
idxo = indicesOutData[row] * stride;
for (int plane = 0; plane < stride; ++plane)
if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])
dinData[idxi + plane] += doutData[idxo + plane];
}
});
});
}
} // namespace spconv
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/maxpool.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensorview.h>
#include <type_traits>
namespace spconv {
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolFwdBlockKernel(T *outFeatures, const T *inFeatures,
const Index *indicesIn,
const Index *indicesOut, int numHot,
int numPlanes) {
T in, out;
int ILPStrideY[NumILP];
Index idxo, idxi;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
outFeatures += blockIdx.y * NumTLP;
inFeatures += blockIdx.y * NumTLP;
for (int ix = blockIdx.x * blockDim.x; ix < numHot;
ix += blockDim.x * gridDim.x) {
{
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
in = inFeatures[idxi];
out = outFeatures[idxo];
if (in > out) {
outFeatures[idxo] = in;
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void
maxPoolFwdGenericBlockKernel(T *outFeatures, const T *inFeatures,
const Index *indicesIn, const Index *indicesOut,
int numHot, int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideX[NumILP];
Index RI[NumILP];
Index RO[NumILP];
T in, out;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
in = inFeatures[RI[ilp] + iy];
out = outFeatures[RO[ilp] + iy];
if (in > out) {
outFeatures[RO[ilp] + iy] = in;
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
__global__ void maxPoolFwdVecBlockKernel(T *outFeatures, const T *inFeatures,
const Index *indicesIn,
const Index *indicesOut, int numHot,
int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideY[NumILP];
constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
T bufi[vecloadFactor];
T bufo[vecloadFactor];
Index idxi, idxo;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
outFeatures += blockIdx.y * NumTLP;
inFeatures += blockIdx.y * NumTLP;
for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
ix += blockDim.x * gridDim.x * vecloadFactor) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
reinterpret_cast<VecType *>(bufo)[0] =
reinterpret_cast<VecType *>(outFeatures)[idxo];
reinterpret_cast<VecType *>(bufi)[0] =
reinterpret_cast<const VecType *>(inFeatures)[idxi];
#pragma unroll
for (int i = 0; i < vecloadFactor; i++) {
if (bufi[i] > bufo[i]) {
bufo[i] = bufi[i];
}
}
reinterpret_cast<VecType *>(outFeatures)[idxo] =
reinterpret_cast<VecType *>(bufo)[0];
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolFwdGenericKernel(T *outFeatures, const T *inFeatures,
const Index *indicesIn,
const Index *indicesOut, int numHot,
int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideX[NumILP];
Index RI[NumILP];
Index RO[NumILP];
T in, out;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < numHot) {
RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
}
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < numHot) {
in = inFeatures[RI[ilp] + iy];
out = outFeatures[RO[ilp] + iy];
if (in > out) {
outFeatures[RO[ilp] + iy] = in;
}
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void
maxPoolBwdBlockKernel(const T *outFeatures, const T *inFeatures, const T *dout,
T *din, const Index *indicesIn, const Index *indicesOut,
int numHot, int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
T in, out;
Index idxo, idxi;
int ILPStrideY[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
outFeatures += blockIdx.y * NumTLP;
inFeatures += blockIdx.y * NumTLP;
dout += blockIdx.y * NumTLP;
din += blockIdx.y * NumTLP;
for (int ix = blockIdx.x * blockDim.x; ix < numHot;
ix += blockDim.x * gridDim.x) {
{
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
in = inFeatures[idxi];
out = outFeatures[idxo];
if (in == out) {
din[idxi] += dout[idxo];
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void maxPoolBwdGenericBlockKernel(const T *outFeatures,
const T *inFeatures, const T *dout,
T *din, const Index *indicesIn,
const Index *indicesOut,
int numHot, int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideX[NumILP];
Index RI[NumILP];
Index RO[NumILP];
T in, out;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
in = inFeatures[RI[ilp] + iy];
out = outFeatures[RO[ilp] + iy];
if (in == out) {
din[RI[ilp] + iy] += dout[RO[ilp] + iy];
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
__global__ void
maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
const T *dout, T *din, const Index *indicesIn,
const Index *indicesOut, int numHot, int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideY[NumILP];
constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
T bufi[vecloadFactor];
T bufo[vecloadFactor];
T bufdi[vecloadFactor];
T bufdo[vecloadFactor];
Index idxi, idxo;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
outFeatures += blockIdx.y * NumTLP;
inFeatures += blockIdx.y * NumTLP;
for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
ix += blockDim.x * gridDim.x * vecloadFactor) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
reinterpret_cast<VecType *>(bufo)[0] =
reinterpret_cast<const VecType *>(outFeatures)[idxo];
reinterpret_cast<VecType *>(bufi)[0] =
reinterpret_cast<const VecType *>(inFeatures)[idxi];
reinterpret_cast<VecType *>(bufdo)[0] =
reinterpret_cast<const VecType *>(dout)[idxo];
reinterpret_cast<VecType *>(bufdi)[0] =
reinterpret_cast<VecType *>(din)[idxi];
#pragma unroll
for (int i = 0; i < vecloadFactor; i++) {
if (bufi[i] == bufo[i]) {
bufdi[i] += bufdo[i];
}
}
reinterpret_cast<VecType *>(din)[idxi] =
reinterpret_cast<VecType *>(bufdi)[0];
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void
maxPoolBwdGenericKernel(const T *outFeatures, const T *inFeatures,
const T *dout, T *din, const Index *indicesIn,
const Index *indicesOut, int numHot, int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideX[NumILP];
Index RI[NumILP];
Index RO[NumILP];
T in, out;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < numHot) {
RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
}
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < numHot) {
in = inFeatures[RI[ilp] + iy];
out = outFeatures[RO[ilp] + iy];
if (in == out) {
din[RI[ilp] + iy] += dout[RO[ilp] + iy];
}
}
}
}
}
}
using float_types_t = tv::mp_list<float, double, at::Half>;
using int_types_t = tv::mp_list<int32_t, int64_t>;
void maxpool_fwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
torch::Tensor indicesIn, torch::Tensor indicesOut,
int size) {
if (size <= 0)
return;
int numPlanes = inFeatures.size(1);
auto dtype = inFeatures.scalar_type();
auto int_dtype = indicesIn.scalar_type();
auto stream = at::cuda::getCurrentCUDAStream();
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = TV_DECLTYPE(TValue);
using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
using Index = TV_DECLTYPE(IndexValue);
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
tv::mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indicesIn,
&indicesOut, &notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (numHotBlock >= NumTLP) {
maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
stream>>>(
outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
indicesIn.data_ptr<Index>(), indicesOut.data_ptr<Index>(),
numHotBlock, numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
if (size > numHotBlock) {
maxPoolFwdGenericKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, stream>>>(outFeatures.data_ptr<T>(),
inFeatures.data_ptr<T>(),
indicesIn.data_ptr<Index>() + numHotBlock,
indicesOut.data_ptr<Index>() + numHotBlock,
size - numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP;
if (numHotBlock >= NumTLP) {
maxPoolFwdGenericBlockKernel<T, Index, NumTLP, NumILP>
<<<dim3(size / NumTLP, tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
indicesIn.data_ptr<Index>(), indicesOut.data_ptr<Index>(),
numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
if (size > numHotBlock) {
maxPoolFwdGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(1, tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
indicesIn.data_ptr<Index>() + numHotBlock,
indicesOut.data_ptr<Index>() + numHotBlock,
size - numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
}
});
});
}
void maxpool_bwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
torch::Tensor dout, torch::Tensor din,
torch::Tensor indicesIn, torch::Tensor indicesOut,
int size) {
if (size <= 0)
return;
int numPlanes = inFeatures.size(1);
auto dtype = inFeatures.scalar_type();
auto int_dtype = indicesIn.scalar_type();
auto stream = at::cuda::getCurrentCUDAStream();
tv::DispatchTorch<float_types_t>()(dtype, [&](auto TValue) {
using T = TV_DECLTYPE(TValue);
using vecload_type_t =
std::conditional_t<std::is_same<T, at::Half>::value, int2, int4>;
using kernel_block_t = tv::mp_list_c<int, 64, 32, 16>;
tv::DispatchTorch<int_types_t>()(int_dtype, [&](auto IndexValue) {
using Index = TV_DECLTYPE(IndexValue);
bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
tv::mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &dout,
&din, &indicesIn, &indicesOut,
&notFound](auto NumTLP) {
constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP;
if (notFound) {
if (numPlanes % NumTLP == 0) {
if (numHotBlock >= NumTLP) {
maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
stream>>>(outFeatures.data_ptr<T>(),
inFeatures.data_ptr<T>(), dout.data_ptr<T>(),
din.data_ptr<T>(), indicesIn.data_ptr<Index>(),
indicesOut.data_ptr<Index>(), numHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR();
}
if (size > numHotBlock) {
maxPoolBwdGenericKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, stream>>>(outFeatures.data_ptr<T>(),
inFeatures.data_ptr<T>(), dout.data_ptr<T>(),
din.data_ptr<T>(),
indicesIn.data_ptr<Index>() + numHotBlock,
indicesOut.data_ptr<Index>() + numHotBlock,
size - numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
notFound = false;
}
}
});
if (notFound) {
constexpr int NumTLP = 64;
constexpr int NumILP = NumTLP / 4;
int numHotBlock = (size / NumTLP) * NumTLP;
if (numHotBlock >= NumTLP) {
maxPoolBwdGenericBlockKernel<T, Index, NumTLP, NumILP>
<<<dim3(size / NumTLP, tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
dout.data_ptr<T>(), din.data_ptr<T>(),
indicesIn.data_ptr<Index>(), indicesOut.data_ptr<Index>(),
numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
if (size > numHotBlock) {
maxPoolBwdGenericKernel<T, Index, NumTLP, NumILP>
<<<dim3(1, tv::cuda::DivUp(numPlanes, NumTLP)),
dim3(NumTLP / NumILP, NumTLP), 0, stream>>>(
outFeatures.data_ptr<T>(), inFeatures.data_ptr<T>(),
dout.data_ptr<T>(), din.data_ptr<T>(),
indicesIn.data_ptr<Index>() + numHotBlock,
indicesOut.data_ptr<Index>() + numHotBlock,
size - numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR();
}
}
});
});
}
} // namespace spconv
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <boost/geometry.hpp>
#include <spconv/nms_functor.h>
#include <torch/script.h>
#include <vector>
namespace spconv {
namespace functor {
template <typename T, typename Index>
struct NonMaxSupressionFunctor<tv::CPU, T, Index> {
Index operator()(const tv::CPU &d, tv::TensorView<Index> keep,
tv::TensorView<const T> boxes, T threshold, T eps) {
auto ndets = boxes.dim(0);
auto suppressed = std::vector<Index>(ndets);
auto area = std::vector<T>(ndets);
for (int i = 0; i < ndets; ++i) {
area[i] =
(boxes(i, 2) - boxes(i, 0) + eps) * (boxes(i, 3) - boxes(i, 1) + eps);
}
int i, j;
T xx1, xx2, w, h, inter, ovr;
int keepNum = 0;
for (int _i = 0; _i < ndets; ++_i) {
i = _i;
if (suppressed[i] == 1)
continue;
keep[keepNum] = i;
keepNum += 1;
for (int _j = _i + 1; _j < ndets; ++_j) {
j = _j;
if (suppressed[j] == 1)
continue;
xx2 = std::min(boxes(i, 2), boxes(j, 2));
xx1 = std::max(boxes(i, 0), boxes(j, 0));
w = xx2 - xx1 + eps;
if (w > 0) {
xx2 = std::min(boxes(i, 3), boxes(j, 3));
xx1 = std::max(boxes(i, 1), boxes(j, 1));
h = xx2 - xx1 + eps;
if (h > 0) {
inter = w * h;
ovr = inter / (area[i] + area[j] - inter);
if (ovr >= threshold)
suppressed[j] = 1;
}
}
}
}
return keepNum;
}
};
template <typename T, typename Index>
struct rotateNonMaxSupressionFunctor<tv::CPU, T, Index> {
Index operator()(const tv::CPU &d, tv::TensorView<Index> keep,
tv::TensorView<const T> boxCorners,
tv::TensorView<const T> standupIoU, T threshold) {
auto ndets = boxCorners.dim(0);
auto suppressed = std::vector<Index>(ndets);
int i, j;
namespace bg = boost::geometry;
typedef bg::model::point<T, 2, bg::cs::cartesian> point_t;
typedef bg::model::polygon<point_t> polygon_t;
polygon_t poly, qpoly;
std::vector<polygon_t> poly_inter, poly_union;
T inter_area, union_area, overlap;
int keepNum = 0;
for (int _i = 0; _i < ndets; ++_i) {
i = _i;
if (suppressed[i] == 1)
continue;
keep[keepNum] = i;
keepNum += 1;
for (int _j = _i + 1; _j < ndets; ++_j) {
j = _j;
if (suppressed[j] == 1)
continue;
if (standupIoU(i, j) <= 0.0)
continue;
bg::append(poly, point_t(boxCorners(i, 0, 0), boxCorners(i, 0, 1)));
bg::append(poly, point_t(boxCorners(i, 1, 0), boxCorners(i, 1, 1)));
bg::append(poly, point_t(boxCorners(i, 2, 0), boxCorners(i, 2, 1)));
bg::append(poly, point_t(boxCorners(i, 3, 0), boxCorners(i, 3, 1)));
bg::append(poly, point_t(boxCorners(i, 0, 0), boxCorners(i, 0, 1)));
bg::append(qpoly, point_t(boxCorners(j, 0, 0), boxCorners(j, 0, 1)));
bg::append(qpoly, point_t(boxCorners(j, 1, 0), boxCorners(j, 1, 1)));
bg::append(qpoly, point_t(boxCorners(j, 2, 0), boxCorners(j, 2, 1)));
bg::append(qpoly, point_t(boxCorners(j, 3, 0), boxCorners(j, 3, 1)));
bg::append(qpoly, point_t(boxCorners(j, 0, 0), boxCorners(j, 0, 1)));
bg::intersection(poly, qpoly, poly_inter);
if (!poly_inter.empty()) {
inter_area = bg::area(poly_inter.front());
bg::union_(poly, qpoly, poly_union);
if (!poly_union.empty()) { // ignore invalid box
union_area = bg::area(poly_union.front());
overlap = inter_area / union_area;
if (overlap >= threshold)
suppressed[j] = 1;
poly_union.clear();
}
}
poly.clear();
qpoly.clear();
poly_inter.clear();
}
}
return keepNum;
}
};
} // namespace functor
#define DECLARE_CPU_T_INDEX(T, Index) \
template struct functor::NonMaxSupressionFunctor<tv::CPU, T, Index>; \
template struct functor::rotateNonMaxSupressionFunctor<tv::CPU, T, Index>;
#define DECLARE_CPU_INDEX(Index) \
DECLARE_CPU_T_INDEX(float, Index); \
DECLARE_CPU_T_INDEX(double, Index);
DECLARE_CPU_INDEX(int);
DECLARE_CPU_INDEX(long);
#undef DECLARE_CPU_INDEX
#undef DECLARE_CPU_T_INDEX
} // namespace spconv
// ------------------------------------------------------------------
// Deformable Convolutional Networks
// Copyright (c) 2015 Microsoft
// Licensed under The MIT License
// Modified from MATLAB Faster R-CNN
// (https://github.com/shaoqingren/faster_rcnn)
// ------------------------------------------------------------------
#include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/reordering.cu.h>
#include <spconv/reordering.h>
#include <tensorview/cuda_utils.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/mp_helper.h>
#include <tensorview/tensorview.h>
#include <type_traits>
#include <utility/timer.h>
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
int const threadsPerBlock = sizeof(unsigned long long) * 8;
template <typename DType>
__device__ inline DType devIoU(DType const *const a, DType const *const b) {
DType left = max(a[0], b[0]), right = min(a[2], b[2]);
DType top = max(a[1], b[1]), bottom = min(a[3], b[3]);
DType width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
DType interS = width * height;
DType Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
DType Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
return interS / (Sa + Sb - interS);
}
template <typename DType, int BLOCK_THREADS>
__global__ void nms_kernel(const int n_boxes, const DType nms_overlap_thresh,
const DType *dev_boxes,
unsigned long long *dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size = min(n_boxes - row_start * BLOCK_THREADS, BLOCK_THREADS);
const int col_size = min(n_boxes - col_start * BLOCK_THREADS, BLOCK_THREADS);
__shared__ DType block_boxes[BLOCK_THREADS * 5];
if (threadIdx.x < col_size) {
#pragma unroll
for (int i = 0; i < 5; ++i) {
block_boxes[threadIdx.x * 5 + i] =
dev_boxes[(BLOCK_THREADS * col_start + threadIdx.x) * 5 + i];
}
}
__syncthreads();
if (threadIdx.x < row_size) {
const int cur_box_idx = BLOCK_THREADS * row_start + threadIdx.x;
const DType *cur_box = dev_boxes + cur_box_idx * 5;
unsigned long long t = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (int i = start; i < col_size; i++) {
if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
t |= 1ULL << i;
}
}
const int col_blocks = DIVUP(n_boxes, BLOCK_THREADS);
dev_mask[cur_box_idx * col_blocks + col_start] = t;
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment