Commit a6ae8967 authored by traveller59's avatar traveller59
Browse files

spconv v1.1 release:

1. add cuda hash support for cuda indice generation.
2. use hash table instead of dense table in CPU code.
3. add CPU-only build support.
parent 0757c45b
/**
* MIT License
*
* Copyright (c) 2017 Tessil
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#ifndef TSL_ROBIN_GROWTH_POLICY_H
#define TSL_ROBIN_GROWTH_POLICY_H
#include <algorithm>
#include <array>
#include <climits>
#include <cmath>
#include <cstddef>
#include <iterator>
#include <limits>
#include <ratio>
#include <stdexcept>
#ifdef TSL_DEBUG
# define tsl_rh_assert(expr) assert(expr)
#else
# define tsl_rh_assert(expr) (static_cast<void>(0))
#endif
/**
* If exceptions are enabled, throw the exception passed in parameter, otherwise call std::terminate.
*/
#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || (defined (_MSC_VER) && defined (_CPPUNWIND))) && !defined(TSL_NO_EXCEPTIONS)
# define TSL_RH_THROW_OR_TERMINATE(ex, msg) throw ex(msg)
#else
# ifdef NDEBUG
# define TSL_RH_THROW_OR_TERMINATE(ex, msg) std::terminate()
# else
# include <cstdio>
# define TSL_RH_THROW_OR_TERMINATE(ex, msg) do { std::fprintf(stderr, msg); std::terminate(); } while(0)
# endif
#endif
#if defined(__GNUC__) || defined(__clang__)
# define TSL_RH_LIKELY(exp) (__builtin_expect(!!(exp), true))
#else
# define TSL_RH_LIKELY(exp) (exp)
#endif
namespace tsl {
namespace rh {
/**
* Grow the hash table by a factor of GrowthFactor keeping the bucket count to a power of two. It allows
* the table to use a mask operation instead of a modulo operation to map a hash to a bucket.
*
* GrowthFactor must be a power of two >= 2.
*/
template<std::size_t GrowthFactor>
class power_of_two_growth_policy {
public:
/**
* Called on the hash table creation and on rehash. The number of buckets for the table is passed in parameter.
* This number is a minimum, the policy may update this value with a higher value if needed (but not lower).
*
* If 0 is given, min_bucket_count_in_out must still be 0 after the policy creation and
* bucket_for_hash must always return 0 in this case.
*/
explicit power_of_two_growth_policy(std::size_t& min_bucket_count_in_out) {
if(min_bucket_count_in_out > max_bucket_count()) {
TSL_RH_THROW_OR_TERMINATE(std::length_error, "The hash table exceeds its maxmimum size.");
}
if(min_bucket_count_in_out > 0) {
min_bucket_count_in_out = round_up_to_power_of_two(min_bucket_count_in_out);
m_mask = min_bucket_count_in_out - 1;
}
else {
m_mask = 0;
}
}
/**
* Return the bucket [0, bucket_count()) to which the hash belongs.
* If bucket_count() is 0, it must always return 0.
*/
std::size_t bucket_for_hash(std::size_t hash) const noexcept {
return hash & m_mask;
}
/**
* Return the number of buckets that should be used on next growth.
*/
std::size_t next_bucket_count() const {
if((m_mask + 1) > max_bucket_count() / GrowthFactor) {
TSL_RH_THROW_OR_TERMINATE(std::length_error, "The hash table exceeds its maxmimum size.");
}
return (m_mask + 1) * GrowthFactor;
}
/**
* Return the maximum number of buckets supported by the policy.
*/
std::size_t max_bucket_count() const {
// Largest power of two.
return (std::numeric_limits<std::size_t>::max() / 2) + 1;
}
/**
* Reset the growth policy as if it was created with a bucket count of 0.
* After a clear, the policy must always return 0 when bucket_for_hash is called.
*/
void clear() noexcept {
m_mask = 0;
}
private:
static std::size_t round_up_to_power_of_two(std::size_t value) {
if(is_power_of_two(value)) {
return value;
}
if(value == 0) {
return 1;
}
--value;
for(std::size_t i = 1; i < sizeof(std::size_t) * CHAR_BIT; i *= 2) {
value |= value >> i;
}
return value + 1;
}
static constexpr bool is_power_of_two(std::size_t value) {
return value != 0 && (value & (value - 1)) == 0;
}
protected:
static_assert(is_power_of_two(GrowthFactor) && GrowthFactor >= 2, "GrowthFactor must be a power of two >= 2.");
std::size_t m_mask;
};
/**
* Grow the hash table by GrowthFactor::num / GrowthFactor::den and use a modulo to map a hash
* to a bucket. Slower but it can be useful if you want a slower growth.
*/
template<class GrowthFactor = std::ratio<3, 2>>
class mod_growth_policy {
public:
explicit mod_growth_policy(std::size_t& min_bucket_count_in_out) {
if(min_bucket_count_in_out > max_bucket_count()) {
TSL_RH_THROW_OR_TERMINATE(std::length_error, "The hash table exceeds its maxmimum size.");
}
if(min_bucket_count_in_out > 0) {
m_mod = min_bucket_count_in_out;
}
else {
m_mod = 1;
}
}
std::size_t bucket_for_hash(std::size_t hash) const noexcept {
return hash % m_mod;
}
std::size_t next_bucket_count() const {
if(m_mod == max_bucket_count()) {
TSL_RH_THROW_OR_TERMINATE(std::length_error, "The hash table exceeds its maxmimum size.");
}
const double next_bucket_count = std::ceil(double(m_mod) * REHASH_SIZE_MULTIPLICATION_FACTOR);
if(!std::isnormal(next_bucket_count)) {
TSL_RH_THROW_OR_TERMINATE(std::length_error, "The hash table exceeds its maxmimum size.");
}
if(next_bucket_count > double(max_bucket_count())) {
return max_bucket_count();
}
else {
return std::size_t(next_bucket_count);
}
}
std::size_t max_bucket_count() const {
return MAX_BUCKET_COUNT;
}
void clear() noexcept {
m_mod = 1;
}
private:
static constexpr double REHASH_SIZE_MULTIPLICATION_FACTOR = 1.0 * GrowthFactor::num / GrowthFactor::den;
static const std::size_t MAX_BUCKET_COUNT =
std::size_t(double(
std::numeric_limits<std::size_t>::max() / REHASH_SIZE_MULTIPLICATION_FACTOR
));
static_assert(REHASH_SIZE_MULTIPLICATION_FACTOR >= 1.1, "Growth factor should be >= 1.1.");
std::size_t m_mod;
};
namespace detail {
static constexpr const std::array<std::size_t, 40> PRIMES = {{
1ul, 5ul, 17ul, 29ul, 37ul, 53ul, 67ul, 79ul, 97ul, 131ul, 193ul, 257ul, 389ul, 521ul, 769ul, 1031ul,
1543ul, 2053ul, 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, 98317ul, 196613ul, 393241ul, 786433ul,
1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, 100663319ul, 201326611ul,
402653189ul, 805306457ul, 1610612741ul, 3221225473ul, 4294967291ul
}};
template<unsigned int IPrime>
static constexpr std::size_t mod(std::size_t hash) { return hash % PRIMES[IPrime]; }
// MOD_PRIME[iprime](hash) returns hash % PRIMES[iprime]. This table allows for faster modulo as the
// compiler can optimize the modulo code better with a constant known at the compilation.
static constexpr const std::array<std::size_t(*)(std::size_t), 40> MOD_PRIME = {{
&mod<0>, &mod<1>, &mod<2>, &mod<3>, &mod<4>, &mod<5>, &mod<6>, &mod<7>, &mod<8>, &mod<9>, &mod<10>,
&mod<11>, &mod<12>, &mod<13>, &mod<14>, &mod<15>, &mod<16>, &mod<17>, &mod<18>, &mod<19>, &mod<20>,
&mod<21>, &mod<22>, &mod<23>, &mod<24>, &mod<25>, &mod<26>, &mod<27>, &mod<28>, &mod<29>, &mod<30>,
&mod<31>, &mod<32>, &mod<33>, &mod<34>, &mod<35>, &mod<36>, &mod<37> , &mod<38>, &mod<39>
}};
}
/**
* Grow the hash table by using prime numbers as bucket count. Slower than tsl::rh::power_of_two_growth_policy in
* general but will probably distribute the values around better in the buckets with a poor hash function.
*
* To allow the compiler to optimize the modulo operation, a lookup table is used with constant primes numbers.
*
* With a switch the code would look like:
* \code
* switch(iprime) { // iprime is the current prime of the hash table
* case 0: hash % 5ul;
* break;
* case 1: hash % 17ul;
* break;
* case 2: hash % 29ul;
* break;
* ...
* }
* \endcode
*
* Due to the constant variable in the modulo the compiler is able to optimize the operation
* by a series of multiplications, substractions and shifts.
*
* The 'hash % 5' could become something like 'hash - (hash * 0xCCCCCCCD) >> 34) * 5' in a 64 bits environement.
*/
class prime_growth_policy {
public:
explicit prime_growth_policy(std::size_t& min_bucket_count_in_out) {
auto it_prime = std::lower_bound(detail::PRIMES.begin(),
detail::PRIMES.end(), min_bucket_count_in_out);
if(it_prime == detail::PRIMES.end()) {
TSL_RH_THROW_OR_TERMINATE(std::length_error, "The hash table exceeds its maxmimum size.");
}
m_iprime = static_cast<unsigned int>(std::distance(detail::PRIMES.begin(), it_prime));
if(min_bucket_count_in_out > 0) {
min_bucket_count_in_out = *it_prime;
}
else {
min_bucket_count_in_out = 0;
}
}
std::size_t bucket_for_hash(std::size_t hash) const noexcept {
return detail::MOD_PRIME[m_iprime](hash);
}
std::size_t next_bucket_count() const {
if(m_iprime + 1 >= detail::PRIMES.size()) {
TSL_RH_THROW_OR_TERMINATE(std::length_error, "The hash table exceeds its maxmimum size.");
}
return detail::PRIMES[m_iprime + 1];
}
std::size_t max_bucket_count() const {
return detail::PRIMES.back();
}
void clear() noexcept {
m_iprime = 0;
}
private:
unsigned int m_iprime;
static_assert(std::numeric_limits<decltype(m_iprime)>::max() >= detail::PRIMES.size(),
"The type of m_iprime is not big enough.");
};
}
}
#endif
This diff is collapsed.
This diff is collapsed.
......@@ -14,11 +14,14 @@
#pragma once
#include <chrono>
#ifdef SPCONV_CUDA
#include <cuda_runtime_api.h>
#endif
#include <iostream>
namespace spconv {
#ifdef SPCONV_CUDA
template <typename TimeT = std::chrono::microseconds> struct CudaContextTimer {
CudaContextTimer() {
cudaDeviceSynchronize();
......@@ -36,6 +39,7 @@ template <typename TimeT = std::chrono::microseconds> struct CudaContextTimer {
private:
std::chrono::time_point<std::chrono::steady_clock> mCurTime;
};
#endif
template <typename TimeT = std::chrono::microseconds> struct CPUTimer {
CPUTimer() { mCurTime = std::chrono::steady_clock::now(); }
......
......@@ -45,8 +45,16 @@ class CMakeBuild(build_ext):
'-DCMAKE_PREFIX_PATH={}'.format(LIBTORCH_ROOT),
'-DPYBIND11_PYTHON_VERSION={}'.format(PYTHON_VERSION),
'-DSPCONV_BuildTests=OFF',
'-DCMAKE_CUDA_FLAGS="--expt-relaxed-constexpr"'
] # -arch=sm_61
if not torch.cuda.is_available():
cmake_args += ['-DSPCONV_BuildCUDA=OFF']
else:
cuda_flags = ["\"--expt-relaxed-constexpr\""]
# must add following flags to use at::Half
# but will remove raw half operators.
cuda_flags += ["-D__CUDA_NO_HALF_OPERATORS__", "-D__CUDA_NO_HALF_CONVERSIONS__"]
cuda_flags += ["-D__CUDA_NO_HALF2_OPERATORS__"]
cmake_args += ['-DCMAKE_CUDA_FLAGS=' + " ".join(cuda_flags)]
cfg = 'Debug' if self.debug else 'Release'
assert cfg == "Release", "pytorch ops don't support debug build."
build_args = ['--config', cfg]
......
......@@ -70,7 +70,7 @@ class SparseConvolution(SparseModule):
inverse=False,
indice_key=None,
fused_bn=False,
use_hash=False):
use_hash=True):
super(SparseConvolution, self).__init__()
assert groups == 1
if not isinstance(kernel_size, (list, tuple)):
......@@ -136,7 +136,6 @@ class SparseConvolution(SparseModule):
out_spatial_shape = ops.get_conv_output_size(
spatial_shape, self.kernel_size, self.stride, self.padding,
self.dilation)
else:
out_spatial_shape = spatial_shape
# input.update_grid(out_spatial_shape)
......@@ -222,7 +221,7 @@ class SparseConv2d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False):
use_hash=True):
super(SparseConv2d, self).__init__(
2,
in_channels,
......@@ -248,7 +247,7 @@ class SparseConv3d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False):
use_hash=True):
super(SparseConv3d, self).__init__(
3,
in_channels,
......@@ -274,7 +273,7 @@ class SparseConv4d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False):
use_hash=True):
super(SparseConv4d, self).__init__(
4,
in_channels,
......@@ -300,7 +299,7 @@ class SparseConvTranspose2d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False):
use_hash=True):
super(SparseConvTranspose2d, self).__init__(
2,
in_channels,
......@@ -327,7 +326,7 @@ class SparseConvTranspose3d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False):
use_hash=True):
super(SparseConvTranspose3d, self).__init__(
3,
in_channels,
......@@ -388,7 +387,7 @@ class SubMConv2d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False):
use_hash=True):
super(SubMConv2d, self).__init__(
2,
in_channels,
......@@ -415,7 +414,7 @@ class SubMConv3d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False):
use_hash=True):
super(SubMConv3d, self).__init__(
3,
in_channels,
......@@ -442,7 +441,7 @@ class SubMConv4d(SparseConvolution):
groups=1,
bias=True,
indice_key=None,
use_hash=False):
use_hash=True):
super(SubMConv4d, self).__init__(
4,
in_channels,
......
......@@ -88,8 +88,10 @@ def get_indice_pairs(indices,
get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_4d
else:
raise NotImplementedError
return get_indice_pairs_func(indices, batch_size, out_shape, spatial_shape, ksize,
res = get_indice_pairs_func(indices, batch_size, out_shape, spatial_shape, ksize,
stride, padding, dilation, out_padding, int(subm), int(transpose), int(use_hash))
return res
else:
if ndim == 2:
get_indice_pairs_func = torch.ops.spconv.get_indice_pairs_grid_2d
......
......@@ -15,10 +15,13 @@
import numpy as np
from spconv import spconv_utils
from spconv.spconv_utils import (
non_max_suppression, non_max_suppression_cpu, points_to_voxel_3d_np,
from spconv.spconv_utils import (non_max_suppression_cpu, points_to_voxel_3d_np,
points_to_voxel_3d_np_mean, points_to_voxel_3d_with_filtering,
rbbox_intersection, rbbox_iou, rotate_non_max_suppression_cpu)
try:
from spconv.spconv_utils import non_max_suppression
except ImportError:
pass
def points_to_voxel(points,
......
add_library(cuhash SHARED hash_functions.cu hash_table.cpp hash_table.cu)
target_include_directories(cuhash PRIVATE ${ALL_INCLUDE} )
set_property(TARGET cuhash PROPERTY CUDA_STANDARD 14)
set_property(TARGET cuhash PROPERTY CXX_STANDARD 14)
set_target_properties(cuhash PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(cuhash PRIVATE ${ALL_LIBS})
install (TARGETS cuhash DESTINATION lib)
if (SPCONV_BuildTests)
add_executable(cuhash_test main.cc)
target_include_directories(cuhash_test PRIVATE ${ALL_INCLUDE} )
set_property(TARGET cuhash_test PROPERTY CUDA_STANDARD 14)
set_property(TARGET cuhash_test PROPERTY CXX_STANDARD 14)
set_target_properties(cuhash_test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(cuhash_test PRIVATE ${ALL_LIBS} cuhash)
install (TARGETS cuhash_test DESTINATION bin)
endif()
\ No newline at end of file
......@@ -15,14 +15,14 @@
* @brief Debugging/statistics/performance utilities for hash tables.
*/
#include <hash/debugging.h>
#include <hash/definitions.h>
#include <cuhash/debugging.h>
#include <cuhash/definitions.h>
#include <algorithm>
#include <cstring>
#include <hash/cuda_util.h>
#include <cuhash/cuda_util.h>
namespace cudahash {
namespace cuhash {
void OutputRetrievalStatistics(const unsigned n_queries,
......
......@@ -15,14 +15,14 @@
* @brief Debugging/statistics/performance utilities for hash tables.
*/
#include <hash/debugging.h>
#include <hash/definitions.h>
#include <hash/hash_table.cuh>
#include <cuhash/debugging.h>
#include <cuhash/definitions.h>
#include <cuhash/hash_table.cuh>
#include <algorithm>
#include <hash/cuda_util.h>
#include <cuhash/cuda_util.h>
namespace cudahash {
namespace cuhash {
//! Debugging function: Takes statistics on the hash functions' distribution.
......@@ -231,9 +231,9 @@ bool CheckAssignedSameSlot(const unsigned N,
void PrintStashContents(const Entry *d_stash) {
Entry *stash = new Entry[cudahash::kStashSize];
CUDA_SAFE_CALL(cudaMemcpy(stash, d_stash, sizeof(Entry) * cudahash::kStashSize, cudaMemcpyDeviceToHost));
for (unsigned i = 0; i < cudahash::kStashSize; ++i) {
Entry *stash = new Entry[cuhash::kStashSize];
CUDA_SAFE_CALL(cudaMemcpy(stash, d_stash, sizeof(Entry) * cuhash::kStashSize, cudaMemcpyDeviceToHost));
for (unsigned i = 0; i < cuhash::kStashSize; ++i) {
if (get_key(stash[i]) != kKeyEmpty) {
char buffer[256];
sprintf(buffer, "Stash[%u]: %u = %u", i, get_key(stash[i]), get_value(stash[i]));
......
#include <hash/hash_table.h>
#include <hash/debugging.h>
#include <cuhash/hash_table.h>
#include <cuhash/debugging.h>
#include <cassert>
#include <random>
#include <hash/mt19937ar.h>
namespace cuhash {
std::random_device random_dev;
#include <cassert>
std::mt19937 random_engine(random_dev());
std::uniform_int_distribution<unsigned> uint_distribution;
namespace cudahash {
unsigned generate_random_uint32(){
return uint_distribution(random_engine);
}
void GenerateFunctions(const unsigned N,
const unsigned num_keys,
......@@ -19,9 +26,11 @@ void GenerateFunctions(const unsigned N,
// Generate a set of hash function constants for this build attempt.
for (unsigned i = 0 ; i < N; ++i) {
unsigned new_a = genrand_int32() % kPrimeDivisor;
// uint_distribution(random_engine) % kPrimeDivisor;
// genrand_int32() % kPrimeDivisor;
unsigned new_a = generate_random_uint32() % kPrimeDivisor;
constants[i].x = (1 > new_a ? 1 : new_a);
constants[i].y = genrand_int32() % kPrimeDivisor;
constants[i].y = generate_random_uint32() % kPrimeDivisor;
}
#ifdef FORCEFULLY_GENERATE_NO_CYCLES
......
......@@ -14,20 +14,18 @@
* @brief Implements a basic hash table that stores one value per key.
*/
#include <hash/hash_table.h>
#include <hash/debugging.h>
#include <cuhash/hash_table.h>
#include <cuhash/debugging.h>
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <limits>
#include <hash/mt19937ar.h>
#include <cuda_runtime_api.h>
#include <hash/cuda_util.h>
#include <cuhash/cuda_util.h>
namespace cudahash {
namespace cuhash {
char buffer[256];
......@@ -164,8 +162,8 @@ bool HashTable::Build(const unsigned n,
else
constants_5_.Generate(n, d_keys,table_size_);
stash_constants_.x = std::max(1lu, genrand_int32()) % kPrimeDivisor;
stash_constants_.y = genrand_int32() % kPrimeDivisor;
stash_constants_.x = std::max(1u, generate_random_uint32()) % kPrimeDivisor;
stash_constants_.y = generate_random_uint32() % kPrimeDivisor;
stash_count_ = 0;
// Initialize memory.
......@@ -205,8 +203,8 @@ bool HashTable::Build(const unsigned n,
// Copy out the stash size.
CUDA_SAFE_CALL(cudaMemcpy( &stash_count_, d_stash_count, sizeof(unsigned), cudaMemcpyDeviceToHost ));
if (stash_count_ && num_failures == 0) {
sprintf(buffer, "Stash size: %u", stash_count_);
PrintMessage(buffer, true);
// sprintf(buffer, "Stash size: %u", stash_count_);
// PrintMessage(buffer, true);
#ifdef _DEBUG
PrintStashContents(d_contents_ + table_size_);
......@@ -226,7 +224,7 @@ bool HashTable::Build(const unsigned n,
sprintf(buffer, "Completely failed to build");
PrintMessage(buffer, true);
} else if (num_attempts > 1) {
sprintf(buffer, "Needed %u attempts to build", num_attempts);
sprintf(buffer, "Needed %u attempts to build, you can ignore this message.", num_attempts);
PrintMessage(buffer, true);
}
......
......@@ -14,14 +14,14 @@
* @brief Hides all of the CUDA calls from the actual CPP file.
*/
#include <hash/cuda_util.h>
#include <hash/debugging.h>
#include <hash/definitions.h>
#include <hash/hash_table.cuh>
#include <cuhash/cuda_util.h>
#include <cuhash/debugging.h>
#include <cuhash/definitions.h>
#include <cuhash/hash_table.cuh>
#include <cuda.h>
namespace cudahash {
namespace cuhash {
namespace CUDAWrapper {
void ClearTable(const unsigned slots_in_table,
......
#include <hash/hash_table.h>
#include <cuhash/hash_table.h>
#include <cuda.h>
int main(){
auto table = cudahash::HashTable();
auto table = cuhash::HashTable();
table.Initialize(10, 2.0);
const int N = 10;
......
add_library(cudahash SHARED hash_functions.cu hash_table.cpp hash_table.cu
mt19937ar.cpp)
target_include_directories(cudahash PRIVATE ${ALL_INCLUDE} )
set_property(TARGET cudahash PROPERTY CUDA_STANDARD 14)
set_property(TARGET cudahash PROPERTY CXX_STANDARD 14)
set_target_properties(cudahash PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(cudahash PRIVATE ${ALL_LIBS})
install (TARGETS cudahash DESTINATION lib)
add_executable(cudahash_test main.cc)
target_include_directories(cudahash_test PRIVATE ${ALL_INCLUDE} )
set_property(TARGET cudahash_test PROPERTY CUDA_STANDARD 14)
set_property(TARGET cudahash_test PROPERTY CXX_STANDARD 14)
set_target_properties(cudahash_test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(cudahash_test PRIVATE ${ALL_LIBS} cudahash)
install (TARGETS cudahash_test DESTINATION bin)
/*
A C-program for MT19937, with initialization improved 2002/1/26.
Coded by Takuji Nishimura and Makoto Matsumoto.
Before using, initialize the state by using init_genrand(seed)
or init_by_array(init_key, key_length).
Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. The names of its contributors may not be used to endorse or promote
products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Any feedback is very welcome.
http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
*/
#include <stdio.h>
/* Period parameters */
#define N 624
#define M 397
#define MATRIX_A 0x9908b0dfUL /* constant vector a */
#define UPPER_MASK 0x80000000UL /* most significant w-r bits */
#define LOWER_MASK 0x7fffffffUL /* least significant r bits */
static unsigned long mt[N]; /* the array for the state vector */
static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */
/* initializes mt[N] with a seed */
void init_genrand(unsigned long s)
{
mt[0]= s & 0xffffffffUL;
for (mti=1; mti<N; mti++) {
mt[mti] =
(1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
/* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
/* In the previous versions, MSBs of the seed affect */
/* only MSBs of the array mt[]. */
/* 2002/01/09 modified by Makoto Matsumoto */
mt[mti] &= 0xffffffffUL;
/* for >32 bit machines */
}
}
/* initialize by an array with array-length */
/* init_key is the array for initializing keys */
/* key_length is its length */
/* slight change for C++, 2004/2/26 */
void init_by_array(unsigned long init_key[], int key_length)
{
int i, j, k;
init_genrand(19650218UL);
i=1; j=0;
k = (N>key_length ? N : key_length);
for (; k; k--) {
mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL))
+ init_key[j] + j; /* non linear */
mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
i++; j++;
if (i>=N) { mt[0] = mt[N-1]; i=1; }
if (j>=key_length) j=0;
}
for (k=N-1; k; k--) {
mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL))
- i; /* non linear */
mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
i++;
if (i>=N) { mt[0] = mt[N-1]; i=1; }
}
mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */
}
/* generates a random number on [0,0xffffffff]-interval */
unsigned long genrand_int32(void)
{
unsigned long y;
static unsigned long mag01[2]={0x0UL, MATRIX_A};
/* mag01[x] = x * MATRIX_A for x=0,1 */
if (mti >= N) { /* generate N words at one time */
int kk;
if (mti == N+1) /* if init_genrand() has not been called, */
init_genrand(5489UL); /* a default initial seed is used */
for (kk=0;kk<N-M;kk++) {
y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];
}
for (;kk<N-1;kk++) {
y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1UL];
}
y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK);
mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];
mti = 0;
}
y = mt[mti++];
/* Tempering */
y ^= (y >> 11);
y ^= (y << 7) & 0x9d2c5680UL;
y ^= (y << 15) & 0xefc60000UL;
y ^= (y >> 18);
return y;
}
/* generates a random number on [0,0x7fffffff]-interval */
long genrand_int31(void)
{
return (long)(genrand_int32()>>1);
}
/* generates a random number on [0,1]-real-interval */
double genrand_real1(void)
{
return genrand_int32()*(1.0/4294967295.0);
/* divided by 2^32-1 */
}
/* generates a random number on [0,1)-real-interval */
double genrand_real2(void)
{
return genrand_int32()*(1.0/4294967296.0);
/* divided by 2^32 */
}
/* generates a random number on (0,1)-real-interval */
double genrand_real3(void)
{
return (((double)genrand_int32()) + 0.5)*(1.0/4294967296.0);
/* divided by 2^32 */
}
/* generates a random number on [0,1) with 53-bit resolution*/
double genrand_res53(void)
{
unsigned long a=genrand_int32()>>5, b=genrand_int32()>>6;
return(a*67108864.0+b)*(1.0/9007199254740992.0);
}
/* These real versions are due to Isaku Wada, 2002/01/09 added */
add_library(spconv SHARED all.cc indice.cc indice.cu
reordering.cc reordering.cu maxpool.cc maxpool.cu nms.cc
pillar_scatter.cu)
set(ALL_FILES all.cc indice.cc reordering.cc maxpool.cc nms.cc)
if (SPCONV_BuildCUDA)
set(ALL_FILES ${ALL_FILES} indice.cu reordering.cu maxpool.cu pillar_scatter.cu)
endif()
add_library(spconv SHARED ${ALL_FILES})
target_include_directories(spconv PRIVATE ${ALL_INCLUDE} )
set_property(TARGET spconv PROPERTY CUDA_STANDARD 14)
set_property(TARGET spconv PROPERTY CXX_STANDARD 14)
set_target_properties(spconv PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(spconv PRIVATE ${ALL_LIBS} cudahash)
if (SPCONV_BuildCUDA)
target_link_libraries(spconv PRIVATE ${ALL_LIBS} cuhash)
else()
target_link_libraries(spconv PRIVATE ${ALL_LIBS})
endif()
install (TARGETS spconv DESTINATION lib)
......@@ -12,7 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cuda_runtime_api.h>
#include <spconv/pool_ops.h>
#include <spconv/spconv_ops.h>
#include <spconv/pillar_scatter_ops.h>
......@@ -35,9 +34,9 @@ static auto registry =
.op("spconv::indice_maxpool_fp32", &spconv::indiceMaxPool<float>)
.op("spconv::indice_maxpool_backward_fp32",
&spconv::indiceMaxPoolBackward<float>)
// .op("spconv::indice_maxpool_half", &spconv::indiceMaxPool<at::Half>)
// .op("spconv::indice_maxpool_backward_half",
// &spconv::indiceMaxPoolBackward<at::Half>)
.op("spconv::indice_maxpool_half", &spconv::indiceMaxPool<at::Half>)
.op("spconv::indice_maxpool_backward_half",
&spconv::indiceMaxPoolBackward<at::Half>)
.op("spconv::nms", &spconv::nonMaxSuppression<float>)
.op("spconv::pillar_scatter_float", &spconv::pointPillarScatter<float>)
.op("spconv::pillar_scatter_half", &spconv::pointPillarScatter<at::Half>);
\ No newline at end of file
......@@ -22,7 +22,7 @@
#include <tensorview/tensorview.h>
#include <type_traits>
#include <utility/timer.h>
#include <hash/hash_table.h>
#include <cuhash/hash_table.h>
namespace spconv {
namespace functor {
......@@ -78,24 +78,28 @@ struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
auto numActIn = indicesIn.dim(0);
if (numActIn == 0)
return 0;
Index numAct = indicePairUnique.dim(0) - 1;
// after unique, there is a std::numeric_limits<int>::max() in the end of indicePairUnique
Index numAct = indicePairUnique.dim(0) - 1;
if (useHash){
auto table = cudahash::HashTable();
table.Initialize(numAct, 2.0);
Index *d_values = nullptr;
cudaMalloc((void**)&d_values, sizeof(Index) * numAct);
auto table = cuhash::HashTable();
// std::cout << "create " << numAct << " size table..." << std::endl;
table.Initialize(numAct, 2.0, 4);
unsigned *d_values = nullptr;
cudaMalloc((void**)&d_values, sizeof(unsigned) * numAct);
TV_CHECK_CUDA_ERR_V2("cudaMalloc failed");
arangeKernel<Index><<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
arangeKernel<unsigned><<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(d_values, numAct);
bool res = table.Build(numAct, reinterpret_cast<unsigned*>(indicePairUnique.data()),
reinterpret_cast<unsigned*>(d_values));
TV_ASSERT_RT_ERR(res, "err");
d_values);
cudaFree(d_values);
if (!res){
return -1; //use -1 to tell outside use CPU implementation
}
assignIndiceOutKernel<Index, NDim>
<<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, numAct,
indicePairUnique, outSpatialShape, batchSize);
TV_CHECK_CUDA_ERR_V2("assignGridAndIndiceOutKernel failed");
cudaFree(d_values);
auto tableSize = table.get_table_size();
auto tableData = table.data();
auto constants = table.get_constants_4();
......@@ -149,8 +153,9 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
return 0;
// auto timer = spconv::CudaContextTimer<>();
if (useHash){
auto table = cudahash::HashTable();
table.Initialize(numActIn, 2.0);
auto table = cuhash::HashTable();
// std::cout << "subm create " << numActIn << " size table..." << std::endl;
table.Initialize(numActIn, 2.0, 4);
unsigned *d_keyvalues = nullptr;
cudaMalloc((void**)&d_keyvalues, sizeof(unsigned) * numActIn * 2);
unsigned *d_values = d_keyvalues + numActIn;
......@@ -160,8 +165,10 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
TV_CHECK_CUDA_ERR_V2("prepareSubMHashKernel failed");
bool res = table.Build(numActIn, reinterpret_cast<unsigned*>(d_keyvalues),
reinterpret_cast<unsigned*>(d_values));
TV_ASSERT_RT_ERR(res, "err");
cudaFree(d_keyvalues);
if (!res){
return -1; //use -1 to tell outside use CPU implementation
}
auto tableSize = table.get_table_size();
auto tableData = table.data();
auto constants = table.get_constants_4();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment