Commit 01ed382c authored by yan.yan's avatar yan.yan
Browse files

working on tensor core test

parent 3517290c
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file hash_table.h
*
* @brief Header for a basic hash table that stores one value per key.
*/
#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__H
#define CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__H
#include "definitions.h"
#include "hash_functions.h"
#include <cstdio>
/** \addtogroup cudpp_app
* @{
*/
/** \addtogroup cudpp_hash_data_structures
* @{
*/
/* --------------------------------------------------------------------------
Doxygen definitions.
-------------------------------------------------------------------------- */
/*! @namespace CudaHT
* @brief Encapsulates the hash table library.
*/
/*! @namespace CuckooHashing
* @brief Encapsulates the cuckoo hash table that uses stashes.
*/
/* -------------------------------------------------------------------------
Hash table code.
------------------------------------------------------------------------- */
namespace cuhash {
//! Compute how many thread blocks are required for the given number of threads.
dim3 ComputeGridDim(unsigned threads);
//! Compute how long an eviction chain is allowed to become for a given input
//! size.
/*! \param[in] num_keys Number of keys in the input.
* \param[in] table_size Number of slots in the hash table.
* \param[in] num_functions Number of hash functions being used.
* \returns The number of iterations that should be allowed.
*
* The latter two parameters are only needed when using an empirical
* formula for computing the chain length.
*/
unsigned ComputeMaxIterations(const unsigned num_keys,
const unsigned table_size,
const unsigned num_functions);
//! Basic hash table that stores one value for each key.
/*! The input consists of two unsigned arrays of keys and values.
* None of the keys are expected to be repeated.
*
* @todo Templatize the interface without forcing the header file to
* have CUDA calls.
* @ingroup cudpp_app
*/
class HashTable {
public:
HashTable();
virtual ~HashTable() { Release(); }
//! Initialize the hash table's memory. Must be called before \ref
//! Build() and after the random number generator has been seeded.
/*! @param[in] max_input_size Largest expected number of items in the input.
* @param[in] space_usage Size of the hash table relative to the
* input. Bigger tables are faster to build
* and retrieve from.
* @param[in] num_functions Number of hash functions to use. May be
* 2-5. More hash functions make it easier
* to build the table, but increase
* retrieval times.
* @returns Whether the hash table was initialized successfully (true)
* or not (false).
*
* The minimum space usage is dependent on the number of functions
* being used; for two through five functions, the minimum space
* usage is 2.1, 1.1, 1.03, and 1.02 respectively.
*/
virtual bool Initialize(const unsigned max_input_size,
const float space_usage = 1.25,
const unsigned num_functions = 4);
//! Free all memory.
virtual void Release();
//! Build the hash table.
/*! @param[in] input_size Number of key-value pairs being inserted.
* @param[in] d_keys Device memory array containing all of the input
* keys.
* @param[in] d_vals Device memory array containing the keys' values.
* @returns Whether the hash table was built successfully (true) or
* not (false).
*
* Several attempts are allowed to build the hash table in case of failure.
* The input keys are expected to be completely unique.
* To reduce the chance of a failure, increase the space usage or number of
* functions.
* Keys are not allowed to be equal to cuhash::kKeyEmpty.
*/
virtual bool Build(const unsigned input_size, const unsigned *d_keys,
const unsigned *d_vals);
//! Query the hash table.
/*! @param[in] n_queries Number of keys in the query set.
* @param[in] d_query_keys Device memory array containing all of
* the query keys.
* @param[in] d_query_results Values for the query keys.
*
* kNotFound is returned for any query key that failed to be found
* in the table.
*/
virtual void Retrieve(const unsigned n_queries, const unsigned *d_query_keys,
unsigned *d_query_results);
//! @name Accessors
/// @brief Mainly needed to use the __device__ CudaHT::retrieve()
/// function directly.
/// @{
//! Returns how many slots the hash table has.
inline unsigned get_table_size() const { return table_size_; }
//! Returns how many items are stored in the stash.
inline unsigned get_stash_count() const { return stash_count_; }
//! Returns the constants used by the stash.
inline uint2 get_stash_constants() const { return stash_constants_; }
//! Returns the hash table contents.
inline const Entry *get_contents() const { return d_contents_; }
//! Returns the number of hash functions being used.
inline unsigned get_num_hash_functions() const { return num_hash_functions_; }
//! When using two hash functions, returns the constants.
inline Functions<2> get_constants_2() const { return constants_2_; }
//! When using three hash functions, returns the constants.
inline Functions<3> get_constants_3() const { return constants_3_; }
//! When using four hash functions, returns the constants.
inline Functions<4> get_constants_4() const { return constants_4_; }
//! When using five hash functions, returns the constants.
inline Functions<5> get_constants_5() const { return constants_5_; }
/// @}
inline Entry *data() { return d_contents_; }
inline const Entry *data() const { return d_contents_; }
protected:
unsigned table_size_; //!< Size of the hash table.
unsigned num_hash_functions_; //!< Number of hash functions being used.
Entry *d_contents_; //!< Device memory: The hash table contents. The stash is
//!< stored at the end.
unsigned stash_count_; //!< Number of key-value pairs currently stored.
uint2 stash_constants_; //!< Hash function constants for the stash.
Functions<2> constants_2_; //!< Constants for a set of two hash functions.
Functions<3> constants_3_; //!< Constants for a set of three hash functions.
Functions<4> constants_4_; //!< Constants for a set of four hash functions.
Functions<5> constants_5_; //!< Constants for a set of five hash functions.
unsigned *d_failures_; //!< Device memory: General use error flag.
};
/*! @name Internal
* @{
*/
namespace CUDAWrapper {
//! Fills a 64-bit array with a particular value.
void ClearTable(const unsigned slots_in_table, const Entry fill_value,
Entry *d_array);
//! Calls the Cuckoo Hash construction kernel.
void CallCuckooHash(const unsigned n_entries, const unsigned num_hash_functions,
const unsigned *d_keys, const unsigned *d_values,
const unsigned table_size, const Functions<2> constants_2,
const Functions<3> constants_3,
const Functions<4> constants_4,
const Functions<5> constants_5,
const unsigned max_iteration_attempts, Entry *d_contents,
uint2 stash_constants, unsigned *d_stash_count,
unsigned *d_failures, unsigned *d_iterations_taken);
//! Calls the kernel that performs retrievals.
void CallHashRetrieve(const unsigned n_queries,
const unsigned num_hash_functions,
const unsigned *keys_in, const unsigned table_size,
const Entry *table, const Functions<2> constants_2,
const Functions<3> constants_3,
const Functions<4> constants_4,
const Functions<5> constants_5,
const uint2 stash_constants, const unsigned stash_count,
unsigned *values_out);
}; // namespace CUDAWrapper
/// @}
}; // namespace cuhash
/** @} */ // end hash table data structures
/** @} */ // end cudpp_app
#endif
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This file is used for c++ unit test, but pytorch jit ops don't support c++
// debug build.
#ifndef PARAMS_GRID_H_
#define PARAMS_GRID_H_
#include <tuple>
#include <vector>
namespace detail {
template <class T> int getTotalSize(std::vector<T> arg) { return arg.size(); }
template <class T, class... TArgs>
int getTotalSize(std::vector<T> arg, std::vector<TArgs>... args) {
return arg.size() * getTotalSize(args...);
}
template <typename T> int getSize(std::vector<T> arg) { return arg.size(); }
template <int Idx, class TT, class T>
void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg) {
std::get<Idx>(src) = arg[counter[Idx]];
}
template <int Idx, class TT, class T, class... TArgs>
void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg,
std::vector<TArgs> &... args) {
std::get<Idx>(src) = arg[counter[Idx]];
assigner<Idx + 1>(src, counter, args...);
}
} // namespace detail
template <class... TArgs>
std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
int length = detail::getTotalSize(args...);
std::vector<int> sizes = {detail::getSize(args)...};
int size = sizes.size();
std::vector<std::tuple<TArgs...>> params(length);
std::vector<int> counter(size);
for (int i = 0; i < length; ++i) {
detail::assigner<0>(params[i], counter, args...);
counter[size - 1] += 1;
for (int c = size - 1; c >= 0; --c) {
if (counter[c] == sizes[c] && c > 0) {
counter[c - 1] += 1;
counter[c] = 0;
}
}
}
return params;
}
#endif
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef BOX_IOU_H
#define BOX_IOU_H
#include <pybind11/pybind11.h>
// must include pybind11/eigen.h if using eigen matrix as arguments.
#include <algorithm>
#include <boost/geometry.hpp>
#include <pybind11/numpy.h>
namespace spconv {
// #include "voxelnet/core/cc/pybind11_helper.h"
namespace py = pybind11;
using namespace pybind11::literals;
template <typename DType, typename ShapeContainer>
inline py::array_t<DType> constant(ShapeContainer shape, DType value) {
// create ROWMAJOR array.
py::array_t<DType> array(shape);
std::fill(array.mutable_data(), array.mutable_data() + array.size(), value);
return array;
}
template <typename DType>
inline py::array_t<DType> zeros(std::vector<long int> shape) {
return constant<DType, std::vector<long int>>(shape, 0);
}
template <typename DType>
py::array_t<DType>
rbbox_iou(py::array_t<DType> box_corners, py::array_t<DType> qbox_corners,
py::array_t<DType> standup_iou, DType standup_thresh) {
namespace bg = boost::geometry;
typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
typedef bg::model::polygon<point_t> polygon_t;
polygon_t poly, qpoly;
std::vector<polygon_t> poly_inter, poly_union;
DType inter_area, union_area;
auto box_corners_r = box_corners.template unchecked<3>();
auto qbox_corners_r = qbox_corners.template unchecked<3>();
auto standup_iou_r = standup_iou.template unchecked<2>();
auto N = box_corners_r.shape(0);
auto K = qbox_corners_r.shape(0);
py::array_t<DType> overlaps = zeros<DType>({int(N), int(K)});
auto overlaps_rw = overlaps.template mutable_unchecked<2>();
if (N == 0 || K == 0) {
return overlaps;
}
for (int k = 0; k < K; ++k) {
for (int n = 0; n < N; ++n) {
if (standup_iou_r(n, k) <= standup_thresh)
continue;
bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1)));
bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
bg::intersection(poly, qpoly, poly_inter);
if (!poly_inter.empty()) {
inter_area = bg::area(poly_inter.front());
bg::union_(poly, qpoly, poly_union);
if (!poly_union.empty()) {
union_area = bg::area(poly_union.front());
overlaps_rw(n, k) = inter_area / union_area;
}
poly_union.clear();
}
poly.clear();
qpoly.clear();
poly_inter.clear();
}
}
return overlaps;
}
template <typename DType>
py::array_t<DType> rbbox_intersection(py::array_t<DType> box_corners,
py::array_t<DType> qbox_corners,
py::array_t<DType> standup_iou,
DType standup_thresh) {
namespace bg = boost::geometry;
typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
typedef bg::model::polygon<point_t> polygon_t;
polygon_t poly, qpoly;
std::vector<polygon_t> poly_inter, poly_union;
DType inter_area, union_area;
auto box_corners_r = box_corners.template unchecked<3>();
auto qbox_corners_r = qbox_corners.template unchecked<3>();
auto standup_iou_r = standup_iou.template unchecked<2>();
auto N = box_corners_r.shape(0);
auto K = qbox_corners_r.shape(0);
py::array_t<DType> overlaps = zeros<DType>({int(N), int(K)});
auto overlaps_rw = overlaps.template mutable_unchecked<2>();
if (N == 0 || K == 0) {
return overlaps;
}
for (int k = 0; k < K; ++k) {
for (int n = 0; n < N; ++n) {
if (standup_iou_r(n, k) <= standup_thresh)
continue;
bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1)));
bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
bg::intersection(poly, qpoly, poly_inter);
if (!poly_inter.empty()) {
inter_area = bg::area(poly_inter.front());
overlaps_rw(n, k) = inter_area;
}
poly.clear();
qpoly.clear();
poly_inter.clear();
}
}
return overlaps;
}
} // namespace spconv
#endif
\ No newline at end of file
#pragma once
#include <cublas_v2.h>
#include <tensorview/tensorview.h>
namespace spconv {
template <class T>
cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const T *alpha, const T *A, int lda, const T *B,
int ldb, const T *beta, T *C, int ldc);
template <class T>
cublasStatus_t cublasTgemmRow(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const T *alpha, const T *A, int lda, const T *B,
int ldb, const T *beta, T *C, int ldc) {
return cublasTgemm<T>(handle, transb, transa, n, m, k, alpha, B, ldb, A, lda,
beta, C, ldc);
}
template <class T> inline T constant_scalar(float data) { return T(data); }
template <class T>
cublasStatus_t gemm(cublasHandle_t handle, bool transa, bool transb,
const tv::TensorView<T> A, const tv::TensorView<T> B,
tv::TensorView<T> C) {
TV_ASSERT_RT_ERR(A.ndim() == 2, "error");
TV_ASSERT_RT_ERR(B.ndim() == 2, "error");
auto transa_cublas = transa ? CUBLAS_OP_T : CUBLAS_OP_N;
auto transb_cublas = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
int m = transa ? A.dim(1) : A.dim(0);
int n = transb ? B.dim(0) : B.dim(1);
int ka = transa ? A.dim(0) : A.dim(1);
int kb = transb ? B.dim(1) : B.dim(0);
int lda = transa ? m : ka;
int ldb = transb ? ka : n;
int ldc = n;
TV_ASSERT_RT_ERR(ka == kb, "error");
T alpha = constant_scalar<T>(1);
T beta = constant_scalar<T>(0);
return cublasTgemmRow<T>(handle, transa_cublas, transb_cublas, m, n, ka,
&alpha, A.data(), lda, B.data(), ldb, &beta,
C.data(), ldc);
}
} // namespace spconv
/*
BSD License
For SparseConvNet software
Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name Facebook nor the names of its contributors may be used to
endorse or promote products derived from this software without specific
prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#define TACC double
template <typename T, int32_t K, int32_t V>
__global__ void
dConvolution_KMxKN_forwardA(T *inFeatures, T *outFeatures, T *w,
int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
int32_t input_nPlanes, int32_t input_stride,
int32_t output_nPlanes, int32_t output_stride) {
// nHot must be a multiple of K!!
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
int32_t M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
int32_t n = blockIdx.y;
int32_t g = blockIdx.z;
inFeatures += g * input_nPlanes;
outFeatures += n * K + g * output_nPlanes;
w += n * K + g * input_nPlanes * output_nPlanes;
TACC O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
int32_t R0[V];
int32_t R1[V];
const int32_t tx = threadIdx.x;
int32_t ty[V];
#pragma unroll
for (int32_t v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int32_t m = 0; m < M; m++) {
// Read w
#pragma unroll
for (int32_t v = 0; v < V; v++)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int32_t v = 0; v < V; v++) {
R0[v] = rulesIn[s + ty[v]];
R1[v] = rulesOut[s + ty[v]];
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int32_t v = 0; v < V; v++) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int32_t k = 0; k < K; k++)
#pragma unroll
for (int32_t v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
#pragma unroll
for (int32_t v = 0; v < V; v++)
O[v] += outFeatures[R1[v] * output_stride + tx];
#pragma unroll
for (int32_t v = 0; v < V; v++)
outFeatures[R1[v] * output_stride + tx] = O[v];
__syncthreads();
}
w += K * output_nPlanes;
inFeatures += K;
}
}
template <typename T, int32_t K, int32_t V>
__global__ void
dConvolution_KMxKN_forwardB(T *inFeatures, T *outFeatures, T *w,
int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
int32_t input_nPlanes, int32_t input_stride,
int32_t output_nPlanes, int32_t output_stride) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
// K is a multiple of V,
// nHot x KM -> nHot x KN - parallel over N,nHot - loop over M
int32_t M = input_nPlanes / K;
// N = gridDim.y == output_nPlanes/K
int32_t n = blockIdx.y;
int32_t g = blockIdx.z;
inFeatures += g * input_nPlanes;
outFeatures += n * K + g * output_nPlanes;
w += n * K + g * input_nPlanes * output_nPlanes;
TACC O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
int32_t R0[V];
int32_t R1[V];
const int32_t tx = threadIdx.x;
int32_t ty[V];
#pragma unroll
for (int32_t v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int32_t m = 0; m < M; m++) {
// Read w
#pragma unroll
for (int32_t v = 0; v < V; v++)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int32_t v = 0; v < V; v++) {
if (s + ty[v] < nHot) {
R0[v] = rulesIn[s + ty[v]];
R1[v] = rulesOut[s + ty[v]];
}
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int32_t v = 0; v < V; v++) {
if (s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int32_t k = 0; k < K; k++)
#pragma unroll
for (int32_t v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
#pragma unroll
for (int32_t v = 0; v < V; v++)
if (s + ty[v] < nHot)
O[v] += outFeatures[R1[v] * output_stride + tx];
#pragma unroll
for (int32_t v = 0; v < V; v++)
if (s + ty[v] < nHot)
outFeatures[R1[v] * output_stride + tx] = O[v];
__syncthreads();
}
w += K * output_nPlanes;
inFeatures += K;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
int32_t o = (nHot / K) * K; \
if (o >= K) \
dConvolution_KMxKN_forwardA<T, K, V> \
<<<dim3(std::min(o / K, (int32_t)512), output_nPlanes / K, \
nGroups), \
dim3(K, K / V), 0, s>>>( \
inFeatures, outFeatures, w, rulesIn, rulesOut, o, \
input_nPlanes, input_stride, output_nPlanes, output_stride); \
if (nHot > o) \
dConvolution_KMxKN_forwardB<T, K, V> \
<<<dim3(1, output_nPlanes / K, nGroups), dim3(K, K / V), 0, s>>>( \
inFeatures, outFeatures, w, rulesIn + o, rulesOut + o, \
nHot - o, input_nPlanes, input_stride, output_nPlanes, \
output_stride); \
return; \
} \
}
template <typename T>
void dConvolution_forward(cudaStream_t s, T *inFeatures, T *outFeatures, T *w,
int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
int32_t input_nPlanes, int32_t input_stride,
int32_t output_nPlanes, int32_t output_stride,
int32_t nGroups) {
FOO(T, 64, 16)
FOO(T, 32, 8)
FOO(T, 16, 4)
FOO(T, 8, 2)
assert(false);
}
template <>
void dConvolution_forward<double>(cudaStream_t s, double *inFeatures,
double *outFeatures, double *w,
int32_t *rulesIn, int32_t *rulesOut,
int32_t nHot, int32_t input_nPlanes,
int32_t input_stride, int32_t output_nPlanes,
int32_t output_stride, int32_t nGroups) {
FOO(double, 32, 8)
FOO(double, 16, 4)
FOO(double, 8, 2)
assert(false);
}
#undef FOO
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
template <typename T, int32_t K, int32_t V>
__global__ void dConvolution_KMxKN_backward_dW_A(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw,
int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, int32_t input_nPlanes,
int32_t input_stride, int32_t output_nPlanes, int32_t output_stride) {
// M = gridDim.y == input_nPlanes / K
int32_t N = output_nPlanes / K;
int32_t m = blockIdx.y;
int32_t g = blockIdx.z;
inFeatures += m * K + g * input_nPlanes;
dInFeatures += m * K + g * input_nPlanes;
dOutFeatures += g * output_nPlanes;
w += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
dw += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
TACC dI[V];
TACC dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
int32_t R0[V];
int32_t R1[V];
const int32_t tx = threadIdx.x;
int32_t ty[V];
#pragma unroll
for (int32_t v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int32_t n = 0; n < N; n++) {
// Read w, reset dW
#pragma unroll
for (int32_t v = 0; v < V; v++) {
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
}
for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int32_t v = 0; v < V; v++) {
R0[v] = rulesIn[s + ty[v]];
R1[v] = rulesOut[s + ty[v]];
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int32_t v = 0; v < V; v++) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
}
__syncthreads();
#pragma unroll
for (int32_t k = 0; k < K; k++)
#pragma unroll
for (int32_t v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int32_t v = 0; v < V; v++)
dI[v] += dInFeatures[R0[v] * input_stride + tx];
#pragma unroll
for (int32_t v = 0; v < V; v++)
dInFeatures[R0[v] * input_stride + tx] = dI[v];
__syncthreads();
}
#pragma unroll
for (int32_t v = 0; v < V; v++)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K;
dw += K;
dOutFeatures += K;
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
template <typename T, int32_t K, int32_t V>
__global__ void dConvolution_KMxKN_backward_dW_B(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw,
int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, int32_t input_nPlanes,
int32_t input_stride, int32_t output_nPlanes, int32_t output_stride) {
// M = gridDim.y == input_nPlanes / K
int32_t N = output_nPlanes / K;
int32_t m = blockIdx.y;
int32_t g = blockIdx.z;
inFeatures += m * K + g * input_nPlanes;
dInFeatures += m * K + g * input_nPlanes;
dOutFeatures += g * output_nPlanes;
w += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
dw += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
TACC dI[V];
TACC dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
int32_t R0[V];
int32_t R1[V];
const int32_t tx = threadIdx.x;
int32_t ty[V];
#pragma unroll
for (int32_t v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int32_t n = 0; n < N; n++) {
// Read w, reset dW
#pragma unroll
for (int32_t v = 0; v < V; v++) {
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
}
for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
#pragma unroll
for (int32_t v = 0; v < V; v++) {
if (s + ty[v] < nHot) {
R0[v] = rulesIn[s + ty[v]];
R1[v] = rulesOut[s + ty[v]];
}
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int32_t v = 0; v < V; v++)
if (s + ty[v] < nHot) {
I[ty[v]][tx] = inFeatures[R0[v] * input_stride + tx];
dO[ty[v]][tx] = dOutFeatures[R1[v] * output_stride + tx];
} else {
I[ty[v]][tx] = 0;
dO[ty[v]][tx] = 0;
}
__syncthreads();
#pragma unroll
for (int32_t k = 0; k < K; k++)
#pragma unroll
for (int32_t v = 0; v < V; v++) {
dI[v] += dO[ty[v]][k] * W[tx][k];
dW[v] += I[k][ty[v]] * dO[k][tx];
}
#pragma unroll
for (int32_t v = 0; v < V; v++)
if (s + ty[v] < nHot)
dI[v] += dInFeatures[R0[v] * input_stride + tx];
#pragma unroll
for (int32_t v = 0; v < V; v++)
if (s + ty[v] < nHot)
dInFeatures[R0[v] * input_stride + tx] = dI[v];
__syncthreads();
}
#pragma unroll
for (int32_t v = 0; v < V; v++)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K;
dw += K;
dOutFeatures += K;
}
}
#define FOO(T, K, V) \
{ \
if (input_nPlanes % K == 0 and output_nPlanes % K == 0) { \
int32_t o = (nHot / K) * K; \
if (o >= K) \
dConvolution_KMxKN_backward_dW_A<T, K, V> \
<<<dim3(std::min(o / K, (int32_t)512), input_nPlanes / K, \
nGroups), \
dim3(K, K / V), 0, s>>>(inFeatures, dInFeatures, dOutFeatures, \
w, dw, rulesIn, rulesOut, o, \
input_nPlanes, input_stride, \
output_nPlanes, output_stride); \
if (nHot > o) \
dConvolution_KMxKN_backward_dW_B<T, K, V> \
<<<dim3(1, input_nPlanes / K, nGroups), dim3(K, K / V), 0, s>>>( \
inFeatures, dInFeatures, dOutFeatures, w, dw, rulesIn + o, \
rulesOut + o, nHot - o, input_nPlanes, input_stride, \
output_nPlanes, output_stride); \
return; \
} \
}
template <typename T>
void dConvolution_backward_dW(cudaStream_t s, T *inFeatures, T *dInFeatures,
T *dOutFeatures, T *w, T *dw, int32_t *rulesIn,
int32_t *rulesOut, int32_t nHot,
int32_t input_nPlanes, int32_t input_stride,
int32_t output_nPlanes, int32_t output_stride,
int32_t nGroups) {
FOO(T, 32, 8)
FOO(T, 16, 4)
FOO(T, 8, 2)
assert(false);
}
#undef FOO
template <typename T, int32_t K, int32_t V>
__global__ void
dConvolution_KMxKN_forward2(T *inFeatures, T *outFeatures, T *w,
int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
int32_t input_nPlanes, int32_t input_stride,
int32_t output_nPlanes, int32_t output_stride) {
// Input x Weight -> Output
// blockDim=(K,K/V,1), gridDim=(nBlocks,N,nGroups) Volkov-blocks
// K is a multiple of V,
// nHot x input_nplanes<=KM -> nHot x output_nPlanes<=KN
// - parallel over N,nHot - loop over M
int32_t M = (input_nPlanes + K - 1) / K;
// N = gridDim.y ~ output_nPlanes/K
int32_t n = blockIdx.y;
int32_t g = blockIdx.z;
inFeatures += g * input_nPlanes;
outFeatures += n * K + g * output_nPlanes;
w += n * K + g * input_nPlanes * output_nPlanes;
int32_t KO = min(K, output_nPlanes - K * n);
TACC O[V];
__shared__ T W[K][K];
__shared__ T I[K][K];
__shared__ int32_t R[K * 2];
const int32_t tx = threadIdx.x;
int32_t ty[V];
#pragma unroll
for (int32_t v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int32_t m = 0; m < M; m++) {
int32_t KI = min(K, input_nPlanes - K * m);
// Read w
#pragma unroll
for (int32_t v = 0; v < V; v++)
if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
// Read rules for K input/output pairs
#pragma unroll
for (int32_t v = 0; v < V; v++) {
if (ty[v] < 1) {
if (s + tx < nHot) {
R[2 * tx] = rulesIn[s + tx];
R[2 * tx + 1] = rulesOut[s + tx];
}
// R[q] = rules[2 * s + q];
}
}
__syncthreads();
// Read input, reset O[]
#pragma unroll
for (int32_t v = 0; v < V; v++) {
if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
O[v] = 0;
}
__syncthreads();
#pragma unroll
for (int32_t k = 0; k < KI; k++)
#pragma unroll
for (int32_t v = 0; v < V; v++)
O[v] += I[ty[v]][k] * W[k][tx];
__syncthreads();
#pragma unroll
for (int32_t v = 0; v < V; v++)
if (tx < KO and s + ty[v] < nHot)
outFeatures[R[2 * ty[v] + 1] * output_stride + tx] += O[v];
__syncthreads();
}
w += K * output_nPlanes;
inFeatures += K;
}
}
// dOutput x W^T -> dInput and
// Input^T x dOutput -> dW
// blockDim=(K,K/V,1), gridDim=(nBlocks,M,nGroups)
template <typename T, int32_t K, int32_t V>
__global__ void dConvolution_KMxKN_backward_dW2(
T *inFeatures, T *dInFeatures, T *dOutFeatures, T *w, T *dw,
int32_t *rulesIn, int32_t *rulesOut, int32_t nHot, int32_t input_nPlanes,
int32_t input_stride, int32_t output_nPlanes, int32_t output_stride) {
// M = gridDim.y == input_nPlanes / K
int32_t N = (output_nPlanes + K - 1) / K;
int32_t m = blockIdx.y;
int32_t g = blockIdx.z;
inFeatures += m * K + g * input_nPlanes;
dInFeatures += m * K + g * input_nPlanes;
dOutFeatures += g * output_nPlanes;
w += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
dw += m * K * output_nPlanes + g * input_nPlanes * output_nPlanes;
int32_t KI = min(K, input_nPlanes - K * m);
TACC dI[V];
TACC dW[V];
__shared__ T I[K][K];
__shared__ T dO[K][K];
__shared__ T W[K][K];
__shared__ int32_t R[K * 2];
const int32_t tx = threadIdx.x;
int32_t ty[V];
#pragma unroll
for (int32_t v = 0; v < V; v++)
ty[v] = threadIdx.y + v * (K / V);
for (int32_t n = 0; n < N; n++) {
int32_t KO = min(K, output_nPlanes - K * n);
// Read w, reset dW
#pragma unroll
for (int32_t v = 0; v < V; v++) {
if (ty[v] < KI and tx < KO)
W[ty[v]][tx] = w[ty[v] * output_nPlanes + tx];
dW[v] = 0;
}
for (int32_t s = blockIdx.x * K; s < nHot; s += K * gridDim.x) {
// Read rules for K input/output pairs, reset dI[]
#pragma unroll
for (int32_t v = 0; v < V; v++) {
if (ty[v] < 1) {
if (s + tx < nHot) {
R[2 * tx] = rulesIn[s + tx];
R[2 * tx + 1] = rulesOut[s + tx];
}
// R[q] = rules[2 * s + q];
}
dI[v] = 0;
}
__syncthreads();
// Read input and dOutput
#pragma unroll
for (int32_t v = 0; v < V; v++) {
if (tx < KI and s + ty[v] < nHot)
I[ty[v]][tx] = inFeatures[R[2 * ty[v]] * input_stride + tx];
else
I[ty[v]][tx] = 0;
if (tx < KO and s + ty[v] < nHot)
dO[ty[v]][tx] = dOutFeatures[R[2 * ty[v] + 1] * output_stride + tx];
else
dO[ty[v]][tx] = 0;
}
__syncthreads();
#pragma unroll
for (int32_t k = 0; k < KO; k++)
#pragma unroll
for (int32_t v = 0; v < V; v++)
dI[v] += dO[ty[v]][k] * W[tx][k];
#pragma unroll
for (int32_t k = 0; k < K; k++)
#pragma unroll
for (int32_t v = 0; v < V; v++)
dW[v] += I[k][ty[v]] * dO[k][tx];
__syncthreads();
#pragma unroll
for (int32_t v = 0; v < V; v++)
if (tx < KI and s + ty[v] < nHot)
dInFeatures[R[2 * ty[v]] * input_stride + tx] += dI[v];
__syncthreads();
}
#pragma unroll
for (int32_t v = 0; v < V; v++)
if (ty[v] < KI and tx < KO)
atomicAdd(&dw[ty[v] * output_nPlanes + tx], dW[v]);
w += K;
dw += K;
dOutFeatures += K;
}
}
template <typename T>
void dConvolution_forward2(cudaStream_t s, T *inFeatures, T *outFeatures, T *w,
int32_t *rulesIn, int32_t *rulesOut, int32_t nHot,
int32_t input_nPlanes, int32_t input_stride,
int32_t output_nPlanes, int32_t output_stride,
int32_t nGroups) {
int32_t c = input_nPlanes * output_nPlanes * nGroups;
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int32_t K = 16;
const int32_t V = 4;
dConvolution_KMxKN_forward2<T, K, V>
<<<dim3(128, (output_nPlanes + K - 1) / K, nGroups), dim3(K, K / V), 0,
s>>>(inFeatures, outFeatures, w, rulesIn, rulesOut, nHot,
input_nPlanes, input_stride, output_nPlanes, output_stride);
} else {
dConvolution_forward(s, inFeatures, outFeatures, w, rulesIn, rulesOut, nHot,
input_nPlanes, input_stride, output_nPlanes,
output_stride, nGroups);
}
}
template <typename T>
void dConvolution_backward_dW2(cudaStream_t s, T *inFeatures, T *dInFeatures,
T *dOutFeatures, T *w, T *dw, int32_t *rulesIn,
int32_t *rulesOut, int32_t nHot,
int32_t input_nPlanes, int32_t input_stride,
int32_t output_nPlanes, int32_t output_stride,
int32_t nGroups) {
int32_t c = input_nPlanes * output_nPlanes * nGroups;
if (input_nPlanes % 8 != 0 or output_nPlanes % 8 != 0) {
const int32_t K = 16;
const int32_t V = 4;
dConvolution_KMxKN_backward_dW2<T, K, V>
<<<dim3(128, (input_nPlanes + K - 1) / K, nGroups), dim3(K, K / V), 0,
s>>>(inFeatures, dInFeatures, dOutFeatures, w, dw, rulesIn, rulesOut,
nHot, input_nPlanes, input_stride, output_nPlanes,
output_stride);
} else {
dConvolution_backward_dW(s, inFeatures, dInFeatures, dOutFeatures, w, dw,
rulesIn, rulesOut, nHot, input_nPlanes,
input_stride, output_nPlanes, output_stride,
nGroups);
}
}
#undef TACC
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cuda_runtime_api.h>
#include <tensorview/tensor.h>
#include <torch/script.h>
namespace spconv {
enum FusedConvAlgo { kFSparseConvNet, kFMinkowskiEngine };
using all_fused_conv_algos_t =
tv::mp_list_c<int, kFSparseConvNet, kFMinkowskiEngine>;
void fused_conv_cuda(torch::Tensor output, torch::Tensor features,
torch::Tensor filters, torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot);
void fused_conv_backward_cuda(torch::Tensor features, torch::Tensor din,
torch::Tensor dout, torch::Tensor filters,
torch::Tensor dfilters, torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot);
void fused_conv_cuda_minkowski(torch::Tensor output, torch::Tensor features,
torch::Tensor filters, torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot);
void fused_conv_backward_cuda_minkowski(torch::Tensor features,
torch::Tensor din, torch::Tensor dout,
torch::Tensor filters,
torch::Tensor dfilters,
torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot);
template <int Algo> struct FusedConvDispatch;
template <> struct FusedConvDispatch<kFSparseConvNet> {
constexpr static auto *fwd = fused_conv_cuda;
constexpr static auto *bwd = fused_conv_backward_cuda;
};
template <> struct FusedConvDispatch<kFMinkowskiEngine> {
constexpr static auto *fwd = fused_conv_cuda_minkowski;
constexpr static auto *bwd = fused_conv_backward_cuda_minkowski;
};
} // namespace spconv
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef FUSED_SPARSE_CONV_OP_H_
#define FUSED_SPARSE_CONV_OP_H_
#include <spconv/indice.h>
#include <spconv/reordering.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace spconv {
// torch.jit's doc says only support int64, so we need to convert to int32.
torch::Tensor
fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters,
torch::Tensor bias, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
/*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element(
indicePairNumVec.begin(), indicePairNumVec.end());
indicePairMaxSize = *indicePairVecMaxSizeIter;
}*/
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch::Tensor output =
torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
// auto timer = spconv::CudaContextTimer<>();
auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob =
torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options);
if (device == torch::kCPU) {
sparse_gather_cpu(inputBuffer, features, indicePairs[i][inverse], nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
sparse_gather_cuda(inputBuffer, features, indicePairs[i][inverse], nHot);
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalGatherTime += timer.report() / 1000.0;
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
// totalGEMMTime += timer.report() / 1000.0;
if (device == torch::kCPU) {
sparse_scatter_add_cpu(outputBuffer, output, indicePairs[i][!inverse],
nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
sparse_scatter_add_cuda(outputBuffer, output, indicePairs[i][!inverse],
nHot);
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalSAddTime += timer.report() / 1000.0;
}
// std::cout << "gather time " << totalGatherTime << std::endl;
// std::cout << "gemm time " << totalGEMMTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return output;
}
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPCONV_GEOMETRY_H_
#define SPCONV_GEOMETRY_H_
#include <iostream>
#include <limits>
#include <tensorview/tensorview.h>
#include <tsl/robin_map.h>
#include <unordered_map>
namespace spconv {
namespace detail {
template <typename T> struct ToUnsigned;
template <> struct ToUnsigned<int> { using type = uint32_t; };
template <> struct ToUnsigned<long> { using type = uint64_t; };
template <typename T> struct FNVInternal;
template <> struct FNVInternal<uint32_t> {
constexpr static uint32_t defaultOffsetBasis = 0x811C9DC5;
constexpr static uint32_t prime = 0x01000193;
};
template <> struct FNVInternal<uint64_t> {
constexpr static uint64_t defaultOffsetBasis = 0xcbf29ce484222325;
constexpr static uint64_t prime = 0x100000001b3;
};
} // namespace detail
template <typename T>
using to_unsigned_t = typename detail::ToUnsigned<std::remove_const_t<T>>::type;
template <typename T> struct FNV1a : detail::FNVInternal<T> {
std::size_t operator()(const T *data, std::size_t size) {
to_unsigned_t<T> hash = detail::FNVInternal<T>::defaultOffsetBasis;
for (std::size_t i = 0; i < size; ++i) {
hash *= detail::FNVInternal<T>::prime;
hash ^= static_cast<to_unsigned_t<T>>(data[i]);
}
return hash;
}
};
template <typename Index, unsigned NDim>
TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
const Index *kernelSize,
const Index *stride, const Index *padding,
const Index *dilation,
const Index *outSpatialShape, Index *out) {
Index lowers[NDim];
Index uppers[NDim];
Index counter[NDim];
Index counterSize[NDim];
Index pointCounter = 0;
Index val;
Index numPoints = 1;
Index m, offset;
bool valid = false;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 +
stride[i] + padding[i]) /
stride[i];
uppers[i] = (input_pos[i] + padding[i]) / stride[i];
}
#pragma unroll
for (unsigned i = 0; i < NDim; ++i) {
counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
numPoints *= counterSize[i];
}
#pragma unroll
for (int i = 0; i < NDim; ++i) {
counter[i] = 0;
}
for (int i = 0; i < numPoints; ++i) {
valid = true;
m = 1;
offset = 0;
#pragma unroll
for (int j = NDim - 1; j >= 0; --j) {
val = uppers[j] - counter[j] * dilation[j];
out[pointCounter * (NDim + 1) + j] = val;
if (val < 0 || (val > outSpatialShape[j] - 1)) {
valid = false;
// break;
}
offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
m *= kernelSize[j];
}
out[pointCounter * (NDim + 1) + NDim] = offset;
if (valid)
++pointCounter;
counter[NDim - 1] += 1;
#pragma unroll
for (int c = NDim - 1; c >= 0; --c) {
if (counter[c] == counterSize[c] && c > 0) {
counter[c - 1] += 1;
counter[c] = 0;
}
}
}
return pointCounter;
}
template <typename Index, unsigned NDim>
TV_HOST_DEVICE Index getValidOutPosTranspose(
const Index *input_pos, const Index *kernelSize, const Index *stride,
const Index *padding, const Index *dilation, const Index *outSpatialShape,
Index *out) {
Index lowers[NDim];
Index uppers[NDim];
Index counter[NDim];
Index counterSize[NDim];
Index pointCounter = 0;
Index val;
Index numPoints = 1;
Index m, offset;
bool valid = false;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
lowers[i] = input_pos[i] * stride[i] - padding[i];
uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i];
}
#pragma unroll
for (unsigned i = 0; i < NDim; ++i) {
counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
numPoints *= counterSize[i];
}
#pragma unroll
for (int i = 0; i < NDim; ++i) {
counter[i] = 0;
}
for (int i = 0; i < numPoints; ++i) {
valid = true;
m = 1;
offset = 0;
#pragma unroll
for (int j = NDim - 1; j >= 0; --j) {
val = uppers[j] - counter[j] * dilation[j];
out[pointCounter * (NDim + 1) + j] = val;
if (val < 0 || (val > outSpatialShape[j] - 1)) {
valid = false;
// break;
}
offset += m * (val - lowers[j]) / dilation[j];
m *= kernelSize[j];
}
out[pointCounter * (NDim + 1) + NDim] = offset;
if (valid)
++pointCounter;
counter[NDim - 1] += 1;
#pragma unroll
for (int c = NDim - 1; c >= 0; --c) {
if (counter[c] == counterSize[c] && c > 0) {
counter[c - 1] += 1;
counter[c] = 0;
}
}
}
return pointCounter;
}
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef INDICE_CU_H_
#define INDICE_CU_H_
#include <cuhash/hash_table.cuh>
#include <spconv/geometry.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/tensorview.h>
namespace spconv {
template <bool UseDeconv, typename Index, unsigned NDim>
struct ConvIndiceDispatch;
template <typename Index, unsigned NDim>
struct ConvIndiceDispatch<true, Index, NDim> {
constexpr static auto *func = getValidOutPosTranspose<Index, NDim>;
};
template <typename Index, unsigned NDim>
struct ConvIndiceDispatch<false, Index, NDim> {
constexpr static auto *func = getValidOutPos<Index, NDim>;
};
template <typename Index, unsigned NDim, bool UseDeconv,
int KernelMaxVolume = 256, typename Index1D = int>
__global__ void prepareIndicePairsKernel(
tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum, tv::TensorView<Index1D> indicePairUnique,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape) {
auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
Index validPoints[KernelMaxVolume * (NDim + 1)];
Index *pointPtr = nullptr;
auto indicePairsDim2 = indicePairs.dim(2);
Index index;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
numValidPoints = ConvIndiceDispatch<UseDeconv, Index, NDim>::func(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
indicePairs(0, offset, oldNum) = ix;
index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
pointPtr, outSpatialShape.data(), 0) +
spatialVolume * indicesIn(ix, 0);
indicePairs(1, offset, oldNum) = index;
indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
}
}
}
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void assignGridAndIndiceOutKernel(
tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
int numAct, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {
Index index;
auto indicesOutPtr = indicesOut.data();
for (int ix : tv::KernelLoopX<int>(numAct)) {
index = indicePairUnique[ix];
gridsOut[index] = ix;
index = tv::rowArrayIdxInv<Index, NDim>(
index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
indicesOut[ix * (NDim + 1)] = index % batchSize;
}
}
template <typename Index, unsigned NDim, unsigned kNumHashFunctions = 4>
__global__ void
assignIndiceOutKernel(tv::TensorView<Index> indicesOut, int numAct,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape,
int batchSize) {
Index index;
auto indicesOutPtr = indicesOut.data();
for (unsigned ix : tv::KernelLoopX<unsigned>(numAct)) {
index = indicePairUnique[ix];
index = tv::rowArrayIdxInv<Index, NDim>(
index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
indicesOut[ix * (NDim + 1)] = index % batchSize;
}
}
template <typename Index, unsigned NDim, unsigned kNumHashFunctions = 4>
__global__ void
assignIndicePairsHashKernel(tv::TensorView<Index> indicesOut, int numActIn,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indicePairUnique,
unsigned table_size, const cuhash::Entry *table,
cuhash::Functions<kNumHashFunctions> constants,
uint2 stash_constants, unsigned stash_count) {
Index index;
int kernelVolume = indicePairs.dim(1);
auto indicePairsOut = indicePairs.subview(1);
for (int ix : tv::KernelLoopX<int>(numActIn)) {
for (int i = 0; i < kernelVolume; ++i) {
index = indicePairsOut(i, ix);
if (index > -1) {
auto val = cuhash::retrieve((unsigned)(index), table_size, table,
constants, stash_constants, stash_count);
assert(val != cuhash::kNotFound);
indicePairsOut(i, ix) = (unsigned)val;
}
}
}
}
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void
assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut, int numActIn,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape) {
Index index;
int kernelVolume = indicePairs.dim(1);
auto indicePairsOut = indicePairs.subview(1);
for (int ix : tv::KernelLoopX<int>(numActIn)) {
for (int i = 0; i < kernelVolume; ++i) {
index = indicePairsOut(i, ix);
if (index > -1) {
indicePairsOut(i, ix) = gridsOut[index];
}
}
}
}
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void
assignIndicePairsLimitedKernel(tv::TensorView<IndexGrid> gridsOut, int numActIn,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum) {
Index index, val;
int kernelVolume = indicePairs.dim(0);
for (int ix : tv::KernelLoopX<int>(numActIn)) {
for (int i = 0; i < kernelVolume; ++i) {
index = indicePairs(i, 1, ix);
if (index != -1) {
val = gridsOut[index];
if (val != -1) {
auto oldNum = atomicAdd(indiceNum.data() + i, Index(1));
indicePairs(i, 0, oldNum) = indicePairs(i, 0, ix);
indicePairs(i, 1, oldNum) = val;
}
}
}
}
}
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void prepareSubMGridKernel(
tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
const tv::SimpleVector<Index, NDim> outSpatialShape, Index spatialVolume) {
auto numActIn = indicesIn.dim(0);
Index index = 0;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
index =
tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
indicesIn.data() + ix * (NDim + 1) + 1, outSpatialShape.data(), 0) +
spatialVolume * indicesIn(ix, 0);
gridsOut[index] = ix;
}
}
template <typename Index, unsigned NDim>
__global__ void
prepareSubMHashKernel(tv::TensorView<const Index> indicesIn, unsigned *keys,
unsigned *values,
const tv::SimpleVector<Index, NDim> outSpatialShape) {
auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index index = 0;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + ix * (NDim + 1) + 1,
outSpatialShape.data()) +
spatialVolume * indicesIn(ix, 0);
keys[ix] = index;
values[ix] = ix;
}
}
template <typename Index, typename IndexGrid, unsigned NDim,
int KernelMaxVolume = 256>
__global__ void getSubMIndicePairsKernel(
tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape) {
auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index numValidPoints = 0;
Index validPoints[KernelMaxVolume * (NDim + 1)];
Index *pointPtr = nullptr;
Index index = 0;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
validPoints);
for (int i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
pointPtr, outSpatialShape.data(), 0) +
spatialVolume * indicesIn(ix, 0);
if (gridsOut[index] > -1) {
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
indicePairs(1, offset, oldNum) = gridsOut[index];
indicePairs(0, offset, oldNum) = ix;
}
}
}
}
template <typename Index, typename IndexGrid, unsigned K0, unsigned K1,
unsigned K2>
__global__ void getSubMIndicePairsUnrollKernel3(
tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, 3> outSpatialShape, Index spatialVolume) {
auto numActIn = indicesIn.dim(0);
Index point[3];
Index index = 0;
Index offset;
constexpr unsigned KV = K0 * K1 * K2;
constexpr unsigned center = KV / 2;
*(indiceNum.data() + center) = numActIn;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
const Index *indice_data = indicesIn.data() + ix * (3 + 1);
#pragma unroll
for (int i = 0; i < K0; ++i) {
#pragma unroll
for (int j = 0; j < K1; ++j) {
#pragma unroll
for (int k = 0; k < K2; ++k) {
offset = i * K1 * K2 + j * K2 + k;
if (offset > center) {
continue;
}
if (center == offset) {
// center of subm indice pairs dont need atomicadd
indicePairs(1, offset, ix) = ix;
indicePairs(0, offset, ix) = ix;
} else {
point[2] = indice_data[3] - k + K2 / 2;
point[1] = indice_data[2] - j + K1 / 2;
point[0] = indice_data[1] - i + K0 / 2;
if (point[1] >= 0 && point[1] < outSpatialShape[1] &&
point[2] >= 0 && point[2] < outSpatialShape[2] &&
point[0] >= 0 && point[0] < outSpatialShape[0]) {
index = tv::ArrayIndexRowMajor<3, 3>::runPtrs(
point, outSpatialShape.data(), 0) +
spatialVolume * indice_data[0];
if (gridsOut[index] != -1) {
// for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i
// - 1]
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
indicePairs(1, offset, oldNum) = gridsOut[index];
indicePairs(0, offset, oldNum) = ix;
indicePairs(1, KV - offset - 1, oldNum) = ix;
indicePairs(0, KV - offset - 1, oldNum) = gridsOut[index];
}
}
}
}
}
}
}
}
template <typename Index, typename IndexGrid, unsigned K0, unsigned K1>
__global__ void getSubMIndicePairsUnrollKernel2(
tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, 2> outSpatialShape, Index spatialVolume) {
auto numActIn = indicesIn.dim(0);
Index point[2];
Index index = 0;
Index offset;
constexpr unsigned KV = K0 * K1;
constexpr unsigned center = KV / 2;
*(indiceNum.data() + center) = numActIn;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
const Index *indice_data = indicesIn.data() + ix * (2 + 1);
#pragma unroll
for (int i = 0; i < K0; ++i) {
#pragma unroll
for (int j = 0; j < K1; ++j) {
offset = i * K1 + j;
if (offset > center) {
continue;
}
if (center == offset) {
// center of subm indice pairs dont need atomicadd
indicePairs(1, offset, ix) = ix;
indicePairs(0, offset, ix) = ix;
} else {
point[1] = indice_data[2] - j + K1 / 2;
point[0] = indice_data[1] - i + K0 / 2;
if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[0] >= 0 &&
point[0] < outSpatialShape[0]) {
index = tv::ArrayIndexRowMajor<2, 2>::runPtrs(
point, outSpatialShape.data(), 0) +
spatialVolume * indice_data[0];
if (gridsOut[index] > -1) {
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
indicePairs(1, offset, oldNum) = gridsOut[index];
indicePairs(0, offset, oldNum) = ix;
indicePairs(1, KV - offset - 1, oldNum) = ix;
indicePairs(0, KV - offset - 1, oldNum) = gridsOut[index];
}
}
}
}
}
}
}
template <typename Index, unsigned NDim, int KernelMaxVolume = 256,
unsigned kNumHashFunctions = 4>
__global__ void getSubMIndicePairsHashKernel(
tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape, unsigned table_size,
const cuhash::Entry *table, cuhash::Functions<kNumHashFunctions> constants,
uint2 stash_constants, unsigned stash_count) {
auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index numValidPoints = 0;
Index validPoints[KernelMaxVolume * (NDim + 1)];
Index *pointPtr = nullptr;
Index index = 0;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
validPoints);
for (int i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
pointPtr, outSpatialShape.data(), 0) +
spatialVolume * indicesIn(ix, 0);
auto val = cuhash::retrieve((unsigned)(index), table_size, table,
constants, stash_constants, stash_count);
if (val != cuhash::kNotFound) {
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
indicePairs(1, offset, oldNum) = val;
indicePairs(0, offset, oldNum) = ix;
}
}
}
}
template <typename Index, unsigned K0, unsigned K1, unsigned K2,
unsigned kNumHashFunctions = 4>
__global__ void getSubMIndicePairsHashUnrollKernel3(
tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, 3> outSpatialShape, Index spatialVolume,
unsigned table_size, const cuhash::Entry *table,
cuhash::Functions<kNumHashFunctions> constants, uint2 stash_constants,
unsigned stash_count) {
auto numActIn = indicesIn.dim(0);
Index index = 0;
Index offset;
Index point[3];
constexpr unsigned KV = K0 * K1 * K2;
constexpr unsigned center = KV / 2;
*(indiceNum.data() + center) = numActIn;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
const Index *indice_data = indicesIn.data() + ix * (3 + 1);
#pragma unroll
for (int i = 0; i < K0; ++i) {
#pragma unroll
for (int j = 0; j < K1; ++j) {
#pragma unroll
for (int k = 0; k < K2; ++k) {
offset = i * K1 * K2 + j * K2 + k;
if (offset > center) {
continue;
}
if (center == offset) {
// center of subm indice pairs dont need atomicadd
indicePairs(1, offset, ix) = ix;
indicePairs(0, offset, ix) = ix;
} else {
point[2] = indice_data[3] - k + K2 / 2;
point[1] = indice_data[2] - j + K1 / 2;
point[0] = indice_data[1] - i + K0 / 2;
if (point[1] >= 0 && point[1] < outSpatialShape[1] &&
point[2] >= 0 && point[2] < outSpatialShape[2] &&
point[0] >= 0 && point[0] < outSpatialShape[0]) {
index = tv::ArrayIndexRowMajor<3, 3>::runPtrs(
point, outSpatialShape.data(), 0) +
spatialVolume * indice_data[0];
auto val =
cuhash::retrieve((unsigned)(index), table_size, table,
constants, stash_constants, stash_count);
if (val != cuhash::kNotFound) {
// for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i
// - 1]
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
indicePairs(1, offset, oldNum) = val;
indicePairs(0, offset, oldNum) = ix;
indicePairs(1, KV - offset - 1, oldNum) = ix;
indicePairs(0, KV - offset - 1, oldNum) = val;
}
}
}
}
}
}
}
}
template <typename Index, unsigned K0, unsigned K1,
unsigned kNumHashFunctions = 4>
__global__ void getSubMIndicePairsHashUnrollKernel2(
tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, 2> outSpatialShape, Index spatialVolume,
unsigned table_size, const cuhash::Entry *table,
cuhash::Functions<kNumHashFunctions> constants, uint2 stash_constants,
unsigned stash_count) {
auto numActIn = indicesIn.dim(0);
Index index = 0;
Index offset;
Index point[2];
constexpr unsigned KV = K0 * K1;
constexpr unsigned center = KV / 2;
*(indiceNum.data() + center) = numActIn;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
const Index *indice_data = indicesIn.data() + ix * (2 + 1);
#pragma unroll
for (int i = 0; i < K0; ++i) {
#pragma unroll
for (int j = 0; j < K1; ++j) {
offset = i * K1 + j;
if (offset > center) {
continue;
}
if (center == offset) {
// center of subm indice pairs dont need atomicadd
indicePairs(1, offset, ix) = ix;
indicePairs(0, offset, ix) = ix;
} else {
point[1] = indice_data[2] - j + K1 / 2;
point[0] = indice_data[1] - i + K0 / 2;
if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[0] >= 0 &&
point[0] < outSpatialShape[0]) {
index = tv::ArrayIndexRowMajor<2, 2>::runPtrs(
point, outSpatialShape.data(), 0) +
spatialVolume * indice_data[0];
auto val =
cuhash::retrieve((unsigned)(index), table_size, table,
constants, stash_constants, stash_count);
if (val != cuhash::kNotFound) {
// for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i -
// 1]
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
indicePairs(1, offset, oldNum) = val;
indicePairs(0, offset, oldNum) = ix;
indicePairs(1, KV - offset - 1, oldNum) = ix;
indicePairs(0, KV - offset - 1, oldNum) = val;
}
}
}
}
}
}
}
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void resetGridKernel(const Index *indicePairUnique,
tv::TensorView<IndexGrid> gridsOut,
int numAct) {
for (int ix : tv::KernelLoopX<int>(numAct)) {
gridsOut[indicePairUnique[ix]] = -1;
}
}
template <typename T> __global__ void arangeKernel(T *data, int size) {
for (int ix : tv::KernelLoopX<int>(size)) {
data[ix] = ix;
}
}
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void
resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut,
const tv::SimpleVector<Index, NDim> outSpatialShape,
int numAct, Index spatialVolume) {
auto indsPtr = indices;
Index index;
for (int ix : tv::KernelLoopX<int>(numAct)) {
indsPtr = indices + ix * (NDim + 1);
index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
indsPtr + 1, outSpatialShape.data(), 0);
gridsOut[index + spatialVolume * indsPtr[0]] = -1;
}
}
} // namespace spconv
#undef atomicAdd
#endif
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_
#define SPARSE_CONV_INDICE_FUNCTOR_H_
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace spconv {
int create_conv_indice_pair_p1_cuda(
torch::Tensor indicesIn, torch::Tensor indicePairs, torch::Tensor indiceNum,
torch::Tensor indicePairUnique, std::vector<int64_t> kernelSize,
std::vector<int64_t> stride, std::vector<int64_t> padding,
std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
bool transpose);
int create_conv_indice_pair_p2_cuda(
torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
torch::Tensor indicePairs, torch::Tensor indiceNum,
torch::Tensor indicePairUnique, std::vector<int64_t> outSpatialShape,
bool transpose, bool resetGrid, bool useHash);
int create_submconv_indice_pair_cuda(
torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
std::vector<int64_t> stride, std::vector<int64_t> padding,
std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
bool transpose, bool resetGrid, bool useHash);
int create_conv_indice_pair_cpu(
torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
torch::Tensor indicePairs, torch::Tensor indiceNum,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outSpatialShape, bool transpose, bool resetGrid,
bool useHash);
int create_submconv_indice_pair_cpu(
torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
std::vector<int64_t> stride, std::vector<int64_t> padding,
std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
bool transpose, bool resetGrid, bool useHash);
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_MAXPOOL_FUNCTOR_H_
#define SPARSE_MAXPOOL_FUNCTOR_H_
#include <tensorview/mp_helper.h>
#include <tensorview/tensor.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
namespace spconv {
void maxpool_bwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
torch::Tensor dout, torch::Tensor din,
torch::Tensor indicesIn, torch::Tensor indicesOut,
int size);
void maxpool_fwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
torch::Tensor indicesIn, torch::Tensor indicesOut,
int size);
void maxpool_bwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
torch::Tensor dout, torch::Tensor din,
torch::Tensor indicesIn, torch::Tensor indicesOut,
int size);
void maxpool_fwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
torch::Tensor indicesIn, torch::Tensor indicesOut,
int size);
} // namespace spconv
#endif
\ No newline at end of file
/* Copyright (c) Chris Choy (chrischoy@ai.stanford.edu).
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural
* Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part
* of the code.
*/
template <typename Dtype, typename Itype, int BLOCK_SIZE>
__global__ void matmul(const Dtype *A, const int wA, const int hA,
const Dtype *B, const int wB, const int hB, Dtype *C,
const Itype *in_map, const Itype *out_map) {
// Use in_feat as A and kernel as B
// Block index
const int bx = blockIdx.x;
const int by = blockIdx.y;
// Thread index
const int tx = threadIdx.x;
const int ty = threadIdx.y;
// Coordinate. x is for rows, y is for columns.
const int x = BLOCK_SIZE * bx + tx;
const int y = BLOCK_SIZE * by + ty;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
Dtype Csub = 0;
const Itype in_row = y < hA ? in_map[y] : 0;
const Itype out_row = y < hA ? out_map[y] : 0;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int s = 0; s < wA; s += BLOCK_SIZE) {
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ Dtype Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * in_row + s + tx] : 0;
Bs[ty][tx] = ((s + ty) < hB && x < wB) ? B[wB * (s + ty) + x] : 0;
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * Bs[k][tx];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write the block sub-matrix to device memory;
// each thread writes one element
if (y < hA && x < wB)
atomicAdd(&C[wB * out_row + x], Csub);
// C[wB * out_row + x] += Csub;
}
template <typename Dtype, typename Itype, int BLOCK_SIZE>
__global__ void matmul2(const Dtype *A, const int wA, const int hA,
const Dtype *B, const int wB, const int hB,
const Dtype *D, const int wD, const int hD, Dtype *C,
Dtype *E, const Itype *in_map, const Itype *out_map) {
// Use grad_out_feat as A, transposed kernel weight as B, and in_feat as D
// Block index
const int bx = blockIdx.x;
const int by = blockIdx.y;
// Thread index
const int tx = threadIdx.x;
const int ty = threadIdx.y;
// Coordinate. y is for rows, x is for columns.
const int x = BLOCK_SIZE * bx + tx;
const int y = BLOCK_SIZE * by + ty;
const Itype in_row = y < hA ? in_map[y] : 0;
const Itype out_row = y < hA ? out_map[y] : 0;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
Dtype Csub = 0;
Dtype Esub = 0;
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ Dtype BTs[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Ds used to
// store the sub-matrix of D
__shared__ Dtype DTs[BLOCK_SIZE][BLOCK_SIZE];
// For Ds = D^T[...:..., ...:...], use the transposed grid dimension for A
DTs[ty][tx] = (x < wD && y < hD) ? D[wD * in_row + x] : 0;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int s = 0; s < wA; s += BLOCK_SIZE) {
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * out_row + s + tx] : 0;
// Transposed kernel
BTs[ty][tx] = ((s + ty) < wB && x < hB) ? B[wB * x + s + ty] : 0;
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * BTs[k][tx];
}
// For Esub, reset to 0
Esub = 0;
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) {
Esub += DTs[k][ty] * As[k][tx];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
// For the E matrix which requires accmulation of multiple blocks, use
// atomic addition. This can be replaced with a more sophisticaed reduction
// algorithm.
if ((bx * BLOCK_SIZE + ty) < wD && (s + tx) < wA)
atomicAdd(&E[wA * (bx * BLOCK_SIZE + ty) + (s + tx)], Esub);
}
// Write the block sub-matrix to device memory;
// each thread writes one element
if (y < hA && x < hB)
atomicAdd(&C[hB * in_row + x], Csub);
}
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef NMS_CPU_H
#define NMS_CPU_H
#include <pybind11/pybind11.h>
// must include pybind11/stl.h if using containers in STL in arguments.
#include "box_iou.h"
#include "nms_gpu.h"
#include <algorithm>
#include <boost/geometry.hpp>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include <vector>
namespace spconv {
namespace py = pybind11;
using namespace pybind11::literals;
template <typename DType>
std::vector<int> non_max_suppression_cpu(py::array_t<DType> boxes,
py::array_t<int> order, DType thresh,
DType eps = 0) {
auto ndets = boxes.shape(0);
auto boxes_r = boxes.template unchecked<2>();
auto order_r = order.template unchecked<1>();
auto suppressed = zeros<int>({int(ndets)});
auto suppressed_rw = suppressed.template mutable_unchecked<1>();
auto area = zeros<DType>({int(ndets)});
auto area_rw = area.template mutable_unchecked<1>();
// get areas
for (int i = 0; i < ndets; ++i) {
area_rw(i) = (boxes_r(i, 2) - boxes_r(i, 0) + eps) *
(boxes_r(i, 3) - boxes_r(i, 1) + eps);
}
std::vector<int> keep;
int i, j;
DType xx1, xx2, w, h, inter, ovr;
for (int _i = 0; _i < ndets; ++_i) {
i = order_r(_i);
if (suppressed_rw(i) == 1)
continue;
keep.push_back(i);
for (int _j = _i + 1; _j < ndets; ++_j) {
j = order_r(_j);
if (suppressed_rw(j) == 1)
continue;
xx2 = std::min(boxes_r(i, 2), boxes_r(j, 2));
xx1 = std::max(boxes_r(i, 0), boxes_r(j, 0));
w = xx2 - xx1 + eps;
if (w > 0) {
xx2 = std::min(boxes_r(i, 3), boxes_r(j, 3));
xx1 = std::max(boxes_r(i, 1), boxes_r(j, 1));
h = xx2 - xx1 + eps;
if (h > 0) {
inter = w * h;
ovr = inter / (area_rw(i) + area_rw(j) - inter);
if (ovr >= thresh)
suppressed_rw(j) = 1;
}
}
}
}
return keep;
}
template <typename DType>
std::vector<int> rotate_non_max_suppression_cpu(py::array_t<DType> box_corners,
py::array_t<int> order,
py::array_t<DType> standup_iou,
DType thresh) {
auto ndets = box_corners.shape(0);
auto box_corners_r = box_corners.template unchecked<3>();
auto order_r = order.template unchecked<1>();
auto suppressed = zeros<int>({int(ndets)});
auto suppressed_rw = suppressed.template mutable_unchecked<1>();
auto standup_iou_r = standup_iou.template unchecked<2>();
std::vector<int> keep;
int i, j;
namespace bg = boost::geometry;
typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
typedef bg::model::polygon<point_t> polygon_t;
polygon_t poly, qpoly;
std::vector<polygon_t> poly_inter, poly_union;
DType inter_area, union_area, overlap;
for (int _i = 0; _i < ndets; ++_i) {
i = order_r(_i);
if (suppressed_rw(i) == 1)
continue;
keep.push_back(i);
for (int _j = _i + 1; _j < ndets; ++_j) {
j = order_r(_j);
if (suppressed_rw(j) == 1)
continue;
if (standup_iou_r(i, j) <= 0.0)
continue;
// std::cout << "pre_poly" << std::endl;
try {
bg::append(poly,
point_t(box_corners_r(i, 0, 0), box_corners_r(i, 0, 1)));
bg::append(poly,
point_t(box_corners_r(i, 1, 0), box_corners_r(i, 1, 1)));
bg::append(poly,
point_t(box_corners_r(i, 2, 0), box_corners_r(i, 2, 1)));
bg::append(poly,
point_t(box_corners_r(i, 3, 0), box_corners_r(i, 3, 1)));
bg::append(poly,
point_t(box_corners_r(i, 0, 0), box_corners_r(i, 0, 1)));
bg::append(qpoly,
point_t(box_corners_r(j, 0, 0), box_corners_r(j, 0, 1)));
bg::append(qpoly,
point_t(box_corners_r(j, 1, 0), box_corners_r(j, 1, 1)));
bg::append(qpoly,
point_t(box_corners_r(j, 2, 0), box_corners_r(j, 2, 1)));
bg::append(qpoly,
point_t(box_corners_r(j, 3, 0), box_corners_r(j, 3, 1)));
bg::append(qpoly,
point_t(box_corners_r(j, 0, 0), box_corners_r(j, 0, 1)));
bg::intersection(poly, qpoly, poly_inter);
} catch (const std::exception &e) {
std::cout << "box i corners:" << std::endl;
for (int k = 0; k < 4; ++k) {
std::cout << box_corners_r(i, k, 0) << " " << box_corners_r(i, k, 1)
<< std::endl;
}
std::cout << "box j corners:" << std::endl;
for (int k = 0; k < 4; ++k) {
std::cout << box_corners_r(j, k, 0) << " " << box_corners_r(j, k, 1)
<< std::endl;
}
// throw e;
continue;
}
// std::cout << "post_poly" << std::endl;
// std::cout << "post_intsec" << std::endl;
if (!poly_inter.empty()) {
inter_area = bg::area(poly_inter.front());
// std::cout << "pre_union" << " " << inter_area << std::endl;
bg::union_(poly, qpoly, poly_union);
/*
if (poly_union.empty()){
std::cout << "intsec area:" << " " << inter_area << std::endl;
std::cout << "box i corners:" << std::endl;
for(int k = 0; k < 4; ++k){
std::cout << box_corners_r(i, k, 0) << " " << box_corners_r(i,
k, 1) << std::endl;
}
std::cout << "box j corners:" << std::endl;
for(int k = 0; k < 4; ++k){
std::cout << box_corners_r(j, k, 0) << " " << box_corners_r(j,
k, 1) << std::endl;
}
}*/
// std::cout << "post_union" << poly_union.empty() << std::endl;
if (!poly_union.empty()) { // ignore invalid box
union_area = bg::area(poly_union.front());
// std::cout << "post union area" << std::endl;
// std::cout << union_area << "debug" << std::endl;
overlap = inter_area / union_area;
if (overlap >= thresh)
suppressed_rw(j) = 1;
poly_union.clear();
}
}
poly.clear();
qpoly.clear();
poly_inter.clear();
}
}
return keep;
}
#ifdef TV_CUDA
constexpr int const threadsPerBlock = sizeof(unsigned long long) * 8;
template <typename DType>
int non_max_suppression(py::array_t<DType> boxes, py::array_t<int> keep_out,
DType nms_overlap_thresh, int device_id) {
py::buffer_info info = boxes.request();
auto boxes_ptr = static_cast<DType *>(info.ptr);
py::buffer_info info_k = keep_out.request();
auto keep_out_ptr = static_cast<int *>(info_k.ptr);
return _nms_gpu<DType, threadsPerBlock>(keep_out_ptr, boxes_ptr,
boxes.shape(0), boxes.shape(1),
nms_overlap_thresh, device_id);
}
#endif
} // namespace spconv
#endif
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef NMS_FUNCTOR_H_
#define NMS_FUNCTOR_H_
#include <tensorview/tensorview.h>
namespace spconv {
namespace functor {
template <typename Device, typename T, typename Index>
struct NonMaxSupressionFunctor {
Index operator()(const Device &d, tv::TensorView<Index> keep,
tv::TensorView<const T> boxes, T threshold, T eps);
};
template <typename Device, typename T, typename Index>
struct rotateNonMaxSupressionFunctor {
Index operator()(const Device &d, tv::TensorView<Index> keep,
tv::TensorView<const T> boxCorners,
tv::TensorView<const T> standupIoU, T threshold);
};
} // namespace functor
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
template <typename DType, int BLOCK_THREADS>
int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
int boxes_dim, DType nms_overlap_thresh, int device_id);
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef NMS_TORCH_OP_H_
#define NMS_TORCH_OP_H_
#include <spconv/indice.h>
#include <spconv/nms_functor.h>
#include <spconv/reordering.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace spconv {
// torch.jit's doc says only support int64, so we need to convert to int32.
template <typename T>
torch::Tensor nonMaxSuppression(torch::Tensor boxes, torch::Tensor scores,
int64_t preMaxSize, int64_t postMaxSize,
double thresh, double eps) {
// auto timer = spconv::CudaContextTimer<>();
tv::check_torch_dtype<T>(boxes);
auto resOptions =
torch::TensorOptions().dtype(torch::kInt64).device(boxes.device());
if (boxes.size(0) == 0) {
return torch::zeros({0}, resOptions);
}
torch::Tensor indices;
if (preMaxSize > 0) {
auto numKeepedScores = scores.size(0);
preMaxSize = std::min(numKeepedScores, preMaxSize);
auto res = torch::topk(scores, preMaxSize);
indices = std::get<1>(res);
boxes = torch::index_select(boxes, 0, indices);
} else {
indices = std::get<1>(torch::sort(scores));
boxes = torch::index_select(boxes, 0, indices);
}
if (boxes.size(0) == 0)
return torch::zeros({0}, resOptions);
auto keep = torch::zeros({boxes.size(0)}, resOptions);
int64_t keepNum = 0;
if (boxes.device().type() == torch::kCPU) {
auto nmsFunctor = functor::NonMaxSupressionFunctor<tv::CPU, T, int64_t>();
keepNum = nmsFunctor(tv::CPU(), tv::torch2tv<int64_t>(keep),
tv::torch2tv<const T>(boxes), T(thresh), T(eps));
} else {
TV_ASSERT_RT_ERR(false, "not implemented");
}
if (postMaxSize <= 0) {
postMaxSize = keepNum;
}
// std::cout << keep << std::endl;
keep = keep.slice(0, 0, std::min(keepNum, postMaxSize));
if (preMaxSize > 0) {
return torch::index_select(indices, 0, keep);
}
return keep;
}
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef POINTPILLARS_SCATTER_FUNCTOR_H_
#define POINTPILLARS_SCATTER_FUNCTOR_H_
#include <tensorview/tensorview.h>
namespace spconv {
namespace functor {
template <typename Device, typename T, typename Index>
struct PointPillarScatter {
void operator()(const Device &d, tv::TensorView<T> canvas,
tv::TensorView<const T> features,
tv::TensorView<const T> coors);
};
} // namespace functor
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PILLAR_SCATTER_OP_H_
#define PILLAR_SCATTER_OP_H_
#include <spconv/pillar_scatter_functor.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace spconv {
// torch.jit's doc says only support int64, so we need to convert to int32.
template <typename T>
torch::Tensor pointPillarScatter(torch::Tensor features, torch::Tensor coors,
torch::Tensor shape) {
TV_ASSERT_RT_ERR(shape.device().type() == torch::kCPU, "error");
TV_ASSERT_RT_ERR(features.device().type() == torch::kCUDA, "error");
TV_ASSERT_RT_ERR(shape.dim() == 1, "error");
TV_ASSERT_RT_ERR(shape.size(0) == 4, "error");
TV_ASSERT_RT_ERR(features.dim() >= 3, "error");
TV_ASSERT_RT_ERR(features.size(0) == 1, "feature first dim must be 1");
TV_ASSERT_RT_ERR(coors.size(0) == 1, "coors first dim must be 1");
TV_ASSERT_RT_ERR(features.size(2) == coors.size(2), "err");
tv::check_torch_dtype<int>(shape);
tv::check_torch_dtype<T>(coors);
auto shapeData = shape.data_ptr<int>();
torch::Tensor canvas =
torch::zeros({shapeData[0], shapeData[1], shapeData[2], shapeData[3]},
features.options());
TV_ASSERT_RT_ERR(shapeData[1] == features.size(1), "error");
#ifdef TV_CUDA
functor::PointPillarScatter<tv::GPU, T, int> ftor;
ftor(tv::TorchGPU(), tv::torch2tv<T>(canvas),
tv::torch2tv<const T>(features.squeeze()),
tv::torch2tv<const T>(coors.squeeze()));
#endif
return canvas;
}
} // namespace spconv
#endif
\ No newline at end of file
#pragma once
#include <tensorview/kernel_utils.h>
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace spconv {
template <typename Index, unsigned NDim>
__global__ void scatterPointToGridKernel(
tv::TensorView<const float> points, tv::TensorView<const Index> indexes,
tv::TensorView<float> grids, tv::TensorView<Index> numPointsPerGrid,
tv::TensorView<Index> pointIndex,
const tv::SimpleVector<Index, NDim> gridShape) {
Index index;
int numPoints = points.dim(0);
int numFeatures = points.dim(1);
for (int ix : tv::KernelLoopX<int>(numPoints)) {
index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
indexes.data() + ix * NDim, gridShape.data(), 0);
pointIndex(ix) = index;
atomicAdd(numPointsPerGrid.data() + index, Index(1));
#pragma unroll
for (int k = 0; k != numFeatures; ++k) {
atomicAdd(grids.data() + index * numFeatures + k,
*(points.data() + ix * numFeatures + k));
}
}
}
template <typename Index, unsigned NDim>
__global__ void
gatherPointFromGridKernel(tv::TensorView<const float> grids,
tv::TensorView<const Index> numPointsPerGrid,
tv::TensorView<const Index> pointIndexUnique,
tv::TensorView<float> voxels,
tv::TensorView<Index> coors,
const tv::SimpleVector<Index, NDim> gridShape) {
Index index;
int numVoxels = voxels.dim(0);
int numFeatures = grids.dim(1);
for (int ix : tv::KernelLoopX<int>(numVoxels)) {
index = pointIndexUnique(ix);
#pragma unroll
for (int k = 0; k != numFeatures; ++k) {
voxels(ix, k) = grids(index, k) / numPointsPerGrid(index);
}
index = tv::rowArrayIdxInv<Index, NDim>(index, coors.data() + ix * NDim,
gridShape.data());
}
}
template <typename Index>
__global__ void resetGridKernel(tv::TensorView<float> grids,
tv::TensorView<Index> numPointsPerGrid,
tv::TensorView<Index> pointIndexUnique) {
Index index;
int numVoxels = pointIndexUnique.dim(0) - 1;
int numFeatures = grids.dim(1);
for (int ix : tv::KernelLoopX<int>(numVoxels)) {
index = pointIndexUnique(ix);
#pragma unroll
for (int k = 0; k != numFeatures; ++k) {
grids(index, k) = 0;
numPointsPerGrid(index) = 0;
}
}
}
template <typename Index>
__global__ void resetPointIndexKernel(tv::TensorView<Index> pointIndex,
const Index gridVolume) {
int num_max_points = pointIndex.dim(0) - 1;
for (int ix : tv::KernelLoopX<int>(num_max_points)) {
pointIndex(ix) = gridVolume;
}
}
} // namespace spconv
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <pybind11/pybind11.h>
// must include pybind11/eigen.h if using eigen matrix as arguments.
// must include pybind11/stl.h if using containers in STL in arguments.
#include <algorithm>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
// #include <vector>
#include <iostream>
#include <math.h>
namespace spconv {
namespace py = pybind11;
using namespace pybind11::literals;
template <typename DType, int NDim>
int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
py::array_t<DType> voxel_point_mask,
py::array_t<int> coors,
py::array_t<int> num_points_per_voxel,
py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size,
std::vector<DType> coors_range, int max_points,
int max_voxels) {
auto points_rw = points.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
// auto ndim = points_rw.shape(1) - 1;
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int voxelidx, num;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed)
continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels)
continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors_rw(voxelidx, k) = coor[k];
}
}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) {
voxel_point_mask_rw(voxelidx, num) = DType(1);
for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k);
}
num_points_per_voxel_rw(voxelidx) += 1;
}
}
for (int i = 0; i < voxel_num; ++i) {
coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
}
return voxel_num;
}
template <typename DType, int NDim>
int points_to_voxel_3d_np_mean(
py::array_t<DType> points, py::array_t<DType> voxel_point_mask,
py::array_t<DType> voxels, py::array_t<DType> means, py::array_t<int> coors,
py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size, std::vector<DType> coors_range,
int max_points, int max_voxels) {
auto points_rw = points.template mutable_unchecked<2>();
auto means_rw = means.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
// auto ndim = points_rw.shape(1) - 1;
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int voxelidx, num;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed)
continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels)
continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors_rw(voxelidx, k) = coor[k];
}
}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) {
voxel_point_mask_rw(voxelidx, num) = DType(1);
for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k);
}
num_points_per_voxel_rw(voxelidx) += 1;
for (int k = 0; k < num_features; ++k) {
means_rw(voxelidx, k) +=
(points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
}
}
}
for (int i = 0; i < voxel_num; ++i) {
coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
num = num_points_per_voxel_rw(i);
for (int j = num; j < max_points; ++j) {
for (int k = 0; k < num_features; ++k) {
voxels_rw(i, j, k) = means_rw(i, k);
}
}
}
return voxel_num;
}
template <typename DType, int NDim>
int points_to_voxel_3d_with_filtering(
py::array_t<DType> points, py::array_t<DType> voxels,
py::array_t<DType> voxel_point_mask, py::array_t<int> voxel_mask,
py::array_t<DType> mins, py::array_t<DType> maxs, py::array_t<int> coors,
py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size, std::vector<DType> coors_range,
int max_points, int max_voxels, int block_factor, int block_size,
DType height_threshold, DType height_high_threshold) {
auto points_rw = points.template mutable_unchecked<2>();
auto mins_rw = mins.template mutable_unchecked<2>();
auto maxs_rw = maxs.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
// auto ndim = points_rw.shape(1) - 1;
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
DType max_value, min_value;
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int block_shape_H = grid_size[1] / block_factor;
int block_shape_W = grid_size[0] / block_factor;
int voxelidx, num;
int block_coor[2];
int startx, stopx, starty, stopy;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed)
continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels)
continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors_rw(voxelidx, k) = coor[k];
}
}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) {
voxel_point_mask_rw(voxelidx, num) = DType(1);
for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k);
}
block_coor[0] = coor[1] / block_factor;
block_coor[1] = coor[2] / block_factor;
mins_rw(block_coor[0], block_coor[1]) =
std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
maxs_rw(block_coor[0], block_coor[1]) =
std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
num_points_per_voxel_rw(voxelidx) += 1;
}
}
for (int i = 0; i < voxel_num; ++i) {
coor[1] = coors_rw(i, 1);
coor[2] = coors_rw(i, 2);
coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;
block_coor[0] = coor[1] / block_factor;
block_coor[1] = coor[2] / block_factor;
min_value = mins_rw(block_coor[0], block_coor[1]);
max_value = maxs_rw(block_coor[0], block_coor[1]);
startx = std::max(0, block_coor[0] - block_size / 2);
stopx =
std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
starty = std::max(0, block_coor[1] - block_size / 2);
stopy =
std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);
for (int j = startx; j < stopx; ++j) {
for (int k = starty; k < stopy; ++k) {
min_value = std::min(min_value, mins_rw(j, k));
max_value = std::max(max_value, maxs_rw(j, k));
}
}
voxel_mask_rw(i) = ((max_value - min_value) > height_threshold) &&
((max_value - min_value) < height_high_threshold);
}
return voxel_num;
}
} // namespace spconv
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment