Commit 01ed382c authored by yan.yan's avatar yan.yan
Browse files

working on tensor core test

parent 3517290c
// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision:$
// $Date:$
// -------------------------------------------------------------
// This source code is distributed under the terms of license.txt in
// the root directory of this source distribution.
// -------------------------------------------------------------
/**
* @file hash_table.h
*
* @brief Header for a basic hash table that stores one value per key.
*/
#ifndef CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__H
#define CUDAHT__CUCKOO__SRC__LIBRARY__HASH_TABLE__H
#include "definitions.h"
#include "hash_functions.h"
#include <cstdio>
/** \addtogroup cudpp_app
* @{
*/
/** \addtogroup cudpp_hash_data_structures
* @{
*/
/* --------------------------------------------------------------------------
Doxygen definitions.
-------------------------------------------------------------------------- */
/*! @namespace CudaHT
* @brief Encapsulates the hash table library.
*/
/*! @namespace CuckooHashing
* @brief Encapsulates the cuckoo hash table that uses stashes.
*/
/* -------------------------------------------------------------------------
Hash table code.
------------------------------------------------------------------------- */
namespace cuhash {
//! Compute how many thread blocks are required for the given number of threads.
dim3 ComputeGridDim(unsigned threads);
//! Compute how long an eviction chain is allowed to become for a given input
//! size.
/*! \param[in] num_keys Number of keys in the input.
* \param[in] table_size Number of slots in the hash table.
* \param[in] num_functions Number of hash functions being used.
* \returns The number of iterations that should be allowed.
*
* The latter two parameters are only needed when using an empirical
* formula for computing the chain length.
*/
unsigned ComputeMaxIterations(const unsigned num_keys,
const unsigned table_size,
const unsigned num_functions);
//! Basic hash table that stores one value for each key.
/*! The input consists of two unsigned arrays of keys and values.
* None of the keys are expected to be repeated.
*
* @todo Templatize the interface without forcing the header file to
* have CUDA calls.
* @ingroup cudpp_app
*/
class HashTable {
public:
HashTable();
virtual ~HashTable() { Release(); }
//! Initialize the hash table's memory. Must be called before \ref
//! Build() and after the random number generator has been seeded.
/*! @param[in] max_input_size Largest expected number of items in the input.
* @param[in] space_usage Size of the hash table relative to the
* input. Bigger tables are faster to build
* and retrieve from.
* @param[in] num_functions Number of hash functions to use. May be
* 2-5. More hash functions make it easier
* to build the table, but increase
* retrieval times.
* @returns Whether the hash table was initialized successfully (true)
* or not (false).
*
* The minimum space usage is dependent on the number of functions
* being used; for two through five functions, the minimum space
* usage is 2.1, 1.1, 1.03, and 1.02 respectively.
*/
virtual bool Initialize(const unsigned max_input_size,
const float space_usage = 1.25,
const unsigned num_functions = 4);
//! Free all memory.
virtual void Release();
//! Build the hash table.
/*! @param[in] input_size Number of key-value pairs being inserted.
* @param[in] d_keys Device memory array containing all of the input
* keys.
* @param[in] d_vals Device memory array containing the keys' values.
* @returns Whether the hash table was built successfully (true) or
* not (false).
*
* Several attempts are allowed to build the hash table in case of failure.
* The input keys are expected to be completely unique.
* To reduce the chance of a failure, increase the space usage or number of
* functions.
* Keys are not allowed to be equal to cuhash::kKeyEmpty.
*/
virtual bool Build(const unsigned input_size, const unsigned *d_keys,
const unsigned *d_vals);
//! Query the hash table.
/*! @param[in] n_queries Number of keys in the query set.
* @param[in] d_query_keys Device memory array containing all of
* the query keys.
* @param[in] d_query_results Values for the query keys.
*
* kNotFound is returned for any query key that failed to be found
* in the table.
*/
virtual void Retrieve(const unsigned n_queries, const unsigned *d_query_keys,
unsigned *d_query_results);
//! @name Accessors
/// @brief Mainly needed to use the __device__ CudaHT::retrieve()
/// function directly.
/// @{
//! Returns how many slots the hash table has.
inline unsigned get_table_size() const { return table_size_; }
//! Returns how many items are stored in the stash.
inline unsigned get_stash_count() const { return stash_count_; }
//! Returns the constants used by the stash.
inline uint2 get_stash_constants() const { return stash_constants_; }
//! Returns the hash table contents.
inline const Entry *get_contents() const { return d_contents_; }
//! Returns the number of hash functions being used.
inline unsigned get_num_hash_functions() const { return num_hash_functions_; }
//! When using two hash functions, returns the constants.
inline Functions<2> get_constants_2() const { return constants_2_; }
//! When using three hash functions, returns the constants.
inline Functions<3> get_constants_3() const { return constants_3_; }
//! When using four hash functions, returns the constants.
inline Functions<4> get_constants_4() const { return constants_4_; }
//! When using five hash functions, returns the constants.
inline Functions<5> get_constants_5() const { return constants_5_; }
/// @}
inline Entry *data() { return d_contents_; }
inline const Entry *data() const { return d_contents_; }
protected:
unsigned table_size_; //!< Size of the hash table.
unsigned num_hash_functions_; //!< Number of hash functions being used.
Entry *d_contents_; //!< Device memory: The hash table contents. The stash is
//!< stored at the end.
unsigned stash_count_; //!< Number of key-value pairs currently stored.
uint2 stash_constants_; //!< Hash function constants for the stash.
Functions<2> constants_2_; //!< Constants for a set of two hash functions.
Functions<3> constants_3_; //!< Constants for a set of three hash functions.
Functions<4> constants_4_; //!< Constants for a set of four hash functions.
Functions<5> constants_5_; //!< Constants for a set of five hash functions.
unsigned *d_failures_; //!< Device memory: General use error flag.
};
/*! @name Internal
* @{
*/
namespace CUDAWrapper {
//! Fills a 64-bit array with a particular value.
void ClearTable(const unsigned slots_in_table, const Entry fill_value,
Entry *d_array);
//! Calls the Cuckoo Hash construction kernel.
void CallCuckooHash(const unsigned n_entries, const unsigned num_hash_functions,
const unsigned *d_keys, const unsigned *d_values,
const unsigned table_size, const Functions<2> constants_2,
const Functions<3> constants_3,
const Functions<4> constants_4,
const Functions<5> constants_5,
const unsigned max_iteration_attempts, Entry *d_contents,
uint2 stash_constants, unsigned *d_stash_count,
unsigned *d_failures, unsigned *d_iterations_taken);
//! Calls the kernel that performs retrievals.
void CallHashRetrieve(const unsigned n_queries,
const unsigned num_hash_functions,
const unsigned *keys_in, const unsigned table_size,
const Entry *table, const Functions<2> constants_2,
const Functions<3> constants_3,
const Functions<4> constants_4,
const Functions<5> constants_5,
const uint2 stash_constants, const unsigned stash_count,
unsigned *values_out);
}; // namespace CUDAWrapper
/// @}
}; // namespace cuhash
/** @} */ // end hash table data structures
/** @} */ // end cudpp_app
#endif
// Leave this at the end of the file
// Local Variables:
// mode:c++
// c-file-style: "NVIDIA"
// End:
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This file is used for c++ unit test, but pytorch jit ops don't support c++
// debug build.
#ifndef PARAMS_GRID_H_
#define PARAMS_GRID_H_
#include <tuple>
#include <vector>
namespace detail {
template <class T> int getTotalSize(std::vector<T> arg) { return arg.size(); }
template <class T, class... TArgs>
int getTotalSize(std::vector<T> arg, std::vector<TArgs>... args) {
return arg.size() * getTotalSize(args...);
}
template <typename T> int getSize(std::vector<T> arg) { return arg.size(); }
template <int Idx, class TT, class T>
void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg) {
std::get<Idx>(src) = arg[counter[Idx]];
}
template <int Idx, class TT, class T, class... TArgs>
void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg,
std::vector<TArgs> &... args) {
std::get<Idx>(src) = arg[counter[Idx]];
assigner<Idx + 1>(src, counter, args...);
}
} // namespace detail
template <class... TArgs>
std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
int length = detail::getTotalSize(args...);
std::vector<int> sizes = {detail::getSize(args)...};
int size = sizes.size();
std::vector<std::tuple<TArgs...>> params(length);
std::vector<int> counter(size);
for (int i = 0; i < length; ++i) {
detail::assigner<0>(params[i], counter, args...);
counter[size - 1] += 1;
for (int c = size - 1; c >= 0; --c) {
if (counter[c] == sizes[c] && c > 0) {
counter[c - 1] += 1;
counter[c] = 0;
}
}
}
return params;
}
#endif
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef BOX_IOU_H
#define BOX_IOU_H
#include <pybind11/pybind11.h>
// must include pybind11/eigen.h if using eigen matrix as arguments.
#include <algorithm>
#include <boost/geometry.hpp>
#include <pybind11/numpy.h>
namespace spconv {
// #include "voxelnet/core/cc/pybind11_helper.h"
namespace py = pybind11;
using namespace pybind11::literals;
template <typename DType, typename ShapeContainer>
inline py::array_t<DType> constant(ShapeContainer shape, DType value) {
// create ROWMAJOR array.
py::array_t<DType> array(shape);
std::fill(array.mutable_data(), array.mutable_data() + array.size(), value);
return array;
}
template <typename DType>
inline py::array_t<DType> zeros(std::vector<long int> shape) {
return constant<DType, std::vector<long int>>(shape, 0);
}
template <typename DType>
py::array_t<DType>
rbbox_iou(py::array_t<DType> box_corners, py::array_t<DType> qbox_corners,
py::array_t<DType> standup_iou, DType standup_thresh) {
namespace bg = boost::geometry;
typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
typedef bg::model::polygon<point_t> polygon_t;
polygon_t poly, qpoly;
std::vector<polygon_t> poly_inter, poly_union;
DType inter_area, union_area;
auto box_corners_r = box_corners.template unchecked<3>();
auto qbox_corners_r = qbox_corners.template unchecked<3>();
auto standup_iou_r = standup_iou.template unchecked<2>();
auto N = box_corners_r.shape(0);
auto K = qbox_corners_r.shape(0);
py::array_t<DType> overlaps = zeros<DType>({int(N), int(K)});
auto overlaps_rw = overlaps.template mutable_unchecked<2>();
if (N == 0 || K == 0) {
return overlaps;
}
for (int k = 0; k < K; ++k) {
for (int n = 0; n < N; ++n) {
if (standup_iou_r(n, k) <= standup_thresh)
continue;
bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1)));
bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
bg::intersection(poly, qpoly, poly_inter);
if (!poly_inter.empty()) {
inter_area = bg::area(poly_inter.front());
bg::union_(poly, qpoly, poly_union);
if (!poly_union.empty()) {
union_area = bg::area(poly_union.front());
overlaps_rw(n, k) = inter_area / union_area;
}
poly_union.clear();
}
poly.clear();
qpoly.clear();
poly_inter.clear();
}
}
return overlaps;
}
template <typename DType>
py::array_t<DType> rbbox_intersection(py::array_t<DType> box_corners,
py::array_t<DType> qbox_corners,
py::array_t<DType> standup_iou,
DType standup_thresh) {
namespace bg = boost::geometry;
typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
typedef bg::model::polygon<point_t> polygon_t;
polygon_t poly, qpoly;
std::vector<polygon_t> poly_inter, poly_union;
DType inter_area, union_area;
auto box_corners_r = box_corners.template unchecked<3>();
auto qbox_corners_r = qbox_corners.template unchecked<3>();
auto standup_iou_r = standup_iou.template unchecked<2>();
auto N = box_corners_r.shape(0);
auto K = qbox_corners_r.shape(0);
py::array_t<DType> overlaps = zeros<DType>({int(N), int(K)});
auto overlaps_rw = overlaps.template mutable_unchecked<2>();
if (N == 0 || K == 0) {
return overlaps;
}
for (int k = 0; k < K; ++k) {
for (int n = 0; n < N; ++n) {
if (standup_iou_r(n, k) <= standup_thresh)
continue;
bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1)));
bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
bg::intersection(poly, qpoly, poly_inter);
if (!poly_inter.empty()) {
inter_area = bg::area(poly_inter.front());
overlaps_rw(n, k) = inter_area;
}
poly.clear();
qpoly.clear();
poly_inter.clear();
}
}
return overlaps;
}
} // namespace spconv
#endif
\ No newline at end of file
#pragma once
#include <cublas_v2.h>
#include <tensorview/tensorview.h>
namespace spconv {
template <class T>
cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const T *alpha, const T *A, int lda, const T *B,
int ldb, const T *beta, T *C, int ldc);
template <class T>
cublasStatus_t cublasTgemmRow(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const T *alpha, const T *A, int lda, const T *B,
int ldb, const T *beta, T *C, int ldc) {
return cublasTgemm<T>(handle, transb, transa, n, m, k, alpha, B, ldb, A, lda,
beta, C, ldc);
}
template <class T> inline T constant_scalar(float data) { return T(data); }
template <class T>
cublasStatus_t gemm(cublasHandle_t handle, bool transa, bool transb,
const tv::TensorView<T> A, const tv::TensorView<T> B,
tv::TensorView<T> C) {
TV_ASSERT_RT_ERR(A.ndim() == 2, "error");
TV_ASSERT_RT_ERR(B.ndim() == 2, "error");
auto transa_cublas = transa ? CUBLAS_OP_T : CUBLAS_OP_N;
auto transb_cublas = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
int m = transa ? A.dim(1) : A.dim(0);
int n = transb ? B.dim(0) : B.dim(1);
int ka = transa ? A.dim(0) : A.dim(1);
int kb = transb ? B.dim(1) : B.dim(0);
int lda = transa ? m : ka;
int ldb = transb ? ka : n;
int ldc = n;
TV_ASSERT_RT_ERR(ka == kb, "error");
T alpha = constant_scalar<T>(1);
T beta = constant_scalar<T>(0);
return cublasTgemmRow<T>(handle, transa_cublas, transb_cublas, m, n, ka,
&alpha, A.data(), lda, B.data(), ldb, &beta,
C.data(), ldc);
}
} // namespace spconv
This diff is collapsed.
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cuda_runtime_api.h>
#include <tensorview/tensor.h>
#include <torch/script.h>
namespace spconv {
enum FusedConvAlgo { kFSparseConvNet, kFMinkowskiEngine };
using all_fused_conv_algos_t =
tv::mp_list_c<int, kFSparseConvNet, kFMinkowskiEngine>;
void fused_conv_cuda(torch::Tensor output, torch::Tensor features,
torch::Tensor filters, torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot);
void fused_conv_backward_cuda(torch::Tensor features, torch::Tensor din,
torch::Tensor dout, torch::Tensor filters,
torch::Tensor dfilters, torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot);
void fused_conv_cuda_minkowski(torch::Tensor output, torch::Tensor features,
torch::Tensor filters, torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot);
void fused_conv_backward_cuda_minkowski(torch::Tensor features,
torch::Tensor din, torch::Tensor dout,
torch::Tensor filters,
torch::Tensor dfilters,
torch::Tensor indicesIn,
torch::Tensor indicesOut, int nHot);
template <int Algo> struct FusedConvDispatch;
template <> struct FusedConvDispatch<kFSparseConvNet> {
constexpr static auto *fwd = fused_conv_cuda;
constexpr static auto *bwd = fused_conv_backward_cuda;
};
template <> struct FusedConvDispatch<kFMinkowskiEngine> {
constexpr static auto *fwd = fused_conv_cuda_minkowski;
constexpr static auto *bwd = fused_conv_backward_cuda_minkowski;
};
} // namespace spconv
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef FUSED_SPARSE_CONV_OP_H_
#define FUSED_SPARSE_CONV_OP_H_
#include <spconv/indice.h>
#include <spconv/reordering.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace spconv {
// torch.jit's doc says only support int64, so we need to convert to int32.
torch::Tensor
fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters,
torch::Tensor bias, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
/*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element(
indicePairNumVec.begin(), indicePairNumVec.end());
indicePairMaxSize = *indicePairVecMaxSizeIter;
}*/
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch::Tensor output =
torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
// auto timer = spconv::CudaContextTimer<>();
auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob =
torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options);
if (device == torch::kCPU) {
sparse_gather_cpu(inputBuffer, features, indicePairs[i][inverse], nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
sparse_gather_cuda(inputBuffer, features, indicePairs[i][inverse], nHot);
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalGatherTime += timer.report() / 1000.0;
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
// totalGEMMTime += timer.report() / 1000.0;
if (device == torch::kCPU) {
sparse_scatter_add_cpu(outputBuffer, output, indicePairs[i][!inverse],
nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
sparse_scatter_add_cuda(outputBuffer, output, indicePairs[i][!inverse],
nHot);
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalSAddTime += timer.report() / 1000.0;
}
// std::cout << "gather time " << totalGatherTime << std::endl;
// std::cout << "gemm time " << totalGEMMTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return output;
}
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPCONV_GEOMETRY_H_
#define SPCONV_GEOMETRY_H_
#include <iostream>
#include <limits>
#include <tensorview/tensorview.h>
#include <tsl/robin_map.h>
#include <unordered_map>
namespace spconv {
namespace detail {
template <typename T> struct ToUnsigned;
template <> struct ToUnsigned<int> { using type = uint32_t; };
template <> struct ToUnsigned<long> { using type = uint64_t; };
template <typename T> struct FNVInternal;
template <> struct FNVInternal<uint32_t> {
constexpr static uint32_t defaultOffsetBasis = 0x811C9DC5;
constexpr static uint32_t prime = 0x01000193;
};
template <> struct FNVInternal<uint64_t> {
constexpr static uint64_t defaultOffsetBasis = 0xcbf29ce484222325;
constexpr static uint64_t prime = 0x100000001b3;
};
} // namespace detail
template <typename T>
using to_unsigned_t = typename detail::ToUnsigned<std::remove_const_t<T>>::type;
template <typename T> struct FNV1a : detail::FNVInternal<T> {
std::size_t operator()(const T *data, std::size_t size) {
to_unsigned_t<T> hash = detail::FNVInternal<T>::defaultOffsetBasis;
for (std::size_t i = 0; i < size; ++i) {
hash *= detail::FNVInternal<T>::prime;
hash ^= static_cast<to_unsigned_t<T>>(data[i]);
}
return hash;
}
};
template <typename Index, unsigned NDim>
TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
const Index *kernelSize,
const Index *stride, const Index *padding,
const Index *dilation,
const Index *outSpatialShape, Index *out) {
Index lowers[NDim];
Index uppers[NDim];
Index counter[NDim];
Index counterSize[NDim];
Index pointCounter = 0;
Index val;
Index numPoints = 1;
Index m, offset;
bool valid = false;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 +
stride[i] + padding[i]) /
stride[i];
uppers[i] = (input_pos[i] + padding[i]) / stride[i];
}
#pragma unroll
for (unsigned i = 0; i < NDim; ++i) {
counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
numPoints *= counterSize[i];
}
#pragma unroll
for (int i = 0; i < NDim; ++i) {
counter[i] = 0;
}
for (int i = 0; i < numPoints; ++i) {
valid = true;
m = 1;
offset = 0;
#pragma unroll
for (int j = NDim - 1; j >= 0; --j) {
val = uppers[j] - counter[j] * dilation[j];
out[pointCounter * (NDim + 1) + j] = val;
if (val < 0 || (val > outSpatialShape[j] - 1)) {
valid = false;
// break;
}
offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
m *= kernelSize[j];
}
out[pointCounter * (NDim + 1) + NDim] = offset;
if (valid)
++pointCounter;
counter[NDim - 1] += 1;
#pragma unroll
for (int c = NDim - 1; c >= 0; --c) {
if (counter[c] == counterSize[c] && c > 0) {
counter[c - 1] += 1;
counter[c] = 0;
}
}
}
return pointCounter;
}
template <typename Index, unsigned NDim>
TV_HOST_DEVICE Index getValidOutPosTranspose(
const Index *input_pos, const Index *kernelSize, const Index *stride,
const Index *padding, const Index *dilation, const Index *outSpatialShape,
Index *out) {
Index lowers[NDim];
Index uppers[NDim];
Index counter[NDim];
Index counterSize[NDim];
Index pointCounter = 0;
Index val;
Index numPoints = 1;
Index m, offset;
bool valid = false;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
lowers[i] = input_pos[i] * stride[i] - padding[i];
uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i];
}
#pragma unroll
for (unsigned i = 0; i < NDim; ++i) {
counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
numPoints *= counterSize[i];
}
#pragma unroll
for (int i = 0; i < NDim; ++i) {
counter[i] = 0;
}
for (int i = 0; i < numPoints; ++i) {
valid = true;
m = 1;
offset = 0;
#pragma unroll
for (int j = NDim - 1; j >= 0; --j) {
val = uppers[j] - counter[j] * dilation[j];
out[pointCounter * (NDim + 1) + j] = val;
if (val < 0 || (val > outSpatialShape[j] - 1)) {
valid = false;
// break;
}
offset += m * (val - lowers[j]) / dilation[j];
m *= kernelSize[j];
}
out[pointCounter * (NDim + 1) + NDim] = offset;
if (valid)
++pointCounter;
counter[NDim - 1] += 1;
#pragma unroll
for (int c = NDim - 1; c >= 0; --c) {
if (counter[c] == counterSize[c] && c > 0) {
counter[c - 1] += 1;
counter[c] = 0;
}
}
}
return pointCounter;
}
} // namespace spconv
#endif
\ No newline at end of file
This diff is collapsed.
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_
#define SPARSE_CONV_INDICE_FUNCTOR_H_
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace spconv {
int create_conv_indice_pair_p1_cuda(
torch::Tensor indicesIn, torch::Tensor indicePairs, torch::Tensor indiceNum,
torch::Tensor indicePairUnique, std::vector<int64_t> kernelSize,
std::vector<int64_t> stride, std::vector<int64_t> padding,
std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
bool transpose);
int create_conv_indice_pair_p2_cuda(
torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
torch::Tensor indicePairs, torch::Tensor indiceNum,
torch::Tensor indicePairUnique, std::vector<int64_t> outSpatialShape,
bool transpose, bool resetGrid, bool useHash);
int create_submconv_indice_pair_cuda(
torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
std::vector<int64_t> stride, std::vector<int64_t> padding,
std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
bool transpose, bool resetGrid, bool useHash);
int create_conv_indice_pair_cpu(
torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
torch::Tensor indicePairs, torch::Tensor indiceNum,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outSpatialShape, bool transpose, bool resetGrid,
bool useHash);
int create_submconv_indice_pair_cpu(
torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
std::vector<int64_t> stride, std::vector<int64_t> padding,
std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
bool transpose, bool resetGrid, bool useHash);
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_MAXPOOL_FUNCTOR_H_
#define SPARSE_MAXPOOL_FUNCTOR_H_
#include <tensorview/mp_helper.h>
#include <tensorview/tensor.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
namespace spconv {
void maxpool_bwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
torch::Tensor dout, torch::Tensor din,
torch::Tensor indicesIn, torch::Tensor indicesOut,
int size);
void maxpool_fwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
torch::Tensor indicesIn, torch::Tensor indicesOut,
int size);
void maxpool_bwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
torch::Tensor dout, torch::Tensor din,
torch::Tensor indicesIn, torch::Tensor indicesOut,
int size);
void maxpool_fwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
torch::Tensor indicesIn, torch::Tensor indicesOut,
int size);
} // namespace spconv
#endif
\ No newline at end of file
/* Copyright (c) Chris Choy (chrischoy@ai.stanford.edu).
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural
* Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part
* of the code.
*/
template <typename Dtype, typename Itype, int BLOCK_SIZE>
__global__ void matmul(const Dtype *A, const int wA, const int hA,
const Dtype *B, const int wB, const int hB, Dtype *C,
const Itype *in_map, const Itype *out_map) {
// Use in_feat as A and kernel as B
// Block index
const int bx = blockIdx.x;
const int by = blockIdx.y;
// Thread index
const int tx = threadIdx.x;
const int ty = threadIdx.y;
// Coordinate. x is for rows, y is for columns.
const int x = BLOCK_SIZE * bx + tx;
const int y = BLOCK_SIZE * by + ty;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
Dtype Csub = 0;
const Itype in_row = y < hA ? in_map[y] : 0;
const Itype out_row = y < hA ? out_map[y] : 0;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int s = 0; s < wA; s += BLOCK_SIZE) {
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ Dtype Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * in_row + s + tx] : 0;
Bs[ty][tx] = ((s + ty) < hB && x < wB) ? B[wB * (s + ty) + x] : 0;
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * Bs[k][tx];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write the block sub-matrix to device memory;
// each thread writes one element
if (y < hA && x < wB)
atomicAdd(&C[wB * out_row + x], Csub);
// C[wB * out_row + x] += Csub;
}
template <typename Dtype, typename Itype, int BLOCK_SIZE>
__global__ void matmul2(const Dtype *A, const int wA, const int hA,
const Dtype *B, const int wB, const int hB,
const Dtype *D, const int wD, const int hD, Dtype *C,
Dtype *E, const Itype *in_map, const Itype *out_map) {
// Use grad_out_feat as A, transposed kernel weight as B, and in_feat as D
// Block index
const int bx = blockIdx.x;
const int by = blockIdx.y;
// Thread index
const int tx = threadIdx.x;
const int ty = threadIdx.y;
// Coordinate. y is for rows, x is for columns.
const int x = BLOCK_SIZE * bx + tx;
const int y = BLOCK_SIZE * by + ty;
const Itype in_row = y < hA ? in_map[y] : 0;
const Itype out_row = y < hA ? out_map[y] : 0;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
Dtype Csub = 0;
Dtype Esub = 0;
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ Dtype BTs[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Ds used to
// store the sub-matrix of D
__shared__ Dtype DTs[BLOCK_SIZE][BLOCK_SIZE];
// For Ds = D^T[...:..., ...:...], use the transposed grid dimension for A
DTs[ty][tx] = (x < wD && y < hD) ? D[wD * in_row + x] : 0;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int s = 0; s < wA; s += BLOCK_SIZE) {
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * out_row + s + tx] : 0;
// Transposed kernel
BTs[ty][tx] = ((s + ty) < wB && x < hB) ? B[wB * x + s + ty] : 0;
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * BTs[k][tx];
}
// For Esub, reset to 0
Esub = 0;
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) {
Esub += DTs[k][ty] * As[k][tx];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
// For the E matrix which requires accmulation of multiple blocks, use
// atomic addition. This can be replaced with a more sophisticaed reduction
// algorithm.
if ((bx * BLOCK_SIZE + ty) < wD && (s + tx) < wA)
atomicAdd(&E[wA * (bx * BLOCK_SIZE + ty) + (s + tx)], Esub);
}
// Write the block sub-matrix to device memory;
// each thread writes one element
if (y < hA && x < hB)
atomicAdd(&C[hB * in_row + x], Csub);
}
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef NMS_CPU_H
#define NMS_CPU_H
#include <pybind11/pybind11.h>
// must include pybind11/stl.h if using containers in STL in arguments.
#include "box_iou.h"
#include "nms_gpu.h"
#include <algorithm>
#include <boost/geometry.hpp>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include <vector>
namespace spconv {
namespace py = pybind11;
using namespace pybind11::literals;
template <typename DType>
std::vector<int> non_max_suppression_cpu(py::array_t<DType> boxes,
py::array_t<int> order, DType thresh,
DType eps = 0) {
auto ndets = boxes.shape(0);
auto boxes_r = boxes.template unchecked<2>();
auto order_r = order.template unchecked<1>();
auto suppressed = zeros<int>({int(ndets)});
auto suppressed_rw = suppressed.template mutable_unchecked<1>();
auto area = zeros<DType>({int(ndets)});
auto area_rw = area.template mutable_unchecked<1>();
// get areas
for (int i = 0; i < ndets; ++i) {
area_rw(i) = (boxes_r(i, 2) - boxes_r(i, 0) + eps) *
(boxes_r(i, 3) - boxes_r(i, 1) + eps);
}
std::vector<int> keep;
int i, j;
DType xx1, xx2, w, h, inter, ovr;
for (int _i = 0; _i < ndets; ++_i) {
i = order_r(_i);
if (suppressed_rw(i) == 1)
continue;
keep.push_back(i);
for (int _j = _i + 1; _j < ndets; ++_j) {
j = order_r(_j);
if (suppressed_rw(j) == 1)
continue;
xx2 = std::min(boxes_r(i, 2), boxes_r(j, 2));
xx1 = std::max(boxes_r(i, 0), boxes_r(j, 0));
w = xx2 - xx1 + eps;
if (w > 0) {
xx2 = std::min(boxes_r(i, 3), boxes_r(j, 3));
xx1 = std::max(boxes_r(i, 1), boxes_r(j, 1));
h = xx2 - xx1 + eps;
if (h > 0) {
inter = w * h;
ovr = inter / (area_rw(i) + area_rw(j) - inter);
if (ovr >= thresh)
suppressed_rw(j) = 1;
}
}
}
}
return keep;
}
template <typename DType>
std::vector<int> rotate_non_max_suppression_cpu(py::array_t<DType> box_corners,
py::array_t<int> order,
py::array_t<DType> standup_iou,
DType thresh) {
auto ndets = box_corners.shape(0);
auto box_corners_r = box_corners.template unchecked<3>();
auto order_r = order.template unchecked<1>();
auto suppressed = zeros<int>({int(ndets)});
auto suppressed_rw = suppressed.template mutable_unchecked<1>();
auto standup_iou_r = standup_iou.template unchecked<2>();
std::vector<int> keep;
int i, j;
namespace bg = boost::geometry;
typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
typedef bg::model::polygon<point_t> polygon_t;
polygon_t poly, qpoly;
std::vector<polygon_t> poly_inter, poly_union;
DType inter_area, union_area, overlap;
for (int _i = 0; _i < ndets; ++_i) {
i = order_r(_i);
if (suppressed_rw(i) == 1)
continue;
keep.push_back(i);
for (int _j = _i + 1; _j < ndets; ++_j) {
j = order_r(_j);
if (suppressed_rw(j) == 1)
continue;
if (standup_iou_r(i, j) <= 0.0)
continue;
// std::cout << "pre_poly" << std::endl;
try {
bg::append(poly,
point_t(box_corners_r(i, 0, 0), box_corners_r(i, 0, 1)));
bg::append(poly,
point_t(box_corners_r(i, 1, 0), box_corners_r(i, 1, 1)));
bg::append(poly,
point_t(box_corners_r(i, 2, 0), box_corners_r(i, 2, 1)));
bg::append(poly,
point_t(box_corners_r(i, 3, 0), box_corners_r(i, 3, 1)));
bg::append(poly,
point_t(box_corners_r(i, 0, 0), box_corners_r(i, 0, 1)));
bg::append(qpoly,
point_t(box_corners_r(j, 0, 0), box_corners_r(j, 0, 1)));
bg::append(qpoly,
point_t(box_corners_r(j, 1, 0), box_corners_r(j, 1, 1)));
bg::append(qpoly,
point_t(box_corners_r(j, 2, 0), box_corners_r(j, 2, 1)));
bg::append(qpoly,
point_t(box_corners_r(j, 3, 0), box_corners_r(j, 3, 1)));
bg::append(qpoly,
point_t(box_corners_r(j, 0, 0), box_corners_r(j, 0, 1)));
bg::intersection(poly, qpoly, poly_inter);
} catch (const std::exception &e) {
std::cout << "box i corners:" << std::endl;
for (int k = 0; k < 4; ++k) {
std::cout << box_corners_r(i, k, 0) << " " << box_corners_r(i, k, 1)
<< std::endl;
}
std::cout << "box j corners:" << std::endl;
for (int k = 0; k < 4; ++k) {
std::cout << box_corners_r(j, k, 0) << " " << box_corners_r(j, k, 1)
<< std::endl;
}
// throw e;
continue;
}
// std::cout << "post_poly" << std::endl;
// std::cout << "post_intsec" << std::endl;
if (!poly_inter.empty()) {
inter_area = bg::area(poly_inter.front());
// std::cout << "pre_union" << " " << inter_area << std::endl;
bg::union_(poly, qpoly, poly_union);
/*
if (poly_union.empty()){
std::cout << "intsec area:" << " " << inter_area << std::endl;
std::cout << "box i corners:" << std::endl;
for(int k = 0; k < 4; ++k){
std::cout << box_corners_r(i, k, 0) << " " << box_corners_r(i,
k, 1) << std::endl;
}
std::cout << "box j corners:" << std::endl;
for(int k = 0; k < 4; ++k){
std::cout << box_corners_r(j, k, 0) << " " << box_corners_r(j,
k, 1) << std::endl;
}
}*/
// std::cout << "post_union" << poly_union.empty() << std::endl;
if (!poly_union.empty()) { // ignore invalid box
union_area = bg::area(poly_union.front());
// std::cout << "post union area" << std::endl;
// std::cout << union_area << "debug" << std::endl;
overlap = inter_area / union_area;
if (overlap >= thresh)
suppressed_rw(j) = 1;
poly_union.clear();
}
}
poly.clear();
qpoly.clear();
poly_inter.clear();
}
}
return keep;
}
#ifdef TV_CUDA
constexpr int const threadsPerBlock = sizeof(unsigned long long) * 8;
template <typename DType>
int non_max_suppression(py::array_t<DType> boxes, py::array_t<int> keep_out,
DType nms_overlap_thresh, int device_id) {
py::buffer_info info = boxes.request();
auto boxes_ptr = static_cast<DType *>(info.ptr);
py::buffer_info info_k = keep_out.request();
auto keep_out_ptr = static_cast<int *>(info_k.ptr);
return _nms_gpu<DType, threadsPerBlock>(keep_out_ptr, boxes_ptr,
boxes.shape(0), boxes.shape(1),
nms_overlap_thresh, device_id);
}
#endif
} // namespace spconv
#endif
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef NMS_FUNCTOR_H_
#define NMS_FUNCTOR_H_
#include <tensorview/tensorview.h>
namespace spconv {
namespace functor {
template <typename Device, typename T, typename Index>
struct NonMaxSupressionFunctor {
Index operator()(const Device &d, tv::TensorView<Index> keep,
tv::TensorView<const T> boxes, T threshold, T eps);
};
template <typename Device, typename T, typename Index>
struct rotateNonMaxSupressionFunctor {
Index operator()(const Device &d, tv::TensorView<Index> keep,
tv::TensorView<const T> boxCorners,
tv::TensorView<const T> standupIoU, T threshold);
};
} // namespace functor
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
template <typename DType, int BLOCK_THREADS>
int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
int boxes_dim, DType nms_overlap_thresh, int device_id);
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef NMS_TORCH_OP_H_
#define NMS_TORCH_OP_H_
#include <spconv/indice.h>
#include <spconv/nms_functor.h>
#include <spconv/reordering.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace spconv {
// torch.jit's doc says only support int64, so we need to convert to int32.
template <typename T>
torch::Tensor nonMaxSuppression(torch::Tensor boxes, torch::Tensor scores,
int64_t preMaxSize, int64_t postMaxSize,
double thresh, double eps) {
// auto timer = spconv::CudaContextTimer<>();
tv::check_torch_dtype<T>(boxes);
auto resOptions =
torch::TensorOptions().dtype(torch::kInt64).device(boxes.device());
if (boxes.size(0) == 0) {
return torch::zeros({0}, resOptions);
}
torch::Tensor indices;
if (preMaxSize > 0) {
auto numKeepedScores = scores.size(0);
preMaxSize = std::min(numKeepedScores, preMaxSize);
auto res = torch::topk(scores, preMaxSize);
indices = std::get<1>(res);
boxes = torch::index_select(boxes, 0, indices);
} else {
indices = std::get<1>(torch::sort(scores));
boxes = torch::index_select(boxes, 0, indices);
}
if (boxes.size(0) == 0)
return torch::zeros({0}, resOptions);
auto keep = torch::zeros({boxes.size(0)}, resOptions);
int64_t keepNum = 0;
if (boxes.device().type() == torch::kCPU) {
auto nmsFunctor = functor::NonMaxSupressionFunctor<tv::CPU, T, int64_t>();
keepNum = nmsFunctor(tv::CPU(), tv::torch2tv<int64_t>(keep),
tv::torch2tv<const T>(boxes), T(thresh), T(eps));
} else {
TV_ASSERT_RT_ERR(false, "not implemented");
}
if (postMaxSize <= 0) {
postMaxSize = keepNum;
}
// std::cout << keep << std::endl;
keep = keep.slice(0, 0, std::min(keepNum, postMaxSize));
if (preMaxSize > 0) {
return torch::index_select(indices, 0, keep);
}
return keep;
}
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef POINTPILLARS_SCATTER_FUNCTOR_H_
#define POINTPILLARS_SCATTER_FUNCTOR_H_
#include <tensorview/tensorview.h>
namespace spconv {
namespace functor {
template <typename Device, typename T, typename Index>
struct PointPillarScatter {
void operator()(const Device &d, tv::TensorView<T> canvas,
tv::TensorView<const T> features,
tv::TensorView<const T> coors);
};
} // namespace functor
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PILLAR_SCATTER_OP_H_
#define PILLAR_SCATTER_OP_H_
#include <spconv/pillar_scatter_functor.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace spconv {
// torch.jit's doc says only support int64, so we need to convert to int32.
template <typename T>
torch::Tensor pointPillarScatter(torch::Tensor features, torch::Tensor coors,
torch::Tensor shape) {
TV_ASSERT_RT_ERR(shape.device().type() == torch::kCPU, "error");
TV_ASSERT_RT_ERR(features.device().type() == torch::kCUDA, "error");
TV_ASSERT_RT_ERR(shape.dim() == 1, "error");
TV_ASSERT_RT_ERR(shape.size(0) == 4, "error");
TV_ASSERT_RT_ERR(features.dim() >= 3, "error");
TV_ASSERT_RT_ERR(features.size(0) == 1, "feature first dim must be 1");
TV_ASSERT_RT_ERR(coors.size(0) == 1, "coors first dim must be 1");
TV_ASSERT_RT_ERR(features.size(2) == coors.size(2), "err");
tv::check_torch_dtype<int>(shape);
tv::check_torch_dtype<T>(coors);
auto shapeData = shape.data_ptr<int>();
torch::Tensor canvas =
torch::zeros({shapeData[0], shapeData[1], shapeData[2], shapeData[3]},
features.options());
TV_ASSERT_RT_ERR(shapeData[1] == features.size(1), "error");
#ifdef TV_CUDA
functor::PointPillarScatter<tv::GPU, T, int> ftor;
ftor(tv::TorchGPU(), tv::torch2tv<T>(canvas),
tv::torch2tv<const T>(features.squeeze()),
tv::torch2tv<const T>(coors.squeeze()));
#endif
return canvas;
}
} // namespace spconv
#endif
\ No newline at end of file
#pragma once
#include <tensorview/kernel_utils.h>
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace spconv {
template <typename Index, unsigned NDim>
__global__ void scatterPointToGridKernel(
tv::TensorView<const float> points, tv::TensorView<const Index> indexes,
tv::TensorView<float> grids, tv::TensorView<Index> numPointsPerGrid,
tv::TensorView<Index> pointIndex,
const tv::SimpleVector<Index, NDim> gridShape) {
Index index;
int numPoints = points.dim(0);
int numFeatures = points.dim(1);
for (int ix : tv::KernelLoopX<int>(numPoints)) {
index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
indexes.data() + ix * NDim, gridShape.data(), 0);
pointIndex(ix) = index;
atomicAdd(numPointsPerGrid.data() + index, Index(1));
#pragma unroll
for (int k = 0; k != numFeatures; ++k) {
atomicAdd(grids.data() + index * numFeatures + k,
*(points.data() + ix * numFeatures + k));
}
}
}
template <typename Index, unsigned NDim>
__global__ void
gatherPointFromGridKernel(tv::TensorView<const float> grids,
tv::TensorView<const Index> numPointsPerGrid,
tv::TensorView<const Index> pointIndexUnique,
tv::TensorView<float> voxels,
tv::TensorView<Index> coors,
const tv::SimpleVector<Index, NDim> gridShape) {
Index index;
int numVoxels = voxels.dim(0);
int numFeatures = grids.dim(1);
for (int ix : tv::KernelLoopX<int>(numVoxels)) {
index = pointIndexUnique(ix);
#pragma unroll
for (int k = 0; k != numFeatures; ++k) {
voxels(ix, k) = grids(index, k) / numPointsPerGrid(index);
}
index = tv::rowArrayIdxInv<Index, NDim>(index, coors.data() + ix * NDim,
gridShape.data());
}
}
template <typename Index>
__global__ void resetGridKernel(tv::TensorView<float> grids,
tv::TensorView<Index> numPointsPerGrid,
tv::TensorView<Index> pointIndexUnique) {
Index index;
int numVoxels = pointIndexUnique.dim(0) - 1;
int numFeatures = grids.dim(1);
for (int ix : tv::KernelLoopX<int>(numVoxels)) {
index = pointIndexUnique(ix);
#pragma unroll
for (int k = 0; k != numFeatures; ++k) {
grids(index, k) = 0;
numPointsPerGrid(index) = 0;
}
}
}
template <typename Index>
__global__ void resetPointIndexKernel(tv::TensorView<Index> pointIndex,
const Index gridVolume) {
int num_max_points = pointIndex.dim(0) - 1;
for (int ix : tv::KernelLoopX<int>(num_max_points)) {
pointIndex(ix) = gridVolume;
}
}
} // namespace spconv
// Copyright 2019-2020 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <pybind11/pybind11.h>
// must include pybind11/eigen.h if using eigen matrix as arguments.
// must include pybind11/stl.h if using containers in STL in arguments.
#include <algorithm>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
// #include <vector>
#include <iostream>
#include <math.h>
namespace spconv {
namespace py = pybind11;
using namespace pybind11::literals;
template <typename DType, int NDim>
int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
py::array_t<DType> voxel_point_mask,
py::array_t<int> coors,
py::array_t<int> num_points_per_voxel,
py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size,
std::vector<DType> coors_range, int max_points,
int max_voxels) {
auto points_rw = points.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
// auto ndim = points_rw.shape(1) - 1;
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int voxelidx, num;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed)
continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels)
continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors_rw(voxelidx, k) = coor[k];
}
}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) {
voxel_point_mask_rw(voxelidx, num) = DType(1);
for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k);
}
num_points_per_voxel_rw(voxelidx) += 1;
}
}
for (int i = 0; i < voxel_num; ++i) {
coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
}
return voxel_num;
}
template <typename DType, int NDim>
int points_to_voxel_3d_np_mean(
py::array_t<DType> points, py::array_t<DType> voxel_point_mask,
py::array_t<DType> voxels, py::array_t<DType> means, py::array_t<int> coors,
py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size, std::vector<DType> coors_range,
int max_points, int max_voxels) {
auto points_rw = points.template mutable_unchecked<2>();
auto means_rw = means.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
// auto ndim = points_rw.shape(1) - 1;
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int voxelidx, num;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed)
continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels)
continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors_rw(voxelidx, k) = coor[k];
}
}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) {
voxel_point_mask_rw(voxelidx, num) = DType(1);
for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k);
}
num_points_per_voxel_rw(voxelidx) += 1;
for (int k = 0; k < num_features; ++k) {
means_rw(voxelidx, k) +=
(points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
}
}
}
for (int i = 0; i < voxel_num; ++i) {
coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
num = num_points_per_voxel_rw(i);
for (int j = num; j < max_points; ++j) {
for (int k = 0; k < num_features; ++k) {
voxels_rw(i, j, k) = means_rw(i, k);
}
}
}
return voxel_num;
}
template <typename DType, int NDim>
int points_to_voxel_3d_with_filtering(
py::array_t<DType> points, py::array_t<DType> voxels,
py::array_t<DType> voxel_point_mask, py::array_t<int> voxel_mask,
py::array_t<DType> mins, py::array_t<DType> maxs, py::array_t<int> coors,
py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size, std::vector<DType> coors_range,
int max_points, int max_voxels, int block_factor, int block_size,
DType height_threshold, DType height_high_threshold) {
auto points_rw = points.template mutable_unchecked<2>();
auto mins_rw = mins.template mutable_unchecked<2>();
auto maxs_rw = maxs.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
// auto ndim = points_rw.shape(1) - 1;
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
DType max_value, min_value;
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int block_shape_H = grid_size[1] / block_factor;
int block_shape_W = grid_size[0] / block_factor;
int voxelidx, num;
int block_coor[2];
int startx, stopx, starty, stopy;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed)
continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels)
continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors_rw(voxelidx, k) = coor[k];
}
}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) {
voxel_point_mask_rw(voxelidx, num) = DType(1);
for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k);
}
block_coor[0] = coor[1] / block_factor;
block_coor[1] = coor[2] / block_factor;
mins_rw(block_coor[0], block_coor[1]) =
std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
maxs_rw(block_coor[0], block_coor[1]) =
std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
num_points_per_voxel_rw(voxelidx) += 1;
}
}
for (int i = 0; i < voxel_num; ++i) {
coor[1] = coors_rw(i, 1);
coor[2] = coors_rw(i, 2);
coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;
block_coor[0] = coor[1] / block_factor;
block_coor[1] = coor[2] / block_factor;
min_value = mins_rw(block_coor[0], block_coor[1]);
max_value = maxs_rw(block_coor[0], block_coor[1]);
startx = std::max(0, block_coor[0] - block_size / 2);
stopx =
std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
starty = std::max(0, block_coor[1] - block_size / 2);
stopy =
std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);
for (int j = startx; j < stopx; ++j) {
for (int k = starty; k < stopy; ++k) {
min_value = std::min(min_value, mins_rw(j, k));
max_value = std::max(max_value, maxs_rw(j, k));
}
}
voxel_mask_rw(i) = ((max_value - min_value) > height_threshold) &&
((max_value - min_value) < height_high_threshold);
}
return voxel_num;
}
} // namespace spconv
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment