Commit a6abf55d authored by yan.yan's avatar yan.yan
Browse files

Merge branch 'develop'

parents fad30002 79a3eaf2
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This file is used for c++ unit test, but pytorch jit ops don't support c++
// debug build.
#ifndef PARAMS_GRID_H_
#define PARAMS_GRID_H_
#include <tuple>
#include <vector>
namespace detail {
template <class T> int getTotalSize(std::vector<T> arg) { return arg.size(); }
template <class T, class... TArgs>
int getTotalSize(std::vector<T> arg, std::vector<TArgs>... args) {
return arg.size() * getTotalSize(args...);
}
template <typename T> int getSize(std::vector<T> arg) { return arg.size(); }
template <int Idx, class TT, class T>
void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg) {
std::get<Idx>(src) = arg[counter[Idx]];
}
template <int Idx, class TT, class T, class... TArgs>
void assigner(TT &src, std::vector<int> counter, std::vector<T> &arg,
std::vector<TArgs> &... args) {
std::get<Idx>(src) = arg[counter[Idx]];
assigner<Idx + 1>(src, counter, args...);
}
} // namespace detail
template <class... TArgs>
std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
int length = detail::getTotalSize(args...);
std::vector<int> sizes = {detail::getSize(args)...};
int size = sizes.size();
std::vector<std::tuple<TArgs...>> params(length);
std::vector<int> counter(size);
for (int i = 0; i < length; ++i) {
detail::assigner<0>(params[i], counter, args...);
counter[size - 1] += 1;
for (int c = size - 1; c >= 0; --c) {
if (counter[c] == sizes[c] && c > 0) {
counter[c - 1] += 1;
counter[c] = 0;
}
}
}
return params;
}
#endif
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef BOX_IOU_H
#define BOX_IOU_H
#include <pybind11/pybind11.h>
// must include pybind11/eigen.h if using eigen matrix as arguments.
#include <algorithm>
#include <boost/geometry.hpp>
#include <pybind11/numpy.h>
namespace spconv {
// #include "voxelnet/core/cc/pybind11_helper.h"
namespace py = pybind11;
using namespace pybind11::literals;
template <typename DType, typename ShapeContainer>
inline py::array_t<DType> constant(ShapeContainer shape, DType value) {
// create ROWMAJOR array.
py::array_t<DType> array(shape);
std::fill(array.mutable_data(), array.mutable_data() + array.size(), value);
return array;
}
template <typename DType>
inline py::array_t<DType> zeros(std::vector<long int> shape) {
return constant<DType, std::vector<long int>>(shape, 0);
}
template <typename DType>
py::array_t<DType>
rbbox_iou(py::array_t<DType> box_corners, py::array_t<DType> qbox_corners,
py::array_t<DType> standup_iou, DType standup_thresh) {
namespace bg = boost::geometry;
typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
typedef bg::model::polygon<point_t> polygon_t;
polygon_t poly, qpoly;
std::vector<polygon_t> poly_inter, poly_union;
DType inter_area, union_area;
auto box_corners_r = box_corners.template unchecked<3>();
auto qbox_corners_r = qbox_corners.template unchecked<3>();
auto standup_iou_r = standup_iou.template unchecked<2>();
auto N = box_corners_r.shape(0);
auto K = qbox_corners_r.shape(0);
py::array_t<DType> overlaps = zeros<DType>({int(N), int(K)});
auto overlaps_rw = overlaps.template mutable_unchecked<2>();
if (N == 0 || K == 0) {
return overlaps;
}
for (int k = 0; k < K; ++k) {
for (int n = 0; n < N; ++n) {
if (standup_iou_r(n, k) <= standup_thresh)
continue;
bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1)));
bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
bg::intersection(poly, qpoly, poly_inter);
if (!poly_inter.empty()) {
inter_area = bg::area(poly_inter.front());
bg::union_(poly, qpoly, poly_union);
if (!poly_union.empty()) {
union_area = bg::area(poly_union.front());
overlaps_rw(n, k) = inter_area / union_area;
}
poly_union.clear();
}
poly.clear();
qpoly.clear();
poly_inter.clear();
}
}
return overlaps;
}
template <typename DType>
py::array_t<DType> rbbox_intersection(py::array_t<DType> box_corners,
py::array_t<DType> qbox_corners,
py::array_t<DType> standup_iou,
DType standup_thresh) {
namespace bg = boost::geometry;
typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
typedef bg::model::polygon<point_t> polygon_t;
polygon_t poly, qpoly;
std::vector<polygon_t> poly_inter, poly_union;
DType inter_area, union_area;
auto box_corners_r = box_corners.template unchecked<3>();
auto qbox_corners_r = qbox_corners.template unchecked<3>();
auto standup_iou_r = standup_iou.template unchecked<2>();
auto N = box_corners_r.shape(0);
auto K = qbox_corners_r.shape(0);
py::array_t<DType> overlaps = zeros<DType>({int(N), int(K)});
auto overlaps_rw = overlaps.template mutable_unchecked<2>();
if (N == 0 || K == 0) {
return overlaps;
}
for (int k = 0; k < K; ++k) {
for (int n = 0; n < N; ++n) {
if (standup_iou_r(n, k) <= standup_thresh)
continue;
bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
bg::append(poly, point_t(box_corners_r(n, 1, 0), box_corners_r(n, 1, 1)));
bg::append(poly, point_t(box_corners_r(n, 2, 0), box_corners_r(n, 2, 1)));
bg::append(poly, point_t(box_corners_r(n, 3, 0), box_corners_r(n, 3, 1)));
bg::append(poly, point_t(box_corners_r(n, 0, 0), box_corners_r(n, 0, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 1, 0), qbox_corners_r(k, 1, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 2, 0), qbox_corners_r(k, 2, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 3, 0), qbox_corners_r(k, 3, 1)));
bg::append(qpoly,
point_t(qbox_corners_r(k, 0, 0), qbox_corners_r(k, 0, 1)));
bg::intersection(poly, qpoly, poly_inter);
if (!poly_inter.empty()) {
inter_area = bg::area(poly_inter.front());
overlaps_rw(n, k) = inter_area;
}
poly.clear();
qpoly.clear();
poly_inter.clear();
}
}
return overlaps;
}
} // namespace spconv
#endif
\ No newline at end of file
#pragma once
#include <cublas_v2.h>
#include <tensorview/tensorview.h>
namespace spconv {
template <class T>
cublasStatus_t cublasTgemm(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const T *alpha, const T *A, int lda, const T *B,
int ldb, const T *beta, T *C, int ldc);
template <class T>
cublasStatus_t cublasTgemmRow(cublasHandle_t handle, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
const T *alpha, const T *A, int lda, const T *B,
int ldb, const T *beta, T *C, int ldc) {
return cublasTgemm<T>(handle, transb, transa, n, m, k, alpha, B, ldb, A, lda,
beta, C, ldc);
}
template <class T> inline T constant_scalar(float data) { return T(data); }
template <class T>
cublasStatus_t gemm(cublasHandle_t handle, bool transa, bool transb,
const tv::TensorView<T> A, const tv::TensorView<T> B,
tv::TensorView<T> C) {
TV_ASSERT_RT_ERR(A.ndim() == 2, "error");
TV_ASSERT_RT_ERR(B.ndim() == 2, "error");
auto transa_cublas = transa ? CUBLAS_OP_T : CUBLAS_OP_N;
auto transb_cublas = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
int m = transa ? A.dim(1) : A.dim(0);
int n = transb ? B.dim(0) : B.dim(1);
int ka = transa ? A.dim(0) : A.dim(1);
int kb = transb ? B.dim(1) : B.dim(0);
int lda = transa ? m : ka;
int ldb = transb ? ka : n;
int ldc = n;
TV_ASSERT_RT_ERR(ka == kb, "error");
T alpha = constant_scalar<T>(1);
T beta = constant_scalar<T>(0);
return cublasTgemmRow<T>(handle, transa_cublas, transb_cublas, m, n, ka,
&alpha, A.data(), lda, B.data(), ldb, &beta,
C.data(), ldc);
}
} // namespace spconv
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef FUSED_SPARSE_CONV_OP_H_
#define FUSED_SPARSE_CONV_OP_H_
#include <spconv/indice.h>
#include <spconv/reordering.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace spconv {
// torch.jit's doc says only support int64, so we need to convert to int32.
torch::Tensor
fusedIndiceConvBatchNorm(torch::Tensor features, torch::Tensor filters,
torch::Tensor bias, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM) {
bool subM = _subM != 0;
bool inverse = _inverse != 0;
auto device = features.device().type();
auto ndim = filters.dim() - 2;
auto kernelVolume = indicePairs.size(0);
auto numInPlanes = features.size(1);
auto numOutPlanes = filters.size(ndim + 1);
auto indicePairNumCpu = indiceNum.to({torch::kCPU});
auto indicePairMaxSizeIter =
std::max_element(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
int indicePairMaxOffset =
indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
int indicePairMaxSize = *indicePairMaxSizeIter;
/*if (_subM){
std::vector<int> indicePairNumVec(indicePairNumCpu.data_ptr<int>(),
indicePairNumCpu.data_ptr<int>() + kernelVolume);
indicePairNumVec.erase(indicePairNumVec.begin() + indicePairMaxOffset);
auto indicePairVecMaxSizeIter = std::max_element(
indicePairNumVec.begin(), indicePairNumVec.end());
indicePairMaxSize = *indicePairVecMaxSizeIter;
}*/
auto options =
torch::TensorOptions().dtype(features.dtype()).device(features.device());
// auto indicePairOptions =
// torch::TensorOptions().dtype(torch::kInt64).device(indicePairs.device());
torch::Tensor output =
torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
torch::Tensor inputBuffer =
torch::zeros({indicePairMaxSize, numInPlanes}, options);
torch::Tensor outputBuffer =
torch::zeros({indicePairMaxSize, numOutPlanes}, options);
filters = filters.view({-1, numInPlanes, numOutPlanes});
if (subM) { // the center index of subm conv don't need gather and scatter
// add.
torch::mm_out(output, features, filters[indicePairMaxOffset]);
}
double totalGatherTime = 0;
double totalGEMMTime = 0;
double totalSAddTime = 0;
for (int i = 0; i < kernelVolume; ++i) {
auto nHot = indicePairNumCpu.data_ptr<int>()[i];
if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
continue;
}
// auto timer = spconv::CudaContextTimer<>();
auto outputBufferBlob = torch::from_blob(outputBuffer.data_ptr(),
{nHot, numOutPlanes}, options);
auto inputBufferBlob =
torch::from_blob(inputBuffer.data_ptr(), {nHot, numInPlanes}, options);
if (device == torch::kCPU) {
sparse_gather_cpu(inputBuffer, features, indicePairs[i][inverse], nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
sparse_gather_cuda(inputBuffer, features, indicePairs[i][inverse], nHot);
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalGatherTime += timer.report() / 1000.0;
torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
// totalGEMMTime += timer.report() / 1000.0;
if (device == torch::kCPU) {
sparse_scatter_add_cpu(outputBuffer, output, indicePairs[i][!inverse],
nHot);
}
#ifdef TV_CUDA
else if (device == torch::kCUDA) {
sparse_scatter_add_cuda(outputBuffer, output, indicePairs[i][!inverse],
nHot);
}
#endif
else {
TV_ASSERT_INVALID_ARG(false, "unknown device type");
}
// totalSAddTime += timer.report() / 1000.0;
}
// std::cout << "gather time " << totalGatherTime << std::endl;
// std::cout << "gemm time " << totalGEMMTime << std::endl;
// std::cout << "scatteradd time " << totalSAddTime << std::endl;
return output;
}
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPCONV_GEOMETRY_H_
#define SPCONV_GEOMETRY_H_
#include <iostream>
#include <limits>
#include <tensorview/tensorview.h>
#include <tsl/robin_map.h>
#include <unordered_map>
namespace spconv {
namespace detail {
template <typename T> struct ToUnsigned;
template <> struct ToUnsigned<int> { using type = uint32_t; };
template <> struct ToUnsigned<long> { using type = uint64_t; };
template <typename T> struct FNVInternal;
template <> struct FNVInternal<uint32_t> {
constexpr static uint32_t defaultOffsetBasis = 0x811C9DC5;
constexpr static uint32_t prime = 0x01000193;
};
template <> struct FNVInternal<uint64_t> {
constexpr static uint64_t defaultOffsetBasis = 0xcbf29ce484222325;
constexpr static uint64_t prime = 0x100000001b3;
};
} // namespace detail
template <typename T>
using to_unsigned_t = typename detail::ToUnsigned<std::remove_const_t<T>>::type;
template <typename T> struct FNV1a : detail::FNVInternal<T> {
std::size_t operator()(const T *data, std::size_t size) {
to_unsigned_t<T> hash = detail::FNVInternal<T>::defaultOffsetBasis;
for (std::size_t i = 0; i < size; ++i) {
hash *= detail::FNVInternal<T>::prime;
hash ^= static_cast<to_unsigned_t<T>>(data[i]);
}
return hash;
}
};
template <typename Index, unsigned NDim>
TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
const Index *kernelSize,
const Index *stride, const Index *padding,
const Index *dilation,
const Index *outSpatialShape, Index *out) {
Index lowers[NDim];
Index uppers[NDim];
Index counter[NDim];
Index counterSize[NDim];
Index pointCounter = 0;
Index val;
Index numPoints = 1;
Index m, offset;
bool valid = false;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 +
stride[i] + padding[i]) /
stride[i];
uppers[i] = (input_pos[i] + padding[i]) / stride[i];
}
#pragma unroll
for (unsigned i = 0; i < NDim; ++i) {
counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
numPoints *= counterSize[i];
}
#pragma unroll
for (int i = 0; i < NDim; ++i) {
counter[i] = 0;
}
for (int i = 0; i < numPoints; ++i) {
valid = true;
m = 1;
offset = 0;
#pragma unroll
for (int j = NDim - 1; j >= 0; --j) {
val = uppers[j] - counter[j] * dilation[j];
out[pointCounter * (NDim + 1) + j] = val;
if (val < 0 || (val > outSpatialShape[j] - 1)) {
valid = false;
// break;
}
offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
m *= kernelSize[j];
}
out[pointCounter * (NDim + 1) + NDim] = offset;
if (valid)
++pointCounter;
counter[NDim - 1] += 1;
#pragma unroll
for (int c = NDim - 1; c >= 0; --c) {
if (counter[c] == counterSize[c] && c > 0) {
counter[c - 1] += 1;
counter[c] = 0;
}
}
}
return pointCounter;
}
template <typename Index, unsigned NDim>
TV_HOST_DEVICE Index getValidOutPosTranspose(
const Index *input_pos, const Index *kernelSize, const Index *stride,
const Index *padding, const Index *dilation, const Index *outSpatialShape,
Index *out) {
Index lowers[NDim];
Index uppers[NDim];
Index counter[NDim];
Index counterSize[NDim];
Index pointCounter = 0;
Index val;
Index numPoints = 1;
Index m, offset;
bool valid = false;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
lowers[i] = input_pos[i] * stride[i] - padding[i];
uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i];
}
#pragma unroll
for (unsigned i = 0; i < NDim; ++i) {
counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
numPoints *= counterSize[i];
}
#pragma unroll
for (int i = 0; i < NDim; ++i) {
counter[i] = 0;
}
for (int i = 0; i < numPoints; ++i) {
valid = true;
m = 1;
offset = 0;
#pragma unroll
for (int j = NDim - 1; j >= 0; --j) {
val = uppers[j] - counter[j] * dilation[j];
out[pointCounter * (NDim + 1) + j] = val;
if (val < 0 || (val > outSpatialShape[j] - 1)) {
valid = false;
// break;
}
offset += m * (val - lowers[j]) / dilation[j];
m *= kernelSize[j];
}
out[pointCounter * (NDim + 1) + NDim] = offset;
if (valid)
++pointCounter;
counter[NDim - 1] += 1;
#pragma unroll
for (int c = NDim - 1; c >= 0; --c) {
if (counter[c] == counterSize[c] && c > 0) {
counter[c - 1] += 1;
counter[c] = 0;
}
}
}
return pointCounter;
}
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef INDICE_CU_H_
#define INDICE_CU_H_
#include <cuhash/hash_table.cuh>
#include <spconv/geometry.h>
#include <tensorview/kernel_utils.h>
#include <tensorview/tensorview.h>
namespace spconv {
template <typename Index, unsigned NDim, int KernelMaxVolume = 256,
typename Index1D = int>
__global__ void prepareIndicePairsKernel(
tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum, tv::TensorView<Index1D> indicePairUnique,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape) {
auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
Index validPoints[KernelMaxVolume * (NDim + 1)];
Index *pointPtr = nullptr;
auto indicePairsDim2 = indicePairs.dim(2);
Index index;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
indicePairs(0, offset, oldNum) = ix;
index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
pointPtr, outSpatialShape.data(), 0) +
spatialVolume * indicesIn(ix, 0);
indicePairs(1, offset, oldNum) = index;
indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
}
}
}
template <typename Index, unsigned NDim, int KernelMaxVolume = 256>
__global__ void prepareDeConvIndicePairsKernel(
tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape) {
auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
Index validPoints[KernelMaxVolume * (NDim + 1)];
Index *pointPtr = nullptr;
auto indicePairsDim2 = indicePairs.dim(2);
Index index;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
numValidPoints = getValidOutPosTranspose<Index, NDim>(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
indicePairs(0, offset, oldNum) = ix;
index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
pointPtr, outSpatialShape.data(), 0) +
spatialVolume * indicesIn(ix, 0);
indicePairs(1, offset, oldNum) = index;
indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
}
}
}
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void assignGridAndIndiceOutKernel(
tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
int numAct, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {
Index index;
auto indicesOutPtr = indicesOut.data();
for (int ix : tv::KernelLoopX<int>(numAct)) {
index = indicePairUnique[ix];
gridsOut[index] = ix;
index = tv::rowArrayIdxInv<Index, NDim>(
index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
indicesOut[ix * (NDim + 1)] = index % batchSize;
}
}
template <typename Index, unsigned NDim, unsigned kNumHashFunctions = 4>
__global__ void
assignIndiceOutKernel(tv::TensorView<Index> indicesOut, int numAct,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape,
int batchSize) {
Index index;
auto indicesOutPtr = indicesOut.data();
for (unsigned ix : tv::KernelLoopX<unsigned>(numAct)) {
index = indicePairUnique[ix];
index = tv::rowArrayIdxInv<Index, NDim>(
index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
indicesOut[ix * (NDim + 1)] = index % batchSize;
}
}
template <typename Index, unsigned NDim, unsigned kNumHashFunctions = 4>
__global__ void
assignIndicePairsHashKernel(tv::TensorView<Index> indicesOut, int numActIn,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indicePairUnique,
unsigned table_size, const cuhash::Entry *table,
cuhash::Functions<kNumHashFunctions> constants,
uint2 stash_constants, unsigned stash_count) {
Index index;
int kernelVolume = indicePairs.dim(1);
auto indicePairsOut = indicePairs.subview(1);
for (int ix : tv::KernelLoopX<int>(numActIn)) {
for (int i = 0; i < kernelVolume; ++i) {
index = indicePairsOut(i, ix);
if (index > -1) {
auto val = cuhash::retrieve((unsigned)(index), table_size, table,
constants, stash_constants, stash_count);
assert(val != cuhash::kNotFound);
indicePairsOut(i, ix) = (unsigned)val;
}
}
}
}
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void
assignIndicePairsKernel(tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut, int numActIn,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape) {
Index index;
int kernelVolume = indicePairs.dim(1);
auto indicePairsOut = indicePairs.subview(1);
for (int ix : tv::KernelLoopX<int>(numActIn)) {
for (int i = 0; i < kernelVolume; ++i) {
index = indicePairsOut(i, ix);
if (index > -1) {
indicePairsOut(i, ix) = gridsOut[index];
}
}
}
}
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void prepareSubMGridKernel(
tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
const tv::SimpleVector<Index, NDim> outSpatialShape, Index spatialVolume) {
auto numActIn = indicesIn.dim(0);
Index index = 0;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
index =
tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
indicesIn.data() + ix * (NDim + 1) + 1, outSpatialShape.data(), 0) +
spatialVolume * indicesIn(ix, 0);
gridsOut[index] = ix;
}
}
template <typename Index, unsigned NDim>
__global__ void
prepareSubMHashKernel(tv::TensorView<const Index> indicesIn, unsigned *keys,
unsigned *values,
const tv::SimpleVector<Index, NDim> outSpatialShape) {
auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index index = 0;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + ix * (NDim + 1) + 1,
outSpatialShape.data()) +
spatialVolume * indicesIn(ix, 0);
keys[ix] = index;
values[ix] = ix;
}
}
template <typename Index, typename IndexGrid, unsigned NDim,
int KernelMaxVolume = 256>
__global__ void getSubMIndicePairsKernel(
tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape) {
auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index numValidPoints = 0;
Index validPoints[KernelMaxVolume * (NDim + 1)];
Index *pointPtr = nullptr;
Index index = 0;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
validPoints);
for (int i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
pointPtr, outSpatialShape.data(), 0) +
spatialVolume * indicesIn(ix, 0);
if (gridsOut[index] > -1) {
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
indicePairs(1, offset, oldNum) = gridsOut[index];
indicePairs(0, offset, oldNum) = ix;
}
}
}
}
template <typename Index, typename IndexGrid, unsigned K0, unsigned K1,
unsigned K2>
__global__ void getSubMIndicePairsKernel3(
tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, 3> outSpatialShape, Index spatialVolume) {
auto numActIn = indicesIn.dim(0);
Index point[3];
Index index = 0;
Index offset;
constexpr unsigned KV = K0 * K1 * K2;
constexpr unsigned center = KV / 2;
*(indiceNum.data() + center) = numActIn;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
const Index *indice_data = indicesIn.data() + ix * (3 + 1);
#pragma unroll
for (int i = 0; i < K0; ++i) {
#pragma unroll
for (int j = 0; j < K1; ++j) {
#pragma unroll
for (int k = 0; k < K2; ++k) {
offset = i * K1 * K2 + j * K2 + k;
if (offset > center){
continue;
}
if (center == offset){
// center of subm indice pairs dont need atomicadd
indicePairs(1, offset, ix) = ix;
indicePairs(0, offset, ix) = ix;
}else{
point[2] = indice_data[3] - k + K2 / 2;
point[1] = indice_data[2] - j + K1 / 2;
point[0] = indice_data[1] - i + K0 / 2;
if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[2] >= 0 &&
point[2] < outSpatialShape[2] && point[0] >= 0 &&
point[0] < outSpatialShape[0]) {
index = tv::ArrayIndexRowMajor<3, 3>::runPtrs(
point, outSpatialShape.data(), 0) +
spatialVolume * indice_data[0];
if (gridsOut[index] != -1) {
// for subm: indicePairs[0, i] = indicePairs[1, kernelVolume - i - 1]
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
indicePairs(1, offset, oldNum) = gridsOut[index];
indicePairs(0, offset, oldNum) = ix;
indicePairs(1, KV - offset - 1, oldNum) = ix;
indicePairs(0, KV - offset - 1, oldNum) = gridsOut[index];
}
}
}
}
}
}
}
}
template <typename Index, typename IndexGrid, unsigned K0, unsigned K1>
__global__ void getSubMIndicePairsKernel2(
tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, 2> outSpatialShape, Index spatialVolume) {
auto numActIn = indicesIn.dim(0);
Index point[2];
Index index = 0;
Index offset;
constexpr unsigned KV = K0 * K1;
constexpr unsigned center = KV / 2;
*(indiceNum.data() + center) = numActIn;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
const Index *indice_data = indicesIn.data() + ix * (2 + 1);
#pragma unroll
for (int i = 0; i < K0; ++i) {
#pragma unroll
for (int j = 0; j < K1; ++j) {
offset = i * K1 + j;
if (offset > center){
continue;
}
if (center == offset){
// center of subm indice pairs dont need atomicadd
indicePairs(1, offset, ix) = ix;
indicePairs(0, offset, ix) = ix;
}else{
point[1] = indice_data[2] - j + K1 / 2;
point[0] = indice_data[1] - i + K0 / 2;
if (point[1] >= 0 && point[1] < outSpatialShape[1] && point[0] >= 0 &&
point[0] < outSpatialShape[0]) {
index = tv::ArrayIndexRowMajor<2, 2>::runPtrs(
point, outSpatialShape.data(), 0) +
spatialVolume * indice_data[0];
if (gridsOut[index] > -1) {
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
atomicAdd(indiceNum.data() + KV - offset - 1, Index(1));
indicePairs(1, offset, oldNum) = gridsOut[index];
indicePairs(0, offset, oldNum) = ix;
indicePairs(1, KV - offset - 1, oldNum) = ix;
indicePairs(0, KV - offset - 1, oldNum) = gridsOut[index];
}
}
}
}
}
}
}
template <typename Index, unsigned NDim, int KernelMaxVolume = 256,
unsigned kNumHashFunctions = 4>
__global__ void getSubMIndicePairsHashKernel(
tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape, unsigned table_size,
const cuhash::Entry *table, cuhash::Functions<kNumHashFunctions> constants,
uint2 stash_constants, unsigned stash_count) {
auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index numValidPoints = 0;
Index validPoints[KernelMaxVolume * (NDim + 1)];
Index *pointPtr = nullptr;
Index index = 0;
for (int ix : tv::KernelLoopX<int>(numActIn)) {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
validPoints);
for (int i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(
pointPtr, outSpatialShape.data(), 0) +
spatialVolume * indicesIn(ix, 0);
auto val = cuhash::retrieve((unsigned)(index), table_size, table,
constants, stash_constants, stash_count);
if (val != cuhash::kNotFound) {
Index oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
indicePairs(1, offset, oldNum) = val;
indicePairs(0, offset, oldNum) = ix;
}
}
}
}
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void resetGridKernel(const Index *indicePairUnique,
tv::TensorView<IndexGrid> gridsOut,
int numAct) {
for (int ix : tv::KernelLoopX<int>(numAct)) {
gridsOut[indicePairUnique[ix]] = -1;
}
}
template <typename T> __global__ void arangeKernel(T *data, int size) {
for (int ix : tv::KernelLoopX<int>(size)) {
data[ix] = ix;
}
}
template <typename Index, typename IndexGrid, unsigned NDim>
__global__ void
resetGridSubMKernel(const Index *indices, tv::TensorView<IndexGrid> gridsOut,
const tv::SimpleVector<Index, NDim> outSpatialShape,
int numAct) {
Index outSpatialShapeReg[NDim];
for (int i = 0; i < NDim; ++i) {
outSpatialShapeReg[i] = outSpatialShape[i];
}
Index spatialVolume = 1;
auto indsPtr = indices;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index index;
for (int ix : tv::KernelLoopX<int>(numAct)) {
indsPtr = indices + ix * (NDim + 1);
index = tv::ArrayIndexRowMajor<NDim, NDim>::runPtrs(indsPtr + 1,
outSpatialShapeReg, 0);
gridsOut[index + spatialVolume * indsPtr[0]] = -1;
}
}
} // namespace spconv
#undef atomicAdd
#endif
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_
#define SPARSE_CONV_INDICE_FUNCTOR_H_
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace spconv {
int create_conv_indice_pair_p1_cuda(
torch::Tensor indicesIn, torch::Tensor indicePairs, torch::Tensor indiceNum,
torch::Tensor indicePairUnique, std::vector<int64_t> kernelSize,
std::vector<int64_t> stride, std::vector<int64_t> padding,
std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
bool transpose);
int create_conv_indice_pair_p2_cuda(
torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
torch::Tensor indicePairs, torch::Tensor indiceNum,
torch::Tensor indicePairUnique, std::vector<int64_t> outSpatialShape,
bool transpose, bool resetGrid, bool useHash);
int create_submconv_indice_pair_cuda(
torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
std::vector<int64_t> stride, std::vector<int64_t> padding,
std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
bool transpose, bool resetGrid, bool useHash);
int create_conv_indice_pair_cpu(
torch::Tensor indicesIn, torch::Tensor indicesOut, torch::Tensor gridsOut,
torch::Tensor indicePairs, torch::Tensor indiceNum,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outSpatialShape, bool transpose, bool resetGrid,
bool useHash);
int create_submconv_indice_pair_cpu(
torch::Tensor indicesIn, torch::Tensor gridsOut, torch::Tensor indicePairs,
torch::Tensor indiceNum, std::vector<int64_t> kernelSize,
std::vector<int64_t> stride, std::vector<int64_t> padding,
std::vector<int64_t> dilation, std::vector<int64_t> outSpatialShape,
bool transpose, bool resetGrid, bool useHash);
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_MAXPOOL_FUNCTOR_H_
#define SPARSE_MAXPOOL_FUNCTOR_H_
#include <tensorview/mp_helper.h>
#include <tensorview/tensor.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
namespace spconv {
void maxpool_bwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
torch::Tensor dout, torch::Tensor din,
torch::Tensor indicesIn, torch::Tensor indicesOut,
int size);
void maxpool_fwd_cpu(torch::Tensor outFeatures, torch::Tensor inFeatures,
torch::Tensor indicesIn, torch::Tensor indicesOut,
int size);
void maxpool_bwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
torch::Tensor dout, torch::Tensor din,
torch::Tensor indicesIn, torch::Tensor indicesOut,
int size);
void maxpool_fwd_cuda(torch::Tensor outFeatures, torch::Tensor inFeatures,
torch::Tensor indicesIn, torch::Tensor indicesOut,
int size);
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef NMS_CPU_H
#define NMS_CPU_H
#include <pybind11/pybind11.h>
// must include pybind11/stl.h if using containers in STL in arguments.
#include "box_iou.h"
#include "nms_gpu.h"
#include <algorithm>
#include <boost/geometry.hpp>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include <vector>
namespace spconv {
namespace py = pybind11;
using namespace pybind11::literals;
template <typename DType>
std::vector<int> non_max_suppression_cpu(py::array_t<DType> boxes,
py::array_t<int> order, DType thresh,
DType eps = 0) {
auto ndets = boxes.shape(0);
auto boxes_r = boxes.template unchecked<2>();
auto order_r = order.template unchecked<1>();
auto suppressed = zeros<int>({int(ndets)});
auto suppressed_rw = suppressed.template mutable_unchecked<1>();
auto area = zeros<DType>({int(ndets)});
auto area_rw = area.template mutable_unchecked<1>();
// get areas
for (int i = 0; i < ndets; ++i) {
area_rw(i) = (boxes_r(i, 2) - boxes_r(i, 0) + eps) *
(boxes_r(i, 3) - boxes_r(i, 1) + eps);
}
std::vector<int> keep;
int i, j;
DType xx1, xx2, w, h, inter, ovr;
for (int _i = 0; _i < ndets; ++_i) {
i = order_r(_i);
if (suppressed_rw(i) == 1)
continue;
keep.push_back(i);
for (int _j = _i + 1; _j < ndets; ++_j) {
j = order_r(_j);
if (suppressed_rw(j) == 1)
continue;
xx2 = std::min(boxes_r(i, 2), boxes_r(j, 2));
xx1 = std::max(boxes_r(i, 0), boxes_r(j, 0));
w = xx2 - xx1 + eps;
if (w > 0) {
xx2 = std::min(boxes_r(i, 3), boxes_r(j, 3));
xx1 = std::max(boxes_r(i, 1), boxes_r(j, 1));
h = xx2 - xx1 + eps;
if (h > 0) {
inter = w * h;
ovr = inter / (area_rw(i) + area_rw(j) - inter);
if (ovr >= thresh)
suppressed_rw(j) = 1;
}
}
}
}
return keep;
}
template <typename DType>
std::vector<int> rotate_non_max_suppression_cpu(py::array_t<DType> box_corners,
py::array_t<int> order,
py::array_t<DType> standup_iou,
DType thresh) {
auto ndets = box_corners.shape(0);
auto box_corners_r = box_corners.template unchecked<3>();
auto order_r = order.template unchecked<1>();
auto suppressed = zeros<int>({int(ndets)});
auto suppressed_rw = suppressed.template mutable_unchecked<1>();
auto standup_iou_r = standup_iou.template unchecked<2>();
std::vector<int> keep;
int i, j;
namespace bg = boost::geometry;
typedef bg::model::point<DType, 2, bg::cs::cartesian> point_t;
typedef bg::model::polygon<point_t> polygon_t;
polygon_t poly, qpoly;
std::vector<polygon_t> poly_inter, poly_union;
DType inter_area, union_area, overlap;
for (int _i = 0; _i < ndets; ++_i) {
i = order_r(_i);
if (suppressed_rw(i) == 1)
continue;
keep.push_back(i);
for (int _j = _i + 1; _j < ndets; ++_j) {
j = order_r(_j);
if (suppressed_rw(j) == 1)
continue;
if (standup_iou_r(i, j) <= 0.0)
continue;
// std::cout << "pre_poly" << std::endl;
try {
bg::append(poly,
point_t(box_corners_r(i, 0, 0), box_corners_r(i, 0, 1)));
bg::append(poly,
point_t(box_corners_r(i, 1, 0), box_corners_r(i, 1, 1)));
bg::append(poly,
point_t(box_corners_r(i, 2, 0), box_corners_r(i, 2, 1)));
bg::append(poly,
point_t(box_corners_r(i, 3, 0), box_corners_r(i, 3, 1)));
bg::append(poly,
point_t(box_corners_r(i, 0, 0), box_corners_r(i, 0, 1)));
bg::append(qpoly,
point_t(box_corners_r(j, 0, 0), box_corners_r(j, 0, 1)));
bg::append(qpoly,
point_t(box_corners_r(j, 1, 0), box_corners_r(j, 1, 1)));
bg::append(qpoly,
point_t(box_corners_r(j, 2, 0), box_corners_r(j, 2, 1)));
bg::append(qpoly,
point_t(box_corners_r(j, 3, 0), box_corners_r(j, 3, 1)));
bg::append(qpoly,
point_t(box_corners_r(j, 0, 0), box_corners_r(j, 0, 1)));
bg::intersection(poly, qpoly, poly_inter);
} catch (const std::exception &e) {
std::cout << "box i corners:" << std::endl;
for (int k = 0; k < 4; ++k) {
std::cout << box_corners_r(i, k, 0) << " " << box_corners_r(i, k, 1)
<< std::endl;
}
std::cout << "box j corners:" << std::endl;
for (int k = 0; k < 4; ++k) {
std::cout << box_corners_r(j, k, 0) << " " << box_corners_r(j, k, 1)
<< std::endl;
}
// throw e;
continue;
}
// std::cout << "post_poly" << std::endl;
// std::cout << "post_intsec" << std::endl;
if (!poly_inter.empty()) {
inter_area = bg::area(poly_inter.front());
// std::cout << "pre_union" << " " << inter_area << std::endl;
bg::union_(poly, qpoly, poly_union);
/*
if (poly_union.empty()){
std::cout << "intsec area:" << " " << inter_area << std::endl;
std::cout << "box i corners:" << std::endl;
for(int k = 0; k < 4; ++k){
std::cout << box_corners_r(i, k, 0) << " " << box_corners_r(i,
k, 1) << std::endl;
}
std::cout << "box j corners:" << std::endl;
for(int k = 0; k < 4; ++k){
std::cout << box_corners_r(j, k, 0) << " " << box_corners_r(j,
k, 1) << std::endl;
}
}*/
// std::cout << "post_union" << poly_union.empty() << std::endl;
if (!poly_union.empty()) { // ignore invalid box
union_area = bg::area(poly_union.front());
// std::cout << "post union area" << std::endl;
// std::cout << union_area << "debug" << std::endl;
overlap = inter_area / union_area;
if (overlap >= thresh)
suppressed_rw(j) = 1;
poly_union.clear();
}
}
poly.clear();
qpoly.clear();
poly_inter.clear();
}
}
return keep;
}
#ifdef TV_CUDA
constexpr int const threadsPerBlock = sizeof(unsigned long long) * 8;
template <typename DType>
int non_max_suppression(py::array_t<DType> boxes, py::array_t<int> keep_out,
DType nms_overlap_thresh, int device_id) {
py::buffer_info info = boxes.request();
auto boxes_ptr = static_cast<DType *>(info.ptr);
py::buffer_info info_k = keep_out.request();
auto keep_out_ptr = static_cast<int *>(info_k.ptr);
return _nms_gpu<DType, threadsPerBlock>(keep_out_ptr, boxes_ptr,
boxes.shape(0), boxes.shape(1),
nms_overlap_thresh, device_id);
}
#endif
} // namespace spconv
#endif
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef NMS_FUNCTOR_H_
#define NMS_FUNCTOR_H_
#include <tensorview/tensorview.h>
namespace spconv {
namespace functor {
template <typename Device, typename T, typename Index>
struct NonMaxSupressionFunctor {
Index operator()(const Device &d, tv::TensorView<Index> keep,
tv::TensorView<const T> boxes, T threshold, T eps);
};
template <typename Device, typename T, typename Index>
struct rotateNonMaxSupressionFunctor {
Index operator()(const Device &d, tv::TensorView<Index> keep,
tv::TensorView<const T> boxCorners,
tv::TensorView<const T> standupIoU, T threshold);
};
} // namespace functor
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
template <typename DType, int BLOCK_THREADS>
int _nms_gpu(int *keep_out, const DType *boxes_host, int boxes_num,
int boxes_dim, DType nms_overlap_thresh, int device_id);
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef NMS_TORCH_OP_H_
#define NMS_TORCH_OP_H_
#include <spconv/indice.h>
#include <spconv/nms_functor.h>
#include <spconv/reordering.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace spconv {
// torch.jit's doc says only support int64, so we need to convert to int32.
template <typename T>
torch::Tensor nonMaxSuppression(torch::Tensor boxes, torch::Tensor scores,
int64_t preMaxSize, int64_t postMaxSize,
double thresh, double eps) {
// auto timer = spconv::CudaContextTimer<>();
tv::check_torch_dtype<T>(boxes);
auto resOptions =
torch::TensorOptions().dtype(torch::kInt64).device(boxes.device());
if (boxes.size(0) == 0) {
return torch::zeros({0}, resOptions);
}
torch::Tensor indices;
if (preMaxSize > 0) {
auto numKeepedScores = scores.size(0);
preMaxSize = std::min(numKeepedScores, preMaxSize);
auto res = torch::topk(scores, preMaxSize);
indices = std::get<1>(res);
boxes = torch::index_select(boxes, 0, indices);
} else {
indices = std::get<1>(torch::sort(scores));
boxes = torch::index_select(boxes, 0, indices);
}
if (boxes.size(0) == 0)
return torch::zeros({0}, resOptions);
auto keep = torch::zeros({boxes.size(0)}, resOptions);
int64_t keepNum = 0;
if (boxes.device().type() == torch::kCPU) {
auto nmsFunctor = functor::NonMaxSupressionFunctor<tv::CPU, T, int64_t>();
keepNum = nmsFunctor(tv::CPU(), tv::torch2tv<int64_t>(keep),
tv::torch2tv<const T>(boxes), T(thresh), T(eps));
} else {
TV_ASSERT_RT_ERR(false, "not implemented");
}
if (postMaxSize <= 0) {
postMaxSize = keepNum;
}
// std::cout << keep << std::endl;
keep = keep.slice(0, 0, std::min(keepNum, postMaxSize));
if (preMaxSize > 0) {
return torch::index_select(indices, 0, keep);
}
return keep;
}
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef POINTPILLARS_SCATTER_FUNCTOR_H_
#define POINTPILLARS_SCATTER_FUNCTOR_H_
#include <tensorview/tensorview.h>
namespace spconv {
namespace functor {
template <typename Device, typename T, typename Index>
struct PointPillarScatter {
void operator()(const Device &d, tv::TensorView<T> canvas,
tv::TensorView<const T> features,
tv::TensorView<const T> coors);
};
} // namespace functor
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PILLAR_SCATTER_OP_H_
#define PILLAR_SCATTER_OP_H_
#include <spconv/pillar_scatter_functor.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace spconv {
// torch.jit's doc says only support int64, so we need to convert to int32.
template <typename T>
torch::Tensor pointPillarScatter(torch::Tensor features, torch::Tensor coors,
torch::Tensor shape) {
TV_ASSERT_RT_ERR(shape.device().type() == torch::kCPU, "error");
TV_ASSERT_RT_ERR(features.device().type() == torch::kCUDA, "error");
TV_ASSERT_RT_ERR(shape.dim() == 1, "error");
TV_ASSERT_RT_ERR(shape.size(0) == 4, "error");
TV_ASSERT_RT_ERR(features.dim() >= 3, "error");
TV_ASSERT_RT_ERR(features.size(0) == 1, "feature first dim must be 1");
TV_ASSERT_RT_ERR(coors.size(0) == 1, "coors first dim must be 1");
TV_ASSERT_RT_ERR(features.size(2) == coors.size(2), "err");
tv::check_torch_dtype<int>(shape);
tv::check_torch_dtype<T>(coors);
auto shapeData = shape.data_ptr<int>();
torch::Tensor canvas =
torch::zeros({shapeData[0], shapeData[1], shapeData[2], shapeData[3]},
features.options());
TV_ASSERT_RT_ERR(shapeData[1] == features.size(1), "error");
#ifdef TV_CUDA
functor::PointPillarScatter<tv::GPU, T, int> ftor;
ftor(tv::TorchGPU(), tv::torch2tv<T>(canvas),
tv::torch2tv<const T>(features.squeeze()),
tv::torch2tv<const T>(coors.squeeze()));
#endif
return canvas;
}
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <pybind11/pybind11.h>
// must include pybind11/eigen.h if using eigen matrix as arguments.
// must include pybind11/stl.h if using containers in STL in arguments.
#include <algorithm>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
// #include <vector>
#include <iostream>
#include <math.h>
namespace spconv {
namespace py = pybind11;
using namespace pybind11::literals;
template <typename DType, int NDim>
int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
py::array_t<DType> voxel_point_mask,
py::array_t<int> coors,
py::array_t<int> num_points_per_voxel,
py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size,
std::vector<DType> coors_range, int max_points,
int max_voxels) {
auto points_rw = points.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
// auto ndim = points_rw.shape(1) - 1;
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int voxelidx, num;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed)
continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels)
continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors_rw(voxelidx, k) = coor[k];
}
}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) {
voxel_point_mask_rw(voxelidx, num) = DType(1);
for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k);
}
num_points_per_voxel_rw(voxelidx) += 1;
}
}
for (int i = 0; i < voxel_num; ++i) {
coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
}
return voxel_num;
}
template <typename DType, int NDim>
int points_to_voxel_3d_np_mean(
py::array_t<DType> points, py::array_t<DType> voxel_point_mask,
py::array_t<DType> voxels, py::array_t<DType> means, py::array_t<int> coors,
py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size, std::vector<DType> coors_range,
int max_points, int max_voxels) {
auto points_rw = points.template mutable_unchecked<2>();
auto means_rw = means.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
// auto ndim = points_rw.shape(1) - 1;
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int voxelidx, num;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed)
continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels)
continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors_rw(voxelidx, k) = coor[k];
}
}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) {
voxel_point_mask_rw(voxelidx, num) = DType(1);
for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k);
}
num_points_per_voxel_rw(voxelidx) += 1;
for (int k = 0; k < num_features; ++k) {
means_rw(voxelidx, k) +=
(points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
}
}
}
for (int i = 0; i < voxel_num; ++i) {
coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
num = num_points_per_voxel_rw(i);
for (int j = num; j < max_points; ++j) {
for (int k = 0; k < num_features; ++k) {
voxels_rw(i, j, k) = means_rw(i, k);
}
}
}
return voxel_num;
}
template <typename DType, int NDim>
int points_to_voxel_3d_with_filtering(
py::array_t<DType> points, py::array_t<DType> voxels,
py::array_t<DType> voxel_point_mask, py::array_t<int> voxel_mask,
py::array_t<DType> mins, py::array_t<DType> maxs, py::array_t<int> coors,
py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size, std::vector<DType> coors_range,
int max_points, int max_voxels, int block_factor, int block_size,
DType height_threshold, DType height_high_threshold) {
auto points_rw = points.template mutable_unchecked<2>();
auto mins_rw = mins.template mutable_unchecked<2>();
auto maxs_rw = maxs.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
auto voxel_point_mask_rw = voxel_point_mask.template mutable_unchecked<2>();
auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
// auto ndim = points_rw.shape(1) - 1;
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
DType max_value, min_value;
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int block_shape_H = grid_size[1] / block_factor;
int block_shape_W = grid_size[0] / block_factor;
int voxelidx, num;
int block_coor[2];
int startx, stopx, starty, stopy;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed)
continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels)
continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors_rw(voxelidx, k) = coor[k];
}
}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) {
voxel_point_mask_rw(voxelidx, num) = DType(1);
for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k);
}
block_coor[0] = coor[1] / block_factor;
block_coor[1] = coor[2] / block_factor;
mins_rw(block_coor[0], block_coor[1]) =
std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
maxs_rw(block_coor[0], block_coor[1]) =
std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
num_points_per_voxel_rw(voxelidx) += 1;
}
}
for (int i = 0; i < voxel_num; ++i) {
coor[1] = coors_rw(i, 1);
coor[2] = coors_rw(i, 2);
coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;
block_coor[0] = coor[1] / block_factor;
block_coor[1] = coor[2] / block_factor;
min_value = mins_rw(block_coor[0], block_coor[1]);
max_value = maxs_rw(block_coor[0], block_coor[1]);
startx = std::max(0, block_coor[0] - block_size / 2);
stopx =
std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
starty = std::max(0, block_coor[1] - block_size / 2);
stopy =
std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);
for (int j = startx; j < stopx; ++j) {
for (int k = starty; k < stopy; ++k) {
min_value = std::min(min_value, mins_rw(j, k));
max_value = std::max(max_value, maxs_rw(j, k));
}
}
voxel_mask_rw(i) = ((max_value - min_value) > height_threshold) &&
((max_value - min_value) < height_high_threshold);
}
return voxel_num;
}
} // namespace spconv
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_POOL_OP_H_
#define SPARSE_POOL_OP_H_
#include <spconv/maxpool.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace spconv {
torch::Tensor indiceMaxPool(torch::Tensor features, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numAct);
torch::Tensor indiceMaxPoolBackward(torch::Tensor features,
torch::Tensor outFeatures,
torch::Tensor outGrad,
torch::Tensor indicePairs,
torch::Tensor indiceNum);
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef REORDERING_CU_H_
#define REORDERING_CU_H_
#include <THC/THCAtomics.cuh>
#include <THC/THCNumerics.cuh>
#include <cuda_fp16.h>
#include <tensorview/kernel_utils.h>
#if PYTORCH_VERSION < 10500
#define TH_ATOMIC_ADD atomicAdd
#else
#define TH_ATOMIC_ADD gpuAtomicAdd
#endif
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
namespace spconv {
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void gatherGenericKernel(T *buffer, const T *features,
const Index *indices, int size,
int numPlanes) {
int ILPStrideX[NumILP];
Index inds[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < size)
inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < size)
buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
features[inds[ilp] + iy];
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
__global__ void gatherVecKernel(T *buffer, const T *features,
const Index *indices, int size, int numPlanes) {
int ILPStrideX[NumILP];
Index inds[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < size)
inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < size)
reinterpret_cast<VecType *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
reinterpret_cast<const VecType *>(features)[inds[ilp] + iy];
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP,
typename VecType = int4>
__global__ void gatherVecBlockKernel(T *buffer, const T *features,
const Index *indices, int size,
int numPlanes) {
int ILPStrideX[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
features += blockIdx.y * NumTLP;
buffer += blockIdx.y * NumTLP;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
reinterpret_cast<VecType *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y] =
reinterpret_cast<const VecType *>(
features)[indices[ix + ILPStrideX[ilp]] * numPlanes +
threadIdx.y];
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void batchGatherGenericKernel(T *buffer, const T *features,
const Index *indices, int size,
int numPlanes, int indice_batch_stride,
int feature_batch_stride) {
// size: max indice num * kernel volume
// inds: [volume, num_elems]
int ILPStrideX[NumILP];
Index inds[NumILP];
Index inds_elem;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < size) {
inds_elem = ix + ILPStrideX[ilp];
inds[ilp] =
indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
inds_elem % feature_batch_stride];
}
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < size) {
if (inds[ilp] != -1) {
buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
features[inds[ilp] * numPlanes + iy];
} else {
buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] = T(0);
}
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
__global__ void
batchGatherVecKernel(T *buffer, const T *features, const Index *indices,
int size, int feature_offset, int numPlanes,
int indice_batch_stride, int feature_batch_stride) {
int ILPStrideX[NumILP];
Index inds[NumILP];
Index zero[sizeof(VecType) / sizeof(T)];
#pragma unroll
for (int i = 0; i < sizeof(VecType) / sizeof(T); ++i) {
zero[i] = T(0);
}
Index inds_elem;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < size) {
inds_elem = ix + ILPStrideX[ilp] + feature_offset;
inds[ilp] =
indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
inds_elem % feature_batch_stride];
}
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < size) {
if (inds[ilp] != -1) {
reinterpret_cast<VecType *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
reinterpret_cast<const VecType *>(
features)[inds[ilp] * numPlanes + iy];
} else {
reinterpret_cast<VecType *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
reinterpret_cast<const VecType *>(&zero)[0];
}
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP,
typename VecType = int4>
__global__ void
batchGatherVecBlockKernel(T *buffer, const T *features, const Index *indices,
int size, int numPlanes, int indice_batch_stride,
int feature_batch_stride) {
int ILPStrideX[NumILP];
Index inds;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
features += blockIdx.y * NumTLP;
buffer += blockIdx.y * NumTLP;
Index inds_elem;
Index zero[sizeof(VecType) / sizeof(T)];
#pragma unroll
for (int i = 0; i < sizeof(VecType) / sizeof(T); ++i) {
zero[i] = T(0);
}
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
inds_elem = ix + ILPStrideX[ilp];
inds = indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
inds_elem % feature_batch_stride];
if (inds != -1) {
reinterpret_cast<VecType *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y] =
reinterpret_cast<const VecType *>(
features)[inds * numPlanes + threadIdx.y];
} else {
reinterpret_cast<VecType *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y] =
reinterpret_cast<const VecType *>(&zero)[0];
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void scatterAddGenericKernel(T *outFeatures, const T *buffer,
const Index *indices, int size,
int numPlanes) {
int ILPStrideX[NumILP];
Index inds[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < size)
inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < size) {
outFeatures[inds[ilp] + iy] +=
buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy];
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP,
typename VecType = int4>
__global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
const Index *indices, int size,
int numPlanes) {
int ILPStrideX[NumILP];
constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
constexpr int vecloadHalf2Factor = sizeof(VecType) / sizeof(__half2);
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
outFeatures += blockIdx.y * NumTLP;
buffer += blockIdx.y * NumTLP;
T buf[vecloadFactor];
T buf2[vecloadFactor];
Index idx;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idx = indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y;
reinterpret_cast<VecType *>(buf)[0] =
reinterpret_cast<VecType *>(outFeatures)[idx];
reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y];
if (std::is_same<T, at::Half>::value) {
#if __CUDA_ARCH__ >= 530
#pragma unroll
for (int i = 0; i < vecloadHalf2Factor; i++) {
reinterpret_cast<__half2 *>(buf)[i] =
__hadd2(reinterpret_cast<__half2 *>(buf)[i],
reinterpret_cast<__half2 *>(buf2)[i]);
}
#else
#pragma unroll
for (int i = 0; i < vecloadFactor; i++) {
buf[i] += buf2[i];
}
#endif
} else {
#pragma unroll
for (int i = 0; i < vecloadFactor; i++) {
buf[i] += buf2[i];
}
}
reinterpret_cast<VecType *>(outFeatures)[idx] =
reinterpret_cast<VecType *>(buf)[0];
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void scatterAddBlockKernel(T *outFeatures, const T *buffer,
const Index *indices, int size,
int numPlanes) {
int ILPStrideX[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
outFeatures += blockIdx.y * NumTLP;
buffer += blockIdx.y * NumTLP;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
outFeatures[indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y] +=
buffer[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y];
}
}
}
#if __CUDA_ARCH__ >= 530
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void scatterAddHalfBlockKernel(T *outFeatures, const T *buffer,
const Index *indices, int size,
int numPlanes) {
int ILPStrideX[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
outFeatures += blockIdx.y * NumTLP;
buffer += blockIdx.y * NumTLP;
Index idx;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
idx = indices[ix + ILPStrideX[ilp]] * numPlanes + threadIdx.y;
reinterpret_cast<__half2 *>(outFeatures)[idx] = __hadd2(
reinterpret_cast<__half2 *>(outFeatures)[idx],
reinterpret_cast<__half2 *>(
buffer)[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]);
}
}
}
#endif
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void batchScatterAddGenericKernel(T *outFeatures, const T *buffer,
const Index *indices, int size,
int feature_offset, int numPlanes,
int indice_batch_stride,
int feature_batch_stride) {
// batch scatter add is greatly slower than native scatter when the number of
// points is large. this may due to atomicAdd?
// batch scatter add is greatly faster than native when the number of points
// is small.
int ILPStrideX[NumILP];
Index inds[NumILP];
Index inds_elem;
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++) {
if (ix + ILPStrideX[ilp] < size) {
inds_elem = ix + ILPStrideX[ilp] + feature_offset;
inds[ilp] =
indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
inds_elem % feature_batch_stride];
}
}
for (int iy : tv::KernelLoopY<int>(numPlanes)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
if (ix + ILPStrideX[ilp] < size && inds[ilp] != -1) {
TH_ATOMIC_ADD(outFeatures + inds[ilp] * numPlanes + iy,
buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy]);
}
}
}
}
}
template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void
batchScatterAddBlockKernel(T *outFeatures, const T *buffer,
const Index *indices, int size, int numPlanes,
int indice_batch_stride, int feature_batch_stride) {
int ILPStrideX[NumILP];
#pragma unroll
for (int ilp = 0; ilp < NumILP; ilp++)
ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
outFeatures += blockIdx.y * NumTLP;
buffer += blockIdx.y * NumTLP;
Index inds, inds_elem;
for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
#pragma unroll
for (int ilp = 0; ilp < NumILP; ++ilp) {
inds_elem = ix + ILPStrideX[ilp];
inds = indices[(inds_elem / feature_batch_stride) * indice_batch_stride +
inds_elem % feature_batch_stride];
if (inds != -1) {
TH_ATOMIC_ADD(outFeatures + inds * numPlanes + threadIdx.y,
buffer[(ix + ILPStrideX[ilp]) * numPlanes + threadIdx.y]);
}
}
}
}
} // namespace spconv
#undef TH_ATOMIC_ADD
#endif
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_REORDERING_FUNCTOR_H_
#define SPARSE_REORDERING_FUNCTOR_H_
#include <tensorview/tensorview.h>
#include <torch/script.h>
namespace spconv {
void batch_sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size);
void batch_sparse_scatter_add_cuda(torch::Tensor buffer,
torch::Tensor outFeatures,
torch::Tensor indices, int size);
void sparse_gather_cuda(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size);
void sparse_scatter_add_cuda(torch::Tensor buffer, torch::Tensor outFeatures,
torch::Tensor indices, int size);
void sparse_gather_cpu(torch::Tensor buffer, torch::Tensor features,
torch::Tensor indices, int size);
void sparse_scatter_add_cpu(torch::Tensor buffer, torch::Tensor outFeatures,
torch::Tensor indices, int size);
} // namespace spconv
#endif
\ No newline at end of file
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_CONV_OP_H_
#define SPARSE_CONV_OP_H_
#include <spconv/indice.h>
#include <spconv/reordering.h>
#include <tensorview/torch_utils.h>
#include <torch/script.h>
#include <utility/timer.h>
namespace spconv {
enum ConvAlgo { kNative = 0, kBatch = 1, kBatchGemmGather = 2 };
// torch.jit's doc says only support int64, so we need to convert to int32.
std::vector<torch::Tensor>
getIndicePairs(torch::Tensor indices, int64_t batchSize,
std::vector<int64_t> outSpatialShape,
std::vector<int64_t> spatialShape,
std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
std::vector<int64_t> padding, std::vector<int64_t> dilation,
std::vector<int64_t> outPadding, int64_t _subM,
int64_t _transpose, int64_t _useHash);
torch::Tensor indiceConvBatch(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t numActOut,
int64_t _inverse, int64_t _subM,
bool batchScatter);
torch::Tensor indiceConv(torch::Tensor features, torch::Tensor filters,
torch::Tensor indicePairs, torch::Tensor indiceNum,
int64_t numActOut, int64_t _inverse, int64_t _subM,
int64_t algo);
std::vector<torch::Tensor>
indiceConvBackward(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse, int64_t _subM,
int64_t algo);
std::vector<torch::Tensor>
indiceConvBackwardBatch(torch::Tensor features, torch::Tensor filters,
torch::Tensor outGrad, torch::Tensor indicePairs,
torch::Tensor indiceNum, int64_t _inverse,
int64_t _subM, bool batchScatter);
} // namespace spconv
#endif
\ No newline at end of file
#include "NvInfer.h"
#include <memory>
#include <tensorview/tensor.h>
#include <unordered_map>
#include <vector>
namespace trt {
template <typename T> tv::DType trt_dtype_to_tv(T trt_dtype) {
switch (trt_dtype) {
case nvinfer1::DataType::kFLOAT:
return tv::float32;
case nvinfer1::DataType::kHALF:
return tv::float16;
case nvinfer1::DataType::kINT32:
return tv::int32;
case nvinfer1::DataType::kINT8:
return tv::int8;
default:;
}
TV_THROW_INVALID_ARG("unknown trt dtype");
}
struct InferDeleter {
template <typename T> void operator()(T *obj) const {
if (obj) {
obj->destroy();
}
}
};
template <typename T> using trt_unique_ptr_t = std::unique_ptr<T, InferDeleter>;
class Logger : public nvinfer1::ILogger {
public:
Logger(Severity severity = Severity::kWARNING)
: reportableSeverity(severity) {}
void log(Severity severity, const char *msg) override {
// suppress messages with severity enum value greater than the reportable
if (severity > reportableSeverity)
return;
switch (severity) {
case Severity::kINTERNAL_ERROR:
std::cerr << "INTERNAL_ERROR: ";
break;
case Severity::kERROR:
std::cerr << "ERROR: ";
break;
case Severity::kWARNING:
std::cerr << "WARNING: ";
break;
case Severity::kINFO:
std::cerr << "INFO: ";
break;
default:
std::cerr << "UNKNOWN: ";
break;
}
std::cerr << msg << std::endl;
}
Severity reportableSeverity;
};
class InferenceContext {
public:
explicit InferenceContext(const std::string &engine_bin, int device)
: logger_(nvinfer1::ILogger::Severity::kINFO), device_(device) {
TV_ASSERT_INVALID_ARG(device >= 0, "invalid device id");
int deviceCount;
cudaGetDeviceCount(&deviceCount);
if (device >= deviceCount) {
TV_THROW_INVALID_ARG("you provide device ", device, " but you only have ",
deviceCount, " device.");
}
cudaSetDevice(device);
auto runtime = trt_unique_ptr_t<nvinfer1::IRuntime>(
nvinfer1::createInferRuntime(logger_));
engine_ =
trt_unique_ptr_t<nvinfer1::ICudaEngine>(runtime->deserializeCudaEngine(
engine_bin.c_str(), engine_bin.size(), nullptr));
ctx_ = trt_unique_ptr_t<nvinfer1::IExecutionContext>(
engine_->createExecutionContext());
max_batch_size_ = engine_->getMaxBatchSize();
for (int i = 0; i < engine_->getNbBindings(); ++i) {
auto dims = engine_->getBindingDimensions(i);
std::vector<int> shape_vec(dims.d, dims.d + dims.nbDims);
shape_vec.insert(shape_vec.begin(), {max_batch_size_});
tv::TensorShape shape(shape_vec);
std::string name = engine_->getBindingName(i);
auto trt_dtype = engine_->getBindingDataType(i);
auto tv_dtype = trt_dtype_to_tv(trt_dtype);
bool isInput = engine_->bindingIsInput(i);
name_to_idx_[name] = i;
idx_to_name_[i] = name;
name_to_host_mem_.insert({name, tv::Tensor(shape, tv_dtype, -1)});
name_to_dev_mem_.insert({name, tv::Tensor(shape, tv_dtype, 0)});
if (isInput)
inp_idxes_.push_back(i);
else
out_idxes_.push_back(i);
bindings_.push_back(name_to_dev_mem_[name].raw_data());
}
checkCudaErrors(cudaStreamCreate(&stream_));
}
std::unordered_map<std::string, tv::Tensor>
operator()(std::vector<tv::Tensor> inputs) {
TV_ASSERT_INVALID_ARG(inputs.size() == inp_idxes_.size(), "must provide",
inp_idxes_.size(), "inputs, but got", inputs.size());
// inference batch size
int bs = inputs[0].dim(0);
for (auto &inp : inputs) {
TV_ASSERT_INVALID_ARG(inp.dim(0) == bs,
"batch sizes of all input must same");
}
TV_ASSERT_INVALID_ARG(bs <= max_batch_size_, "your batchsize too large", bs,
max_batch_size_);
for (int i = 0; i < inputs.size(); ++i) {
auto &dev_mem = name_to_dev_mem_[idx_to_name_[i]];
auto shape_inp = inputs[i].shape().subshape(1);
auto shape_dev = dev_mem.shape().subshape(1);
TV_ASSERT_INVALID_ARG(shape_inp == shape_dev,
"shape except batch must same", shape_inp,
shape_dev);
dev_mem.slice_first_axis(0, bs).copy_(inputs[i].slice_first_axis(0, bs),
stream_);
}
ctx_->enqueue(bs, bindings_.data(), stream_, nullptr);
for (int i : out_idxes_) {
name_to_host_mem_[idx_to_name_[i]].slice_first_axis(0, bs).copy_(
name_to_dev_mem_[idx_to_name_[i]].slice_first_axis(0, bs), stream_);
}
checkCudaErrors(cudaStreamSynchronize(stream_));
std::unordered_map<std::string, tv::Tensor> output_map;
for (int i = 0; i < out_idxes_.size(); ++i) {
auto name = idx_to_name_[out_idxes_[i]];
output_map[name] = name_to_host_mem_[name].slice_first_axis(0, bs);
}
return output_map;
}
std::unordered_map<std::string, tv::Tensor>
operator()(std::unordered_map<std::string, tv::Tensor> inputs) {
std::vector<tv::Tensor> inputs_vec(inp_idxes_.size());
int count = 0;
for (auto &p : inputs) {
auto iter = name_to_idx_.find(p.first);
TV_ASSERT_INVALID_ARG(iter != name_to_idx_.end(), "cant find your name",
p.first);
inputs_vec[name_to_idx_[p.first]] = p.second;
}
TV_ASSERT_INVALID_ARG(count == inp_idxes_.size(), "your inp not enough");
return (*this)(inputs_vec);
}
tv::Tensor operator[](std::string name) {
auto iter = name_to_host_mem_.find(name);
if (iter == name_to_host_mem_.end()) {
TV_THROW_INVALID_ARG(name, "not found.");
}
return iter->second;
}
std::string repr() {
std::stringstream ss;
ss << "InferenceContext[gpu=" << device_ << "]";
ss << "\n Inputs:";
std::string name;
for (auto &i : inp_idxes_) {
name = idx_to_name_[i];
auto &mem = name_to_host_mem_[name];
ss << "\n " << name << "[" << tv::detail::typeString(mem.dtype())
<< "]: " << mem.shape();
}
ss << "\n Outputs:";
for (auto &i : out_idxes_) {
name = idx_to_name_[i];
auto &mem = name_to_host_mem_[name];
ss << "\n " << name << "[" << tv::detail::typeString(mem.dtype())
<< "]: " << mem.shape();
}
return ss.str();
}
private:
Logger logger_;
trt_unique_ptr_t<nvinfer1::ICudaEngine> engine_;
trt_unique_ptr_t<nvinfer1::IExecutionContext> ctx_;
std::unordered_map<std::string, tv::Tensor> name_to_dev_mem_;
std::unordered_map<std::string, tv::Tensor> name_to_host_mem_;
std::unordered_map<std::string, int> name_to_idx_;
std::unordered_map<int, std::string> idx_to_name_;
std::vector<int> inp_idxes_;
std::vector<int> out_idxes_;
std::vector<void *> bindings_;
cudaStream_t stream_;
int max_batch_size_;
int device_;
};
} // namespace trt
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment