Commit 3fff6789 authored by zhangwenwei's avatar zhangwenwei
Browse files

Merge branch 'clean_data-ptr' into 'master'

clean c files

See merge request open-mmlab/mmdet.3d!53
parents 16c3f6e1 d1b9ae40
...@@ -16,29 +16,24 @@ ...@@ -16,29 +16,24 @@
#define SPARSE_MAXPOOL_FUNCTOR_H_ #define SPARSE_MAXPOOL_FUNCTOR_H_
#include <tensorview/tensorview.h> #include <tensorview/tensorview.h>
namespace spconv namespace spconv {
{ namespace functor {
namespace functor
{
template <typename Device, typename T, typename Index> template <typename Device, typename T, typename Index>
struct SparseMaxPoolForwardFunctor struct SparseMaxPoolForwardFunctor {
{ void operator()(const Device& d, tv::TensorView<T> outFeatures,
void operator()(const Device& d, tv::TensorView<T> outFeatures,
tv::TensorView<const T> inFeatures, tv::TensorView<const T> inFeatures,
tv::TensorView<const Index> indices, int size); tv::TensorView<const Index> indices, int size);
}; };
template <typename Device, typename T, typename Index> template <typename Device, typename T, typename Index>
struct SparseMaxPoolBackwardFunctor struct SparseMaxPoolBackwardFunctor {
{ void operator()(const Device& d, tv::TensorView<const T> outFeatures,
void operator()(const Device& d, tv::TensorView<const T> outFeatures,
tv::TensorView<const T> inFeatures, tv::TensorView<const T> inFeatures,
tv::TensorView<const T> dout, tv::TensorView<const T> dout, tv::TensorView<T> din,
tv::TensorView<T> din,
tv::TensorView<const Index> indices, int size); tv::TensorView<const Index> indices, int size);
}; };
} // namespace functor } // namespace functor
} // namespace spconv } // namespace spconv
#endif #endif
...@@ -4,7 +4,8 @@ ...@@ -4,7 +4,8 @@
#include <utility> #include <utility>
namespace spconv { namespace spconv {
template <class... T> struct mp_list {}; template <class... T>
struct mp_list {};
template <class T, T... I> template <class T, T... I>
using mp_list_c = mp_list<std::integral_constant<T, I>...>; using mp_list_c = mp_list<std::integral_constant<T, I>...>;
...@@ -16,15 +17,17 @@ constexpr F mp_for_each_impl(mp_list<T...>, F &&f) { ...@@ -16,15 +17,17 @@ constexpr F mp_for_each_impl(mp_list<T...>, F &&f) {
return std::initializer_list<int>{(f(T()), 0)...}, std::forward<F>(f); return std::initializer_list<int>{(f(T()), 0)...}, std::forward<F>(f);
} }
template <class F> constexpr F mp_for_each_impl(mp_list<>, F &&f) { template <class F>
constexpr F mp_for_each_impl(mp_list<>, F &&f) {
return std::forward<F>(f); return std::forward<F>(f);
} }
} // namespace detail } // namespace detail
namespace detail { namespace detail {
template <class A, template <class...> class B> struct mp_rename_impl { template <class A, template <class...> class B>
struct mp_rename_impl {
// An error "no type named 'type'" here means that the first argument to // An error "no type named 'type'" here means that the first argument to
// mp_rename is not a list // mp_rename is not a list
}; };
...@@ -34,14 +37,15 @@ struct mp_rename_impl<A<T...>, B> { ...@@ -34,14 +37,15 @@ struct mp_rename_impl<A<T...>, B> {
using type = B<T...>; using type = B<T...>;
}; };
} // namespace detail } // namespace detail
template <class A, template <class...> class B> template <class A, template <class...> class B>
using mp_rename = typename detail::mp_rename_impl<A, B>::type; using mp_rename = typename detail::mp_rename_impl<A, B>::type;
template <class L, class F> constexpr F mp_for_each(F &&f) { template <class L, class F>
constexpr F mp_for_each(F &&f) {
return detail::mp_for_each_impl(mp_rename<L, mp_list>(), std::forward<F>(f)); return detail::mp_for_each_impl(mp_rename<L, mp_list>(), std::forward<F>(f));
} }
} // namespace spconv } // namespace spconv
#endif #endif
...@@ -16,11 +16,13 @@ ...@@ -16,11 +16,13 @@
#define NMS_CPU_H #define NMS_CPU_H
#include <pybind11/pybind11.h> #include <pybind11/pybind11.h>
// must include pybind11/stl.h if using containers in STL in arguments. // must include pybind11/stl.h if using containers in STL in arguments.
#include <algorithm>
#include <boost/geometry.hpp>
#include <pybind11/numpy.h> #include <pybind11/numpy.h>
#include <pybind11/stl.h> #include <pybind11/stl.h>
#include <algorithm>
#include <boost/geometry.hpp>
#include <vector> #include <vector>
#include "box_iou.h" #include "box_iou.h"
#include "nms_gpu.h" #include "nms_gpu.h"
namespace spconv { namespace spconv {
...@@ -48,13 +50,11 @@ std::vector<int> non_max_suppression_cpu(py::array_t<DType> boxes, ...@@ -48,13 +50,11 @@ std::vector<int> non_max_suppression_cpu(py::array_t<DType> boxes,
DType xx1, xx2, w, h, inter, ovr; DType xx1, xx2, w, h, inter, ovr;
for (int _i = 0; _i < ndets; ++_i) { for (int _i = 0; _i < ndets; ++_i) {
i = order_r(_i); i = order_r(_i);
if (suppressed_rw(i) == 1) if (suppressed_rw(i) == 1) continue;
continue;
keep.push_back(i); keep.push_back(i);
for (int _j = _i + 1; _j < ndets; ++_j) { for (int _j = _i + 1; _j < ndets; ++_j) {
j = order_r(_j); j = order_r(_j);
if (suppressed_rw(j) == 1) if (suppressed_rw(j) == 1) continue;
continue;
xx2 = std::min(boxes_r(i, 2), boxes_r(j, 2)); xx2 = std::min(boxes_r(i, 2), boxes_r(j, 2));
xx1 = std::max(boxes_r(i, 0), boxes_r(j, 0)); xx1 = std::max(boxes_r(i, 0), boxes_r(j, 0));
w = xx2 - xx1 + eps; w = xx2 - xx1 + eps;
...@@ -65,8 +65,7 @@ std::vector<int> non_max_suppression_cpu(py::array_t<DType> boxes, ...@@ -65,8 +65,7 @@ std::vector<int> non_max_suppression_cpu(py::array_t<DType> boxes,
if (h > 0) { if (h > 0) {
inter = w * h; inter = w * h;
ovr = inter / (area_rw(i) + area_rw(j) - inter); ovr = inter / (area_rw(i) + area_rw(j) - inter);
if (ovr >= thresh) if (ovr >= thresh) suppressed_rw(j) = 1;
suppressed_rw(j) = 1;
} }
} }
} }
...@@ -97,15 +96,12 @@ std::vector<int> rotate_non_max_suppression_cpu(py::array_t<DType> box_corners, ...@@ -97,15 +96,12 @@ std::vector<int> rotate_non_max_suppression_cpu(py::array_t<DType> box_corners,
for (int _i = 0; _i < ndets; ++_i) { for (int _i = 0; _i < ndets; ++_i) {
i = order_r(_i); i = order_r(_i);
if (suppressed_rw(i) == 1) if (suppressed_rw(i) == 1) continue;
continue;
keep.push_back(i); keep.push_back(i);
for (int _j = _i + 1; _j < ndets; ++_j) { for (int _j = _i + 1; _j < ndets; ++_j) {
j = order_r(_j); j = order_r(_j);
if (suppressed_rw(j) == 1) if (suppressed_rw(j) == 1) continue;
continue; if (standup_iou_r(i, j) <= 0.0) continue;
if (standup_iou_r(i, j) <= 0.0)
continue;
// std::cout << "pre_poly" << std::endl; // std::cout << "pre_poly" << std::endl;
try { try {
bg::append(poly, bg::append(poly,
...@@ -164,13 +160,12 @@ std::vector<int> rotate_non_max_suppression_cpu(py::array_t<DType> box_corners, ...@@ -164,13 +160,12 @@ std::vector<int> rotate_non_max_suppression_cpu(py::array_t<DType> box_corners,
} }
}*/ }*/
// std::cout << "post_union" << poly_union.empty() << std::endl; // std::cout << "post_union" << poly_union.empty() << std::endl;
if (!poly_union.empty()) { // ignore invalid box if (!poly_union.empty()) { // ignore invalid box
union_area = bg::area(poly_union.front()); union_area = bg::area(poly_union.front());
// std::cout << "post union area" << std::endl; // std::cout << "post union area" << std::endl;
// std::cout << union_area << "debug" << std::endl; // std::cout << union_area << "debug" << std::endl;
overlap = inter_area / union_area; overlap = inter_area / union_area;
if (overlap >= thresh) if (overlap >= thresh) suppressed_rw(j) = 1;
suppressed_rw(j) = 1;
poly_union.clear(); poly_union.clear();
} }
} }
...@@ -197,5 +192,5 @@ int non_max_suppression(py::array_t<DType> boxes, py::array_t<int> keep_out, ...@@ -197,5 +192,5 @@ int non_max_suppression(py::array_t<DType> boxes, py::array_t<int> keep_out,
nms_overlap_thresh, device_id); nms_overlap_thresh, device_id);
} }
} // namespace spconv } // namespace spconv
#endif #endif
...@@ -16,27 +16,22 @@ ...@@ -16,27 +16,22 @@
#define NMS_FUNCTOR_H_ #define NMS_FUNCTOR_H_
#include <tensorview/tensorview.h> #include <tensorview/tensorview.h>
namespace spconv namespace spconv {
{ namespace functor {
namespace functor
{
template <typename Device, typename T, typename Index> template <typename Device, typename T, typename Index>
struct NonMaxSupressionFunctor struct NonMaxSupressionFunctor {
{ Index operator()(const Device& d, tv::TensorView<Index> keep,
Index operator()(const Device& d, tv::TensorView<Index> keep, tv::TensorView<const T> boxes, T threshold, T eps);
tv::TensorView<const T> boxes,
T threshold, T eps);
}; };
template <typename Device, typename T, typename Index> template <typename Device, typename T, typename Index>
struct rotateNonMaxSupressionFunctor struct rotateNonMaxSupressionFunctor {
{ Index operator()(const Device& d, tv::TensorView<Index> keep,
Index operator()(const Device& d, tv::TensorView<Index> keep, tv::TensorView<const T> boxCorners,
tv::TensorView<const T> boxCorners, tv::TensorView<const T> standupIoU, T threshold);
tv::TensorView<const T> standupIoU, T threshold);
}; };
} // namespace functor } // namespace functor
} // namespace spconv } // namespace spconv
#endif #endif
...@@ -16,13 +16,15 @@ ...@@ -16,13 +16,15 @@
#include <pybind11/pybind11.h> #include <pybind11/pybind11.h>
// must include pybind11/eigen.h if using eigen matrix as arguments. // must include pybind11/eigen.h if using eigen matrix as arguments.
// must include pybind11/stl.h if using containers in STL in arguments. // must include pybind11/stl.h if using containers in STL in arguments.
#include <algorithm>
#include <pybind11/numpy.h> #include <pybind11/numpy.h>
#include <pybind11/stl.h> #include <pybind11/stl.h>
#include <algorithm>
// #include <vector> // #include <vector>
#include <iostream>
#include <math.h> #include <math.h>
#include <iostream>
namespace spconv { namespace spconv {
namespace py = pybind11; namespace py = pybind11;
using namespace pybind11::literals; using namespace pybind11::literals;
...@@ -64,13 +66,11 @@ int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels, ...@@ -64,13 +66,11 @@ int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
} }
coor[ndim_minus_1 - j] = c; coor[ndim_minus_1 - j] = c;
} }
if (failed) if (failed) continue;
continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]); voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) { if (voxelidx == -1) {
voxelidx = voxel_num; voxelidx = voxel_num;
if (voxel_num >= max_voxels) if (voxel_num >= max_voxels) break;
break;
voxel_num += 1; voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx; coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) { for (int k = 0; k < NDim; ++k) {
...@@ -87,20 +87,19 @@ int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels, ...@@ -87,20 +87,19 @@ int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
} }
for (int i = 0; i < voxel_num; ++i) { for (int i = 0; i < voxel_num; ++i) {
coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1; coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
} }
return voxel_num; return voxel_num;
} }
template <typename DType, int NDim> template <typename DType, int NDim>
int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> voxels, int points_to_voxel_3d_np_mean(py::array_t<DType> points,
py::array_t<DType> means, py::array_t<DType> voxels,
py::array_t<int> coors, py::array_t<DType> means, py::array_t<int> coors,
py::array_t<int> num_points_per_voxel, py::array_t<int> num_points_per_voxel,
py::array_t<int> coor_to_voxelidx, py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size, std::vector<DType> voxel_size,
std::vector<DType> coors_range, int max_points, std::vector<DType> coors_range, int max_points,
int max_voxels) { int max_voxels) {
auto points_rw = points.template mutable_unchecked<2>(); auto points_rw = points.template mutable_unchecked<2>();
auto means_rw = means.template mutable_unchecked<2>(); auto means_rw = means.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>(); auto voxels_rw = voxels.template mutable_unchecked<3>();
...@@ -131,13 +130,11 @@ int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> vox ...@@ -131,13 +130,11 @@ int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> vox
} }
coor[ndim_minus_1 - j] = c; coor[ndim_minus_1 - j] = c;
} }
if (failed) if (failed) continue;
continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]); voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) { if (voxelidx == -1) {
voxelidx = voxel_num; voxelidx = voxel_num;
if (voxel_num >= max_voxels) if (voxel_num >= max_voxels) break;
break;
voxel_num += 1; voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx; coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) { for (int k = 0; k < NDim; ++k) {
...@@ -151,14 +148,15 @@ int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> vox ...@@ -151,14 +148,15 @@ int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> vox
} }
num_points_per_voxel_rw(voxelidx) += 1; num_points_per_voxel_rw(voxelidx) += 1;
for (int k = 0; k < num_features; ++k) { for (int k = 0; k < num_features; ++k) {
means_rw(voxelidx, k) += (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1); means_rw(voxelidx, k) +=
(points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
} }
} }
} }
for (int i = 0; i < voxel_num; ++i) { for (int i = 0; i < voxel_num; ++i) {
coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1; coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
num = num_points_per_voxel_rw(i); num = num_points_per_voxel_rw(i);
for (int j = num; j < max_points; ++j){ for (int j = num; j < max_points; ++j) {
for (int k = 0; k < num_features; ++k) { for (int k = 0; k < num_features; ++k) {
voxels_rw(i, j, k) = means_rw(i, k); voxels_rw(i, j, k) = means_rw(i, k);
} }
...@@ -168,15 +166,12 @@ int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> vox ...@@ -168,15 +166,12 @@ int points_to_voxel_3d_np_mean(py::array_t<DType> points, py::array_t<DType> vox
} }
template <typename DType, int NDim> template <typename DType, int NDim>
int points_to_voxel_3d_np_height(py::array_t<DType> points, py::array_t<DType> voxels, int points_to_voxel_3d_np_height(
py::array_t<DType> height, py::array_t<DType> points, py::array_t<DType> voxels,
py::array_t<DType> maxs, py::array_t<DType> height, py::array_t<DType> maxs, py::array_t<int> coors,
py::array_t<int> coors, py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
py::array_t<int> num_points_per_voxel, std::vector<DType> voxel_size, std::vector<DType> coors_range,
py::array_t<int> coor_to_voxelidx, int max_points, int max_voxels) {
std::vector<DType> voxel_size,
std::vector<DType> coors_range, int max_points,
int max_voxels) {
auto points_rw = points.template mutable_unchecked<2>(); auto points_rw = points.template mutable_unchecked<2>();
auto height_rw = height.template mutable_unchecked<2>(); auto height_rw = height.template mutable_unchecked<2>();
auto maxs_rw = maxs.template mutable_unchecked<2>(); auto maxs_rw = maxs.template mutable_unchecked<2>();
...@@ -208,13 +203,11 @@ int points_to_voxel_3d_np_height(py::array_t<DType> points, py::array_t<DType> v ...@@ -208,13 +203,11 @@ int points_to_voxel_3d_np_height(py::array_t<DType> points, py::array_t<DType> v
} }
coor[ndim_minus_1 - j] = c; coor[ndim_minus_1 - j] = c;
} }
if (failed) if (failed) continue;
continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]); voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) { if (voxelidx == -1) {
voxelidx = voxel_num; voxelidx = voxel_num;
if (voxel_num >= max_voxels) if (voxel_num >= max_voxels) break;
break;
voxel_num += 1; voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx; coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) { for (int k = 0; k < NDim; ++k) {
...@@ -225,7 +218,8 @@ int points_to_voxel_3d_np_height(py::array_t<DType> points, py::array_t<DType> v ...@@ -225,7 +218,8 @@ int points_to_voxel_3d_np_height(py::array_t<DType> points, py::array_t<DType> v
if (num < max_points) { if (num < max_points) {
for (int k = 0; k < num_features; ++k) { for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k); voxels_rw(voxelidx, num, k) = points_rw(i, k);
height_rw(voxelidx, k) = std::min(points_rw(i, k), height_rw(voxelidx, k)); height_rw(voxelidx, k) =
std::min(points_rw(i, k), height_rw(voxelidx, k));
maxs_rw(voxelidx, k) = std::max(points_rw(i, k), maxs_rw(voxelidx, k)); maxs_rw(voxelidx, k) = std::max(points_rw(i, k), maxs_rw(voxelidx, k));
} }
num_points_per_voxel_rw(voxelidx) += 1; num_points_per_voxel_rw(voxelidx) += 1;
...@@ -241,15 +235,11 @@ int points_to_voxel_3d_np_height(py::array_t<DType> points, py::array_t<DType> v ...@@ -241,15 +235,11 @@ int points_to_voxel_3d_np_height(py::array_t<DType> points, py::array_t<DType> v
} }
template <typename DType, int NDim> template <typename DType, int NDim>
int block_filtering(py::array_t<DType> points, int block_filtering(py::array_t<DType> points, py::array_t<int> mask,
py::array_t<int> mask, py::array_t<DType> height, py::array_t<DType> maxs,
py::array_t<DType> height, py::array_t<int> coor_to_voxelidx,
py::array_t<DType> maxs, std::vector<DType> voxel_size,
py::array_t<int> coor_to_voxelidx, std::vector<DType> coors_range, int max_voxels, DType eps) {
std::vector<DType> voxel_size,
std::vector<DType> coors_range,
int max_voxels,
DType eps) {
auto points_rw = points.template mutable_unchecked<2>(); auto points_rw = points.template mutable_unchecked<2>();
auto height_rw = height.template mutable_unchecked<1>(); auto height_rw = height.template mutable_unchecked<1>();
auto maxs_rw = maxs.template mutable_unchecked<1>(); auto maxs_rw = maxs.template mutable_unchecked<1>();
...@@ -278,8 +268,7 @@ int block_filtering(py::array_t<DType> points, ...@@ -278,8 +268,7 @@ int block_filtering(py::array_t<DType> points,
} }
coor[ndim_minus_1 - j] = c; coor[ndim_minus_1 - j] = c;
} }
if (failed) if (failed) continue;
continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]); voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) { if (voxelidx == -1) {
voxelidx = voxel_num; voxelidx = voxel_num;
...@@ -299,30 +288,23 @@ int block_filtering(py::array_t<DType> points, ...@@ -299,30 +288,23 @@ int block_filtering(py::array_t<DType> points,
} }
coor[ndim_minus_1 - j] = c; coor[ndim_minus_1 - j] = c;
} }
if (failed) if (failed) continue;
continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]); voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps){ if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps) {
mask(i) = 0; mask(i) = 0;
} }
} }
} }
template <typename DType, int NDim> template <typename DType, int NDim>
int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DType> voxels, int points_to_voxel_3d_with_filtering(
py::array_t<int> voxel_mask, py::array_t<DType> points, py::array_t<DType> voxels,
py::array_t<DType> mins, py::array_t<int> voxel_mask, py::array_t<DType> mins,
py::array_t<DType> maxs, py::array_t<DType> maxs, py::array_t<int> coors,
py::array_t<int> coors, py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
py::array_t<int> num_points_per_voxel, std::vector<DType> voxel_size, std::vector<DType> coors_range,
py::array_t<int> coor_to_voxelidx, int max_points, int max_voxels, int block_factor, int block_size,
std::vector<DType> voxel_size, DType height_threshold) {
std::vector<DType> coors_range,
int max_points,
int max_voxels,
int block_factor,
int block_size,
DType height_threshold) {
auto points_rw = points.template mutable_unchecked<2>(); auto points_rw = points.template mutable_unchecked<2>();
auto mins_rw = mins.template mutable_unchecked<2>(); auto mins_rw = mins.template mutable_unchecked<2>();
auto maxs_rw = maxs.template mutable_unchecked<2>(); auto maxs_rw = maxs.template mutable_unchecked<2>();
...@@ -361,13 +343,11 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy ...@@ -361,13 +343,11 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy
} }
coor[ndim_minus_1 - j] = c; coor[ndim_minus_1 - j] = c;
} }
if (failed) if (failed) continue;
continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]); voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) { if (voxelidx == -1) {
voxelidx = voxel_num; voxelidx = voxel_num;
if (voxel_num >= max_voxels) if (voxel_num >= max_voxels) break;
break;
voxel_num += 1; voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx; coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) { for (int k = 0; k < NDim; ++k) {
...@@ -381,8 +361,10 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy ...@@ -381,8 +361,10 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy
} }
block_coor[0] = coor[1] / block_factor; block_coor[0] = coor[1] / block_factor;
block_coor[1] = coor[2] / block_factor; block_coor[1] = coor[2] / block_factor;
mins_rw(block_coor[0], block_coor[1]) = std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1])); mins_rw(block_coor[0], block_coor[1]) =
maxs_rw(block_coor[0], block_coor[1]) = std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1])); std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
maxs_rw(block_coor[0], block_coor[1]) =
std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
num_points_per_voxel_rw(voxelidx) += 1; num_points_per_voxel_rw(voxelidx) += 1;
} }
} }
...@@ -394,13 +376,15 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy ...@@ -394,13 +376,15 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy
block_coor[1] = coor[2] / block_factor; block_coor[1] = coor[2] / block_factor;
min_value = mins_rw(block_coor[0], block_coor[1]); min_value = mins_rw(block_coor[0], block_coor[1]);
max_value = maxs_rw(block_coor[0], block_coor[1]); max_value = maxs_rw(block_coor[0], block_coor[1]);
startx = std::max(0, block_coor[0]-block_size/2); startx = std::max(0, block_coor[0] - block_size / 2);
stopx = std::min(block_shape_H, block_coor[0]+block_size-block_size/2); stopx =
starty = std::max(0, block_coor[1]-block_size/2); std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
stopy = std::min(block_shape_W, block_coor[1]+block_size-block_size/2); starty = std::max(0, block_coor[1] - block_size / 2);
stopy =
std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);
for (int j = startx; j < stopx; ++j){ for (int j = startx; j < stopx; ++j) {
for (int k = starty; k < stopy; ++k){ for (int k = starty; k < stopy; ++k) {
min_value = std::min(min_value, mins_rw(j, k)); min_value = std::min(min_value, mins_rw(j, k));
max_value = std::max(max_value, maxs_rw(j, k)); max_value = std::max(max_value, maxs_rw(j, k));
} }
...@@ -410,5 +394,4 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy ...@@ -410,5 +394,4 @@ int points_to_voxel_3d_with_filtering(py::array_t<DType> points, py::array_t<DTy
return voxel_num; return voxel_num;
} }
} // namespace spconv
} // namespace spconv
...@@ -156,6 +156,6 @@ __global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer, ...@@ -156,6 +156,6 @@ __global__ void scatterAddVecBlockKernel(T *outFeatures, const T *buffer,
} }
} }
} // namespace spconv } // namespace spconv
#endif #endif
...@@ -16,25 +16,23 @@ ...@@ -16,25 +16,23 @@
#define SPARSE_REORDERING_FUNCTOR_H_ #define SPARSE_REORDERING_FUNCTOR_H_
#include <tensorview/tensorview.h> #include <tensorview/tensorview.h>
namespace spconv namespace spconv {
{ namespace functor {
namespace functor
{
template <typename Device, typename T, typename Index> template <typename Device, typename T, typename Index>
struct SparseGatherFunctor struct SparseGatherFunctor {
{ void operator()(const Device& d, tv::TensorView<T> buffer,
void operator()(const Device& d, tv::TensorView<T> buffer, tv::TensorView<const T> features, tv::TensorView<const T> features,
tv::TensorView<const Index> indices, int size); tv::TensorView<const Index> indices, int size);
}; };
template <typename Device, typename T, typename Index> template <typename Device, typename T, typename Index>
struct SparseScatterAddFunctor struct SparseScatterAddFunctor {
{ void operator()(const Device& d, tv::TensorView<T> out_features,
void operator()(const Device& d, tv::TensorView<T> out_features, tv::TensorView<const T> buffer,
tv::TensorView<const T> buffer, tv::TensorView<const Index> indices, tv::TensorView<const Index> indices, int size,
int size, bool stable=false); bool stable = false);
}; };
} // namespace functor } // namespace functor
} // namespace spconv } // namespace spconv
#endif #endif
#pragma once #pragma once
// from tensorflow // from tensorflow
namespace tv namespace tv {
{ namespace detail {
namespace detail
{
template <typename T> template <typename T>
class KernelLoop class KernelLoop {
{ struct Iterator {
struct Iterator __forceinline__ __device__ Iterator(T index, T delta)
{ : index_(index), delta_(delta) {}
__forceinline__ __device__ Iterator(T index, T delta) : index_(index), delta_(delta) {}
__forceinline__ __device__ T operator*() const { return index_; } __forceinline__ __device__ T operator*() const { return index_; }
__forceinline__ __device__ Iterator &operator++() __forceinline__ __device__ Iterator &operator++() {
{
index_ += delta_; index_ += delta_;
return *this; return *this;
} }
__forceinline__ __device__ bool operator!=(const Iterator &other) const __forceinline__ __device__ bool operator!=(const Iterator &other) const {
{
bool greater = index_ > other.index_; bool greater = index_ > other.index_;
bool less = index_ < other.index_; bool less = index_ < other.index_;
// Anything past an end iterator (delta_ == 0) is equal. // Anything past an end iterator (delta_ == 0) is equal.
// In range-based for loops, this optimizes to 'return less'. // In range-based for loops, this optimizes to 'return less'.
if (!other.delta_) if (!other.delta_) {
{
return less; return less;
} }
if (!delta_) if (!delta_) {
{
return greater; return greater;
} }
return less || greater; return less || greater;
} }
private: private:
T index_; T index_;
const T delta_; const T delta_;
}; };
public: public:
__forceinline__ __device__ KernelLoop(T begin, T delta, T end) __forceinline__ __device__ KernelLoop(T begin, T delta, T end)
: begin_(begin), delta_(delta), end_(end) {} : begin_(begin), delta_(delta), end_(end) {}
__forceinline__ __device__ Iterator begin() const { return Iterator{begin_, delta_}; } __forceinline__ __device__ Iterator begin() const {
return Iterator{begin_, delta_};
}
__forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; } __forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }
private: private:
T begin_; T begin_;
T delta_; T delta_;
T end_; T end_;
}; };
} // namespace detail } // namespace detail
template <typename T, int NumILP=1> template <typename T, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<T> KernelLoopX(T count) __forceinline__ __device__ detail::KernelLoop<T> KernelLoopX(T count) {
{
return detail::KernelLoop<T>(blockIdx.x * blockDim.x + threadIdx.x, return detail::KernelLoop<T>(blockIdx.x * blockDim.x + threadIdx.x,
gridDim.x * blockDim.x * NumILP, count); gridDim.x * blockDim.x * NumILP, count);
} }
// Helper to visit indices in the range 0 <= i < count using the y-coordinate. // Helper to visit indices in the range 0 <= i < count using the y-coordinate.
// Usage: for(int i : KernelLoopY(count)) { visit(i); } // Usage: for(int i : KernelLoopY(count)) { visit(i); }
template <typename T, int NumILP=1> template <typename T, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<T> KernelLoopY(T count) __forceinline__ __device__ detail::KernelLoop<T> KernelLoopY(T count) {
{
return detail::KernelLoop<T>(blockIdx.y * blockDim.y + threadIdx.y, return detail::KernelLoop<T>(blockIdx.y * blockDim.y + threadIdx.y,
gridDim.y * blockDim.y * NumILP, count); gridDim.y * blockDim.y * NumILP, count);
} }
// Helper to visit indices in the range 0 <= i < count using the z-coordinate. // Helper to visit indices in the range 0 <= i < count using the z-coordinate.
// Usage: for(int i : KernelLoopZ(count)) { visit(i); } // Usage: for(int i : KernelLoopZ(count)) { visit(i); }
template <typename T, int NumILP=1> template <typename T, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<T> KernelLoopZ(T count) __forceinline__ __device__ detail::KernelLoop<T> KernelLoopZ(T count) {
{
return detail::KernelLoop<T>(blockIdx.z * blockDim.z + threadIdx.z, return detail::KernelLoop<T>(blockIdx.z * blockDim.z + threadIdx.z,
gridDim.z * blockDim.z * NumILP, count); gridDim.z * blockDim.z * NumILP, count);
} }
} // namespace tv } // namespace tv
...@@ -13,10 +13,11 @@ ...@@ -13,10 +13,11 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <cuda_runtime_api.h>
#include <algorithm> #include <algorithm>
#include <cassert> #include <cassert>
#include <cstdlib> #include <cstdlib>
#include <cuda_runtime_api.h>
#include <iostream> #include <iostream>
#include <memory> #include <memory>
// #include <prettyprint.h> // #include <prettyprint.h>
...@@ -42,22 +43,22 @@ namespace tv { ...@@ -42,22 +43,22 @@ namespace tv {
#define TV_HOST_DEVICE #define TV_HOST_DEVICE
#endif #endif
#define TV_REQUIRE(expr, ...) \ #define TV_REQUIRE(expr, ...) \
{ \ { \
if (!(expr)) { \ if (!(expr)) { \
printf(__VA_ARGS__); \ printf(__VA_ARGS__); \
assert(expr); \ assert(expr); \
} \ } \
} }
#define TV_DEVICE_REQUIRE(expr, ...) \ #define TV_DEVICE_REQUIRE(expr, ...) \
{ \ { \
if (!(expr) && threadIdx.x == 0) \ if (!(expr) && threadIdx.x == 0) printf(__VA_ARGS__); \
printf(__VA_ARGS__); \ assert(expr); \
assert(expr); \
} }
template <class SStream, class T> void sstream_print(SStream &ss, T val) { template <class SStream, class T>
void sstream_print(SStream &ss, T val) {
ss << val; ss << val;
} }
...@@ -67,37 +68,37 @@ void sstream_print(SStream &ss, T val, TArgs... args) { ...@@ -67,37 +68,37 @@ void sstream_print(SStream &ss, T val, TArgs... args) {
sstream_print(ss, args...); sstream_print(ss, args...);
} }
#define TV_ASSERT_RT_ERR(expr, ...) \ #define TV_ASSERT_RT_ERR(expr, ...) \
{ \ { \
if (!(expr)) { \ if (!(expr)) { \
std::stringstream __macro_s; \ std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \ __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << #expr << " assert faild. "; \ __macro_s << #expr << " assert faild. "; \
tv::sstream_print(__macro_s, __VA_ARGS__); \ tv::sstream_print(__macro_s, __VA_ARGS__); \
throw std::runtime_error(__macro_s.str()); \ throw std::runtime_error(__macro_s.str()); \
} \ } \
} }
#define TV_ASSERT_INVALID_ARG(expr, ...) \ #define TV_ASSERT_INVALID_ARG(expr, ...) \
{ \ { \
if (!(expr)) { \ if (!(expr)) { \
std::stringstream __macro_s; \ std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \ __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << #expr << " assert faild. "; \ __macro_s << #expr << " assert faild. "; \
tv::sstream_print(__macro_s, __VA_ARGS__); \ tv::sstream_print(__macro_s, __VA_ARGS__); \
throw std::invalid_argument(__macro_s.str()); \ throw std::invalid_argument(__macro_s.str()); \
} \ } \
} }
#define TV_CHECK_CUDA_ERR() \ #define TV_CHECK_CUDA_ERR() \
{ \ { \
auto err = cudaGetLastError(); \ auto err = cudaGetLastError(); \
if (err != cudaSuccess) { \ if (err != cudaSuccess) { \
std::stringstream __macro_s; \ std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \ __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << "cuda execution failed with error " << err; \ __macro_s << "cuda execution failed with error " << err; \
throw std::runtime_error(__macro_s.str()); \ throw std::runtime_error(__macro_s.str()); \
} \ } \
} }
struct GPU { struct GPU {
...@@ -130,7 +131,7 @@ constexpr size_t calc_align(size_t ndim) ...@@ -130,7 +131,7 @@ constexpr size_t calc_align(size_t ndim)
*/ */
template <typename T, size_t MaxDim = TV_MAX_DIM> template <typename T, size_t MaxDim = TV_MAX_DIM>
struct /*alignas(calc_align<T>(MaxDim))*/ SimpleVector { struct /*alignas(calc_align<T>(MaxDim))*/ SimpleVector {
public: public:
TV_HOST_DEVICE_INLINE SimpleVector(){}; TV_HOST_DEVICE_INLINE SimpleVector(){};
TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<T> q) { TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<T> q) {
TV_ASSERT(q.size() <= MaxDim); TV_ASSERT(q.size() <= MaxDim);
...@@ -187,7 +188,7 @@ public: ...@@ -187,7 +188,7 @@ public:
typedef size_t size_type; typedef size_t size_type;
class iterator { class iterator {
public: public:
typedef iterator self_type; typedef iterator self_type;
typedef T value_type; typedef T value_type;
typedef T &reference; typedef T &reference;
...@@ -213,12 +214,12 @@ public: ...@@ -213,12 +214,12 @@ public:
return ptr_ != rhs.ptr_; return ptr_ != rhs.ptr_;
} }
private: private:
pointer ptr_; pointer ptr_;
}; };
class const_iterator { class const_iterator {
public: public:
typedef const_iterator self_type; typedef const_iterator self_type;
typedef T value_type; typedef T value_type;
typedef const T &reference; typedef const T &reference;
...@@ -244,7 +245,7 @@ public: ...@@ -244,7 +245,7 @@ public:
return ptr_ != rhs.ptr_; return ptr_ != rhs.ptr_;
} }
private: private:
pointer ptr_; pointer ptr_;
}; };
...@@ -267,7 +268,7 @@ public: ...@@ -267,7 +268,7 @@ public:
return const_iterator(mArray + mSize); return const_iterator(mArray + mSize);
} }
protected: protected:
T mArray[MaxDim]; T mArray[MaxDim];
size_t mSize = 0; size_t mSize = 0;
}; };
...@@ -275,11 +276,9 @@ protected: ...@@ -275,11 +276,9 @@ protected:
template <typename T, size_t MaxDim> template <typename T, size_t MaxDim>
bool operator==(const SimpleVector<T, MaxDim> &lfs, bool operator==(const SimpleVector<T, MaxDim> &lfs,
const SimpleVector<T, MaxDim> &rfs) { const SimpleVector<T, MaxDim> &rfs) {
if (lfs.size() != rfs.size()) if (lfs.size() != rfs.size()) return false;
return false;
for (size_t i = 0; i < lfs.size(); ++i) { for (size_t i = 0; i < lfs.size(); ++i) {
if (lfs[i] != rfs[i]) if (lfs[i] != rfs[i]) return false;
return false;
} }
return true; return true;
} }
...@@ -287,12 +286,12 @@ bool operator==(const SimpleVector<T, MaxDim> &lfs, ...@@ -287,12 +286,12 @@ bool operator==(const SimpleVector<T, MaxDim> &lfs,
template <typename T, size_t MaxDim> template <typename T, size_t MaxDim>
bool operator!=(const SimpleVector<T, MaxDim> &lfs, bool operator!=(const SimpleVector<T, MaxDim> &lfs,
const SimpleVector<T, MaxDim> &rfs) { const SimpleVector<T, MaxDim> &rfs) {
return !(lfs == rfs); return !(lfs == rfs);
} }
struct Slice { struct Slice {
template <class... Integers> TV_HOST_DEVICE_INLINE Slice(Integers... ints) { template <class... Integers>
TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3"); static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
SimpleVector<int, 3> slices{int(ints)...}; SimpleVector<int, 3> slices{int(ints)...};
mSlices[0] = -1; mSlices[0] = -1;
...@@ -333,7 +332,7 @@ struct Slice { ...@@ -333,7 +332,7 @@ struct Slice {
return mSlices[idx]; return mSlices[idx];
} }
protected: protected:
int mSlices[3]; int mSlices[3];
}; };
...@@ -372,8 +371,7 @@ struct ShapeBase : public SimpleVector<int, MaxDim> { ...@@ -372,8 +371,7 @@ struct ShapeBase : public SimpleVector<int, MaxDim> {
} }
TV_HOST_DEVICE_INLINE size_t size() const { TV_HOST_DEVICE_INLINE size_t size() const {
if (this->mSize == 0) if (this->mSize == 0) return 0;
return 0;
size_t s = 1; size_t s = 1;
for (int i = 0; i < int(this->mSize); ++i) { for (int i = 0; i < int(this->mSize); ++i) {
s *= this->mArray[i]; s *= this->mArray[i];
...@@ -384,16 +382,14 @@ struct ShapeBase : public SimpleVector<int, MaxDim> { ...@@ -384,16 +382,14 @@ struct ShapeBase : public SimpleVector<int, MaxDim> {
TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze() const { TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze() const {
ShapeBase<MaxDim> shape; ShapeBase<MaxDim> shape;
for (int i = 0; i < this->mSize; ++i) { for (int i = 0; i < this->mSize; ++i) {
if (this->mArray[i] != 1) if (this->mArray[i] != 1) shape.push_back(this->mArray[i]);
shape.push_back(this->mArray[i]);
} }
return shape; return shape;
} }
TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze(int dim) const { TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze(int dim) const {
ShapeBase<MaxDim> shape; ShapeBase<MaxDim> shape;
for (int i = 0; i < this->mSize; ++i) { for (int i = 0; i < this->mSize; ++i) {
if (i != dim || this->mArray[i] != 1) if (i != dim || this->mArray[i] != 1) shape.push_back(this->mArray[i]);
shape.push_back(this->mArray[i]);
} }
return shape; return shape;
} }
...@@ -479,7 +475,8 @@ TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output, ...@@ -479,7 +475,8 @@ TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
return index; return index;
} }
template <int N> struct ArrayIndexRowMajor { template <int N>
struct ArrayIndexRowMajor {
// mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4]; // mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape, TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
const Shape &indexes) { const Shape &indexes) {
...@@ -488,7 +485,8 @@ template <int N> struct ArrayIndexRowMajor { ...@@ -488,7 +485,8 @@ template <int N> struct ArrayIndexRowMajor {
} }
}; };
template <> struct ArrayIndexRowMajor<0> { template <>
struct ArrayIndexRowMajor<0> {
TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape, TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
const Shape &indexes) { const Shape &indexes) {
return 0; return 0;
...@@ -496,24 +494,36 @@ template <> struct ArrayIndexRowMajor<0> { ...@@ -496,24 +494,36 @@ template <> struct ArrayIndexRowMajor<0> {
}; };
namespace detail { namespace detail {
template <typename T> constexpr const char *simpleTypeName(T val = T()); template <typename T>
template <> constexpr const char *simpleTypeName(float val) { constexpr const char *simpleTypeName(T val = T());
template <>
constexpr const char *simpleTypeName(float val) {
return "float32"; return "float32";
} }
template <> constexpr const char *simpleTypeName(double val) { template <>
constexpr const char *simpleTypeName(double val) {
return "float64"; return "float64";
} }
template <> constexpr const char *simpleTypeName(int val) { return "int32"; } template <>
template <> constexpr const char *simpleTypeName(unsigned val) { constexpr const char *simpleTypeName(int val) {
return "int32";
}
template <>
constexpr const char *simpleTypeName(unsigned val) {
return "uint32"; return "uint32";
} }
template <> constexpr const char *simpleTypeName(long val) { return "int64"; } template <>
template <> constexpr const char *simpleTypeName(unsigned long val) { constexpr const char *simpleTypeName(long val) {
return "int64";
}
template <>
constexpr const char *simpleTypeName(unsigned long val) {
return "uint64"; return "uint64";
} }
}; // namespace detail }; // namespace detail
template <typename T, int Rank = -1> struct TensorView { template <typename T, int Rank = -1>
struct TensorView {
TV_HOST_DEVICE_INLINE TensorView() {} TV_HOST_DEVICE_INLINE TensorView() {}
explicit TV_HOST_DEVICE_INLINE TensorView(T *ptr, Shape shape) explicit TV_HOST_DEVICE_INLINE TensorView(T *ptr, Shape shape)
: mPtr(ptr), mShape(shape) {} : mPtr(ptr), mShape(shape) {}
...@@ -526,29 +536,28 @@ template <typename T, int Rank = -1> struct TensorView { ...@@ -526,29 +536,28 @@ template <typename T, int Rank = -1> struct TensorView {
mShape = {int(shapes)...}; mShape = {int(shapes)...};
} }
TV_HOST_DEVICE_INLINE TensorView<T, Rank> & TV_HOST_DEVICE_INLINE TensorView<T, Rank> &assign(
assign(const TensorView<T, Rank> &tensor) { const TensorView<T, Rank> &tensor) {
TV_REQUIRE(tensor.shape() == shape(), "you must provide same input size%s", TV_REQUIRE(tensor.shape() == shape(), "you must provide same input size%s",
"\n"); "\n");
T *ptr = mPtr; T *ptr = mPtr;
const T *other_ptr = tensor.data(); const T *other_ptr = tensor.data();
for (size_t i = 0; i < size(); ++i) for (size_t i = 0; i < size(); ++i) *(ptr++) = *(other_ptr++);
*(ptr++) = *(other_ptr++);
return *this; return *this;
} }
template <typename T1> template <typename T1>
TV_HOST_DEVICE_INLINE TensorView<T, Rank> & TV_HOST_DEVICE_INLINE TensorView<T, Rank> &assign(
assign(std::initializer_list<T1> seq) { std::initializer_list<T1> seq) {
TV_REQUIRE(seq.size() == size(), "you must provide same input size%s", TV_REQUIRE(seq.size() == size(), "you must provide same input size%s",
"\n"); "\n");
T *ptr = mPtr; T *ptr = mPtr;
for (const T1 &s : seq) for (const T1 &s : seq) *(ptr++) = T(s);
*(ptr++) = T(s);
return *this; return *this;
} }
template <class... Inds> TV_HOST_DEVICE_INLINE T &operator()(Inds... inds) { template <class... Inds>
TV_HOST_DEVICE_INLINE T &operator()(Inds... inds) {
#ifdef TV_DEBUG #ifdef TV_DEBUG
int idxes[sizeof...(Inds)]{int(inds)...}; int idxes[sizeof...(Inds)]{int(inds)...};
TV_REQUIRE(sizeof...(inds) == mShape.ndim(), TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
...@@ -610,7 +619,8 @@ template <typename T, int Rank = -1> struct TensorView { ...@@ -610,7 +619,8 @@ template <typename T, int Rank = -1> struct TensorView {
return mPtr[0]; return mPtr[0];
} }
template <class T1> TV_HOST_DEVICE_INLINE T &operator()(T1 i1) { template <class T1>
TV_HOST_DEVICE_INLINE T &operator()(T1 i1) {
#if defined TV_DEBUG #if defined TV_DEBUG
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 1, TV_DEVICE_REQUIRE(mShape.ndim() == 1,
...@@ -711,7 +721,8 @@ template <typename T, int Rank = -1> struct TensorView { ...@@ -711,7 +721,8 @@ template <typename T, int Rank = -1> struct TensorView {
return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4]; return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
} }
template <class T1> TV_HOST_DEVICE_INLINE const T &operator()(T1 i1) const { template <class T1>
TV_HOST_DEVICE_INLINE const T &operator()(T1 i1) const {
#ifdef TV_DEBUG #ifdef TV_DEBUG
#if defined(__CUDA_ARCH__) #if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 1, TV_DEVICE_REQUIRE(mShape.ndim() == 1,
...@@ -843,12 +854,12 @@ template <typename T, int Rank = -1> struct TensorView { ...@@ -843,12 +854,12 @@ template <typename T, int Rank = -1> struct TensorView {
#endif #endif
return mPtr[idx]; return mPtr[idx];
}*/ }*/
TV_HOST_DEVICE_INLINE TensorView<T, Rank> TV_HOST_DEVICE_INLINE TensorView<T, Rank> operator[](
operator[](SimpleVector<Slice> slice_vec) { SimpleVector<Slice> slice_vec) {
return _subview(slice_vec); return _subview(slice_vec);
} }
TV_HOST_DEVICE_INLINE const TensorView<T, Rank> TV_HOST_DEVICE_INLINE const TensorView<T, Rank> operator[](
operator[](SimpleVector<Slice> slice_vec) const { SimpleVector<Slice> slice_vec) const {
return _subview(slice_vec); return _subview(slice_vec);
} }
TV_HOST_DEVICE_INLINE bool empty() const { return mPtr == nullptr; } TV_HOST_DEVICE_INLINE bool empty() const { return mPtr == nullptr; }
...@@ -917,7 +928,7 @@ template <typename T, int Rank = -1> struct TensorView { ...@@ -917,7 +928,7 @@ template <typename T, int Rank = -1> struct TensorView {
new_shape[i] = slice_vec[i][1] - slice_vec[i][0]; new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
TV_ASSERT(new_shape[i] >= 0); TV_ASSERT(new_shape[i] >= 0);
} else { } else {
new_shape[i] = 1; // reduce dim new_shape[i] = 1; // reduce dim
} }
} }
auto offset = rowArrayIdx(mShape, start); auto offset = rowArrayIdx(mShape, start);
...@@ -952,8 +963,7 @@ template <typename T, int Rank = -1> struct TensorView { ...@@ -952,8 +963,7 @@ template <typename T, int Rank = -1> struct TensorView {
std::string repr() const { std::string repr() const {
std::ostringstream ss; std::ostringstream ss;
if (empty()) if (empty()) return "";
return "";
if (mShape.ndim() == 0) { if (mShape.ndim() == 0) {
ss << *mPtr; ss << *mPtr;
// ss << fmt::format("\nTensor: shape={}, dtype={}", mShape, // ss << fmt::format("\nTensor: shape={}, dtype={}", mShape,
...@@ -980,14 +990,12 @@ template <typename T, int Rank = -1> struct TensorView { ...@@ -980,14 +990,12 @@ template <typename T, int Rank = -1> struct TensorView {
print_comma = false; print_comma = false;
} }
} }
if (print_comma && i != this->size() - 1) if (print_comma && i != this->size() - 1) ss << ", ";
ss << ", ";
for (int j = 0; j < inc_count; ++j) { for (int j = 0; j < inc_count; ++j) {
ss << "]"; ss << "]";
} }
if (i != this->size() - 1) { if (i != this->size() - 1) {
if (inc_count != 0) if (inc_count != 0) ss << "\n";
ss << "\n";
for (int j = 0; j < inc_count; ++j) { for (int j = 0; j < inc_count; ++j) {
ss << "["; ss << "[";
} }
...@@ -1000,11 +1008,11 @@ template <typename T, int Rank = -1> struct TensorView { ...@@ -1000,11 +1008,11 @@ template <typename T, int Rank = -1> struct TensorView {
return ss.str(); return ss.str();
} }
protected: protected:
// TODO: make this function public. // TODO: make this function public.
// currently this function is called unexpectedly when using subview({0, 0}). // currently this function is called unexpectedly when using subview({0, 0}).
TV_HOST_DEVICE_INLINE TensorView<T, Rank> TV_HOST_DEVICE_INLINE TensorView<T, Rank> _subview(
_subview(SimpleVector<Slice> slice_vec) { SimpleVector<Slice> slice_vec) {
Shape new_shape; Shape new_shape;
for (int i = 0; i < slice_vec.size(); ++i) { for (int i = 0; i < slice_vec.size(); ++i) {
new_shape.push_back(slice_vec[i][0]); new_shape.push_back(slice_vec[i][0]);
...@@ -1022,7 +1030,7 @@ protected: ...@@ -1022,7 +1030,7 @@ protected:
new_shape[i] = slice_vec[i][1] - slice_vec[i][0]; new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
TV_ASSERT(new_shape[i] >= 0); TV_ASSERT(new_shape[i] >= 0);
} else { } else {
new_shape[i] = 1; // reduce dim new_shape[i] = 1; // reduce dim
} }
} }
auto offset = rowArrayIdx(mShape, start); auto offset = rowArrayIdx(mShape, start);
...@@ -1041,7 +1049,8 @@ protected: ...@@ -1041,7 +1049,8 @@ protected:
} }
return TensorView<T, Rank>(mPtr + offset, reduced_shape); return TensorView<T, Rank>(mPtr + offset, reduced_shape);
} }
template <typename T1> TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const { template <typename T1>
TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
return Slice{int(s), -1, -1}; return Slice{int(s), -1, -1};
} }
...@@ -1064,26 +1073,38 @@ Os &operator<<(Os &os, const TensorView<const T, Rank> &dt) { ...@@ -1064,26 +1073,38 @@ Os &operator<<(Os &os, const TensorView<const T, Rank> &dt) {
} }
namespace detail { namespace detail {
template <typename T> constexpr const char *printfTypeFormat(T val = T()); template <typename T>
template <> constexpr const char *printfTypeFormat(float val) { return "%.2f"; } constexpr const char *printfTypeFormat(T val = T());
template <> constexpr const char *printfTypeFormat(double val) { template <>
constexpr const char *printfTypeFormat(float val) {
return "%.2f";
}
template <>
constexpr const char *printfTypeFormat(double val) {
return "%.2f"; return "%.2f";
} }
template <> constexpr const char *printfTypeFormat(int val) { return "%d"; } template <>
template <> constexpr const char *printfTypeFormat(unsigned val) { constexpr const char *printfTypeFormat(int val) {
return "%d";
}
template <>
constexpr const char *printfTypeFormat(unsigned val) {
return "%u"; return "%u";
} }
template <> constexpr const char *printfTypeFormat(long val) { return "%ld"; } template <>
template <> constexpr const char *printfTypeFormat(unsigned long val) { constexpr const char *printfTypeFormat(long val) {
return "%ld";
}
template <>
constexpr const char *printfTypeFormat(unsigned long val) {
return "%lu"; return "%lu";
} }
}; // namespace detail }; // namespace detail
template <typename T> template <typename T>
TV_HOST_DEVICE void printTensorView(const TensorView<T> tensor, TV_HOST_DEVICE void printTensorView(const TensorView<T> tensor,
const char *format) { const char *format) {
if (tensor.empty()) if (tensor.empty()) return;
return;
if (tensor.ndim() == 0) { if (tensor.ndim() == 0) {
printf(format, tensor()); printf(format, tensor());
printf("\n"); printf("\n");
...@@ -1108,14 +1129,12 @@ TV_HOST_DEVICE void printTensorView(const TensorView<T> tensor, ...@@ -1108,14 +1129,12 @@ TV_HOST_DEVICE void printTensorView(const TensorView<T> tensor,
print_comma = false; print_comma = false;
} }
} }
if (print_comma && i != tensor.size() - 1) if (print_comma && i != tensor.size() - 1) printf(", ");
printf(", ");
for (int j = 0; j < inc_count; ++j) { for (int j = 0; j < inc_count; ++j) {
printf("]"); printf("]");
} }
if (i != tensor.size() - 1) { if (i != tensor.size() - 1) {
if (inc_count != 0) if (inc_count != 0) printf("\n");
printf("\n");
for (int j = 0; j < inc_count; ++j) { for (int j = 0; j < inc_count; ++j) {
printf("["); printf("[");
} }
...@@ -1141,4 +1160,4 @@ TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape, ...@@ -1141,4 +1160,4 @@ TV_HOST_DEVICE void printTensorView(const T *ptr, Shape shape,
return printTensorView(TensorView<const T>(ptr, shape), format); return printTensorView(TensorView<const T>(ptr, shape), format);
} }
} // namespace tv } // namespace tv
...@@ -23,61 +23,57 @@ namespace functor { ...@@ -23,61 +23,57 @@ namespace functor {
template <typename Index, typename IndexGrid, unsigned NDim> template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> { struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn, Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut, tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut, tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum, tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize, const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride, const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding, const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation, const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape, const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid) { bool transpose, bool resetGrid) {
if (transpose) if (transpose)
return getIndicePairsDeConv<Index, IndexGrid, NDim>( return getIndicePairsDeConv<Index, IndexGrid, NDim>(
indicesIn, indicesOut, indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
gridsOut, indicePairs, indiceNum,
kernelSize.data(), stride.data(), padding.data(), dilation.data(), kernelSize.data(), stride.data(), padding.data(), dilation.data(),
outSpatialShape.data()); outSpatialShape.data());
else else
return getIndicePairsConv<Index, IndexGrid, NDim>( return getIndicePairsConv<Index, IndexGrid, NDim>(
indicesIn, indicesOut, indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
gridsOut, indicePairs, indiceNum,
kernelSize.data(), stride.data(), padding.data(), dilation.data(), kernelSize.data(), stride.data(), padding.data(), dilation.data(),
outSpatialShape.data()); outSpatialShape.data());
} }
}; };
template <typename Index, typename IndexGrid, unsigned NDim> template <typename Index, typename IndexGrid, unsigned NDim>
struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> { struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn, Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut, tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs, tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum, tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize, const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride, const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding, const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation, const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape, const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid) { bool transpose, bool resetGrid) {
return getIndicePairsSubM<Index, IndexGrid, NDim>( return getIndicePairsSubM<Index, IndexGrid, NDim>(
indicesIn, indicesIn, gridsOut, indicePairs, indiceNum, kernelSize.data(),
gridsOut, indicePairs, indiceNum, stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
kernelSize.data(), stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
} }
}; };
} // namespace functor } // namespace functor
#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM) \
template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, NDIM>; \
template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
NDIM>;
#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM) \
template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, \
NDIM>; \
template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
NDIM>;
#define DECLARE_CPU_INDEX(Index) \ #define DECLARE_CPU_INDEX(Index) \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \ DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2); \ DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3); \ DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3); \
DECLARE_CPU_SPECS_INDEX_NDIM(Index, 4); DECLARE_CPU_SPECS_INDEX_NDIM(Index, 4);
DECLARE_CPU_INDEX(int); DECLARE_CPU_INDEX(int);
...@@ -86,4 +82,4 @@ DECLARE_CPU_INDEX(long); ...@@ -86,4 +82,4 @@ DECLARE_CPU_INDEX(long);
#undef DECLARE_CPU_INDEX #undef DECLARE_CPU_INDEX
#undef DECLARE_CPU_SPECS_INDEX_NDIM #undef DECLARE_CPU_SPECS_INDEX_NDIM
} // namespace spconv } // namespace spconv
...@@ -13,16 +13,17 @@ ...@@ -13,16 +13,17 @@
// limitations under the License. // limitations under the License.
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/mp_helper.h>
#include <spconv/indice.h>
#include <spconv/indice.cu.h> #include <spconv/indice.cu.h>
#include <spconv/indice.h>
#include <spconv/mp_helper.h>
#include <tensorview/helper_launch.h> #include <tensorview/helper_launch.h>
#include <tensorview/tensorview.h> #include <tensorview/tensorview.h>
#include <type_traits>
#include <utility/timer.h> #include <utility/timer.h>
#include <chrono>
#include <limits>
#include <type_traits>
namespace spconv { namespace spconv {
namespace functor { namespace functor {
template <typename Index, typename IndexGrid, unsigned NDim> template <typename Index, typename IndexGrid, unsigned NDim>
...@@ -41,21 +42,20 @@ struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> { ...@@ -41,21 +42,20 @@ struct CreateConvIndicePairFunctorP1<tv::GPU, Index, IndexGrid, NDim> {
bool transpose) { bool transpose) {
Index batchSize = gridsOut.dim(0); Index batchSize = gridsOut.dim(0);
auto numActIn = indicesIn.dim(0); auto numActIn = indicesIn.dim(0);
if (numActIn == 0) if (numActIn == 0) return 0;
return 0;
// auto timer = spconv::CudaContextTimer<>(); // auto timer = spconv::CudaContextTimer<>();
if (transpose) if (transpose)
prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096> prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0, <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs, d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
indiceNum, indicePairUnique, kernelSize, stride, indiceNum, indicePairUnique, kernelSize, stride,
padding, dilation, outSpatialShape); padding, dilation, outSpatialShape);
else else
prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096> prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0, <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs, d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
indiceNum, indicePairUnique, kernelSize, stride, indiceNum, indicePairUnique, kernelSize, stride,
padding, dilation, outSpatialShape); padding, dilation, outSpatialShape);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
// std::cout << "p1 gene time " << timer.report() / 1000.0 << std::endl; // std::cout << "p1 gene time " << timer.report() / 1000.0 << std::endl;
return 1; return 1;
...@@ -75,18 +75,17 @@ struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> { ...@@ -75,18 +75,17 @@ struct CreateConvIndicePairFunctorP2<tv::GPU, Index, IndexGrid, NDim> {
Index batchSize = gridsOut.dim(0); Index batchSize = gridsOut.dim(0);
auto kernelVolume = indicePairs.dim(0); auto kernelVolume = indicePairs.dim(0);
auto numActIn = indicesIn.dim(0); auto numActIn = indicesIn.dim(0);
if (numActIn == 0) if (numActIn == 0) return 0;
return 0;
Index numAct = indicePairUnique.dim(0) - 1; Index numAct = indicePairUnique.dim(0) - 1;
assignGridAndIndiceOutKernel<Index, IndexGrid, NDim> assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0, <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs, d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
indicePairUnique, outSpatialShape, batchSize); indicePairUnique, outSpatialShape, batchSize);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
assignIndicePairsKernel<Index, IndexGrid, NDim> assignIndicePairsKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0, <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs, d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
indicePairUnique, outSpatialShape); indicePairUnique, outSpatialShape);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
if (resetGrid) { if (resetGrid) {
resetGridKernel<Index, IndexGrid, NDim> resetGridKernel<Index, IndexGrid, NDim>
...@@ -111,8 +110,7 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> { ...@@ -111,8 +110,7 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
const tv::SimpleVector<Index, NDim> outSpatialShape, const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid) { bool transpose, bool resetGrid) {
auto numActIn = indicesIn.dim(0); auto numActIn = indicesIn.dim(0);
if (numActIn == 0) if (numActIn == 0) return 0;
return 0;
// auto timer = spconv::CudaContextTimer<>(); // auto timer = spconv::CudaContextTimer<>();
prepareSubMGridKernel<Index, IndexGrid, NDim> prepareSubMGridKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0, <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
...@@ -121,38 +119,40 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> { ...@@ -121,38 +119,40 @@ struct CreateSubMIndicePairFunctor<tv::GPU, Index, IndexGrid, NDim> {
getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096> getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0, <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum, d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
kernelSize, stride, padding, dilation, outSpatialShape); kernelSize, stride, padding, dilation,
outSpatialShape);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
// std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl; // std::cout << "subm gene time " << timer.report() / 1000.0 << std::endl;
if (resetGrid) { if (resetGrid) {
resetGridSubMKernel<Index, IndexGrid, NDim> resetGridSubMKernel<Index, IndexGrid, NDim>
<<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0, <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape, numActIn); d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,
numActIn);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
return numActIn; return numActIn;
} }
}; };
} // namespace functor } // namespace functor
#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM) \ #define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM) \
template struct functor::CreateConvIndicePairFunctor<tv::GPU, Index, int, \ template struct functor::CreateConvIndicePairFunctor<tv::GPU, Index, int, \
NDIM>; \ NDIM>; \
template struct functor::CreateConvIndicePairFunctorP1<tv::GPU, Index, int, \ template struct functor::CreateConvIndicePairFunctorP1<tv::GPU, Index, int, \
NDIM>; \ NDIM>; \
template struct functor::CreateConvIndicePairFunctorP2<tv::GPU, Index, int, \ template struct functor::CreateConvIndicePairFunctorP2<tv::GPU, Index, int, \
NDIM>; \ NDIM>; \
template struct functor::CreateSubMIndicePairFunctor<tv::GPU, Index, int, \ template struct functor::CreateSubMIndicePairFunctor<tv::GPU, Index, int, \
NDIM>; NDIM>;
#define DECLARE_GPU_INDEX(Index) \ #define DECLARE_GPU_INDEX(Index) \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \ DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \ DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \ DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \
DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4); DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);
DECLARE_GPU_INDEX(int); DECLARE_GPU_INDEX(int);
#undef DECLARE_GPU_INDEX #undef DECLARE_GPU_INDEX
#undef DECLARE_GPU_SPECS_INDEX_NDIM #undef DECLARE_GPU_SPECS_INDEX_NDIM
} // namespace spconv } // namespace spconv
...@@ -62,14 +62,14 @@ struct SparseMaxPoolBackwardFunctor<tv::CPU, T, Index> { ...@@ -62,14 +62,14 @@ struct SparseMaxPoolBackwardFunctor<tv::CPU, T, Index> {
} }
} }
}; };
} // namespace functor } // namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \ #define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>; \ template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>; \
template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>; template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;
#define DECLARE_CPU_SPECS(T) \ #define DECLARE_CPU_SPECS(T) \
DECLARE_CPU_SPECS_T_INDEX(T, int); \ DECLARE_CPU_SPECS_T_INDEX(T, int); \
DECLARE_CPU_SPECS_T_INDEX(T, long); DECLARE_CPU_SPECS_T_INDEX(T, long);
DECLARE_CPU_SPECS(float); DECLARE_CPU_SPECS(float);
...@@ -79,4 +79,4 @@ DECLARE_CPU_SPECS(at::Half); ...@@ -79,4 +79,4 @@ DECLARE_CPU_SPECS(at::Half);
#undef DECLARE_CPU_SPECS #undef DECLARE_CPU_SPECS
#undef DECLARE_CPU_SPECS_T_INDEX #undef DECLARE_CPU_SPECS_T_INDEX
} // namespace spconv } // namespace spconv
...@@ -13,13 +13,14 @@ ...@@ -13,13 +13,14 @@
// limitations under the License. // limitations under the License.
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/maxpool.h> #include <spconv/maxpool.h>
#include <spconv/mp_helper.h> #include <spconv/mp_helper.h>
#include <tensorview/helper_kernel.cu.h> #include <tensorview/helper_kernel.cu.h>
#include <tensorview/helper_launch.h> #include <tensorview/helper_launch.h>
#include <tensorview/tensorview.h> #include <tensorview/tensorview.h>
#include <chrono>
#include <limits>
#include <type_traits> #include <type_traits>
namespace spconv { namespace spconv {
...@@ -54,10 +55,11 @@ __global__ void maxPoolFwdBlockKernel(T *outFeatures, const T *inFeatures, ...@@ -54,10 +55,11 @@ __global__ void maxPoolFwdBlockKernel(T *outFeatures, const T *inFeatures,
} }
template <typename T, typename Index, int NumTLP, int NumILP> template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void __global__ void maxPoolFwdGenericBlockKernel(T *outFeatures,
maxPoolFwdGenericBlockKernel(T *outFeatures, const T *inFeatures, const T *inFeatures,
const Index *indicesIn, const Index *indicesOut, const Index *indicesIn,
int numHot, int numPlanes) { const Index *indicesOut,
int numHot, int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf. // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideX[NumILP]; int ILPStrideX[NumILP];
Index RI[NumILP]; Index RI[NumILP];
...@@ -160,10 +162,11 @@ __global__ void maxPoolFwdGenericKernel(T *outFeatures, const T *inFeatures, ...@@ -160,10 +162,11 @@ __global__ void maxPoolFwdGenericKernel(T *outFeatures, const T *inFeatures,
} }
template <typename T, typename Index, int NumTLP, int NumILP> template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void __global__ void maxPoolBwdBlockKernel(const T *outFeatures, const T *inFeatures,
maxPoolBwdBlockKernel(const T *outFeatures, const T *inFeatures, const T *dout, const T *dout, T *din,
T *din, const Index *indicesIn, const Index *indicesOut, const Index *indicesIn,
int numHot, int numPlanes) { const Index *indicesOut, int numHot,
int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf. // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
T in, out; T in, out;
Index idxo, idxi; Index idxo, idxi;
...@@ -226,10 +229,11 @@ __global__ void maxPoolBwdGenericBlockKernel(const T *outFeatures, ...@@ -226,10 +229,11 @@ __global__ void maxPoolBwdGenericBlockKernel(const T *outFeatures,
} }
template <typename T, typename Index, int NumTLP, int NumILP, typename VecType> template <typename T, typename Index, int NumTLP, int NumILP, typename VecType>
__global__ void __global__ void maxPoolBwdVecBlockKernel(const T *outFeatures,
maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures, const T *inFeatures, const T *dout,
const T *dout, T *din, const Index *indicesIn, T *din, const Index *indicesIn,
const Index *indicesOut, int numHot, int numPlanes) { const Index *indicesOut, int numHot,
int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf. // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideY[NumILP]; int ILPStrideY[NumILP];
constexpr int vecloadFactor = sizeof(VecType) / sizeof(T); constexpr int vecloadFactor = sizeof(VecType) / sizeof(T);
...@@ -255,7 +259,8 @@ maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures, ...@@ -255,7 +259,8 @@ maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
reinterpret_cast<const VecType *>(inFeatures)[idxi]; reinterpret_cast<const VecType *>(inFeatures)[idxi];
reinterpret_cast<VecType *>(bufdo)[0] = reinterpret_cast<VecType *>(bufdo)[0] =
reinterpret_cast<const VecType *>(dout)[idxo]; reinterpret_cast<const VecType *>(dout)[idxo];
reinterpret_cast<VecType *>(bufdi)[0] = reinterpret_cast<VecType *>(din)[idxi]; reinterpret_cast<VecType *>(bufdi)[0] =
reinterpret_cast<VecType *>(din)[idxi];
#pragma unroll #pragma unroll
for (int i = 0; i < vecloadFactor; i++) { for (int i = 0; i < vecloadFactor; i++) {
...@@ -263,16 +268,18 @@ maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures, ...@@ -263,16 +268,18 @@ maxPoolBwdVecBlockKernel(const T *outFeatures, const T *inFeatures,
bufdi[i] += bufdo[i]; bufdi[i] += bufdo[i];
} }
} }
reinterpret_cast<VecType *>(din)[idxi] = reinterpret_cast<VecType *>(bufdi)[0]; reinterpret_cast<VecType *>(din)[idxi] =
reinterpret_cast<VecType *>(bufdi)[0];
} }
} }
} }
template <typename T, typename Index, int NumTLP, int NumILP> template <typename T, typename Index, int NumTLP, int NumILP>
__global__ void __global__ void maxPoolBwdGenericKernel(const T *outFeatures,
maxPoolBwdGenericKernel(const T *outFeatures, const T *inFeatures, const T *inFeatures, const T *dout,
const T *dout, T *din, const Index *indicesIn, T *din, const Index *indicesIn,
const Index *indicesOut, int numHot, int numPlanes) { const Index *indicesOut, int numHot,
int numPlanes) {
// see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf. // see http://www.nvidia.com/content/GTC-2010/pdfs/2238_GTC2010.pdf.
int ILPStrideX[NumILP]; int ILPStrideX[NumILP];
Index RI[NumILP]; Index RI[NumILP];
...@@ -313,8 +320,7 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> { ...@@ -313,8 +320,7 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures, void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
tv::TensorView<const T> inFeatures, tv::TensorView<const T> inFeatures,
tv::TensorView<const Index> indices, int size) { tv::TensorView<const Index> indices, int size) {
if (size <= 0) if (size <= 0) return;
return;
int numPlanes = inFeatures.dim(1); int numPlanes = inFeatures.dim(1);
bool notFound = true; bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T); constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
...@@ -326,13 +332,14 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> { ...@@ -326,13 +332,14 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
if (notFound) { if (notFound) {
if (numPlanes % NumTLP == 0) { if (numPlanes % NumTLP == 0) {
if (numHotBlock >= NumTLP) { if (numHotBlock >= NumTLP) {
maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t> maxPoolFwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP), <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0, dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(outFeatures.data(), inFeatures.data(), d.getStream()>>>(outFeatures.data(), inFeatures.data(),
indices.subview(0).data(), indices.subview(0).data(),
indices.subview(1).data(), numHotBlock, indices.subview(1).data(), numHotBlock,
numPlanes / vecloadFactor); numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
...@@ -340,9 +347,9 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> { ...@@ -340,9 +347,9 @@ struct SparseMaxPoolForwardFunctor<tv::GPU, T, Index> {
maxPoolFwdGenericKernel<T, Index, int(NumTLP), NumILP> maxPoolFwdGenericKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP), <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, d.getStream()>>>(outFeatures.data(), inFeatures.data(), 0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
indices.subview(0).data() + numHotBlock, indices.subview(0).data() + numHotBlock,
indices.subview(1).data() + numHotBlock, indices.subview(1).data() + numHotBlock,
size - numHotBlock, numPlanes); size - numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
notFound = false; notFound = false;
...@@ -387,8 +394,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> { ...@@ -387,8 +394,7 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
tv::TensorView<const T> inFeatures, tv::TensorView<const T> inFeatures,
tv::TensorView<const T> dout, tv::TensorView<T> din, tv::TensorView<const T> dout, tv::TensorView<T> din,
tv::TensorView<const Index> indices, int size) { tv::TensorView<const Index> indices, int size) {
if (size <= 0) if (size <= 0) return;
return;
int numPlanes = inFeatures.dim(1); int numPlanes = inFeatures.dim(1);
bool notFound = true; bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T); constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
...@@ -400,14 +406,15 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> { ...@@ -400,14 +406,15 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
if (notFound) { if (notFound) {
if (numPlanes % NumTLP == 0) { if (numPlanes % NumTLP == 0) {
if (numHotBlock >= NumTLP) { if (numHotBlock >= NumTLP) {
maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t> maxPoolBwdVecBlockKernel<T, Index, int(NumTLP), NumILP,
vecload_type_t>
<<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP), <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0, dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(outFeatures.data(), inFeatures.data(), d.getStream()>>>(outFeatures.data(), inFeatures.data(),
dout.data(), din.data(), dout.data(), din.data(),
indices.subview(0).data(), indices.subview(0).data(),
indices.subview(1).data(), numHotBlock, indices.subview(1).data(), numHotBlock,
numPlanes / vecloadFactor); numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
...@@ -415,10 +422,10 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> { ...@@ -415,10 +422,10 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
maxPoolBwdGenericKernel<T, Index, int(NumTLP), NumILP> maxPoolBwdGenericKernel<T, Index, int(NumTLP), NumILP>
<<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP), <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
0, d.getStream()>>>(outFeatures.data(), inFeatures.data(), 0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
dout.data(), din.data(), dout.data(), din.data(),
indices.subview(0).data() + numHotBlock, indices.subview(0).data() + numHotBlock,
indices.subview(1).data() + numHotBlock, indices.subview(1).data() + numHotBlock,
size - numHotBlock, numPlanes); size - numHotBlock, numPlanes);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
notFound = false; notFound = false;
...@@ -454,10 +461,10 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> { ...@@ -454,10 +461,10 @@ struct SparseMaxPoolBackwardFunctor<tv::GPU, T, Index> {
} }
}; };
} // namespace functor } // namespace functor
#define DECLARE_GPU_SPECS_T_INDEX(T, Index) \ #define DECLARE_GPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseMaxPoolForwardFunctor<tv::GPU, T, Index>; \ template struct functor::SparseMaxPoolForwardFunctor<tv::GPU, T, Index>; \
template struct functor::SparseMaxPoolBackwardFunctor<tv::GPU, T, Index>; template struct functor::SparseMaxPoolBackwardFunctor<tv::GPU, T, Index>;
#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int); #define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
...@@ -468,4 +475,4 @@ DECLARE_GPU_SPECS(at::Half); ...@@ -468,4 +475,4 @@ DECLARE_GPU_SPECS(at::Half);
#undef DECLARE_GPU_SPECS #undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX #undef DECLARE_GPU_SPECS_T_INDEX
} // namespace spconv } // namespace spconv
...@@ -19,7 +19,8 @@ namespace spconv { ...@@ -19,7 +19,8 @@ namespace spconv {
namespace functor { namespace functor {
template <typename T, typename Index> template <typename T, typename Index>
struct SparseGatherFunctor<tv::CPU, T, Index> { struct SparseGatherFunctor<tv::CPU, T, Index> {
void operator()(const tv::CPU& d, tv::TensorView<T> buffer, tv::TensorView<const T> features, void operator()(const tv::CPU& d, tv::TensorView<T> buffer,
tv::TensorView<const T> features,
tv::TensorView<const Index> indices, int size) { tv::TensorView<const Index> indices, int size) {
int numPlanes = features.dim(1); int numPlanes = features.dim(1);
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
...@@ -33,30 +34,29 @@ struct SparseGatherFunctor<tv::CPU, T, Index> { ...@@ -33,30 +34,29 @@ struct SparseGatherFunctor<tv::CPU, T, Index> {
template <typename T, typename Index> template <typename T, typename Index>
struct SparseScatterAddFunctor<tv::CPU, T, Index> { struct SparseScatterAddFunctor<tv::CPU, T, Index> {
void operator()(const tv::CPU& d, tv::TensorView<T> outFeatures, void operator()(const tv::CPU& d, tv::TensorView<T> outFeatures,
tv::TensorView<const T> buffer, tv::TensorView<const Index> indices, tv::TensorView<const T> buffer,
int size, bool stable) { tv::TensorView<const Index> indices, int size, bool stable) {
int numPlanes = outFeatures.dim(1); int numPlanes = outFeatures.dim(1);
const T* buf = buffer.data(); const T* buf = buffer.data();
T* out = outFeatures.data(); T* out = outFeatures.data();
for (int i = 0; i < size; ++i) { for (int i = 0; i < size; ++i) {
buf = buffer.data() + i * numPlanes; buf = buffer.data() + i * numPlanes;
out = outFeatures.data() + indices[i] * numPlanes; out = outFeatures.data() + indices[i] * numPlanes;
for (int j = 0; j < numPlanes; ++j){ for (int j = 0; j < numPlanes; ++j) {
out[j] += buf[j]; out[j] += buf[j];
} }
} }
} }
}; };
} // namespace functor } // namespace functor
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \
#define DECLARE_CPU_SPECS_T_INDEX(T, Index) \ template struct functor::SparseGatherFunctor<tv::CPU, T, Index>; \
template struct functor::SparseGatherFunctor<tv::CPU, T, Index>; \
template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>; template struct functor::SparseScatterAddFunctor<tv::CPU, T, Index>;
#define DECLARE_CPU_SPECS(T) \ #define DECLARE_CPU_SPECS(T) \
DECLARE_CPU_SPECS_T_INDEX(T, int); \ DECLARE_CPU_SPECS_T_INDEX(T, int); \
DECLARE_CPU_SPECS_T_INDEX(T, long); DECLARE_CPU_SPECS_T_INDEX(T, long);
DECLARE_CPU_SPECS(float); DECLARE_CPU_SPECS(float);
...@@ -66,4 +66,4 @@ DECLARE_CPU_SPECS(at::Half); ...@@ -66,4 +66,4 @@ DECLARE_CPU_SPECS(at::Half);
#undef DECLARE_CPU_SPECS #undef DECLARE_CPU_SPECS
#undef DECLARE_CPU_SPECS_T_INDEX #undef DECLARE_CPU_SPECS_T_INDEX
} // namespace spconv } // namespace spconv
...@@ -13,17 +13,18 @@ ...@@ -13,17 +13,18 @@
// limitations under the License. // limitations under the License.
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <chrono>
#include <limits>
#include <spconv/mp_helper.h> #include <spconv/mp_helper.h>
#include <spconv/reordering.h>
#include <spconv/reordering.cu.h> #include <spconv/reordering.cu.h>
#include <spconv/reordering.h>
#include <tensorview/helper_kernel.cu.h> #include <tensorview/helper_kernel.cu.h>
#include <tensorview/helper_launch.h> #include <tensorview/helper_launch.h>
#include <tensorview/tensorview.h> #include <tensorview/tensorview.h>
#include <type_traits>
#include <utility/timer.h> #include <utility/timer.h>
#include <chrono>
#include <limits>
#include <type_traits>
namespace spconv { namespace spconv {
namespace functor { namespace functor {
template <typename T, typename Index> template <typename T, typename Index>
...@@ -34,8 +35,7 @@ struct SparseGatherFunctor<tv::GPU, T, Index> { ...@@ -34,8 +35,7 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
void operator()(const tv::GPU &d, tv::TensorView<T> buffer, void operator()(const tv::GPU &d, tv::TensorView<T> buffer,
tv::TensorView<const T> features, tv::TensorView<const T> features,
tv::TensorView<const Index> indices, int size) { tv::TensorView<const Index> indices, int size) {
if (size <= 0) if (size <= 0) return;
return;
int numPlanes = features.dim(1); int numPlanes = features.dim(1);
bool notFound = true; bool notFound = true;
constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T); constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(T);
...@@ -50,8 +50,9 @@ struct SparseGatherFunctor<tv::GPU, T, Index> { ...@@ -50,8 +50,9 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
gatherVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t> gatherVecBlockKernel<T, Index, int(NumTLP), NumILP, vecload_type_t>
<<<dim3(numPlanes / NumTLP, size / NumTLP), <<<dim3(numPlanes / NumTLP, size / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0, dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(buffer.data(), features.data(), indices.data(), d.getStream()>>>(buffer.data(), features.data(),
nHotBlock, numPlanes / vecloadFactor); indices.data(), nHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
...@@ -60,8 +61,9 @@ struct SparseGatherFunctor<tv::GPU, T, Index> { ...@@ -60,8 +61,9 @@ struct SparseGatherFunctor<tv::GPU, T, Index> {
<<<dim3(1, numPlanes / NumTLP), <<<dim3(1, numPlanes / NumTLP),
dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0, dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
d.getStream()>>>(buffer.data() + nHotBlock * numPlanes, d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
features.data(), indices.data() + nHotBlock, features.data(), indices.data() + nHotBlock,
size - nHotBlock, numPlanes / vecloadFactor); size - nHotBlock,
numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
notFound = false; notFound = false;
...@@ -89,12 +91,11 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> { ...@@ -89,12 +91,11 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures, void operator()(const tv::GPU &d, tv::TensorView<T> outFeatures,
tv::TensorView<const T> buffer, tv::TensorView<const T> buffer,
tv::TensorView<const Index> indices, int size, bool stable) { tv::TensorView<const Index> indices, int size, bool stable) {
if (size <= 0) if (size <= 0) return;
return;
int numPlanes = outFeatures.dim(1); int numPlanes = outFeatures.dim(1);
bool notFound = true; bool notFound = true;
constexpr int vecloadFactor = constexpr int vecloadFactor =
sizeof(vecload_type_t) / sizeof(T); // important for half. sizeof(vecload_type_t) / sizeof(T); // important for half.
mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices, mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
&notFound](auto NumTLP) { &notFound](auto NumTLP) {
// constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor)); // constexpr int NumILP = NumTLP / (64 / (NumTLP / vecloadFactor));
...@@ -108,8 +109,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> { ...@@ -108,8 +109,8 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
<<<dim3(numPlanes / NumTLP, size / NumTLP), <<<dim3(numPlanes / NumTLP, size / NumTLP),
dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0, dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
d.getStream()>>>(outFeatures.data(), buffer.data(), d.getStream()>>>(outFeatures.data(), buffer.data(),
indices.data(), nHotBlock, indices.data(), nHotBlock,
numPlanes / vecloadFactor); numPlanes / vecloadFactor);
TV_CHECK_CUDA_ERR(); TV_CHECK_CUDA_ERR();
} }
if (size - nHotBlock > 0) { if (size - nHotBlock > 0) {
...@@ -137,11 +138,10 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> { ...@@ -137,11 +138,10 @@ struct SparseScatterAddFunctor<tv::GPU, T, Index> {
} }
} }
}; };
} // namespace functor } // namespace functor
#define DECLARE_GPU_SPECS_T_INDEX(T, Index) \ #define DECLARE_GPU_SPECS_T_INDEX(T, Index) \
template struct functor::SparseGatherFunctor<tv::GPU, T, Index>; \ template struct functor::SparseGatherFunctor<tv::GPU, T, Index>; \
template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>; template struct functor::SparseScatterAddFunctor<tv::GPU, T, Index>;
#define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int); #define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPECS_T_INDEX(T, int);
...@@ -152,4 +152,4 @@ DECLARE_GPU_SPECS(at::Half); ...@@ -152,4 +152,4 @@ DECLARE_GPU_SPECS(at::Half);
#undef DECLARE_GPU_SPECS #undef DECLARE_GPU_SPECS
#undef DECLARE_GPU_SPECS_T_INDEX #undef DECLARE_GPU_SPECS_T_INDEX
} // namespace spconv } // namespace spconv
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment