Commit 6f3c5f1c authored by limm's avatar limm
Browse files

support v1.4.0

parent 6f674c7e
/*************************************************************************
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#ifndef PYTORCH_MLU_HELPER_HPP_
#define PYTORCH_MLU_HELPER_HPP_
#ifdef MMCV_WITH_MLU
#include "aten.h"
#define NFU_ALIGN_SIZE 128
#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y))
#define PAD_DOWN(x, y) (((x) / (y)) * (y))
#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
#define CEIL_ALIGN(x, y) (((x) + (y)-1) / (y) * (y))
inline int32_t getJobLimitCapability() {
CNcontext drv_ctx;
TORCH_CHECK(CN_SUCCESS == cnCtxGetCurrent(&drv_ctx), "cnCtxGetCurrent fails");
CNctxConfigParam ctx_conf_param;
TORCH_CHECK(
CN_SUCCESS == cnGetCtxConfigParam(drv_ctx, CN_CTX_CONFIG_UNION_LIMIT,
&ctx_conf_param),
"cnGetCtxConfigParam fails.");
return (int32_t)ctx_conf_param.unionLimit;
}
inline int32_t getCoreNumOfJobLimitCapability() {
switch (getJobLimitCapability()) {
default:
return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) *
getJobLimitCapability();
case CN_KERNEL_CLASS_BLOCK:
return 1;
case CN_KERNEL_CLASS_UNION:
return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
case CN_KERNEL_CLASS_UNION2:
return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 2;
case CN_KERNEL_CLASS_UNION4:
return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 4;
case CN_KERNEL_CLASS_UNION8:
return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 8;
case CN_KERNEL_CLASS_UNION16:
return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 16;
}
}
#endif // MMCV_WITH_MLU
#endif // PYTORCH_MLU_HELPER_HPP_
/******************************************************************************
* Copyright (c) 2022 Huawei Technologies Co., Ltd
* All rights reserved.
*
* Licensed under the BSD 3-Clause License (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://opensource.org/licenses/BSD-3-Clause
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef PYTORCH_NPU_HELPER_HPP_
#define PYTORCH_NPU_HELPER_HPP_
#include <torch_npu/csrc/aten/NPUNativeFunctions.h>
#include <torch_npu/csrc/framework/utils/CalcuOpUtil.h>
#include <torch_npu/csrc/framework/utils/OpAdapter.h>
#include "pytorch_cpp_helper.hpp"
#include "pytorch_device_registry.hpp"
#define NPU_NAME_SPACE at_npu::native
#define REGISTER_NPU_IMPL(key, value) REGISTER_DEVICE_IMPL(key, XLA, value)
#define CHECK_NPU(x) \
TORCH_CHECK(x.device().type() == at::kXLA, #x " must be a NPU tensor")
#endif // PYTORCH_NPU_HELPER_HPP_
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PARAMS_GRID_H_
#define PARAMS_GRID_H_
#include <tuple>
#include <vector>
namespace detail {
template <class scalar_t>
int getTotalSize(std::vector<scalar_t> arg) {
return arg.size();
}
template <class scalar_t, class... TArgs>
int getTotalSize(std::vector<scalar_t> arg, std::vector<TArgs>... args) {
return arg.size() * getTotalSize(args...);
}
template <typename scalar_t>
int getSize(std::vector<scalar_t> arg) {
return arg.size();
}
template <int Idx, class TT, class scalar_t>
void assigner(TT &src, std::vector<int> counter, std::vector<scalar_t> &arg) {
std::get<Idx>(src) = arg[counter[Idx]];
}
template <int Idx, class TT, class scalar_t, class... TArgs>
void assigner(TT &src, std::vector<int> counter, std::vector<scalar_t> &arg,
std::vector<TArgs> &... args) {
std::get<Idx>(src) = arg[counter[Idx]];
assigner<Idx + 1>(src, counter, args...);
}
} // namespace detail
template <class... TArgs>
std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
int length = detail::getTotalSize(args...);
std::vector<int> sizes = {detail::getSize(args)...};
int size = sizes.size();
std::vector<std::tuple<TArgs...>> params(length);
std::vector<int> counter(size);
for (int i = 0; i < length; ++i) {
detail::assigner<0>(params[i], counter, args...);
counter[size - 1] += 1;
for (int c = size - 1; c >= 0; --c) {
if (counter[c] == sizes[c] && c > 0) {
counter[c - 1] += 1;
counter[c] = 0;
}
}
}
return params;
}
#endif
// Copyright Louis Delacroix 2010 - 2014.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
//
// A pretty printing library for C++
//
// Usage:
// Include this header, and operator<< will "just work".
#ifndef H_PRETTY_PRINT
#define H_PRETTY_PRINT
#include <cstddef>
#include <iterator>
#include <memory>
#include <ostream>
#include <set>
#include <tuple>
#include <type_traits>
#include <unordered_set>
#include <utility>
#include <valarray>
namespace pretty_print {
namespace detail {
// SFINAE type trait to detect whether T::const_iterator exists.
struct sfinae_base {
using yes = char;
using no = yes[2];
};
template <typename T>
struct has_const_iterator : private sfinae_base {
private:
template <typename C>
static yes &test(typename C::const_iterator *);
template <typename C>
static no &test(...);
public:
static const bool value = sizeof(test<T>(nullptr)) == sizeof(yes);
using type = T;
};
template <typename T>
struct has_begin_end : private sfinae_base {
private:
template <typename C>
static yes &
f(typename std::enable_if<
std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
const>(&C::begin)),
typename C::const_iterator (C::*)() const>::value>::type *);
template <typename C>
static no &f(...);
template <typename C>
static yes &g(typename std::enable_if<
std::is_same<decltype(static_cast<typename C::const_iterator (
C::*)() const>(&C::end)),
typename C::const_iterator (C::*)() const>::value,
void>::type *);
template <typename C>
static no &g(...);
public:
static bool const beg_value = sizeof(f<T>(nullptr)) == sizeof(yes);
static bool const end_value = sizeof(g<T>(nullptr)) == sizeof(yes);
};
} // namespace detail
// Holds the delimiter values for a specific character type
template <typename TChar>
struct delimiters_values {
using char_type = TChar;
const char_type *prefix;
const char_type *delimiter;
const char_type *postfix;
};
// Defines the delimiter values for a specific container and character type
template <typename T, typename TChar>
struct delimiters {
using type = delimiters_values<TChar>;
static const type values;
};
// Functor to print containers. You can use this directly if you want
// to specify a non-default delimiters type. The printing logic can
// be customized by specializing the nested template.
template <typename T, typename TChar = char,
typename TCharTraits = ::std::char_traits<TChar>,
typename TDelimiters = delimiters<T, TChar>>
struct print_container_helper {
using delimiters_type = TDelimiters;
using ostream_type = std::basic_ostream<TChar, TCharTraits>;
template <typename U>
struct printer {
static void print_body(const U &c, ostream_type &stream) {
using std::begin;
using std::end;
auto it = begin(c);
const auto the_end = end(c);
if (it != the_end) {
for (;;) {
stream << *it;
if (++it == the_end) break;
if (delimiters_type::values.delimiter != NULL)
stream << delimiters_type::values.delimiter;
}
}
}
};
print_container_helper(const T &container) : container_(container) {}
inline void operator()(ostream_type &stream) const {
if (delimiters_type::values.prefix != NULL)
stream << delimiters_type::values.prefix;
printer<T>::print_body(container_, stream);
if (delimiters_type::values.postfix != NULL)
stream << delimiters_type::values.postfix;
}
private:
const T &container_;
};
// Specialization for pairs
template <typename T, typename TChar, typename TCharTraits,
typename TDelimiters>
template <typename T1, typename T2>
struct print_container_helper<T, TChar, TCharTraits,
TDelimiters>::printer<std::pair<T1, T2>> {
using ostream_type =
typename print_container_helper<T, TChar, TCharTraits,
TDelimiters>::ostream_type;
static void print_body(const std::pair<T1, T2> &c, ostream_type &stream) {
stream << c.first;
if (print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter != NULL)
stream << print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter;
stream << c.second;
}
};
// Specialization for tuples
template <typename T, typename TChar, typename TCharTraits,
typename TDelimiters>
template <typename... Args>
struct print_container_helper<T, TChar, TCharTraits,
TDelimiters>::printer<std::tuple<Args...>> {
using ostream_type =
typename print_container_helper<T, TChar, TCharTraits,
TDelimiters>::ostream_type;
using element_type = std::tuple<Args...>;
template <std::size_t I>
struct Int {};
static void print_body(const element_type &c, ostream_type &stream) {
tuple_print(c, stream, Int<0>());
}
static void tuple_print(const element_type &, ostream_type &,
Int<sizeof...(Args)>) {}
static void tuple_print(
const element_type &c, ostream_type &stream,
typename std::conditional<sizeof...(Args) != 0, Int<0>,
std::nullptr_t>::type) {
stream << std::get<0>(c);
tuple_print(c, stream, Int<1>());
}
template <std::size_t N>
static void tuple_print(const element_type &c, ostream_type &stream, Int<N>) {
if (print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter != NULL)
stream << print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter;
stream << std::get<N>(c);
tuple_print(c, stream, Int<N + 1>());
}
};
// Prints a print_container_helper to the specified stream.
template <typename T, typename TChar, typename TCharTraits,
typename TDelimiters>
inline std::basic_ostream<TChar, TCharTraits> &operator<<(
std::basic_ostream<TChar, TCharTraits> &stream,
const print_container_helper<T, TChar, TCharTraits, TDelimiters> &helper) {
helper(stream);
return stream;
}
// Basic is_container template; specialize to derive from std::true_type for all
// desired container types
template <typename T>
struct is_container
: public std::integral_constant<bool,
detail::has_const_iterator<T>::value &&
detail::has_begin_end<T>::beg_value &&
detail::has_begin_end<T>::end_value> {};
template <typename T, std::size_t N>
struct is_container<T[N]> : std::true_type {};
template <std::size_t N>
struct is_container<char[N]> : std::false_type {};
template <typename T>
struct is_container<std::valarray<T>> : std::true_type {};
template <typename T1, typename T2>
struct is_container<std::pair<T1, T2>> : std::true_type {};
template <typename... Args>
struct is_container<std::tuple<Args...>> : std::true_type {};
// Default delimiters
template <typename T>
struct delimiters<T, char> {
static const delimiters_values<char> values;
};
template <typename T>
const delimiters_values<char> delimiters<T, char>::values = {"[", ", ", "]"};
template <typename T>
struct delimiters<T, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T>
const delimiters_values<wchar_t> delimiters<T, wchar_t>::values = {L"[", L", ",
L"]"};
// Delimiters for (multi)set and unordered_(multi)set
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::set<T, TComp, TAllocator>, char> {
static const delimiters_values<char> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<char>
delimiters<::std::set<T, TComp, TAllocator>, char>::values = {"{", ", ",
"}"};
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::set<T, TComp, TAllocator>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<wchar_t>
delimiters<::std::set<T, TComp, TAllocator>, wchar_t>::values = {
L"{", L", ", L"}"};
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::multiset<T, TComp, TAllocator>, char> {
static const delimiters_values<char> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<char>
delimiters<::std::multiset<T, TComp, TAllocator>, char>::values = {
"{", ", ", "}"};
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<wchar_t>
delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t>::values = {
L"{", L", ", L"}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, char> {
static const delimiters_values<char> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<char> delimiters<
::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = {
"{", ", ", "}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<wchar_t> delimiters<
::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = {
L"{", L", ", L"}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
char> {
static const delimiters_values<char> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<char> delimiters<
::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = {
"{", ", ", "}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<wchar_t>
delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
wchar_t>::values = {L"{", L", ", L"}"};
// Delimiters for pair and tuple
template <typename T1, typename T2>
struct delimiters<std::pair<T1, T2>, char> {
static const delimiters_values<char> values;
};
template <typename T1, typename T2>
const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = {
"(", ", ", ")"};
template <typename T1, typename T2>
struct delimiters<::std::pair<T1, T2>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T1, typename T2>
const delimiters_values<wchar_t>
delimiters<::std::pair<T1, T2>, wchar_t>::values = {L"(", L", ", L")"};
template <typename... Args>
struct delimiters<std::tuple<Args...>, char> {
static const delimiters_values<char> values;
};
template <typename... Args>
const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = {
"(", ", ", ")"};
template <typename... Args>
struct delimiters<::std::tuple<Args...>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename... Args>
const delimiters_values<wchar_t>
delimiters<::std::tuple<Args...>, wchar_t>::values = {L"(", L", ", L")"};
// Type-erasing helper class for easy use of custom delimiters.
// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t,
// and MyDelims needs to be defined for TChar. Usage: "cout <<
// pretty_print::custom_delims<MyDelims>(x)".
struct custom_delims_base {
virtual ~custom_delims_base() {}
virtual std::ostream &stream(::std::ostream &) = 0;
virtual std::wostream &stream(::std::wostream &) = 0;
};
template <typename T, typename Delims>
struct custom_delims_wrapper : custom_delims_base {
custom_delims_wrapper(const T &t_) : t(t_) {}
std::ostream &stream(std::ostream &s) {
return s << print_container_helper<T, char, std::char_traits<char>, Delims>(
t);
}
std::wostream &stream(std::wostream &s) {
return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>,
Delims>(t);
}
private:
const T &t;
};
template <typename Delims>
struct custom_delims {
template <typename Container>
custom_delims(const Container &c)
: base(new custom_delims_wrapper<Container, Delims>(c)) {}
std::unique_ptr<custom_delims_base> base;
};
template <typename TChar, typename TCharTraits, typename Delims>
inline std::basic_ostream<TChar, TCharTraits> &operator<<(
std::basic_ostream<TChar, TCharTraits> &s, const custom_delims<Delims> &p) {
return p.base->stream(s);
}
// A wrapper for a C-style array given as pointer-plus-size.
// Usage: std::cout << pretty_print_array(arr, n) << std::endl;
template <typename T>
struct array_wrapper_n {
typedef const T *const_iterator;
typedef T value_type;
array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {}
inline const_iterator begin() const { return _array; }
inline const_iterator end() const { return _array + _n; }
private:
const T *const _array;
size_t _n;
};
// A wrapper for hash-table based containers that offer local iterators to each
// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl; (Prints bucket
// 5 of container m.)
template <typename T>
struct bucket_print_wrapper {
typedef typename T::const_local_iterator const_iterator;
typedef typename T::size_type size_type;
const_iterator begin() const { return m_map.cbegin(n); }
const_iterator end() const { return m_map.cend(n); }
bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {}
private:
const T &m_map;
const size_type n;
};
} // namespace pretty_print
// Global accessor functions for the convenience wrappers
template <typename T>
inline pretty_print::array_wrapper_n<T> pretty_print_array(const T *const a,
size_t n) {
return pretty_print::array_wrapper_n<T>(a, n);
}
template <typename T>
pretty_print::bucket_print_wrapper<T> bucket_print(const T &m,
typename T::size_type n) {
return pretty_print::bucket_print_wrapper<T>(m, n);
}
// Main magic entry point: An overload snuck into namespace std.
// Can we do better?
namespace std {
// Prints a container to the stream using default delimiters
template <typename T, typename TChar, typename TCharTraits>
inline typename enable_if<::pretty_print::is_container<T>::value,
basic_ostream<TChar, TCharTraits> &>::type
operator<<(basic_ostream<TChar, TCharTraits> &stream, const T &container) {
return stream
<< ::pretty_print::print_container_helper<T, TChar, TCharTraits>(
container);
}
} // namespace std
#endif // H_PRETTY_PRINT
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <pybind11/embed.h>
#include <pybind11/functional.h>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <spconv/tensorview/tensorview.h>
#include <algorithm>
#include <iostream>
namespace py = pybind11;
template <typename scalar_t, typename TPyObject>
std::vector<scalar_t> array2Vector(TPyObject arr) {
py::array arr_np = arr;
size_t size = arr.attr("size").template cast<size_t>();
py::array_t<scalar_t> arr_cc = arr_np;
std::vector<scalar_t> data(arr_cc.data(), arr_cc.data() + size);
return data;
}
template <typename scalar_t>
std::vector<scalar_t> arrayT2Vector(py::array_t<scalar_t> arr) {
std::vector<scalar_t> data(arr.data(), arr.data() + arr.size());
return data;
}
template <typename scalar_t, typename TPyObject>
tv::TensorView<scalar_t> array2TensorView(TPyObject arr) {
py::array arr_np = arr;
py::array_t<scalar_t> arr_cc = arr_np;
tv::Shape shape;
for (int i = 0; i < arr_cc.ndim(); ++i) {
shape.push_back(arr_cc.shape(i));
}
return tv::TensorView<scalar_t>(arr_cc.mutable_data(), shape);
}
template <typename scalar_t>
tv::TensorView<scalar_t> arrayT2TensorView(py::array_t<scalar_t> arr) {
tv::Shape shape;
for (int i = 0; i < arr.ndim(); ++i) {
shape.push_back(arr.shape(i));
}
return tv::TensorView<scalar_t>(arr.mutable_data(), shape);
}
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPCONV_GEOMETRY_H_
#define SPCONV_GEOMETRY_H_
#include <utils/spconv/tensorview/tensorview.h>
#include <iostream>
#include <limits>
template <typename Index, unsigned NDim>
TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
const Index *kernelSize,
const Index *stride, const Index *padding,
const Index *dilation,
const Index *outSpatialShape, Index *out) {
Index lowers[NDim];
Index uppers[NDim];
Index counter[NDim];
Index counterSize[NDim];
Index pointCounter = 0;
Index val;
Index numPoints = 1;
Index m, offset;
bool valid = false;
#pragma unroll
for (unsigned i = 0; i < NDim; ++i) {
lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 +
stride[i] + padding[i]) /
stride[i];
uppers[i] = (input_pos[i] + padding[i]) / stride[i];
}
#pragma unroll
for (unsigned i = 0; i < NDim; ++i) {
counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
numPoints *= counterSize[i];
}
#pragma unroll
for (unsigned i = 0; i < NDim; ++i) {
counter[i] = 0;
}
for (int i = 0; i < numPoints; ++i) {
valid = true;
m = 1;
offset = 0;
#pragma unroll
for (int j = NDim - 1; j >= 0; --j) {
val = uppers[j] - counter[j] * dilation[j];
out[pointCounter * (NDim + 1) + j] = val;
if (val < 0 || (val > outSpatialShape[j] - 1)) {
valid = false;
// break;
}
offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
m *= kernelSize[j];
}
out[pointCounter * (NDim + 1) + NDim] = offset;
if (valid) ++pointCounter;
counter[NDim - 1] += 1;
#pragma unroll
for (int c = NDim - 1; c >= 0; --c) {
if (counter[c] == counterSize[c] && c > 0) {
counter[c - 1] += 1;
counter[c] = 0;
}
}
}
return pointCounter;
}
template <typename Index, unsigned NDim>
TV_HOST_DEVICE Index getValidOutPosTranspose(
const Index *input_pos, const Index *kernelSize, const Index *stride,
const Index *padding, const Index *dilation, const Index *outSpatialShape,
Index *out) {
Index lowers[NDim];
Index uppers[NDim];
Index counter[NDim];
Index counterSize[NDim];
Index pointCounter = 0;
Index val;
Index numPoints = 1;
Index m, offset;
bool valid = false;
#pragma unroll
for (unsigned i = 0; i < NDim; ++i) {
lowers[i] = input_pos[i] * stride[i] - padding[i];
uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i];
}
#pragma unroll
for (unsigned i = 0; i < NDim; ++i) {
counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
numPoints *= counterSize[i];
}
#pragma unroll
for (unsigned i = 0; i < NDim; ++i) {
counter[i] = 0;
}
for (int i = 0; i < numPoints; ++i) {
valid = true;
m = 1;
offset = 0;
#pragma unroll
for (int j = NDim - 1; j >= 0; --j) {
val = uppers[j] - counter[j] * dilation[j];
out[pointCounter * (NDim + 1) + j] = val;
if (val < 0 || (val > outSpatialShape[j] - 1)) {
valid = false;
}
offset += m * (val - lowers[j]) / dilation[j];
m *= kernelSize[j];
}
out[pointCounter * (NDim + 1) + NDim] = offset;
if (valid) ++pointCounter;
counter[NDim - 1] += 1;
#pragma unroll
for (int c = NDim - 1; c >= 0; --c) {
if (counter[c] == counterSize[c] && c > 0) {
counter[c - 1] += 1;
counter[c] = 0;
}
}
}
return pointCounter;
}
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *kernelSize, const Index *stride,
const Index *padding, const Index *dilation,
const Index *outSpatialShape) {
// indicesOut: num_active * kernelVolume * (NDim + 1)
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
for (int j = 0; j < numActIn; ++j) {
batchIdx = indicesIn(j, 0);
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * batchIdx;
if (gridsOut[index] == -1) {
for (unsigned k = 1; k < NDim + 1; ++k) {
indicesOut(numAct, k) = pointPtr[k - 1];
}
indicesOut(numAct, 0) = batchIdx;
gridsOut[index] = numAct++;
}
// indicePairs: [K, 2, L]
indicePairs(offset, 0, indiceNum[offset]) = j;
indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
}
}
return numAct;
}
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *kernelSize, const Index *stride,
const Index *padding, const Index *dilation,
const Index *outSpatialShape) {
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
for (int j = 0; j < numActIn; ++j) {
batchIdx = indicesIn(j, 0);
numValidPoints = getValidOutPosTranspose<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * batchIdx;
if (gridsOut[index] == -1) {
for (unsigned k = 1; k < NDim + 1; ++k) {
indicesOut(numAct, k) = pointPtr[k - 1];
}
indicesOut(numAct, 0) = batchIdx;
gridsOut[index] = numAct++;
}
// indicePairs: [K, 2, L]
indicePairs(offset, 0, indiceNum[offset]) = j;
indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
}
}
return numAct;
}
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *const kernelSize,
const Index *const stride, const Index *const padding,
const Index *dilation,
const Index *const outSpatialShape) {
auto numActIn = indicesIn.dim(0);
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
// Index validPoints[kernelVolume * (NDim + 1)];
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
Index index = 0;
for (int j = 0; j < numActIn; ++j) {
index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
outSpatialShape) +
spatialVolume * indicesIn(j, 0);
gridsOut[index] = j;
}
for (int j = 0; j < numActIn; ++j) {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * indicesIn(j, 0);
if (gridsOut[index] > -1) {
indicePairs(offset, 0, indiceNum[offset]) = j;
indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
}
}
}
return numActIn;
}
#endif
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_
#define SPARSE_CONV_INDICE_FUNCTOR_H_
#include <utils/spconv/tensorview/tensorview.h>
namespace functor {
template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP1 {
Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose);
};
template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP2 {
Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid = false);
};
template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctor {
Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid = false);
};
template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateSubMIndicePairFunctor {
Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid = false);
};
} // namespace functor
#endif
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_MAXPOOL_FUNCTOR_H_
#define SPARSE_MAXPOOL_FUNCTOR_H_
#include <utils/spconv/tensorview/tensorview.h>
namespace functor {
template <typename Device, typename scalar_t, typename Index>
struct SparseMaxPoolForwardFunctor {
void operator()(const Device& d, tv::TensorView<scalar_t> outFeatures,
tv::TensorView<const scalar_t> inFeatures,
tv::TensorView<const Index> indices, int size);
};
template <typename Device, typename scalar_t, typename Index>
struct SparseMaxPoolBackwardFunctor {
void operator()(const Device& d, tv::TensorView<const scalar_t> outFeatures,
tv::TensorView<const scalar_t> inFeatures,
tv::TensorView<const scalar_t> fout,
tv::TensorView<scalar_t> fin,
tv::TensorView<const Index> indices, int size);
};
} // namespace functor
#endif
#ifndef MP_HELPER_H_
#define MP_HELPER_H_
#include <type_traits>
#include <utility>
template <class... T>
struct mp_list {};
template <class T, T... I>
using mp_list_c = mp_list<std::integral_constant<T, I>...>;
namespace detail {
template <class... T, class F>
constexpr F mp_for_each_impl(mp_list<T...>, F &&f) {
return std::initializer_list<int>{(f(T()), 0)...}, std::forward<F>(f);
}
template <class F>
constexpr F mp_for_each_impl(mp_list<>, F &&f) {
return std::forward<F>(f);
}
} // namespace detail
namespace detail {
template <class A, template <class...> class B>
struct mp_rename_impl {
// An error "no type named 'type'" here means that the first argument to
// mp_rename is not a list
};
template <template <class...> class A, class... T, template <class...> class B>
struct mp_rename_impl<A<T...>, B> {
using type = B<T...>;
};
} // namespace detail
template <class A, template <class...> class B>
using mp_rename = typename ::detail::mp_rename_impl<A, B>::type;
template <class L, class F>
constexpr F mp_for_each(F &&f) {
return ::detail::mp_for_each_impl(mp_rename<L, mp_list>(),
std::forward<F>(f));
}
#endif
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <math.h>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <algorithm>
#include <iostream>
namespace py = pybind11;
using namespace pybind11::literals;
template <typename DType, int NDim>
int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
py::array_t<int> coors,
py::array_t<int> num_points_per_voxel,
py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size,
std::vector<DType> coors_range, int max_points,
int max_voxels) {
auto points_rw = points.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int voxelidx, num;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed) continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels) continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors_rw(voxelidx, k) = coor[k];
}
}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) {
for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k);
}
num_points_per_voxel_rw(voxelidx) += 1;
}
}
for (int i = 0; i < voxel_num; ++i) {
coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
}
return voxel_num;
}
template <typename DType, int NDim>
int points_to_voxel_3d_np_mean(py::array_t<DType> points,
py::array_t<DType> voxels,
py::array_t<DType> means, py::array_t<int> coors,
py::array_t<int> num_points_per_voxel,
py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size,
std::vector<DType> coors_range, int max_points,
int max_voxels) {
auto points_rw = points.template mutable_unchecked<2>();
auto means_rw = means.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int voxelidx, num;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed) continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels) continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors_rw(voxelidx, k) = coor[k];
}
}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) {
for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k);
}
num_points_per_voxel_rw(voxelidx) += 1;
for (int k = 0; k < num_features; ++k) {
means_rw(voxelidx, k) +=
(points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
}
}
}
for (int i = 0; i < voxel_num; ++i) {
coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
num = num_points_per_voxel_rw(i);
for (int j = num; j < max_points; ++j) {
for (int k = 0; k < num_features; ++k) {
voxels_rw(i, j, k) = means_rw(i, k);
}
}
}
return voxel_num;
}
template <typename DType, int NDim>
int points_to_voxel_3d_np_height(
py::array_t<DType> points, py::array_t<DType> voxels,
py::array_t<DType> height, py::array_t<DType> maxs, py::array_t<int> coors,
py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size, std::vector<DType> coors_range,
int max_points, int max_voxels) {
auto points_rw = points.template mutable_unchecked<2>();
auto height_rw = height.template mutable_unchecked<2>();
auto maxs_rw = maxs.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int voxelidx, num;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed) continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels) continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors_rw(voxelidx, k) = coor[k];
}
}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) {
for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k);
height_rw(voxelidx, k) =
std::min(points_rw(i, k), height_rw(voxelidx, k));
maxs_rw(voxelidx, k) = std::max(points_rw(i, k), maxs_rw(voxelidx, k));
}
num_points_per_voxel_rw(voxelidx) += 1;
}
}
for (int i = 0; i < voxel_num; ++i) {
coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
for (int k = 0; k < num_features; ++k) {
height_rw(i, k) = maxs_rw(i, k) - height_rw(i, k);
}
}
return voxel_num;
}
template <typename DType, int NDim>
int block_filtering(py::array_t<DType> points, py::array_t<int> mask,
py::array_t<DType> height, py::array_t<DType> maxs,
py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size,
std::vector<DType> coors_range, int max_voxels, DType eps) {
auto points_rw = points.template mutable_unchecked<2>();
auto height_rw = height.template mutable_unchecked<1>();
auto maxs_rw = maxs.template mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int voxelidx, num;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed) continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
}
height_rw(voxelidx) = std::min(points_rw(i, 2), height_rw(voxelidx));
maxs_rw(voxelidx) = std::max(points_rw(i, 2), maxs_rw(voxelidx));
}
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed) continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps) {
mask(i) = 0;
}
}
}
template <typename DType, int NDim>
int points_to_voxel_3d_with_filtering(
py::array_t<DType> points, py::array_t<DType> voxels,
py::array_t<int> voxel_mask, py::array_t<DType> mins,
py::array_t<DType> maxs, py::array_t<int> coors,
py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size, std::vector<DType> coors_range,
int max_points, int max_voxels, int block_factor, int block_size,
DType height_threshold) {
auto points_rw = points.template mutable_unchecked<2>();
auto mins_rw = mins.template mutable_unchecked<2>();
auto maxs_rw = maxs.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
DType max_value, min_value;
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int block_shape_H = grid_size[1] / block_factor;
int block_shape_W = grid_size[0] / block_factor;
int voxelidx, num;
int block_coor[2];
int startx, stopx, starty, stopy;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed) continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels) continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors_rw(voxelidx, k) = coor[k];
}
}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) {
for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k);
}
block_coor[0] = coor[1] / block_factor;
block_coor[1] = coor[2] / block_factor;
mins_rw(block_coor[0], block_coor[1]) =
std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
maxs_rw(block_coor[0], block_coor[1]) =
std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
num_points_per_voxel_rw(voxelidx) += 1;
}
}
for (int i = 0; i < voxel_num; ++i) {
coor[1] = coors_rw(i, 1);
coor[2] = coors_rw(i, 2);
coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;
block_coor[0] = coor[1] / block_factor;
block_coor[1] = coor[2] / block_factor;
min_value = mins_rw(block_coor[0], block_coor[1]);
max_value = maxs_rw(block_coor[0], block_coor[1]);
startx = std::max(0, block_coor[0] - block_size / 2);
stopx =
std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
starty = std::max(0, block_coor[1] - block_size / 2);
stopy =
std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);
for (int j = startx; j < stopx; ++j) {
for (int k = starty; k < stopy; ++k) {
min_value = std::min(min_value, mins_rw(j, k));
max_value = std::max(max_value, maxs_rw(j, k));
}
}
voxel_mask_rw(i) = (max_value - min_value) > height_threshold;
}
return voxel_num;
}
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_REORDERING_FUNCTOR_H_
#define SPARSE_REORDERING_FUNCTOR_H_
#include <utils/spconv/tensorview/tensorview.h>
namespace functor {
template <typename Device, typename scalar_t, typename Index>
struct SparseGatherFunctor {
void operator()(const Device& d, tv::TensorView<scalar_t> buffer,
tv::TensorView<const scalar_t> features,
tv::TensorView<const Index> indices, int size);
};
template <typename Device, typename scalar_t, typename Index>
struct SparseScatterAddFunctor {
void operator()(const Device& d, tv::TensorView<scalar_t> out_features,
tv::TensorView<const scalar_t> buffer,
tv::TensorView<const Index> indices, int size,
bool stable = false);
};
} // namespace functor
#endif
#pragma once
namespace tv {
namespace detail {
template <typename scalar_t>
class KernelLoop {
struct Iterator {
__forceinline__ __device__ Iterator(scalar_t index, scalar_t delta)
: index_(index), delta_(delta) {}
__forceinline__ __device__ scalar_t operator*() const { return index_; }
__forceinline__ __device__ Iterator &operator++() {
index_ += delta_;
return *this;
}
__forceinline__ __device__ bool operator!=(const Iterator &other) const {
bool greater = index_ > other.index_;
bool less = index_ < other.index_;
if (!other.delta_) {
return less;
}
if (!delta_) {
return greater;
}
return less || greater;
}
private:
scalar_t index_;
const scalar_t delta_;
};
public:
__forceinline__ __device__ KernelLoop(scalar_t begin, scalar_t delta,
scalar_t end)
: begin_(begin), delta_(delta), end_(end) {}
__forceinline__ __device__ Iterator begin() const {
return Iterator{begin_, delta_};
}
__forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }
private:
scalar_t begin_;
scalar_t delta_;
scalar_t end_;
};
} // namespace detail
template <typename scalar_t, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopX(
scalar_t count) {
return detail::KernelLoop<scalar_t>(blockIdx.x * blockDim.x + threadIdx.x,
gridDim.x * blockDim.x * NumILP, count);
}
// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
// Usage: for(int i : KernelLoopY(count)) { visit(i); }
template <typename scalar_t, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopY(
scalar_t count) {
return detail::KernelLoop<scalar_t>(blockIdx.y * blockDim.y + threadIdx.y,
gridDim.y * blockDim.y * NumILP, count);
}
// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
template <typename scalar_t, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopZ(
scalar_t count) {
return detail::KernelLoop<scalar_t>(blockIdx.z * blockDim.z + threadIdx.z,
gridDim.z * blockDim.z * NumILP, count);
}
} // namespace tv
#pragma once
// from pytorch.aten
#include "tensorview.h"
namespace tv {
namespace launch {
template <typename T1, typename T2>
inline int DivUp(const T1 a, const T2 b) {
return (a + b - 1) / b;
}
constexpr int CUDA_NUM_THREADS = 1024;
inline int getBlocks(const int N) {
TV_ASSERT_RT_ERR(N > 0,
"CUDA kernel launch blocks must be positive, but got N=", N);
return DivUp(N, CUDA_NUM_THREADS);
}
} // namespace launch
} // namespace tv
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <iostream>
#include <memory>
#include <sstream>
#include <type_traits>
#include <vector>
#include "pytorch_cpp_helper.hpp"
namespace tv {
#if defined(__NVCC__) || defined(__HIP__)
#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
#define TV_DEVICE_INLINE __forceinline__ __device__
#define TV_HOST_DEVICE __device__ __host__
#define TV_ASSERT(expr) assert(expr)
#elif defined(__CUDACC_RTC__)
#define TV_ASSERT(expr) assert(expr)
#define TV_HOST_DEVICE_INLINE __forceinline__ __device__
#define TV_DEVICE_INLINE __forceinline__ __device__
#define TV_HOST_DEVICE __device__ __host__
#else
#define TV_ASSERT(x) assert(x)
#define TV_HOST_DEVICE_INLINE inline
#define TV_HOST_DEVICE
#endif
#define TV_REQUIRE(expr, ...) \
{ \
if (!(expr)) { \
printf(__VA_ARGS__); \
assert(expr); \
} \
}
#define TV_DEVICE_REQUIRE(expr, ...) \
{ \
if (!(expr) && threadIdx.x == 0) printf(__VA_ARGS__); \
assert(expr); \
}
template <class SStream, class T>
void sstream_print(SStream &ss, T val) {
ss << val;
}
template <class SStream, class T, class... TArgs>
void sstream_print(SStream &ss, T val, TArgs... args) {
ss << val << " ";
sstream_print(ss, args...);
}
#define TV_ASSERT_RT_ERR(expr, ...) \
{ \
if (!(expr)) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << #expr << " assert failed. "; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
throw std::runtime_error(__macro_s.str()); \
} \
}
#define TV_ASSERT_INVALID_ARG(expr, ...) \
{ \
if (!(expr)) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << #expr << " assert failed. "; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
throw std::invalid_argument(__macro_s.str()); \
} \
}
#define TV_CHECK_CUDA_ERR() \
{ \
auto err = cudaGetLastError(); \
if (err != cudaSuccess) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << "cuda execution failed with error " << err; \
throw std::runtime_error(__macro_s.str()); \
} \
}
struct CPU {};
#define TV_MAX_DIM 6
template <typename scalar_t, size_t MaxDim = TV_MAX_DIM>
struct SimpleVector {
public:
TV_HOST_DEVICE_INLINE SimpleVector(){};
TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<scalar_t> q) {
TV_ASSERT(q.size() <= MaxDim);
mSize = 0;
for (scalar_t s : q) {
mArray[mSize++] = s;
}
mSize = q.size();
}
SimpleVector(const std::vector<scalar_t> &arr) {
TV_ASSERT(arr.size() <= MaxDim);
for (size_t i = 0; i < arr.size(); ++i) {
mArray[i] = arr[i];
}
mSize = arr.size();
}
TV_HOST_DEVICE_INLINE SimpleVector(
const SimpleVector<scalar_t, MaxDim> &arr) {
TV_ASSERT(arr.size() <= MaxDim);
for (size_t i = 0; i < arr.size(); ++i) {
mArray[i] = arr[i];
}
mSize = arr.size();
}
TV_HOST_DEVICE_INLINE scalar_t &operator[](int idx) {
#ifdef TV_DEBUG
TV_ASSERT(idx >= 0 && idx < mSize);
#endif
return mArray[idx];
}
TV_HOST_DEVICE_INLINE const scalar_t &operator[](int idx) const {
#ifdef TV_DEBUG
TV_ASSERT(idx >= 0 && idx < mSize);
#endif
return mArray[idx];
}
TV_HOST_DEVICE_INLINE void push_back(scalar_t s) {
#ifdef TV_DEBUG
TV_ASSERT(mSize < MaxDim);
#endif
mArray[mSize] = s;
mSize++;
}
TV_HOST_DEVICE_INLINE void pop_back() {
#ifdef TV_DEBUG
TV_ASSERT(mSize > 0);
#endif
mSize--;
}
TV_HOST_DEVICE_INLINE size_t size() const { return mSize; }
TV_HOST_DEVICE_INLINE const scalar_t *data() const { return mArray; }
TV_HOST_DEVICE_INLINE size_t empty() const { return mSize == 0; }
typedef size_t size_type;
class iterator {
public:
typedef iterator self_type;
typedef scalar_t value_type;
typedef scalar_t &reference;
typedef scalar_t *pointer;
typedef std::forward_iterator_tag iterator_category;
typedef std::ptrdiff_t difference_type;
TV_HOST_DEVICE_INLINE iterator(pointer ptr) : ptr_(ptr) {}
TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
self_type i = *this;
ptr_++;
return i;
}
TV_HOST_DEVICE_INLINE self_type operator++() {
ptr_++;
return *this;
}
TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
return ptr_ == rhs.ptr_;
}
TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
return ptr_ != rhs.ptr_;
}
private:
pointer ptr_;
};
class const_iterator {
public:
typedef const_iterator self_type;
typedef scalar_t value_type;
typedef const scalar_t &reference;
typedef const scalar_t *pointer;
typedef std::ptrdiff_t difference_type;
typedef std::forward_iterator_tag iterator_category;
TV_HOST_DEVICE_INLINE const_iterator(pointer ptr) : ptr_(ptr) {}
TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
self_type i = *this;
ptr_++;
return i;
}
TV_HOST_DEVICE_INLINE self_type operator++() {
ptr_++;
return *this;
}
TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
return ptr_ == rhs.ptr_;
}
TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
return ptr_ != rhs.ptr_;
}
private:
pointer ptr_;
};
TV_HOST_DEVICE_INLINE iterator begin() { return iterator(mArray); }
TV_HOST_DEVICE_INLINE iterator end() { return iterator(mArray + mSize); }
TV_HOST_DEVICE_INLINE const_iterator begin() const {
return const_iterator(mArray);
}
TV_HOST_DEVICE_INLINE const_iterator end() const {
return const_iterator(mArray + mSize);
}
TV_HOST_DEVICE_INLINE const_iterator cbegin() const {
return const_iterator(mArray);
}
TV_HOST_DEVICE_INLINE const_iterator cend() const {
return const_iterator(mArray + mSize);
}
protected:
scalar_t mArray[MaxDim];
size_t mSize = 0;
};
template <typename scalar_t, size_t MaxDim>
bool operator==(const SimpleVector<scalar_t, MaxDim> &lfs,
const SimpleVector<scalar_t, MaxDim> &rfs) {
if (lfs.size() != rfs.size()) return false;
for (size_t i = 0; i < lfs.size(); ++i) {
if (lfs[i] != rfs[i]) return false;
}
return true;
}
template <typename scalar_t, size_t MaxDim>
bool operator!=(const SimpleVector<scalar_t, MaxDim> &lfs,
const SimpleVector<scalar_t, MaxDim> &rfs) {
return !(lfs == rfs);
}
struct Slice {
template <class... Integers>
TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
SimpleVector<int, 3> slices{int(ints)...};
mSlices[0] = -1;
mSlices[1] = -1;
mSlices[2] = -1;
for (size_t i = 0; i < slices.size(); ++i) {
mSlices[i] = slices[i];
}
}
TV_HOST_DEVICE_INLINE Slice() {
mSlices[0] = -1;
mSlices[1] = -1;
mSlices[2] = -1;
}
template <typename scalar_t>
TV_HOST_DEVICE_INLINE Slice(std::initializer_list<scalar_t> slice) {
mSlices[0] = -1;
mSlices[1] = -1;
mSlices[2] = -1;
TV_ASSERT(slice.size() <= 3);
int idx = 0;
for (scalar_t s : slice) {
mSlices[idx] = int(s);
++idx;
}
}
TV_HOST_DEVICE_INLINE int &operator[](int idx) {
#ifdef TV_DEBUG
TV_ASSERT(idx >= 0 && idx < 3);
#endif
return mSlices[idx];
}
TV_HOST_DEVICE_INLINE const int &operator[](int idx) const {
#ifdef TV_DEBUG
TV_ASSERT(idx >= 0 && idx < 3);
#endif
return mSlices[idx];
}
protected:
int mSlices[3];
};
template <size_t MaxDim = TV_MAX_DIM>
struct ShapeBase : public SimpleVector<int, MaxDim> {
TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>(){};
TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape)
: SimpleVector<int, MaxDim>(shape) {}
template <typename scalar_t, template <class...> class Container>
ShapeBase(Container<scalar_t> shape) : SimpleVector<int, MaxDim>(shape) {}
TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
: SimpleVector<int, MaxDim>(shape) {}
ShapeBase(const std::vector<int> &arr) : SimpleVector<int, MaxDim>(arr) {}
ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default;
TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const {
#ifdef TV_DEBUG
TV_ASSERT(start >= 0 && end < this->mSize && end > start);
#endif
ShapeBase<MaxDim> shape;
for (int i = start; i < end; ++i) {
shape.push_back(this->mArray[i]);
}
return shape;
}
TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start) const {
#ifdef TV_DEBUG
TV_ASSERT(start >= 0 && start <= this->mSize);
#endif
ShapeBase<MaxDim> shape;
for (int i = start; i < this->mSize; ++i) {
shape.push_back(this->mArray[i]);
}
return shape;
}
TV_HOST_DEVICE_INLINE size_t size() const {
if (this->mSize == 0) return 0;
size_t s = 1;
for (int i = 0; i < int(this->mSize); ++i) {
s *= this->mArray[i];
}
return s;
}
TV_HOST_DEVICE_INLINE size_t ndim() const { return this->mSize; }
TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze() const {
ShapeBase<MaxDim> shape;
for (int i = 0; i < this->mSize; ++i) {
if (this->mArray[i] != 1) shape.push_back(this->mArray[i]);
}
return shape;
}
TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze(int dim) const {
ShapeBase<MaxDim> shape;
for (int i = 0; i < this->mSize; ++i) {
if (i != dim || this->mArray[i] != 1) shape.push_back(this->mArray[i]);
}
return shape;
}
};
using Shape = ShapeBase<TV_MAX_DIM>;
template <class... Inds>
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
Inds... indexes) {
unsigned offset = 0;
unsigned m = 1;
int indexes_vec[sizeof...(indexes)] = {indexes...};
#ifdef TV_DEBUG
TV_ASSERT(sizeof...(indexes) == shape.size());
#endif
#pragma unroll
for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
offset += m * indexes_vec[i];
m *= shape[i];
}
return offset;
}
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
std::vector<int> &indexes_vec) {
unsigned offset = 0;
unsigned m = 1;
for (int i = shape.size() - 1; i >= 0; --i) {
offset += m * indexes_vec[i];
m *= shape[i];
}
return offset;
}
template <class... Inds>
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
Inds... indexes) {
unsigned offset = 0;
unsigned m = 1;
int indexes_vec[sizeof...(indexes)] = {indexes...};
#pragma unroll
for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
offset += m * indexes_vec[i];
m *= shape[i];
}
return offset;
}
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
const Shape &indexes_vec) {
unsigned offset = 0;
unsigned m = 1;
for (int i = indexes_vec.ndim() - 1; i >= 0; --i) {
offset += m * indexes_vec[i];
m *= shape[i];
}
return offset;
}
template <typename Index, unsigned NDim>
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Index *indexes,
const Index *shape) {
unsigned offset = 0;
unsigned m = 1;
#pragma unroll
for (int i = NDim - 1; i >= 0; --i) {
offset += m * indexes[i];
m *= shape[i];
}
return offset;
}
template <typename Index, unsigned NDim>
TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
const Index *shape) {
#pragma unroll
for (int i = NDim - 1; i >= 0; --i) {
output[i] = index % shape[i];
index -= output[i];
index /= shape[i];
}
return index;
}
template <int N>
struct ArrayIndexRowMajor {
TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
const Shape &indexes) {
return indexes[N - 1] +
shape[N - 1] * ArrayIndexRowMajor<N - 1>::run(shape, indexes);
}
};
template <>
struct ArrayIndexRowMajor<0> {
TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
const Shape &indexes) {
return 0;
}
};
namespace detail {
template <typename scalar_t>
constexpr const char *simpleTypeName(scalar_t val = scalar_t());
template <>
constexpr const char *simpleTypeName(float val) {
return "float32";
}
template <>
constexpr const char *simpleTypeName(double val) {
return "float64";
}
template <>
constexpr const char *simpleTypeName(int val) {
return "int32";
}
template <>
constexpr const char *simpleTypeName(unsigned val) {
return "uint32";
}
template <>
constexpr const char *simpleTypeName(long val) {
return "int64";
}
template <>
constexpr const char *simpleTypeName(unsigned long val) {
return "uint64";
}
}; // namespace detail
template <typename scalar_t, int Rank = -1>
struct TensorView {
TV_HOST_DEVICE_INLINE TensorView() {}
explicit TV_HOST_DEVICE_INLINE TensorView(scalar_t *ptr, Shape shape)
: mPtr(ptr), mShape(shape) {}
template <class... Integers>
explicit TV_HOST_DEVICE_INLINE TensorView(scalar_t *ptr, Integers... shapes)
: mPtr(ptr) {
mShape = {int(shapes)...};
}
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &assign(
const TensorView<scalar_t, Rank> &tensor) {
TV_REQUIRE(tensor.shape() == shape(), "you must provide same input size%s",
"\n");
scalar_t *ptr = mPtr;
const scalar_t *other_ptr = tensor.data();
for (size_t i = 0; i < size(); ++i) *(ptr++) = *(other_ptr++);
return *this;
}
template <typename T1>
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &assign(
std::initializer_list<T1> seq) {
TV_REQUIRE(seq.size() == size(), "you must provide same input size%s",
"\n");
scalar_t *ptr = mPtr;
for (const T1 &s : seq) *(ptr++) = scalar_t(s);
return *this;
}
template <class... Inds>
TV_HOST_DEVICE_INLINE scalar_t &operator()(Inds... inds) {
#ifdef TV_DEBUG
int idxes[sizeof...(Inds)]{int(inds)...};
TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
"you provide %d indexes, but dim is %d\n", sizeof...(inds),
mShape.ndim());
for (int i = 0; i < sizeof...(inds); ++i) {
TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
"index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
mShape[i]);
}
#endif
return mPtr[rowArrayIdx(mShape, int(inds)...)];
}
template <class... Inds>
TV_HOST_DEVICE_INLINE const scalar_t &operator()(Inds... inds) const {
#ifdef TV_DEBUG
int idxes[sizeof...(Inds)]{int(inds)...};
TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
"you provide %d indexes, but dim is %d\n", sizeof...(inds),
mShape.ndim());
for (int i = 0; i < sizeof...(inds); ++i) {
TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
"index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
mShape[i]);
}
#endif
return mPtr[rowArrayIdx(mShape, int(inds)...)];
}
TV_HOST_DEVICE_INLINE scalar_t &operator()() {
#if defined TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mPtr != nullptr,
"you want get value but the view is empty.%s", "\n");
TV_DEVICE_REQUIRE(mShape.ndim() == 0,
"you provide 0 indexes, but dim is %ld\n", mShape.ndim());
#else
TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
"\n");
TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
mShape.ndim());
#endif
#endif
return mPtr[0];
}
TV_HOST_DEVICE_INLINE const scalar_t &operator()() const {
#if defined TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mPtr != nullptr,
"you want get value but the view is empty.%s", "\n");
TV_DEVICE_REQUIRE(mShape.ndim() == 0,
"you provide 0 indexes, but dim is %ld\n", mShape.ndim());
#else
TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
"\n");
TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
mShape.ndim());
#endif
#endif
return mPtr[0];
}
template <class T1>
TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1) {
#if defined TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 1,
"you provide 1 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
#else
TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
#endif
#endif
return mPtr[i1];
}
template <class T1, class T2>
TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2) {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 2,
"you provide 2 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
#else
TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
#endif
#endif
return mPtr[i1 * mShape[1] + i2];
}
template <class T1, class T2, class T3>
TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2, T3 i3) {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 3,
"you provide 3 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
mShape[2]);
#else
TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
#endif
#endif
return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
}
template <class T1, class T2, class T3, class T4>
TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2, T3 i3, T4 i4) {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 4,
"you provide 4 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
mShape[2]);
TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
"index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
mShape[3]);
#else
TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
"index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
#endif
#endif
return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
}
template <class T1>
TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 1,
"you provide 1 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
#else
TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
#endif
#endif
return mPtr[i1];
}
template <class T1, class T2>
TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 2,
"you provide 2 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
#else
TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
#endif
#endif
return mPtr[i1 * mShape[1] + i2];
}
template <class T1, class T2, class T3>
TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2, T3 i3) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 3,
"you provide 3 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
mShape[2]);
#else
TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
#endif
#endif
return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
}
template <class T1, class T2, class T3, class T4>
TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2, T3 i3,
T4 i4) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 4,
"you provide 4 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
mShape[2]);
TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
"index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
mShape[3]);
#else
TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
"index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
#endif
#endif
return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
}
TV_HOST_DEVICE_INLINE scalar_t &operator[](int idx) {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
"index(%d) out-of-range: [0, %ld)\n", int(idx), size());
#else
TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
int(idx), size());
#endif
#endif
return mPtr[idx];
}
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> operator[](
SimpleVector<Slice> slice_vec) {
return _subview(slice_vec);
}
TV_HOST_DEVICE_INLINE const TensorView<scalar_t, Rank> operator[](
SimpleVector<Slice> slice_vec) const {
return _subview(slice_vec);
}
TV_HOST_DEVICE_INLINE bool empty() const { return mPtr == nullptr; }
TV_HOST_DEVICE_INLINE scalar_t *data() { return mPtr; }
TV_HOST_DEVICE_INLINE const scalar_t *data() const { return mPtr; }
TV_HOST_DEVICE_INLINE const Shape &shape() const { return mShape; }
TV_HOST_DEVICE_INLINE int dim(int idx) const { return mShape[idx]; }
TV_HOST_DEVICE_INLINE int ndim() const { return mShape.ndim(); }
template <class... Inds>
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &reshape(Inds... newShapes) {
Shape shapes{int(newShapes)...};
TV_ASSERT(shapes.size() == size());
mShape = shapes;
return *this;
}
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &reshape(Shape shapes) {
TV_ASSERT(shapes.size() == size());
mShape = shapes;
return *this;
}
template <class... Inds>
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> view(
Inds... newShapes) const {
Shape shapes{int(newShapes)...};
for (size_t i = 0; i < shapes.ndim(); ++i) {
if (shapes[i] == -1) {
shapes[i] = 1;
shapes[i] = size() / shapes.size();
break;
}
}
TV_ASSERT(shapes.size() == size());
return TensorView<scalar_t, Rank>(mPtr, shapes);
}
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> view(Shape shapes) const {
TV_ASSERT(shapes.size() == size());
return TensorView<scalar_t, Rank>(mPtr, shapes);
}
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> squeeze() const {
return TensorView<scalar_t, Rank>(mPtr, mShape.squeeze());
}
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> squeeze(int dim) const {
return TensorView<scalar_t, Rank>(mPtr, mShape.squeeze(dim));
}
TV_HOST_DEVICE_INLINE size_t size() const { return mShape.size(); }
template <class... Slices>
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(
Slice slice, Slices... slices) const {
return subview<float, Slice, Slices...>(slice, slices...);
}
template <class T2 = float, class... Slices>
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(
Slices... slices) const {
Slice slice_vec[sizeof...(Slices)] = {to_slice(slices)...};
Shape new_shape{to_slice(slices)[0]...};
Shape start{to_slice(slices)[0]...};
TV_ASSERT(new_shape.ndim() <= mShape.ndim());
TV_ASSERT(new_shape.ndim() != 0);
size_t idxsize = new_shape.ndim();
for (size_t i = idxsize; i < mShape.ndim(); ++i) {
new_shape.push_back(0);
start.push_back(0);
}
#pragma unroll
for (size_t i = 0; i < sizeof...(Slices); ++i) {
if (slice_vec[i][1] != -1) {
new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
TV_ASSERT(new_shape[i] >= 0);
} else {
new_shape[i] = 1;
}
}
auto offset = rowArrayIdx(mShape, start);
#pragma unroll
for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
new_shape[i] = mShape[i];
TV_ASSERT(new_shape[i] >= 0);
}
Shape reduced_shape;
#pragma unroll
for (size_t i = 0; i < sizeof...(Slices); ++i) {
if (slice_vec[i][1] != -1) {
reduced_shape.push_back(new_shape[i]);
}
}
#pragma unroll
for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
reduced_shape.push_back(new_shape[i]);
}
return TensorView<scalar_t, Rank>(mPtr + offset, reduced_shape);
}
template <class... Integers>
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(int id,
Integers... ints) {
Shape start = {id, ints...};
for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
start.push_back(0);
}
return TensorView<scalar_t, Rank>(mPtr + rowArrayIdx(mShape, start),
mShape.subshape(sizeof...(ints) + 1));
}
std::string repr() const {
std::ostringstream ss;
if (empty()) return "";
if (mShape.ndim() == 0) {
ss << *mPtr;
ss << "Tensor: dtype=" << detail::simpleTypeName<scalar_t>();
return ss.str();
}
Shape counter = mShape;
auto tensor_flat = this->view(-1);
for (int i = 0; i < counter.ndim(); ++i) {
counter[i] = 0;
ss << "[";
}
for (size_t i = 0; i < this->size(); ++i) {
ss << tensor_flat(rowArrayIdx(mShape, counter));
counter[counter.ndim() - 1] += 1;
int inc_count = 0;
bool print_comma = true;
for (int c = counter.ndim() - 1; c >= 0; --c) {
if (counter[c] == this->dim(c) && c > 0) {
++inc_count;
counter[c - 1] += 1;
counter[c] = 0;
print_comma = false;
}
}
if (print_comma && i != this->size() - 1) ss << ", ";
for (int j = 0; j < inc_count; ++j) {
ss << "]";
}
if (i != this->size() - 1) {
if (inc_count != 0) ss << "\n";
for (int j = 0; j < inc_count; ++j) {
ss << "[";
}
}
}
ss << "]";
ss << "Tensor: dtype=" << detail::simpleTypeName<scalar_t>();
return ss.str();
}
protected:
// TODO: make this function public.
// currently this function is called unexpectedly when using subview({0, 0}).
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> _subview(
SimpleVector<Slice> slice_vec) {
Shape new_shape;
for (int i = 0; i < slice_vec.size(); ++i) {
new_shape.push_back(slice_vec[i][0]);
}
Shape start = new_shape;
TV_ASSERT(new_shape.ndim() <= mShape.ndim());
TV_ASSERT(new_shape.ndim() != 0);
size_t idxsize = new_shape.ndim();
for (size_t i = idxsize; i < mShape.ndim(); ++i) {
new_shape.push_back(0);
start.push_back(0);
}
for (size_t i = 0; i < slice_vec.size(); ++i) {
if (slice_vec[i][1] != -1) {
new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
TV_ASSERT(new_shape[i] >= 0);
} else {
new_shape[i] = 1; // reduce dim
}
}
auto offset = rowArrayIdx(mShape, start);
for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
new_shape[i] = mShape[i];
TV_ASSERT(new_shape[i] >= 0);
}
Shape reduced_shape;
for (size_t i = 0; i < slice_vec.size(); ++i) {
if (slice_vec[i][1] != -1) {
reduced_shape.push_back(new_shape[i]);
}
}
for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
reduced_shape.push_back(new_shape[i]);
}
return TensorView<scalar_t, Rank>(mPtr + offset, reduced_shape);
}
template <typename T1>
TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
return Slice{int(s), -1, -1};
}
TV_HOST_DEVICE_INLINE Slice to_slice(Slice s) const { return Slice(s); }
scalar_t *mPtr = nullptr;
Shape mShape;
};
template <typename Os, typename scalar_t, int Rank>
Os &operator<<(Os &os, const TensorView<scalar_t, Rank> &dt) {
os << dt.repr();
return os;
}
template <typename Os, typename scalar_t, int Rank>
Os &operator<<(Os &os, const TensorView<const scalar_t, Rank> &dt) {
os << dt.repr();
return os;
}
namespace detail {
template <typename scalar_t>
constexpr const char *printfTypeFormat(scalar_t val = scalar_t());
template <>
constexpr const char *printfTypeFormat(float val) {
return "%.2f";
}
template <>
constexpr const char *printfTypeFormat(double val) {
return "%.2f";
}
template <>
constexpr const char *printfTypeFormat(int val) {
return "%d";
}
template <>
constexpr const char *printfTypeFormat(unsigned val) {
return "%u";
}
template <>
constexpr const char *printfTypeFormat(long val) {
return "%ld";
}
template <>
constexpr const char *printfTypeFormat(unsigned long val) {
return "%lu";
}
}; // namespace detail
template <typename scalar_t>
TV_HOST_DEVICE void printTensorView(const TensorView<scalar_t> tensor,
const char *format) {
if (tensor.empty()) return;
if (tensor.ndim() == 0) {
printf(format, tensor());
printf("\n");
return;
}
Shape counter = tensor.shape();
auto tensor_flat = tensor.view(-1);
for (int i = 0; i < counter.ndim(); ++i) {
counter[i] = 0;
printf("[");
}
for (size_t i = 0; i < tensor.size(); ++i) {
printf(format, tensor_flat(rowArrayIdx(tensor.shape(), counter)));
counter[counter.ndim() - 1] += 1;
int inc_count = 0;
bool print_comma = true;
for (int c = counter.ndim() - 1; c >= 0; --c) {
if (counter[c] == tensor.dim(c) && c > 0) {
++inc_count;
counter[c - 1] += 1;
counter[c] = 0;
print_comma = false;
}
}
if (print_comma && i != tensor.size() - 1) printf(", ");
for (int j = 0; j < inc_count; ++j) {
printf("]");
}
if (i != tensor.size() - 1) {
if (inc_count != 0) printf("\n");
for (int j = 0; j < inc_count; ++j) {
printf("[");
}
}
}
printf("]\n");
}
template <typename scalar_t>
TV_HOST_DEVICE void printTensorView(TensorView<scalar_t> tensor) {
using Traw = typename std::remove_const<scalar_t>::type;
return printTensorView(tensor, detail::printfTypeFormat<Traw>());
}
template <typename scalar_t>
TV_HOST_DEVICE void printTensorView(const scalar_t *ptr, Shape shape) {
using Traw = typename std::remove_const<scalar_t>::type;
return printTensorView(TensorView<const scalar_t>(ptr, shape),
detail::printfTypeFormat<Traw>());
}
template <typename scalar_t>
TV_HOST_DEVICE void printTensorView(const scalar_t *ptr, Shape shape,
const char *format) {
return printTensorView(TensorView<const scalar_t>(ptr, shape), format);
}
} // namespace tv
// Copyright (c) OpenMMLab. All rights reserved
#ifndef ONNXRUNTIME_CORNER_POOL_H
#define ONNXRUNTIME_CORNER_POOL_H
#include <assert.h>
#include <onnxruntime_cxx_api.h>
struct MMCVCornerPoolKernel {
public:
MMCVCornerPoolKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
: ort_(ort) {
mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "mode");
}
void Compute(OrtKernelContext* context);
private:
Ort::CustomOpApi ort_;
int64_t mode_;
};
struct MMCVCornerPoolCustomOp
: Ort::CustomOpBase<MMCVCornerPoolCustomOp, MMCVCornerPoolKernel> {
void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
return new MMCVCornerPoolKernel(api, info);
}
const char* GetName() const { return "MMCVCornerPool"; }
size_t GetInputTypeCount() const { return 1; }
ONNXTensorElementDataType GetInputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
size_t GetOutputTypeCount() const { return 1; }
ONNXTensorElementDataType GetOutputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
// force cpu
const char* GetExecutionProviderType() const {
return "CPUExecutionProvider";
}
};
#endif // ONNXRUNTIME_CORNER_POOL_H
// Copyright (c) OpenMMLab. All rights reserved
#include "corner_pool.h"
#include "../ort_mmcv_utils.h"
void TopPoolForwardCPU(const float *input, float *output, const int batch_size,
const int channels, const int height, const int width) {
for (int n = 0; n < batch_size; n++) {
int index_n = n * channels * width * height;
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * width * height;
for (int w = 0; w < width; w++) {
// directly copy the most bottom value from input to output
output[index_n_c + (height - 1) * width + w] =
input[index_n_c + (height - 1) * width + w];
// do top_pool
for (int h = height - 2; h >= 0; h--) {
output[index_n_c + h * width + w] =
std::max(output[index_n_c + (h + 1) * width + w],
input[index_n_c + h * width + w]);
} // for h
} // for w
} // for c
} // for n
}
void BottomPoolForwardCPU(const float *input, float *output,
const int batch_size, const int channels,
const int height, const int width) {
for (int n = 0; n < batch_size; n++) {
int index_n = n * channels * width * height;
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * width * height;
for (int w = 0; w < width; w++) {
// directly copy the most top value from input to output
output[index_n_c + w] = input[index_n_c + w];
// do top_pool
for (int h = 1; h < height; h++) {
output[index_n_c + h * width + w] =
std::max(output[index_n_c + (h - 1) * width + w],
input[index_n_c + h * width + w]);
} // for h
} // for w
} // for c
} // for n
}
void LeftPoolForwardCPU(const float *input, float *output, const int batch_size,
const int channels, const int height, const int width) {
for (int n = 0; n < batch_size; n++) {
int index_n = n * channels * width * height;
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * width * height;
for (int h = 0; h < height; h++) {
// directly copy the most right value from input to output
output[index_n_c + h * width + width - 1] =
input[index_n_c + h * width + width - 1];
// do left_pool
for (int w = width - 2; w >= 0; w--) {
output[index_n_c + h * width + w] =
std::max(output[index_n_c + h * width + w + 1],
input[index_n_c + h * width + w]);
} // for w
} // for h
} // for c
} // for n
}
void RightPoolForwardCPU(const float *input, float *output,
const int batch_size, const int channels,
const int height, const int width) {
for (int n = 0; n < batch_size; n++) {
int index_n = n * channels * width * height;
for (int c = 0; c < channels; c++) {
int index_n_c = index_n + c * width * height;
for (int h = 0; h < height; h++) {
// directly copy the most left value from input to output
output[index_n_c + h * width] = input[index_n_c + h * width];
// do right_pool
for (int w = 1; w < width; w++) {
output[index_n_c + h * width + w] =
std::max(output[index_n_c + h * width + w - 1],
input[index_n_c + h * width + w]);
} // for w
} // for h
} // for c
} // for n
}
void MMCVCornerPoolKernel::Compute(OrtKernelContext *context) {
const int mode = int(mode_);
typedef float T;
const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
const T *input_data =
reinterpret_cast<const float *>(ort_.GetTensorData<T>(input));
// get output memory
OrtTensorDimensions out_dimensions(ort_, input);
OrtValue *output = ort_.KernelContext_GetOutput(
context, 0, out_dimensions.data(), out_dimensions.size());
T *output_data = ort_.GetTensorMutableData<T>(output);
// 'top': 0, 'bottom': 1, 'left': 2, 'right':3
assert(mode == 0 || mode == 1 || mode == 2 || mode == 3);
// do corner_pool
int batch_size = out_dimensions.data()[0];
int input_channels = out_dimensions.data()[1];
int input_height = out_dimensions.data()[2];
int input_width = out_dimensions.data()[3];
if (mode == 0)
TopPoolForwardCPU(input_data, output_data, batch_size, input_channels,
input_height, input_width);
else if (mode == 1)
BottomPoolForwardCPU(input_data, output_data, batch_size, input_channels,
input_height, input_width);
else if (mode == 2)
LeftPoolForwardCPU(input_data, output_data, batch_size, input_channels,
input_height, input_width);
else
RightPoolForwardCPU(input_data, output_data, batch_size, input_channels,
input_height, input_width);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "deform_conv.h"
#include <cmath>
#include <vector>
#include "../ort_mmcv_utils.h"
void gemm_ref_fp32_deform(const float *A, const float *B, const float *V,
const float *H, const int32_t trans_A,
const int32_t trans_B, const int32_t M,
const int32_t N, const int32_t K, const float alpha,
const float beta, float *Y) {
if (!trans_A && !trans_B) { // MK, KN; NN
for (int64_t m = 0; m < M; ++m) {
for (int64_t n = 0; n < N; ++n) {
float y = 0.0f;
for (int64_t k = 0; k < K; ++k) {
y += A[m * K + k] * B[k * N + n];
}
y *= alpha;
if (V) y += beta * V[n];
if (H) y += beta * H[m * N + n];
Y[m * N + n] = y;
}
}
}
if (trans_A && !trans_B) { // KM, KN; TN
for (int64_t m = 0; m < M; ++m) {
for (int64_t n = 0; n < N; ++n) {
float y = 0.0f;
for (int64_t k = 0; k < K; ++k) {
y += A[k * M + m] * B[k * N + n];
}
y *= alpha;
if (V) y += beta * V[n];
if (H) y += beta * H[m * N + n];
Y[m * N + n] = y;
}
}
}
if (trans_A && trans_B) { // KM, NK; TT
for (int64_t m = 0; m < M; ++m) {
for (int64_t n = 0; n < N; ++n) {
float y = 0.0f;
for (int64_t k = 0; k < K; ++k) {
y += A[k * M + m] * B[n * K + k];
}
y *= alpha;
if (V) y += beta * V[n];
if (H) y += beta * H[m * N + n];
Y[m * N + n] = y;
}
}
}
if (!trans_A && trans_B) { // MK, NK; NT
for (int64_t m = 0; m < M; ++m) {
for (int64_t n = 0; n < N; ++n) {
float y = 0.0f;
for (int64_t k = 0; k < K; ++k) {
y += A[m * K + k] * B[n * K + k];
}
y *= alpha;
if (V) y += beta * V[n];
if (H) y += beta * H[m * N + n];
Y[m * N + n] = y;
}
}
}
}
float bilinear_interpolate(const float *src, const int64_t src_h,
const int64_t src_w, const float h, const float w) {
if (h <= -1 || src_h <= h || w <= -1 || src_w <= w) {
return 0;
}
int64_t h_low = floor(h);
int64_t w_low = floor(w);
int64_t h_high = h_low + 1;
int64_t w_high = w_low + 1;
float lh = h - h_low;
float lw = w - w_low;
float hh = 1 - lh;
float hw = 1 - lw;
float v1 = 0;
if (h_low >= 0 && w_low >= 0) v1 = src[h_low * src_w + w_low];
float v2 = 0;
if (h_low >= 0 && w_high <= src_w - 1) v2 = src[h_low * src_w + w_high];
float v3 = 0;
if (h_high <= src_h - 1 && w_low >= 0) v3 = src[h_high * src_w + w_low];
float v4 = 0;
if (h_high <= src_h - 1 && w_high <= src_w - 1)
v4 = src[h_high * src_w + w_high];
float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
void deformable_im2col(const float *input, const float *offset,
const int64_t src_h, const int64_t src_w,
const int64_t kernel_h, const int64_t kernel_w,
const int64_t pad_h, const int64_t pad_w,
const int64_t stride_h, const int64_t stride_w,
const int64_t dilation_h, const int64_t dilation_w,
const int64_t channels, const int64_t offset_groups,
const int64_t dst_h, const int64_t dst_w,
float *columns) {
const int64_t indices = channels * dst_h * dst_w;
for (int64_t index = 0; index != indices; ++index) {
const int64_t w_col = index % dst_w;
const int64_t h_col = (index / dst_w) % dst_h;
const int64_t c_im = index / (dst_w * dst_h);
const int64_t c_col = c_im * kernel_h * kernel_w;
int64_t c_per_offset_grp = channels / offset_groups;
const int64_t grp_idx = c_im / c_per_offset_grp;
auto columns_ptr =
columns + (c_col * (dst_h * dst_w) + h_col * dst_w + w_col);
auto input_ptr = input + c_im * (src_h * src_w);
auto offset_ptr =
offset + grp_idx * 2 * kernel_h * kernel_w * dst_h * dst_w;
for (int64_t kh = 0; kh < kernel_h; ++kh) {
for (int64_t kw = 0; kw < kernel_w; ++kw) {
const int data_offset_h_ptr =
((2 * (kh * kernel_w + kw)) * dst_h + h_col) * dst_w + w_col;
const int data_offset_w_ptr =
((2 * (kh * kernel_w + kw) + 1) * dst_h + h_col) * dst_w + w_col;
const float offset_h = offset_ptr[data_offset_h_ptr];
const float offset_w = offset_ptr[data_offset_w_ptr];
const float ih =
(h_col * stride_h - pad_h) + kh * dilation_h + offset_h;
const float iw =
(w_col * stride_w - pad_w) + kw * dilation_w + offset_w;
*columns_ptr = bilinear_interpolate(input_ptr, src_h, src_w, ih, iw);
columns_ptr += dst_h * dst_w;
}
}
}
}
void deformable_conv_forward(
const float *src, const float *offset, const float *filter,
const int64_t batch, const int64_t src_c, const int64_t src_h,
const int64_t src_w, const int64_t dst_c, const int64_t dst_h,
const int64_t dst_w, const int64_t group, const int64_t offset_group,
const int64_t channels, const int64_t num_output, const int64_t kernel_h,
const int64_t kernel_w, const int64_t stride_h, const int64_t stride_w,
const int64_t pad_h, const int64_t pad_w, const int64_t dilation_h,
const int64_t dilation_w, float *columns, float *dst) {
const int64_t ic_per_gp = channels / group;
const int64_t oc_per_gp = num_output / group;
for (int64_t b = 0; b < batch; ++b) {
for (int64_t g = 0; g < group; ++g) {
deformable_im2col(
src + b * src_c * src_h * src_w + g * ic_per_gp * src_h * src_w,
offset + b * offset_group * 2 * kernel_h * kernel_w * dst_h * dst_w,
src_h, src_w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, ic_per_gp, offset_group, dst_h, dst_w,
columns);
float *dst_ptr =
dst + b * dst_c * dst_h * dst_w + g * oc_per_gp * dst_h * dst_w;
memset(dst_ptr, 0.0f, sizeof(float) * oc_per_gp * dst_h * dst_w);
gemm_ref_fp32_deform(
filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w, columns,
nullptr, dst_ptr, 0, 0, oc_per_gp, dst_h * dst_w,
ic_per_gp * kernel_h * kernel_w, 1.0f, 1.0f, dst_ptr);
}
}
}
MMCVDeformConvKernel::MMCVDeformConvKernel(OrtApi api,
const OrtKernelInfo *info)
: api_(api), ort_(api_), info_(info) {
std::vector<int64_t> stride =
ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "stride");
stride_height_ = stride[0];
stride_width_ = stride[1];
std::vector<int64_t> padding =
ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "padding");
padding_height_ = padding[0];
padding_width_ = padding[1];
std::vector<int64_t> dilation =
ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "dilation");
dilation_height_ = dilation[0];
dilation_width_ = dilation[1];
deformable_group_ =
ort_.KernelInfoGetAttribute<int64_t>(info, "deform_groups");
group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "groups");
// create allocator
allocator_ = Ort::AllocatorWithDefaultOptions();
}
void MMCVDeformConvKernel::Compute(OrtKernelContext *context) {
const int64_t stride_height = stride_height_;
const int64_t stride_width = stride_width_;
const int64_t padding_height = padding_height_;
const int64_t padding_width = padding_width_;
const int64_t dilation_height = dilation_height_;
const int64_t dilation_width = dilation_width_;
const int64_t deformable_group = deformable_group_;
const int64_t group = group_;
const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
const float *input_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
const OrtValue *offset = ort_.KernelContext_GetInput(context, 1);
const float *offset_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(offset));
const OrtValue *filter = ort_.KernelContext_GetInput(context, 2);
const float *filter_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(filter));
OrtTensorDimensions input_dims(ort_, input);
OrtTensorDimensions filter_dims(ort_, filter);
int64_t batch_size = input_dims[0];
int64_t in_channels = input_dims[1];
int64_t in_height = input_dims[2];
int64_t in_width = input_dims[3];
int64_t out_channels = filter_dims[0];
int64_t kernel_height = filter_dims[2];
int64_t kernel_width = filter_dims[3];
// get output memory
int64_t out_height = floor((in_height + 2 * padding_height -
dilation_height * (kernel_height - 1) - 1) /
stride_height +
1);
int64_t out_width = floor(
(in_width + 2 * padding_width - dilation_width * (kernel_width - 1) - 1) /
stride_width +
1);
std::vector<int64_t> output_dims = {batch_size, out_channels, out_height,
out_width};
OrtValue *output = ort_.KernelContext_GetOutput(
context, 0, output_dims.data(), output_dims.size());
float *out_ptr = ort_.GetTensorMutableData<float>(output);
// allocate tmp memory
int64_t column_len = (in_channels / group) * kernel_height * kernel_width *
out_height * out_width;
float *columns = (float *)allocator_.Alloc(sizeof(float) * column_len);
deformable_conv_forward(
input_data, offset_data, filter_data, batch_size, in_channels, in_height,
in_width, out_channels, out_height, out_width, group, deformable_group,
in_channels, out_channels, kernel_height, kernel_width, stride_height,
stride_width, padding_height, padding_width, dilation_height,
dilation_width, columns, out_ptr);
}
// Copyright (c) OpenMMLab. All rights reserved
#include <cmath>
#include "../ort_mmcv_utils.h"
#include "grid_sample.h"
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
#define MAX(a, b) (((a) < (b)) ? (b) : (a))
#define CLIP_COORDINATES(in, out, clip_limit) \
out = MIN((clip_limit - 1), MAX(in, 0))
// modified from
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/GridSampler.cpp
GridSampleKernel::GridSampleKernel(OrtApi api, const OrtKernelInfo *info)
: api_(api), ort_(api_), info_(info) {
align_corners_ = ort_.KernelInfoGetAttribute<int64_t>(info, "align_corners");
interpolation_mode_ =
ort_.KernelInfoGetAttribute<int64_t>(info, "interpolation_mode");
padding_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "padding_mode");
allocator_ = Ort::AllocatorWithDefaultOptions();
}
enum GridSamplerInterpolation { Bilinear = 0, Nearest = 1, Bicubic = 2 };
enum GridSamplerPadding { Zeros = 0, Border = 1, Reflection = 2 };
template <typename scalar_t>
static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size,
bool align_corners) {
if (align_corners) {
return ((coord + 1) / 2) * (size - 1);
} else {
return ((coord + 1) * size - 1) / 2;
}
}
// Clips coordinates to between 0 and clip_limit - 1
template <typename scalar_t>
static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) {
return std::min(static_cast<scalar_t>(clip_limit - 1),
std::max(in, static_cast<scalar_t>(0)));
}
// Reflects coordinates until they fall between low and high (inclusive).
// The bounds are passed as twice their value so that half-integer values
// can be represented as ints.
template <typename scalar_t>
static inline scalar_t reflect_coordinates(scalar_t in, int64_t twice_low,
int64_t twice_high) {
if (twice_low == twice_high) {
return static_cast<scalar_t>(0);
}
scalar_t min = static_cast<scalar_t>(twice_low) / 2;
scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
in = std::fabs(in - min);
// `fmod` returns same sign as `in`, which is positive after the `fabs` above.
scalar_t extra = std::fmod(in, span);
int flips = static_cast<int>(std::floor(in / span));
if (flips % 2 == 0) {
return extra + min;
} else {
return span - extra + min;
}
}
template <typename scalar_t>
static inline scalar_t compute_coordinates(scalar_t coord, int64_t size,
int64_t padding_mode,
bool align_corners) {
if (padding_mode == GridSamplerPadding::Border) {
coord = clip_coordinates(coord, size);
} else if (padding_mode == GridSamplerPadding::Reflection) {
if (align_corners) {
coord = reflect_coordinates(coord, 0, 2 * (size - 1));
} else {
coord = reflect_coordinates(coord, -1, 2 * size - 1);
}
coord = clip_coordinates(coord, size);
}
return coord;
}
// Computes the pixel source index value for a grid coordinate
template <typename scalar_t>
static inline scalar_t grid_sampler_compute_source_index(scalar_t coord,
int64_t size,
int64_t padding_mode,
bool align_corners) {
coord = grid_sampler_unnormalize(coord, size, align_corners);
coord = compute_coordinates(coord, size, padding_mode, align_corners);
return coord;
}
static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H,
int64_t W) {
return h >= 0 && h < H && w >= 0 && w < W;
}
template <typename scalar_t>
static inline scalar_t get_value_bounded(const scalar_t *data, scalar_t x,
scalar_t y, int64_t W, int64_t H,
int64_t sW, int64_t sH,
int64_t padding_mode,
bool align_corners) {
x = compute_coordinates(x, W, padding_mode, align_corners);
y = compute_coordinates(y, H, padding_mode, align_corners);
int64_t ix = static_cast<int64_t>(x);
int64_t iy = static_cast<int64_t>(y);
if (within_bounds_2d(iy, ix, H, W)) {
return data[iy * sH + ix * sW];
}
return static_cast<scalar_t>(0);
}
template <typename scalar_t>
static inline scalar_t cubic_convolution1(scalar_t x, scalar_t A) {
return ((A + 2) * x - (A + 3)) * x * x + 1;
}
template <typename scalar_t>
static inline scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
}
template <typename scalar_t>
static inline void get_cubic_upsample_coefficients(scalar_t coeffs[4],
scalar_t t) {
scalar_t A = -0.75;
scalar_t x1 = t;
coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A);
coeffs[1] = cubic_convolution1<scalar_t>(x1, A);
// opposite coefficients
scalar_t x2 = 1.0 - t;
coeffs[2] = cubic_convolution1<scalar_t>(x2, A);
coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A);
}
template <typename scalar_t>
static inline scalar_t cubic_interp1d(scalar_t x0, scalar_t x1, scalar_t x2,
scalar_t x3, scalar_t t) {
scalar_t coeffs[4];
get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
}
void GridSampleKernel::Compute(OrtKernelContext *context) {
const bool align_corners = align_corners_;
const int64_t padding_mode = padding_mode_;
const int64_t interpolation_mode = interpolation_mode_;
const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
const float *input_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
const OrtValue *grid = ort_.KernelContext_GetInput(context, 1);
const float *grid_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(grid));
OrtTensorDimensions input_dims(ort_, input);
OrtTensorDimensions grid_dims(ort_, grid);
int64_t N = input_dims[0];
int64_t C = input_dims[1];
int64_t inp_H = input_dims[2];
int64_t inp_W = input_dims[3];
int64_t out_H = grid_dims[1];
int64_t out_W = grid_dims[2];
std::vector<int64_t> output_dims = {N, C, out_H, out_W};
OrtValue *output = ort_.KernelContext_GetOutput(
context, 0, output_dims.data(), output_dims.size());
float *out_ptr = ort_.GetTensorMutableData<float>(output);
int64_t inp_sN = input_dims[1] * input_dims[2] * input_dims[3];
int64_t inp_sC = input_dims[2] * input_dims[3];
int64_t inp_sH = input_dims[3];
int64_t inp_sW = 1;
int64_t grid_sN = grid_dims[1] * grid_dims[2] * grid_dims[3];
int64_t grid_sH = grid_dims[2] * grid_dims[3];
int64_t grid_sW = grid_dims[3];
int64_t grid_sCoor = 1;
int64_t out_sN = output_dims[1] * output_dims[2] * output_dims[3];
int64_t out_sC = output_dims[2] * output_dims[3];
int64_t out_sH = output_dims[3];
int64_t out_sW = 1;
// loop over each output pixel
for (int64_t n = 0; n < N; ++n) {
const float *grid_ptr_N = grid_data + n * grid_sN;
const float *inp_ptr_N = input_data + n * inp_sN;
for (int64_t h = 0; h < out_H; ++h) {
for (int64_t w = 0; w < out_W; ++w) {
const float *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
float x = *grid_ptr_NHW;
float y = grid_ptr_NHW[grid_sCoor];
float ix = grid_sampler_compute_source_index(x, inp_W, padding_mode,
align_corners);
float iy = grid_sampler_compute_source_index(y, inp_H, padding_mode,
align_corners);
if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
// get corner pixel values from (x, y)
// for 4d, we use north-east-south-west
int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
int64_t ix_ne = ix_nw + 1;
int64_t iy_ne = iy_nw;
int64_t ix_sw = ix_nw;
int64_t iy_sw = iy_nw + 1;
int64_t ix_se = ix_nw + 1;
int64_t iy_se = iy_nw + 1;
// get surfaces to each neighbor:
float nw = (ix_se - ix) * (iy_se - iy);
float ne = (ix - ix_sw) * (iy_sw - iy);
float sw = (ix_ne - ix) * (iy - iy_ne);
float se = (ix - ix_nw) * (iy - iy_nw);
// calculate bilinear weighted pixel value and set output pixel
const float *inp_ptr_NC = inp_ptr_N;
float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
for (int64_t c = 0; c < C;
++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
auto res = static_cast<float>(0);
if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
res += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
}
if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
res += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
}
if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
res += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
}
if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
res += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
}
*out_ptr_NCHW = res;
}
} else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
int64_t ix_nearest = static_cast<int64_t>(std::nearbyint(ix));
int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
// assign nearest neighbor pixel value to output pixel
float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
const float *inp_ptr_NC = inp_ptr_N;
for (int64_t c = 0; c < C;
++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
*out_ptr_NCHW =
inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
} else {
*out_ptr_NCHW = static_cast<float>(0);
}
}
} else if (interpolation_mode == GridSamplerInterpolation::Bicubic) {
// grid_sampler_compute_source_index will "clip the value" of idx
// depends on the padding,
// which would cause calculation to be wrong,
// for example x = -0.1 -> ix = 0 for zero padding, but in bicubic ix
// = floor(x) = -1
// There would be more problem in reflection padding, since the -1 and
// +1 direction is not fixed in boundary condition
ix = grid_sampler_unnormalize(x, inp_W, align_corners);
iy = grid_sampler_unnormalize(y, inp_H, align_corners);
float ix_nw = std::floor(ix);
float iy_nw = std::floor(iy);
const float tx = ix - ix_nw;
const float ty = iy - iy_nw;
const float *inp_ptr_NC = inp_ptr_N;
float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
for (int64_t c = 0; c < C;
++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
float coefficients[4];
// Interpolate 4 values in the x direction
for (int64_t i = 0; i < 4; ++i) {
coefficients[i] = cubic_interp1d<float>(
get_value_bounded<float>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i,
inp_W, inp_H, inp_sW, inp_sH,
padding_mode, align_corners),
get_value_bounded<float>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i,
inp_W, inp_H, inp_sW, inp_sH,
padding_mode, align_corners),
get_value_bounded<float>(inp_ptr_NC, ix_nw + 1, iy_nw - 1 + i,
inp_W, inp_H, inp_sW, inp_sH,
padding_mode, align_corners),
get_value_bounded<float>(inp_ptr_NC, ix_nw + 2, iy_nw - 1 + i,
inp_W, inp_H, inp_sW, inp_sH,
padding_mode, align_corners),
tx);
}
// Interpolate in the y direction
*out_ptr_NCHW =
cubic_interp1d<float>(coefficients[0], coefficients[1],
coefficients[2], coefficients[3], ty);
}
}
}
}
}
}
// Copyright (c) OpenMMLab. All rights reserved
#include "modulated_deform_conv.h"
#include <cmath>
#include <vector>
#include "../ort_mmcv_utils.h"
float bilinear_interpolate_2d(const float *src, const int64_t src_h,
const int64_t src_w, const float h,
const float w) {
if (h <= -1 || src_h <= h || w <= -1 || src_w <= w) {
return 0;
}
int64_t h_low = floor(h);
int64_t w_low = floor(w);
int64_t h_high = h_low + 1;
int64_t w_high = w_low + 1;
float lh = h - h_low;
float lw = w - w_low;
float hh = 1 - lh;
float hw = 1 - lw;
float v1 = 0;
if (h_low >= 0 && w_low >= 0) v1 = src[h_low * src_w + w_low];
float v2 = 0;
if (h_low >= 0 && w_high <= src_w - 1) v2 = src[h_low * src_w + w_high];
float v3 = 0;
if (h_high <= src_h - 1 && w_low >= 0) v3 = src[h_high * src_w + w_low];
float v4 = 0;
if (h_high <= src_h - 1 && w_high <= src_w - 1)
v4 = src[h_high * src_w + w_high];
float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
// output: (channels * kernel_h * kernel_w, dst_h * dst_w)
void deformable_im2col_2d(const float *input, const float *offset,
const float *mask, const int64_t src_h,
const int64_t src_w, const int64_t kernel_h,
const int64_t kernel_w, const int64_t pad_h,
const int64_t pad_w, const int64_t stride_h,
const int64_t stride_w, const int64_t dilation_h,
const int64_t dilation_w, const int64_t channels,
const int64_t offset_groups, const int64_t dst_h,
const int64_t dst_w, const bool use_mask,
float *columns) {
const int64_t workload = channels * dst_h * dst_w;
for (int64_t index = 0; index != workload; ++index) {
const int64_t ow = index % dst_w;
const int64_t oh = (index / dst_w) % dst_h;
const int64_t ic = index / (dst_w * dst_h);
const int64_t oc = ic * kernel_h * kernel_w;
int64_t c_per_offset_grp = channels / offset_groups;
const int64_t grp_idx = ic / c_per_offset_grp;
auto columns_ptr = columns + (oc * (dst_h * dst_w) + oh * dst_w + ow);
auto input_ptr = input + ic * (src_h * src_w);
auto offset_ptr =
offset + grp_idx * 2 * kernel_h * kernel_w * dst_h * dst_w;
auto mask_ptr = mask;
if (use_mask) {
mask_ptr += grp_idx * kernel_h * kernel_w * dst_h * dst_w;
}
for (int64_t kh = 0; kh < kernel_h; ++kh) {
for (int64_t kw = 0; kw < kernel_w; ++kw) {
const int64_t mask_idx = kh * kernel_w + kw;
const int64_t offset_idx = 2 * mask_idx;
float mask_value = 1;
if (use_mask) {
mask_value = mask_ptr[mask_idx * (dst_h * dst_w) + oh * dst_w + ow];
}
const float offset_h =
offset_ptr[offset_idx * (dst_h * dst_w) + oh * dst_w + ow];
const float offset_w =
offset_ptr[(offset_idx + 1) * (dst_h * dst_w) + oh * dst_w + ow];
const float ih = (oh * stride_h - pad_h) + kh * dilation_h + offset_h;
const float iw = (ow * stride_w - pad_w) + kw * dilation_w + offset_w;
*columns_ptr = mask_value *
bilinear_interpolate_2d(input_ptr, src_h, src_w, ih, iw);
columns_ptr += dst_h * dst_w;
}
}
}
}
void gemm_ref_fp32(const float *A, const float *B, const float *V,
const float *H, const int32_t trans_A, const int32_t trans_B,
const int32_t M, const int32_t N, const int32_t K,
const float alpha, const float beta, float *Y) {
if (!trans_A && !trans_B) { // MK, KN; NN
for (int64_t m = 0; m < M; ++m) {
for (int64_t n = 0; n < N; ++n) {
float y = 0.0f;
for (int64_t k = 0; k < K; ++k) {
y += A[m * K + k] * B[k * N + n];
}
y *= alpha;
if (V) y += beta * V[n];
if (H) y += beta * H[m * N + n];
Y[m * N + n] = y;
}
}
}
if (trans_A && !trans_B) { // KM, KN; TN
for (int64_t m = 0; m < M; ++m) {
for (int64_t n = 0; n < N; ++n) {
float y = 0.0f;
for (int64_t k = 0; k < K; ++k) {
y += A[k * M + m] * B[k * N + n];
}
y *= alpha;
if (V) y += beta * V[n];
if (H) y += beta * H[m * N + n];
Y[m * N + n] = y;
}
}
}
if (trans_A && trans_B) { // KM, NK; TT
for (int64_t m = 0; m < M; ++m) {
for (int64_t n = 0; n < N; ++n) {
float y = 0.0f;
for (int64_t k = 0; k < K; ++k) {
y += A[k * M + m] * B[n * K + k];
}
y *= alpha;
if (V) y += beta * V[n];
if (H) y += beta * H[m * N + n];
Y[m * N + n] = y;
}
}
}
if (!trans_A && trans_B) { // MK, NK; NT
for (int64_t m = 0; m < M; ++m) {
for (int64_t n = 0; n < N; ++n) {
float y = 0.0f;
for (int64_t k = 0; k < K; ++k) {
y += A[m * K + k] * B[n * K + k];
}
y *= alpha;
if (V) y += beta * V[n];
if (H) y += beta * H[m * N + n];
Y[m * N + n] = y;
}
}
}
}
void deformable_conv2d_ref_fp32(
const float *src, const float *offset, const float *mask,
const float *filter, const float *bias, const int64_t batch,
const int64_t src_c, const int64_t src_h, const int64_t src_w,
const int64_t dst_c, const int64_t dst_h, const int64_t dst_w,
const int64_t group, const int64_t offset_group, const int64_t channels,
const int64_t num_output, const int64_t kernel_h, const int64_t kernel_w,
const int64_t stride_h, const int64_t stride_w, const int64_t pad_h,
const int64_t pad_w, const int64_t dilation_h, const int64_t dilation_w,
float *columns, float *dst) {
const int64_t ic_per_gp = channels / group;
const int64_t oc_per_gp = num_output / group;
for (int64_t b = 0; b < batch; ++b) {
for (int64_t g = 0; g < group; ++g) {
deformable_im2col_2d(
src + b * src_c * src_h * src_w + g * ic_per_gp * src_h * src_w,
offset + b * offset_group * 2 * kernel_h * kernel_w * dst_h * dst_w,
mask + b * offset_group * kernel_h * kernel_w * dst_h * dst_w, src_h,
src_w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
dilation_h, dilation_w, ic_per_gp, offset_group, dst_h, dst_w,
mask != nullptr, columns);
float *dst_ptr =
dst + b * dst_c * dst_h * dst_w + g * oc_per_gp * dst_h * dst_w;
if (bias != nullptr) {
const float *bias_ptr = bias + g * oc_per_gp;
for (int64_t oc = 0; oc < oc_per_gp; ++oc) {
for (int64_t hw = 0; hw < dst_h * dst_w; ++hw) {
dst_ptr[oc * dst_h * dst_w + hw] = bias_ptr[oc];
}
}
} else {
memset(dst_ptr, 0.0f, sizeof(float) * oc_per_gp * dst_h * dst_w);
}
gemm_ref_fp32(filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w,
columns, nullptr, dst_ptr, 0, 0, oc_per_gp, dst_h * dst_w,
ic_per_gp * kernel_h * kernel_w, 1.0f, 1.0f, dst_ptr);
}
}
}
MMCVModulatedDeformConvKernel::MMCVModulatedDeformConvKernel(
OrtApi api, const OrtKernelInfo *info)
: api_(api), ort_(api_), info_(info) {
std::vector<int64_t> stride =
ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "stride");
stride_height_ = stride[0];
stride_width_ = stride[1];
std::vector<int64_t> padding =
ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "padding");
padding_height_ = padding[0];
padding_width_ = padding[1];
std::vector<int64_t> dilation =
ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "dilation");
dilation_height_ = dilation[0];
dilation_width_ = dilation[1];
deformable_group_ =
ort_.KernelInfoGetAttribute<int64_t>(info, "deform_groups");
group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "groups");
// create allocator
allocator_ = Ort::AllocatorWithDefaultOptions();
}
void MMCVModulatedDeformConvKernel::Compute(OrtKernelContext *context) {
const int64_t stride_height = stride_height_;
const int64_t stride_width = stride_width_;
const int64_t padding_height = padding_height_;
const int64_t padding_width = padding_width_;
const int64_t dilation_height = dilation_height_;
const int64_t dilation_width = dilation_width_;
const int64_t deformable_group = deformable_group_;
const int64_t group = group_;
const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
const float *input_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
const OrtValue *offset = ort_.KernelContext_GetInput(context, 1);
const float *offset_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(offset));
const OrtValue *mask = ort_.KernelContext_GetInput(context, 2);
const float *mask_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(mask));
const OrtValue *filter = ort_.KernelContext_GetInput(context, 3);
const float *filter_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(filter));
const OrtValue *bias = ort_.KernelContext_GetInput(context, 4);
const float *bias_data =
(bias != nullptr)
? reinterpret_cast<const float *>(ort_.GetTensorData<float>(bias))
: nullptr;
// const float *bias_data = nullptr;
OrtTensorDimensions input_dims(ort_, input);
OrtTensorDimensions filter_dims(ort_, filter);
int64_t batch = input_dims[0];
int64_t channels = input_dims[1];
int64_t in_height = input_dims[2];
int64_t in_width = input_dims[3];
int64_t num_output = filter_dims[0];
int64_t kernel_height = filter_dims[2];
int64_t kernel_width = filter_dims[3];
// get output memory
int64_t out_height = floor((in_height + 2 * padding_height -
dilation_height * (kernel_height - 1) - 1) /
stride_height +
1);
int64_t out_width = floor(
(in_width + 2 * padding_width - dilation_width * (kernel_width - 1) - 1) /
stride_width +
1);
std::vector<int64_t> output_dims = {batch, num_output, out_height, out_width};
OrtValue *output = ort_.KernelContext_GetOutput(
context, 0, output_dims.data(), output_dims.size());
float *out_ptr = ort_.GetTensorMutableData<float>(output);
// allocate tmp memory
int64_t column_len = (channels / group) * kernel_height * kernel_width *
out_height * out_width;
float *columns = (float *)allocator_.Alloc(sizeof(float) * column_len);
deformable_conv2d_ref_fp32(
input_data, offset_data, mask_data, filter_data, bias_data, batch,
channels, in_height, in_width, num_output, out_height, out_width, group,
deformable_group, channels, num_output, kernel_height, kernel_width,
stride_height, stride_width, padding_height, padding_width,
dilation_height, dilation_width, columns, out_ptr);
}
// Copyright (c) OpenMMLab. All rights reserved
#include "nms.h"
#include <assert.h>
#include <algorithm>
#include <cmath>
#include <iostream>
#include <iterator>
#include <numeric> // std::iota
#include <vector>
#include "../ort_mmcv_utils.h"
NmsKernel::NmsKernel(OrtApi api, const OrtKernelInfo *info)
: api_(api), ort_(api_), info_(info) {
iou_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "iou_threshold");
offset_ = ort_.KernelInfoGetAttribute<int64_t>(info, "offset");
// create allocator
allocator_ = Ort::AllocatorWithDefaultOptions();
}
void NmsKernel::Compute(OrtKernelContext *context) {
const float iou_threshold = iou_threshold_;
const int64_t offset = offset_;
const OrtValue *boxes = ort_.KernelContext_GetInput(context, 0);
const float *boxes_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(boxes));
const OrtValue *scores = ort_.KernelContext_GetInput(context, 1);
const float *scores_data =
reinterpret_cast<const float *>(ort_.GetTensorData<float>(scores));
OrtTensorDimensions boxes_dim(ort_, boxes);
OrtTensorDimensions scores_dim(ort_, scores);
int64_t nboxes = boxes_dim[0];
assert(boxes_dim[1] == 4);
// allocate tmp memory
float *tmp_boxes = (float *)allocator_.Alloc(sizeof(float) * nboxes * 4);
float *sc = (float *)allocator_.Alloc(sizeof(float) * nboxes);
float *areas = (float *)allocator_.Alloc(sizeof(float) * nboxes);
bool *select = (bool *)allocator_.Alloc(sizeof(bool) * nboxes);
for (int64_t i = 0; i < nboxes; i++) {
select[i] = true;
}
memcpy(tmp_boxes, boxes_data, sizeof(float) * nboxes * 4);
memcpy(sc, scores_data, sizeof(float) * nboxes);
// sort scores
std::vector<float> tmp_sc;
for (int i = 0; i < nboxes; i++) {
tmp_sc.push_back(sc[i]);
}
std::vector<int64_t> order(tmp_sc.size());
std::iota(order.begin(), order.end(), 0);
std::sort(order.begin(), order.end(), [&tmp_sc](int64_t id1, int64_t id2) {
return tmp_sc[id1] > tmp_sc[id2];
});
// area = (x2 - x1 + offset) * (y2 - y1 + offset)
for (int64_t i = 0; i < nboxes; i++) {
areas[i] = (tmp_boxes[i * 4 + 2] - tmp_boxes[i * 4 + 0] + offset) *
(tmp_boxes[i * 4 + 3] - tmp_boxes[i * 4 + 1] + offset);
}
for (int64_t _i = 0; _i < nboxes; _i++) {
if (select[_i] == false) continue;
auto i = order[_i];
auto ix1 = tmp_boxes[i * 4 + 0];
auto iy1 = tmp_boxes[i * 4 + 1];
auto ix2 = tmp_boxes[i * 4 + 2];
auto iy2 = tmp_boxes[i * 4 + 3];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < nboxes; _j++) {
if (select[_j] == false) continue;
auto j = order[_j];
auto xx1 = std::max(ix1, tmp_boxes[j * 4 + 0]);
auto yy1 = std::max(iy1, tmp_boxes[j * 4 + 1]);
auto xx2 = std::min(ix2, tmp_boxes[j * 4 + 2]);
auto yy2 = std::min(iy2, tmp_boxes[j * 4 + 3]);
auto w = std::max(0.f, xx2 - xx1 + offset);
auto h = std::max(0.f, yy2 - yy1 + offset);
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr > iou_threshold) select[_j] = false;
}
}
std::vector<int64_t> res_order;
for (int i = 0; i < nboxes; i++) {
if (select[i]) {
res_order.push_back(order[i]);
}
}
std::vector<int64_t> inds_dims({res_order.size()});
OrtValue *res = ort_.KernelContext_GetOutput(context, 0, inds_dims.data(),
inds_dims.size());
int64_t *res_data = ort_.GetTensorMutableData<int64_t>(res);
memcpy(res_data, res_order.data(), sizeof(int64_t) * res_order.size());
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment