Commit fdeee889 authored by limm's avatar limm
Browse files

release v1.6.1 of mmcv

parent df465820
#include "MPSLibrary.h"
#include <c10/util/CallOnce.h>
#include "MPSDevice.h"
static std::unique_ptr<MPSLibraryManager> mps_library_manager;
static c10::once_flag mpsdev_init;
MPSLibraryManager* MPSLibraryManager::getInstance() {
c10::call_once(mpsdev_init, [] {
mps_library_manager = std::unique_ptr<MPSLibraryManager>(new MPSLibraryManager());
});
return mps_library_manager.get();
}
MPSLibraryManager::~MPSLibraryManager() {}
MPSLibraryManager::MPSLibraryManager() {}
bool MPSLibraryManager::hasLibrary(const std::string& name) {
return _library_map.find(name) != _library_map.end();
}
MPSLibrary* MPSLibraryManager::getLibrary(const std::string& library_url) {
if (_library_map.find(library_url) != _library_map.end()) {
return _library_map[library_url].get();
}
_library_map.emplace(std::make_pair(
library_url, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromUrl(library_url))));
return _library_map[library_url].get();
}
MPSLibrary* MPSLibraryManager::createLibraryFromSouce(const std::string& name,
const std::string& source) {
NSString* ns_name = [NSString stringWithCString:name.c_str()];
if (_library_map.find(name) != _library_map.end()) {
NSLog(@"Library %@ already exist.", ns_name);
return nullptr;
}
_library_map.emplace(
std::make_pair(name, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromSource(source))));
return _library_map[name].get();
}
MPSLibrary* MPSLibrary::createFromUrl(const std::string& library_url) {
MPSLibrary* library = new MPSLibrary();
@autoreleasepool {
NSError* error = nil;
// load library and func
NSString* utl_str = [NSString stringWithCString:library_url.c_str()];
NSURL* metal_url = [NSURL fileURLWithPath:utl_str];
library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithURL:metal_url
error:&error];
if (library->_library == nil) {
NSLog(@"Failed to find library, error %@.", error);
exit(1);
}
}
return library;
}
MPSLibrary* MPSLibrary::createFromSource(const std::string& sources) {
MPSLibrary* library = new MPSLibrary();
@autoreleasepool {
NSError* error = nil;
// load library and func
NSString* code_str = [NSString stringWithCString:sources.c_str()];
library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithSource:code_str
options:nil
error:&error];
if (library->_library == nil) {
NSLog(@"Failed to find library, error %@.", error);
exit(1);
}
}
return library;
}
MPSLibrary::~MPSLibrary() {
[_library release];
_library = nil;
}
MTLComputePipelineState_t MPSLibrary::getComputePipelineState(const std::string& function_name) {
if (_pso_map.find(function_name) != _pso_map.end()) {
return _pso_map[function_name];
}
MTLComputePipelineState_t pso;
@autoreleasepool {
NSError* error = nil;
// create function
NSString* function_name_str = [NSString stringWithCString:function_name.c_str()];
id<MTLFunction> func = [_library newFunctionWithName:function_name_str];
if (func == nil) {
NSLog(@"Failed to created pipeline state object, error %@.", error);
exit(1);
}
// create pipeline
pso = [at::mps::MPSDevice::getInstance()->device() newComputePipelineStateWithFunction:func
error:&error];
_pso_map.emplace(std::make_pair(function_name, pso));
}
return _pso_map[function_name];
}
// Copyright © 2022 Apple Inc.
// This file is modify from:
// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSStream.h
#pragma once
#include <cstdint>
#include <utility>
#include <c10/core/DeviceGuard.h>
#include <c10/core/Stream.h>
#include <c10/util/Exception.h>
#include "MPSDevice.h"
#ifdef __OBJC__
#include <Foundation/Foundation.h>
#include <Metal/Metal.h>
#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
typedef id<MTLCommandQueue> MTLCommandQueue_t;
typedef id<MTLCommandBuffer> MTLCommandBuffer_t;
typedef id<MTLSharedEvent> MTLSharedEvent_t;
typedef id<MTLDevice> MTLDevice_t;
#else
typedef void* MTLCommandQueue_t;
typedef void* MTLCommandQueue;
typedef void* MTLCommandBuffer_t;
typedef void* MTLCommandBuffer;
typedef void* MTLSharedEvent_t;
typedef void* dispatch_queue_t;
typedef void* MTLDevice_t;
#define nil NULL;
#endif
namespace at {
namespace mps {
//-----------------------------------------------------------------
// MPSStream
//-----------------------------------------------------------------
class TORCH_API MPSStream {
public:
enum Unchecked { UNCHECKED };
/// Construct a MPSStream from a Stream. This construction is checked,
/// and will raise an error if the Stream is not, in fact, a MPS stream.
explicit MPSStream(Stream stream);
~MPSStream();
MTLCommandQueue_t commandQueue() const { return _commandQueue; };
dispatch_queue_t queue() const { return _serialQueue; }
MTLCommandBuffer_t commandBuffer();
void commit(bool flush);
void commitAndWait();
void synchronize();
void flush();
/// Get the MPS device index that this stream is associated with.
c10::DeviceIndex device_index() const { return _stream.device_index(); }
MTLCommandQueue_t stream() const { return _commandQueue; };
MTLDevice_t device() const { return [_commandQueue device]; }
/// Explicit conversion to Stream.
Stream unwrap() const { return _stream; }
private:
Stream _stream;
MTLCommandQueue_t _commandQueue = nil;
MTLCommandBuffer_t _commandBuffer = nil;
void _flush(bool commitAndWait) const;
dispatch_queue_t _serialQueue = nullptr;
};
/**
* Get the current MPS stream
*/
TORCH_API MPSStream* getCurrentMPSStream();
/**
* Get the default MPS stream
*/
TORCH_API MPSStream* getDefaultMPSStream();
//-----------------------------------------------------------------
// MPSStreamImpl
//-----------------------------------------------------------------
class TORCH_API MPSStreamImpl {
public:
/**
* Gets single instance of the MPSStream.
*/
static MPSStream* getInstance();
private:
static MPSStream* _stream;
MPSStreamImpl();
};
//-----------------------------------------------------------------
// MPSEvent
//-----------------------------------------------------------------
struct TORCH_API MPSEvent {
MPSEvent();
// MPSEvent(id<MTLDevice> device);
~MPSEvent();
MTLSharedEvent_t event() const { return _event; }
void recordEvent(MPSStream* stream);
void waitForEvent(MPSStream* queue); // waits on the cpu
bool queryEvent();
uint64_t getCurrentValue() { return _currentValue; }
void setCurrentValue(uint64_t currValue) { _currentValue = currValue; }
private:
bool _isRecorded = false;
uint64_t _currentValue = 0;
MTLSharedEvent_t _event;
};
typedef MPSEvent* mpsEvent_t;
} // namespace mps
} // namespace at
#ifndef _MPS_UTILS_H_
#define _MPS_UTILS_H_
#include <torch/extension.h>
#ifdef __OBJC__
#include <Foundation/Foundation.h>
#include <Metal/Metal.h>
#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
typedef id<MTLBuffer> MTLBuffer_t;
typedef id<MTLComputeCommandEncoder> MTLComputeCommandEncoder_t;
#else
typedef void* MTLBuffer;
typedef void* MTLBuffer_t;
typedef void* MTLComputeCommandEncoder;
typedef void* MTLComputeCommandEncoder_t;
#endif
// utils
static inline MTLBuffer_t getMTLBufferStorage(const at::Tensor& tensor) {
return __builtin_bit_cast(MTLBuffer_t, tensor.storage().data());
}
template <typename T,
std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>
void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t);
template <typename T,
std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>
void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {
[encoder setBuffer:getMTLBufferStorage(t) offset:0 atIndex:index];
}
template <typename T, std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool>>
void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {
[encoder setBytes:&t length:sizeof(t) atIndex:index];
}
inline void setMTLArgsImpl(MTLComputeCommandEncoder_t, int) {}
template <typename T, typename... Args>
void setMTLArgsImpl(MTLComputeCommandEncoder_t encoder, int index, T&& t, Args&&... args) {
setMTLArg(encoder, index, std::forward<T>(t));
setMTLArgsImpl(encoder, index + 1, std::forward<Args>(args)...);
}
template <typename... Args>
void setMTLArgs(MTLComputeCommandEncoder_t encoder, MTLComputePipelineState_t pso, Args&&... args) {
[encoder setComputePipelineState:pso];
setMTLArgsImpl(encoder, 0, std::forward<Args>(args)...);
}
#endif
#ifndef PYTORCH_CPP_HELPER
#define PYTORCH_CPP_HELPER
#include <torch/extension.h>
#include <torch/types.h>
#include <vector>
using namespace at;
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define CHECK_CUDA(x) \
TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_MLU(x) \
TORCH_CHECK(x.device().type() == at::kMLU, #x " must be a MLU tensor")
#define CHECK_CPU(x) \
TORCH_CHECK(!x.device().is_cuda(), #x " must be a CPU tensor")
TORCH_CHECK(x.device().type() == at::kCPU, #x " must be a CPU tensor")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_CUDA_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
#define CHECK_MLU_INPUT(x) \
CHECK_MLU(x); \
CHECK_CONTIGUOUS(x)
#define CHECK_CPU_INPUT(x) \
CHECK_CPU(x); \
CHECK_CONTIGUOUS(x)
......
/*************************************************************************
* Copyright (C) 2021 Cambricon.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*************************************************************************/
#ifndef PYTORCH_MLU_HELPER_HPP_
#define PYTORCH_MLU_HELPER_HPP_
#ifdef MMCV_WITH_MLU
#include "aten.h"
#define NFU_ALIGN_SIZE 128
#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y))
#define PAD_DOWN(x, y) (((x) / (y)) * (y))
#define CEIL_ALIGN(x, y) (((x) + (y)-1) / (y) * (y))
#endif
#endif // PYTORCH_MLU_HELPER_HPP_
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PARAMS_GRID_H_
#define PARAMS_GRID_H_
#include <tuple>
#include <vector>
namespace detail {
template <class scalar_t>
int getTotalSize(std::vector<scalar_t> arg) {
return arg.size();
}
template <class scalar_t, class... TArgs>
int getTotalSize(std::vector<scalar_t> arg, std::vector<TArgs>... args) {
return arg.size() * getTotalSize(args...);
}
template <typename scalar_t>
int getSize(std::vector<scalar_t> arg) {
return arg.size();
}
template <int Idx, class TT, class scalar_t>
void assigner(TT &src, std::vector<int> counter, std::vector<scalar_t> &arg) {
std::get<Idx>(src) = arg[counter[Idx]];
}
template <int Idx, class TT, class scalar_t, class... TArgs>
void assigner(TT &src, std::vector<int> counter, std::vector<scalar_t> &arg,
std::vector<TArgs> &... args) {
std::get<Idx>(src) = arg[counter[Idx]];
assigner<Idx + 1>(src, counter, args...);
}
} // namespace detail
template <class... TArgs>
std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
int length = detail::getTotalSize(args...);
std::vector<int> sizes = {detail::getSize(args)...};
int size = sizes.size();
std::vector<std::tuple<TArgs...>> params(length);
std::vector<int> counter(size);
for (int i = 0; i < length; ++i) {
detail::assigner<0>(params[i], counter, args...);
counter[size - 1] += 1;
for (int c = size - 1; c >= 0; --c) {
if (counter[c] == sizes[c] && c > 0) {
counter[c - 1] += 1;
counter[c] = 0;
}
}
}
return params;
}
#endif
// Copyright Louis Delacroix 2010 - 2014.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
//
// A pretty printing library for C++
//
// Usage:
// Include this header, and operator<< will "just work".
#ifndef H_PRETTY_PRINT
#define H_PRETTY_PRINT
#include <cstddef>
#include <iterator>
#include <memory>
#include <ostream>
#include <set>
#include <tuple>
#include <type_traits>
#include <unordered_set>
#include <utility>
#include <valarray>
namespace pretty_print {
namespace detail {
// SFINAE type trait to detect whether T::const_iterator exists.
struct sfinae_base {
using yes = char;
using no = yes[2];
};
template <typename T>
struct has_const_iterator : private sfinae_base {
private:
template <typename C>
static yes &test(typename C::const_iterator *);
template <typename C>
static no &test(...);
public:
static const bool value = sizeof(test<T>(nullptr)) == sizeof(yes);
using type = T;
};
template <typename T>
struct has_begin_end : private sfinae_base {
private:
template <typename C>
static yes &
f(typename std::enable_if<
std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
const>(&C::begin)),
typename C::const_iterator (C::*)() const>::value>::type *);
template <typename C>
static no &f(...);
template <typename C>
static yes &g(typename std::enable_if<
std::is_same<decltype(static_cast<typename C::const_iterator (
C::*)() const>(&C::end)),
typename C::const_iterator (C::*)() const>::value,
void>::type *);
template <typename C>
static no &g(...);
public:
static bool const beg_value = sizeof(f<T>(nullptr)) == sizeof(yes);
static bool const end_value = sizeof(g<T>(nullptr)) == sizeof(yes);
};
} // namespace detail
// Holds the delimiter values for a specific character type
template <typename TChar>
struct delimiters_values {
using char_type = TChar;
const char_type *prefix;
const char_type *delimiter;
const char_type *postfix;
};
// Defines the delimiter values for a specific container and character type
template <typename T, typename TChar>
struct delimiters {
using type = delimiters_values<TChar>;
static const type values;
};
// Functor to print containers. You can use this directly if you want
// to specify a non-default delimiters type. The printing logic can
// be customized by specializing the nested template.
template <typename T, typename TChar = char,
typename TCharTraits = ::std::char_traits<TChar>,
typename TDelimiters = delimiters<T, TChar>>
struct print_container_helper {
using delimiters_type = TDelimiters;
using ostream_type = std::basic_ostream<TChar, TCharTraits>;
template <typename U>
struct printer {
static void print_body(const U &c, ostream_type &stream) {
using std::begin;
using std::end;
auto it = begin(c);
const auto the_end = end(c);
if (it != the_end) {
for (;;) {
stream << *it;
if (++it == the_end) break;
if (delimiters_type::values.delimiter != NULL)
stream << delimiters_type::values.delimiter;
}
}
}
};
print_container_helper(const T &container) : container_(container) {}
inline void operator()(ostream_type &stream) const {
if (delimiters_type::values.prefix != NULL)
stream << delimiters_type::values.prefix;
printer<T>::print_body(container_, stream);
if (delimiters_type::values.postfix != NULL)
stream << delimiters_type::values.postfix;
}
private:
const T &container_;
};
// Specialization for pairs
template <typename T, typename TChar, typename TCharTraits,
typename TDelimiters>
template <typename T1, typename T2>
struct print_container_helper<T, TChar, TCharTraits,
TDelimiters>::printer<std::pair<T1, T2>> {
using ostream_type =
typename print_container_helper<T, TChar, TCharTraits,
TDelimiters>::ostream_type;
static void print_body(const std::pair<T1, T2> &c, ostream_type &stream) {
stream << c.first;
if (print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter != NULL)
stream << print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter;
stream << c.second;
}
};
// Specialization for tuples
template <typename T, typename TChar, typename TCharTraits,
typename TDelimiters>
template <typename... Args>
struct print_container_helper<T, TChar, TCharTraits,
TDelimiters>::printer<std::tuple<Args...>> {
using ostream_type =
typename print_container_helper<T, TChar, TCharTraits,
TDelimiters>::ostream_type;
using element_type = std::tuple<Args...>;
template <std::size_t I>
struct Int {};
static void print_body(const element_type &c, ostream_type &stream) {
tuple_print(c, stream, Int<0>());
}
static void tuple_print(const element_type &, ostream_type &,
Int<sizeof...(Args)>) {}
static void tuple_print(
const element_type &c, ostream_type &stream,
typename std::conditional<sizeof...(Args) != 0, Int<0>,
std::nullptr_t>::type) {
stream << std::get<0>(c);
tuple_print(c, stream, Int<1>());
}
template <std::size_t N>
static void tuple_print(const element_type &c, ostream_type &stream, Int<N>) {
if (print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter != NULL)
stream << print_container_helper<T, TChar, TCharTraits,
TDelimiters>::delimiters_type::values
.delimiter;
stream << std::get<N>(c);
tuple_print(c, stream, Int<N + 1>());
}
};
// Prints a print_container_helper to the specified stream.
template <typename T, typename TChar, typename TCharTraits,
typename TDelimiters>
inline std::basic_ostream<TChar, TCharTraits> &operator<<(
std::basic_ostream<TChar, TCharTraits> &stream,
const print_container_helper<T, TChar, TCharTraits, TDelimiters> &helper) {
helper(stream);
return stream;
}
// Basic is_container template; specialize to derive from std::true_type for all
// desired container types
template <typename T>
struct is_container
: public std::integral_constant<bool,
detail::has_const_iterator<T>::value &&
detail::has_begin_end<T>::beg_value &&
detail::has_begin_end<T>::end_value> {};
template <typename T, std::size_t N>
struct is_container<T[N]> : std::true_type {};
template <std::size_t N>
struct is_container<char[N]> : std::false_type {};
template <typename T>
struct is_container<std::valarray<T>> : std::true_type {};
template <typename T1, typename T2>
struct is_container<std::pair<T1, T2>> : std::true_type {};
template <typename... Args>
struct is_container<std::tuple<Args...>> : std::true_type {};
// Default delimiters
template <typename T>
struct delimiters<T, char> {
static const delimiters_values<char> values;
};
template <typename T>
const delimiters_values<char> delimiters<T, char>::values = {"[", ", ", "]"};
template <typename T>
struct delimiters<T, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T>
const delimiters_values<wchar_t> delimiters<T, wchar_t>::values = {L"[", L", ",
L"]"};
// Delimiters for (multi)set and unordered_(multi)set
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::set<T, TComp, TAllocator>, char> {
static const delimiters_values<char> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<char>
delimiters<::std::set<T, TComp, TAllocator>, char>::values = {"{", ", ",
"}"};
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::set<T, TComp, TAllocator>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<wchar_t>
delimiters<::std::set<T, TComp, TAllocator>, wchar_t>::values = {
L"{", L", ", L"}"};
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::multiset<T, TComp, TAllocator>, char> {
static const delimiters_values<char> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<char>
delimiters<::std::multiset<T, TComp, TAllocator>, char>::values = {
"{", ", ", "}"};
template <typename T, typename TComp, typename TAllocator>
struct delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename TComp, typename TAllocator>
const delimiters_values<wchar_t>
delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t>::values = {
L"{", L", ", L"}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, char> {
static const delimiters_values<char> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<char> delimiters<
::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = {
"{", ", ", "}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<wchar_t> delimiters<
::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = {
L"{", L", ", L"}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
char> {
static const delimiters_values<char> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<char> delimiters<
::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = {
"{", ", ", "}"};
template <typename T, typename THash, typename TEqual, typename TAllocator>
struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T, typename THash, typename TEqual, typename TAllocator>
const delimiters_values<wchar_t>
delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
wchar_t>::values = {L"{", L", ", L"}"};
// Delimiters for pair and tuple
template <typename T1, typename T2>
struct delimiters<std::pair<T1, T2>, char> {
static const delimiters_values<char> values;
};
template <typename T1, typename T2>
const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = {
"(", ", ", ")"};
template <typename T1, typename T2>
struct delimiters<::std::pair<T1, T2>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename T1, typename T2>
const delimiters_values<wchar_t>
delimiters<::std::pair<T1, T2>, wchar_t>::values = {L"(", L", ", L")"};
template <typename... Args>
struct delimiters<std::tuple<Args...>, char> {
static const delimiters_values<char> values;
};
template <typename... Args>
const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = {
"(", ", ", ")"};
template <typename... Args>
struct delimiters<::std::tuple<Args...>, wchar_t> {
static const delimiters_values<wchar_t> values;
};
template <typename... Args>
const delimiters_values<wchar_t>
delimiters<::std::tuple<Args...>, wchar_t>::values = {L"(", L", ", L")"};
// Type-erasing helper class for easy use of custom delimiters.
// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t,
// and MyDelims needs to be defined for TChar. Usage: "cout <<
// pretty_print::custom_delims<MyDelims>(x)".
struct custom_delims_base {
virtual ~custom_delims_base() {}
virtual std::ostream &stream(::std::ostream &) = 0;
virtual std::wostream &stream(::std::wostream &) = 0;
};
template <typename T, typename Delims>
struct custom_delims_wrapper : custom_delims_base {
custom_delims_wrapper(const T &t_) : t(t_) {}
std::ostream &stream(std::ostream &s) {
return s << print_container_helper<T, char, std::char_traits<char>, Delims>(
t);
}
std::wostream &stream(std::wostream &s) {
return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>,
Delims>(t);
}
private:
const T &t;
};
template <typename Delims>
struct custom_delims {
template <typename Container>
custom_delims(const Container &c)
: base(new custom_delims_wrapper<Container, Delims>(c)) {}
std::unique_ptr<custom_delims_base> base;
};
template <typename TChar, typename TCharTraits, typename Delims>
inline std::basic_ostream<TChar, TCharTraits> &operator<<(
std::basic_ostream<TChar, TCharTraits> &s, const custom_delims<Delims> &p) {
return p.base->stream(s);
}
// A wrapper for a C-style array given as pointer-plus-size.
// Usage: std::cout << pretty_print_array(arr, n) << std::endl;
template <typename T>
struct array_wrapper_n {
typedef const T *const_iterator;
typedef T value_type;
array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {}
inline const_iterator begin() const { return _array; }
inline const_iterator end() const { return _array + _n; }
private:
const T *const _array;
size_t _n;
};
// A wrapper for hash-table based containers that offer local iterators to each
// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl; (Prints bucket
// 5 of container m.)
template <typename T>
struct bucket_print_wrapper {
typedef typename T::const_local_iterator const_iterator;
typedef typename T::size_type size_type;
const_iterator begin() const { return m_map.cbegin(n); }
const_iterator end() const { return m_map.cend(n); }
bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {}
private:
const T &m_map;
const size_type n;
};
} // namespace pretty_print
// Global accessor functions for the convenience wrappers
template <typename T>
inline pretty_print::array_wrapper_n<T> pretty_print_array(const T *const a,
size_t n) {
return pretty_print::array_wrapper_n<T>(a, n);
}
template <typename T>
pretty_print::bucket_print_wrapper<T> bucket_print(const T &m,
typename T::size_type n) {
return pretty_print::bucket_print_wrapper<T>(m, n);
}
// Main magic entry point: An overload snuck into namespace std.
// Can we do better?
namespace std {
// Prints a container to the stream using default delimiters
template <typename T, typename TChar, typename TCharTraits>
inline typename enable_if<::pretty_print::is_container<T>::value,
basic_ostream<TChar, TCharTraits> &>::type
operator<<(basic_ostream<TChar, TCharTraits> &stream, const T &container) {
return stream
<< ::pretty_print::print_container_helper<T, TChar, TCharTraits>(
container);
}
} // namespace std
#endif // H_PRETTY_PRINT
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <pybind11/embed.h>
#include <pybind11/functional.h>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <spconv/tensorview/tensorview.h>
#include <algorithm>
#include <iostream>
namespace py = pybind11;
template <typename scalar_t, typename TPyObject>
std::vector<scalar_t> array2Vector(TPyObject arr) {
py::array arr_np = arr;
size_t size = arr.attr("size").template cast<size_t>();
py::array_t<scalar_t> arr_cc = arr_np;
std::vector<scalar_t> data(arr_cc.data(), arr_cc.data() + size);
return data;
}
template <typename scalar_t>
std::vector<scalar_t> arrayT2Vector(py::array_t<scalar_t> arr) {
std::vector<scalar_t> data(arr.data(), arr.data() + arr.size());
return data;
}
template <typename scalar_t, typename TPyObject>
tv::TensorView<scalar_t> array2TensorView(TPyObject arr) {
py::array arr_np = arr;
py::array_t<scalar_t> arr_cc = arr_np;
tv::Shape shape;
for (int i = 0; i < arr_cc.ndim(); ++i) {
shape.push_back(arr_cc.shape(i));
}
return tv::TensorView<scalar_t>(arr_cc.mutable_data(), shape);
}
template <typename scalar_t>
tv::TensorView<scalar_t> arrayT2TensorView(py::array_t<scalar_t> arr) {
tv::Shape shape;
for (int i = 0; i < arr.ndim(); ++i) {
shape.push_back(arr.shape(i));
}
return tv::TensorView<scalar_t>(arr.mutable_data(), shape);
}
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPCONV_GEOMETRY_H_
#define SPCONV_GEOMETRY_H_
#include <utils/spconv/tensorview/tensorview.h>
#include <iostream>
#include <limits>
template <typename Index, unsigned NDim>
TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
const Index *kernelSize,
const Index *stride, const Index *padding,
const Index *dilation,
const Index *outSpatialShape, Index *out) {
Index lowers[NDim];
Index uppers[NDim];
Index counter[NDim];
Index counterSize[NDim];
Index pointCounter = 0;
Index val;
Index numPoints = 1;
Index m, offset;
bool valid = false;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 +
stride[i] + padding[i]) /
stride[i];
uppers[i] = (input_pos[i] + padding[i]) / stride[i];
}
#pragma unroll
for (unsigned i = 0; i < NDim; ++i) {
counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
numPoints *= counterSize[i];
}
#pragma unroll
for (int i = 0; i < NDim; ++i) {
counter[i] = 0;
}
for (int i = 0; i < numPoints; ++i) {
valid = true;
m = 1;
offset = 0;
#pragma unroll
for (int j = NDim - 1; j >= 0; --j) {
val = uppers[j] - counter[j] * dilation[j];
out[pointCounter * (NDim + 1) + j] = val;
if (val < 0 || (val > outSpatialShape[j] - 1)) {
valid = false;
// break;
}
offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
m *= kernelSize[j];
}
out[pointCounter * (NDim + 1) + NDim] = offset;
if (valid) ++pointCounter;
counter[NDim - 1] += 1;
#pragma unroll
for (int c = NDim - 1; c >= 0; --c) {
if (counter[c] == counterSize[c] && c > 0) {
counter[c - 1] += 1;
counter[c] = 0;
}
}
}
return pointCounter;
}
template <typename Index, unsigned NDim>
TV_HOST_DEVICE Index getValidOutPosTranspose(
const Index *input_pos, const Index *kernelSize, const Index *stride,
const Index *padding, const Index *dilation, const Index *outSpatialShape,
Index *out) {
Index lowers[NDim];
Index uppers[NDim];
Index counter[NDim];
Index counterSize[NDim];
Index pointCounter = 0;
Index val;
Index numPoints = 1;
Index m, offset;
bool valid = false;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
lowers[i] = input_pos[i] * stride[i] - padding[i];
uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i];
}
#pragma unroll
for (unsigned i = 0; i < NDim; ++i) {
counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
numPoints *= counterSize[i];
}
#pragma unroll
for (int i = 0; i < NDim; ++i) {
counter[i] = 0;
}
for (int i = 0; i < numPoints; ++i) {
valid = true;
m = 1;
offset = 0;
#pragma unroll
for (int j = NDim - 1; j >= 0; --j) {
val = uppers[j] - counter[j] * dilation[j];
out[pointCounter * (NDim + 1) + j] = val;
if (val < 0 || (val > outSpatialShape[j] - 1)) {
valid = false;
}
offset += m * (val - lowers[j]) / dilation[j];
m *= kernelSize[j];
}
out[pointCounter * (NDim + 1) + NDim] = offset;
if (valid) ++pointCounter;
counter[NDim - 1] += 1;
#pragma unroll
for (int c = NDim - 1; c >= 0; --c) {
if (counter[c] == counterSize[c] && c > 0) {
counter[c - 1] += 1;
counter[c] = 0;
}
}
}
return pointCounter;
}
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *kernelSize, const Index *stride,
const Index *padding, const Index *dilation,
const Index *outSpatialShape) {
// indicesOut: num_active * kernelVolume * (NDim + 1)
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
for (int j = 0; j < numActIn; ++j) {
batchIdx = indicesIn(j, 0);
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * batchIdx;
if (gridsOut[index] == -1) {
for (unsigned k = 1; k < NDim + 1; ++k) {
indicesOut(numAct, k) = pointPtr[k - 1];
}
indicesOut(numAct, 0) = batchIdx;
gridsOut[index] = numAct++;
}
// indicePairs: [K, 2, L]
indicePairs(offset, 0, indiceNum[offset]) = j;
indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
}
}
return numAct;
}
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *kernelSize, const Index *stride,
const Index *padding, const Index *dilation,
const Index *outSpatialShape) {
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
for (int j = 0; j < numActIn; ++j) {
batchIdx = indicesIn(j, 0);
numValidPoints = getValidOutPosTranspose<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * batchIdx;
if (gridsOut[index] == -1) {
for (unsigned k = 1; k < NDim + 1; ++k) {
indicesOut(numAct, k) = pointPtr[k - 1];
}
indicesOut(numAct, 0) = batchIdx;
gridsOut[index] = numAct++;
}
// indicePairs: [K, 2, L]
indicePairs(offset, 0, indiceNum[offset]) = j;
indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
}
}
return numAct;
}
template <typename Index, typename IndexGrid, unsigned NDim>
Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const Index *const kernelSize,
const Index *const stride, const Index *const padding,
const Index *dilation,
const Index *const outSpatialShape) {
Index numAct = 0;
auto numActIn = indicesIn.dim(0);
Index batchIdx = 0;
Index spatialVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
spatialVolume *= outSpatialShape[i];
}
Index kernelVolume = 1;
#pragma unroll
for (int i = 0; i < NDim; ++i) {
kernelVolume *= kernelSize[i];
}
Index numValidPoints = 0;
// Index validPoints[kernelVolume * (NDim + 1)];
std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
Index *validPoints = validPoints_.data();
Index *pointPtr = nullptr;
Index index = 0;
for (int j = 0; j < numActIn; ++j) {
index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
outSpatialShape) +
spatialVolume * indicesIn(j, 0);
gridsOut[index] = j;
}
for (int j = 0; j < numActIn; ++j) {
numValidPoints = getValidOutPos<Index, NDim>(
indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
dilation, outSpatialShape, validPoints);
for (Index i = 0; i < numValidPoints; ++i) {
pointPtr = validPoints + i * (NDim + 1);
auto offset = pointPtr[NDim];
index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
spatialVolume * indicesIn(j, 0);
if (gridsOut[index] > -1) {
indicePairs(offset, 0, indiceNum[offset]) = j;
indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
}
}
}
return numActIn;
}
#endif
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_
#define SPARSE_CONV_INDICE_FUNCTOR_H_
#include <utils/spconv/tensorview/tensorview.h>
namespace functor {
template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP1 {
Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose);
};
template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctorP2 {
Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
tv::TensorView<Index> indicePairUnique,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid = false);
};
template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateConvIndicePairFunctor {
Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<Index> indicesOut,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid = false);
};
template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
struct CreateSubMIndicePairFunctor {
Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
tv::TensorView<IndexGrid> gridsOut,
tv::TensorView<Index> indicePairs,
tv::TensorView<Index> indiceNum,
const tv::SimpleVector<Index, NDim> kernelSize,
const tv::SimpleVector<Index, NDim> stride,
const tv::SimpleVector<Index, NDim> padding,
const tv::SimpleVector<Index, NDim> dilation,
const tv::SimpleVector<Index, NDim> outSpatialShape,
bool transpose, bool resetGrid = false);
};
} // namespace functor
#endif
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_MAXPOOL_FUNCTOR_H_
#define SPARSE_MAXPOOL_FUNCTOR_H_
#include <utils/spconv/tensorview/tensorview.h>
namespace functor {
template <typename Device, typename scalar_t, typename Index>
struct SparseMaxPoolForwardFunctor {
void operator()(const Device& d, tv::TensorView<scalar_t> outFeatures,
tv::TensorView<const scalar_t> inFeatures,
tv::TensorView<const Index> indices, int size);
};
template <typename Device, typename scalar_t, typename Index>
struct SparseMaxPoolBackwardFunctor {
void operator()(const Device& d, tv::TensorView<const scalar_t> outFeatures,
tv::TensorView<const scalar_t> inFeatures,
tv::TensorView<const scalar_t> fout,
tv::TensorView<scalar_t> fin,
tv::TensorView<const Index> indices, int size);
};
} // namespace functor
#endif
#ifndef MP_HELPER_H_
#define MP_HELPER_H_
#include <type_traits>
#include <utility>
template <class... T>
struct mp_list {};
template <class T, T... I>
using mp_list_c = mp_list<std::integral_constant<T, I>...>;
namespace detail {
template <class... T, class F>
constexpr F mp_for_each_impl(mp_list<T...>, F &&f) {
return std::initializer_list<int>{(f(T()), 0)...}, std::forward<F>(f);
}
template <class F>
constexpr F mp_for_each_impl(mp_list<>, F &&f) {
return std::forward<F>(f);
}
} // namespace detail
namespace detail {
template <class A, template <class...> class B>
struct mp_rename_impl {
// An error "no type named 'type'" here means that the first argument to
// mp_rename is not a list
};
template <template <class...> class A, class... T, template <class...> class B>
struct mp_rename_impl<A<T...>, B> {
using type = B<T...>;
};
} // namespace detail
template <class A, template <class...> class B>
using mp_rename = typename ::detail::mp_rename_impl<A, B>::type;
template <class L, class F>
constexpr F mp_for_each(F &&f) {
return ::detail::mp_for_each_impl(mp_rename<L, mp_list>(),
std::forward<F>(f));
}
#endif
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <math.h>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <algorithm>
#include <iostream>
namespace py = pybind11;
using namespace pybind11::literals;
template <typename DType, int NDim>
int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
py::array_t<int> coors,
py::array_t<int> num_points_per_voxel,
py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size,
std::vector<DType> coors_range, int max_points,
int max_voxels) {
auto points_rw = points.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int voxelidx, num;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed) continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels) continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors_rw(voxelidx, k) = coor[k];
}
}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) {
for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k);
}
num_points_per_voxel_rw(voxelidx) += 1;
}
}
for (int i = 0; i < voxel_num; ++i) {
coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
}
return voxel_num;
}
template <typename DType, int NDim>
int points_to_voxel_3d_np_mean(py::array_t<DType> points,
py::array_t<DType> voxels,
py::array_t<DType> means, py::array_t<int> coors,
py::array_t<int> num_points_per_voxel,
py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size,
std::vector<DType> coors_range, int max_points,
int max_voxels) {
auto points_rw = points.template mutable_unchecked<2>();
auto means_rw = means.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int voxelidx, num;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed) continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels) continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors_rw(voxelidx, k) = coor[k];
}
}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) {
for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k);
}
num_points_per_voxel_rw(voxelidx) += 1;
for (int k = 0; k < num_features; ++k) {
means_rw(voxelidx, k) +=
(points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
}
}
}
for (int i = 0; i < voxel_num; ++i) {
coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
num = num_points_per_voxel_rw(i);
for (int j = num; j < max_points; ++j) {
for (int k = 0; k < num_features; ++k) {
voxels_rw(i, j, k) = means_rw(i, k);
}
}
}
return voxel_num;
}
template <typename DType, int NDim>
int points_to_voxel_3d_np_height(
py::array_t<DType> points, py::array_t<DType> voxels,
py::array_t<DType> height, py::array_t<DType> maxs, py::array_t<int> coors,
py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size, std::vector<DType> coors_range,
int max_points, int max_voxels) {
auto points_rw = points.template mutable_unchecked<2>();
auto height_rw = height.template mutable_unchecked<2>();
auto maxs_rw = maxs.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int voxelidx, num;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed) continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels) continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors_rw(voxelidx, k) = coor[k];
}
}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) {
for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k);
height_rw(voxelidx, k) =
std::min(points_rw(i, k), height_rw(voxelidx, k));
maxs_rw(voxelidx, k) = std::max(points_rw(i, k), maxs_rw(voxelidx, k));
}
num_points_per_voxel_rw(voxelidx) += 1;
}
}
for (int i = 0; i < voxel_num; ++i) {
coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
for (int k = 0; k < num_features; ++k) {
height_rw(i, k) = maxs_rw(i, k) - height_rw(i, k);
}
}
return voxel_num;
}
template <typename DType, int NDim>
int block_filtering(py::array_t<DType> points, py::array_t<int> mask,
py::array_t<DType> height, py::array_t<DType> maxs,
py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size,
std::vector<DType> coors_range, int max_voxels, DType eps) {
auto points_rw = points.template mutable_unchecked<2>();
auto height_rw = height.template mutable_unchecked<1>();
auto maxs_rw = maxs.template mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int voxelidx, num;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed) continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
}
height_rw(voxelidx) = std::min(points_rw(i, 2), height_rw(voxelidx));
maxs_rw(voxelidx) = std::max(points_rw(i, 2), maxs_rw(voxelidx));
}
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed) continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps) {
mask(i) = 0;
}
}
}
template <typename DType, int NDim>
int points_to_voxel_3d_with_filtering(
py::array_t<DType> points, py::array_t<DType> voxels,
py::array_t<int> voxel_mask, py::array_t<DType> mins,
py::array_t<DType> maxs, py::array_t<int> coors,
py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
std::vector<DType> voxel_size, std::vector<DType> coors_range,
int max_points, int max_voxels, int block_factor, int block_size,
DType height_threshold) {
auto points_rw = points.template mutable_unchecked<2>();
auto mins_rw = mins.template mutable_unchecked<2>();
auto maxs_rw = maxs.template mutable_unchecked<2>();
auto voxels_rw = voxels.template mutable_unchecked<3>();
auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
auto coors_rw = coors.mutable_unchecked<2>();
auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
auto N = points_rw.shape(0);
auto num_features = points_rw.shape(1);
constexpr int ndim_minus_1 = NDim - 1;
int voxel_num = 0;
bool failed = false;
int coor[NDim];
int c;
int grid_size[NDim];
DType max_value, min_value;
for (int i = 0; i < NDim; ++i) {
grid_size[i] =
round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
}
int block_shape_H = grid_size[1] / block_factor;
int block_shape_W = grid_size[0] / block_factor;
int voxelidx, num;
int block_coor[2];
int startx, stopx, starty, stopy;
for (int i = 0; i < N; ++i) {
failed = false;
for (int j = 0; j < NDim; ++j) {
c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
if ((c < 0 || c >= grid_size[j])) {
failed = true;
break;
}
coor[ndim_minus_1 - j] = c;
}
if (failed) continue;
voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
if (voxelidx == -1) {
voxelidx = voxel_num;
if (voxel_num >= max_voxels) continue;
voxel_num += 1;
coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors_rw(voxelidx, k) = coor[k];
}
}
num = num_points_per_voxel_rw(voxelidx);
if (num < max_points) {
for (int k = 0; k < num_features; ++k) {
voxels_rw(voxelidx, num, k) = points_rw(i, k);
}
block_coor[0] = coor[1] / block_factor;
block_coor[1] = coor[2] / block_factor;
mins_rw(block_coor[0], block_coor[1]) =
std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
maxs_rw(block_coor[0], block_coor[1]) =
std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
num_points_per_voxel_rw(voxelidx) += 1;
}
}
for (int i = 0; i < voxel_num; ++i) {
coor[1] = coors_rw(i, 1);
coor[2] = coors_rw(i, 2);
coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;
block_coor[0] = coor[1] / block_factor;
block_coor[1] = coor[2] / block_factor;
min_value = mins_rw(block_coor[0], block_coor[1]);
max_value = maxs_rw(block_coor[0], block_coor[1]);
startx = std::max(0, block_coor[0] - block_size / 2);
stopx =
std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
starty = std::max(0, block_coor[1] - block_size / 2);
stopy =
std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);
for (int j = startx; j < stopx; ++j) {
for (int k = starty; k < stopy; ++k) {
min_value = std::min(min_value, mins_rw(j, k));
max_value = std::max(max_value, maxs_rw(j, k));
}
}
voxel_mask_rw(i) = (max_value - min_value) > height_threshold;
}
return voxel_num;
}
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef SPARSE_REORDERING_FUNCTOR_H_
#define SPARSE_REORDERING_FUNCTOR_H_
#include <utils/spconv/tensorview/tensorview.h>
namespace functor {
template <typename Device, typename scalar_t, typename Index>
struct SparseGatherFunctor {
void operator()(const Device& d, tv::TensorView<scalar_t> buffer,
tv::TensorView<const scalar_t> features,
tv::TensorView<const Index> indices, int size);
};
template <typename Device, typename scalar_t, typename Index>
struct SparseScatterAddFunctor {
void operator()(const Device& d, tv::TensorView<scalar_t> out_features,
tv::TensorView<const scalar_t> buffer,
tv::TensorView<const Index> indices, int size,
bool stable = false);
};
} // namespace functor
#endif
#pragma once
namespace tv {
namespace detail {
template <typename scalar_t>
class KernelLoop {
struct Iterator {
__forceinline__ __device__ Iterator(scalar_t index, scalar_t delta)
: index_(index), delta_(delta) {}
__forceinline__ __device__ scalar_t operator*() const { return index_; }
__forceinline__ __device__ Iterator &operator++() {
index_ += delta_;
return *this;
}
__forceinline__ __device__ bool operator!=(const Iterator &other) const {
bool greater = index_ > other.index_;
bool less = index_ < other.index_;
if (!other.delta_) {
return less;
}
if (!delta_) {
return greater;
}
return less || greater;
}
private:
scalar_t index_;
const scalar_t delta_;
};
public:
__forceinline__ __device__ KernelLoop(scalar_t begin, scalar_t delta,
scalar_t end)
: begin_(begin), delta_(delta), end_(end) {}
__forceinline__ __device__ Iterator begin() const {
return Iterator{begin_, delta_};
}
__forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }
private:
scalar_t begin_;
scalar_t delta_;
scalar_t end_;
};
} // namespace detail
template <typename scalar_t, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopX(
scalar_t count) {
return detail::KernelLoop<scalar_t>(blockIdx.x * blockDim.x + threadIdx.x,
gridDim.x * blockDim.x * NumILP, count);
}
// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
// Usage: for(int i : KernelLoopY(count)) { visit(i); }
template <typename scalar_t, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopY(
scalar_t count) {
return detail::KernelLoop<scalar_t>(blockIdx.y * blockDim.y + threadIdx.y,
gridDim.y * blockDim.y * NumILP, count);
}
// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
template <typename scalar_t, int NumILP = 1>
__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopZ(
scalar_t count) {
return detail::KernelLoop<scalar_t>(blockIdx.z * blockDim.z + threadIdx.z,
gridDim.z * blockDim.z * NumILP, count);
}
} // namespace tv
#pragma once
// from pytorch.aten
#include "tensorview.h"
namespace tv {
namespace launch {
template <typename T1, typename T2>
inline int DivUp(const T1 a, const T2 b) {
return (a + b - 1) / b;
}
constexpr int CUDA_NUM_THREADS = 1024;
inline int getBlocks(const int N) {
TV_ASSERT_RT_ERR(N > 0,
"CUDA kernel launch blocks must be positive, but got N=", N);
return DivUp(N, CUDA_NUM_THREADS);
}
} // namespace launch
} // namespace tv
// Copyright 2019 Yan Yan
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <iostream>
#include <memory>
#include <sstream>
#include <type_traits>
#include <vector>
#include "pytorch_cpp_helper.hpp"
namespace tv {
#ifdef __NVCC__
#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
#define TV_DEVICE_INLINE __forceinline__ __device__
#define TV_HOST_DEVICE __device__ __host__
#define TV_ASSERT(expr) assert(expr)
#elif defined(__CUDACC_RTC__)
#define TV_ASSERT(expr) assert(expr)
#define TV_HOST_DEVICE_INLINE __forceinline__ __device__
#define TV_DEVICE_INLINE __forceinline__ __device__
#define TV_HOST_DEVICE __device__ __host__
#else
#define TV_ASSERT(x) assert(x)
#define TV_HOST_DEVICE_INLINE inline
#define TV_HOST_DEVICE
#endif
#define TV_REQUIRE(expr, ...) \
{ \
if (!(expr)) { \
printf(__VA_ARGS__); \
assert(expr); \
} \
}
#define TV_DEVICE_REQUIRE(expr, ...) \
{ \
if (!(expr) && threadIdx.x == 0) printf(__VA_ARGS__); \
assert(expr); \
}
template <class SStream, class T>
void sstream_print(SStream &ss, T val) {
ss << val;
}
template <class SStream, class T, class... TArgs>
void sstream_print(SStream &ss, T val, TArgs... args) {
ss << val << " ";
sstream_print(ss, args...);
}
#define TV_ASSERT_RT_ERR(expr, ...) \
{ \
if (!(expr)) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << #expr << " assert failed. "; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
throw std::runtime_error(__macro_s.str()); \
} \
}
#define TV_ASSERT_INVALID_ARG(expr, ...) \
{ \
if (!(expr)) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << #expr << " assert failed. "; \
tv::sstream_print(__macro_s, __VA_ARGS__); \
throw std::invalid_argument(__macro_s.str()); \
} \
}
#define TV_CHECK_CUDA_ERR() \
{ \
auto err = cudaGetLastError(); \
if (err != cudaSuccess) { \
std::stringstream __macro_s; \
__macro_s << __FILE__ << " " << __LINE__ << "\n"; \
__macro_s << "cuda execution failed with error " << err; \
throw std::runtime_error(__macro_s.str()); \
} \
}
struct CPU {};
#define TV_MAX_DIM 6
template <typename scalar_t, size_t MaxDim = TV_MAX_DIM>
struct SimpleVector {
public:
TV_HOST_DEVICE_INLINE SimpleVector(){};
TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<scalar_t> q) {
TV_ASSERT(q.size() <= MaxDim);
mSize = 0;
for (scalar_t s : q) {
mArray[mSize++] = s;
}
mSize = q.size();
}
SimpleVector(const std::vector<scalar_t> &arr) {
TV_ASSERT(arr.size() <= MaxDim);
for (size_t i = 0; i < arr.size(); ++i) {
mArray[i] = arr[i];
}
mSize = arr.size();
}
TV_HOST_DEVICE_INLINE SimpleVector(
const SimpleVector<scalar_t, MaxDim> &arr) {
TV_ASSERT(arr.size() <= MaxDim);
for (size_t i = 0; i < arr.size(); ++i) {
mArray[i] = arr[i];
}
mSize = arr.size();
}
TV_HOST_DEVICE_INLINE scalar_t &operator[](int idx) {
#ifdef TV_DEBUG
TV_ASSERT(idx >= 0 && idx < mSize);
#endif
return mArray[idx];
}
TV_HOST_DEVICE_INLINE const scalar_t &operator[](int idx) const {
#ifdef TV_DEBUG
TV_ASSERT(idx >= 0 && idx < mSize);
#endif
return mArray[idx];
}
TV_HOST_DEVICE_INLINE void push_back(scalar_t s) {
#ifdef TV_DEBUG
TV_ASSERT(mSize < MaxDim);
#endif
mArray[mSize] = s;
mSize++;
}
TV_HOST_DEVICE_INLINE void pop_back() {
#ifdef TV_DEBUG
TV_ASSERT(mSize > 0);
#endif
mSize--;
}
TV_HOST_DEVICE_INLINE size_t size() const { return mSize; }
TV_HOST_DEVICE_INLINE const scalar_t *data() const { return mArray; }
TV_HOST_DEVICE_INLINE size_t empty() const { return mSize == 0; }
typedef size_t size_type;
class iterator {
public:
typedef iterator self_type;
typedef scalar_t value_type;
typedef scalar_t &reference;
typedef scalar_t *pointer;
typedef std::forward_iterator_tag iterator_category;
typedef std::ptrdiff_t difference_type;
TV_HOST_DEVICE_INLINE iterator(pointer ptr) : ptr_(ptr) {}
TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
self_type i = *this;
ptr_++;
return i;
}
TV_HOST_DEVICE_INLINE self_type operator++() {
ptr_++;
return *this;
}
TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
return ptr_ == rhs.ptr_;
}
TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
return ptr_ != rhs.ptr_;
}
private:
pointer ptr_;
};
class const_iterator {
public:
typedef const_iterator self_type;
typedef scalar_t value_type;
typedef const scalar_t &reference;
typedef const scalar_t *pointer;
typedef std::ptrdiff_t difference_type;
typedef std::forward_iterator_tag iterator_category;
TV_HOST_DEVICE_INLINE const_iterator(pointer ptr) : ptr_(ptr) {}
TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
self_type i = *this;
ptr_++;
return i;
}
TV_HOST_DEVICE_INLINE self_type operator++() {
ptr_++;
return *this;
}
TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
return ptr_ == rhs.ptr_;
}
TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
return ptr_ != rhs.ptr_;
}
private:
pointer ptr_;
};
TV_HOST_DEVICE_INLINE iterator begin() { return iterator(mArray); }
TV_HOST_DEVICE_INLINE iterator end() { return iterator(mArray + mSize); }
TV_HOST_DEVICE_INLINE const_iterator begin() const {
return const_iterator(mArray);
}
TV_HOST_DEVICE_INLINE const_iterator end() const {
return const_iterator(mArray + mSize);
}
TV_HOST_DEVICE_INLINE const_iterator cbegin() const {
return const_iterator(mArray);
}
TV_HOST_DEVICE_INLINE const_iterator cend() const {
return const_iterator(mArray + mSize);
}
protected:
scalar_t mArray[MaxDim];
size_t mSize = 0;
};
template <typename scalar_t, size_t MaxDim>
bool operator==(const SimpleVector<scalar_t, MaxDim> &lfs,
const SimpleVector<scalar_t, MaxDim> &rfs) {
if (lfs.size() != rfs.size()) return false;
for (size_t i = 0; i < lfs.size(); ++i) {
if (lfs[i] != rfs[i]) return false;
}
return true;
}
template <typename scalar_t, size_t MaxDim>
bool operator!=(const SimpleVector<scalar_t, MaxDim> &lfs,
const SimpleVector<scalar_t, MaxDim> &rfs) {
return !(lfs == rfs);
}
struct Slice {
template <class... Integers>
TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
SimpleVector<int, 3> slices{int(ints)...};
mSlices[0] = -1;
mSlices[1] = -1;
mSlices[2] = -1;
for (size_t i = 0; i < slices.size(); ++i) {
mSlices[i] = slices[i];
}
}
TV_HOST_DEVICE_INLINE Slice() {
mSlices[0] = -1;
mSlices[1] = -1;
mSlices[2] = -1;
}
template <typename scalar_t>
TV_HOST_DEVICE_INLINE Slice(std::initializer_list<scalar_t> slice) {
mSlices[0] = -1;
mSlices[1] = -1;
mSlices[2] = -1;
TV_ASSERT(slice.size() <= 3);
int idx = 0;
for (scalar_t s : slice) {
mSlices[idx] = int(s);
++idx;
}
}
TV_HOST_DEVICE_INLINE int &operator[](int idx) {
#ifdef TV_DEBUG
TV_ASSERT(idx >= 0 && idx < 3);
#endif
return mSlices[idx];
}
TV_HOST_DEVICE_INLINE const int &operator[](int idx) const {
#ifdef TV_DEBUG
TV_ASSERT(idx >= 0 && idx < 3);
#endif
return mSlices[idx];
}
protected:
int mSlices[3];
};
template <size_t MaxDim = TV_MAX_DIM>
struct ShapeBase : public SimpleVector<int, MaxDim> {
TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>(){};
TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape)
: SimpleVector<int, MaxDim>(shape) {}
template <typename scalar_t, template <class...> class Container>
ShapeBase(Container<scalar_t> shape) : SimpleVector<int, MaxDim>(shape) {}
TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
: SimpleVector<int, MaxDim>(shape) {}
ShapeBase(const std::vector<int> &arr) : SimpleVector<int, MaxDim>(arr) {}
ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default;
TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const {
#ifdef TV_DEBUG
TV_ASSERT(start >= 0 && end < this->mSize && end > start);
#endif
ShapeBase<MaxDim> shape;
for (int i = start; i < end; ++i) {
shape.push_back(this->mArray[i]);
}
return shape;
}
TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start) const {
#ifdef TV_DEBUG
TV_ASSERT(start >= 0 && start <= this->mSize);
#endif
ShapeBase<MaxDim> shape;
for (int i = start; i < this->mSize; ++i) {
shape.push_back(this->mArray[i]);
}
return shape;
}
TV_HOST_DEVICE_INLINE size_t size() const {
if (this->mSize == 0) return 0;
size_t s = 1;
for (int i = 0; i < int(this->mSize); ++i) {
s *= this->mArray[i];
}
return s;
}
TV_HOST_DEVICE_INLINE size_t ndim() const { return this->mSize; }
TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze() const {
ShapeBase<MaxDim> shape;
for (int i = 0; i < this->mSize; ++i) {
if (this->mArray[i] != 1) shape.push_back(this->mArray[i]);
}
return shape;
}
TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze(int dim) const {
ShapeBase<MaxDim> shape;
for (int i = 0; i < this->mSize; ++i) {
if (i != dim || this->mArray[i] != 1) shape.push_back(this->mArray[i]);
}
return shape;
}
};
using Shape = ShapeBase<TV_MAX_DIM>;
template <class... Inds>
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
Inds... indexes) {
unsigned offset = 0;
unsigned m = 1;
int indexes_vec[sizeof...(indexes)] = {indexes...};
#ifdef TV_DEBUG
TV_ASSERT(sizeof...(indexes) == shape.size());
#endif
#pragma unroll
for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
offset += m * indexes_vec[i];
m *= shape[i];
}
return offset;
}
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
std::vector<int> &indexes_vec) {
unsigned offset = 0;
unsigned m = 1;
for (int i = shape.size() - 1; i >= 0; --i) {
offset += m * indexes_vec[i];
m *= shape[i];
}
return offset;
}
template <class... Inds>
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
Inds... indexes) {
unsigned offset = 0;
unsigned m = 1;
int indexes_vec[sizeof...(indexes)] = {indexes...};
#pragma unroll
for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
offset += m * indexes_vec[i];
m *= shape[i];
}
return offset;
}
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
const Shape &indexes_vec) {
unsigned offset = 0;
unsigned m = 1;
for (int i = indexes_vec.ndim() - 1; i >= 0; --i) {
offset += m * indexes_vec[i];
m *= shape[i];
}
return offset;
}
template <typename Index, unsigned NDim>
TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Index *indexes,
const Index *shape) {
unsigned offset = 0;
unsigned m = 1;
#pragma unroll
for (int i = NDim - 1; i >= 0; --i) {
offset += m * indexes[i];
m *= shape[i];
}
return offset;
}
template <typename Index, unsigned NDim>
TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
const Index *shape) {
#pragma unroll
for (int i = NDim - 1; i >= 0; --i) {
output[i] = index % shape[i];
index -= output[i];
index /= shape[i];
}
return index;
}
template <int N>
struct ArrayIndexRowMajor {
TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
const Shape &indexes) {
return indexes[N - 1] +
shape[N - 1] * ArrayIndexRowMajor<N - 1>::run(shape, indexes);
}
};
template <>
struct ArrayIndexRowMajor<0> {
TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
const Shape &indexes) {
return 0;
}
};
namespace detail {
template <typename scalar_t>
constexpr const char *simpleTypeName(scalar_t val = scalar_t());
template <>
constexpr const char *simpleTypeName(float val) {
return "float32";
}
template <>
constexpr const char *simpleTypeName(double val) {
return "float64";
}
template <>
constexpr const char *simpleTypeName(int val) {
return "int32";
}
template <>
constexpr const char *simpleTypeName(unsigned val) {
return "uint32";
}
template <>
constexpr const char *simpleTypeName(long val) {
return "int64";
}
template <>
constexpr const char *simpleTypeName(unsigned long val) {
return "uint64";
}
}; // namespace detail
template <typename scalar_t, int Rank = -1>
struct TensorView {
TV_HOST_DEVICE_INLINE TensorView() {}
explicit TV_HOST_DEVICE_INLINE TensorView(scalar_t *ptr, Shape shape)
: mPtr(ptr), mShape(shape) {}
template <class... Integers>
explicit TV_HOST_DEVICE_INLINE TensorView(scalar_t *ptr, Integers... shapes)
: mPtr(ptr) {
mShape = {int(shapes)...};
}
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &assign(
const TensorView<scalar_t, Rank> &tensor) {
TV_REQUIRE(tensor.shape() == shape(), "you must provide same input size%s",
"\n");
scalar_t *ptr = mPtr;
const scalar_t *other_ptr = tensor.data();
for (size_t i = 0; i < size(); ++i) *(ptr++) = *(other_ptr++);
return *this;
}
template <typename T1>
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &assign(
std::initializer_list<T1> seq) {
TV_REQUIRE(seq.size() == size(), "you must provide same input size%s",
"\n");
scalar_t *ptr = mPtr;
for (const T1 &s : seq) *(ptr++) = scalar_t(s);
return *this;
}
template <class... Inds>
TV_HOST_DEVICE_INLINE scalar_t &operator()(Inds... inds) {
#ifdef TV_DEBUG
int idxes[sizeof...(Inds)]{int(inds)...};
TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
"you provide %d indexes, but dim is %d\n", sizeof...(inds),
mShape.ndim());
for (int i = 0; i < sizeof...(inds); ++i) {
TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
"index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
mShape[i]);
}
#endif
return mPtr[rowArrayIdx(mShape, int(inds)...)];
}
template <class... Inds>
TV_HOST_DEVICE_INLINE const scalar_t &operator()(Inds... inds) const {
#ifdef TV_DEBUG
int idxes[sizeof...(Inds)]{int(inds)...};
TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
"you provide %d indexes, but dim is %d\n", sizeof...(inds),
mShape.ndim());
for (int i = 0; i < sizeof...(inds); ++i) {
TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
"index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
mShape[i]);
}
#endif
return mPtr[rowArrayIdx(mShape, int(inds)...)];
}
TV_HOST_DEVICE_INLINE scalar_t &operator()() {
#if defined TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mPtr != nullptr,
"you want get value but the view is empty.%s", "\n");
TV_DEVICE_REQUIRE(mShape.ndim() == 0,
"you provide 0 indexes, but dim is %ld\n", mShape.ndim());
#else
TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
"\n");
TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
mShape.ndim());
#endif
#endif
return mPtr[0];
}
TV_HOST_DEVICE_INLINE const scalar_t &operator()() const {
#if defined TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mPtr != nullptr,
"you want get value but the view is empty.%s", "\n");
TV_DEVICE_REQUIRE(mShape.ndim() == 0,
"you provide 0 indexes, but dim is %ld\n", mShape.ndim());
#else
TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
"\n");
TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
mShape.ndim());
#endif
#endif
return mPtr[0];
}
template <class T1>
TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1) {
#if defined TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 1,
"you provide 1 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
#else
TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
#endif
#endif
return mPtr[i1];
}
template <class T1, class T2>
TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2) {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 2,
"you provide 2 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
#else
TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
#endif
#endif
return mPtr[i1 * mShape[1] + i2];
}
template <class T1, class T2, class T3>
TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2, T3 i3) {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 3,
"you provide 3 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
mShape[2]);
#else
TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
#endif
#endif
return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
}
template <class T1, class T2, class T3, class T4>
TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2, T3 i3, T4 i4) {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 4,
"you provide 4 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
mShape[2]);
TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
"index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
mShape[3]);
#else
TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
"index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
#endif
#endif
return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
}
template <class T1>
TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 1,
"you provide 1 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
#else
TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
#endif
#endif
return mPtr[i1];
}
template <class T1, class T2>
TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 2,
"you provide 2 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
#else
TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
#endif
#endif
return mPtr[i1 * mShape[1] + i2];
}
template <class T1, class T2, class T3>
TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2, T3 i3) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 3,
"you provide 3 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
mShape[2]);
#else
TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
#endif
#endif
return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
}
template <class T1, class T2, class T3, class T4>
TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2, T3 i3,
T4 i4) const {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(mShape.ndim() == 4,
"you provide 4 indexes, but dim is %ld\n", mShape.ndim());
TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
mShape[0]);
TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
mShape[1]);
TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
mShape[2]);
TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
"index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
mShape[3]);
#else
TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
mShape.ndim());
TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
"index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
"index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
"index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
"index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
#endif
#endif
return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
}
TV_HOST_DEVICE_INLINE scalar_t &operator[](int idx) {
#ifdef TV_DEBUG
#if defined(__CUDA_ARCH__)
TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
"index(%d) out-of-range: [0, %ld)\n", int(idx), size());
#else
TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
int(idx), size());
#endif
#endif
return mPtr[idx];
}
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> operator[](
SimpleVector<Slice> slice_vec) {
return _subview(slice_vec);
}
TV_HOST_DEVICE_INLINE const TensorView<scalar_t, Rank> operator[](
SimpleVector<Slice> slice_vec) const {
return _subview(slice_vec);
}
TV_HOST_DEVICE_INLINE bool empty() const { return mPtr == nullptr; }
TV_HOST_DEVICE_INLINE scalar_t *data() { return mPtr; }
TV_HOST_DEVICE_INLINE const scalar_t *data() const { return mPtr; }
TV_HOST_DEVICE_INLINE const Shape &shape() const { return mShape; }
TV_HOST_DEVICE_INLINE int dim(int idx) const { return mShape[idx]; }
TV_HOST_DEVICE_INLINE int ndim() const { return mShape.ndim(); }
template <class... Inds>
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &reshape(Inds... newShapes) {
Shape shapes{int(newShapes)...};
TV_ASSERT(shapes.size() == size());
mShape = shapes;
return *this;
}
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &reshape(Shape shapes) {
TV_ASSERT(shapes.size() == size());
mShape = shapes;
return *this;
}
template <class... Inds>
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> view(
Inds... newShapes) const {
Shape shapes{int(newShapes)...};
for (size_t i = 0; i < shapes.ndim(); ++i) {
if (shapes[i] == -1) {
shapes[i] = 1;
shapes[i] = size() / shapes.size();
break;
}
}
TV_ASSERT(shapes.size() == size());
return TensorView<scalar_t, Rank>(mPtr, shapes);
}
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> view(Shape shapes) const {
TV_ASSERT(shapes.size() == size());
return TensorView<scalar_t, Rank>(mPtr, shapes);
}
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> squeeze() const {
return TensorView<scalar_t, Rank>(mPtr, mShape.squeeze());
}
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> squeeze(int dim) const {
return TensorView<scalar_t, Rank>(mPtr, mShape.squeeze(dim));
}
TV_HOST_DEVICE_INLINE size_t size() const { return mShape.size(); }
template <class... Slices>
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(
Slice slice, Slices... slices) const {
return subview<float, Slice, Slices...>(slice, slices...);
}
template <class T2 = float, class... Slices>
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(
Slices... slices) const {
Slice slice_vec[sizeof...(Slices)] = {to_slice(slices)...};
Shape new_shape{to_slice(slices)[0]...};
Shape start{to_slice(slices)[0]...};
TV_ASSERT(new_shape.ndim() <= mShape.ndim());
TV_ASSERT(new_shape.ndim() != 0);
size_t idxsize = new_shape.ndim();
for (size_t i = idxsize; i < mShape.ndim(); ++i) {
new_shape.push_back(0);
start.push_back(0);
}
#pragma unroll
for (size_t i = 0; i < sizeof...(Slices); ++i) {
if (slice_vec[i][1] != -1) {
new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
TV_ASSERT(new_shape[i] >= 0);
} else {
new_shape[i] = 1;
}
}
auto offset = rowArrayIdx(mShape, start);
#pragma unroll
for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
new_shape[i] = mShape[i];
TV_ASSERT(new_shape[i] >= 0);
}
Shape reduced_shape;
#pragma unroll
for (size_t i = 0; i < sizeof...(Slices); ++i) {
if (slice_vec[i][1] != -1) {
reduced_shape.push_back(new_shape[i]);
}
}
#pragma unroll
for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
reduced_shape.push_back(new_shape[i]);
}
return TensorView<scalar_t, Rank>(mPtr + offset, reduced_shape);
}
template <class... Integers>
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(int id,
Integers... ints) {
Shape start = {id, ints...};
for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
start.push_back(0);
}
return TensorView<scalar_t, Rank>(mPtr + rowArrayIdx(mShape, start),
mShape.subshape(sizeof...(ints) + 1));
}
std::string repr() const {
std::ostringstream ss;
if (empty()) return "";
if (mShape.ndim() == 0) {
ss << *mPtr;
ss << "Tensor: dtype=" << detail::simpleTypeName<scalar_t>();
return ss.str();
}
Shape counter = mShape;
auto tensor_flat = this->view(-1);
for (int i = 0; i < counter.ndim(); ++i) {
counter[i] = 0;
ss << "[";
}
for (size_t i = 0; i < this->size(); ++i) {
ss << tensor_flat(rowArrayIdx(mShape, counter));
counter[counter.ndim() - 1] += 1;
int inc_count = 0;
bool print_comma = true;
for (int c = counter.ndim() - 1; c >= 0; --c) {
if (counter[c] == this->dim(c) && c > 0) {
++inc_count;
counter[c - 1] += 1;
counter[c] = 0;
print_comma = false;
}
}
if (print_comma && i != this->size() - 1) ss << ", ";
for (int j = 0; j < inc_count; ++j) {
ss << "]";
}
if (i != this->size() - 1) {
if (inc_count != 0) ss << "\n";
for (int j = 0; j < inc_count; ++j) {
ss << "[";
}
}
}
ss << "]";
ss << "Tensor: dtype=" << detail::simpleTypeName<scalar_t>();
return ss.str();
}
protected:
// TODO: make this function public.
// currently this function is called unexpectedly when using subview({0, 0}).
TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> _subview(
SimpleVector<Slice> slice_vec) {
Shape new_shape;
for (int i = 0; i < slice_vec.size(); ++i) {
new_shape.push_back(slice_vec[i][0]);
}
Shape start = new_shape;
TV_ASSERT(new_shape.ndim() <= mShape.ndim());
TV_ASSERT(new_shape.ndim() != 0);
size_t idxsize = new_shape.ndim();
for (size_t i = idxsize; i < mShape.ndim(); ++i) {
new_shape.push_back(0);
start.push_back(0);
}
for (size_t i = 0; i < slice_vec.size(); ++i) {
if (slice_vec[i][1] != -1) {
new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
TV_ASSERT(new_shape[i] >= 0);
} else {
new_shape[i] = 1; // reduce dim
}
}
auto offset = rowArrayIdx(mShape, start);
for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
new_shape[i] = mShape[i];
TV_ASSERT(new_shape[i] >= 0);
}
Shape reduced_shape;
for (size_t i = 0; i < slice_vec.size(); ++i) {
if (slice_vec[i][1] != -1) {
reduced_shape.push_back(new_shape[i]);
}
}
for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
reduced_shape.push_back(new_shape[i]);
}
return TensorView<scalar_t, Rank>(mPtr + offset, reduced_shape);
}
template <typename T1>
TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
return Slice{int(s), -1, -1};
}
TV_HOST_DEVICE_INLINE Slice to_slice(Slice s) const { return Slice(s); }
scalar_t *mPtr = nullptr;
Shape mShape;
};
template <typename Os, typename scalar_t, int Rank>
Os &operator<<(Os &os, const TensorView<scalar_t, Rank> &dt) {
os << dt.repr();
return os;
}
template <typename Os, typename scalar_t, int Rank>
Os &operator<<(Os &os, const TensorView<const scalar_t, Rank> &dt) {
os << dt.repr();
return os;
}
namespace detail {
template <typename scalar_t>
constexpr const char *printfTypeFormat(scalar_t val = scalar_t());
template <>
constexpr const char *printfTypeFormat(float val) {
return "%.2f";
}
template <>
constexpr const char *printfTypeFormat(double val) {
return "%.2f";
}
template <>
constexpr const char *printfTypeFormat(int val) {
return "%d";
}
template <>
constexpr const char *printfTypeFormat(unsigned val) {
return "%u";
}
template <>
constexpr const char *printfTypeFormat(long val) {
return "%ld";
}
template <>
constexpr const char *printfTypeFormat(unsigned long val) {
return "%lu";
}
}; // namespace detail
template <typename scalar_t>
TV_HOST_DEVICE void printTensorView(const TensorView<scalar_t> tensor,
const char *format) {
if (tensor.empty()) return;
if (tensor.ndim() == 0) {
printf(format, tensor());
printf("\n");
return;
}
Shape counter = tensor.shape();
auto tensor_flat = tensor.view(-1);
for (int i = 0; i < counter.ndim(); ++i) {
counter[i] = 0;
printf("[");
}
for (size_t i = 0; i < tensor.size(); ++i) {
printf(format, tensor_flat(rowArrayIdx(tensor.shape(), counter)));
counter[counter.ndim() - 1] += 1;
int inc_count = 0;
bool print_comma = true;
for (int c = counter.ndim() - 1; c >= 0; --c) {
if (counter[c] == tensor.dim(c) && c > 0) {
++inc_count;
counter[c - 1] += 1;
counter[c] = 0;
print_comma = false;
}
}
if (print_comma && i != tensor.size() - 1) printf(", ");
for (int j = 0; j < inc_count; ++j) {
printf("]");
}
if (i != tensor.size() - 1) {
if (inc_count != 0) printf("\n");
for (int j = 0; j < inc_count; ++j) {
printf("[");
}
}
}
printf("]\n");
}
template <typename scalar_t>
TV_HOST_DEVICE void printTensorView(TensorView<scalar_t> tensor) {
using Traw = typename std::remove_const<scalar_t>::type;
return printTensorView(tensor, detail::printfTypeFormat<Traw>());
}
template <typename scalar_t>
TV_HOST_DEVICE void printTensorView(const scalar_t *ptr, Shape shape) {
using Traw = typename std::remove_const<scalar_t>::type;
return printTensorView(TensorView<const scalar_t>(ptr, shape),
detail::printfTypeFormat<Traw>());
}
template <typename scalar_t>
TV_HOST_DEVICE void printTensorView(const scalar_t *ptr, Shape shape,
const char *format) {
return printTensorView(TensorView<const scalar_t>(ptr, shape), format);
}
} // namespace tv
......@@ -10,6 +10,7 @@
#include "reduce_ops.h"
#include "roi_align.h"
#include "roi_align_rotated.h"
#include "rotated_feature_align.h"
#include "soft_nms.h"
const char *c_MMCVOpDomain = "mmcv";
......@@ -17,6 +18,7 @@ SoftNmsOp c_SoftNmsOp;
NmsOp c_NmsOp;
MMCVRoiAlignCustomOp c_MMCVRoiAlignCustomOp;
MMCVRoIAlignRotatedCustomOp c_MMCVRoIAlignRotatedCustomOp;
MMCVRotatedFeatureAlignCustomOp c_MMCVRotatedFeatureAlignCustomOp;
GridSampleOp c_GridSampleOp;
MMCVCumMaxCustomOp c_MMCVCumMaxCustomOp;
MMCVCumMinCustomOp c_MMCVCumMinCustomOp;
......@@ -77,5 +79,10 @@ OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
return status;
}
if (auto status = ortApi->CustomOpDomain_Add(
domain, &c_MMCVRotatedFeatureAlignCustomOp)) {
return status;
}
return ortApi->AddCustomOpDomain(options, domain);
}
// Modified from
// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
#include "rotated_feature_align.h"
#include "../ort_mmcv_utils.h"
template <typename T>
T bilinear_interpolate(const T *input, const int height, const int width, T y,
T x, const int index /* index for debug only*/) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
if (y <= 0) y = 0;
if (x <= 0) x = 0;
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// do bilinear interpolation
T v1 = input[int(fma(y_low, width, x_low))];
T v2 = input[int(fma(y_low, width, x_high))];
T v3 = input[int(fma(y_high, width, x_low))];
T v4 = input[int(fma(y_high, width, x_high))];
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template <typename scalar_t>
void rotated_feature_align_forward_cpu_kernel(
const int nthreads, const int points, const scalar_t *bottom_data,
const scalar_t *best_bboxes, const scalar_t spatial_scale,
const int channels, const int height, const int width, scalar_t *top_data) {
for (int index = 0; index < nthreads; index++) {
int w = index % width;
int h = (index / width) % height;
int c = (index / width / height) % channels;
int n = index / width / height / channels;
const scalar_t *bbox_offset =
best_bboxes + ((n * height + h) * width + w) * 5;
scalar_t roi_y = bbox_offset[0] * spatial_scale;
scalar_t roi_x = bbox_offset[1] * spatial_scale;
scalar_t px[5] = {roi_x, 0, 0, 0, 0};
scalar_t py[5] = {roi_y, 0, 0, 0, 0};
if (points > 1) {
scalar_t roi_w = bbox_offset[2] * spatial_scale;
scalar_t roi_h = bbox_offset[3] * spatial_scale;
scalar_t roi_a = bbox_offset[4];
scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
scalar_t wx = cosa * w_2, wy = sina * w_2;
scalar_t hx = -sina * h_2, hy = cosa * h_2;
px[1] = roi_x + wx + hx;
py[1] = roi_y + wy + hy;
px[2] = roi_x - wx + hx;
py[2] = roi_y - wy + hy;
px[3] = roi_x - wx - hx;
py[3] = roi_y - wy - hy;
px[4] = roi_x + wx - hx;
py[4] = roi_y + wy - hy;
}
const scalar_t *offset_bottom_data =
bottom_data + (n * channels + c) * height * width;
scalar_t output_val = bottom_data[index];
for (int i = 0; i < points; i++) {
output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
width, py[i], px[i], i);
}
top_data[index] = output_val;
}
}
void MMCVRotatedFeatureAlignKernel::Compute(OrtKernelContext *context) {
// Setup inputs
const OrtValue *input_features = ort_.KernelContext_GetInput(context, 0);
const float *features_data = reinterpret_cast<const float *>(
ort_.GetTensorData<float>(input_features));
const OrtValue *input_best_rbboxes = ort_.KernelContext_GetInput(context, 1);
const float *best_rbboxes = reinterpret_cast<const float *>(
ort_.GetTensorData<const float *>(input_best_rbboxes));
// Setup output
OrtTensorDimensions out_dimensions(ort_, input_features);
int batch_size = out_dimensions.data()[0];
int input_channels = out_dimensions.data()[1];
int input_height = out_dimensions.data()[2];
int input_width = out_dimensions.data()[3];
OrtValue *output = ort_.KernelContext_GetOutput(
context, 0, out_dimensions.data(), out_dimensions.size());
float *out = ort_.GetTensorMutableData<float>(output);
OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
ort_.ReleaseTensorTypeAndShapeInfo(output_info);
// TODO: forward here
int output_size = out_dimensions.data()[0];
for (auto i = 1; i < out_dimensions.size(); ++i) {
output_size *= out_dimensions.data()[i];
}
rotated_feature_align_forward_cpu_kernel<float>(
output_size, points_, features_data, best_rbboxes, spatial_scale_,
input_channels, input_height, input_width, out);
}
#ifndef ONNXRUNTIME_ROTATED_FEATURE_ALIGN_H
#define ONNXRUNTIME_ROTATED_FEATURE_ALIGN_H
#include <onnxruntime_cxx_api.h>
#include <cmath>
struct MMCVRotatedFeatureAlignKernel {
public:
MMCVRotatedFeatureAlignKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
: ort_(ort) {
spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
points_ = ort_.KernelInfoGetAttribute<int64_t>(info, "points");
}
void Compute(OrtKernelContext* context);
private:
Ort::CustomOpApi ort_;
float spatial_scale_;
int points_;
};
struct MMCVRotatedFeatureAlignCustomOp
: Ort::CustomOpBase<MMCVRotatedFeatureAlignCustomOp,
MMCVRotatedFeatureAlignKernel> {
void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
return new MMCVRotatedFeatureAlignKernel(api, info);
}
const char* GetName() const { return "MMCVRotatedFeatureAlign"; }
size_t GetInputTypeCount() const { return 2; }
ONNXTensorElementDataType GetInputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
size_t GetOutputTypeCount() const { return 1; }
ONNXTensorElementDataType GetOutputType(size_t) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
// force cpu
const char* GetExecutionProviderType() const {
return "CPUExecutionProvider";
}
};
#endif // ONNXRUNTIME_ROTATED_FEATURE_ALIGN_H
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment