"git@developer.sourcefind.cn:gaoqiong/migraphx.git" did not exist on "0d5aa0f1a06b70276a5d351280623fb2535f4f3c"
Commit 1a91fcc2 authored by gaoqiong's avatar gaoqiong
Browse files

add dtk所需文件

parent a144865d
Pipeline #492 failed with stages
in 0 seconds
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "max_pool_with_index.h"
#include <cfloat>
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/providers/rocm/shared_inc/fast_divmod.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
__global__ void MaxPoolWithIndexKernel(
int64_t batch,
int64_t channels,
int64_t height,
int64_t width,
int64_t depth,
int64_t pooled_height,
int64_t pooled_width,
int64_t pooled_depth,
int64_t kernel_h,
int64_t kernel_w,
int64_t kernel_d,
int64_t stride_h,
int64_t stride_w,
int64_t stride_d,
int64_t pad_h,
int64_t pad_w,
int64_t pad_d,
int64_t dilation_h,
int64_t dilation_w,
int64_t dilation_d,
fast_divmod fdm_c,
fast_divmod fdm_h,
fast_divmod fdm_w,
fast_divmod fdm_d,
int64_t storage_order,
const T* p_input,
int64_t output_size,
T* p_output,
int64_t* p_indices) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id >= output_size) return;
int d_index, w_index, h_index, c_index, n_index, id_tmp;
fdm_d.divmod(id, id_tmp, d_index);
fdm_w.divmod(id_tmp, id_tmp, w_index);
fdm_h.divmod(id_tmp, id_tmp, h_index);
fdm_c.divmod(id_tmp, n_index, c_index);
int64_t d_start = d_index * stride_d - pad_d;
int64_t w_start = w_index * stride_w - pad_w;
int64_t h_start = h_index * stride_h - pad_h;
int64_t d_end = _Min<int64_t>(d_start + (kernel_d - 1) * dilation_d + 1, depth);
int64_t w_end = _Min<int64_t>(w_start + (kernel_w - 1) * dilation_w + 1, width);
int64_t h_end = _Min<int64_t>(h_start + (kernel_h - 1) * dilation_h + 1, height);
d_start = _Max<int64_t>(d_start, 0);
w_start = _Max<int64_t>(w_start, 0);
h_start = _Max<int64_t>(h_start, 0);
int64_t d_index_max = -1;
int64_t w_index_max = -1;
int64_t h_index_max = -1;
int64_t offset = (n_index * channels + c_index) * height * width * depth;
const T* p_slice = p_input + offset;
T maxval = p_slice[h_start * width * depth + w_start * depth + d_start] - (T)1;
for (int64_t d = d_start; d < d_end; d += dilation_d) {
for (int64_t w = w_start; w < w_end; w += dilation_w) {
for (int64_t h = h_start; h < h_end; h += dilation_h) {
if (p_slice[h * width * depth + w * depth + d] > maxval) {
h_index_max = h;
w_index_max = w;
d_index_max = d;
maxval = static_cast<float>(p_slice[h * width * depth + w * depth + d]);
}
}
}
}
p_output[id] = p_input[offset + h_index_max * width * depth + w_index_max * depth + d_index_max];
if (p_indices) {
p_indices[id] = storage_order == 0 ? offset + h_index_max * width * depth + w_index_max * depth + d_index_max
: offset + h_index_max + w_index_max * height + d_index_max * width * height;
}
}
template <typename T>
void MaxPoolWithIndex(
hipStream_t stream,
const TensorShape& input_shape,
const TensorShape& output_shape,
const gsl::span<const int64_t>& kernel_shape,
const gsl::span<const int64_t>& stride_shape,
const gsl::span<const int64_t>& pads,
const gsl::span<const int64_t>& dilations,
int64_t storage_order,
const T* p_input,
T* p_output,
int64_t* p_indices) {
int64_t batchs = input_shape[0];
int64_t channels = input_shape[1];
int64_t height = input_shape[2];
int64_t width = kernel_shape.size() > 1 ? input_shape[3] : 1;
int64_t depth = kernel_shape.size() > 2 ? input_shape[4] : 1;
int64_t pooled_height = output_shape[2];
int64_t pooled_width = kernel_shape.size() > 1 ? output_shape[3] : 1;
int64_t pooled_depth = kernel_shape.size() > 2 ? output_shape[4] : 1;
int64_t kernel_h = kernel_shape[0];
int64_t kernel_w = kernel_shape.size() > 1 ? kernel_shape[1] : 1;
int64_t kernel_d = kernel_shape.size() > 2 ? kernel_shape[2] : 1;
int64_t stride_h = stride_shape[0];
int64_t stride_w = stride_shape.size() > 1 ? stride_shape[1] : 1;
int64_t stride_d = stride_shape.size() > 2 ? stride_shape[2] : 1;
//pads in the format of [x1_begin, x2_begin...x1_end, x2_end,...],
//where xi_begin the number of pixels added at the beginning of axis i
//and xi_end, the number of pixels added at the end of axis i.
int64_t pad_h = pads[0];
int64_t pad_w = pads.size() >= 4 ? pads[1] : 0;
int64_t pad_d = pads.size() == 6 ? pads[2] : 0;
int64_t dilation_h = dilations[0];
int64_t dilation_w = dilations.size() >= 2 ? dilations[1] : 1;
int64_t dilation_d = dilations.size() == 3 ? dilations[2] : 1;
int64_t output_size = output_shape.Size();
fast_divmod fdm_c(static_cast<int>(channels));
fast_divmod fdm_h(static_cast<int>(pooled_height));
fast_divmod fdm_w(static_cast<int>(pooled_width));
fast_divmod fdm_d(static_cast<int>(pooled_depth));
int blocksPerGrid = (int)((output_size + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock);
hipLaunchKernelGGL(MaxPoolWithIndexKernel, blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
batchs,
channels,
height,
width,
depth,
pooled_height,
pooled_width,
pooled_depth,
kernel_h,
kernel_w,
kernel_d,
stride_h,
stride_w,
stride_d,
pad_h,
pad_w,
pad_d,
dilation_h,
dilation_w,
dilation_d,
fdm_c,
fdm_h,
fdm_w,
fdm_d,
storage_order,
p_input,
output_size,
p_output,
p_indices);
}
#define INSTANTIATEMAXPOOLWITHINDEX(T) \
template void MaxPoolWithIndex<T>( \
hipStream_t stream, \
const TensorShape& input_shape, \
const TensorShape& output_shape, \
const gsl::span<const int64_t>& kernel_shape, \
const gsl::span<const int64_t>& stride_shape, \
const gsl::span<const int64_t>& pads, \
const gsl::span<const int64_t>& dilations, \
int64_t storage_order, \
const T* p_input, \
T* p_output, \
int64_t* p_indices);
INSTANTIATEMAXPOOLWITHINDEX(float)
INSTANTIATEMAXPOOLWITHINDEX(double)
INSTANTIATEMAXPOOLWITHINDEX(half)
INSTANTIATEMAXPOOLWITHINDEX(int8_t)
INSTANTIATEMAXPOOLWITHINDEX(uint8_t)
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include <vector>
#include "core/framework/tensor_shape.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
void MaxPoolWithIndex(
hipStream_t stream,
const TensorShape& input_shape,
const TensorShape& output_shape,
const gsl::span<const int64_t>& kernel_shape,
const gsl::span<const int64_t>& stride_shape,
const gsl::span<const int64_t>& pads,
const gsl::span<const int64_t>& dilations,
int64_t storage_order,
const T* p_input,
T* p_output,
int64_t* p_indices);
} //namespace rocm
} //namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/nn/pool.h"
#include "core/providers/rocm/miopen_common.h"
#include "core/providers/rocm/nn/max_pool_with_index.h"
#include "core/providers/rocm/math/unary_elementwise_ops_impl.h"
using namespace onnxruntime::common;
namespace onnxruntime {
namespace rocm {
#define POOLING_KERNEL(op_name, data_type, pool_type, since_version) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
op_name, \
kOnnxDomain, \
since_version, \
data_type, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()), \
Pool<data_type, pool_type>);
#define POOLING_KERNEL_VERSIONED(op_name, data_type, pool_type, since_version, end_version) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
op_name, \
kOnnxDomain, \
since_version, \
end_version, \
data_type, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()), \
Pool<data_type, pool_type>);
#define POOLING_KERNEL_WITH_INDICES(op_name, data_type, pool_type, since_version) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
op_name, \
kOnnxDomain, \
since_version, \
data_type, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()) \
.TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()), \
Pool<data_type, pool_type>);
#define POOLING_KERNEL_VERSIONED_WITH_INDICES(op_name, data_type, pool_type, since_version, end_version) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
op_name, \
kOnnxDomain, \
since_version, \
end_version, \
data_type, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()) \
.TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()), \
Pool<data_type, pool_type>);
POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 7, 9)
POOLING_KERNEL_VERSIONED(AveragePool, double, AveragePool, 7, 9)
POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 7, 9)
POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 10, 10)
POOLING_KERNEL_VERSIONED(AveragePool, double, AveragePool, 10, 10)
POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 10, 10)
//AveragePool and MaxPool op set 11 only update spec document on default value for dilations and strides.
POOLING_KERNEL(AveragePool, float, AveragePool, 11)
POOLING_KERNEL(AveragePool, double, AveragePool, 11)
POOLING_KERNEL(AveragePool, MLFloat16, AveragePool, 11)
POOLING_KERNEL(GlobalAveragePool, float, AveragePool, 1)
POOLING_KERNEL(GlobalAveragePool, double, AveragePool, 1)
POOLING_KERNEL(GlobalAveragePool, MLFloat16, AveragePool, 1)
POOLING_KERNEL_VERSIONED(MaxPool, float, MaxPool<1>, 1, 7)
POOLING_KERNEL_VERSIONED(MaxPool, double, MaxPool<1>, 1, 7)
POOLING_KERNEL_VERSIONED(MaxPool, MLFloat16, MaxPool<1>, 1, 7)
POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 8, 9)
POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, double, MaxPool<8>, 8, 9)
POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 8, 9)
POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 10, 10)
POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, double, MaxPool<8>, 10, 10)
POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 10, 10)
POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 11, 11)
POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, double, MaxPool<8>, 11, 11)
POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 11, 11)
POOLING_KERNEL_WITH_INDICES(MaxPool, float, MaxPool<8>, 12)
POOLING_KERNEL_WITH_INDICES(MaxPool, double, MaxPool<8>, 12)
POOLING_KERNEL_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 12)
POOLING_KERNEL_WITH_INDICES(MaxPool, int8_t, MaxPool<8>, 12)
POOLING_KERNEL_WITH_INDICES(MaxPool, uint8_t, MaxPool<8>, 12)
POOLING_KERNEL(GlobalMaxPool, float, MaxPool<1>, 1)
POOLING_KERNEL(GlobalMaxPool, double, MaxPool<1>, 1)
POOLING_KERNEL(GlobalMaxPool, MLFloat16, MaxPool<1>, 1)
class MiopenPoolingDescriptor final {
public:
MiopenPoolingDescriptor() : desc_(nullptr) {
}
~MiopenPoolingDescriptor() {
if (desc_ != nullptr) {
miopenDestroyPoolingDescriptor(desc_);
desc_ = nullptr;
}
}
MiopenPoolingDescriptor(const MiopenPoolingDescriptor&) = delete;
MiopenPoolingDescriptor& operator=(const MiopenPoolingDescriptor&) = delete;
Status Set(miopenPoolingMode_t mode,
const gsl::span<const int64_t>& kernel_shape,
const gsl::span<const int64_t>& pads,
const gsl::span<const int64_t>& strides) {
if (!desc_)
MIOPEN_RETURN_IF_ERROR(miopenCreatePoolingDescriptor(&desc_));
int rank = gsl::narrow_cast<int>(kernel_shape.size());
InlinedVector<int> window(rank);
InlinedVector<int> padding(rank);
InlinedVector<int> stride(rank);
for (int i = 0; i < rank; i++) {
window[i] = gsl::narrow_cast<int>(kernel_shape[i]);
}
for (int i = 0; i < rank; i++) {
padding[i] = gsl::narrow_cast<int>(pads[i]);
}
for (int i = 0; i < rank; i++) {
stride[i] = gsl::narrow_cast<int>(strides[i]);
}
MIOPEN_RETURN_IF_ERROR(SetPoolingNdDescriptorHelper(
desc_,
mode,
MIOPEN_PROPAGATE_NAN,
rank,
window.data(),
padding.data(),
stride.data()));
return Status::OK();
}
operator miopenPoolingDescriptor_t() const { return desc_; }
private:
miopenPoolingDescriptor_t desc_;
};
template <typename T, typename PoolType>
Status Pool<T, PoolType>::ComputeInternal(OpKernelContext* context) const {
typedef typename ToHipType<T>::MappedType HipT;
const Tensor* X = context->Input<Tensor>(0);
const TensorShape& x_shape = X->Shape();
const auto x_dims = x_shape.GetDims();
if (x_shape.NumDimensions() < 3) {
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Input dimension cannot be less than 3.");
}
auto kernel_shape = pool_attrs_.kernel_shape;
auto pads = pool_attrs_.pads;
auto strides = pool_attrs_.strides;
if (pool_attrs_.global_pooling) {
kernel_shape.assign(x_dims.begin() + 2, x_dims.end());
pads.assign(kernel_shape.size(), 0);
strides.assign(kernel_shape.size(), 1);
}
auto y_dims = pool_attrs_.SetOutputSize(x_shape, x_shape[1], &pads);
TensorShape y_shape(y_dims);
Tensor* Y = context->Output(0, y_shape);
// special case when there is a dim value of 0 in the shape.
if (y_shape.Size() == 0)
return Status::OK();
auto x_data = reinterpret_cast<const HipT*>(X->Data<T>());
auto y_data = reinterpret_cast<HipT*>(Y->MutableData<T>());
TensorShapeVector x_dims_miopen(x_dims.begin(), x_dims.end());
TensorShapeVector y_dims_miopen(y_dims);
if (kernel_shape.size() < 2) {
// miopen only takes 4D or 5D input, so pad dimensions if needed
x_dims_miopen.push_back(1);
y_dims_miopen.push_back(1);
pads.insert(pads.begin() + kernel_shape.size(), 0);
pads.insert(pads.end(), 0);
kernel_shape.push_back(1);
strides.push_back(1);
}
miopenPoolingMode_t mode = miopenPoolingMax;
if constexpr (PoolType::type == onnxruntime::PoolType::kAveragePool) {
mode = pool_attrs_.count_include_pad ? miopenPoolingAverageInclusive
: miopenPoolingAverage;
}
MiopenPoolingDescriptor pooling_desc;
ORT_RETURN_IF_ERROR(pooling_desc.Set(mode, kernel_shape, pads, strides));
if constexpr (std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value) {
// Cast to float back and forth using temp buffer
const auto alpha = Consts<float>::One;
const auto beta = Consts<float>::Zero;
MiopenTensor x_tensor;
MiopenTensor y_tensor;
ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_miopen, MiopenTensor::GetDataType<float>()));
ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_miopen, MiopenTensor::GetDataType<float>()));
const auto input_count = x_shape.Size();
const auto output_count = y_shape.Size();
IAllocatorUniquePtr<float> temp_X = GetScratchBuffer<float>(input_count);
auto temp_Y = GetScratchBuffer<float>(output_count);
Impl_Cast<HipT, float>(Stream(), reinterpret_cast<const HipT*>(x_data), temp_X.get(), input_count);
MIOPEN_RETURN_IF_ERROR(PoolingForwardHelper(MiopenHandle(), pooling_desc, &alpha,
x_tensor, temp_X.get(), &beta, y_tensor, temp_Y.get()));
Impl_Cast<float, HipT>(Stream(), temp_Y.get(), y_data, output_count);
} else {
const auto alpha = Consts<HipT>::One;
const auto beta = Consts<HipT>::Zero;
MiopenTensor x_tensor;
MiopenTensor y_tensor;
ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_miopen, MiopenTensor::GetDataType<HipT>()));
ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_miopen, MiopenTensor::GetDataType<HipT>()));
MIOPEN_RETURN_IF_ERROR(PoolingForwardHelper(MiopenHandle(), pooling_desc, &alpha,
x_tensor, x_data, &beta, y_tensor, y_data));
}
return Status::OK();
}
template <typename T>
Status Pool<T, MaxPool<8>>::ComputeInternal(OpKernelContext* context) const {
typedef typename ToHipType<T>::MappedType HipT;
const Tensor* X = context->Input<Tensor>(0);
const TensorShape& x_shape = X->Shape();
const auto& x_dims = x_shape.GetDims();
if (x_shape.NumDimensions() < 3) {
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Input dimension cannot be less than 3.");
}
auto kernel_shape = this->pool_attrs_.kernel_shape;
auto pads = this->pool_attrs_.pads;
auto strides = this->pool_attrs_.strides;
if (this->pool_attrs_.global_pooling) {
kernel_shape.assign(x_dims.begin() + 2, x_dims.end());
pads.assign(kernel_shape.size(), 0);
strides.assign(kernel_shape.size(), 1);
}
auto y_dims = this->pool_attrs_.SetOutputSize(x_shape, x_shape[1], &pads);
Tensor* Y = context->Output(0, TensorShape(y_dims));
// special case when there is a dim value of 0 in the shape.
if (Y->Shape().Size() == 0)
return Status::OK();
auto x_data = reinterpret_cast<const HipT*>(X->Data<T>());
auto y_data = reinterpret_cast<HipT*>(Y->MutableData<T>());
Tensor* I = context->Output(1, TensorShape(y_dims));
if (nullptr != I || !this->pool_attrs_.default_dilations) {
auto i_data = nullptr == I ? nullptr : I->MutableData<int64_t>();
MaxPoolWithIndex<HipT>(
this->Stream(),
x_shape,
TensorShape(y_dims),
kernel_shape,
strides,
pads,
this->pool_attrs_.dilations,
this->pool_attrs_.storage_order,
x_data,
y_data,
i_data);
} else {
ORT_RETURN_IF_ERROR((Pool<T, MaxPool<1>>::ComputeInternal(context)));
}
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/rocm/miopen_common.h"
#include "core/providers/cpu/nn/pool_base.h"
namespace onnxruntime {
namespace rocm {
template <typename T, typename PoolType>
class Pool : public RocmKernel, public PoolBase {
public:
Pool(const OpKernelInfo& info) : RocmKernel(info), PoolBase(info) {}
Status ComputeInternal(OpKernelContext* context) const override;
};
template <typename T>
class Pool<T, MaxPool<8>> final : public Pool<T, MaxPool<1>> {
public:
Pool(const OpKernelInfo& info) : Pool<T, MaxPool<1>>(info) {}
Status ComputeInternal(OpKernelContext* context) const override;
};
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "shrink.h"
#include "shrink_impl.h"
#include "core/providers/common.h"
using namespace std;
namespace onnxruntime {
namespace rocm {
#define SHRINK_REGISTER_KERNEL(T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
Shrink, \
kOnnxDomain, \
9, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.MayInplace(0, 0) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
Shrink<T>);
template <typename T>
Status Shrink<T>::ComputeInternal(OpKernelContext* p_op_kernel_context) const {
typedef typename ToHipType<T>::MappedType HipT;
const Tensor* X = p_op_kernel_context->Input<Tensor>(0);
const auto* x_data = reinterpret_cast<const HipT*>(X->Data<T>());
const TensorShape& x_shape = X->Shape();
const size_t x_size = static_cast<size_t>(x_shape.Size());
Tensor* Y = p_op_kernel_context->Output(0, x_shape);
auto* y_data = reinterpret_cast<HipT*>(Y->MutableData<T>());
ShrinkImpl<HipT>(Stream(), x_data, bias_, lambd_, y_data, x_size);
return Status::OK();
}
SHRINK_REGISTER_KERNEL(float)
SHRINK_REGISTER_KERNEL(double)
SHRINK_REGISTER_KERNEL(MLFloat16)
SHRINK_REGISTER_KERNEL(uint8_t)
SHRINK_REGISTER_KERNEL(int8_t)
SHRINK_REGISTER_KERNEL(uint16_t)
SHRINK_REGISTER_KERNEL(int16_t)
SHRINK_REGISTER_KERNEL(uint32_t)
SHRINK_REGISTER_KERNEL(int32_t)
SHRINK_REGISTER_KERNEL(uint64_t)
SHRINK_REGISTER_KERNEL(int64_t)
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
class Shrink final : public RocmKernel {
public:
Shrink(const OpKernelInfo& info) : RocmKernel(info) {
float bias_temp;
// if the attribute exists, use the value
if (info.GetAttr<float>("bias", &bias_temp).IsOK())
bias_ = bias_temp;
float lambd_temp;
// if the attribute exists, use the value
if (info.GetAttr<float>("lambd", &lambd_temp).IsOK())
lambd_ = lambd_temp;
}
Status ComputeInternal(OpKernelContext* p_op_kernel_context) const;
private:
float bias_ = 0.0f; // default as per spec
float lambd_ = 0.5f; // default as per spec
};
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/cu_inc/common.cuh"
#include "shrink_impl.h"
namespace onnxruntime {
namespace rocm {
// Generic implementation of Shrink
template <typename T>
__global__ void _ShrinkKernel(
const T* input_data,
const float bias,
const float lambda,
T* output_data,
const HIP_LONG N) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
T x = input_data[id];
if (x < -lambda) {
output_data[id] = (T)(x + bias);
} else if (x > lambda) {
output_data[id] = (T)(x - bias);
} else {
output_data[id] = (T)0;
}
}
// Specialized implementation for 'half' type
// the idea is to convert 'half' data to 'float' first,
// do the operation and convert result back to 'half'
template <>
__global__ void _ShrinkKernel(
const half* input_data,
const float bias,
const float lambda,
half* output_data,
const HIP_LONG N) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
half x = input_data[id];
if ((float)x < -lambda) {
output_data[id] = half((float)x + bias);
} else if ((float)x > lambda) {
output_data[id] = half((float)x - bias);
} else {
output_data[id] = (half)0;
}
}
template <typename T>
void ShrinkImpl(
hipStream_t stream,
const T* input_data,
const float bias,
const float lambda,
T* output_data,
size_t N) {
int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
hipLaunchKernelGGL(HIP_KERNEL_NAME(_ShrinkKernel<T>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
input_data, bias, lambda, output_data, (HIP_LONG)N);
}
#define SPECIALIZED_IMPL(T) \
template void ShrinkImpl<T>(hipStream_t stream, const T* input_data, const float bias, const float lambda, T* output_data, size_t N);
SPECIALIZED_IMPL(float)
SPECIALIZED_IMPL(double)
SPECIALIZED_IMPL(half)
SPECIALIZED_IMPL(uint8_t)
SPECIALIZED_IMPL(int8_t)
SPECIALIZED_IMPL(uint16_t)
SPECIALIZED_IMPL(int16_t)
SPECIALIZED_IMPL(uint32_t)
SPECIALIZED_IMPL(int32_t)
SPECIALIZED_IMPL(uint64_t)
SPECIALIZED_IMPL(int64_t)
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
namespace onnxruntime {
namespace rocm {
template <typename T>
void ShrinkImpl(
hipStream_t stream,
const T* input_data,
const float bias,
const float lambda,
T* output_data,
size_t count);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#ifdef ENABLE_NVTX_PROFILE
#include "nvtx_profile.h"
#include "core/common/common.h"
#include <nvToolsExt.h>
#include <nvToolsExtCuda.h>
namespace onnxruntime {
namespace profile {
void NvtxRangeCreator::BeginImpl() {
// enable only for debug builds because this function is for profiling only.
nvtxEventAttributes_t eventAttrib;
eventAttrib.version = NVTX_VERSION;
eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
eventAttrib.colorType = NVTX_COLOR_ARGB;
eventAttrib.color = static_cast<uint32_t>(color_);
eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
eventAttrib.message.ascii = message_.c_str();
range_id_ = nvtxRangeStartEx(&eventAttrib);
}
void NvtxRangeCreator::EndImpl() {
// enable only for debug builds because this function is for profiling only.
nvtxRangeEnd(range_id_);
}
void NvtxNestedRangeCreator::BeginImpl() {
// enable only for debug builds because this function is for profiling only.
nvtxEventAttributes_t eventAttrib;
eventAttrib.version = NVTX_VERSION;
eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
eventAttrib.colorType = NVTX_COLOR_ARGB;
eventAttrib.color = static_cast<uint32_t>(color_);
eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
eventAttrib.message.ascii = message_.c_str();
nvtxRangePushEx(&eventAttrib);
}
void NvtxNestedRangeCreator::EndImpl() {
// enable only for debug builds because this function is for profiling only.
nvtxRangePop();
}
void NvtxMarkerCreator::Mark() {
// enable only for debug builds because this function is for profiling only.
nvtxEventAttributes_t eventAttrib;
eventAttrib.version = NVTX_VERSION;
eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
eventAttrib.colorType = NVTX_COLOR_ARGB;
eventAttrib.color = static_cast<uint32_t>(color_);
eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
eventAttrib.message.ascii = message_.c_str();
nvtxMarkEx(&eventAttrib);
}
} // namespace contrib
} // namespace onnxruntime
#endif
\ No newline at end of file
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
// These enclosed classes are wrappers over
// generating Nvidia's visual profile APIs.
// They can be used to plot the time intervals of forward and backward passes.
// They can also be used to plot the time span of a specific operator.
// When writing this file, Nvidia only supports this tool on Linux.
#ifdef ENABLE_NVTX_PROFILE
#pragma once
#include <cinttypes>
#include <cstdlib>
#include <iostream>
#include <stdexcept>
#include <string>
#include "core/common/common.h"
namespace onnxruntime {
namespace profile {
// Color in ARGB space.
// A: first 8 bit.
// R: later 8 bit.
// G: later 8 bit.
// B: last 8 bits
// All colo channels has range [0, 255].
enum class Color : uint32_t {
Black = 0x00000000,
Red = 0x00ff0000,
DarkGreen = 0x00009900,
Green = 0x0000ff00,
LightGreen = 0x00ccffcc,
Blue = 0x000000ff,
Amber = 0x00ffbf00,
LightAmber = 0x00fff2cc,
White = 0x00ffffff,
Cyan = 0x0000ffff,
Magenta = 0x00ff00ff,
Yellow = 0x00ffff00,
};
class RangeCreatorBase {
public:
RangeCreatorBase(const std::string message, const Color color)
: message_(message), color_(color),
is_begin_called_(false), is_end_called_(false) {};
// Check if Begin and End are both called.
// It's pointless if not all of them are called.
~RangeCreatorBase() {
if (!is_begin_called_) {
std::cerr << "Begin must be called once." << std::endl;
}
if (!is_end_called_) {
std::cerr << "End must be called once." << std::endl;
}
}
// Mark the beginning of a range.
void Begin() {
ORT_ENFORCE(!is_begin_called_, "Begin cannot be called more than once.");
ORT_ENFORCE(!is_end_called_, "Begin cannot be called after calling End.");
BeginImpl();
is_begin_called_ = true;
}
// Mark the end of a range.
void End() {
ORT_ENFORCE(is_begin_called_, "End must be called after calling Begin.");
ORT_ENFORCE(!is_end_called_, "End cannot be called more than once.");
EndImpl();
is_end_called_ = true;
}
bool IsBeginCalled() const {
return is_begin_called_;
}
bool IsEndCalled() const {
return is_end_called_;
}
virtual void BeginImpl() = 0;
virtual void EndImpl() = 0;
protected:
// Text on this event.
const std::string message_;
// Color of event in ARGB space.
const Color color_;
bool is_begin_called_;
bool is_end_called_;
};
class NvtxRangeCreator final : public RangeCreatorBase {
public:
NvtxRangeCreator(const std::string message, const Color color)
: RangeCreatorBase(message, color) {};
void BeginImpl() override;
void EndImpl() override;
private:
// It records the event ID created by BeginImpl.
// EndImpl needs this value to end the right event.
uint64_t range_id_;
};
class NvtxNestedRangeCreator final : public RangeCreatorBase {
public:
NvtxNestedRangeCreator(const std::string message, const Color color)
: RangeCreatorBase(message, color) {};
void BeginImpl() override;
void EndImpl() override;
};
class NvtxMarkerCreator final {
public:
NvtxMarkerCreator(const std::string message, const Color color)
: message_(message), color_(color) {};
void Mark();
private:
// Text on this marker.
const std::string message_;
// See nvtxRangeCreator.color_.
const Color color_;
};
} // namespace profile
} // namespace onnxruntime
#endif
\ No newline at end of file
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <thread>
#include <string>
#include <unordered_map>
#include "core/platform/ort_mutex.h"
#ifdef ENABLE_NVTX_PROFILE
namespace onnxruntime {
namespace profile {
// Singleton class of managing global NVTX profiling information.
class Context {
public:
static Context& GetInstance() {
static Context instance_;
return instance_;
}
// Return tag for the specified thread.
// If the thread's tag doesn't exist, this function returns an empty string.
std::string GetThreadTagOrDefault(const std::thread::id& thread_id) {
const std::lock_guard<OrtMutex> lock(mtx_);
return thread_tag_[thread_id];
}
// Set tag for the specified thread.
void SetThreadTag(
const std::thread::id& thread_id, const std::string& tag) {
const std::lock_guard<OrtMutex> lock(mtx_);
thread_tag_[thread_id] = tag;
}
private:
Context() = default;
~Context() = default;
Context(const Context&) = delete;
Context& operator=(const Context&) = delete;
// map from thread's id to its human-readable tag.
std::unordered_map<std::thread::id, std::string> thread_tag_;
OrtMutex mtx_;
};
} // namespace profile
} // namespace onnxruntime
#endif
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "non_max_suppression.h"
#include "core/providers/cpu/object_detection/non_max_suppression_helper.h"
#include "non_max_suppression_impl.h"
#include "core/providers/rocm/tensor/concat_impl.h"
namespace onnxruntime {
namespace rocm {
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
NonMaxSuppression,
kOnnxDomain,
10, 10,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.InputMemoryType(OrtMemTypeCPUInput, 2)
.InputMemoryType(OrtMemTypeCPUInput, 3)
.InputMemoryType(OrtMemTypeCPUInput, 4),
NonMaxSuppression);
ONNX_OPERATOR_KERNEL_EX(
NonMaxSuppression,
kOnnxDomain,
11,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.InputMemoryType(OrtMemTypeCPUInput, 2)
.InputMemoryType(OrtMemTypeCPUInput, 3)
.InputMemoryType(OrtMemTypeCPUInput, 4),
NonMaxSuppression);
Status NonMaxSuppression::ComputeInternal(OpKernelContext* ctx) const {
PrepareContext pc;
ORT_RETURN_IF_ERROR(PrepareCompute(ctx, pc));
int64_t max_output_boxes_per_class = 0;
float iou_threshold = .0f;
float score_threshold = .0f;
ORT_RETURN_IF_ERROR(GetThresholdsFromInputs(pc, max_output_boxes_per_class, iou_threshold, score_threshold));
if (0 == pc.num_boxes_ || 0 == max_output_boxes_per_class) {
ctx->Output(0, {0, 3});
return Status::OK();
}
// TODO: use hipcub::DeviceSegmentedRadixSort::SortPairsDescending instead of hipcub::DeviceRadixSort::SortPairsDescending
// to deal with multi batch/class parallelly
std::vector<std::tuple<IAllocatorUniquePtr<void>, int>> all_selected_indices;
int total_num_saved_outputs = 0;
// safe downcast max_output_boxes_per_class to int as hipcub::DeviceSelect::Flagged() does not support int64_t
int int_max_output_boxes_per_class = max_output_boxes_per_class > std::numeric_limits<int>::max()
? std::numeric_limits<int>::max()
: static_cast<int>(max_output_boxes_per_class);
for (int64_t batch_index = 0; batch_index < pc.num_batches_; ++batch_index) {
for (int64_t class_index = 0; class_index < pc.num_classes_; ++class_index) {
IAllocatorUniquePtr<void> d_selected_indices{};
IAllocatorUniquePtr<void> h_number_selected_ptr{AllocateBufferOnCPUPinned<void>(sizeof(int))};
auto* h_number_selected = static_cast<int*>(h_number_selected_ptr.get());
ORT_RETURN_IF_ERROR(NonMaxSuppressionImpl(
Stream(),
[this](size_t bytes) { return GetScratchBuffer<void>(bytes); },
pc,
GetCenterPointBox(),
batch_index,
class_index,
int_max_output_boxes_per_class,
iou_threshold,
score_threshold,
d_selected_indices,
h_number_selected));
int num_saved_outputs = *h_number_selected;
if (num_saved_outputs > 0) {
all_selected_indices.emplace_back(std::move(d_selected_indices), num_saved_outputs);
total_num_saved_outputs += num_saved_outputs;
}
}
}
if (total_num_saved_outputs == 0) {
ctx->Output(0, {0, 3});
} else {
// concatenate outputs
constexpr int last_dim = 3;
const int num_elements = last_dim * total_num_saved_outputs;
Tensor* output = ctx->Output(0, {static_cast<int64_t>(total_num_saved_outputs), last_dim});
ORT_ENFORCE(output != nullptr);
int64_t* dst = output->MutableData<int64_t>();
size_t count = all_selected_indices.size();
RocmAsyncBuffer<const void*> input_ptr(this, count);
RocmAsyncBuffer<int64_t> concat_sizes_gpu(this, count);
RocmAsyncBuffer<int64_t> concat_sizes_range_gpu(this, count);
RocmAsyncBuffer<int64_t> axis_dimension_input_output_mapping_gpu(this, total_num_saved_outputs);
int index = 0;
for (size_t i = 0; i < count; i++) {
auto& it = all_selected_indices[i];
auto src = std::get<0>(it).get();
auto size = std::get<1>(it);
input_ptr.CpuPtr()[i] = src;
concat_sizes_gpu.CpuPtr()[i] = size;
concat_sizes_range_gpu.CpuPtr()[i] = (i == 0) ? size : size + concat_sizes_range_gpu.CpuPtr()[i - 1];
for (int j = 0; j < size; j++) {
axis_dimension_input_output_mapping_gpu.CpuPtr()[index++] = i;
}
}
ORT_RETURN_IF_ERROR(concat_sizes_gpu.CopyToGpu());
ORT_RETURN_IF_ERROR(axis_dimension_input_output_mapping_gpu.CopyToGpu());
ORT_RETURN_IF_ERROR(concat_sizes_range_gpu.CopyToGpu());
ORT_RETURN_IF_ERROR(input_ptr.CopyToGpu());
ORT_RETURN_IF_ERROR(ConcatImpl(Stream(),
sizeof(int64_t),
num_elements,
last_dim,
concat_sizes_gpu.GpuPtr(),
concat_sizes_range_gpu.GpuPtr(),
axis_dimension_input_output_mapping_gpu.GpuPtr(),
dst,
input_ptr.GpuPtr(),
static_cast<size_t>(num_elements)));
}
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/common.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/object_detection/non_max_suppression.h"
namespace onnxruntime {
namespace rocm {
struct NonMaxSuppression final : public RocmKernel, public NonMaxSuppressionBase {
explicit NonMaxSuppression(const OpKernelInfo& info) : RocmKernel(info), NonMaxSuppressionBase(info) {
}
Status ComputeInternal(OpKernelContext* context) const override;
private:
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(NonMaxSuppression);
};
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
/* Modifications Copyright (c) Microsoft. */
#include <thrust/count.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include "non_max_suppression_impl.h"
#include "core/providers/cpu/object_detection/non_max_suppression_helper.h"
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/providers/rocm/rocm_common.h"
#include <hipcub/hipcub.hpp>
//TODO:fix the warnings
#ifdef _MSC_VER
#pragma warning(disable : 4244)
#endif
namespace onnxruntime {
namespace rocm {
using namespace nms_helpers;
namespace {
struct __align__(16) Box {
float x1, y1, x2, y2;
};
// This is the width of the bitmask for masking boxes for each thread.
// This needs to be a multiple of 2(a POD width usually) so that division and
// modulo can be implemented as bit operations during host selection.
constexpr int kNmsBoxesPerThread = 8 * sizeof(int);
// Helper to calculate modulo mask and shift bits.
// For kNmsBoxesPerThread=32 ModuloMask will be 31, i.e 0x1F thus
// i % 32 == i & 31. Similarly ShiftBits will be 5 so that
// i / 32 == i >> 5. Using these bit operations should reduce the stall on host
// thread.
__device__ constexpr int NumBits(int n) { return (n == 0) ? 0 : NumBits(n >> 1) + 1; }
constexpr int kNmsBlockDim = 16;
constexpr int kNmsBlockDimMax = 128;
// Check whether two boxes have an IoU greater than threshold.
template <typename T>
__device__ inline bool OverThreshold(const Box* a, const Box* b,
const float a_area,
const T iou_threshold) {
const float b_area = (b->x2 - b->x1) * (b->y2 - b->y1);
if (a_area == 0.0f || b_area == 0.0f) return false;
const float xx1 = fmaxf(a->x1, b->x1);
const float yy1 = fmaxf(a->y1, b->y1);
const float xx2 = fminf(a->x2, b->x2);
const float yy2 = fminf(a->y2, b->y2);
// fdimf computes the positive difference between xx2+1 and xx1.
const float w = fdimf(xx2, xx1);
const float h = fdimf(yy2, yy1);
const float intersection = w * h;
// Testing for aa/bb > t
// eq with aa > bb*t (b is !=0)
// avoiding divisions.
const float aa = intersection;
const float bb = a_area + b_area - intersection;
const float bt = bb * iou_threshold;
return aa >= bt;
}
template <typename T>
__device__ inline bool CheckBit(T* bit_mask, int bit) {
constexpr int kShiftLen = NumBits(8 * sizeof(T)) - 1;
constexpr int kRemainderMask = 8 * sizeof(T) - 1;
int bin = bit >> kShiftLen;
return (bit_mask[bin] >> (bit & kRemainderMask)) & 1;
}
// Produce a global bitmask (result_mask) of selected boxes from bitmask
// generated by NMSKernel Abort early if max_boxes boxes are selected. Bitmask
// is num_boxes*bit_mask_len bits indicating whether to keep or remove a box.
__global__ void NMSReduce(const int* bitmask, const int bit_mask_len,
const int num_boxes, const int max_boxes,
char* result_mask) {
extern __shared__ int local[];
// set global mask to accept all boxes
for (int box = blockIdx.x * blockDim.x + threadIdx.x; box < bit_mask_len; box += blockDim.x * gridDim.x) {
local[box] = 0xFFFFFFFF;
}
__syncthreads();
int accepted_boxes = 0;
for (int box = 0; box < num_boxes - 1; ++box) {
// if current box is masked by an earlier box, skip it.
if (!CheckBit(local, box)) {
continue;
}
accepted_boxes += 1;
int offset = box * bit_mask_len;
// update global mask with current box's mask
for (int b = blockIdx.x * blockDim.x + threadIdx.x; b < bit_mask_len; b += blockDim.x * gridDim.x) {
local[b] &= ~bitmask[offset + b];
}
__syncthreads();
if (accepted_boxes > max_boxes) break;
}
// copy global mask to result_max char array. char array is needed for
// hipcub::DeviceSelect later.
for (int box = blockIdx.x * blockDim.x + threadIdx.x; box < num_boxes; box += blockDim.x * gridDim.x) {
result_mask[box] = CheckBit(local, box);
}
}
// For each box, compute a bitmask of boxes which has an overlap with given box
// above threshold.
//
// Starting from highes scoring box, mark any box which has IoU>threshold with
// given box. Each thread processes a kNmsBoxesPerThread boxes per stride, and
// each box has bitmask of overlaps of length bit_mask_len.
//
__launch_bounds__(kNmsBlockDim* kNmsBlockDim, 4) __global__
void NMSKernel(
const int64_t center_point_box,
const Box* d_desc_sorted_boxes,
const int num_boxes,
const float iou_threshold,
const int bit_mask_len,
int* d_delete_mask) {
for (int i_block_offset = blockIdx.x * blockDim.x; i_block_offset < num_boxes;
i_block_offset += blockDim.x * gridDim.x) {
const int i = i_block_offset + threadIdx.x;
if (i < num_boxes) {
for (int j_thread_offset =
kNmsBoxesPerThread * (blockIdx.y * blockDim.y + threadIdx.y);
j_thread_offset < num_boxes;
j_thread_offset += kNmsBoxesPerThread * blockDim.y * gridDim.y) {
// Note : We can do everything using multiplication,
// and use fp16 - we are comparing against a low precision
// threshold.
int above_threshold = 0;
// Make sure that threads are within valid domain.
bool valid = false;
// Loop over the next kNmsBoxesPerThread boxes and set corresponding bit
// if it is overlapping with current box
for (int ib = 0; ib < kNmsBoxesPerThread; ++ib) {
// This thread will compare Box i and Box j.
const int j = j_thread_offset + ib;
if (i >= j || i >= num_boxes || j >= num_boxes) continue;
valid = true;
if (SuppressByIOU(reinterpret_cast<const float*>(d_desc_sorted_boxes),
i, j, center_point_box, iou_threshold)) {
// we have score[j] <= score[i].
above_threshold |= (1U << ib);
}
}
if (valid) {
d_delete_mask[i * bit_mask_len + j_thread_offset / kNmsBoxesPerThread] =
above_threshold;
}
}
}
}
}
// Variadic template helpers for Index selecting multiple arrays at the same
// time
template <typename Index>
__device__ inline void SelectHelper(const Index /*i_selected */,
const Index /* i_original */) {}
template <typename Index, typename T, typename... Args>
__device__ inline void SelectHelper(const Index i_selected,
const Index i_original,
const T* original, T* selected,
Args... args) {
selected[i_selected] = original[i_original];
SelectHelper(i_selected, i_original, args...);
}
// Helper template to select elements from original arrays using the index
// mapping and store into selected array. Each array sharing same mapping need
// to be passed as pairs of pointers to original and selected arrays. For
// selecting 2 arrays call would be
// IndexMultiSelect(num_elements, indices, original1 ,selected1, original2,
// selected2).
template <typename Index, typename T, typename... Args>
__global__ void IndexMultiSelect(const int num_elements, const Index* indices,
const T* original, T* selected, Args... args) {
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_elements; idx += blockDim.x * gridDim.x) {
SelectHelper(idx, indices[idx], original, selected, args...);
}
}
template <typename T>
__global__ void SetZero(const int count, T* __restrict__ ptr) {
// Check that the grid is one dimensional and index doesn't overflow.
assert(blockDim.y == 1);
assert(blockDim.z == 1);
assert(blockDim.x * gridDim.x / blockDim.x == gridDim.x);
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) {
ptr[i] = T(0);
}
}
template <typename T>
__global__ void Iota(const int num_elements, const T offset, T* to_fill) {
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_elements; idx += blockDim.x * gridDim.x) {
to_fill[idx] = static_cast<T>(idx) + offset;
}
}
__global__ void NormalizeOutput(const int num_elements, const int* original, int64_t* to_normalize, int64_t batch_index, int64_t class_index) {
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < num_elements; idx += blockDim.x * gridDim.x) {
to_normalize[idx * 3] = batch_index;
to_normalize[idx * 3 + 1] = class_index;
to_normalize[idx * 3 + 2] = static_cast<int64_t>(original[idx]);
}
}
Status NmsGpu(hipStream_t stream,
std::function<IAllocatorUniquePtr<void>(size_t)> allocator,
const int64_t center_point_box,
const float* d_sorted_boxes_float_ptr,
const int num_boxes,
const float iou_threshold,
int* d_selected_indices,
int* h_nkeep,
const int max_boxes) {
// Making sure we respect the __align(16)__
// we promised to the compiler.
auto iptr = reinterpret_cast<std::uintptr_t>(d_sorted_boxes_float_ptr);
ORT_ENFORCE((iptr & 15) == 0);
const int bit_mask_len =
(num_boxes + kNmsBoxesPerThread - 1) / kNmsBoxesPerThread;
int max_nms_mask_size = num_boxes * bit_mask_len;
IAllocatorUniquePtr<void> d_nms_mask_ptr{allocator(max_nms_mask_size * sizeof(int))};
auto* d_nms_mask = static_cast<int*>(d_nms_mask_ptr.get());
int blocksPerGrid = (int)(ceil(static_cast<float>(max_nms_mask_size) / GridDim::maxThreadsPerBlock));
hipLaunchKernelGGL(HIP_KERNEL_NAME(SetZero<int>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, max_nms_mask_size, d_nms_mask);
int* d_delete_mask = d_nms_mask;
int* h_selected_count = h_nkeep;
const Box* d_sorted_boxes =
reinterpret_cast<const Box*>(d_sorted_boxes_float_ptr);
dim3 block_dim, thread_block;
int num_blocks = (num_boxes + kNmsBlockDim - 1) / kNmsBlockDim;
num_blocks = std::max(std::min(num_blocks, kNmsBlockDimMax), 1);
block_dim.x = num_blocks;
block_dim.y = num_blocks;
block_dim.z = 1;
thread_block.x = kNmsBlockDim;
thread_block.y = kNmsBlockDim;
thread_block.z = 1;
hipLaunchKernelGGL(NMSKernel, block_dim, thread_block, 0, stream, center_point_box,
d_sorted_boxes,
num_boxes,
iou_threshold,
bit_mask_len,
d_delete_mask);
IAllocatorUniquePtr<void> d_selected_boxes_ptr{allocator(num_boxes * sizeof(char))};
auto* d_selected_boxes = static_cast<char*>(d_selected_boxes_ptr.get());
IAllocatorUniquePtr<void> d_indices_ptr{allocator(num_boxes * sizeof(int))};
auto* d_indices = static_cast<int*>(d_indices_ptr.get());
blocksPerGrid = (int)(ceil(static_cast<float>(num_boxes) / GridDim::maxThreadsPerBlock));
hipLaunchKernelGGL(HIP_KERNEL_NAME(Iota<int>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, num_boxes, 0, d_indices);
NMSReduce<<<1, 1024, bit_mask_len * sizeof(int), stream>>>(d_delete_mask, bit_mask_len, num_boxes, max_boxes, d_selected_boxes);
size_t flagged_buffer_size = 0;
HIP_RETURN_IF_ERROR(hipcub::DeviceSelect::Flagged(static_cast<void*>(nullptr), // temp_storage
flagged_buffer_size,
static_cast<int*>(nullptr), // input
static_cast<char*>(nullptr), // selection flag
static_cast<int*>(nullptr), // selected items
static_cast<int*>(nullptr), // num_selected
num_boxes,
stream));
IAllocatorUniquePtr<void> d_cub_scratch_buffer_ptr{allocator(flagged_buffer_size)};
auto* d_cub_scratch_buffer = static_cast<uint8_t*>(d_cub_scratch_buffer_ptr.get());
IAllocatorUniquePtr<void> d_num_selected_ptr{allocator(sizeof(int))};
auto* d_num_selected = static_cast<int*>(d_num_selected_ptr.get());
HIP_RETURN_IF_ERROR(hipcub::DeviceSelect::Flagged(
d_cub_scratch_buffer, // temp_storage
flagged_buffer_size,
d_indices, // input
d_selected_boxes, // selection flag
d_selected_indices, // selected items
d_num_selected, num_boxes, stream));
HIP_RETURN_IF_ERROR(hipMemcpyAsync(h_selected_count, d_num_selected, sizeof(int), hipMemcpyDeviceToHost, stream));
// hipStreamSynchronize is needed since the value of h_selected_count will be used by host after this function.
HIP_RETURN_IF_ERROR(hipStreamSynchronize(stream));
return Status::OK();
}
struct DeviceGreaterThan {
float threshold_;
__host__ __device__ __forceinline__ DeviceGreaterThan(float threshold)
: threshold_(threshold) {}
__host__ __device__ __forceinline__ bool operator()(const float& val) const {
return (val > threshold_);
}
};
} // namespace
Status NonMaxSuppressionImpl(
hipStream_t stream,
std::function<IAllocatorUniquePtr<void>(size_t)> allocator,
const PrepareContext& pc,
const int64_t center_point_box,
int64_t batch_index,
int64_t class_index,
int max_output_boxes_per_class,
float iou_threshold,
float score_threshold,
IAllocatorUniquePtr<void>& selected_indices,
int* h_number_selected) {
// STEP 1. Prepare data
int num_boxes = pc.num_boxes_;
const float* boxes_data = pc.boxes_data_ + batch_index * num_boxes * 4;
const float* scores_data = pc.scores_data_ + (batch_index * pc.num_classes_ + class_index) * num_boxes;
// prepare temporary memory for sorting scores
// calculate temporary size that used for sorting
size_t cub_sort_temp_storage_bytes = 0;
HIP_RETURN_IF_ERROR(hipcub::DeviceRadixSort::SortPairsDescending(
nullptr, cub_sort_temp_storage_bytes,
static_cast<float*>(nullptr), // scores
static_cast<float*>(nullptr), // sorted scores
static_cast<int*>(nullptr), // input indices
static_cast<int*>(nullptr), // sorted indices
num_boxes, // num items
0, 8 * sizeof(float), // sort all bits
stream));
// allocate temporary memory
IAllocatorUniquePtr<void> d_cub_sort_buffer_ptr{allocator(cub_sort_temp_storage_bytes)};
auto* d_cub_sort_buffer = static_cast<uint8_t*>(d_cub_sort_buffer_ptr.get());
IAllocatorUniquePtr<void> d_indices_ptr{allocator(num_boxes * sizeof(int))};
auto* d_indices = static_cast<int*>(d_indices_ptr.get());
IAllocatorUniquePtr<void> d_sorted_indices_ptr{allocator(num_boxes * sizeof(int))};
auto* d_sorted_indices = static_cast<int*>(d_sorted_indices_ptr.get());
IAllocatorUniquePtr<void> d_selected_indices_ptr{allocator(num_boxes * sizeof(int))};
auto* d_selected_indices = static_cast<int*>(d_selected_indices_ptr.get());
IAllocatorUniquePtr<void> d_sorted_scores_ptr{allocator(num_boxes * sizeof(float))};
auto* d_sorted_scores = static_cast<float*>(d_sorted_scores_ptr.get());
IAllocatorUniquePtr<void> d_sorted_boxes_ptr{allocator(num_boxes * 4 * sizeof(float))};
auto* d_sorted_boxes = static_cast<float*>(d_sorted_boxes_ptr.get());
// create sequense of indices
int blocksPerGrid = (int)(ceil(static_cast<float>(num_boxes) / GridDim::maxThreadsPerBlock));
hipLaunchKernelGGL(HIP_KERNEL_NAME(Iota<int>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, num_boxes, 0, d_indices);
HIP_RETURN_IF_ERROR(hipGetLastError());
// sort scores
HIP_RETURN_IF_ERROR(hipcub::DeviceRadixSort::SortPairsDescending(
d_cub_sort_buffer,
cub_sort_temp_storage_bytes,
scores_data,
d_sorted_scores,
d_indices,
d_sorted_indices,
num_boxes,
0,
8 * sizeof(float), // sort all bits
stream));
// pick sorted scores
const Box* original_boxes = reinterpret_cast<const Box*>(boxes_data);
Box* sorted_boxes = reinterpret_cast<Box*>(d_sorted_boxes);
hipLaunchKernelGGL(HIP_KERNEL_NAME(IndexMultiSelect<int, Box>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, num_boxes, d_sorted_indices, original_boxes, sorted_boxes);
HIP_RETURN_IF_ERROR(hipGetLastError());
// STEP 2. filter boxes by scores
int limited_num_boxes = num_boxes;
if (pc.score_threshold_ != nullptr) {
thrust::device_ptr<float> sorted_scores_device_ptr(d_sorted_scores);
limited_num_boxes = thrust::count_if(
thrust::hip::par.on(stream),
sorted_scores_device_ptr,
sorted_scores_device_ptr + num_boxes,
DeviceGreaterThan(score_threshold));
HIP_RETURN_IF_ERROR(hipGetLastError());
if (limited_num_boxes == 0) {
*h_number_selected = 0;
return Status::OK();
}
}
// STEP 3. launch NMS kernels
ORT_RETURN_IF_ERROR(NmsGpu(stream,
allocator,
center_point_box,
d_sorted_boxes,
limited_num_boxes,
iou_threshold,
d_selected_indices,
h_number_selected,
max_output_boxes_per_class));
HIP_RETURN_IF_ERROR(hipGetLastError());
// STEP 4. map back to sorted indices
*h_number_selected = std::min(*h_number_selected, max_output_boxes_per_class);
int num_to_keep = *h_number_selected;
if (num_to_keep > 0) {
IAllocatorUniquePtr<void> d_output_indices_ptr{allocator(num_to_keep * sizeof(int))};
auto* d_output_indices = static_cast<int*>(d_output_indices_ptr.get());
IAllocatorUniquePtr<void> d_normalized_output_indices_ptr{allocator(num_to_keep * 3 * sizeof(int64_t))};
auto* d_normalized_output_indices = static_cast<int64_t*>(d_normalized_output_indices_ptr.get());
blocksPerGrid = (int)(ceil(static_cast<float>(num_to_keep) / GridDim::maxThreadsPerBlock));
hipLaunchKernelGGL(HIP_KERNEL_NAME(IndexMultiSelect<int, int>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, num_to_keep, d_selected_indices, d_sorted_indices, d_output_indices);
hipLaunchKernelGGL(NormalizeOutput, blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, num_to_keep, d_output_indices, d_normalized_output_indices, batch_index, class_index);
HIP_RETURN_IF_ERROR(hipGetLastError());
selected_indices = std::move(d_normalized_output_indices_ptr);
}
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
#include <functional>
#include "core/providers/cpu/object_detection/non_max_suppression_helper.h"
namespace onnxruntime {
namespace rocm {
Status NonMaxSuppressionImpl(
hipStream_t stream,
std::function<IAllocatorUniquePtr<void>(size_t)> allocator,
const PrepareContext& pc,
const int64_t center_point_box,
int64_t batch_index,
int64_t class_index,
int max_output_boxes_per_class,
float iou_threshold,
float score_threshold,
IAllocatorUniquePtr<void>& selected_indices,
int* h_number_selected);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "roialign.h"
#include "roialign_impl.h"
namespace onnxruntime {
namespace rocm {
#define REGISTER_KERNEL_TYPED(T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
RoiAlign, \
kOnnxDomain, \
10, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<int64_t>()), \
RoiAlign<T>);
template <typename T>
Status RoiAlign<T>::ComputeInternal(OpKernelContext* context) const {
// X
const auto* X_ptr = context->Input<Tensor>(0);
// rois
const auto* rois_ptr = context->Input<Tensor>(1);
// batch indices
const auto* batch_indices_ptr = context->Input<Tensor>(2);
const auto& x_dims = X_ptr->Shape();
const auto& rois_dims = rois_ptr->Shape();
const auto& batch_indices_dims = batch_indices_ptr->Shape();
auto num_rois = batch_indices_dims[0];
auto num_roi_cols = rois_dims[1];
auto status = CheckROIAlignValidInput(X_ptr, rois_ptr, batch_indices_ptr);
if (status != Status::OK()) {
return status;
}
Tensor& Y = *context->Output(0, {num_rois, x_dims[1], this->output_height_, this->output_width_});
int64_t output_size = Y.Shape().Size();
if (output_size > 0) {
RoiAlignImpl(
Stream(),
output_size, // num threads
reinterpret_cast<const typename ToHipType<T>::MappedType*>(X_ptr->Data<T>()),
ToHipType<T>::FromFloat(this->spatial_scale_),
x_dims[1], // num channels
x_dims[2], // height
x_dims[3], // width
this->output_height_,
this->output_width_,
this->sampling_ratio_,
reinterpret_cast<const typename ToHipType<T>::MappedType*>(rois_ptr->Data<T>()),
num_roi_cols,
reinterpret_cast<typename ToHipType<T>::MappedType*>(Y.MutableData<T>()),
this->mode_ == RoiAlignMode::avg,
this->half_pixel_,
batch_indices_ptr->Data<int64_t>());
}
return Status::OK();
}
#define SPECIALIZED_COMPUTE(T) \
REGISTER_KERNEL_TYPED(T) \
template Status RoiAlign<T>::ComputeInternal(OpKernelContext* ctx) const;
SPECIALIZED_COMPUTE(float)
SPECIALIZED_COMPUTE(double)
//SPECIALIZED_COMPUTE(MLFloat16)
} // namespace rocm
}; // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/common.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/object_detection/roialign.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
struct RoiAlign final : RocmKernel, RoiAlignBase {
RoiAlign(const OpKernelInfo& info) : RocmKernel(info), RoiAlignBase(info) {}
Status ComputeInternal(OpKernelContext* context) const override;
private:
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(RoiAlign);
};
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/* Modifications Copyright (c) Microsoft. */
#include "roialign_impl.h"
#include "core/providers/rocm/cu_inc/common.cuh"
namespace onnxruntime {
namespace rocm {
template <typename T>
__device__ T bilinear_interpolate(
const T* bottom_data,
const int height,
const int width,
T y,
T x,
const bool is_mode_avg,
const int index /* index for debug only*/) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
return 0;
}
if (y <= 0) {
y = 0;
}
if (x <= 0) {
x = 0;
}
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// do bilinear interpolation
T v1 = bottom_data[y_low * width + x_low];
T v2 = bottom_data[y_low * width + x_high];
T v3 = bottom_data[y_high * width + x_low];
T v4 = bottom_data[y_high * width + x_high];
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
T val = is_mode_avg
? (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4) // mode Avg
: max(max(max(w1 * v1, w2 * v2), w3 * v3), w4 * v4); // mode Max
return val;
}
template <typename T>
__global__ void RoIAlignForward(
const int64_t nthreads,
const T* bottom_data,
const T spatial_scale,
const int64_t channels,
const int64_t height,
const int64_t width,
const int64_t pooled_height,
const int64_t pooled_width,
const int64_t sampling_ratio,
const T* bottom_rois,
int64_t roi_cols,
T* top_data,
const bool is_mode_avg,
const bool half_pixel,
const int64_t* batch_indices_ptr) {
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
// RoI could have 4 or 5 columns
const T* offset_bottom_rois = bottom_rois + n * roi_cols;
const auto roi_batch_ind = batch_indices_ptr[n];
// Do not using rounding; this implementation detail is critical
T roi_offset = half_pixel ? T(0.5) : T(0);
T roi_start_w = offset_bottom_rois[0] * spatial_scale - roi_offset;
T roi_start_h = offset_bottom_rois[1] * spatial_scale - roi_offset;
T roi_end_w = offset_bottom_rois[2] * spatial_scale - roi_offset;
T roi_end_h = offset_bottom_rois[3] * spatial_scale - roi_offset;
T roi_width = roi_end_w - roi_start_w;
T roi_height = roi_end_h - roi_start_h;
if (!half_pixel) { // backward compatiblity
// Force malformed ROIs to be 1x1
roi_width = max(roi_width, (T)1.);
roi_height = max(roi_height, (T)1.);
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
const T* offset_bottom_data =
bottom_data + static_cast<int64_t>((roi_batch_ind * channels + c) * height * width);
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: _Ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : _Ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
T output_val = 0.;
bool max_flag = false;
for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
{
const T y = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T val = bilinear_interpolate(
offset_bottom_data, height, width, y, x, is_mode_avg, index);
if (is_mode_avg) {
output_val += val;
} else {
if (!max_flag) {
output_val = val;
max_flag = true;
} else {
output_val = max(output_val, val);
}
}
}
}
if (is_mode_avg) {
output_val /= count;
}
top_data[index] = output_val;
}
}
template <typename T>
void RoiAlignImpl(
hipStream_t stream,
const int64_t nthreads,
const T* bottom_data,
const T spatial_scale,
const int64_t channels,
const int64_t height,
const int64_t width,
const int64_t pooled_height,
const int64_t pooled_width,
const int64_t sampling_ratio,
const T* bottom_rois,
int64_t roi_cols,
T* top_data,
const bool is_mode_avg,
const bool half_pixel,
const int64_t* batch_indices_ptr) {
int blocksPerGrid = (int)(ceil(static_cast<float>(nthreads) / GridDim::maxThreadsPerBlock));
hipLaunchKernelGGL(HIP_KERNEL_NAME(RoIAlignForward<T>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
nthreads,
bottom_data,
spatial_scale,
channels,
height,
width,
pooled_height,
pooled_width,
sampling_ratio,
bottom_rois,
roi_cols,
top_data,
is_mode_avg,
half_pixel,
batch_indices_ptr);
}
#define SPECIALIZED_IMPL(T) \
template void RoiAlignImpl<T>( \
hipStream_t stream, \
const int64_t nthreads, \
const T* bottom_data, \
const T spatial_scale, \
const int64_t channels, \
const int64_t height, \
const int64_t width, \
const int64_t pooled_height, \
const int64_t pooled_width, \
const int64_t sampling_ratio, \
const T* bottom_rois, \
int64_t roi_cols, \
T* top_data, \
const bool is_mode_avg, \
const bool half_pixel, \
const int64_t* batch_indices_ptr);
SPECIALIZED_IMPL(float)
SPECIALIZED_IMPL(double)
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
void RoiAlignImpl(
hipStream_t stream,
const int64_t nthreads,
const T* bottom_data,
const T spatial_scale,
const int64_t channels,
const int64_t height,
const int64_t width,
const int64_t pooled_height,
const int64_t pooled_width,
const int64_t sampling_ratio,
const T* bottom_rois,
int64_t roi_cols,
T* top_data,
const bool is_mode_avg,
const bool half_pixel,
const int64_t* batch_indices_ptr);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/reduction/reduction_functions.h"
#include <algorithm>
#include <cassert>
#include <iterator>
#include <utility>
#include "core/common/optional.h"
#include "core/framework/tensor_shape.h"
namespace onnxruntime {
namespace rocm {
namespace {
// gets min and max of single contiguous range of axes if available
optional<std::pair<int64_t, int64_t>> GetMinAndMaxContiguousAxes(
int64_t rank,
const std::vector<int64_t>& dims,
const std::vector<int64_t>& original_axes) {
assert(rank == static_cast<int64_t>(dims.size()));
// empty axes means reduce all dimensions
if (original_axes.empty()) {
return std::make_pair(int64_t{0}, rank - 1);
}
// normalize axis values and sort
const std::vector<int64_t> axes = [&original_axes, rank]() {
std::vector<int64_t> result(original_axes);
std::for_each(
result.begin(), result.end(),
[rank](int64_t& axis) { axis = HandleNegativeAxis(axis, rank); });
std::sort(result.begin(), result.end());
return result;
}();
assert(!axes.empty());
const auto is_dim_one = [](int64_t dim) { return dim == 1; };
for (auto a = axes.begin(), b = axes.begin() + 1;
b != axes.end();
++a, ++b) {
ORT_ENFORCE(*a != *b, "axes must not contain duplicate values");
// if axis values are adjacent, the axes are contiguous
if (*a + 1 == *b) {
continue;
}
// if all dimension values between adjacent axes are 1,
// treat the axes as contiguous
if (std::all_of(dims.begin() + *a + 1, dims.begin() + *b, is_dim_one)) {
continue;
}
// otherwise, not contiguous
return nullopt;
}
// expand axes over surrounding dimensions with value of 1
const int64_t min_axis = [&dims, &axes, &is_dim_one]() -> int64_t {
const auto& min_given_axis = axes.front();
// note that std::reverse_iterator(it) refers to the element at (it-1)
// it -> reverse it: element offset of -1
const auto before_min_given_axis_rit =
std::make_reverse_iterator(dims.begin() + min_given_axis);
const auto before_min_axis_rit =
std::find_if_not(before_min_given_axis_rit, dims.rend(), is_dim_one);
// reverse it -> it: element offset of +1
return std::distance(dims.begin(), before_min_axis_rit.base());
}();
const int64_t max_axis = [&dims, &axes, &is_dim_one]() {
const auto& max_given_axis = axes.back();
const auto after_max_given_axis_it = dims.begin() + max_given_axis + 1;
const auto after_max_axis_it =
std::find_if_not(after_max_given_axis_it, dims.end(), is_dim_one);
return std::distance(dims.begin(), after_max_axis_it - 1);
}();
return std::make_pair(min_axis, max_axis);
}
} // namespace
ApplicableMatrixReduction get_applicable_matrix_reduction(
const miopenReduceTensorOp_t miopen_reduce_op,
gsl::span<const int64_t> dims, gsl::span<const int64_t> original_axes,
int& m_out, int& n_out) {
if (miopen_reduce_op != MIOPEN_REDUCE_TENSOR_ADD && miopen_reduce_op != MIOPEN_REDUCE_TENSOR_AVG) {
return ApplicableMatrixReduction::None;
}
// Remove all dims with value 1. This can help to optimize case like:
// dims=[2,3,1,4,1,5] and axes=[0,2,4], which is same as dims=[2,3,4,5] and axes=[0].
std::vector<int64_t> new_dims;
std::vector<int64_t> new_axes;
const auto original_rank = gsl::narrow<int64_t>(dims.size());
std::set<int64_t> original_axes_set;
for (const auto axis : original_axes) {
original_axes_set.insert(HandleNegativeAxis(axis, original_rank));
}
int64_t new_axis = 0;
for (size_t i = 0; i < dims.size(); i++) {
if (dims[i] != 1) {
new_dims.emplace_back(dims[i]);
if (original_axes_set.find(gsl::narrow<int64_t>(i)) != original_axes_set.end()) {
new_axes.emplace_back(new_axis);
}
new_axis++;
}
}
// Empty axes means reduce all dimensions, which has different meaning,
// so add a new dim to the end if all original axes are on dims with value 1.
if (!original_axes.empty() && new_axes.empty()) {
new_dims.emplace_back(1);
new_axes.emplace_back(new_axis);
}
// If all dims are value 1, make sure it's not empty by adding a new dim.
if (!dims.empty() && new_dims.empty()) {
new_dims.emplace_back(1);
}
const auto rank = gsl::narrow<int64_t>(new_dims.size());
const auto min_and_max_axes = GetMinAndMaxContiguousAxes(rank, new_dims, new_axes);
if (!min_and_max_axes.has_value()) {
return ApplicableMatrixReduction::None;
}
const auto& min_axis = min_and_max_axes->first;
const auto& max_axis = min_and_max_axes->second;
// axes from beginning means row reduction, axes to end means column reduction
// for axes from beginning to end, either works and we do row reduction
const bool axes_from_beginning = min_axis == 0;
const bool axes_to_end = max_axis == rank - 1;
// handle axes anchored to beginning or end
if (!axes_from_beginning && !axes_to_end) {
return ApplicableMatrixReduction::None;
}
// the axis index right after the last flattened into matrix rows
const int64_t m_end_axis = axes_from_beginning ? max_axis + 1 : min_axis;
const auto shape=TensorShape::FromExistingBuffer(new_dims);
const auto m = shape.SizeToDimension(m_end_axis);
const auto n = shape.SizeFromDimension(m_end_axis);
ORT_ENFORCE(m > 0 && n > 0, "shape must not have negative dimensions: ", shape);
if (m > std::numeric_limits<int>::max() ||
n > std::numeric_limits<int>::max()) {
return ApplicableMatrixReduction::None;
}
m_out = gsl::narrow_cast<int>(m);
n_out = gsl::narrow_cast<int>(n);
return axes_from_beginning
? ApplicableMatrixReduction::Rows
: ApplicableMatrixReduction::Columns;
}
} // namespace rocm
} // namespace onnxruntime
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment