Commit 1a91fcc2 authored by gaoqiong's avatar gaoqiong
Browse files

add dtk所需文件

parent a144865d
Pipeline #492 failed with stages
in 0 seconds
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "quantize_linear.h"
#include "core/providers/rocm/rocm_common.h"
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace onnxruntime {
namespace rocm {
template <class T, class U>
Status CudaQuantizeLinear(hipStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element);
template <class T, class U>
Status CudaDequantizeLinear(hipStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace onnxruntime {
namespace rocm {
template <class T, class U = float>
class QuantizeLinear final : public RocmKernel {
public:
QuantizeLinear(const OpKernelInfo& info) : RocmKernel(info) {}
Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override;
};
template <class T, class U = float>
class DequantizeLinear final : public RocmKernel {
public:
DequantizeLinear(const OpKernelInfo& info) : RocmKernel(info) {}
Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override;
};
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "reshape.h"
namespace onnxruntime {
namespace rocm {
ONNX_OPERATOR_KERNEL_EX(
Reshape,
kOnnxDomain,
14,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.TypeConstraint("shape", DataTypeImpl::GetTensorType<int64_t>())
.Alias(0, 0)
.InputMemoryType(OrtMemTypeCPUInput, 1),
Reshape);
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Reshape,
kOnnxDomain,
13, 13,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.TypeConstraint("shape", DataTypeImpl::GetTensorType<int64_t>())
.Alias(0, 0)
.InputMemoryType(OrtMemTypeCPUInput, 1),
Reshape);
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Reshape,
kOnnxDomain,
5, 12,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.TypeConstraint("shape", DataTypeImpl::GetTensorType<int64_t>())
.Alias(0, 0)
.InputMemoryType(OrtMemTypeCPUInput, 1),
Reshape);
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Reshape,
kOnnxDomain,
1,
4,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.Alias(0, 0)
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
Reshape_1);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/tensor/reshape_helper.h"
namespace onnxruntime {
namespace rocm {
class Reshape final : public RocmKernel {
public:
Reshape(const OpKernelInfo& info) : RocmKernel(info),
allow_zero_(info.GetAttrOrDefault("allowzero", static_cast<int64_t>(0)) == 1) {
}
Status ComputeInternal(OpKernelContext* context) const override {
// Copy the second input tensor into the shape vector
const Tensor* shapeTensor = context->Input<Tensor>(1);
if (shapeTensor == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
if (shapeTensor->Shape().NumDimensions() != 1) return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "A shape tensor must be a vector tensor, got ", shapeTensor->Shape().NumDimensions(), " dimensions");
auto data_span = shapeTensor->template DataAsSpan<int64_t>();
TensorShapeVector shape(data_span.begin(), data_span.end());
const Tensor* X = context->Input<Tensor>(0);
if (X == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
const TensorShape& X_shape = X->Shape();
ReshapeHelper helper(X_shape, shape, allow_zero_);
Tensor* Y = context->Output(0, TensorShape(shape));
const void* source = X->DataRaw();
void* target = Y->MutableDataRaw();
//If source and target pointers are not equal (non-inplace operation), we need to copy the data.
if (target != source) {
ORT_RETURN_IF_ERROR(CopyTensor(*X, *Y));
}
return Status::OK();
}
private:
bool allow_zero_;
};
class Reshape_1 final : public RocmKernel {
public:
Reshape_1(const OpKernelInfo& info) : RocmKernel(info) {
Status status = info.GetAttrs("shape", shape_);
ORT_ENFORCE(status.IsOK(), "Attribute shape is not set.");
}
Status ComputeInternal(OpKernelContext* context) const override {
TensorShapeVector shape = shape_;
const Tensor* X = context->Input<Tensor>(0);
const TensorShape& X_shape = X->Shape();
ReshapeHelper helper(X_shape, shape);
Tensor* Y = context->Output(0, TensorShape(shape));
const void* source = X->DataRaw();
void* target = Y->MutableDataRaw();
//If source and target pointers are not equal (non-inplace operation), we need to copy the data.
if (target != source) {
ORT_RETURN_IF_ERROR(CopyTensor(*X, *Y));
}
return Status::OK();
}
private:
TensorShapeVector shape_;
};
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "resize.h"
namespace onnxruntime {
namespace rocm {
#define REGISTER_KERNEL_TYPED(T) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Resize, \
kOnnxDomain, \
10, 10, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.InputMemoryType(OrtMemTypeCPUInput, 1) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
Resize<T>); \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Resize, \
kOnnxDomain, \
11, 12, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.InputMemoryType(OrtMemTypeCPUInput, 1) \
.InputMemoryType(OrtMemTypeCPUInput, 2) \
.InputMemoryType(OrtMemTypeCPUInput, 3) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()), \
Resize<T>); \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
Resize, \
kOnnxDomain, \
13, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.InputMemoryType(OrtMemTypeCPUInput, 1) \
.InputMemoryType(OrtMemTypeCPUInput, 2) \
.InputMemoryType(OrtMemTypeCPUInput, 3) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()), \
Resize<T>);
REGISTER_KERNEL_TYPED(float)
REGISTER_KERNEL_TYPED(double)
REGISTER_KERNEL_TYPED(MLFloat16)
REGISTER_KERNEL_TYPED(int32_t)
REGISTER_KERNEL_TYPED(uint8_t)
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/tensor/upsample.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
class Resize : public Upsample<T> {
public:
Resize(const OpKernelInfo& info) : Upsample<T>(info) {
}
Status ComputeInternal(OpKernelContext* context) const override {
return Upsample<T>::ComputeInternal(context);
}
};
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/providers/rocm/tensor/resize_impl.h"
namespace onnxruntime {
namespace rocm {
using onnxruntime::ResizeCoordinateTransformationMode;
using onnxruntime::ResizeNearestMode;
using onnxruntime::UpsampleMode;
struct NearestPixel_SIMPLE {
__device__ __forceinline__ int operator() (float x_original, bool is_down_sampling) const {
if (is_down_sampling) {
return static_cast<int>(_Ceil(x_original));
}
return static_cast<int>(x_original);
}
};
struct NearestPixel_ROUND_PREFER_FLOOR {
__device__ __forceinline__ int operator() (float x_original, bool) const {
if (x_original == static_cast<int>(x_original) + 0.5f) {
return static_cast<int>(_Floor(x_original));
}
return static_cast<int>(roundf(x_original));
}
};
struct NearestPixel_ROUND_PREFER_CEIL {
__device__ __forceinline__ int operator() (float x_original, bool) const {
return static_cast<int>(roundf(x_original));
}
};
struct NearestPixel_FLOOR {
__device__ __forceinline__ int operator() (float x_original, bool) const {
return static_cast<int>(_Floor(x_original));
}
};
struct NearestPixel_CEIL {
__device__ __forceinline__ int operator() (float x_original, bool) const {
return static_cast<int>(_Ceil(x_original));
}
};
struct TransformCoordinate_ASYMMETRIC {
__device__ __forceinline__ float operator() (float x_resized, float x_scale, float, float, float, float) const {
return x_resized / x_scale;
}
};
struct TransformCoordinate_HALF_PIXEL {
__device__ __forceinline__ float operator() (float x_resized, float x_scale, float, float, float, float) const {
return ((x_resized + 0.5f) / x_scale) - 0.5f;
}
};
struct TransformCoordinate_PYTORCH_HALF_PIXEL {
__device__ __forceinline__ float operator() (float x_resized, float x_scale, float length_resized, float, float, float) const {
return length_resized > 1 ? (x_resized + 0.5f) / x_scale - 0.5f : 0.0f;
}
};
struct TransformCoordinate_TF_HALF_PIXEL_FOR_NN {
__device__ __forceinline__ float operator() (float x_resized, float x_scale, float, float, float, float) const {
return (x_resized + 0.5f) / x_scale;
}
};
struct TransformCoordinate_ALIGN_CORNERS {
__device__ __forceinline__ float operator() (float x_resized, float, float length_resized, float length_original, float, float) const {
return length_resized == 1 ? 0 : x_resized * (length_original - 1) / (length_resized - 1);
}
};
struct TransformCoordinate_TF_CROP_AND_RESIZE {
__device__ __forceinline__ float operator() (float x_resized, float, float length_resized, float length_original, float roi_start, float roi_end) const {
auto orig = length_resized > 1
? roi_start * (length_original - 1) + (x_resized * (roi_end - roi_start) * (length_original - 1)) / (length_resized - 1)
: 0.5 * (roi_start + roi_end) * (length_original - 1);
return static_cast<float>(orig);
}
};
#define CASE_TYPE_USING_HINT(enum_type, type, HINT, ...) \
case enum_type: { \
using HINT = type; \
return __VA_ARGS__(); \
}
#define CASE_TYPE_COORD(enum_type, type, ...) \
CASE_TYPE_USING_HINT(enum_type, type, coord_t, __VA_ARGS__)
#define DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(TYPE, ...) \
[&] { \
const auto& the_type = TYPE; \
/* don't use TYPE again in case it is an expensive or side-effect op */ \
switch (the_type) { \
CASE_TYPE_COORD(ResizeCoordinateTransformationMode::HALF_PIXEL, TransformCoordinate_HALF_PIXEL, __VA_ARGS__) \
CASE_TYPE_COORD(ResizeCoordinateTransformationMode::ASYMMETRIC, TransformCoordinate_ASYMMETRIC, __VA_ARGS__) \
CASE_TYPE_COORD(ResizeCoordinateTransformationMode::PYTORCH_HALF_PIXEL, TransformCoordinate_PYTORCH_HALF_PIXEL, __VA_ARGS__) \
CASE_TYPE_COORD(ResizeCoordinateTransformationMode::ALIGN_CORNERS, TransformCoordinate_ALIGN_CORNERS, __VA_ARGS__) \
CASE_TYPE_COORD(ResizeCoordinateTransformationMode::TF_HALF_PIXEL_FOR_NN, TransformCoordinate_TF_HALF_PIXEL_FOR_NN, __VA_ARGS__) \
CASE_TYPE_COORD(ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE, TransformCoordinate_TF_CROP_AND_RESIZE, __VA_ARGS__) \
default: \
ORT_THROW("unknown ResizeCoordinateTransformationMode"); \
} \
}()
#define CASE_TYPE_NEAREST(enum_type, type, ...) \
CASE_TYPE_USING_HINT(enum_type, type, nearest_t, __VA_ARGS__)
#define DISPATCH_RESIZE_NEAREST_MODE(TYPE, ...) \
[&] { \
const auto& the_type = TYPE; \
/* don't use TYPE again in case it is an expensive or side-effect op */ \
switch (the_type) { \
CASE_TYPE_NEAREST(ResizeNearestMode::SIMPLE, NearestPixel_SIMPLE, __VA_ARGS__) \
CASE_TYPE_NEAREST(ResizeNearestMode::ROUND_PREFER_FLOOR, NearestPixel_ROUND_PREFER_FLOOR, __VA_ARGS__) \
CASE_TYPE_NEAREST(ResizeNearestMode::ROUND_PREFER_CEIL, NearestPixel_ROUND_PREFER_CEIL, __VA_ARGS__) \
CASE_TYPE_NEAREST(ResizeNearestMode::FLOOR, NearestPixel_FLOOR, __VA_ARGS__) \
CASE_TYPE_NEAREST(ResizeNearestMode::CEIL, NearestPixel_CEIL, __VA_ARGS__) \
default: \
ORT_THROW("unknown ResizeNearestMode"); \
} \
}()
struct NearestMappingInfo {
int origin_;
int extrapolate_;
};
template <typename T, typename CudaFunctionOriginalCoordinate, typename CudaFunctionNearestPixel>
__global__ void _ResizeNearestMappingKernel2D(
const int input_height, const int input_width,
const int output_height, const int output_width,
const float scales_height, const float scales_width,
const float roi_start_height, const float roi_end_height,
const float roi_start_width, const float roi_end_width,
const bool extrapolation_enabled,
const CudaFunctionOriginalCoordinate& transform_coordinate,
const CudaFunctionNearestPixel& calc_nearest_pixel,
NearestMappingInfo* dims_mapping) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, output_height + output_width);
if (id >= 0 && id < output_height) { // for Height
int dim = id;
// only apply co-ordinate transformation if scale != 1.0
if (scales_height == 1.0f) {
dims_mapping[id].extrapolate_ = 0;
} else {
float orig_coord = transform_coordinate(static_cast<float>(dim), scales_height, static_cast<float>(output_height),
static_cast<float>(input_height), roi_start_height, roi_end_height);
dims_mapping[id].extrapolate_ = static_cast<int>(
extrapolation_enabled && (orig_coord < 0.f || orig_coord > static_cast<float>(input_height - 1)));
dim = calc_nearest_pixel(orig_coord, scales_height < 1);
if (dim >= input_height) dim = input_height - 1;
if (dim < 0) dim = 0;
}
dims_mapping[id].origin_ = dim;
} else {
int dim = id - output_height;
// only apply co-ordinate transformation if scale != 1.0
if (scales_width == 1.0f) {
dims_mapping[id].extrapolate_ = 0;
} else {
float orig_coord = transform_coordinate(static_cast<float>(dim), scales_width, static_cast<float>(output_width),
static_cast<float>(input_width), roi_start_width, roi_end_width);
dims_mapping[id].extrapolate_ = static_cast<int>(
extrapolation_enabled && (orig_coord < 0.f || orig_coord > static_cast<float>(input_width - 1)));
dim = calc_nearest_pixel(orig_coord, scales_width < 1);
if (dim >= input_width) dim = input_width - 1;
if (dim < 0) dim = 0;
}
dims_mapping[id].origin_ = dim;
return;
}
}
template <typename T, typename CudaFunctionOriginalCoordinate, typename CudaFunctionNearestPixel>
__global__ void _ResizeNearestMappingKernel(
const size_t rank,
const TArray<int64_t> input_shape,
const TArray<int64_t> output_shape,
const TArray<float> scales,
const TArray<float, 10> roi,
const size_t total_dim_sum,
bool extrapolation_enabled,
const CudaFunctionOriginalCoordinate& transform_coordinate,
const CudaFunctionNearestPixel& calc_nearest_pixel,
int64_t* prefix_dim_sum,
NearestMappingInfo* dims_mapping) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, total_dim_sum);
int64_t dim_sum = 0;
for (int axis = 0; axis < rank; ++axis) {
if (id == dim_sum) {
prefix_dim_sum[axis] = dim_sum;
}
if (id >= dim_sum && id < dim_sum + output_shape[axis]) {
int dim = id - dim_sum;
// only apply co-ordinate transformation if scale != 1.0
if (scales[axis] == 1.0f) {
dims_mapping[id].extrapolate_ = 0;
} else {
float orig_coord = transform_coordinate(static_cast<float>(dim), scales[axis], static_cast<float>(output_shape[axis]),
static_cast<float>(input_shape[axis]), roi[axis], roi[axis + rank]);
dims_mapping[id].extrapolate_ = static_cast<int>(extrapolation_enabled && (orig_coord < 0.f || orig_coord > static_cast<float>(input_shape[axis] - 1)));
dim = calc_nearest_pixel(orig_coord, scales[axis] < 1);
if (dim >= input_shape[axis]) dim = input_shape[axis] - 1;
if (dim < 0) dim = 0;
}
dims_mapping[id].origin_ = dim;
return;
}
dim_sum += output_shape[axis];
}
}
template <typename T, bool UseExtrapolation>
__global__ void _ResizeNearestKernel2D(
const int64_t output_height, const int64_t output_width,
const int64_t input_stride_image, const int input_stride_row,
const fast_divmod output_stride_image, const fast_divmod output_stride_row,
const T* input_data, T* output_data, const size_t N,
const T extrapolation_value, const NearestMappingInfo* dims_mapping) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
int imageid, h, w, output_index;
output_stride_image.divmod(static_cast<int>(id), imageid, output_index);
output_stride_row.divmod(output_index, h, w);
if (UseExtrapolation) {
if (dims_mapping[h].extrapolate_ + dims_mapping[output_height + w].extrapolate_) {
output_data[id] = extrapolation_value;
return;
}
}
int input_index = input_stride_image * imageid +
input_stride_row * dims_mapping[h].origin_ +
dims_mapping[output_height + w].origin_;
output_data[id] = input_data[input_index];
}
template <typename T>
__global__ void _ResizeNearestKernel(
const int rank,
const TArray<int64_t> input_strides,
const TArray<fast_divmod> output_div_pitches,
const T* input_data,
T* output_data,
const size_t N,
const T extrapolation_value,
const int64_t* prefix_dim_sum,
const NearestMappingInfo* dims_mapping) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
int output_index = static_cast<int>(id);
int input_index = 0;
int extrapolation_occured = 0;
for (int axis = 0; axis < rank; ++axis) {
int dim = 0;
output_div_pitches[axis].divmod(output_index, dim, output_index);
const NearestMappingInfo& mi = dims_mapping[prefix_dim_sum[axis] + dim];
extrapolation_occured += mi.extrapolate_;
input_index += input_strides[axis] * mi.origin_;
}
output_data[id] = extrapolation_occured ? extrapolation_value : input_data[input_index];
}
struct LinearMappingInfo {
int origin_;
float weight_;
int extrapolate_;
};
template <typename T, typename CudaFunctionOriginalCoordinate>
__global__ void _ResizeBilinearCoordinateMapping(
int64_t input_height, int64_t input_width,
int64_t output_height, int64_t output_width,
float scale_height, float scale_width,
float roi_height_start, float roi_height_end,
float roi_width_start, float roi_width_end,
const size_t SumHW, bool extrapolation_enabled,
const CudaFunctionOriginalCoordinate& transform_coordinate,
LinearMappingInfo* dims_mapping) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, SumHW);
if (id < output_height) { // y = id
float input_y = scale_height == 1 ? static_cast<float>(id) :
transform_coordinate(static_cast<float>(id), scale_height,
static_cast<float>(output_height), static_cast<float>(input_height),
roi_height_start, roi_height_end);
dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_y < 0 || input_y > static_cast<float>(input_height - 1)));
input_y = max(0.0f, min(input_y, static_cast<float>(input_height - 1)));
int y_int = static_cast<int>(input_y);
dims_mapping[id].origin_ = y_int;
dims_mapping[id].weight_ = (y_int >= input_height - 1) ? 0.5f : input_y - y_int;
} else { //x = id - output_height
float input_x = scale_width == 1 ? static_cast<float>(id - output_height) :
transform_coordinate(static_cast<float>(id - output_height), scale_width,
static_cast<float>(output_width), static_cast<float>(input_width),
roi_width_start, roi_width_end);
dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_x < 0 || input_x > static_cast<float>(input_width - 1)));
input_x = max(0.0f, min(input_x, static_cast<float>(input_width - 1)));
int x_int = static_cast<int>(input_x);
dims_mapping[id].origin_ = x_int;
dims_mapping[id].weight_ = (x_int >= input_width - 1) ? 0.5f : input_x - x_int;
}
}
// The following method supports a 2-D or 4-D input in 'Linear mode'. Last two dimension is [H, W].
// the scale values for the outer dimensions except last two are 1.
template <typename T>
__global__ void _ResizeBilinearKernel(
int64_t input_height, int64_t input_width,
int64_t output_height, int64_t output_width,
fast_divmod div_output_width, fast_divmod div_output_image,
const T* input_data, T* output_data, const size_t N,
const T extrapolation_value,
LinearMappingInfo* dims_mapping) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
int bxc, output_image_index;
div_output_image.divmod(id, bxc, output_image_index);
HIP_LONG input_index = bxc * input_height * input_width;
int output_y, output_x;
div_output_width.divmod(output_image_index, output_y, output_x);
if (dims_mapping[output_y].extrapolate_ || dims_mapping[output_x + output_height].extrapolate_) {
output_data[id] = extrapolation_value;
return;
}
float y_offset_0 = dims_mapping[output_y].weight_;
int y_int = dims_mapping[output_y].origin_;
float x_offset_0 = dims_mapping[output_x + output_height].weight_;
int x_int = dims_mapping[output_x + output_height].origin_;
input_index += y_int * input_width + x_int;
T x00 = input_data[input_index];
bool end_of_h = (y_int >= input_height - 1);
bool end_of_w = (x_int >= input_width - 1);
T x10 = end_of_w ? x00 : input_data[input_index + 1];
T x01 = end_of_h ? x00 : input_data[input_index + input_width];
T x11 = end_of_w ? x01 : (end_of_h ? x10 : input_data[input_index + input_width + 1]);
float y_offset_1 = 1.0f - y_offset_0;
float x_offset_1 = 1.0f - x_offset_0;
output_data[id] =
x00 * static_cast<T>(y_offset_1 * x_offset_1) +
x01 * static_cast<T>(y_offset_0 * x_offset_1) +
x10 * static_cast<T>(y_offset_1 * x_offset_0) +
x11 * static_cast<T>(y_offset_0 * x_offset_0);
}
template <typename T, typename CudaFunctionOriginalCoordinate>
__global__ void _ResizeTrilinearCoordinateMapping(
int64_t input_depth, int64_t input_height, int64_t input_width,
int64_t output_depth, int64_t output_height, int64_t output_width,
float scale_depth, float scale_height, float scale_width,
float roi_depth_start, float roi_depth_end,
float roi_height_start, float roi_height_end,
float roi_width_start, float roi_width_end,
const size_t SumDHW, bool extrapolation_enabled,
const CudaFunctionOriginalCoordinate& transform_coordinate,
LinearMappingInfo* dims_mapping) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, SumDHW);
if (id < output_depth) { // z = id
float input_z = scale_depth == 1 ? static_cast<float>(id) :
transform_coordinate(static_cast<float>(id), scale_depth,
static_cast<float>(output_depth), static_cast<float>(input_depth),
roi_depth_start, roi_depth_end);
dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_z < 0 || input_z > static_cast<float>(input_depth - 1)));
input_z = max(0.0f, min(input_z, static_cast<float>(input_depth - 1)));
int z_int = static_cast<int>(input_z);
dims_mapping[id].origin_ = z_int;
dims_mapping[id].weight_ = (z_int >= input_depth - 1) ? 0.5f : input_z - z_int;
} else if (id >= output_depth && id < (output_depth + output_height)) { // y = id - output_depth
float input_y = scale_height == 1 ? static_cast<float>(id - output_depth) :
transform_coordinate(static_cast<float>(id - output_depth), scale_height,
static_cast<float>(output_height), static_cast<float>(input_height),
roi_height_start, roi_height_end);
dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_y < 0 || input_y > static_cast<float>(input_height - 1)));
input_y = max(0.0f, min(input_y, static_cast<float>(input_height - 1)));
int y_int = static_cast<int>(input_y);
dims_mapping[id].origin_ = y_int;
dims_mapping[id].weight_ = (y_int >= input_height - 1) ? 0.5f : input_y - y_int;
} else { //x = id - output_depth - output_height
float input_x = scale_width == 1 ? static_cast<float>(id - output_depth - output_height) :
transform_coordinate(static_cast<float>(id - output_depth - output_height), scale_width,
static_cast<float>(output_width), static_cast<float>(input_width),
roi_width_start, roi_width_end);
dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_x < 0 || input_x > static_cast<float>(input_width - 1)));
input_x = max(0.0f, min(input_x, static_cast<float>(input_width - 1)));
int x_int = static_cast<int>(input_x);
dims_mapping[id].origin_ = x_int;
dims_mapping[id].weight_ = (x_int >= input_width - 1) ? 0.5f : input_x - x_int;
}
}
// The following method supports a 3-D or 5-D input in 'Linear mode'. Last two dimension is [D, sH, W].
// the scale values for the outer dimensions except last two are 1.
template <typename T>
__global__ void _ResizeTrilinearKernel(
int64_t input_depth, int64_t input_height, int64_t input_width,
int64_t output_depth, int64_t output_height, int64_t output_width,
fast_divmod div_output_height, fast_divmod div_output_width, fast_divmod div_output_image,
const T* input_data, T* output_data, const size_t N,
const T extrapolation_value,
LinearMappingInfo* dims_mapping) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
int bxc, output_image_index;
div_output_image.divmod(id, bxc, output_image_index);
HIP_LONG input_index = bxc * input_depth * input_height * input_width;
int output_z, output_y, output_x, temp;
div_output_height.divmod(output_image_index, output_z, temp);
div_output_width.divmod(temp, output_y, output_x);
if (dims_mapping[output_z].extrapolate_ ||
dims_mapping[output_y + output_depth].extrapolate_ ||
dims_mapping[output_x + output_depth + output_height].extrapolate_) {
output_data[id] = extrapolation_value;
return;
}
float z_offset_0 = dims_mapping[output_z].weight_;
int z_int = dims_mapping[output_z].origin_;
float y_offset_0 = dims_mapping[output_y + output_depth].weight_;
int y_int = dims_mapping[output_y + output_depth].origin_;
float x_offset_0 = dims_mapping[output_x + output_depth + output_height].weight_;
int x_int = dims_mapping[output_x + output_depth + output_height].origin_;
input_index += z_int * input_height * input_width + y_int * input_width + x_int;
T x000 = input_data[input_index];
bool end_of_h = (y_int >= input_height - 1);
bool end_of_w = (x_int >= input_width - 1);
T x100 = end_of_w ? x000 : input_data[input_index + 1];
T x010 = end_of_h ? x000 : input_data[input_index + input_width];
T x110 = end_of_w ? x010 : (end_of_h ? x100 : input_data[input_index + input_width + 1]);
bool end_of_d = (z_int >= input_depth - 1);
if (!end_of_d) {
input_index = input_index + input_height * input_width;
}
T x001 = end_of_d ? x000 : input_data[input_index];
T x101 = end_of_w ? x001 : input_data[input_index + 1];
T x011 = end_of_h ? x001 : input_data[input_index + input_width];
T x111 = end_of_w ? x011 : (end_of_h ? x101 : input_data[input_index + input_width + 1]);
float z_offset_1 = 1.0f - z_offset_0;
float y_offset_1 = 1.0f - y_offset_0;
float x_offset_1 = 1.0f - x_offset_0;
output_data[id] =
x000 * static_cast<T>(z_offset_1 * y_offset_1 * x_offset_1) +
x010 * static_cast<T>(z_offset_1 * y_offset_0 * x_offset_1) +
x100 * static_cast<T>(z_offset_1 * y_offset_1 * x_offset_0) +
x110 * static_cast<T>(z_offset_1 * y_offset_0 * x_offset_0) +
x001 * static_cast<T>(z_offset_0 * y_offset_1 * x_offset_1) +
x011 * static_cast<T>(z_offset_0 * y_offset_0 * x_offset_1) +
x101 * static_cast<T>(z_offset_0 * y_offset_1 * x_offset_0) +
x111 * static_cast<T>(z_offset_0 * y_offset_0 * x_offset_0);
}
template <typename T>
__device__ __forceinline__ float CubicInterpolationRowwise(
const T* image, int x, int y, int input_height, int input_width,
float coeff0, float coeff1, float coeff2, float coeff3) {
int row_index = max(0, min(y, input_height - 1)) * input_width;
return coeff0 * static_cast<float>(image[row_index + max(0, min(x - 1, input_width - 1))]) +
coeff1 * static_cast<float>(image[row_index + max(0, min(x, input_width - 1))]) +
coeff2 * static_cast<float>(image[row_index + max(0, min(x + 1, input_width - 1))]) +
coeff3 * static_cast<float>(image[row_index + max(0, min(x + 2, input_width - 1))]);
}
struct CubicMappingInfo {
int origin_;
int extrapolate_;
float coeff0_;
float coeff1_;
float coeff2_;
float coeff3_;
};
template <typename T, typename CudaFunctionOriginalCoordinate>
__global__ void _ResizeCubicCoordinateMapping(
int64_t input_height, int64_t input_width,
int64_t output_height, int64_t output_width,
float scale_height, float scale_width,
float roi_height_start, float roi_height_end,
float roi_width_start, float roi_width_end,
const size_t SumHW, bool extrapolation_enabled,
float cubic_coeff_a, bool exclude_outside,
const CudaFunctionOriginalCoordinate& transform_coordinate,
CubicMappingInfo* dims_mapping) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, SumHW);
auto& dm = dims_mapping[id];
bool is_y_axis = (id < output_height);
int max_input_coord = static_cast<int>(is_y_axis ? input_height : input_width);
float scale = is_y_axis ? scale_height : scale_width;
float input_coordinat = scale == 1 ? (is_y_axis ? id : id - output_height) :
transform_coordinate(
static_cast<float>(is_y_axis ? id : id - output_height),
scale,
static_cast<float>(is_y_axis ? output_height : output_width),
static_cast<float>(max_input_coord),
(is_y_axis ? roi_height_start : roi_width_start),
(is_y_axis ? roi_height_end : roi_width_end));
int coord_int = static_cast<int>(_Floor(input_coordinat));
float s_coord = abs(input_coordinat - coord_int);
float coeff_sum = 1.0f;
float coeff_0 = static_cast<float>(((cubic_coeff_a * (s_coord + 1) - 5 * cubic_coeff_a) * (s_coord + 1) + 8 * cubic_coeff_a) * (s_coord + 1) - 4 * cubic_coeff_a);
float coeff_1 = static_cast<float>(((cubic_coeff_a + 2) * s_coord - (cubic_coeff_a + 3)) * s_coord * s_coord + 1);
float coeff_2 = static_cast<float>(((cubic_coeff_a + 2) * (1 - s_coord) - (cubic_coeff_a + 3)) * (1 - s_coord) * (1 - s_coord) + 1);
float coeff_3 = static_cast<float>(((cubic_coeff_a * (2 - s_coord) - 5 * cubic_coeff_a) * (2 - s_coord) + 8 * cubic_coeff_a) * (2 - s_coord) - 4 * cubic_coeff_a);
if (exclude_outside) {
coeff_0 = (coord_int - 1 < 0 || coord_int - 1 >= max_input_coord) ? 0.0 : coeff_0;
coeff_1 = (coord_int + 0 < 0 || coord_int + 0 >= max_input_coord) ? 0.0 : coeff_1;
coeff_2 = (coord_int + 1 < 0 || coord_int + 1 >= max_input_coord) ? 0.0 : coeff_2;
coeff_3 = (coord_int + 2 < 0 || coord_int + 2 >= max_input_coord) ? 0.0 : coeff_3;
coeff_sum = coeff_0 + coeff_1 + coeff_2 + coeff_3;
}
dm.origin_ = coord_int;
dm.coeff0_ = coeff_0 / coeff_sum;
dm.coeff1_ = coeff_1 / coeff_sum;
dm.coeff2_ = coeff_2 / coeff_sum;
dm.coeff3_ = coeff_3 / coeff_sum;
dm.extrapolate_ = (int)(extrapolation_enabled && (input_coordinat < 0 || input_coordinat > static_cast<float>(max_input_coord - 1)));
}
template <typename T>
__global__ void _ResizeBiCubicKernel(
int64_t input_height, int64_t input_width, int64_t output_height, int64_t output_width,
fast_divmod div_output_width, fast_divmod div_output_image,
const T* input_data, T* output_data, const size_t N, const T extrapolation_value,
CubicMappingInfo* dims_mapping) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
int bxc, output_image_index, output_x, output_y;
div_output_image.divmod(id, bxc, output_image_index);
HIP_LONG input_index = bxc * input_height * input_width;
div_output_width.divmod(output_image_index, output_y, output_x);
CubicMappingInfo& y_info = dims_mapping[output_y];
CubicMappingInfo& x_info = dims_mapping[output_x + output_height];
if (y_info.extrapolate_ || x_info.extrapolate_) {
output_data[id] = extrapolation_value;
return;
}
float w0 = x_info.coeff0_;
float w1 = x_info.coeff1_;
float w2 = x_info.coeff2_;
float w3 = x_info.coeff3_;
int x_int = x_info.origin_;
int y_int = y_info.origin_;
const T* image = input_data + input_index;
output_data[id] = y_info.coeff0_ * CubicInterpolationRowwise(image, x_int, y_int - 1, input_height, input_width, w0, w1, w2, w3) +
y_info.coeff1_ * CubicInterpolationRowwise(image, x_int, y_int, input_height, input_width, w0, w1, w2, w3) +
y_info.coeff2_ * CubicInterpolationRowwise(image, x_int, y_int + 1, input_height, input_width, w0, w1, w2, w3) +
y_info.coeff3_ * CubicInterpolationRowwise(image, x_int, y_int + 2, input_height, input_width, w0, w1, w2, w3);
}
size_t CalcResizeBufferSize(const onnxruntime::UpsampleMode upsample_mode,
const gsl::span<const int64_t>& output_dims) {
switch (upsample_mode) {
case UpsampleMode::NN:
return sizeof(int64_t) * output_dims.size() + sizeof(NearestMappingInfo) * static_cast<size_t>(std::accumulate(output_dims.begin(), output_dims.end(), (int64_t)0));
case UpsampleMode::LINEAR:
return sizeof(LinearMappingInfo) * static_cast<size_t>(std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, (int64_t)0));
case UpsampleMode::CUBIC:
return sizeof(CubicMappingInfo) * static_cast<size_t>(std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, (int64_t)0));
}
return 0;
}
template <typename T>
void ResizeNearestImpl(
hipStream_t stream,
const int rank,
TArray<int64_t>& input_shape,
TArray<int64_t>& output_shape,
TArray<int64_t>& input_strides,
TArray<fast_divmod>& output_div_pitches,
TArray<float>& scales_vals,
TArray<float, 10>& roi_vals,
const T* input_data,
T* output_data,
const size_t N,
bool extrapolation_enabled,
const T extrapolation_value,
float cubic_coeff_a,
ResizeCoordinateTransformationMode transform_coordinate,
ResizeNearestMode calc_nearest_pixel,
int64_t* /* prefix_dim_sum */,
NearestMappingInfo* dims_mapping) {
unsigned int blocksPerGrid = static_cast<unsigned int>(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
bool could2d = rank >= 2 &&
transform_coordinate != ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE &&
std::all_of(scales_vals.Data(), scales_vals.Data() + (rank - 2), [](float v) { return v == 1.0; });
if (could2d) {
int64_t output_height = output_shape[rank - 2];
int64_t output_width = output_shape[rank - 1];
fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 3] : fast_divmod(static_cast<int>(output_height * output_width));
int blocksPerDimsMappingGrid = static_cast<int>(ceil((output_height + output_width) / 32.0));
DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(transform_coordinate, [&]() {
DISPATCH_RESIZE_NEAREST_MODE(calc_nearest_pixel, [&]() {
hipLaunchKernelGGL(HIP_KERNEL_NAME(_ResizeNearestMappingKernel2D<T>), blocksPerDimsMappingGrid, 32, 0, stream,
static_cast<int>(input_shape[rank - 2]), static_cast<int>(input_shape[rank - 1]),
static_cast<int>(output_height), static_cast<int>(output_width),
scales_vals[rank - 2], scales_vals[rank - 1],
roi_vals[rank - 2], roi_vals[rank - 2 + rank],
roi_vals[rank - 1], roi_vals[rank - 1 + rank],
extrapolation_enabled, coord_t(), nearest_t(),
dims_mapping);
});
});
if (extrapolation_enabled) {
hipLaunchKernelGGL(HIP_KERNEL_NAME(_ResizeNearestKernel2D<T, true>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
output_height, output_width,
input_shape[rank - 2] * input_shape[rank - 1], static_cast<int>(input_shape[rank - 1]),
div_output_image, output_div_pitches[rank - 2],
input_data, output_data, N,
extrapolation_value,
dims_mapping);
} else {
hipLaunchKernelGGL(HIP_KERNEL_NAME(_ResizeNearestKernel2D<T, false>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
output_height, output_width,
input_shape[rank - 2] * input_shape[rank - 1], static_cast<int>(input_shape[rank - 1]),
div_output_image, output_div_pitches[rank - 2],
input_data, output_data, N,
extrapolation_value,
dims_mapping);
}
return;
}
int64_t total_dim_sum = std::accumulate(output_shape.Data(), output_shape.Data() + rank, (int64_t)0);
int blocksPerDimsMappingGrid = (int)(ceil(static_cast<double>(total_dim_sum) / 32));
DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(transform_coordinate, [&]() {
DISPATCH_RESIZE_NEAREST_MODE(calc_nearest_pixel, [&]() {
hipLaunchKernelGGL(HIP_KERNEL_NAME(_ResizeNearestMappingKernel<T>), blocksPerDimsMappingGrid, 32, 0, stream,
rank, input_shape, output_shape,
scales_vals, roi_vals,
total_dim_sum, extrapolation_enabled,
coord_t(), nearest_t(),
reinterpret_cast<int64_t*>(dims_mapping),
reinterpret_cast<NearestMappingInfo*>(reinterpret_cast<int64_t*>(dims_mapping) + rank));
});
});
hipLaunchKernelGGL(HIP_KERNEL_NAME(_ResizeNearestKernel<T>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
rank, input_strides, output_div_pitches,
input_data, output_data, N,
extrapolation_value,
reinterpret_cast<const int64_t*>(dims_mapping),
reinterpret_cast<const NearestMappingInfo*>(reinterpret_cast<int64_t*>(dims_mapping) + rank));
return;
}
template <typename T>
void ResizeImpl(
hipStream_t stream,
const UpsampleMode upsample_mode,
const int rank,
TArray<int64_t>& input_shape,
TArray<int64_t>& output_shape,
TArray<int64_t>& input_strides,
TArray<fast_divmod>& output_div_pitches,
TArray<float>& scales_vals,
TArray<float, 10>& roi_vals,
const T* input_data,
T* output_data,
const size_t N,
bool extrapolation_enabled,
const T extrapolation_value,
float cubic_coeff_a,
bool exclude_outside,
ResizeCoordinateTransformationMode coordinate_transform_mode,
ResizeNearestMode nearest_mode,
void* dims_mapping) {
bool isSame = std::all_of(scales_vals.Data(), scales_vals.Data() + rank, [](float v) { return v == 1.0f; }) &&
(coordinate_transform_mode != ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE);
if (isSame) {
HIP_CALL_THROW(hipMemcpyAsync(output_data, input_data, N * sizeof(T), hipMemcpyDeviceToDevice, stream));
return;
}
if (upsample_mode == UpsampleMode::NN) {
ResizeNearestImpl(
stream, rank, input_shape, output_shape, input_strides, output_div_pitches,
scales_vals, roi_vals, input_data, output_data, N,
extrapolation_enabled, extrapolation_value, cubic_coeff_a,
coordinate_transform_mode, nearest_mode,
reinterpret_cast<int64_t*>(dims_mapping),
reinterpret_cast<NearestMappingInfo*>(reinterpret_cast<int64_t*>(dims_mapping) + rank));
return;
}
// We support a special case of bilinear or bicubic if the input data is 4D with the outer 2 scales being 1.0
// We would have validated the outer scale values by the time execution reaches this
bool is_2D = (rank == 2 || rank == 4);
// We support a special case of trilinear or tricubic if the input data is 5D with the outer 2 scales being 1.0
// We would have validated the outer scale values by the time execution reaches this
bool is_3D = (rank == 3 || rank == 5);
// Should not hit this as we have already validated input rank/scales and we provide verbose error messages
// to the user.
ORT_ENFORCE(is_2D || is_3D, "Only bilinear/trilinear and bicubic modes are supported in Resize");
int blocksPerGrid = static_cast<int>(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
fast_divmod div_output_image;
if (is_2D) {
div_output_image = (rank > 2) ? output_div_pitches[rank - 3] : fast_divmod(gsl::narrow_cast<int>(N));
} else if (is_3D) {
div_output_image = (rank > 3) ? output_div_pitches[rank - 4] : fast_divmod(gsl::narrow_cast<int>(N));
}
int64_t output_depth = is_3D ? output_shape[rank - 3] : 0;
int64_t output_height = output_shape[rank - 2];
int64_t output_width = output_shape[rank - 1];
int blocksPerDimsMappingGrid =
static_cast<int>(ceil((output_depth + output_height + output_width) / 32.0));
switch (upsample_mode) {
case UpsampleMode::LINEAR:
if (is_2D) {
DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(coordinate_transform_mode, [&]() {
hipLaunchKernelGGL(HIP_KERNEL_NAME(_ResizeBilinearCoordinateMapping<T>), blocksPerDimsMappingGrid, 32, 0, stream,
input_shape[rank - 2], input_shape[rank - 1],
output_height, output_width,
scales_vals[rank - 2], scales_vals[rank - 1],
roi_vals[rank - 2], roi_vals[rank - 2 + rank],
roi_vals[rank - 1], roi_vals[rank - 1 + rank],
output_height + output_width, extrapolation_enabled, coord_t(),
reinterpret_cast<LinearMappingInfo*>(dims_mapping));
});
hipLaunchKernelGGL(HIP_KERNEL_NAME(_ResizeBilinearKernel<T>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
input_shape[rank - 2], input_shape[rank - 1],
output_height, output_width,
output_div_pitches[rank - 2], div_output_image,
input_data, output_data, N, extrapolation_value,
reinterpret_cast<LinearMappingInfo*>(dims_mapping));
return;
} else if (is_3D) {
DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(coordinate_transform_mode, [&]() {
hipLaunchKernelGGL(HIP_KERNEL_NAME(_ResizeTrilinearCoordinateMapping<T>), blocksPerDimsMappingGrid, 32, 0, stream,
input_shape[rank - 3] , input_shape[rank - 2], input_shape[rank - 1],
output_depth, output_height, output_width,
scales_vals[rank - 3], scales_vals[rank - 2], scales_vals[rank - 1],
roi_vals[rank - 3], roi_vals[rank - 3 + rank],
roi_vals[rank - 2], roi_vals[rank - 2 + rank],
roi_vals[rank - 1], roi_vals[rank - 1 + rank],
output_depth + output_height + output_width, extrapolation_enabled, coord_t(),
reinterpret_cast<LinearMappingInfo*>(dims_mapping));
});
hipLaunchKernelGGL(HIP_KERNEL_NAME(_ResizeTrilinearKernel<T>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
input_shape[rank - 3], input_shape[rank - 2], input_shape[rank - 1],
output_depth, output_height, output_width,
output_div_pitches[rank - 3], output_div_pitches[rank - 2], div_output_image,
input_data, output_data, N, extrapolation_value,
reinterpret_cast<LinearMappingInfo*>(dims_mapping));
return;
}
ORT_THROW("Only bilinear/trilinear and bicubic modes are supported in Resize");
break;
case UpsampleMode::CUBIC:
if (is_2D) {
DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(coordinate_transform_mode, [&]() {
hipLaunchKernelGGL(HIP_KERNEL_NAME(_ResizeCubicCoordinateMapping<T>), blocksPerDimsMappingGrid, 32, 0, stream,
input_shape[rank - 2], input_shape[rank - 1],
output_height, output_width,
scales_vals[rank - 2], scales_vals[rank - 1],
roi_vals[rank - 2], roi_vals[rank - 2 + rank],
roi_vals[rank - 1], roi_vals[rank - 1 + rank],
output_height + output_width, extrapolation_enabled,
cubic_coeff_a, exclude_outside, coord_t(),
reinterpret_cast<CubicMappingInfo*>(dims_mapping));
});
hipLaunchKernelGGL(HIP_KERNEL_NAME(_ResizeBiCubicKernel<T>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
input_shape[rank - 2], input_shape[rank - 1],
output_height, output_width,
output_div_pitches[rank - 2], div_output_image,
input_data, output_data, N, extrapolation_value,
reinterpret_cast<CubicMappingInfo*>(dims_mapping));
return;
}
ORT_THROW("Only bilinear/trilinear and bicubic modes are supported in Resize");
case UpsampleMode::NN:
ORT_THROW("Only bilinear/trilinear and bicubic modes are supported in Resize");
}
}
#define SPECIALIZED_IMPL(T) \
template void ResizeImpl<T>( \
hipStream_t stream, \
const UpsampleMode upsample_mode, \
const int rank, \
TArray<int64_t>& input_shape, \
TArray<int64_t>& output_shape, \
TArray<int64_t>& input_strides, \
TArray<fast_divmod>& output_div_pitches, \
TArray<float>& scales_vals, \
TArray<float, 10>& roi_vals, \
const T* input_data, \
T* output_data, \
const size_t N, \
bool extrapolation_enabled, \
const T extrapolation_value, \
float cubic_coeff_a, \
bool exclude_outside, \
ResizeCoordinateTransformationMode coordinate_transform_mode, \
ResizeNearestMode nearest_mode, \
void* dims_mapping);
SPECIALIZED_IMPL(float)
SPECIALIZED_IMPL(double)
SPECIALIZED_IMPL(half)
SPECIALIZED_IMPL(int32_t)
SPECIALIZED_IMPL(uint8_t)
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
#include "core/common/common.h"
#include "core/providers/cpu/tensor/upsamplebase.h"
#include "core/providers/rocm/rocm_common.h"
namespace onnxruntime {
namespace rocm {
size_t CalcResizeBufferSize(const onnxruntime::UpsampleMode upsample_mode,
const gsl::span<const int64_t>& output_dims);
template <typename T>
void ResizeImpl(
hipStream_t stream,
const onnxruntime::UpsampleMode upsample_mode,
const int rank,
TArray<int64_t>& input_shape,
TArray<int64_t>& output_shape,
TArray<int64_t>& input_strides,
TArray<fast_divmod>& output_div_pitches,
TArray<float>& scales_vals,
TArray<float, 10>& roi,
const T* input_data,
T* output_data,
const size_t N,
bool extrapolation_enabled,
const T extrapolation_value,
float cubic_coeff_a,
bool exclude_outside,
onnxruntime::ResizeCoordinateTransformationMode coordinate_transform_mode,
onnxruntime::ResizeNearestMode nearest_mode,
void* dims_mapping);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "reverse_sequence.h"
#include "reverse_sequence_impl.h"
#include "core/providers/cpu/tensor/utils.h"
namespace onnxruntime {
namespace rocm {
ONNX_OPERATOR_KERNEL_EX(
ReverseSequence,
kOnnxDomain,
10,
kRocmExecutionProvider,
(*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
ReverseSequenceOp);
#define ReverseSequenceCallCudaImplTypeAs(T, TEqual) \
if (X.IsDataType<T>()) { \
HIP_RETURN_IF_ERROR(ReverseSequenceCudaImpl( \
Stream(), \
reinterpret_cast<const typename ToHipType<TEqual>::MappedType*>(X.Data<T>()), \
seq_lengths.Data<int64_t>(), \
reinterpret_cast<typename ToHipType<TEqual>::MappedType*>(Y.MutableData<T>()), \
gsl::narrow<int>(batch_size), gsl::narrow<int>(max_seq_len), gsl::narrow<int>(element_size), \
time_major_)); \
return Status::OK(); \
}
Status ReverseSequenceOp::ComputeInternal(OpKernelContext* context) const {
const auto& X = *context->Input<Tensor>(0);
const auto& dims = X.Shape();
const auto batch_size = time_major_ ? dims[1] : dims[0];
const auto max_seq_len = time_major_ ? dims[0] : dims[1];
const auto element_size = dims.SizeFromDimension(2);
const auto& seq_lengths = *context->Input<Tensor>(1);
const auto& seq_len_shape = seq_lengths.Shape();
if (seq_len_shape.NumDimensions() != 1 || seq_len_shape[0] != batch_size) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "sequence_lens shape must be {batch_size}. Got:",
seq_len_shape, ". batch_size=", batch_size);
}
auto& Y = *context->Output(0, dims);
ReverseSequenceCallCudaImplTypeAs(float, int32_t);
ReverseSequenceCallCudaImplTypeAs(int32_t, int32_t);
ReverseSequenceCallCudaImplTypeAs(uint32_t, int32_t);
ReverseSequenceCallCudaImplTypeAs(MLFloat16, int16_t);
ReverseSequenceCallCudaImplTypeAs(int16_t, int16_t);
ReverseSequenceCallCudaImplTypeAs(uint16_t, int16_t);
ReverseSequenceCallCudaImplTypeAs(int8_t, int8_t);
ReverseSequenceCallCudaImplTypeAs(uint8_t, int8_t);
ReverseSequenceCallCudaImplTypeAs(bool, int8_t);
ReverseSequenceCallCudaImplTypeAs(int64_t, int64_t);
ReverseSequenceCallCudaImplTypeAs(double, int64_t);
ReverseSequenceCallCudaImplTypeAs(uint64_t, int64_t);
return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
"Type for ", X.DataType(), " is not supported yet in ReverseSequence.");
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace onnxruntime {
namespace rocm {
class ReverseSequenceOp final : public RocmKernel {
public:
ReverseSequenceOp(const OpKernelInfo& info) : RocmKernel(info) {
int64_t batch_axis;
int64_t time_axis;
ORT_ENFORCE(info.GetAttr<int64_t>("batch_axis", &batch_axis).IsOK());
ORT_ENFORCE(info.GetAttr<int64_t>("time_axis", &time_axis).IsOK());
ORT_ENFORCE(batch_axis < 2, "Invalid batch_axis of ", batch_axis, ". Must be 0 or 1");
ORT_ENFORCE(time_axis < 2, "Invalid time_axis of ", time_axis, ". Must be 0 or 1");
ORT_ENFORCE(batch_axis != time_axis,
"time_axis and batch_axis must have different values but both are ", time_axis);
time_major_ = time_axis == 0;
}
Status ComputeInternal(OpKernelContext* context) const override;
private:
bool time_major_;
};
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "reverse_sequence_impl.h"
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/common/common.h"
namespace onnxruntime {
namespace rocm {
static const int kReverseSequenceElementsPerThread = 4;
template <typename T, bool time_major>
__global__ void ReverseSequenceImplKernel(
const T* x_data,
const int64_t* seq_len_data,
T* y_data,
const int batch_size,
const int max_seq_len,
const int element_size,
const int group_count,
const fast_divmod fdm_grouped_stride_0,
const fast_divmod fdm_grouped_stride_1) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(grouped_index, group_count);
int batch_id, seq_id, gid = grouped_index;
if (time_major) {
fdm_grouped_stride_0.divmod(gid, seq_id, gid);
fdm_grouped_stride_1.divmod(gid, batch_id, gid);
} else {
fdm_grouped_stride_0.divmod(gid, batch_id, gid);
fdm_grouped_stride_1.divmod(gid, seq_id, gid);
}
int eid = gid * kReverseSequenceElementsPerThread;
int target_seq_id = (seq_id < (int)seq_len_data[batch_id]) ? ((int)seq_len_data[batch_id] - 1 - seq_id) : seq_id;
int flat_src_idx, flat_target_idx;
if (time_major) {
flat_src_idx = seq_id * batch_size * element_size + batch_id * element_size + eid;
flat_target_idx = target_seq_id * batch_size * element_size + batch_id * element_size + eid;
} else {
flat_src_idx = batch_id * max_seq_len * element_size + seq_id * element_size + eid;
flat_target_idx = batch_id * max_seq_len * element_size + target_seq_id * element_size + eid;
}
y_data[flat_target_idx] = x_data[flat_src_idx];
#pragma unroll
for (int i = 1; i < kReverseSequenceElementsPerThread; ++i) {
if (eid + i < element_size) {
y_data[flat_target_idx + i] = x_data[flat_src_idx + i];
}
}
}
template <typename T>
hipError_t ReverseSequenceCudaImpl(
hipStream_t stream,
const T* x_data,
const int64_t* seq_len_data,
T* y_data,
const int batch_size,
const int max_seq_len,
const int element_size,
const bool time_major) {
int element_group_size = CeilDiv(element_size, kReverseSequenceElementsPerThread);
fast_divmod fdm_grouped_stride_1(element_group_size);
fast_divmod fdm_grouped_stride_0(element_group_size * ((time_major) ? batch_size : max_seq_len));
int group_count = batch_size * max_seq_len * element_group_size;
int blocksPerGrid = CeilDiv(group_count, GridDim::maxThreadsPerBlock);
if (time_major) {
hipLaunchKernelGGL(HIP_KERNEL_NAME(ReverseSequenceImplKernel<T, true>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
x_data, seq_len_data, y_data, batch_size, max_seq_len, element_size,
group_count, fdm_grouped_stride_0, fdm_grouped_stride_1);
} else {
hipLaunchKernelGGL(HIP_KERNEL_NAME(ReverseSequenceImplKernel<T, false>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
x_data, seq_len_data, y_data, batch_size, max_seq_len, element_size,
group_count, fdm_grouped_stride_0, fdm_grouped_stride_1);
}
return hipSuccess;
}
#define InstantiateReverseSequenceImpl(T) \
template hipError_t ReverseSequenceCudaImpl( \
hipStream_t stream, \
const T* x_data, \
const int64_t* seq_len_data, \
T* y_data, \
const int batch_size, \
const int max_seq_len, \
const int element_size, \
const bool time_major)
InstantiateReverseSequenceImpl(int64_t);
InstantiateReverseSequenceImpl(int32_t);
InstantiateReverseSequenceImpl(int16_t);
InstantiateReverseSequenceImpl(int8_t);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
hipError_t ReverseSequenceCudaImpl(
hipStream_t stream,
const T* x_data,
const int64_t* seq_len_data,
T* y_data,
const int batch_size,
const int max_seq_len,
const int element_size,
const bool time_major);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/scatter_elements.h"
#include "core/providers/cpu/tensor/utils.h"
#include "core/providers/rocm/tensor/gather_elements.h"
#include "core/providers/rocm/tensor/gather_elements_impl.h"
#include "core/providers/rocm/tensor/scatter_elements_impl.h"
namespace onnxruntime {
namespace rocm {
ONNX_OPERATOR_VERSIONED_KERNEL_EX(Scatter, kOnnxDomain, 9, 10, kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.TypeConstraint("Tind",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(),
DataTypeImpl::GetTensorType<int64_t>()}),
ScatterElements);
ONNX_OPERATOR_VERSIONED_KERNEL_EX(ScatterElements, kOnnxDomain, 11, 12, kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.TypeConstraint("Tind",
std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(),
DataTypeImpl::GetTensorType<int64_t>()}),
ScatterElements);
ONNX_OPERATOR_KERNEL_EX(ScatterElements, kOnnxDomain, 13, kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.TypeConstraint("Tind", std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(),
DataTypeImpl::GetTensorType<int64_t>()}),
ScatterElements);
#define CASE_SCATTER_ELEMENTS_IMPL(type) \
case sizeof(type): { \
const type* indices_data = reinterpret_cast<const type*>(indices_data_raw); \
ORT_RETURN_IF_ERROR(ScatterElementsImpl(stream, input_data, indices_data, updates_data, output_data, args)); \
} break
template <typename T>
struct ScatterElements::ComputeImpl {
Status operator()(hipStream_t stream, const void* input_data_raw, const void* updates_data_raw,
const void* indices_data_raw, void* output_data_raw, const size_t index_element_size,
const GatherScatterElementsArgs& args) const {
typedef typename ToHipType<T>::MappedType HipT;
const HipT* input_data = reinterpret_cast<const HipT*>(input_data_raw);
const HipT* updates_data = reinterpret_cast<const HipT*>(updates_data_raw);
HipT* output_data = reinterpret_cast<HipT*>(output_data_raw);
switch (index_element_size) {
CASE_SCATTER_ELEMENTS_IMPL(int32_t);
CASE_SCATTER_ELEMENTS_IMPL(int64_t);
// should not reach here as we validate if the all relevant types are supported in the Compute method
default:
ORT_THROW("Unsupported indices element size by the ScatterElements ROCM kernel");
}
return Status::OK();
}
};
#undef CASE_SCATTER_ELEMENTS_IMPL
Status ScatterElements::ComputeInternal(OpKernelContext* context) const {
const auto* input_tensor = context->Input<Tensor>(0);
const auto& input_shape = input_tensor->Shape();
const int64_t input_size = input_shape.Size();
const int64_t input_rank = static_cast<int64_t>(input_shape.NumDimensions());
const int64_t axis = HandleNegativeAxis(axis_, input_rank);
const auto* indices_tensor = context->Input<Tensor>(1);
const auto* updates_tensor = context->Input<Tensor>(2);
if (input_tensor->DataType() != updates_tensor->DataType()) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "data type is different from updates type");
}
const auto& indices_shape = indices_tensor->Shape();
auto indices_dims = indices_shape.GetDims();
auto updates_dims = updates_tensor->Shape().GetDims();
if (indices_dims.size() != updates_dims.size()) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Indices and updates must have the same rank");
}
for (size_t i = 0; i < indices_dims.size(); ++i) {
if (indices_dims[i] != updates_dims[i]) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Indices vs updates dimensions differs at position=", i,
" ", indices_dims[i], " vs ", updates_dims[i]);
}
}
// Validate input shapes and ranks (invoke the static method in the CPU GatherElements kernel that hosts the shared
// checks)
ORT_RETURN_IF_ERROR(onnxruntime::GatherElements::ValidateInputShapes(input_shape, indices_shape, axis));
auto* output_tensor = context->Output(0, input_shape);
if (input_size == 0) return Status::OK();
GatherScatterElementsArgs args;
args.input_size = input_size;
args.indices_size = indices_shape.Size();
TensorShapeVector input_shape_vec = input_shape.AsShapeVector();
TensorShapeVector indices_shape_vec = indices_shape.AsShapeVector();
CoalesceDimensions(input_shape_vec, indices_shape_vec, nullptr, axis, args);
// Use element size instead of concrete types so we can specialize less template functions to reduce binary size.
int dtype = GetElementType(input_tensor->DataType()->Size());
if (dtype == ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) {
ORT_THROW("Unsupported element size by the ScatterElements ROCM kernel");
}
utils::MLTypeCallDispatcher<int8_t, MLFloat16, float, double> t_disp(dtype);
return t_disp.InvokeRet<Status, ComputeImpl>(Stream(), input_tensor->DataRaw(), updates_tensor->DataRaw(),
indices_tensor->DataRaw(), output_tensor->MutableDataRaw(),
indices_tensor->DataType()->Size(), args);
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace onnxruntime {
namespace rocm {
class ScatterElements final : public RocmKernel {
public:
ScatterElements(const OpKernelInfo& info) : RocmKernel(info) {
ORT_ENFORCE(info.GetAttr<int64_t>("axis", &axis_).IsOK(),
"Missing/Invalid 'axis' attribute value");
}
~ScatterElements() = default;
Status ComputeInternal(OpKernelContext* context) const override;
private:
template <typename T>
struct ComputeImpl;
int64_t axis_;
};
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace onnxruntime {
namespace rocm {
struct GatherScatterElementsArgs;
template <typename T, typename TIndex>
Status ScatterElementsImpl(hipStream_t stream, const T* input_data, const TIndex* indices_data, const T* updates_data,
T* output_data, const GatherScatterElementsArgs& args);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/scatter_nd.h"
#include "core/providers/rocm/tensor/scatter_nd_impl.h"
#include "core/providers/rocm/shared_inc/rocm_utils.h"
#include "core/providers/cpu/tensor/utils.h"
namespace onnxruntime {
namespace rocm {
ONNX_OPERATOR_VERSIONED_KERNEL_EX(ScatterND,
kOnnxDomain,
11, 12,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.MayInplace(0, 0),
ScatterND);
ONNX_OPERATOR_KERNEL_EX(ScatterND,
kOnnxDomain,
13,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.MayInplace(0, 0),
ScatterND);
Status ScatterND::ComputeInternal(OpKernelContext* context) const {
const auto* input_tensor = context->Input<Tensor>(0);
const auto* indices_tensor = context->Input<Tensor>(1);
const auto* updates_tensor = context->Input<Tensor>(2);
const auto& input_shape = input_tensor->Shape();
const auto& indices_shape = indices_tensor->Shape();
const auto& updates_shape = updates_tensor->Shape();
// Validate input shapes
ORT_RETURN_IF_ERROR(onnxruntime::ScatterND::ValidateShapes(input_shape, indices_shape, updates_shape));
auto* output_tensor = context->Output(0, input_shape);
const void* input_data = input_tensor->DataRaw();
void* output_data = output_tensor->MutableDataRaw();
size_t element_size = input_tensor->DataType()->Size();
if (input_data != output_data) {
// TODO: Run benchmarks to determine if a dedicated kernel doing data copy will be faster than invoking hipMemcpy ?
HIP_RETURN_IF_ERROR(
hipMemcpyAsync(output_data, input_data, input_tensor->SizeInBytes(), hipMemcpyDeviceToDevice, Stream()));
}
// Bail out early
if (indices_shape.Size() == 0) {
return Status::OK();
}
auto last_index_dimension = indices_shape[indices_shape.NumDimensions() - 1];
// We need element counts for each dimension and the input dim value for each dimension
// for the range [0, last_index_dimension).
// To avoid multiple GPU data transfers, we combine this into one array and send it through
TensorPitches input_strides(input_shape);
std::vector<int64_t> element_counts_and_input_dims(last_index_dimension * 2, 0LL);
for (int64_t i = 0; i < last_index_dimension; ++i) {
element_counts_and_input_dims[i] = input_strides[i];
element_counts_and_input_dims[i + last_index_dimension] = input_shape[i];
}
RocmAsyncBuffer<int64_t> element_counts_and_input_dims_gpu(this, element_counts_and_input_dims);
ORT_RETURN_IF_ERROR(element_counts_and_input_dims_gpu.CopyToGpu());
ORT_RETURN_IF_ERROR(ScatterNDImpl(
Stream(),
output_data,
element_size,
indices_shape.Size() / static_cast<size_t>(last_index_dimension),
indices_tensor->Data<int64_t>(), // only int64_t is supported for indices as per the onnx spec
last_index_dimension,
element_counts_and_input_dims_gpu.GpuPtr(),
updates_tensor->DataRaw(),
input_shape.SizeFromDimension(last_index_dimension)));
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/tensor/scatter_nd.h"
namespace onnxruntime {
namespace rocm {
class ScatterND final : public RocmKernel {
public:
explicit ScatterND(const OpKernelInfo& info) : RocmKernel(info) {}
Status ComputeInternal(OpKernelContext* context) const override;
};
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/scatter_nd_impl.h"
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/providers/rocm/atomic/common.cuh"
namespace onnxruntime {
namespace rocm {
template <typename T>
__global__ void _ScatterNDKernel(
T* output_data,
const size_t num_indices,
const int64_t* indices_data,
const int64_t last_index_dimension,
const int64_t* element_counts_and_input_dims,
const T* updates_data,
const size_t num_updates_elements) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, num_indices);
// Compute the base offset into the output data
int64_t data_offset = 0;
size_t indices_start = last_index_dimension * id;
size_t indices_end = indices_start + last_index_dimension;
for (size_t i = indices_start; i < indices_end; ++i) {
int64_t index = indices_data[i];
int64_t element_count_dim = element_counts_and_input_dims[i - indices_start];
int64_t dim_value = element_counts_and_input_dims[i - indices_start + last_index_dimension];
// Clamp the index if out of range
// This would have been an error in the CPU kernel, but throwing in the ROCM EP
// is hard. This is the approach taken by other frameworks for out of bound indices
// in their corresponding GPU backends as well.
// index >= -dim_value && index < dim_value
if (index >= 0) {
if (index >= dim_value) {
index = dim_value - 1;
}
} else {
if (index < -dim_value) {
index = 0;
} else {
index += dim_value;
}
}
data_offset += (index * element_count_dim);
}
const T* updates_data_base = updates_data + num_updates_elements * id;
T* output_data_base = output_data + data_offset;
for (size_t i = 0; i < num_updates_elements; ++i) {
output_data_base[i] = updates_data_base[i];
}
}
Status ScatterNDImpl(
hipStream_t stream,
void* output_data,
const size_t element_size,
const size_t num_indices,
const int64_t* indices_data,
const int64_t last_index_dimension,
const int64_t* element_counts_and_input_dims,
const void* updates_data,
const size_t num_updates_elements) {
if (num_indices == 0)
return Status::OK();
// Parallelize on number of indices
int blocksPerGrid = static_cast<int>(ceil(static_cast<float>(num_indices) / GridDim::maxThreadsPerBlock));
switch (element_size) {
case sizeof(int8_t):
hipLaunchKernelGGL(_ScatterNDKernel, blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
reinterpret_cast<int8_t*>(output_data),
num_indices,
indices_data,
last_index_dimension,
element_counts_and_input_dims,
reinterpret_cast<const int8_t*>(updates_data),
num_updates_elements);
break;
case sizeof(int16_t):
hipLaunchKernelGGL(_ScatterNDKernel, blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
reinterpret_cast<int16_t*>(output_data),
num_indices,
indices_data,
last_index_dimension,
element_counts_and_input_dims,
reinterpret_cast<const int16_t*>(updates_data),
num_updates_elements);
break;
case sizeof(int32_t):
hipLaunchKernelGGL(_ScatterNDKernel, blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
reinterpret_cast<int32_t*>(output_data),
num_indices,
indices_data,
last_index_dimension,
element_counts_and_input_dims,
reinterpret_cast<const int32_t*>(updates_data),
num_updates_elements);
break;
case sizeof(int64_t):
hipLaunchKernelGGL(_ScatterNDKernel, blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
reinterpret_cast<int64_t*>(output_data),
num_indices,
indices_data,
last_index_dimension,
element_counts_and_input_dims,
reinterpret_cast<const int64_t*>(updates_data),
num_updates_elements);
break;
default:
// Shouldn't hit this
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for ScatterND operator");
}
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace onnxruntime {
namespace rocm {
Status ScatterNDImpl(
hipStream_t stream,
void* output_data,
const size_t element_size,
const size_t num_indices,
const int64_t* indices_data,
const int64_t last_index_dimension,
const int64_t* element_counts_and_input_dims,
const void* updates_data,
const size_t num_updates_elements);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "sequence_op.h"
namespace onnxruntime {
namespace rocm {
ONNX_OPERATOR_KERNEL_EX(
SequenceAt,
kOnnxDomain,
11,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.InputMemoryType(OrtMemTypeCPUInput, 1)
.TypeConstraint("S", DataTypeImpl::AllFixedSizeSequenceTensorTypes())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.TypeConstraint("I", std::vector<MLDataType>{
DataTypeImpl::GetTensorType<int32_t>(),
DataTypeImpl::GetTensorType<int64_t>()}),
SequenceAt);
ONNX_OPERATOR_KERNEL_EX(
SequenceConstruct,
kOnnxDomain,
11,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.TypeConstraint("S", DataTypeImpl::AllFixedSizeSequenceTensorTypes()),
SequenceConstruct);
ONNX_OPERATOR_KERNEL_EX(
SequenceEmpty,
kOnnxDomain,
11,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("S", DataTypeImpl::AllFixedSizeSequenceTensorTypes()),
SequenceEmpty);
ONNX_OPERATOR_KERNEL_EX(
SequenceLength,
kOnnxDomain,
11,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.OutputMemoryType(OrtMemTypeCPUInput, 0)
.TypeConstraint("S", DataTypeImpl::AllFixedSizeSequenceTensorTypes())
.TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),
SequenceLength);
ONNX_OPERATOR_KERNEL_EX(
ConcatFromSequence,
kOnnxDomain,
11,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("S", DataTypeImpl::AllFixedSizeSequenceTensorTypes()),
ConcatFromSequence);
ONNX_OPERATOR_KERNEL_EX(
SequenceErase,
kOnnxDomain,
11,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.InputMemoryType(OrtMemTypeCPUInput, 1)
.TypeConstraint("S", DataTypeImpl::AllFixedSizeSequenceTensorTypes())
.TypeConstraint("I", std::vector<MLDataType>{
DataTypeImpl::GetTensorType<int32_t>(),
DataTypeImpl::GetTensorType<int64_t>()}),
SequenceErase);
ONNX_OPERATOR_KERNEL_EX(
SequenceInsert,
kOnnxDomain,
11,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.InputMemoryType(OrtMemTypeCPUInput, 2)
.TypeConstraint("S", DataTypeImpl::AllFixedSizeSequenceTensorTypes())
.TypeConstraint("I", std::vector<MLDataType>{
DataTypeImpl::GetTensorType<int32_t>(),
DataTypeImpl::GetTensorType<int64_t>()}),
SequenceInsert);
} // namespace rocm
} // namespace onnxruntime
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment