Commit 1a91fcc2 authored by gaoqiong's avatar gaoqiong
Browse files

add dtk所需文件

parent a144865d
Pipeline #492 failed with stages
in 0 seconds
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/rocm/tensor/concat.h"
#include "core/providers/rocm/tensor/concat_impl.h"
namespace onnxruntime {
namespace rocm {
class SequenceAt final : public RocmKernel {
public:
SequenceAt(const OpKernelInfo& info) : RocmKernel(info) {}
Status ComputeInternal(OpKernelContext* context) const override {
const TensorSeq* X = context->Input<TensorSeq>(0);
const Tensor* I = context->Input<Tensor>(1);
int64_t idx = -1;
if (I->IsDataType<int32_t>()) {
idx = static_cast<int64_t>(I->Data<int32_t>()[0]);
} else {
idx = I->Data<int64_t>()[0];
}
int64_t sequence_size = static_cast<int64_t>(X->Size());
if (idx < 0) {
idx = sequence_size + idx;
}
ORT_ENFORCE(idx >= 0 && idx < sequence_size, "SequenceAt GPU: Invalid sequence index.");
const Tensor& source_tensor = X->Get(idx);
auto source_type = source_tensor.DataType();
const void* source_addr = source_tensor.DataRaw(source_type);
Tensor* target_tensor = context->Output(0, source_tensor.Shape());
void* target_addr = target_tensor->MutableDataRaw(source_type);
if (source_addr != target_addr) {
HIP_RETURN_IF_ERROR(hipMemcpyAsync(target_addr,
source_addr,
source_tensor.SizeInBytes(),
hipMemcpyDeviceToDevice, Stream()));
}
return Status::OK();
}
}; // SequenceAt
class SequenceConstruct final : public RocmKernel {
public:
SequenceConstruct(const OpKernelInfo& info) : RocmKernel(info) {}
Status ComputeInternal(OpKernelContext* context) const override {
auto num_inputs = Node().InputArgCount().front();
ORT_ENFORCE(num_inputs >= 1, "Must have 1 or more inputs");
MLDataType first_dtype = context->Input<Tensor>(0)->DataType();
AllocatorPtr alloc;
ORT_ENFORCE(context->GetTempSpaceAllocator(&alloc).IsOK(),
"SequenceConstruct GPU: Unable to get an allocator.");
TensorSeq* Y = context->Output<TensorSeq>(0);
Y->SetType(first_dtype);
Y->Reserve(num_inputs);
for (int input_idx = 0; input_idx < num_inputs; ++input_idx) {
const auto* source_tensor = context->Input<Tensor>(input_idx);
std::unique_ptr<Tensor> target_tensor = Tensor::Create(source_tensor->DataType(),
source_tensor->Shape(), alloc);
HIP_RETURN_IF_ERROR(hipMemcpyAsync(target_tensor->MutableDataRaw(),
source_tensor->DataRaw(),
source_tensor->SizeInBytes(),
hipMemcpyDeviceToDevice, Stream()));
Y->Add(std::move(*target_tensor)); // Add will check for type consistency
}
return Status::OK();
}
}; // SequenceConstruct
class SequenceEmpty final : public RocmKernel {
public:
SequenceEmpty(const OpKernelInfo& info) : RocmKernel(info) {
if (!info.GetAttr("dtype", &dtype_).IsOK()) {
dtype_ = ONNX_NAMESPACE::TensorProto_DataType_FLOAT;
}
}
Status ComputeInternal(OpKernelContext* context) const override {
TensorSeq* Y = context->Output<TensorSeq>(0);
#ifdef SHARED_PROVIDER
Y->SetType(DataTypeImpl::GetTypeFromOnnxType(static_cast<int>(dtype_)));
#else
Y->SetType(DataTypeImpl::TensorTypeFromONNXEnum(static_cast<int>(dtype_))->GetElementType());
#endif
return Status::OK();
}
private:
int64_t dtype_{};
}; // SequenceEmpty
class SequenceLength final : public RocmKernel {
public:
SequenceLength(const OpKernelInfo& info) : RocmKernel(info) {}
Status ComputeInternal(OpKernelContext* context) const override {
const TensorSeq* X = context->Input<TensorSeq>(0);
Tensor* Y = context->Output(0, {});
Y->MutableData<int64_t>()[0] = static_cast<int64_t>(X->Size());
return Status::OK();
}
}; // SequenceLength
class ConcatFromSequence final : public RocmKernel, public ConcatBase {
public:
ConcatFromSequence(const OpKernelInfo& info) : RocmKernel(info), ConcatBase(info, true) {}
Status ComputeInternal(OpKernelContext* context) const override {
const TensorSeq* X = context->Input<TensorSeq>(0);
int64_t input_count = static_cast<int64_t>(X->Size());
InlinedTensorsVector input_tensors;
for (int64_t i = 0; i < input_count; ++i) {
input_tensors.push_back(&X->Get(i));
}
Prepare p;
ORT_RETURN_IF_ERROR(PrepareForCompute(context, input_tensors, p));
if (0 == p.output_num_elements) {
return Status::OK();
}
int64_t initial_output_offset = 0;
auto element_bytes = p.output_tensor->DataType()->Size();
for (int input_index = 0; input_index < input_count; input_index++) {
const auto& prep = p.inputs[input_index];
if (prep.num_elements == 0) {
continue;
}
auto input_axis_pitch = prep.axis_pitch;
const uint8_t* input = static_cast<const uint8_t*>(prep.tensor->DataRaw());
auto input_size = prep.num_elements;
uint8_t* output = static_cast<uint8_t*>(p.output_tensor->MutableDataRaw());
int64_t cur_out_offset = 0;
int64_t cur_in_offset = 0;
for (size_t idx_copy = 0, end = input_size / input_axis_pitch; idx_copy < end; ++idx_copy) {
HIP_RETURN_IF_ERROR(hipMemcpyAsync(
output + (initial_output_offset + cur_out_offset) * element_bytes,
input + cur_in_offset * element_bytes, input_axis_pitch * element_bytes,
hipMemcpyDeviceToDevice, Stream()));
cur_out_offset += p.output_axis_pitch;
cur_in_offset += input_axis_pitch;
}
initial_output_offset += input_axis_pitch;
}
return Status::OK();
}
}; // ConcatFromSequence
class SequenceErase final : public RocmKernel {
public:
SequenceErase(const OpKernelInfo& info) : RocmKernel(info) {}
Status ComputeInternal(OpKernelContext* context) const override {
const TensorSeq* X = context->Input<TensorSeq>(0);
int64_t X_size = static_cast<int64_t>(X->Size());
int64_t idx = X_size - 1;
const Tensor* I = context->Input<Tensor>(1);
if (I != nullptr) {
if (I->IsDataType<int32_t>()) {
idx = static_cast<int64_t>(I->Data<int32_t>()[0]);
} else {
idx = I->Data<int64_t>()[0];
}
if (idx < 0) {
idx = X_size + idx;
}
ORT_ENFORCE(idx >= 0 && idx < X_size, "SequenceErase GPU: Invalid sequence index.");
}
AllocatorPtr alloc;
ORT_ENFORCE(context->GetTempSpaceAllocator(&alloc).IsOK(),
"SequenceErase GPU: Unable to get an allocator.");
TensorSeq* Y = context->Output<TensorSeq>(0);
Y->SetType(X->DataType());
Y->Reserve(X_size - 1);
for (int64_t i = 0; i < X_size; ++i) {
if (i == idx) {
continue;
}
const Tensor& source_tensor = X->Get(i);
std::unique_ptr<Tensor> target_tensor = Tensor::Create(source_tensor.DataType(),
source_tensor.Shape(), alloc);
HIP_RETURN_IF_ERROR(hipMemcpyAsync(target_tensor->MutableDataRaw(),
source_tensor.DataRaw(),
source_tensor.SizeInBytes(),
hipMemcpyDeviceToDevice, Stream()));
Y->Add(std::move(*target_tensor)); // Add will check for type consistency
}
return Status::OK();
}
}; // SequenceErase
class SequenceInsert final : public RocmKernel {
public:
SequenceInsert(const OpKernelInfo& info) : RocmKernel(info) {}
Status ComputeInternal(OpKernelContext* context) const override {
const TensorSeq* S = context->Input<TensorSeq>(0);
int64_t S_size = static_cast<int64_t>(S->Size());
int64_t idx = S_size;
const Tensor* I = context->Input<Tensor>(2);
if (I != nullptr) {
if (I->IsDataType<int32_t>()) {
idx = static_cast<int64_t>(I->Data<int32_t>()[0]);
} else {
idx = I->Data<int64_t>()[0];
}
if (idx < 0) {
idx = S_size + idx;
}
ORT_ENFORCE(idx >= 0 && idx <= S_size, "SequenceInsert GPU: Invalid sequence index.");
}
const Tensor* X = context->Input<Tensor>(1);
AllocatorPtr alloc;
ORT_ENFORCE(context->GetTempSpaceAllocator(&alloc).IsOK(),
"SequenceInsert GPU: Unable to get an allocator.");
std::unique_ptr<Tensor> tensor_to_be_inserted = Tensor::Create(X->DataType(),
X->Shape(), alloc);
HIP_RETURN_IF_ERROR(hipMemcpyAsync(tensor_to_be_inserted->MutableDataRaw(),
X->DataRaw(), X->SizeInBytes(),
hipMemcpyDeviceToDevice, Stream()));
TensorSeq* Y = context->Output<TensorSeq>(0);
Y->SetType(S->DataType());
Y->Reserve(S_size + 1);
for (int64_t i = 0; i < S_size; ++i) {
if (i == idx) {
Y->Add(std::move(*tensor_to_be_inserted)); // Add will check for type consistency
}
const Tensor& source_tensor = S->Get(i);
std::unique_ptr<Tensor> target_tensor = Tensor::Create(source_tensor.DataType(),
source_tensor.Shape(), alloc);
HIP_RETURN_IF_ERROR(hipMemcpyAsync(target_tensor->MutableDataRaw(),
source_tensor.DataRaw(),
source_tensor.SizeInBytes(),
hipMemcpyDeviceToDevice, Stream()));
Y->Add(std::move(*target_tensor)); // Add will check for type consistency
}
if (idx == S_size) {
Y->Add(std::move(*tensor_to_be_inserted)); // Add will check for type consistency
}
return Status::OK();
}
}; // SequenceInsert
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/cpu/tensor/shape_op.h"
#include "core/providers/rocm/rocm_fwd.h"
namespace onnxruntime {
namespace rocm {
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Shape,
kOnnxDomain,
1, 12,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
// properly force CPU/GPU synch inside the kernel
.OutputMemoryType(OrtMemTypeCPUInput, 0)
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>()),
Shape);
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Shape,
kOnnxDomain,
13, 14,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
// properly force CPU/GPU synch inside the kernel
.OutputMemoryType(OrtMemTypeCPUInput, 0)
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>()),
Shape);
ONNX_OPERATOR_KERNEL_EX(
Shape,
kOnnxDomain,
15,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
// properly force CPU/GPU synch inside the kernel
.OutputMemoryType(OrtMemTypeCPUInput, 0)
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>()),
Shape);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/cpu/tensor/size.h"
#include "core/providers/rocm/rocm_fwd.h"
namespace onnxruntime {
namespace rocm {
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Size,
kOnnxDomain,
1, 12,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.OutputMemoryType(OrtMemTypeCPUInput, 0)
.TypeConstraint("T", DataTypeImpl::AllTensorTypes())
.TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>()),
Size);
ONNX_OPERATOR_KERNEL_EX(
Size,
kOnnxDomain,
13,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
// properly force CPU/GPU synch inside the kernel
.OutputMemoryType(OrtMemTypeCPUInput, 0)
.TypeConstraint("T", DataTypeImpl::AllTensorTypes())
.TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>()),
Size);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/slice.h"
#include "core/providers/cpu/tensor/utils.h"
#include "core/providers/rocm/tensor/slice_impl.h"
namespace onnxruntime {
namespace rocm {
// this really doesn't need to be a typed registration as the indices come from attributes and can only be int64.
// leaving as in maintain original incorrect registration setup (pre 02/2022).
#define REGISTER_VERSIONED_TYPED_SLICE(TIND) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Slice, \
kOnnxDomain, \
1, 9, \
TIND, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()), \
Slice<false>);
REGISTER_VERSIONED_TYPED_SLICE(int64_t)
#define REGISTER_V10_TYPED_SLICE(TIND) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Slice, \
kOnnxDomain, \
10, 10, \
TIND, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.InputMemoryType(OrtMemTypeCPUInput, 1) \
.InputMemoryType(OrtMemTypeCPUInput, 2) \
.InputMemoryType(OrtMemTypeCPUInput, 3) \
.InputMemoryType(OrtMemTypeCPUInput, 4) \
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()) \
.TypeConstraint("Tind", DataTypeImpl::GetTensorType<TIND>()), \
Slice<true>);
REGISTER_V10_TYPED_SLICE(int32_t)
REGISTER_V10_TYPED_SLICE(int64_t)
#define REGISTER_V12_TYPED_SLICE(TIND) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Slice, \
kOnnxDomain, \
11, 12, \
TIND, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.InputMemoryType(OrtMemTypeCPUInput, 1) \
.InputMemoryType(OrtMemTypeCPUInput, 2) \
.InputMemoryType(OrtMemTypeCPUInput, 3) \
.InputMemoryType(OrtMemTypeCPUInput, 4) \
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()) \
.TypeConstraint("Tind", DataTypeImpl::GetTensorType<TIND>()), \
Slice<true>);
REGISTER_V12_TYPED_SLICE(int32_t)
REGISTER_V12_TYPED_SLICE(int64_t)
#define REGISTER_V13_TYPED_SLICE(TIND) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
Slice, \
kOnnxDomain, \
13, \
TIND, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.InputMemoryType(OrtMemTypeCPUInput, 1) \
.InputMemoryType(OrtMemTypeCPUInput, 2) \
.InputMemoryType(OrtMemTypeCPUInput, 3) \
.InputMemoryType(OrtMemTypeCPUInput, 4) \
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()) \
.TypeConstraint("Tind", DataTypeImpl::GetTensorType<TIND>()), \
Slice<true>);
REGISTER_V13_TYPED_SLICE(int32_t)
REGISTER_V13_TYPED_SLICE(int64_t)
static Status SliceImpCore(hipStream_t stream,
const void* input_data, void* output_data,
size_t element_size, size_t dimension_count,
const TArray<int64_t>& starts_buffer, const TArray<int64_t>& steps_buffer,
const TArray<int64_t>& input_strides, const TArray<fast_divmod>& output_strides,
const TensorShape& output_shape) {
if (output_shape.Size() == 0) {
return Status::OK();
}
return SliceImpl(stream,
element_size,
gsl::narrow_cast<int32_t>(dimension_count),
starts_buffer,
steps_buffer,
input_strides,
output_strides,
input_data,
output_data,
output_shape.Size());
}
namespace SliceRocm {
static Status ComputeSliceStrides(const TensorShape& input_shape, TArray<int64_t>& input_strides,
TArray<fast_divmod>& output_strides,
SliceOp::PrepareForComputeMetadata& compute_metadata) {
// If we were able to coalesce the input and output shapes, use the new shapes to compute the strides.
const auto input_dimensions = input_shape.GetDims();
size_t rank = compute_metadata.p_flattened_input_dims_ ? compute_metadata.p_flattened_input_dims_->size()
: input_dimensions.size();
input_strides.SetSize(gsl::narrow_cast<int32_t>(rank));
const gsl::span<int64_t> input_strides_span = gsl::make_span(input_strides.Data(), input_strides.Size());
if (compute_metadata.p_flattened_input_dims_) {
ORT_ENFORCE(TensorPitches::Calculate(input_strides_span, compute_metadata.flattened_input_dims_));
} else {
ORT_ENFORCE(TensorPitches::Calculate(input_strides_span, input_dimensions));
}
const auto output_dims =
gsl::make_span(compute_metadata.p_flattened_output_dims_ != nullptr ? compute_metadata.flattened_output_dims_
: compute_metadata.output_dims_);
TensorPitches original_output_strides(output_dims);
output_strides.SetSize(gsl::narrow_cast<int32_t>(original_output_strides.size()));
for (int32_t i = 0, limit = static_cast<int32_t>(original_output_strides.size()); i < limit; ++i) {
output_strides[i] = fast_divmod(gsl::narrow_cast<int>(original_output_strides[i]));
}
return Status::OK();
}
Status Impl(hipStream_t stream,
const void* input_data,
const TensorShape& input_shape,
void* output_data,
SliceOp::PrepareForComputeMetadata& compute_metadata,
size_t element_size) {
const auto input_dimensions = input_shape.GetDims();
size_t dimension_count = input_dimensions.size();
TArray<int64_t> starts_buffer(compute_metadata.starts_);
TArray<int64_t> steps_buffer(compute_metadata.steps_);
TArray<int64_t> input_strides;
TArray<fast_divmod> output_strides;
ORT_RETURN_IF_ERROR(ComputeSliceStrides(input_shape, input_strides, output_strides, compute_metadata));
TensorShape output_shape(compute_metadata.output_dims_);
ORT_RETURN_IF_ERROR(SliceImpCore(stream,
input_data,
output_data,
element_size,
gsl::narrow_cast<int32_t>(dimension_count),
starts_buffer,
steps_buffer,
input_strides,
output_strides,
output_shape));
return Status::OK();
}
} // namespace SliceRocm
template <bool dynamic>
Status Slice<dynamic>::ComputeInternal(OpKernelContext* ctx) const {
const Tensor* input_tensor = GetSlicedOrUnslicedTensor(ctx);
ORT_ENFORCE(nullptr != input_tensor);
const auto& input_shape = input_tensor->Shape();
const auto input_dimensions = input_shape.GetDims();
if (input_dimensions.empty()) return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Cannot slice scalars");
SliceOp::PrepareForComputeMetadata compute_metadata(input_dimensions);
if (dynamic) {
TensorShapeVector input_starts, input_ends, input_axes, input_steps;
ORT_RETURN_IF_ERROR(FillInputVectors(ctx, input_starts, input_ends, input_axes, input_steps));
ORT_RETURN_IF_ERROR(PrepareForCompute(input_starts, input_ends, input_axes, input_steps, compute_metadata));
} else {
ORT_RETURN_IF_ERROR(PrepareForCompute(StartsAttribute(), EndsAttribute(), AxesAttribute(), compute_metadata));
}
TensorShape output_shape(compute_metadata.output_dims_);
TArray<int64_t> starts_buffer(compute_metadata.starts_);
TArray<int64_t> steps_buffer(compute_metadata.steps_);
TArray<int64_t> input_strides;
TArray<fast_divmod> output_strides;
ORT_RETURN_IF_ERROR(SliceRocm::ComputeSliceStrides(input_shape, input_strides, output_strides, compute_metadata));
// It may seem that we may use `SliceImpCore()` directly, but we need to go through `CallSliceImp()` because
// `ComputeInternal()` is shared between the inferencing and training kernels and the training kernel overrides
// `CallSliceImp()`
ORT_RETURN_IF_ERROR(CallSliceImp(input_tensor->DataType()->Size(), input_dimensions.size(), starts_buffer,
steps_buffer, input_strides,
output_strides, ctx,
output_shape));
return Status::OK();
}
template <bool dynamic>
const Tensor* Slice<dynamic>::GetSlicedOrUnslicedTensor(OpKernelContext* ctx) const {
return ctx->Input<Tensor>(0);
}
template <bool dynamic>
Status Slice<dynamic>::FillInputVectors(OpKernelContext* ctx, TensorShapeVector& input_starts,
TensorShapeVector& input_ends, TensorShapeVector& input_axes,
TensorShapeVector& input_steps) const {
return FillVectorsFromInput(*ctx->Input<Tensor>(1), *ctx->Input<Tensor>(2), ctx->Input<Tensor>(3),
ctx->Input<Tensor>(4), input_starts, input_ends, input_axes, input_steps);
}
template <bool dynamic>
Status Slice<dynamic>::CallSliceImp(size_t element_size, size_t dimension_count, const TArray<int64_t>& starts_buffer,
const TArray<int64_t>& steps_buffer, const TArray<int64_t>& input_strides,
const TArray<fast_divmod>& output_strides, OpKernelContext* ctx,
const TensorShape& output_shape) const {
const auto* input_tensor = ctx->Input<Tensor>(0);
auto* output_tensor = ctx->Output(0, output_shape);
return SliceImpCore(Stream(),
input_tensor->DataRaw(),
output_tensor->MutableDataRaw(),
element_size,
gsl::narrow_cast<int32_t>(dimension_count),
starts_buffer,
steps_buffer,
input_strides,
output_strides,
output_shape);
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/tensor/slice.h"
#include "core/providers/cpu/tensor/utils.h"
namespace onnxruntime {
namespace rocm {
namespace SliceRocm {
Status Impl(hipStream_t stream,
const void* input_data,
const TensorShape& input_shape,
void* output_data,
SliceOp::PrepareForComputeMetadata& prepare_metadata,
size_t element_size);
} // namespace SliceRocm
template <bool dynamic>
class Slice : public RocmKernel, public SliceBase {
public:
Slice(const OpKernelInfo& info) : RocmKernel(info), SliceBase(info, dynamic) {}
Status ComputeInternal(OpKernelContext* ctx) const override;
private:
virtual const Tensor* GetSlicedOrUnslicedTensor(OpKernelContext* ctx) const;
virtual Status FillInputVectors(OpKernelContext* ctx, TensorShapeVector& input_starts,
TensorShapeVector& input_ends, TensorShapeVector& input_axes,
TensorShapeVector& input_steps) const;
virtual Status CallSliceImp(size_t element_size, size_t dimension_count, const TArray<int64_t>& starts_buffer,
const TArray<int64_t>& steps_buffer, const TArray<int64_t>& input_strides,
const TArray<fast_divmod>& output_strides, OpKernelContext* ctx,
const TensorShape& output_shape) const;
};
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/providers/rocm/rocm_common.h"
#include "core/providers/rocm/tensor/slice_impl.h"
namespace onnxruntime {
namespace rocm {
namespace {
#ifdef USE_ROCM
constexpr int kNumElementsPerThread = 2;
constexpr int kNumThreadsPerBlock = 512;
#else
constexpr int kNumElementsPerThread = GridDim::maxElementsPerThread;
constexpr int kNumThreadsPerBlock = GridDim::maxThreadsPerBlock;
#endif
} // namespace
template <bool is_grad, int DIMS, typename T>
__global__ void _SliceKernel(const TArray<int64_t> starts, const TArray<int64_t> steps,
const TArray<int64_t> input_strides, const TArray<fast_divmod> output_strides,
const T* input_data, T* output_data, const HIP_LONG N) {
HIP_LONG start = kNumElementsPerThread * kNumThreadsPerBlock * blockIdx.x + threadIdx.x;
T values[kNumElementsPerThread];
HIP_LONG id;
if (is_grad) {
id = start;
#pragma unroll
for (int i = 0; i < kNumElementsPerThread; ++i) {
if (id < N) {
values[i] = input_data[id];
id += kNumThreadsPerBlock;
}
}
}
id = start;
#pragma unroll
for (int i = 0; i < kNumElementsPerThread; ++i) {
if (id < N) {
HIP_LONG input_index = 0;
int div;
int mod = id;
int dim = 0;
#pragma unroll
for (; dim < DIMS - 1; ++dim) {
output_strides[dim].divmod(mod, div, mod);
input_index += (starts[dim] + div * steps[dim]) * input_strides[dim];
}
input_index += starts[dim] + mod * steps[dim];
if (is_grad) {
output_data[input_index] = values[i];
} else {
values[i] = input_data[input_index];
}
id += kNumThreadsPerBlock;
}
}
if (!is_grad) {
id = start;
#pragma unroll
for (int i = 0; i < kNumElementsPerThread; ++i) {
if (id < N) {
output_data[id] = values[i];
id += kNumThreadsPerBlock;
}
}
}
}
template <bool is_grad>
Status SliceImplEx(hipStream_t stream, const size_t element_size, const int32_t dimension_count,
const TArray<int64_t>& starts, const TArray<int64_t>& steps, const TArray<int64_t>& input_strides,
const TArray<fast_divmod>& output_strides, const void* input_data, void* output_data,
const size_t N) {
int blocksPerGrid = static_cast<int>(CeilDiv(N, kNumThreadsPerBlock * kNumElementsPerThread));
switch (element_size) {
#define HANDLE_DIMS(ELEMENT_TYPE, DIMS) \
case DIMS: { \
hipLaunchKernelGGL(HIP_KERNEL_NAME(_SliceKernel<is_grad, DIMS, ELEMENT_TYPE>), blocksPerGrid, kNumThreadsPerBlock, 0, stream, \
starts, steps, input_strides, output_strides, \
reinterpret_cast<const ToHipType<ELEMENT_TYPE>::MappedType*>(input_data), \
reinterpret_cast<ToHipType<ELEMENT_TYPE>::MappedType*>(output_data), (HIP_LONG)N); \
} break
#define HANDLE_ELEMENT_TYPE(ELEMENT_TYPE) \
case sizeof(ELEMENT_TYPE): { \
switch (dimension_count) { \
HANDLE_DIMS(ELEMENT_TYPE, 1); \
HANDLE_DIMS(ELEMENT_TYPE, 2); \
HANDLE_DIMS(ELEMENT_TYPE, 3); \
HANDLE_DIMS(ELEMENT_TYPE, 4); \
HANDLE_DIMS(ELEMENT_TYPE, 5); \
HANDLE_DIMS(ELEMENT_TYPE, 6); \
HANDLE_DIMS(ELEMENT_TYPE, 7); \
HANDLE_DIMS(ELEMENT_TYPE, 8); \
} \
} break
HANDLE_ELEMENT_TYPE(int8_t);
HANDLE_ELEMENT_TYPE(int16_t);
HANDLE_ELEMENT_TYPE(int32_t);
HANDLE_ELEMENT_TYPE(int64_t);
default:
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for Slice operator");
#undef HANDLE_ELEMENT_TYPE
#undef HANDLE_DIMS
}
return Status::OK();
}
Status SliceImpl(hipStream_t stream, const size_t element_size, const int32_t dimension_count,
const TArray<int64_t>& starts, const TArray<int64_t>& steps, const TArray<int64_t>& input_strides,
const TArray<fast_divmod>& output_strides, const void* input_data, void* output_data, const size_t N) {
return SliceImplEx<false>(stream, element_size, dimension_count, starts, steps, input_strides, output_strides,
input_data, output_data, N);
}
#ifdef ENABLE_TRAINING
Status SliceImplGrad(hipStream_t stream, const size_t element_size, const int32_t dimension_count,
const TArray<int64_t>& starts, const TArray<int64_t>& steps, const TArray<int64_t>& input_strides,
const TArray<fast_divmod>& output_strides, const void* input_data, void* output_data,
const size_t N) {
return SliceImplEx<true>(stream, element_size, dimension_count, starts, steps, input_strides, output_strides,
input_data, output_data, N);
}
#endif // ENABLE_TRAINING
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace onnxruntime {
namespace rocm {
Status SliceImpl(hipStream_t stream,
const size_t element_size,
const int32_t dimension_count,
const TArray<int64_t>& starts,
const TArray<int64_t>& steps,
const TArray<int64_t>& input_strides,
const TArray<fast_divmod>& output_strides,
const void* input_data,
void* output_data,
const size_t N);
#ifdef ENABLE_TRAINING
Status SliceImplGrad(hipStream_t stream,
const size_t element_size,
const int32_t dimension_count,
const TArray<int64_t>& starts,
const TArray<int64_t>& steps,
const TArray<int64_t>& input_strides,
const TArray<fast_divmod>& output_strides,
const void* input_data,
void* output_data,
const size_t N);
#endif // ENABLE_TRAINING
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include <vector>
#include "space_depth_ops.h"
#include "core/providers/rocm/tensor/transpose.h"
namespace onnxruntime {
namespace rocm {
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
SpaceToDepth,
kOnnxDomain,
1,
12,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T",
{DataTypeImpl::GetTensorType<float>(),
DataTypeImpl::GetTensorType<double>(),
DataTypeImpl::GetTensorType<MLFloat16>()}),
SpaceToDepth);
ONNX_OPERATOR_KERNEL_EX(
SpaceToDepth,
kOnnxDomain,
13,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T",
{DataTypeImpl::GetTensorType<float>(),
DataTypeImpl::GetTensorType<double>(),
DataTypeImpl::GetTensorType<MLFloat16>()}),
SpaceToDepth);
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
DepthToSpace,
kOnnxDomain,
1,
10,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T",
{DataTypeImpl::GetTensorType<float>(),
DataTypeImpl::GetTensorType<double>(),
DataTypeImpl::GetTensorType<MLFloat16>()}),
DepthToSpace);
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
DepthToSpace,
kOnnxDomain,
11,
12,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T",
{DataTypeImpl::GetTensorType<float>(),
DataTypeImpl::GetTensorType<double>(),
DataTypeImpl::GetTensorType<MLFloat16>()}),
DepthToSpace);
ONNX_OPERATOR_KERNEL_EX(
DepthToSpace,
kOnnxDomain,
13,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T",
{DataTypeImpl::GetTensorType<float>(),
DataTypeImpl::GetTensorType<double>(),
DataTypeImpl::GetTensorType<MLFloat16>()}),
DepthToSpace);
static Status SpaceDepthOpCudaImpl(const hipDeviceProp_t& prop,
hipStream_t stream,
const rocblas_handle rocblas_handle,
const Tensor& input, Tensor& output,
const std::vector<size_t>& permutation,
const int64_t batch_size,
const int64_t in_dim1, const int64_t in_dim2, const int64_t in_dim3,
const int64_t in_dim4, const int64_t in_dim5,
const TensorShape& virtual_output_shape) {
TensorShape virtual_input_shape{batch_size, in_dim1, in_dim2, in_dim3, in_dim4, in_dim5};
return Transpose::DoTranspose(prop, stream, rocblas_handle, permutation, input, output,
&virtual_input_shape, &virtual_output_shape);
}
Status SpaceToDepth::ComputeInternal(OpKernelContext* context) const {
const auto* tensor_pointer = context->Input<Tensor>(0);
if (tensor_pointer == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
const Tensor& input = *tensor_pointer;
int64_t batch = -1;
int64_t input_depth = -1;
int64_t input_height = -1;
int64_t input_width = -1;
int64_t output_depth = -1;
int64_t output_height = -1;
int64_t output_width = -1;
ORT_RETURN_IF_ERROR(InputValidationsAndOutputDimsCalc(input,
batch,
input_depth, input_height, input_width,
output_depth, output_height, output_width,
true));
// We use the "actual" output shape to construct the output tensor
Tensor& output = *context->Output(0, {batch, output_depth, output_height, output_width});
// We will pass in the "virtual" output shape to be used by DoTranspose() in SpaceDepthOpCudaImpl(...)
TensorShape virtual_output_shape{batch, blocksize_, blocksize_, input_depth,
input_height / blocksize_, input_width / blocksize_};
std::vector<size_t> permutation = {0, 3, 5, 1, 2, 4};
ORT_RETURN_IF_ERROR(SpaceDepthOpCudaImpl(GetDeviceProp(), Stream(), RocblasHandle(), input, output, permutation, batch,
input_depth, input_height / blocksize_, blocksize_, input_width / blocksize_, blocksize_,
virtual_output_shape));
return Status::OK();
}
Status DepthToSpace::ComputeInternal(OpKernelContext* context) const {
const auto* tensor_pointer = context->Input<Tensor>(0);
if (tensor_pointer == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
const Tensor& input = *tensor_pointer;
int64_t batch = -1;
int64_t input_depth = -1;
int64_t input_height = -1;
int64_t input_width = -1;
int64_t output_depth = -1;
int64_t output_height = -1;
int64_t output_width = -1;
ORT_RETURN_IF_ERROR(InputValidationsAndOutputDimsCalc(input,
batch,
input_depth, input_height, input_width,
output_depth, output_height, output_width,
false));
// We use the "actual" output shape to construct the output tensor
Tensor& output = *context->Output(0, {batch, output_depth, output_height, output_width});
// We will pass in the "virtual" output shape to be used by DoTranspose() in SpaceDepthOpCudaImpl(...)
TensorShape virtual_output_shape{batch, input_depth / blocksize_ / blocksize_,
input_height, blocksize_, input_width, blocksize_};
std::vector<size_t> permutation;
permutation.reserve(6);
permutation.push_back(0);
if (is_dcr_) {
permutation.push_back(3);
permutation.push_back(4);
permutation.push_back(1);
permutation.push_back(5);
permutation.push_back(2);
} else {
permutation.push_back(1);
permutation.push_back(4);
permutation.push_back(2);
permutation.push_back(5);
permutation.push_back(3);
}
int64_t dim1 = is_dcr_ ? blocksize_ : input_depth / blocksize_ / blocksize_;
int64_t dim3 = is_dcr_ ? input_depth / blocksize_ / blocksize_ : blocksize_;
ORT_RETURN_IF_ERROR(SpaceDepthOpCudaImpl(GetDeviceProp(), Stream(), RocblasHandle(), input, output,
permutation,
batch,
dim1, blocksize_, dim3, input_height, input_width,
virtual_output_shape));
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/tensor/space_depth_ops.h"
namespace onnxruntime {
namespace rocm {
class SpaceToDepth final : public RocmKernel, SpaceDepthBase {
public:
explicit SpaceToDepth(const OpKernelInfo& info) : RocmKernel(info), SpaceDepthBase(info) {
}
Status ComputeInternal(OpKernelContext* context) const override;
};
class DepthToSpace final : public RocmKernel, SpaceDepthBase {
public:
explicit DepthToSpace(const OpKernelInfo& info) : RocmKernel(info), SpaceDepthBase(info) {
std::string mode;
// if mode doesn't exist, then it is the default "DCR" mode
// (or) it is an opset < 11 model for which the only mode is "DCR" mode
if (info.GetAttr("mode", &mode).IsOK()) {
if (mode == "CRD")
is_dcr_ = false;
else if (mode != "DCR")
ORT_THROW("DepthToSpace op: only 'DCR' and 'CRD' modes are supported");
}
}
Status ComputeInternal(OpKernelContext* context) const override;
private:
bool is_dcr_ = true;
};
} // namespace rocm
} //namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/split.h"
#include "core/providers/rocm/tensor/split_impl.h"
#include "core/providers/cpu/tensor/utils.h"
namespace onnxruntime {
namespace rocm {
ONNX_OPERATOR_VERSIONED_KERNEL_EX(Split,
kOnnxDomain,
2, 10,
kRocmExecutionProvider,
(*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
Split);
// explicitly supports negative axis
ONNX_OPERATOR_VERSIONED_KERNEL_EX(Split,
kOnnxDomain,
11, 12,
kRocmExecutionProvider,
(*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
Split);
// explicitly supports 'split' as optional input
ONNX_OPERATOR_KERNEL_EX(Split,
kOnnxDomain,
13,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.InputMemoryType(OrtMemTypeCPUInput, 1)
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
Split);
Status Split::ComputeInternal(OpKernelContext* ctx) const {
const Tensor* input_tensor = ctx->Input<Tensor>(0);
ORT_ENFORCE(input_tensor);
auto& input_shape = input_tensor->Shape();
auto num_outputs = ctx->OutputCount();
int64_t axis = HandleNegativeAxis(axis_, input_shape.NumDimensions());
int before_dims = 0;
int block_size_including_axis_dim = 0;
int block_size_inside_axis_dim = 0;
std::vector<int64_t> split_sizes(num_outputs);
const Tensor* split_tensor = ctx->Input<Tensor>(1);
if (split_tensor) {
ORT_ENFORCE(split_tensor->Shape().NumDimensions() == 1, "An split tensor must be a vector tensor.");
auto nDims = static_cast<size_t>(split_tensor->Shape()[0]);
const int64_t* data = split_tensor->Data<int64_t>();
split_sizes.assign(data, data + nDims);
} else {
split_sizes.assign(split_sizes_.begin(), split_sizes_.end());
}
ORT_RETURN_IF_ERROR(PrepareForCompute(input_shape,
num_outputs,
axis,
before_dims,
block_size_including_axis_dim,
block_size_inside_axis_dim,
split_sizes));
auto input_data = input_tensor->DataRaw();
auto input_dims = input_shape.GetDims();
auto output_dimensions{input_shape.AsShapeVector()};
RocmAsyncBuffer<void*> output_ptr(this, num_outputs);
gsl::span<void*> output_ptr_span = output_ptr.CpuSpan();
TensorShapeVector axis_dimension_input_output_mapping(input_dims[axis]);
int index = 0;
for (int i = 0; i < num_outputs; ++i) {
// update size of dimension for axis we're splitting on
auto split_size = gsl::narrow<int>(split_sizes[i]);
output_dimensions[axis] = split_size;
Tensor* output = ctx->Output(i, TensorShape{output_dimensions});
auto output_data = output->MutableDataRaw();
output_ptr_span[i] = output_data;
for (int j = 0; j < split_size; ++j) {
axis_dimension_input_output_mapping.at(index++) = i;
}
}
if (input_tensor->Shape().Size() <= 0) return Status::OK();
size_t element_size = input_tensor->DataType()->Size();
if (std::all_of(split_sizes.begin(), split_sizes.end(), [&](int64_t size) { return size == split_sizes[0]; })) {
if (num_outputs <= 32) {
TArray<void*, 32> output_ptr_array(num_outputs);
for (int i = 0; i < num_outputs; ++i) output_ptr_array[i] = output_ptr_span[i];
ORT_RETURN_IF_ERROR(SplitSameSplitDimImpl(Stream(), element_size, block_size_including_axis_dim,
block_size_inside_axis_dim, split_sizes[0], num_outputs, input_data,
output_ptr_array, static_cast<size_t>(input_shape.Size())));
} else {
ORT_RETURN_IF_ERROR(output_ptr.CopyToGpu());
ORT_RETURN_IF_ERROR(SplitSameSplitDimImpl(Stream(), element_size, block_size_including_axis_dim,
block_size_inside_axis_dim, split_sizes[0], num_outputs, input_data,
output_ptr.GpuPtr(), static_cast<size_t>(input_shape.Size())));
}
} else {
ORT_RETURN_IF_ERROR(output_ptr.CopyToGpu());
RocmAsyncBuffer<int64_t> split_sizes_gpu(this, split_sizes);
ORT_RETURN_IF_ERROR(split_sizes_gpu.CopyToGpu());
std::vector<int64_t> split_sizes_range(split_sizes);
for (size_t i = 1; i < split_sizes_range.size(); ++i) {
split_sizes_range[i] += split_sizes_range[i - 1];
}
RocmAsyncBuffer<int64_t> split_sizes_range_gpu(this, split_sizes_range);
ORT_RETURN_IF_ERROR(split_sizes_range_gpu.CopyToGpu());
RocmAsyncBuffer<int64_t> axis_dimension_input_output_mapping_gpu(this, axis_dimension_input_output_mapping);
ORT_RETURN_IF_ERROR(axis_dimension_input_output_mapping_gpu.CopyToGpu());
ORT_RETURN_IF_ERROR(SplitImpl(Stream(), element_size, block_size_including_axis_dim, block_size_inside_axis_dim,
split_sizes_gpu.GpuPtr(), split_sizes_range_gpu.GpuPtr(),
axis_dimension_input_output_mapping_gpu.GpuPtr(), num_outputs, input_data,
output_ptr.GpuPtr(), static_cast<size_t>(input_shape.Size())));
}
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/tensor/split.h"
namespace onnxruntime {
namespace rocm {
class Split final : public RocmKernel, public SplitBase {
public:
Split(const OpKernelInfo& info) : RocmKernel(info), SplitBase(info) {}
Status ComputeInternal(OpKernelContext* context) const override;
};
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/split_impl.h"
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/providers/rocm/rocm_common.h"
namespace onnxruntime {
namespace rocm {
namespace {
#ifdef USE_ROCM
constexpr int kNumElementsPerThread = 2;
constexpr int kNumThreadsPerBlock = 512;
#else
constexpr int kNumElementsPerThread = GridDim::maxElementsPerThread;
constexpr int kNumThreadsPerBlock = GridDim::maxThreadsPerBlock;
#endif
} // namespace
template <typename T, typename OutputDataArray>
__global__ void _SplitKernelSameSplitDim(const fast_divmod block_size_including_axis_dim_div,
const fast_divmod block_size_inside_axis_dim_div,
const fast_divmod split_dim_size, const int num_outputs, const T* input_data,
OutputDataArray output_data, const HIP_LONG N) {
HIP_LONG start = kNumElementsPerThread * kNumThreadsPerBlock * blockIdx.x + threadIdx.x;
T value[kNumElementsPerThread];
HIP_LONG id = start;
#pragma unroll
for (int i = 0; i < kNumElementsPerThread; ++i) {
if (id < N) {
value[i] = input_data[id];
id += kNumThreadsPerBlock;
}
}
id = start;
#pragma unroll
for (int i = 0; i < kNumElementsPerThread; ++i) {
if (id < N) {
int outer_block_index, block_index, offset, output_index, block_offset;
block_size_including_axis_dim_div.divmod(id, outer_block_index, offset);
block_size_inside_axis_dim_div.divmod(offset, block_index, offset);
split_dim_size.divmod(block_index, output_index, block_offset);
HIP_LONG output_pos =
(outer_block_index * split_dim_size.d_ + block_offset) * block_size_inside_axis_dim_div.d_ + offset;
reinterpret_cast<T*>(output_data[output_index])[output_pos] = value[i];
id += kNumThreadsPerBlock;
}
}
}
template <typename OutputDataArray>
Status SplitSameSplitDimImpl(hipStream_t stream, const size_t element_size, const int block_size_including_axis_dim,
const int block_size_inside_axis_dim, const int64_t split_size, const int num_outputs,
const void* input_data, OutputDataArray output_data, const size_t input_size) {
HIP_LONG N = static_cast<HIP_LONG>(input_size);
int blocksPerGrid = CeilDiv(N, kNumElementsPerThread * kNumThreadsPerBlock);
fast_divmod block_size_including_axis_dim_div = fast_divmod(block_size_including_axis_dim);
fast_divmod block_size_inside_axis_dim_div = fast_divmod(block_size_inside_axis_dim);
fast_divmod split_size_div = fast_divmod(static_cast<int>(split_size));
switch (element_size) {
#define CASE_ELEMENT_TYPE(type) \
case sizeof(type): { \
hipLaunchKernelGGL(_SplitKernelSameSplitDim, blocksPerGrid, kNumThreadsPerBlock, 0, stream, \
block_size_including_axis_dim_div, block_size_inside_axis_dim_div, split_size_div, num_outputs, \
reinterpret_cast<const ToHipType<type>::MappedType*>(input_data), output_data, N); \
} break
CASE_ELEMENT_TYPE(int8_t);
CASE_ELEMENT_TYPE(int16_t);
CASE_ELEMENT_TYPE(int32_t);
CASE_ELEMENT_TYPE(int64_t);
#undef CASE_ELEMENT_TYPE
default:
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for Slice operator");
}
return Status::OK();
}
template Status SplitSameSplitDimImpl<void**>(hipStream_t stream, const size_t element_size,
const int block_size_including_axis_dim,
const int block_size_inside_axis_dim, const int64_t split_size,
const int num_outputs, const void* input_data, void** output_data,
const size_t input_size);
template Status SplitSameSplitDimImpl<TArray<void*, 32>>(hipStream_t stream, const size_t element_size,
const int block_size_including_axis_dim,
const int block_size_inside_axis_dim, const int64_t split_size,
const int num_outputs, const void* input_data,
TArray<void*, 32> output_data, const size_t input_size);
template <typename T>
__global__ void _SplitKernel(const fast_divmod block_size_including_axis_dim_div,
const fast_divmod block_size_inside_axis_dim_div, const int64_t* split_sizes,
const int64_t* split_sizes_range, const int64_t* axis_dimension_input_output_mapping,
const int num_outputs, const T* input_data, void** output_data, const HIP_LONG N) {
HIP_LONG start = kNumElementsPerThread * kNumThreadsPerBlock * blockIdx.x + threadIdx.x;
T value[kNumElementsPerThread];
HIP_LONG id = start;
#pragma unroll
for (int i = 0; i < kNumElementsPerThread; ++i) {
if (id < N) {
value[i] = input_data[id];
id += kNumThreadsPerBlock;
}
}
id = start;
#pragma unroll
for (int i = 0; i < kNumElementsPerThread; ++i) {
if (id < N) {
int outer_block_index, block_index, offset;
block_size_including_axis_dim_div.divmod(id, outer_block_index, offset);
block_size_inside_axis_dim_div.divmod(offset, block_index, offset);
int output_index = axis_dimension_input_output_mapping[block_index];
int64_t range_left = (output_index == 0) ? 0 : split_sizes_range[output_index - 1];
int block_offset = block_index - static_cast<int>(range_left);
HIP_LONG output_pos =
(outer_block_index * split_sizes[output_index] + block_offset) * block_size_inside_axis_dim_div.d_ + offset;
reinterpret_cast<T*>(output_data[output_index])[output_pos] = value[i];
id += kNumThreadsPerBlock;
}
}
}
Status SplitImpl(hipStream_t stream, const size_t element_size, const int block_size_including_axis_dim,
const int block_size_inside_axis_dim, const int64_t* split_sizes, const int64_t* split_sizes_range,
const int64_t* axis_dimension_input_output_mapping, const int num_outputs, const void* input_data,
void** output_data, const size_t input_size) {
HIP_LONG N = static_cast<HIP_LONG>(input_size);
int blocksPerGrid = CeilDiv(N, kNumElementsPerThread * kNumThreadsPerBlock);
fast_divmod block_size_including_axis_dim_div = fast_divmod(block_size_including_axis_dim);
fast_divmod block_size_inside_axis_dim_div = fast_divmod(block_size_inside_axis_dim);
switch (element_size) {
#define CASE_ELEMENT_TYPE(type) \
case sizeof(type): { \
hipLaunchKernelGGL(_SplitKernel, blocksPerGrid, kNumThreadsPerBlock, 0, stream, \
block_size_including_axis_dim_div, block_size_inside_axis_dim_div, split_sizes, split_sizes_range, \
axis_dimension_input_output_mapping, num_outputs, \
reinterpret_cast<const ToHipType<type>::MappedType*>(input_data), output_data, N); \
} break
CASE_ELEMENT_TYPE(int8_t);
CASE_ELEMENT_TYPE(int16_t);
CASE_ELEMENT_TYPE(int32_t);
CASE_ELEMENT_TYPE(int64_t);
#undef CASE_ELEMENT_TYPE
default:
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for Slice operator");
}
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
#include "core/common/common.h"
namespace onnxruntime {
namespace rocm {
template <typename OutputDataArray>
Status SplitSameSplitDimImpl(hipStream_t stream, const size_t element_size, const int block_size_including_axis_dim,
const int block_size_inside_axis_dim, const int64_t split_size, const int num_outputs,
const void* input_data, OutputDataArray output_data, const size_t input_size);
Status SplitImpl(hipStream_t stream, const size_t element_size, const int block_size_including_axis_dim,
const int block_size_inside_axis_dim, const int64_t* split_sizes, const int64_t* split_sizes_range,
const int64_t* axis_dimension_input_output_mapping, const int num_outputs, const void* input_data,
void** output_data, const size_t input_size);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "squeeze.h"
namespace onnxruntime {
namespace rocm {
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Squeeze,
kOnnxDomain,
1, 10,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.Alias(0, 0)
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
Squeeze);
// explicit support for negative axis.
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Squeeze,
kOnnxDomain,
11, 12,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.Alias(0, 0)
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
Squeeze);
// axes is input instead of attribute
ONNX_OPERATOR_KERNEL_EX(
Squeeze,
kOnnxDomain,
13,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.Alias(0, 0)
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.InputMemoryType(OrtMemTypeCPUInput, 1),
Squeeze);
Status Squeeze::ComputeInternal(OpKernelContext* ctx) const {
const Tensor* X = ctx->Input<Tensor>(0);
const TensorShape& X_shape = X->Shape();
TensorShapeVector axes;
size_t num_inputs = ctx->InputCount();
if (num_inputs == 2) { //axes is an input
const Tensor* axes_tensor = ctx->Input<Tensor>(1);
ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null");
ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1,
"An axes tensor must be a vector tensor.");
auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
const auto* data = axes_tensor->Data<int64_t>();
axes.assign(data, data + nDims);
} else {
axes.assign(axes_.begin(), axes_.end());
}
TensorShapeVector output_shape = ComputeOutputShape(X_shape, axes);
Tensor* Y = ctx->Output(0, TensorShape(output_shape));
const void* input = X->DataRaw();
void* output = Y->MutableDataRaw();
if (input == output)
return Status::OK();
auto count = X->Shape().Size();
auto element_bytes = X->DataType()->Size();
HIP_RETURN_IF_ERROR(hipMemcpyAsync(output, input, count * element_bytes, hipMemcpyDeviceToDevice, Stream()));
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/tensor/squeeze.h"
namespace onnxruntime {
namespace rocm {
class Squeeze final : public SqueezeBase, public RocmKernel {
public:
Squeeze(const OpKernelInfo& info) : SqueezeBase(info), RocmKernel(info) {}
Status ComputeInternal(OpKernelContext* context) const override;
};
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/tile.h"
#include "core/providers/cpu/tensor/utils.h"
#include "tile_impl.h"
using namespace onnxruntime::common;
namespace onnxruntime {
namespace rocm {
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Tile,
kOnnxDomain,
6,
12,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.InputMemoryType(OrtMemTypeCPUInput, 1)
.TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
DataTypeImpl::GetTensorType<double>(),
DataTypeImpl::GetTensorType<int32_t>(),
DataTypeImpl::GetTensorType<int64_t>(),
DataTypeImpl::GetTensorType<MLFloat16>()})
.TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>()),
Tile);
ONNX_OPERATOR_KERNEL_EX(
Tile,
kOnnxDomain,
13,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.InputMemoryType(OrtMemTypeCPUInput, 1)
.TypeConstraint("T", {DataTypeImpl::GetTensorType<float>(),
DataTypeImpl::GetTensorType<double>(),
DataTypeImpl::GetTensorType<int32_t>(),
DataTypeImpl::GetTensorType<int64_t>(),
DataTypeImpl::GetTensorType<MLFloat16>()})
.TypeConstraint("T1", DataTypeImpl::GetTensorType<int64_t>()),
Tile);
#define CASE_TILE(type) \
case sizeof(type): { \
TileImpl(Stream(), rank, fdm_input_shape, input_strides, \
reinterpret_cast<const typename ToHipType<type>::MappedType*>(input_data), fdm_output_strides, \
reinterpret_cast<typename ToHipType<type>::MappedType*>(output_data), output_tensor.Shape().Size()); \
} break
#define CASE_TILE_MEMCPY(type) \
case sizeof(type): { \
TileMemcpyImpl(Stream(), reinterpret_cast<const typename ToHipType<type>::MappedType*>(input_data), \
reinterpret_cast<typename ToHipType<type>::MappedType*>(output_data), input_shape.Size(), \
num_of_copies_per_batch); \
} break
#define CASE_TILE_BATCHED_MEMCPY(type) \
case sizeof(type): { \
TileBatchedMemcpyImpl(Stream(), reinterpret_cast<const typename ToHipType<type>::MappedType*>(input_data), \
reinterpret_cast<typename ToHipType<type>::MappedType*>(output_data), \
num_of_elements_per_batch, input_shape.Size(), num_of_batch_copies, \
num_of_copies_per_batch); \
} break
Status Tile::ComputeInternal(OpKernelContext* ctx) const {
auto& input_tensor = *ctx->Input<Tensor>(0);
auto& repeats_tensor = *ctx->Input<Tensor>(1);
int32_t rank = static_cast<int32_t>(input_tensor.Shape().NumDimensions());
if (repeats_tensor.Shape().NumDimensions() != 1)
return Status(ONNXRUNTIME, INVALID_ARGUMENT, "'repeat' input tensor must be 1 dimensional");
if (repeats_tensor.Shape().Size() != rank)
return Status(ONNXRUNTIME, INVALID_ARGUMENT, "'repeat' input tensor must have the same length as the 'input' tensor");
// Calculate the shape of the output tensor
auto* repeats = repeats_tensor.Data<int64_t>();
const auto& input_shape = input_tensor.Shape();
const auto input_dims = input_shape.GetDims();
auto output_dims(input_shape.AsShapeVector());
for (auto axis = 0; axis < rank; axis++)
output_dims[axis] *= repeats[axis];
TensorShape output_shape(output_dims);
auto& output_tensor = *ctx->Output(0, output_shape);
void* output_data = output_tensor.MutableDataRaw();
const void* input_data = input_tensor.DataRaw();
const auto element_size = input_tensor.DataType()->Size();
// Repeat tensor input can have 0 as a valid value
// check if the computed output_shape size is 0 and
// return an empty tensor if so.
if (output_shape.Size() == 0) {
return Status::OK();
}
// Repeat tensor has all 1s in it
if (output_shape == input_shape) {
return HIP_CALL(hipMemcpyAsync(output_tensor.MutableDataRaw(), input_tensor.DataRaw(), input_tensor.SizeInBytes(), hipMemcpyDeviceToDevice, Stream()));
}
bool is_batched_memcpy = false;
size_t num_of_elements_per_batch = 1;
size_t num_of_copies_per_batch = 1;
size_t num_of_batch_copies = 1;
if (TileOp::IsTileMemcpy(input_shape,
repeats,
rank,
is_batched_memcpy,
num_of_elements_per_batch,
num_of_copies_per_batch,
num_of_batch_copies)) {
if (!is_batched_memcpy) {
switch (element_size) {
CASE_TILE_MEMCPY(float);
CASE_TILE_MEMCPY(double);
CASE_TILE_MEMCPY(MLFloat16);
default:
ORT_THROW("Unsupported value attribute datatype with sizeof=: ", element_size);
break;
}
} else {
switch (element_size) {
CASE_TILE_BATCHED_MEMCPY(float);
CASE_TILE_BATCHED_MEMCPY(double);
CASE_TILE_BATCHED_MEMCPY(MLFloat16);
default:
ORT_THROW("Unsupported value attribute datatype with sizeof=: ", element_size);
break;
}
}
return Status::OK();
}
TensorPitches input_pitches(input_dims);
TArray<int64_t> input_strides(input_pitches);
TArray<fast_divmod> fdm_input_shape(rank);
for (size_t i = 0; i < input_dims.size(); ++i) {
fdm_input_shape[gsl::narrow_cast<int>(i)] = fast_divmod(gsl::narrow_cast<int>(input_dims[i]));
}
TArray<fast_divmod> fdm_output_strides(rank);
TensorPitches output_pitches(output_dims);
for (auto i = 0; i < rank; i++) {
fdm_output_strides[i] = fast_divmod(static_cast<int>(output_pitches[i]));
}
static_assert(sizeof(float) == sizeof(int32_t), "Float and Int32 are of different sizes");
static_assert(sizeof(double) == sizeof(int64_t), "Double and Int64 are of different sizes");
if (output_tensor.Shape().Size() > 0) {
switch (element_size) {
CASE_TILE(float);
CASE_TILE(double);
CASE_TILE(MLFloat16);
default:
ORT_THROW("Unsupported value attribute datatype with sizeof=: ", element_size);
break;
}
}
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/tensor/tile.h"
namespace onnxruntime {
namespace rocm {
struct Tile final : RocmKernel {
explicit Tile(const OpKernelInfo& info) : RocmKernel(info) {
}
Status ComputeInternal(OpKernelContext* context) const override;
};
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/cu_inc/common.cuh"
#include "tile_impl.h"
namespace onnxruntime {
namespace rocm {
#ifdef USE_ROCM
constexpr int num_elements_per_thread = 2;
constexpr int num_threads_per_block = 512;
#else
constexpr int num_elements_per_thread = GridDim::maxElementsPerThread;
constexpr int num_threads_per_block = GridDim::maxThreadsPerBlock;
#endif
template <typename T>
__global__ void _UnRolledTileKernel(const size_t shape_rank, const TArray<fast_divmod> fdm_input_shape,
const TArray<int64_t> input_strides, const T* input_data,
const TArray<fast_divmod> fdm_output_strides, T* output_data, const HIP_LONG N) {
HIP_LONG start = num_elements_per_thread * num_threads_per_block * blockIdx.x + threadIdx.x;
T value[num_elements_per_thread];
HIP_LONG id = start;
#pragma unroll
for (int i = 0; i < num_elements_per_thread; ++i) {
if (id < N) {
HIP_LONG input_index = 0;
HIP_LONG offset = id;
#pragma unroll
for (auto dim = 0; dim < fdm_output_strides.Capacity(); ++dim) {
if (dim >= shape_rank) {
break;
}
int out_coord, r;
fdm_output_strides[dim].divmod(offset, out_coord, r);
int in_coord = fdm_input_shape[dim].mod(out_coord);
input_index += input_strides[dim] * in_coord;
offset = r;
}
value[i] = input_data[input_index];
id += num_threads_per_block;
}
}
id = start;
#pragma unroll
for (int i = 0; i < num_elements_per_thread; ++i) {
if (id < N) {
output_data[id] = value[i];
id += num_threads_per_block;
}
}
}
template <typename T>
void TileImpl(hipStream_t stream, const size_t shape_rank, const TArray<fast_divmod>& fdm_input_shape,
const TArray<int64_t>& input_stride, const T* input_data, const TArray<fast_divmod>& fdm_output_strides,
T* output_data, const size_t N) {
int blocksPerGrid = static_cast<int>(CeilDiv(N, num_threads_per_block * num_elements_per_thread));
hipLaunchKernelGGL(HIP_KERNEL_NAME(_UnRolledTileKernel<T>), blocksPerGrid, num_threads_per_block, 0, stream, shape_rank, fdm_input_shape, input_stride,
input_data, fdm_output_strides,
output_data, static_cast<HIP_LONG>(N));
}
template <typename T>
__global__ void _TileMemcpyKernelFromOutput(const T* input_data, T* output_data,
const fast_divmod divmod_num_input_elements, const HIP_LONG N) {
HIP_LONG start = num_elements_per_thread * num_threads_per_block * blockIdx.x + threadIdx.x;
T value[num_elements_per_thread];
HIP_LONG id = start;
#pragma unroll
for (int i = 0; i < num_elements_per_thread; ++i) {
if (id < N) {
value[i] = input_data[divmod_num_input_elements.mod(id)];
id += num_threads_per_block;
}
}
id = start;
#pragma unroll
for (int i = 0; i < num_elements_per_thread; ++i) {
if (id < N) {
output_data[id] = value[i];
id += num_threads_per_block;
}
}
}
template <typename T>
__global__ void _TileMemcpyKernelFromInput(const T* input_data, T* output_data, const HIP_LONG N,
const size_t repeats) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
T input_val = input_data[id];
for (size_t i = 0; i < repeats; ++i) {
output_data[id] = input_val;
id += N;
}
}
template <typename T>
size_t GetVectorizedSize(size_t num_input_elements, size_t num_elements_per_batch, uint64_t address_input,
uint64_t address_output, HIP_LONG& N, int& blocksPerGrid) {
constexpr int vec4_alignment = std::alignment_of<aligned_vector<T, 4>>::value;
constexpr int vec2_alignment = std::alignment_of<aligned_vector<T, 2>>::value;
N = static_cast<HIP_LONG>(num_input_elements);
size_t vectorized_size = 1;
if (num_elements_per_batch % 4 == 0 && address_input % vec4_alignment == 0 && address_output % vec4_alignment == 0) {
N /= 4;
vectorized_size = 4;
} else if (num_elements_per_batch % 2 == 0 && address_input % vec2_alignment == 0 &&
address_output % vec2_alignment == 0) {
N /= 2;
vectorized_size = 2;
}
blocksPerGrid = CeilDiv(N, num_threads_per_block);
return vectorized_size;
}
template <typename T>
void TileMemcpyImpl(hipStream_t stream, const T* input_data, T* output_data, const size_t num_input_elements,
const size_t repeats) {
// If the block number from input size is too small to fill all streaming multiprocessors,
// it won't have perf gain to launch from inputs. In this case we will use the output based kernel.
HIP_LONG N;
int blocksPerGrid;
size_t vectorized_size =
GetVectorizedSize<T>(num_input_elements, num_input_elements, reinterpret_cast<uint64_t>(input_data),
reinterpret_cast<uint64_t>(output_data), N, blocksPerGrid);
if (blocksPerGrid < 128) {
N = static_cast<HIP_LONG>(num_input_elements * repeats);
blocksPerGrid = CeilDiv(N, num_threads_per_block * num_elements_per_thread);
hipLaunchKernelGGL(_TileMemcpyKernelFromOutput, blocksPerGrid, num_threads_per_block, 0, stream,
input_data, output_data, fast_divmod(static_cast<int>(num_input_elements)), N);
return;
}
if (vectorized_size == 4) {
using Vec4T = aligned_vector<T, 4>;
hipLaunchKernelGGL(_TileMemcpyKernelFromInput, blocksPerGrid, num_threads_per_block, 0, stream,
reinterpret_cast<const Vec4T*>(input_data), reinterpret_cast<Vec4T*>(output_data), N, repeats);
return;
} else if (vectorized_size == 2) {
using Vec2T = aligned_vector<T, 2>;
hipLaunchKernelGGL(_TileMemcpyKernelFromInput, blocksPerGrid, num_threads_per_block, 0, stream,
reinterpret_cast<const Vec2T*>(input_data), reinterpret_cast<Vec2T*>(output_data), N, repeats);
return;
}
hipLaunchKernelGGL(_TileMemcpyKernelFromInput, blocksPerGrid, num_threads_per_block, 0, stream, input_data, output_data, N, repeats);
}
template <typename T>
__global__ void _TileBatchedMemcpyKernelFromOutput(const T* input_data, T* output_data,
const fast_divmod divmod_size_output_row,
const size_t size_input_row, const fast_divmod divmod_batch,
const fast_divmod divmod_size_input_row, const HIP_LONG N) {
HIP_LONG start = num_elements_per_thread * num_threads_per_block * blockIdx.x + threadIdx.x;
T value[num_elements_per_thread];
HIP_LONG id = start;
#pragma unroll
for (int i = 0; i < num_elements_per_thread; ++i) {
if (id < N) {
int batch_idx, element_idx;
divmod_size_output_row.divmod(id, batch_idx, element_idx);
value[i] = input_data[divmod_batch.mod(batch_idx) * size_input_row + divmod_size_input_row.mod(element_idx)];
id += num_threads_per_block;
}
}
id = start;
#pragma unroll
for (int i = 0; i < num_elements_per_thread; ++i) {
if (id < N) {
output_data[id] = value[i];
id += num_threads_per_block;
}
}
}
// Input size is [batch, data], output size is [batch * batch_repeats, data * repeats_per_batch].
// Here size_input_row = data, size_output_row = data * repeats_per_batch,
// size_output_batch = batch * data * repeats_per_batch
template <typename T>
__global__ void _TileBatchedMemcpyKernelFromInput(const T* input_data, T* output_data,
const fast_divmod divmod_size_input_row,
const HIP_LONG size_input_row, const HIP_LONG size_output_row,
const HIP_LONG size_output_batch, const size_t batch_repeats,
const size_t repeats_per_batch, const HIP_LONG N) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
T input_val = input_data[id];
HIP_LONG q, r;
divmod_size_input_row.divmod(id, q, r);
HIP_LONG batch_offset = q * size_output_row + r;
for (size_t i = 0; i < batch_repeats; ++i) {
HIP_LONG offset = batch_offset;
for (size_t j = 0; j < repeats_per_batch; ++j) {
output_data[offset] = input_val;
offset += size_input_row;
}
batch_offset += size_output_batch;
}
}
// Input size is [batch, data], output size is [batch * batch_repeats, data * repeats_per_batch].
// Here size_input_row = data, num_input_elements = batch * data
template <typename T>
void TileBatchedMemcpyImpl(hipStream_t stream, const T* input_data, T* output_data, const size_t size_input_row,
const size_t num_input_elements, const size_t batch_repeats,
const size_t repeats_per_batch) {
// If the block number from input size is too small to fill all streaming multiprocessors,
// it won't have perf gain to launch from inputs. In this case we will use the output based kernel.
HIP_LONG N;
int blocksPerGrid;
size_t vectorized_size =
GetVectorizedSize<T>(num_input_elements, size_input_row, reinterpret_cast<uint64_t>(input_data),
reinterpret_cast<uint64_t>(output_data), N, blocksPerGrid);
if (blocksPerGrid < 128) {
N = static_cast<HIP_LONG>(num_input_elements * batch_repeats * repeats_per_batch);
blocksPerGrid = CeilDiv(N, num_threads_per_block * num_elements_per_thread);
hipLaunchKernelGGL(_TileBatchedMemcpyKernelFromOutput, blocksPerGrid, num_threads_per_block, 0, stream,
input_data, output_data, fast_divmod(static_cast<int>(size_input_row * repeats_per_batch)), size_input_row,
fast_divmod(static_cast<int>(num_input_elements / size_input_row)),
fast_divmod(static_cast<int>(size_input_row)), N);
return;
}
HIP_LONG size_input_row_vec = static_cast<HIP_LONG>(size_input_row);
if (vectorized_size == 4) {
using Vec4T = aligned_vector<T, 4>;
size_input_row_vec /= 4;
hipLaunchKernelGGL(_TileBatchedMemcpyKernelFromInput, blocksPerGrid, num_threads_per_block, 0, stream,
reinterpret_cast<const Vec4T*>(input_data), reinterpret_cast<Vec4T*>(output_data),
fast_divmod(size_input_row_vec), size_input_row_vec,
size_input_row_vec * static_cast<HIP_LONG>(repeats_per_batch), N * static_cast<HIP_LONG>(repeats_per_batch),
batch_repeats, repeats_per_batch, N);
return;
} else if (vectorized_size == 2) {
using Vec2T = aligned_vector<T, 2>;
size_input_row_vec /= 2;
hipLaunchKernelGGL(_TileBatchedMemcpyKernelFromInput, blocksPerGrid, num_threads_per_block, 0, stream,
reinterpret_cast<const Vec2T*>(input_data), reinterpret_cast<Vec2T*>(output_data),
fast_divmod(size_input_row_vec), size_input_row_vec,
size_input_row_vec * static_cast<HIP_LONG>(repeats_per_batch), N * static_cast<HIP_LONG>(repeats_per_batch),
batch_repeats, repeats_per_batch, N);
return;
}
hipLaunchKernelGGL(_TileBatchedMemcpyKernelFromInput, blocksPerGrid, num_threads_per_block, 0, stream,
input_data, output_data, fast_divmod(size_input_row_vec), size_input_row_vec,
size_input_row_vec * static_cast<HIP_LONG>(repeats_per_batch), N * static_cast<HIP_LONG>(repeats_per_batch),
batch_repeats, repeats_per_batch, N);
}
#define SPECIALIZED_IMPL(T) \
template void TileImpl<T>(hipStream_t stream, const size_t shape_rank, const TArray<fast_divmod>& fdm_input_shape, \
const TArray<int64_t>& input_stride, const T* input_data, \
const TArray<fast_divmod>& fdm_output_strides, T* output_data, const size_t N); \
template void TileMemcpyImpl<T>(hipStream_t stream, const T* input_data, T* output_data, \
const size_t num_input_elements, const size_t repeats); \
template void TileBatchedMemcpyImpl<T>(hipStream_t stream, const T* input_data, T* output_data, \
const size_t size_input_row, const size_t num_input_elements, \
const size_t batch_repeats, const size_t repeats_per_batch);
SPECIALIZED_IMPL(float)
SPECIALIZED_IMPL(double)
SPECIALIZED_IMPL(half)
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
void TileImpl(hipStream_t stream, const size_t shape_rank, const TArray<fast_divmod>& fdm_input_shape,
const TArray<int64_t>& input_stride, const T* input_data, const TArray<fast_divmod>& fdm_output_strides,
T* output_data, const size_t N);
template <typename T>
void TileMemcpyImpl(hipStream_t stream, const T* input_data, T* output_data, const size_t num_input_elements,
const size_t repeats);
template <typename T>
void TileBatchedMemcpyImpl(hipStream_t stream, const T* input_data, T* output_data, const size_t size_input_row,
const size_t num_input_elements, const size_t batch_repeats, const size_t repeats_per_batch);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/common/inlined_containers.h"
#include "core/providers/rocm/tensor/transpose.h"
#include "core/providers/rocm/tensor/transpose_impl.h"
#include "core/providers/cpu/tensor/utils.h"
#include "core/providers/rocm/shared_inc/fpgeneric.h"
namespace onnxruntime {
namespace rocm {
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Transpose,
kOnnxDomain,
1, 12,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
Transpose);
ONNX_OPERATOR_KERNEL_EX(
Transpose,
kOnnxDomain,
13,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
Transpose);
// special case acceleration using rocblas matrix transpose
static std::tuple<int, int> TryTransposeWithRocblas(const gsl::span<const size_t>& perm, const TensorShape& input_shape) {
int M = 0;
int N = 0;
if (perm.size() == 4 && input_shape[0] == 1 && perm[0] == 0) {
// NCHW <-> NHWC when N == 1
if ((perm[1] == 2 && perm[2] == 3 && perm[3] == 1) ||
(perm[1] == 3 && perm[2] == 1 && perm[3] == 2)) {
if (perm[1] == 2) {
M = gsl::narrow<int>(input_shape[1]);
N = gsl::narrow<int>(input_shape[2] * input_shape[3]);
} else {
M = gsl::narrow<int>(input_shape[1] * input_shape[2]);
N = gsl::narrow<int>(input_shape[3]);
}
}
} else if (perm.size() == 2 && perm[1] == 0 && perm[0] == 1) {
// 2D matrix transpose
M = gsl::narrow<int>(input_shape[0]);
N = gsl::narrow<int>(input_shape[1]);
}
return std::make_tuple(M, N);
}
template <typename T>
Status TransposeWithRocblas(hipStream_t stream, rocblas_handle rocblas_handle, const Tensor& input, Tensor& output, int M, int N) {
typedef typename ToHipType<T>::MappedType HipT;
HipT one = ToHipType<T>::FromFloat(1.0f);
HipT zero = ToHipType<T>::FromFloat(0.0f);
const HipT* input_data = reinterpret_cast<const HipT*>(input.Data<T>());
HipT* output_data = reinterpret_cast<HipT*>(output.MutableData<T>());
ROCBLAS_RETURN_IF_ERROR(
rocblasTransposeHelper(stream,
rocblas_handle,
rocblas_operation_transpose, rocblas_operation_transpose, M, N,
&one,
input_data,
N,
&zero,
input_data,
N,
output_data,
M));
return Status::OK();
}
Status Transpose::DoTranspose(const Transpose& transpose_kernel,
const gsl::span<const size_t>& permutations, const Tensor& input, Tensor& output) {
return Transpose::DoTranspose(transpose_kernel.GetDeviceProp(), transpose_kernel.Stream(), transpose_kernel.RocblasHandle(), permutations, input, output);
}
Status Transpose::DoTranspose(const hipDeviceProp_t& prop,
hipStream_t stream,
const rocblas_handle rocblas_handle,
const gsl::span<const size_t>& permutations, const Tensor& input, Tensor& output,
const TensorShape* input_shape_override,
const TensorShape* output_shape_override) {
// special case when there is a dim value of 0 in the shape.
if (output.Shape().Size() == 0)
return Status::OK();
const auto input_dims = input_shape_override ? input_shape_override->GetDims() : input.Shape().GetDims();
const auto output_dims = output_shape_override ? output_shape_override->GetDims() : output.Shape().GetDims();
auto rank = static_cast<int32_t>(input_dims.size());
// flatten the adjacent dimensions which are contiguous
// for example: permutations[0, 2, 3, 1] -> [0, 2, 1], permutations[0, 3, 1, 2] -> [0, 2, 1]
auto new_rank = rank;
InlinedVector<size_t> new_permutations(permutations.begin(), permutations.end());
TensorShapeVector new_input_dims = ToShapeVector(input_dims);
TensorShapeVector new_output_dims = ToShapeVector(output_dims);
// Remove all dims with value 1.
std::vector<bool> dims_to_remove(new_rank, false);
int input_pos = 0;
int output_pos = 0;
int perm_pos = 0;
for (int i = 0; i < new_rank; ++i) {
if (new_input_dims[i] != 1) {
new_input_dims[input_pos++] = new_input_dims[i];
} else {
dims_to_remove[i] = true;
}
if (new_output_dims[i] != 1) {
new_output_dims[output_pos++] = new_output_dims[i];
}
}
for (int i = 0; i < new_rank; ++i) {
if (!dims_to_remove[new_permutations[i]]) {
new_permutations[perm_pos++] = new_permutations[i];
}
}
for (int i = new_rank - 1; i >= 0; --i) {
if (dims_to_remove[i]) {
for (int j = 0; j < perm_pos; ++j) {
if (new_permutations[j] > static_cast<size_t>(i)) {
new_permutations[j] -= 1;
}
}
}
}
ORT_ENFORCE(input_pos == output_pos && input_pos == perm_pos);
new_rank = input_pos;
new_input_dims.resize(new_rank);
new_output_dims.resize(new_rank);
new_permutations.resize(new_rank);
for (auto i = new_rank - 1; i > 0; i--) {
auto curr = new_permutations[i];
auto prev = new_permutations[i - 1];
if (prev + 1 == curr) {
// all dims bigger than curr need to be reduced by 1 due to the merging.
for (auto j = 0; j < new_rank; j++) {
if (new_permutations[j] > curr) {
new_permutations[j] -= 1;
}
}
for (auto j = i + 1; j < new_rank; j++) {
new_permutations[j - 1] = new_permutations[j];
}
// update input dims
new_input_dims[prev] *= new_input_dims[curr];
new_input_dims[curr] = 1;
for (auto j = static_cast<int32_t>(curr + 1); j < new_rank; j++) {
new_input_dims[j - 1] = new_input_dims[j];
}
new_input_dims[new_rank - 1] = 1;
// update output dims
new_output_dims[i - 1] *= new_output_dims[i];
new_output_dims[i] = 1;
for (auto j = i + 1; j < new_rank; j++) {
new_output_dims[j - 1] = new_output_dims[j];
}
new_output_dims[new_rank - 1] = 1;
new_rank--;
}
}
new_permutations.resize(new_rank);
new_input_dims.resize(new_rank);
new_output_dims.resize(new_rank);
if (new_rank <= 1) {
HIP_RETURN_IF_ERROR(hipMemcpyAsync(output.MutableDataRaw(), input.DataRaw(),
input.Shape().Size() * input.DataType()->Size(), hipMemcpyDeviceToDevice,
stream));
return Status::OK();
}
auto element_type = input.GetElementType();
size_t element_size = input.DataType()->Size();
if (element_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT ||
element_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE ||
element_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16) {
auto mn = TryTransposeWithRocblas(new_permutations, new_input_dims);
int M = std::get<0>(mn);
int N = std::get<1>(mn);
if (M != 0 && N != 0) {
if (element_type == utils::GetONNXTensorElementDataType<float>()) {
return TransposeWithRocblas<float>(stream, rocblas_handle, input, output, M, N);
} else if (element_type == utils::GetONNXTensorElementDataType<double>()) {
return TransposeWithRocblas<double>(stream, rocblas_handle, input, output, M, N);
} else {
return TransposeWithRocblas<MLFloat16>(stream, rocblas_handle, input, output, M, N);
}
}
}
// Transpose021 has a specialized Transpose3DImpl kernel
dim3 grid_size, block_size;
if (CanDoTranspose3D(prop, static_cast<size_t>(new_rank), new_input_dims, new_permutations, grid_size, block_size)) {
TensorPitches new_input_strides(new_input_dims);
return Transpose3DImpl(stream, element_size, ToConstSpan(new_input_dims), ToConstSpan(new_input_strides),
input.DataRaw(), output.MutableDataRaw(), output.Shape().Size(), grid_size, block_size);
}
// 3D-Transpose can treated as a special case of 4D-Transpose with first dimension being 1.
if (new_rank == 3) {
new_permutations[0]++;
new_permutations[1]++;
new_permutations[2]++;
new_permutations.insert(new_permutations.begin(), 0);
new_input_dims.insert(new_input_dims.begin(), 1);
new_output_dims.insert(new_output_dims.begin(), 1);
new_rank = 4;
}
TensorPitches new_input_strides(new_input_dims);
TensorPitches new_output_strides(new_output_dims);
TArray<int64_t> input_shape(new_input_dims);
TArray<int64_t> tmp_input_strides(new_input_strides);
if (CanDoTranspose4DParallelizeMultipleElementsPerThreadInInnermostDim(
prop, element_size, new_rank, new_input_dims, new_permutations,
grid_size, block_size)) {
TArray<int64_t> tmp_output_strides(new_rank);
for (auto i = 0; i < new_rank; i++) {
tmp_output_strides[static_cast<int32_t>(new_permutations[i])] = new_output_strides[i];
}
return Transpose4DParallelizeMultipleElementsPerThreadInInnermostDim(
stream, element_size, input_shape, tmp_input_strides, input.DataRaw(),
tmp_output_strides, output.MutableDataRaw(), gsl::narrow<int>(output.Shape().Size()),
grid_size, block_size);
}
// We used to check if Transpose4DParallelizeOneElementPerThread can be used before falling back to generic case,
// But tests on lots of cases showing that Transpose4DParallelizeOneElementPerThread is not faster than generic case,
// and even much slower than generic case for some cases.
// General cases
TArray<int64_t> input_strides(new_rank);
for (auto i = 0; i < new_rank; i++) {
input_strides[i] = new_input_strides[new_permutations[i]];
}
TArray<fast_divmod> output_strides(new_rank);
for (auto i = 0; i < new_rank; i++) {
output_strides[i] = fast_divmod(gsl::narrow_cast<int>(new_output_strides[i]));
}
auto status = TransposeImpl(stream, element_size, new_rank, input_strides, input.DataRaw(),
output_strides, output.MutableDataRaw(), gsl::narrow<int>(output.Shape().Size()));
return status;
}
Status Transpose::ComputeInternal(OpKernelContext* ctx) const {
const Tensor* X_ptr = ctx->Input<Tensor>(0);
if (X_ptr == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
const Tensor& X = *X_ptr;
const TensorShape& input_shape = X.Shape();
int32_t rank = gsl::narrow_cast<int32_t>(input_shape.NumDimensions());
TensorShapeVector output_dims(rank);
InlinedVector<size_t> default_perm(rank);
const InlinedVector<size_t>* p_perm = nullptr;
const auto& status = ComputeOutputShape(X, output_dims, default_perm, p_perm);
if (!status.IsOK())
return status;
TensorShape output_shape{output_dims};
Tensor* Y = ctx->Output(0, output_shape);
return DoTranspose(this->GetDeviceProp(), this->Stream(), this->RocblasHandle(), *p_perm, X, *Y);
}
} // namespace rocm
} // namespace onnxruntime
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment