Commit 1a91fcc2 authored by gaoqiong's avatar gaoqiong
Browse files

add dtk所需文件

parent a144865d
Pipeline #492 failed with stages
in 0 seconds
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace onnxruntime {
namespace rocm {
// These macros simplifies coding. To add a new op with following steps:
// 1. Add a new entry in BINARY_OPS() list
// 2. (optional) Define templated single element operator in binary_elementwise_ops_impl.cu
// 3. (optional) Implement specialized single element operator
// 4. Add op kernel class definition in binary_elementwise_ops.h
// 5. Add op kernel registration and compute specialization in binary_elementwise_ops.cc
#define BINARY_OPS() \
BINARY_OP_NAME_EXPR(Add, (a + b)) \
BINARY_OP_NAME_EXPR(Sub, (a - b)) \
BINARY_OP_NAME_EXPR(Mul, (a * b)) \
BINARY_OP_NAME_EXPR(Div, (a / b)) \
BINARY_OP_NAME_EXPR(Pow_7, _Pow(a, b)) \
BINARY_OP_NAME_EXPR(And, (a & b)) \
BINARY_OP_NAME_EXPR(Or, (a | b)) \
BINARY_OP_NAME_EXPR(Xor, (a ^ b)) \
BINARY_OP_NAME_EXPR(PRelu, (a > (T)0 ? a : a * b)) \
BINARY_OP_NAME_EXPR(Max, _Max(a, b)) \
BINARY_OP_NAME_EXPR(Min, _Min(a, b)) \
BINARY_OP_NAME_EXPR(Mod, _Mod(a, b)) \
BINARY_OP_NAME_EXPR(Fmod, _Fmod(a, b))
// NOTE that cu files are compiled with nvcc and should not refer to any onnxruntime headers
// so struct BinaryElementwisePreparation cannot be used here
#define BINARY_ELEMENTWISE_IMPL_DECLARATION(name) \
template <typename T> \
void Impl_##name( \
hipStream_t stream, \
int32_t output_rank_or_simple_broadcast, \
const TArray<int64_t>* lhs_padded_strides, \
const T* lhs_data, \
const TArray<int64_t>* rhs_padded_strides, \
const T* rhs_data, \
const TArray<fast_divmod>* fdm_output_strides, \
const fast_divmod& fdm_H, \
const fast_divmod& fdm_C, \
T* output_data, \
size_t count)
#define BINARY_OP_NAME_EXPR(name, expr) BINARY_ELEMENTWISE_IMPL_DECLARATION(name);
BINARY_OPS()
#undef BINARY_OP_NAME_EXPR
#define BINARY_ELEMENTWISE_IMPL_DECLARATION_T1(name) \
template <typename T, typename T1> \
void ImplT1_##name( \
hipStream_t stream, \
int32_t output_rank_or_simple_broadcast, \
const TArray<int64_t>* lhs_padded_strides, \
const T* lhs_data, \
const TArray<int64_t>* rhs_padded_strides, \
const T1* rhs_data, \
const TArray<fast_divmod>* fdm_output_strides, \
const fast_divmod& fdm_H, \
const fast_divmod& fdm_C, \
T* output_data, \
size_t count)
BINARY_ELEMENTWISE_IMPL_DECLARATION_T1(Pow);
#define BINARY_ELEMENTWISE_IMPL_DECLARATION_T2(name) \
template <typename T, typename T1, typename T2> \
void ImplT2_##name( \
hipStream_t stream, \
int32_t output_rank_or_simple_broadcast, \
const TArray<int64_t>* lhs_padded_strides, \
const T1* lhs_data, \
const TArray<int64_t>* rhs_padded_strides, \
const T2* rhs_data, \
const TArray<fast_divmod>* fdm_output_strides, \
const fast_divmod& fdm_H, \
const fast_divmod& fdm_C, \
T* output_data, \
size_t count)
#define BINARY_OPS2() \
BINARY_OP_NAME_EXPR2(Greater, (a > b)) \
BINARY_OP_NAME_EXPR2(Equal, (a == b)) \
BINARY_OP_NAME_EXPR2(Less, (a < b)) \
BINARY_OP_NAME_EXPR2(GreaterOrEqual, (a >= b)) \
BINARY_OP_NAME_EXPR2(LessOrEqual, (a <= b))
#define BINARY_OP_NAME_EXPR2(name, expr) BINARY_ELEMENTWISE_IMPL_DECLARATION_T2(name);
BINARY_OPS2()
#undef BINARY_OP_NAME_EXPR2
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/math/binary_elementwise_ops_impl.h"
namespace onnxruntime {
namespace rocm {
// define the device functors that perform the computation on scalars
#define OP_FUNCTOR_DEFINITION(name, expr) \
template <class T, class T1, class T2> \
struct OP_##name { \
__device__ __inline__ T operator()(T1 a, T2 b) const { \
return (expr); \
} \
};
#define BINARY_OP_NAME_EXPR(name, expr) \
OP_FUNCTOR_DEFINITION(name, expr)
BINARY_OPS()
OP_FUNCTOR_DEFINITION(Pow, _Pow(a, b))
#undef BINARY_OP_NAME_EXPR
#define BINARY_OP_NAME_EXPR2(name, expr) \
OP_FUNCTOR_DEFINITION(name, expr)
BINARY_OPS2()
#undef BINARY_OP_NAME_EXPR2
#undef OP_FUNCTOR_DEFINITION
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/math/clip.h"
#include "core/providers/rocm/math/clip_impl.h"
namespace onnxruntime {
namespace rocm {
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(
Clip,
kOnnxDomain,
6,
10,
float,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
Clip_6<float>);
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Clip,
kOnnxDomain,
11, 11,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
Clip);
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Clip,
kOnnxDomain,
12, 12,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", BuildKernelDefConstraints<float, double, MLFloat16, int8_t, uint8_t, int64_t, uint64_t>()),
Clip);
ONNX_OPERATOR_KERNEL_EX(
Clip,
kOnnxDomain,
13,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", BuildKernelDefConstraints<float, double, MLFloat16, int8_t, uint8_t, int64_t, uint64_t>()),
Clip);
template <typename T>
Status Clip_6<T>::ComputeInternal(OpKernelContext* ctx) const {
const Tensor& X = *ctx->Input<Tensor>(0);
const TensorShape& input_shape{X.Shape()};
Tensor* Y = ctx->Output(0, input_shape);
const size_t count = input_shape.Size();
if (count > 0) {
auto* y_data = Y->MutableData<T>();
const auto* x_data = X.Data<T>();
ClipImpl<T>(Stream(), x_data, y_data, nullptr, nullptr, this->min_, this->max_, count);
}
return Status::OK();
}
namespace clip_internal {
template <typename T>
struct LowMax {
constexpr static T low() {
return std::numeric_limits<T>::lowest();
}
constexpr static T max() {
return std::numeric_limits<T>::max();
}
};
template <>
struct LowMax<MLFloat16> {
static MLFloat16 low() {
return MLFloat16(math::floatToHalf(std::numeric_limits<float>::lowest()));
}
static MLFloat16 max() {
return MLFloat16(math::floatToHalf(std::numeric_limits<float>::max()));
}
};
} // namespace clip_internal
template <typename T>
struct Clip::ComputeImpl {
void operator()(hipStream_t stream, const Tensor* X, const Tensor* min, const Tensor* max, Tensor* Y) const {
auto min_default = clip_internal::LowMax<T>::low();
auto max_default = clip_internal::LowMax<T>::max();
const T* min_data = nullptr;
const T* max_data = nullptr;
// 1-2 Input on CPU
if (min) {
ORT_ENFORCE(min->Shape().IsScalar(), "min should be a scalar.");
min_data = min->Data<T>();
}
if (max) {
ORT_ENFORCE(max->Shape().IsScalar(), "max should be a scalar.");
max_data = max->Data<T>();
}
const size_t count = X->Shape().Size();
if (count > 0) {
auto* y_data = Y->MutableData<T>();
const auto* x_data = X->Data<T>();
ClipImpl<T>(stream, x_data, y_data, min_data, max_data, min_default, max_default, count);
}
}
};
Status Clip::ComputeInternal(OpKernelContext* ctx) const {
const auto* X = ctx->Input<Tensor>(0);
const auto* min = ctx->Input<Tensor>(1);
const auto* max = ctx->Input<Tensor>(2);
Tensor* Y = ctx->Output(0, X->Shape());
utils::MLTypeCallDispatcher<float, double, MLFloat16, int8_t, uint8_t, int64_t, uint64_t>
t_disp(X->GetElementType());
t_disp.Invoke<ComputeImpl>(Stream(), X, min, max, Y);
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/math/clip.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
class Clip_6 final : public onnxruntime::clip_internal::Clip_6Base<T>, public RocmKernel {
public:
explicit Clip_6(const OpKernelInfo& info) : onnxruntime::clip_internal::Clip_6Base<T>(info), RocmKernel{info} {
}
Status ComputeInternal(OpKernelContext* context) const override;
};
// Since version 11. Min and Max are inputs
// version 12 adds type support
class Clip final : public RocmKernel {
public:
explicit Clip(const OpKernelInfo& info) : RocmKernel{info} {
}
Status ComputeInternal(OpKernelContext* context) const override;
private:
template <typename T>
struct ComputeImpl;
};
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/math/clip_impl.h"
#include "core/providers/rocm/cu_inc/common.cuh"
namespace onnxruntime {
namespace rocm {
template <typename T>
__global__ void _Clip(const T* input, T* output, const T* min, const T* max, T min_default, T max_default, size_t N) {
auto min_val = (min) ? *min : min_default;
auto max_val = (max) ? *max : max_default;
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
output[id] = (input[id] < min_val) ? min_val : ((input[id] > max_val) ? max_val : input[id]);
}
template <typename T>
void ClipImpl(hipStream_t stream, const T* input_data, T* output_data, const T* min, const T* max, T min_default, T max_default, size_t count) {
typedef typename ToHipType<T>::MappedType HipT;
int blocksPerGrid = (int)(ceil(static_cast<float>(count) / GridDim::maxThreadsPerBlock));
union ConstAliasUnion {
const T *t;
const HipT *rocmT;
ConstAliasUnion(const T* _t) { t = _t;}
};
union AliasUnion {
T *t;
HipT *rocmT;
AliasUnion(T* _t) { t = _t;}
};
hipLaunchKernelGGL(HIP_KERNEL_NAME(_Clip<HipT>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, ((union ConstAliasUnion)input_data).rocmT,
((union AliasUnion)output_data).rocmT,
((union ConstAliasUnion)min).rocmT,
((union ConstAliasUnion)max).rocmT,
*((union AliasUnion)&min_default).rocmT,
*((union AliasUnion)&max_default).rocmT,
count);
}
template void ClipImpl<float>(hipStream_t stream, const float* input_data, float* output_data, const float* min, const float* max, float min_default, float max_default, size_t count);
template void ClipImpl<double>(hipStream_t stream, const double* input_data, double* output_data, const double* min, const double* max, double min_default, double max_default, size_t count);
template void ClipImpl<MLFloat16>(hipStream_t stream, const MLFloat16* input_data, MLFloat16* output_data, const MLFloat16* min, const MLFloat16* max, MLFloat16 min_default, MLFloat16 max_default, size_t count);
template void ClipImpl<int8_t>(hipStream_t stream, const int8_t* input_data, int8_t* output_data, const int8_t* min, const int8_t* max, int8_t min_default, int8_t max_default, size_t count);
template void ClipImpl<uint8_t>(hipStream_t stream, const uint8_t* input_data, uint8_t* output_data, const uint8_t* min, const uint8_t* max, uint8_t min_default, uint8_t max_default, size_t count);
template void ClipImpl<int64_t>(hipStream_t stream, const int64_t* input_data, int64_t* output_data, const int64_t* min, const int64_t* max, int64_t min_default, int64_t max_default, size_t count);
template void ClipImpl<uint64_t>(hipStream_t stream, const uint64_t* input_data, uint64_t* output_data, const uint64_t* min, const uint64_t* max, uint64_t min_default, uint64_t max_default, size_t count);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/math/clip.h"
#include "core/providers/rocm/rocm_common.h"
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
void ClipImpl(hipStream_t stream, const T* input_data, T* output_data, const T* min, const T* max, T min_default, T max_default, size_t count);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "cumsum.h"
#include "cumsum_impl.h"
#include "core/providers/cpu/math/cumsum.h"
#include "core/providers/common.h"
namespace onnxruntime {
namespace rocm {
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
CumSum,
kOnnxDomain,
11, 13,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.InputMemoryType(OrtMemTypeCPUInput, 1) // 'axis' needs to be on CPU
.TypeConstraint("T", std::vector<MLDataType>{
DataTypeImpl::GetTensorType<int32_t>(),
DataTypeImpl::GetTensorType<int64_t>(),
DataTypeImpl::GetTensorType<uint32_t>(),
DataTypeImpl::GetTensorType<uint64_t>(),
DataTypeImpl::GetTensorType<float>(),
DataTypeImpl::GetTensorType<double>()})
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(), DataTypeImpl::GetTensorType<int64_t>()}),
CumSum);
ONNX_OPERATOR_KERNEL_EX(
CumSum,
kOnnxDomain,
14,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.InputMemoryType(OrtMemTypeCPUInput, 1) // 'axis' needs to be on CPU
.TypeConstraint("T", std::vector<MLDataType>{
DataTypeImpl::GetTensorType<int32_t>(),
DataTypeImpl::GetTensorType<int64_t>(),
DataTypeImpl::GetTensorType<uint32_t>(),
DataTypeImpl::GetTensorType<uint64_t>(),
DataTypeImpl::GetTensorType<float>(),
DataTypeImpl::GetTensorType<double>(),
DataTypeImpl::GetTensorType<MLFloat16>()}) // MLFloat16 is added in opset 14
.TypeConstraint("T2", std::vector<MLDataType>{DataTypeImpl::GetTensorType<int32_t>(), DataTypeImpl::GetTensorType<int64_t>()}),
CumSum);
Status CumSum::ComputeInternal(OpKernelContext* ctx) const {
const Tensor* input = ctx->Input<Tensor>(0); // input tensor
auto rank = static_cast<int64_t>(input->Shape().NumDimensions()); // the rank of the input/output
if (rank == 0)
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Cannot apply CumSum operator on a scalar");
const Tensor* axis_tensor = ctx->Input<Tensor>(1); // axis input tensor
int64_t axis = 0;
ORT_THROW_IF_ERROR(cumsum_op::GetAxis(axis_tensor, rank, axis));
TensorShape output_shape(input->Shape());
auto& output = *ctx->Output(0, output_shape); // output tensor
// output tensor's size is 0, nothing to fill - return
if (output_shape.Size() == 0)
return Status::OK();
const auto& input_dims = input->Shape().GetDims();
int64_t current_dim = rank - 1;
int64_t input_stride_along_axis = 1;
// axis (and by extension current_dim) can never be negative as this is validated much before
// so no need to add the extra check to make sure current_dim is within bounds of the vector size
while (current_dim > axis) {
input_stride_along_axis *= input_dims[current_dim--];
}
fast_divmod fast_divmod_input_dim_along_axis(static_cast<int>(input_dims[axis]));
fast_divmod fast_divmod_input_stride_along_axis(static_cast<int>(input_stride_along_axis));
if (input->IsDataType<float>()) {
CumSumImpl(Stream(), reinterpret_cast<const typename ToHipType<float>::MappedType*>(input->Data<float>()),
fast_divmod_input_dim_along_axis,
fast_divmod_input_stride_along_axis,
reinterpret_cast<typename ToHipType<float>::MappedType*>(output.MutableData<float>()),
output_shape.Size(),
exclusive_,
reverse_);
} else if (input->IsDataType<double>()) {
CumSumImpl(Stream(), reinterpret_cast<const typename ToHipType<double>::MappedType*>(input->Data<double>()),
fast_divmod_input_dim_along_axis,
fast_divmod_input_stride_along_axis,
reinterpret_cast<typename ToHipType<double>::MappedType*>(output.MutableData<double>()),
output_shape.Size(),
exclusive_,
reverse_);
} else if (input->IsDataType<int32_t>()) {
CumSumImpl(Stream(), reinterpret_cast<const typename ToHipType<int32_t>::MappedType*>(input->Data<int32_t>()),
fast_divmod_input_dim_along_axis,
fast_divmod_input_stride_along_axis,
reinterpret_cast<typename ToHipType<int32_t>::MappedType*>(output.MutableData<int32_t>()),
output_shape.Size(),
exclusive_,
reverse_);
} else if (input->IsDataType<int64_t>()) {
CumSumImpl(Stream(), reinterpret_cast<const typename ToHipType<int64_t>::MappedType*>(input->Data<int64_t>()),
fast_divmod_input_dim_along_axis,
fast_divmod_input_stride_along_axis,
reinterpret_cast<typename ToHipType<int64_t>::MappedType*>(output.MutableData<int64_t>()),
output_shape.Size(),
exclusive_,
reverse_);
} else if (input->IsDataType<uint32_t>()) {
CumSumImpl(Stream(), reinterpret_cast<const typename ToHipType<uint32_t>::MappedType*>(input->Data<uint32_t>()),
fast_divmod_input_dim_along_axis,
fast_divmod_input_stride_along_axis,
reinterpret_cast<typename ToHipType<uint32_t>::MappedType*>(output.MutableData<uint32_t>()),
output_shape.Size(),
exclusive_,
reverse_);
} else if (input->IsDataType<uint64_t>()) {
CumSumImpl(Stream(), reinterpret_cast<const typename ToHipType<uint64_t>::MappedType*>(input->Data<uint64_t>()),
fast_divmod_input_dim_along_axis,
fast_divmod_input_stride_along_axis,
reinterpret_cast<typename ToHipType<uint64_t>::MappedType*>(output.MutableData<uint64_t>()),
output_shape.Size(),
exclusive_,
reverse_);
} else if (input->IsDataType<MLFloat16>()) {
CumSumImpl(Stream(), reinterpret_cast<const typename ToHipType<MLFloat16>::MappedType*>(input->Data<MLFloat16>()),
fast_divmod_input_dim_along_axis,
fast_divmod_input_stride_along_axis,
reinterpret_cast<typename ToHipType<MLFloat16>::MappedType*>(output.MutableData<MLFloat16>()),
output_shape.Size(),
exclusive_,
reverse_);
} else {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported input data type to the CumSum op: ",
input->DataType());
}
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/common.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace onnxruntime {
namespace rocm {
class CumSum final : public RocmKernel {
public:
explicit CumSum(const OpKernelInfo& info) : RocmKernel(info) {
// Process exclusive attribute
int64_t exclusive = 0;
auto status = info.GetAttr("exclusive", &exclusive);
if (status.IsOK()) {
if (exclusive == 1 || exclusive == 0) {
exclusive_ = (exclusive == 1);
} else {
ORT_ENFORCE("attribute exclusive can only be 0 or 1");
}
}
// Process reverse attribute
int64_t reverse = 0;
status = info.GetAttr("reverse", &reverse);
if (status.IsOK()) {
if (reverse == 1 || reverse == 0) {
reverse_ = (reverse == 1);
} else {
ORT_ENFORCE("attribute reverse can only be 0 or 1");
}
}
}
~CumSum() = default;
Status ComputeInternal(OpKernelContext* ctx) const override;
private:
bool exclusive_ = false;
bool reverse_ = false;
};
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/providers/rocm/shared_inc/fast_divmod.h"
#include "cumsum_impl.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
__global__ void _CumSumKernel(
const T* input_data,
const fast_divmod fast_divmod_input_dim_along_axis,
const fast_divmod fast_divmod_input_stride_along_axis,
T* output_data,
const int64_t output_size,
const bool exclusive,
const bool reverse) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(indices_index, output_size);
int input_dim_along_axis = fast_divmod_input_dim_along_axis.d_;
int input_stride_along_axis = fast_divmod_input_stride_along_axis.d_;
int axis_dim = 0;
int div = fast_divmod_input_stride_along_axis.div(static_cast<int>(indices_index));
fast_divmod_input_dim_along_axis.divmod(div, div, axis_dim);
int start = 0;
int end = 0;
if (!reverse && !exclusive) {
start = 0;
end = axis_dim;
} else if (reverse && !exclusive) {
start = axis_dim;
end = input_dim_along_axis - 1;
} else if (!reverse && exclusive) {
start = 0;
end = axis_dim - 1;
} else { // reverse && exclusive
start = axis_dim + 1;
end = input_dim_along_axis - 1;
}
// count the number of elements to accumulate the sum
int count = end - start + 1;
if (count <= 0) {
output_data[indices_index] = 0;
return;
}
// adjust start index based on the above identified start dim value along the axis of interest
int data_index = static_cast<int>(indices_index) + (start - axis_dim) * input_stride_along_axis;
T sum = 0;
// keep accumulating values from the start index for 'count' times and skip appropriately
while (count != 0) {
sum += input_data[data_index];
data_index += input_stride_along_axis;
--count;
}
output_data[indices_index] = sum;
}
template <typename T>
void CumSumImpl(
hipStream_t stream,
const T* input_data,
const fast_divmod& input_dim_along_axis,
const fast_divmod& input_stride_along_axis,
T* output_data,
int64_t output_size,
bool exclusive,
bool reverse) {
if (output_size > 0) {
int blocksPerGrid = static_cast<int>((output_size + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock);
hipLaunchKernelGGL(HIP_KERNEL_NAME(_CumSumKernel<T>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream, input_data,
input_dim_along_axis,
input_stride_along_axis,
output_data,
output_size,
exclusive,
reverse);
}
}
template void CumSumImpl<int32_t>(
hipStream_t stream,
const int32_t* input_data,
const fast_divmod& input_dim_along_axis,
const fast_divmod& input_stride_along_axis,
int32_t* output_data,
int64_t output_size,
bool exclusive,
bool reverse);
template void CumSumImpl<int64_t>(
hipStream_t stream,
const int64_t* input_data,
const fast_divmod& input_dim_along_axis,
const fast_divmod& input_stride_along_axis,
int64_t* output_data,
int64_t output_size,
bool exclusive,
bool reverse);
template void CumSumImpl<uint32_t>(
hipStream_t stream,
const uint32_t* input_data,
const fast_divmod& input_dim_along_axis,
const fast_divmod& input_stride_along_axis,
uint32_t* output_data,
int64_t output_size,
bool exclusive,
bool reverse);
template void CumSumImpl<uint64_t>(
hipStream_t stream,
const uint64_t* input_data,
const fast_divmod& input_dim_along_axis,
const fast_divmod& input_stride_along_axis,
uint64_t* output_data,
int64_t output_size,
bool exclusive,
bool reverse);
template void CumSumImpl<float>(
hipStream_t stream,
const float* input_data,
const fast_divmod& input_dim_along_axis,
const fast_divmod& input_stride_along_axis,
float* output_data,
int64_t output_size,
bool exclusive,
bool reverse);
template void CumSumImpl<double>(
hipStream_t stream,
const double* input_data,
const fast_divmod& input_dim_along_axis,
const fast_divmod& input_stride_along_axis,
double* output_data,
int64_t output_size,
bool exclusive,
bool reverse);
template void CumSumImpl<half>(
hipStream_t stream,
const half* input_data,
const fast_divmod& input_dim_along_axis,
const fast_divmod& input_stride_along_axis,
half* output_data,
int64_t output_size,
bool exclusive,
bool reverse);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
void CumSumImpl(
hipStream_t stream,
const T* input_data,
const fast_divmod& input_dim_along_axis,
const fast_divmod& input_stride_along_axis,
T* output_data,
int64_t output_size,
bool exclusive,
bool reverse);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
class Gemm final : public RocmKernel {
using Base = RocmKernel;
public:
Gemm(const OpKernelInfo& info) : RocmKernel(info) {
int64_t temp;
ORT_ENFORCE(info.GetAttr<int64_t>("transA", &temp).IsOK());
trans_A_ = (temp != 0);
ORT_ENFORCE(info.GetAttr<int64_t>("transB", &temp).IsOK());
trans_B_ = (temp != 0);
ORT_ENFORCE(info.GetAttr<float>("alpha", &alpha_).IsOK());
ORT_ENFORCE(info.GetAttr<float>("beta", &beta_).IsOK());
}
Status ComputeInternal(OpKernelContext* context) const override;
private:
bool trans_A_;
bool trans_B_;
float alpha_;
float beta_;
};
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
class MatMul final : public RocmKernel {
using Base = RocmKernel;
public:
MatMul(const OpKernelInfo& info)
: RocmKernel(info),
alpha_{info.GetAttrOrDefault<float>("alpha", 1.0f)},
trans_A_{info.GetAttrOrDefault<int64_t>("transA", 0) != 0},
trans_B_{info.GetAttrOrDefault<int64_t>("transB", 0) != 0},
trans_batch_a_{info.GetAttrOrDefault<int64_t>("transBatchA", 0) != 0},
trans_batch_b_{info.GetAttrOrDefault<int64_t>("transBatchB", 0) != 0} {}
Status ComputeInternal(OpKernelContext* context) const override;
private:
const float alpha_;
const bool trans_A_;
const bool trans_B_;
const bool trans_batch_a_;
const bool trans_batch_b_;
};
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "matmul_integer.h"
#include "matmul_integer.cuh"
#include "core/providers/cpu/math/matmul_helper.h"
#include "core/providers/rocm/shared_inc/fpgeneric.h"
#include "core/providers/rocm/shared_inc/integer_gemm.h"
#include "core/providers/rocm/rocm_allocator.h"
#include "core/providers/common.h"
namespace onnxruntime {
namespace rocm {
ONNX_OPERATOR_TYPED_KERNEL_EX(
MatMulInteger,
kOnnxDomain,
10,
int8_t,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.InputMemoryType(OrtMemTypeCPUInput, 2)
.InputMemoryType(OrtMemTypeCPUInput, 3)
.TypeConstraint("T1", DataTypeImpl::GetTensorType<int8_t>())
.TypeConstraint("T2", DataTypeImpl::GetTensorType<int8_t>())
.TypeConstraint("T3", DataTypeImpl::GetTensorType<int32_t>()),
MatMulInteger<int8_t, int8_t>);
template <>
Status MatMulInteger<int8_t, int8_t>::ComputeInternal(OpKernelContext* ctx) const {
auto a = ctx->Input<Tensor>(0);
auto b = ctx->Input<Tensor>(1);
ORT_ENFORCE(a != nullptr && b != nullptr);
MatMulComputeHelper helper;
ORT_RETURN_IF_ERROR(helper.Compute(a->Shape(), b->Shape()));
Tensor* Y = ctx->Output(0, helper.OutputShape());
// Bail out early if the output is going to be empty
if (Y->Shape().Size() == 0)
return Status::OK();
const int8_t* a_ptr = a->Data<int8_t>();
const int8_t* b_ptr = b->Data<int8_t>();
int32_t* output_ptr = Y->MutableData<int32_t>();
// validate zero points
int8_t a_offset = 0;
int8_t b_offset = 0;
if (has_a_zero_point_) {
auto a_zero_point = ctx->Input<Tensor>(2);
ORT_ENFORCE(IsScalarOr1ElementVector(a_zero_point),
"MatmulInteger : input1 zero point must be a scalar or 1D tensor of size 1");
a_offset = *(a_zero_point->Data<int8_t>());
}
if (has_b_zero_point_) {
auto b_zero_point = ctx->Input<Tensor>(3);
ORT_ENFORCE(IsScalarOr1ElementVector(b_zero_point),
"MatmulInteger : input2 zero point must be a scalar or 1D tensor of size 1");
b_offset = *(b_zero_point->Data<int8_t>());
}
// offset output c[i,j] to
// k*a_offset*b_offset -
// b_offset * (a[i,0] + a[i,1] ...+a[i,k]) -
// a_offset * (b[0,j] + b[1,j] ... + b[k,j])
// ReduceRowSumOnMatrixA computes the b_offset * (a[i,0] + a[i,1] ...+a[i,k]) part
// ReduceColSumOnMatrixB computes the a_offset * (b[0,j] + b[1,j] ... + b[k,j]) part
// OffsetOutput computes gets the final result
IAllocatorUniquePtr<int32_t> a_row_buf;
if (b_offset != 0) {
a_row_buf = GetScratchBuffer<int32_t>(helper.OutputShape().Size() / helper.N());
ORT_RETURN_IF_ERROR(ReduceRowSumOnMatrixA(Stream(), a_ptr, a_row_buf.get(), b_offset, helper));
}
IAllocatorUniquePtr<int32_t> b_col_buf;
if (a_offset != 0) {
b_col_buf = GetScratchBuffer<int32_t>(helper.OutputShape().Size() / helper.M());
ORT_RETURN_IF_ERROR(ReduceColSumOnMatrixB(Stream(), b_ptr, b_col_buf.get(), a_offset, helper));
}
int alpha = 1;
int beta = 0;
if (a_offset != 0 || b_offset != 0) {
ORT_RETURN_IF_ERROR(OffsetOutput(Stream(),
a_row_buf.get(),
b_col_buf.get(),
output_ptr,
a_offset,
b_offset,
helper));
beta = 1;
}
for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
ORT_RETURN_IF_ERROR(GemmInt8(static_cast<int>(helper.M()),
static_cast<int>(helper.N()),
static_cast<int>(helper.K()),
alpha,
beta,
a_ptr + helper.LeftOffsets()[batch],
static_cast<int>(helper.K()),
b_ptr + helper.RightOffsets()[batch],
static_cast<int>(helper.N()),
output_ptr + helper.OutputOffsets()[batch],
static_cast<int>(helper.N()),
this));
}
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "matmul_integer.cuh"
#include <hipcub/hipcub.hpp>
#include "core/providers/rocm/cu_inc/common.cuh"
namespace onnxruntime {
namespace rocm {
template <int TPB>
__global__ void ReduceRowSumOnMatrixAKernel(const int8_t* matrix, int32_t* row_sum, const int8_t offset, int32_t K) {
int32_t thread_data = 0;
const int8_t* row_ptr = matrix + blockIdx.x * K;
for (int i = threadIdx.x; i < K; i += TPB) {
thread_data += *(row_ptr + i);
}
using BlockReduce = hipcub::BlockReduce<int32_t, TPB>;
__shared__ typename BlockReduce::TempStorage temp_storage;
int32_t sum = BlockReduce(temp_storage).Sum(thread_data);
if (threadIdx.x == 0) {
row_sum[blockIdx.x] = offset * sum;
}
}
Status ReduceRowSumOnMatrixA(hipStream_t stream, const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper) {
for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
ReduceRowSumOnMatrixAKernel<static_cast<int>(GridDim::maxThreadsPerBlock)><<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>(matrix + helper.LeftOffsets()[batch],
row_sum + batch * helper.M(),
offset,
static_cast<int>(helper.K()));
}
return HIP_CALL(hipGetLastError());
}
template <int TPB>
__global__ void ReduceColSumOnMatrixBKernel(const int8_t* matrix, int32_t* col_sum, const int8_t offset, int32_t row, int32_t col) {
int32_t thread_data = 0;
const int8_t* col_ptr = matrix + blockIdx.x;
for (int i = threadIdx.x; i < row; i += TPB) {
thread_data += *(col_ptr + i * col);
}
using BlockReduce = hipcub::BlockReduce<int32_t, TPB>;
__shared__ typename BlockReduce::TempStorage temp_storage;
int32_t sum = BlockReduce(temp_storage).Sum(thread_data);
if (threadIdx.x == 0) {
col_sum[blockIdx.x] = offset * sum;
}
}
Status ReduceColSumOnMatrixB(hipStream_t stream, const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper) {
for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
ReduceColSumOnMatrixBKernel<static_cast<int>(GridDim::maxThreadsPerBlock)><<<static_cast<int>(helper.N()), GridDim::maxThreadsPerBlock, 0, stream>>>(matrix + helper.RightOffsets()[batch],
col_sum + batch * helper.N(),
offset,
static_cast<int32_t>(helper.K()),
static_cast<int32_t>(helper.N()));
}
return HIP_CALL(hipGetLastError());
}
__global__ void ComputeOffsetOfMatrixAB(const int32_t* row_sum,
const int32_t* col_sum,
int32_t* output,
int32_t K_A_B,
int32_t N) {
for (int32_t i = threadIdx.x; i < N; i += blockDim.x) {
*(output + blockIdx.x * N + i) = K_A_B - row_sum[blockIdx.x] - col_sum[i];
}
}
__global__ void ComputeOffsetOfMatrixA(const int32_t* col_sum,
int32_t* output,
int32_t N) {
for (int32_t i = threadIdx.x; i < N; i += blockDim.x) {
*(output + blockIdx.x * N + i) = -col_sum[i];
}
}
__global__ void ComputeOffsetOfMatrixB(const int32_t* row_sum,
int32_t* output,
int32_t N) {
for (int32_t i = threadIdx.x; i < N; i += blockDim.x) {
*(output + blockIdx.x * N + i) = -row_sum[blockIdx.x];
}
}
Status OffsetOutput(hipStream_t stream,
const int32_t* row_sum,
const int32_t* col_sum,
int32_t* output,
const int8_t a_offset,
const int8_t b_offset,
const MatMulComputeHelper& helper) {
if (a_offset && b_offset) {
for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
ComputeOffsetOfMatrixAB<<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>(
row_sum + batch * helper.M(),
col_sum + batch * helper.N(),
output + helper.OutputOffsets()[batch],
static_cast<int32_t>(helper.K()) * a_offset * b_offset,
static_cast<int32_t>(helper.N()));
}
} else if (a_offset) {
for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
ComputeOffsetOfMatrixA<<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>(
col_sum + batch * helper.N(),
output + helper.OutputOffsets()[batch],
static_cast<int32_t>(helper.N()));
}
} else if (b_offset) {
for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
ComputeOffsetOfMatrixB<<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0, stream>>>(
row_sum + batch * helper.M(),
output + helper.OutputOffsets()[batch],
static_cast<int32_t>(helper.N()));
}
}
return HIP_CALL(hipGetLastError());
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "matmul_integer.h"
#include "core/providers/cpu/math/matmul_helper.h"
#include "core/providers/rocm/rocm_common.h"
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace onnxruntime {
namespace rocm {
Status ReduceRowSumOnMatrixA(hipStream_t stream, const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper);
Status ReduceColSumOnMatrixB(hipStream_t stream, const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper);
Status OffsetOutput(hipStream_t stream,
const int32_t* row_sum,
const int32_t* col_sum,
int32_t* output,
const int8_t a_offset,
const int8_t b_offset,
const MatMulComputeHelper& helper);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
namespace onnxruntime {
namespace rocm {
template <typename T1, typename T2>
class MatMulInteger final : public RocmKernel {
using Base = RocmKernel;
public:
MatMulInteger(const OpKernelInfo& info) : RocmKernel(info) {
has_a_zero_point_ = false;
has_b_zero_point_ = false;
if (info.GetInputCount() > 2) {
has_a_zero_point_ = true;
}
if (info.GetInputCount() > 3) {
has_b_zero_point_ = true;
}
}
Status ComputeInternal(OpKernelContext* context) const override;
private:
bool has_a_zero_point_;
bool has_b_zero_point_;
};
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/gsl.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace onnxruntime {
namespace rocm {
template <typename T, bool is_log_softmax>
Status SoftMaxComputeHelper(
hipStream_t stream,
const T* input,
const TensorShape& shape,
T* Y,
int64_t axis);
template <typename input_t, typename output_t, typename acc_t, bool is_log_softmax>
void dispatch_warpwise_softmax_forward(hipStream_t stream, output_t* dst, const input_t* src,
int softmax_elements, int softmax_elements_stride, int batch_count);
template <typename input_t, typename output_t, typename acc_t, bool is_log_softmax>
void dispatch_blockwise_softmax_forward(hipStream_t stream, output_t* output, const input_t* input,
int softmax_elements, int input_stride, int output_stride, int batch_count);
template <typename T>
class Softmax final : public RocmKernel {
public:
Softmax(const OpKernelInfo& info) : RocmKernel{info} {
const auto& node = info.node();
opset_ = node.SinceVersion();
int64_t axis;
Status status = info.GetAttr<int64_t>("axis", &axis);
if (status.IsOK()) {
axis_ = gsl::narrow_cast<int>(axis);
} else {
if (opset_ < 13) {
axis_ = 1; // opset-12 and below, the default axis value is 1
} else {
axis_ = -1; // opset-13, the default axis value is -1
}
}
log_softmax_ = info.GetKernelDef().OpName() == "LogSoftmax";
// We need to cast away the const as PerThreadRocblasHandle() is currently a non-const method
// TODO: Clean up the ROCMExecutionProvider interface to avoid this
rocm_ep_ = const_cast<ROCMExecutionProvider*>(
static_cast<const ROCMExecutionProvider*>(info.GetExecutionProvider()));
}
Status ComputeInternal(OpKernelContext* context) const override;
private:
int64_t axis_;
bool log_softmax_;
int opset_;
// We need to access to the ROCM EP instance to get the rocblas handle to use
// for transposing(if applicable)
ROCMExecutionProvider* rocm_ep_;
};
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// The code below is mostly copied from Pytorch SoftMax.cuh
#pragma once
#include "core/providers/rocm/cu_inc/common.cuh"
namespace onnxruntime {
namespace rocm {
constexpr int ALIGN_BYTES = 16;
const int max_threads = 1024;
dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) {
uint64_t block_size = 1;
uint64_t max_block_size = std::min(dim_size / ILP, static_cast<uint64_t>(max_threads));
// In the vectorized case we want to trade off allowing more of the buffers to be accessed
// in a vectorized way against wanting a larger block size to get better utilisation.
// In general with ILP you can have (ILP-1)/ILP of the buffer accessed vectorised, at the risk
// of having a very small block size. We choose to keep >= 1/2 of the buffer vectorised while
// allowing a larger block size.
if (ILP > 1) {
max_block_size /= 2;
}
while (block_size < (max_block_size)) block_size *= 2;
// Launch at least a single warp - the kernel assumes that.
block_size = std::max(block_size, static_cast<uint64_t>(GPU_WARP_SIZE_HOST));
return dim3(static_cast<unsigned int>(block_size));
}
////////////////////////////////////////////////////////////////////////////////
// Regular kernel (fast when dim_size is large; requires inner_size == 1)
////////////////////////////////////////////////////////////////////////////////
template <typename T, typename AccumT>
struct MaxFloat
{
__device__ __forceinline__ AccumT operator()(AccumT max, T v) const {
return ::max(max, (AccumT)v);
}
};
template<typename T, typename AccumT>
struct AddFloat
{
__device__ __forceinline__ AccumT operator()(AccumT sum, T v) const {
return sum + (AccumT)v;
}
};
template<typename T, typename AccumT>
struct SumExpFloat
{
__device__ __forceinline__ SumExpFloat(AccumT v)
: max_k(v) {}
__device__ __forceinline__ AccumT operator()(AccumT sum, T v) const {
return sum + expf((AccumT)v - max_k);
}
const AccumT max_k;
};
template <template<typename> class Reduction, typename AccumT>
__device__ __forceinline__ AccumT
blockReduce(AccumT* smem, AccumT val,
const Reduction<AccumT>& r,
AccumT defaultVal)
{
// To avoid RaW races from chaining blockReduce calls together, we need a sync here
__syncthreads();
smem[threadIdx.x] = val;
__syncthreads();
AccumT warpVal = defaultVal;
// First warp will perform per-warp reductions for the remaining warps
uint32_t mask = (((uint64_t)1) << (blockDim.x / GPU_WARP_SIZE)) - 1;
if (threadIdx.x < GPU_WARP_SIZE) {
int lane = threadIdx.x % GPU_WARP_SIZE;
if (lane < blockDim.x / GPU_WARP_SIZE) {
#pragma unroll
for (int i = 0; i < GPU_WARP_SIZE; ++i) {
warpVal = r(warpVal, smem[lane * GPU_WARP_SIZE + i]);
}
#if !defined(USE_ROCM)
__syncwarp(mask);
#endif
smem[lane] = warpVal;
}
}
__syncthreads();
// First thread will perform a reduction of the above per-warp reductions
AccumT blockVal = defaultVal;
if (threadIdx.x == 0) {
for (int i = 0; i < blockDim.x / GPU_WARP_SIZE; ++i) {
blockVal = r(blockVal, smem[i]);
}
smem[0] = blockVal;
}
// Sync and broadcast
__syncthreads();
return smem[0];
}
template <template<typename, typename> class Reduction, int ILP, typename T, typename AccumT>
__device__ __forceinline__ AccumT
ilpReduce(int shift,
T* data,
int size,
const Reduction<T, AccumT>& r,
AccumT defaultVal)
{
using LoadT = aligned_vector<T, ILP>;
AccumT threadVal = defaultVal;
int offset = threadIdx.x;
// shift and do 1
if(shift > 0){
data -= shift;
size += shift;
if(threadIdx.x >= shift){
threadVal = r(threadVal, data[offset]);
}
size -= blockDim.x;
data += blockDim.x;
}
int last = size % (ILP * blockDim.x);
T v[ILP];
LoadT* value = reinterpret_cast<LoadT*>(&v);
for (; offset * ILP < (size - last); offset += blockDim.x) {
*value = reinterpret_cast<LoadT*>(data)[offset];
#pragma unroll
for (int j = 0; j < ILP; ++j) {
threadVal = r(threadVal, v[j]);
}
}
offset = size - last + threadIdx.x;
// Epilogue
for (; offset < size; offset += blockDim.x)
threadVal = r(threadVal, data[offset]);
return threadVal;
}
/**
* This will apply the Epilogue with vectorized reads & writes when input & output have the same shift
*/
template <int ILP, typename scalar_t, typename accum_t, typename outscalar_t, template<typename, typename, typename> class Epilogue>
__device__ __forceinline__ void
WriteFpropResultsVectorized(
int size,
const int shift,
scalar_t *input,
outscalar_t *output,
Epilogue<scalar_t, accum_t, outscalar_t> epilogue) {
using LoadT = aligned_vector<scalar_t, ILP>;
using StoreT = aligned_vector<outscalar_t, ILP>;
int offset = threadIdx.x;
// if unaligned, do one value / thread and move on, guaranteeing aligned reads/writes later
if (shift > 0) {
input -= shift;
output -= shift;
size += shift;
if (threadIdx.x >= shift) {
output[offset] = epilogue(input[offset]);
}
size -= blockDim.x;
input += blockDim.x;
output += blockDim.x;
}
const int last = size % (ILP * blockDim.x);
scalar_t in_v[ILP];
LoadT* in_value = reinterpret_cast<LoadT*>(&in_v);
outscalar_t out_v[ILP];
StoreT* out_value = reinterpret_cast<StoreT*>(&out_v);
for (; offset * ILP < (size - last); offset += blockDim.x) {
*in_value = reinterpret_cast<LoadT*>(input)[offset];
#pragma unroll
for (int j = 0; j < ILP; ++j) {
out_v[j] = epilogue(in_v[j]);
}
reinterpret_cast<StoreT*>(output)[offset] = *out_value;
}
offset = size - last + threadIdx.x;
// handle the tail
for (; offset < size; offset += blockDim.x) {
output[offset] = epilogue(input[offset]);
}
}
/**
* This will apply the Epilogue with non-vectrorized reads & writes for the general case
*/
template <int ILP, typename scalar_t, typename accum_t, typename outscalar_t, template<typename, typename, typename> class Epilogue>
__device__ __forceinline__ void
WriteFpropResults(
int classes,
scalar_t *input,
outscalar_t *output,
Epilogue<scalar_t, accum_t, outscalar_t> epilogue) {
int offset = threadIdx.x;
int last = classes % (ILP * blockDim.x);
// Main bulk of loop with ILP
for (; offset < classes - last; offset += blockDim.x * ILP) {
scalar_t tmp[ILP];
#pragma unroll
for (int j = 0; j < ILP; ++j) {
tmp[j] = input[offset + j * blockDim.x];
}
#pragma unroll
for (int j = 0; j < ILP; ++j) {
output[offset + j * blockDim.x] = epilogue(tmp[j]);
}
}
// Remainder - no ILP
for (; offset < classes; offset += blockDim.x) {
output[offset] = epilogue(input[offset]);
}
}
template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t,
template <typename, typename, typename> class Epilogue>
__global__ void
softmax_block_forward(outscalar_t* output, scalar_t* input, int classes, int input_stride, int output_stride) {
extern __shared__ unsigned char smem[];
auto sdata = reinterpret_cast<accscalar_t*>(smem);
using LoadT = aligned_vector<scalar_t, ILP>;
using StoreT = aligned_vector<outscalar_t, ILP>;
// forward pointers to batch[blockIdx.x]
// each block handles a sample in the mini-batch
input += blockIdx.x * input_stride;
output += blockIdx.x * output_stride;
const int shift = ((uint64_t)input) % ALIGN_BYTES / sizeof(scalar_t);
const int output_shift = ((uint64_t)output) % ALIGN_BYTES / sizeof(outscalar_t);
// find the max
accscalar_t threadMax = ilpReduce<MaxFloat, ILP, scalar_t, accscalar_t>(
shift, input, classes, MaxFloat<scalar_t, accscalar_t>(), -std::numeric_limits<accscalar_t>::max());
accscalar_t max_k = blockReduce<Max, accscalar_t>(
sdata, threadMax, Max<accscalar_t>(), -std::numeric_limits<accscalar_t>::max());
// reduce all values
accscalar_t threadExp = ilpReduce<SumExpFloat, ILP, scalar_t, accscalar_t>(
shift, input, classes, SumExpFloat<scalar_t, accscalar_t>(max_k), static_cast<accscalar_t>(0));
accscalar_t sumAll = blockReduce<Add, accscalar_t>(
sdata, threadExp, Add<accscalar_t>(), static_cast<accscalar_t>(0));
Epilogue<scalar_t, accscalar_t, outscalar_t> epilogue(max_k, sumAll);
if (shift == output_shift) {
WriteFpropResultsVectorized<ILP, scalar_t, accscalar_t, outscalar_t, Epilogue>(classes, shift, input, output, epilogue);
} else {
WriteFpropResults<ILP, scalar_t, accscalar_t, outscalar_t, Epilogue>(classes, input, output, epilogue);
}
}
template<typename T, typename AccumT, typename OutT>
struct LogSoftMaxForwardEpilogue {
__device__ __forceinline__ LogSoftMaxForwardEpilogue(AccumT max_input, AccumT sum)
: max_input(max_input), logsum(logf(sum)) {}
__device__ __forceinline__ OutT operator()(T input) const {
return static_cast<OutT>((AccumT)input - max_input - logsum);
}
const AccumT max_input;
const AccumT logsum;
};
template<typename T, typename AccumT, typename OutT>
struct SoftMaxForwardEpilogue {
__device__ __forceinline__ SoftMaxForwardEpilogue(AccumT max_input, AccumT sum)
: max_input(max_input)
, sum(sum) {}
__device__ __forceinline__ OutT operator()(T input) const {
return static_cast<OutT>(expf((AccumT)input - max_input) / sum);
}
const AccumT max_input;
const AccumT sum;
};
}
}
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/status.h"
#include "core/providers/rocm/miopen_common.h"
namespace onnxruntime {
namespace rocm {
Status SoftmaxForward(miopenHandle_t miopen_handle, const void* alpha, const miopenTensorDescriptor_t input_tensor,
const void* input_data, const void* beta, const miopenTensorDescriptor_t output_tensor,
void* output_data);
Status SoftmaxBackward(miopenHandle_t miopen_handle, bool is_log_softmax, const void* alpha,
const miopenTensorDescriptor_t input_tensor, const void* output_data,
const void* output_grad_data, const void* beta, const miopenTensorDescriptor_t output_tensor,
void* input_grad_data);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "topk.h"
#include "topk_impl.h"
namespace onnxruntime {
namespace rocm {
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
TopK,
kOnnxDomain,
1, 9,
kRocmExecutionProvider,
(*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
TopK<false>);
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
TopK,
kOnnxDomain,
10, 10,
kRocmExecutionProvider,
(*KernelDefBuilder::Create()).InputMemoryType(OrtMemTypeCPUInput, 1).TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()).TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),
TopK<true>);
ONNX_OPERATOR_KERNEL_EX(
TopK,
kOnnxDomain,
11,
kRocmExecutionProvider,
(*KernelDefBuilder::Create()).InputMemoryType(OrtMemTypeCPUInput, 1).TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()).TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()),
TopK<true>);
template <bool inputk>
TopK<inputk>::TopK(const OpKernelInfo& info) : RocmKernel(info) {
info.GetAttrOrDefault<int64_t>("axis", &axis_, -1);
info.GetAttrOrDefault<int64_t>("largest", &largest_, 1);
info.GetAttrOrDefault<int64_t>("sorted", &sorted_, 1);
if (!inputk) {
info.GetAttrOrDefault<int64_t>("k", &K_, 0);
}
}
#define IS_PRIM_TYPE(T) utils::IsPrimitiveDataType<T>(prim_type)
#define TOPKIMPL(T) TopKImpl<T>(this, stream, tensor_X->Data<T>(), \
static_cast<T*>(tensor_V->MutableDataRaw()), \
static_cast<int64_t*>(tensor_I->MutableDataRaw()), \
elem_nums_rocm, \
elem_nums.size(), \
axis, K_, largest_, sorted_, N, dimension)
template <bool inputk>
Status TopK<inputk>::ComputeInternal(OpKernelContext* ctx) const {
auto tensor_X = ctx->Input<Tensor>(0);
ORT_ENFORCE(nullptr != tensor_X);
int32_t rank = static_cast<int32_t>(tensor_X->Shape().NumDimensions());
int32_t axis = static_cast<int32_t>(axis_ < 0 ? rank + axis_ : axis_);
ORT_ENFORCE(axis > -1 && axis < rank);
if (inputk) {
auto tensor_K = ctx->Input<Tensor>(1);
ORT_ENFORCE(nullptr != tensor_K);
K_ = *tensor_K->Data<int64_t>();
ORT_ENFORCE(K_ >= 0 && K_ <= tensor_X->Shape().GetDims()[axis]);
}
auto output_shape = tensor_X->Shape();
output_shape[axis] = K_;
auto tensor_V = ctx->Output(0, output_shape);
auto tensor_I = ctx->Output(1, output_shape);
if (0 == K_) {
return Status::OK();
}
auto elem_nums = tensor_X->Shape().AsShapeVector();
auto dimension = elem_nums[axis];
for (auto i = static_cast<int64_t>(elem_nums.size()) - 2; i >= 0; --i) {
elem_nums[i] *= elem_nums[i + 1];
}
auto N = elem_nums[0] / dimension;
TArray<int64_t> elem_nums_rocm(elem_nums);
auto prim_type = tensor_X->DataType()->AsPrimitiveDataType();
if (prim_type == nullptr) {
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for TopK operator");
}
hipStream_t stream = this->Stream();
if (IS_PRIM_TYPE(uint8_t)) return TOPKIMPL(uint8_t);
if (IS_PRIM_TYPE(uint16_t)) return TOPKIMPL(uint16_t);
if (IS_PRIM_TYPE(uint32_t)) return TOPKIMPL(uint32_t);
if (IS_PRIM_TYPE(uint64_t)) return TOPKIMPL(uint64_t);
if (IS_PRIM_TYPE(int8_t)) return TOPKIMPL(int8_t);
if (IS_PRIM_TYPE(int16_t)) return TOPKIMPL(int16_t);
if (IS_PRIM_TYPE(int32_t)) return TOPKIMPL(int32_t);
if (IS_PRIM_TYPE(int64_t)) return TOPKIMPL(int64_t);
if (IS_PRIM_TYPE(MLFloat16)) return TOPKIMPL(MLFloat16);
if (IS_PRIM_TYPE(float)) return TOPKIMPL(float);
if (IS_PRIM_TYPE(double)) return TOPKIMPL(double);
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for TopK operator");
}
} // namespace rocm
} // namespace onnxruntime
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment