Commit 1a91fcc2 authored by gaoqiong's avatar gaoqiong
Browse files

add dtk所需文件

parent a144865d
Pipeline #492 failed with stages
in 0 seconds
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace onnxruntime {
namespace rocm {
void GatherImpl(
hipStream_t stream,
const int64_t input_block_size,
const int64_t indices_max,
const fast_divmod& output_block_size,
const fast_divmod& block_size,
const void* indices_data,
size_t index_element_size,
const void* input_data,
size_t element_size,
void* output_data,
const size_t N);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/gather_nd.h"
#include "core/providers/rocm/tensor/gather_nd_impl.h"
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace onnxruntime {
namespace rocm {
Status CheckBatchDimensionsMatch(
size_t num_batch_dimensions,
const std::vector<std::reference_wrapper<TensorShape>>& tensor_shapes) {
for (size_t tensor_shape_idx = 0; tensor_shape_idx < tensor_shapes.size(); ++tensor_shape_idx) {
const TensorShape& tensor_shape = tensor_shapes[tensor_shape_idx];
ORT_RETURN_IF_NOT(
num_batch_dimensions <= tensor_shape.NumDimensions(),
"Number of batch dimensions exceeds tensor rank. ",
"Batch dimension count: ", num_batch_dimensions,
", tensor rank: ", tensor_shape.NumDimensions(),
", tensor index: ", tensor_shape_idx);
}
if (tensor_shapes.empty()) return Status::OK();
const TensorShape& first_tensor_shape = tensor_shapes.front();
for (size_t batch_dimension_idx = 0; batch_dimension_idx < num_batch_dimensions; ++batch_dimension_idx) {
for (size_t tensor_shape_idx = 1; tensor_shape_idx < tensor_shapes.size(); ++tensor_shape_idx) {
const TensorShape& other_tensor_shape = tensor_shapes[tensor_shape_idx];
ORT_RETURN_IF_NOT(
first_tensor_shape[batch_dimension_idx] == other_tensor_shape[batch_dimension_idx],
"Batch dimensions differ at index ", batch_dimension_idx, ": ",
first_tensor_shape[batch_dimension_idx], " != ", other_tensor_shape[batch_dimension_idx],
", tensor indices: 0, ", tensor_shape_idx);
}
}
return Status::OK();
}
template <typename TIndex>
Status GatherNDBase::PrepareCompute(
hipStream_t stream,
const int64_t batch_dims,
const TensorShape& input_shape,
const TensorShape& indices_shape,
const Tensor* indices_tensor,
int64_t& num_slices,
int64_t& slice_size,
IAllocatorUniquePtr<int64_t>& input_slice_offsets_buffer) const {
const auto num_slice_dims = indices_shape[indices_shape.NumDimensions() - 1];
num_slices = indices_shape.SizeToDimension(indices_shape.NumDimensions() - 1);
slice_size = input_shape.SizeFromDimension(batch_dims + num_slice_dims);
const auto num_batches = input_shape.SizeToDimension(batch_dims);
const auto input_batch_stride = input_shape.SizeFromDimension(batch_dims);
const auto num_slices_per_batch = num_slices / num_batches;
const TIndex* const indices_data = indices_tensor->Data<TIndex>();
std::vector<int64_t> sizes_from_slice_dims(num_slice_dims);
{
auto running_product = slice_size;
for (int64_t i = 0; i < num_slice_dims; ++i) {
sizes_from_slice_dims[num_slice_dims - 1 - i] = running_product;
running_product *= input_shape[batch_dims + num_slice_dims - 1 - i];
}
}
auto sizes_from_slice_dims_buffer = GetScratchBuffer<int64_t>(sizes_from_slice_dims.size());
HIP_RETURN_IF_ERROR(hipMemcpyAsync(
sizes_from_slice_dims_buffer.get(),
sizes_from_slice_dims.data(),
sizes_from_slice_dims.size() * sizeof(int64_t),
hipMemcpyHostToDevice, stream));
input_slice_offsets_buffer = GetScratchBuffer<int64_t>(num_slices);
TArray<int64_t> input_dims(input_shape.GetDims());
ComputeSliceOffsetsImpl(
stream,
batch_dims,
input_dims,
num_slices,
num_slices_per_batch,
input_batch_stride,
num_slice_dims,
sizes_from_slice_dims_buffer.get(),
indices_data,
input_slice_offsets_buffer.get());
return Status::OK();
}
#define REGISTER_KERNEL_VERSIONED_TYPED_GATHER_ND(TIndex, startver, endver) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
GatherND, \
kOnnxDomain, \
startver, \
endver, \
TIndex, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", \
std::vector<MLDataType>{ \
DataTypeImpl::GetTensorType<float>(), \
DataTypeImpl::GetTensorType<double>(), \
DataTypeImpl::GetTensorType<MLFloat16>(), \
DataTypeImpl::GetTensorType<int64_t>(), \
DataTypeImpl::GetTensorType<bool>(), \
}) \
.TypeConstraint("indices", DataTypeImpl::GetTensorType<TIndex>()), \
GatherND<TIndex>);
#define REGISTER_KERNEL_TYPED_GATHER_ND(TIndex, ver) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
GatherND, kOnnxDomain, ver, TIndex, kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", BuildKernelDefConstraints<float, MLFloat16, double, int64_t, BFloat16, bool>()) \
.TypeConstraint("indices", DataTypeImpl::GetTensorType<TIndex>()), \
GatherND<TIndex>);
REGISTER_KERNEL_TYPED_GATHER_ND(int64_t, 13)
REGISTER_KERNEL_VERSIONED_TYPED_GATHER_ND(int64_t, 12, 12)
REGISTER_KERNEL_VERSIONED_TYPED_GATHER_ND(int64_t, 11, 11)
template <typename T>
struct GatherNDComputeImpl {
void operator()(hipStream_t stream,
const int64_t num_slices,
const int64_t slice_size,
const void* const kernel_input_data,
void* const kernel_output_data,
int64_t* const input_slice_offsets_data) const {
typedef typename ToHipType<T>::MappedType HipT;
GatherNDImpl<HipT>(stream,
num_slices, kernel_input_data,
kernel_output_data, slice_size,
input_slice_offsets_data);
}
};
template <typename TIndex>
Status GatherND<TIndex>::ComputeInternal(OpKernelContext* context) const {
auto input_tensor = context->Input<Tensor>(0);
auto indices_tensor = context->Input<Tensor>(1);
ORT_RETURN_IF_NOT(input_tensor != nullptr, "input_tensor == nullptr");
ORT_RETURN_IF_NOT(indices_tensor != nullptr, "indices_tensor == nullptr");
auto input_shape = input_tensor->Shape();
auto indices_shape = indices_tensor->Shape();
if (indices_shape.NumDimensions() == 0) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"indices tensor must has rank larger than 0");
}
auto last_indices_dimension = batch_dims_ + indices_shape[indices_shape.NumDimensions() - 1];
if (last_indices_dimension > static_cast<int64_t>(input_shape.NumDimensions())) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"last dimension of indices must not be larger than rank of input tensor");
}
ORT_RETURN_IF_ERROR(CheckBatchDimensionsMatch(
static_cast<size_t>(batch_dims_), {input_shape, indices_shape}));
// Output shape
std::vector<int64_t> shape(indices_shape.GetDims().begin(), indices_shape.GetDims().end() - 1);
shape.insert(shape.end(), input_shape.GetDims().begin() + last_indices_dimension, input_shape.GetDims().end());
auto output_tensor = context->Output(0, TensorShape(shape));
// Bail out early in case the output is going to be empty
if (output_tensor->Shape().Size() == 0) {
return Status::OK();
}
// Compute
int64_t num_slices;
int64_t slice_size;
IAllocatorUniquePtr<int64_t> input_slice_offsets_buffer;
ORT_RETURN_IF_ERROR(PrepareCompute<TIndex>(Stream(),
batch_dims_, input_shape, indices_shape, indices_tensor,
num_slices, slice_size, input_slice_offsets_buffer));
const void* const kernel_input_data = input_tensor->DataRaw();
void* const kernel_output_data = output_tensor->MutableDataRaw();
utils::MLTypeCallDispatcher<float, MLFloat16, double, int64_t, BFloat16, bool> t_disp(input_tensor->GetElementType());
t_disp.Invoke<GatherNDComputeImpl>(Stream(), num_slices, slice_size, kernel_input_data, kernel_output_data,
input_slice_offsets_buffer.get());
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace onnxruntime {
namespace rocm {
Status CheckBatchDimensionsMatch(
size_t num_batch_dimensions,
const std::vector<std::reference_wrapper<TensorShape>>& tensor_shapes);
class GatherNDBase : public RocmKernel {
public:
GatherNDBase(const OpKernelInfo& info) : RocmKernel(info) {
info.GetAttrOrDefault("batch_dims", &batch_dims_, static_cast<int64_t>(0));
ORT_ENFORCE(batch_dims_ >= 0);
}
protected:
template <typename TIndex>
Status PrepareCompute(
hipStream_t stream,
const int64_t batch_dims,
const TensorShape& input_shape,
const TensorShape& indices_shape,
const Tensor* indices_tensor,
int64_t& num_slices,
int64_t& slice_size,
IAllocatorUniquePtr<int64_t>& input_slice_offsets_buffer) const;
int64_t batch_dims_;
};
template <typename Tind>
class GatherND final : public GatherNDBase {
public:
GatherND(const OpKernelInfo& info) : GatherNDBase(info) {}
Status ComputeInternal(OpKernelContext* context) const override;
};
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/gather_nd_impl.h"
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/providers/rocm/atomic/common.cuh"
namespace onnxruntime {
namespace rocm {
template <typename TIndex>
__global__ void _ComputeSliceOffsetsKernel(
const int64_t batch_dims,
const TArray<int64_t> input_dims,
const size_t num_slices,
const size_t num_slices_per_batch,
const size_t input_batch_stride,
const size_t num_slice_dims,
const int64_t* const sizes_from_slice_dims_data, // num_slice_dims elements
const TIndex* const indices_data, // num_slices * num_slice_dims elements
int64_t* const input_slice_offsets_data) { // num_slices elements
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(slice_idx, num_slices)
const size_t batch_idx = slice_idx / num_slices_per_batch;
const size_t base_offset = batch_idx * input_batch_stride;
const TIndex* const slice_indices = indices_data + slice_idx * num_slice_dims;
size_t relative_slice_offset = 0;
for (size_t dim_idx = 0; dim_idx < num_slice_dims; ++dim_idx) {
int64_t index = static_cast<int64_t>(slice_indices[dim_idx]);
const size_t input_dim_idx = batch_dims + dim_idx;
HIP_KERNEL_ASSERT(index >= -input_dims[input_dim_idx] && index < input_dims[input_dim_idx]);
if (index < 0) index += input_dims[input_dim_idx];
relative_slice_offset += index * sizes_from_slice_dims_data[dim_idx];
}
input_slice_offsets_data[slice_idx] = base_offset + relative_slice_offset;
}
template <typename T>
__global__ void _GatherNDKernel(
const size_t num_slices,
const T* input_data,
T* output_data,
const size_t slice_size,
const int64_t* slice_offsets) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(i, num_slices * slice_size)
uint64_t slice_offset = slice_offsets[i / slice_size];
output_data[i] = input_data[slice_offset + i % slice_size];
};
template <typename TIndex>
void ComputeSliceOffsetsImpl(
hipStream_t stream,
const int64_t batch_dims,
const TArray<int64_t> input_dims,
const size_t num_slices,
const size_t num_slices_per_batch,
const size_t input_batch_stride,
const size_t num_slice_dims,
const int64_t* const sizes_from_slice_dims_data, // num_slice_dims elements
const TIndex* const indices_data, // num_slices * num_slice_dims elements
int64_t* const input_slice_offsets_data) { // num_slices elements
const unsigned int blocks_per_grid = static_cast<unsigned int>(CeilDiv(num_slices, GridDim::maxThreadsPerBlock));
hipLaunchKernelGGL(_ComputeSliceOffsetsKernel, blocks_per_grid, GridDim::maxThreadsPerBlock, 0, stream,
batch_dims,
input_dims,
num_slices,
num_slices_per_batch,
input_batch_stride,
num_slice_dims,
sizes_from_slice_dims_data,
indices_data,
input_slice_offsets_data);
}
template <typename T>
void GatherNDImpl(
hipStream_t stream,
const size_t num_slices,
const void* input_data,
void* output_data,
const size_t slice_size,
const int64_t* input_slice_offsets_data) {
const unsigned int blocks_per_grid = static_cast<unsigned int>(CeilDiv(num_slices * slice_size, GridDim::maxThreadsPerBlock));
hipLaunchKernelGGL(HIP_KERNEL_NAME(_GatherNDKernel<T>), blocks_per_grid, GridDim::maxThreadsPerBlock, 0, stream,
num_slices, static_cast<const T*>(input_data), static_cast<T*>(output_data), slice_size, input_slice_offsets_data);
}
#define SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(TIndex) \
template void ComputeSliceOffsetsImpl<TIndex>( \
hipStream_t stream, \
const int64_t batch_dims, \
const TArray<int64_t> input_dims, \
const size_t num_slices, \
const size_t num_slices_per_batch, \
const size_t input_batch_stride, \
const size_t num_slice_dims, \
const int64_t* const sizes_from_slice_dims_data, \
const TIndex* const indices_data, \
int64_t* const input_slice_offsets_data);
#define SPECIALIZED_IMPL(T) \
template void GatherNDImpl<T>(hipStream_t stream, const size_t num_slices, const void* input_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data);
SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(int32_t)
SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(int64_t)
SPECIALIZED_IMPL(bool)
SPECIALIZED_IMPL(float)
SPECIALIZED_IMPL(int64_t)
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
SPECIALIZED_IMPL(half)
SPECIALIZED_IMPL(double)
SPECIALIZED_IMPL(BFloat16)
#endif
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace onnxruntime {
namespace rocm {
template<typename TIndex>
void ComputeSliceOffsetsImpl(
hipStream_t stream,
const int64_t batch_dims,
const TArray<int64_t> input_dims,
const size_t num_slices,
const size_t num_slices_per_batch,
const size_t input_batch_stride,
const size_t num_slice_dims,
const int64_t* const sizes_from_slice_dims_data, // num_slice_dims elements
const TIndex* const indices_data, // num_slices * num_slice_dims elements
int64_t* const input_slice_offsets_data); // num_slices elements
template <typename T>
void GatherNDImpl(
hipStream_t stream,
const size_t num_slices,
const void* input_data,
void* output_data,
const size_t slice_size,
const int64_t* input_slice_offsets_data);
#ifdef ENABLE_TRAINING
template <typename T>
void GatherNDGradImpl(
hipStream_t stream,
const size_t num_slices,
const void* update_data,
void* output_data,
const size_t slice_size,
const int64_t* input_slice_offsets_data);
#endif
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "identity_op.h"
namespace onnxruntime {
namespace rocm {
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Dropout,
kOnnxDomain,
7, 9,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", {DataTypeImpl::GetTensorType<MLFloat16>(),
DataTypeImpl::GetTensorType<float>(),
DataTypeImpl::GetTensorType<double>()})
.Alias(0, 0),
IdentityOp<true>);
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Dropout,
kOnnxDomain,
10,
11,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", {DataTypeImpl::GetTensorType<MLFloat16>(),
DataTypeImpl::GetTensorType<float>(),
DataTypeImpl::GetTensorType<double>()})
.TypeConstraint("T1", DataTypeImpl::GetTensorType<bool>())
.Alias(0, 0),
IdentityOp<true>);
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Identity,
kOnnxDomain,
1, 12,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.Alias(0, 0),
IdentityOp<false>);
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Identity,
kOnnxDomain,
13, 13,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.Alias(0, 0),
IdentityOp<false>);
ONNX_OPERATOR_KERNEL_EX(
Identity,
kOnnxDomain,
14,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("V", DataTypeImpl::AllFixedSizeTensorAndSequenceTensorTypes())
.Alias(0, 0),
IdentityOp<false>);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace onnxruntime {
namespace rocm {
template <bool is_dropout>
class IdentityOp final : public RocmKernel {
public:
IdentityOp(const OpKernelInfo& info) : RocmKernel(info) {
}
Status ComputeInternal(OpKernelContext* context) const override {
auto X_ml_type = context->InputType(0);
if (X_ml_type->IsTensorType()) {
const Tensor* X = context->Input<Tensor>(0);
if (nullptr == X) {
return Status(common::ONNXRUNTIME, common::FAIL,
"IdentityOp rocm: input count mismatch.");
}
const TensorShape& shape = X->Shape();
Tensor* Y = context->Output(0, shape);
if (nullptr == Y) {
return Status(common::ONNXRUNTIME, common::FAIL,
"IdentityOp rocm: failed to allocate output tensor.");
}
auto X_type = X->DataType();
const void* source = X->DataRaw(X_type);
void* target = Y->MutableDataRaw(X_type);
//If source and target pointers are not equal, we need to copy the data.
if (target != source) {
HIP_RETURN_IF_ERROR(hipMemcpyAsync(target, source, X->Shape().Size() * X->DataType()->Size(), hipMemcpyDeviceToDevice, Stream()));
}
if (is_dropout) {
Tensor* mask = context->Output(1, shape);
// a 'nullptr' returned would make it an unused optional output
if (mask != nullptr) {
// Opset 7 differs with Opset 10 in that the type of the 'mask'
// output is tied with the type of the input in Opset 7 whereas
// the type of 'mask' in Opset 10 is 'bool' always
// so we have a common solution
void* mask_data = mask->MutableDataRaw();
// In 'test'/'inference' mode, there are no input values dropped out
// so fill the buffer with 0/false
HIP_RETURN_IF_ERROR(hipMemsetAsync(mask_data, 0, mask->SizeInBytes(), Stream()));
}
}
} else if (X_ml_type->IsTensorSequenceType()) {
const TensorSeq* X = context->Input<TensorSeq>(0);
ORT_ENFORCE(X != nullptr, "IdentityOp rocm: input tensor is missing.");
TensorSeq* Y = context->Output<TensorSeq>(0);
ORT_ENFORCE(Y != nullptr, "IdentityOp rocm: failed to allocate output tensor sequence.");
if (X == Y) {
return Status::OK();
}
auto X_type = X->DataType();
Y->SetType(X_type);
AllocatorPtr alloc;
auto status = context->GetTempSpaceAllocator(&alloc);
if (!status.IsOK()) {
return Status(common::ONNXRUNTIME, common::FAIL,
"IdentityOp rocm: unable to get an allocator.");
}
auto X_size = X->Size();
for (size_t i = 0; i < X_size; ++i) {
const Tensor& source_tensor = X->Get(i);
std::unique_ptr<Tensor> target_tensor = Tensor::Create(source_tensor.DataType(),
source_tensor.Shape(), alloc);
HIP_RETURN_IF_ERROR(hipMemcpyAsync(target_tensor->MutableDataRaw(),
source_tensor.DataRaw(),
source_tensor.SizeInBytes(),
hipMemcpyDeviceToDevice, Stream()));
Y->Add(std::move(*target_tensor));
}
} else {
return Status(common::ONNXRUNTIME, common::FAIL,
"IdentityOp rocm: unsupported input type.");
}
return Status::OK();
}
};
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "nonzero_impl.h"
#include "core/providers/rocm/shared_inc/rocm_call.h"
#include "core/providers/rocm/cu_inc/common.cuh"
#include <hipcub/hipcub.hpp>
namespace onnxruntime {
namespace rocm {
static const int NONZERO_THREADS_PER_BLOCK = GridDim::maxThreadsPerBlock;
//TODO:check overflow
int NonZeroCalcBlockCount(int64_t x_size) {
return static_cast<int>(CeilDiv(x_size, NONZERO_THREADS_PER_BLOCK));
}
hipError_t NonZeroCalcPrefixSumTempStorageBytes(
hipStream_t stream, int* prefix_counts, int number_of_blocks, size_t& temp_storage_bytes) {
temp_storage_bytes = 0;
return hipcub::DeviceScan::InclusiveSum(nullptr, temp_storage_bytes, prefix_counts, prefix_counts, number_of_blocks, stream);
}
hipError_t NonZeroInclusivePrefixSum(
hipStream_t stream, void* d_temp_storage, size_t temp_storage_bytes, int* prefix_counts, int number_of_blocks) {
return hipcub::DeviceScan::InclusiveSum(
d_temp_storage, temp_storage_bytes, prefix_counts, prefix_counts, number_of_blocks, stream);
}
template <typename InputT, int THREADS_PER_BLOCK>
__global__ void NonZeroCountEachBlockKernel(const InputT* x, int64_t x_size, int* count_in_blocks) {
typedef hipcub::BlockReduce<int, THREADS_PER_BLOCK, hipcub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY> BlockReduceT;
__shared__ typename BlockReduceT::TempStorage temp_storage;
int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
// const hipcub::CastOp<bool> cast_to_bool; not supported on amd hipcub
int nz = 0;
if (index < x_size && bool(x[index])) ++nz;
int count = BlockReduceT(temp_storage).Sum(nz);
if (threadIdx.x == 0) {
count_in_blocks[blockIdx.x] = count;
}
}
template <typename InputT, int THREADS_PER_BLOCK>
__global__ void NonZeroOutputPositionsKernel(
const InputT* x, int64_t x_size, int x_rank, const TArray<fast_divmod> x_strides,
const int* prefix_counts, int nonzero_elements, int64_t* results) {
typedef hipcub::BlockScan<int, THREADS_PER_BLOCK> BlockScanT;
__shared__ typename BlockScanT::TempStorage temp_storage;
int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
// const hipcub::CastOp<bool> cast_to_bool; not supported on amd hipcub
int nz = 0;
if (index < x_size && bool(x[index])) ++nz;
int pos_in_block = 0;
BlockScanT(temp_storage).InclusiveSum(nz, pos_in_block);
int result_position = ((blockIdx.x == 0) ? 0 : prefix_counts[blockIdx.x - 1]) + pos_in_block - nz;
if (index < x_size && bool(x[index])) {
int remain = (int)index, dim = 0;
for (int axis = 0, rp = result_position; axis < x_rank; ++axis, rp += nonzero_elements) {
x_strides[axis].divmod(remain, dim, remain);
results[rp] = (int64_t)dim;
}
}
}
constexpr int MAX_DIMS = 16;
template <typename InputT, int THREADS_PER_BLOCK>
__global__ void UnRolledNonZeroOutputPositionsKernel(
const InputT* x, int64_t x_size, int x_rank, const TArray<fast_divmod> x_strides,
const int* prefix_counts, int nonzero_elements, int64_t* results) {
typedef hipcub::BlockScan<int, THREADS_PER_BLOCK> BlockScanT;
__shared__ typename BlockScanT::TempStorage temp_storage;
int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
// const hipcub::CastOp<bool> cast_to_bool; not supported on amd hipcub
int nz = 0;
if (index < x_size && bool(x[index])) ++nz;
int pos_in_block = 0;
BlockScanT(temp_storage).InclusiveSum(nz, pos_in_block);
int result_position = ((blockIdx.x == 0) ? 0 : prefix_counts[blockIdx.x - 1]) + pos_in_block - nz;
if (index < x_size && bool(x[index])) {
int remain = (int)index, dim = 0;
int rp = result_position;
#pragma unroll
for (int axis = 0; axis < MAX_DIMS; ++axis) {
if (axis == x_rank) {
break;
}
x_strides[axis].divmod(remain, dim, remain);
results[rp] = (int64_t)dim;
rp += nonzero_elements;
}
}
}
template <typename InputT>
hipError_t NonZeroCountEachBlock(hipStream_t stream, const InputT* x, int64_t x_size, int* count_in_blocks) {
int num_blocks = NonZeroCalcBlockCount(x_size);
hipLaunchKernelGGL(HIP_KERNEL_NAME(NonZeroCountEachBlockKernel<InputT, NONZERO_THREADS_PER_BLOCK>), num_blocks, NONZERO_THREADS_PER_BLOCK, 0, stream,
x, x_size, count_in_blocks);
return hipSuccess;
}
template <typename InputT>
hipError_t NonZeroOutputPositions(
hipStream_t stream, const InputT* x, int64_t x_size, int x_rank, const TArray<fast_divmod>& x_strides,
const int* prefix_counts, int nonzero_elements, int64_t* results) {
int num_blocks = NonZeroCalcBlockCount(x_size);
if (x_rank > MAX_DIMS) {
hipLaunchKernelGGL(HIP_KERNEL_NAME(NonZeroOutputPositionsKernel<InputT, NONZERO_THREADS_PER_BLOCK>), num_blocks, NONZERO_THREADS_PER_BLOCK, 0, stream,
x, x_size, x_rank, x_strides,
prefix_counts, nonzero_elements, results);
} else {
hipLaunchKernelGGL(HIP_KERNEL_NAME(UnRolledNonZeroOutputPositionsKernel<InputT, NONZERO_THREADS_PER_BLOCK>), num_blocks, NONZERO_THREADS_PER_BLOCK, 0, stream,
x, x_size, x_rank, x_strides,
prefix_counts, nonzero_elements, results);
}
return hipSuccess;
}
template hipError_t NonZeroCountEachBlock(hipStream_t stream, const bool*, int64_t, int*);
template hipError_t NonZeroCountEachBlock(hipStream_t stream, const uint8_t*, int64_t, int*);
template hipError_t NonZeroCountEachBlock(hipStream_t stream, const int64_t*, int64_t, int*);
template hipError_t NonZeroCountEachBlock(hipStream_t stream, const int32_t*, int64_t, int*);
template hipError_t NonZeroCountEachBlock(hipStream_t stream, const float*, int64_t, int*);
template hipError_t NonZeroCountEachBlock(hipStream_t stream, const half*, int64_t, int*);
template hipError_t NonZeroOutputPositions(hipStream_t stream, const bool*, int64_t, int, const TArray<fast_divmod>&, const int*, int, int64_t*);
template hipError_t NonZeroOutputPositions(hipStream_t stream, const uint8_t*, int64_t, int, const TArray<fast_divmod>&, const int*, int, int64_t*);
template hipError_t NonZeroOutputPositions(hipStream_t stream, const int64_t*, int64_t, int, const TArray<fast_divmod>&, const int*, int, int64_t*);
template hipError_t NonZeroOutputPositions(hipStream_t stream, const int32_t*, int64_t, int, const TArray<fast_divmod>&, const int*, int, int64_t*);
template hipError_t NonZeroOutputPositions(hipStream_t stream, const float*, int64_t, int, const TArray<fast_divmod>&, const int*, int, int64_t*);
template hipError_t NonZeroOutputPositions(hipStream_t stream, const half*, int64_t, int, const TArray<fast_divmod>&, const int*, int, int64_t*);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace onnxruntime {
namespace rocm {
int NonZeroCalcBlockCount(int64_t x_size);
hipError_t NonZeroCalcPrefixSumTempStorageBytes(hipStream_t stream, int* prefix_counts, int number_of_blocks, size_t& );
hipError_t NonZeroInclusivePrefixSum(hipStream_t stream, void* d_temp_storage, size_t temp_storage_bytes, int* prefix_counts, int number_of_blocks);
// count nonzero elements in each block into counts_in_blocks,
// the counts_in_blocks buffer is pre-allocated on gpu first.
template<typename InputT>
hipError_t NonZeroCountEachBlock(hipStream_t stream, const InputT* x, int64_t x_size, int* counts_in_blocks);
// output nonzero positions using input x and prefix_counts for each blocks
template<typename InputT>
hipError_t NonZeroOutputPositions(
hipStream_t stream, const InputT *x, int64_t x_size, int x_rank, const TArray<fast_divmod>& x_strides,
const int* prefix_counts, int nonzero_elements, int64_t* results);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "nonzero_op.h"
#include "nonzero_impl.h"
#include "core/providers/cpu/tensor/utils.h"
namespace onnxruntime {
namespace rocm {
// kernel builder functions
#define NONZERO_TYPED_KERNEL_WITH_TYPE_NAME(type, type_name) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
NonZero, \
kOnnxDomain, \
9, 12, \
type_name, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<type>()), \
NonZero<type>) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
NonZero, \
kOnnxDomain, \
13, \
type_name, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<type>()), \
NonZero<type>)
#define NONZERO_TYPED_KERNEL(type) \
NONZERO_TYPED_KERNEL_WITH_TYPE_NAME(type, type)
// start with a subset of types, enable more as needed...
NONZERO_TYPED_KERNEL(bool)
NONZERO_TYPED_KERNEL(uint8_t)
//NONZERO_TYPED_KERNEL(uint16_t)
//NONZERO_TYPED_KERNEL(uint32_t)
//NONZERO_TYPED_KERNEL(uint64_t)
//NONZERO_TYPED_KERNEL(int8_t)
//NONZERO_TYPED_KERNEL(int16_t)
NONZERO_TYPED_KERNEL(int32_t)
NONZERO_TYPED_KERNEL(int64_t)
NONZERO_TYPED_KERNEL(MLFloat16)
//NONZERO_TYPED_KERNEL(BFloat16)
NONZERO_TYPED_KERNEL(float)
//NONZERO_TYPED_KERNEL(double)
//NONZERO_TYPED_KERNEL_WITH_TYPE_NAME(std::string, string)
#undef NONZERO_TYPED_KERNEL
#undef NONZERO_TYPED_KERNEL_WITH_TYPE_NAME
template <typename T>
Status NonZero<T>::ComputeInternal(OpKernelContext* context) const {
static const TensorShape kScalarDims{1};
const auto x = context->Input<Tensor>(0);
int nonzero_elements = 0;
const auto& x_shape = x->Shape();
const int x_rank = x_shape.IsScalar() ? 1 : static_cast<int>(x_shape.NumDimensions());
auto x_dims = (x_shape.IsScalar()) ? kScalarDims.GetDims() : x_shape.GetDims();
const int64_t x_size = x_shape.Size();
if (x_size > 0) {
auto x_data = reinterpret_cast<const typename ToHipType<T>::MappedType*>(x->Data<T>());
const int number_of_blocks = NonZeroCalcBlockCount(x_size);
auto prefix_buffer = GetScratchBuffer<int>(number_of_blocks);
int* prefix_counts = prefix_buffer.get();
HIP_RETURN_IF_ERROR(NonZeroCountEachBlock(Stream(), x_data, x_size, prefix_counts));
size_t temp_storage_bytes = 0;
HIP_RETURN_IF_ERROR(NonZeroCalcPrefixSumTempStorageBytes(Stream(), prefix_counts, number_of_blocks, temp_storage_bytes));
auto temp_buffer = GetScratchBuffer<uint8_t>(temp_storage_bytes);
auto d_temp_storage = temp_buffer.get();
HIP_RETURN_IF_ERROR(NonZeroInclusivePrefixSum(Stream(), d_temp_storage, temp_storage_bytes, prefix_counts, number_of_blocks));
// hipMemcpyAsync from device memory to pageable host memory will return only once the copy has completed.
HIP_RETURN_IF_ERROR(hipMemcpyAsync(
&nonzero_elements, prefix_counts + number_of_blocks - 1,
sizeof(int), hipMemcpyDeviceToHost, Stream()));
TArray<fast_divmod> fdm_x_strides(x_rank);
TensorPitches x_strides(x_dims);
for (auto i = 0; i < x_rank; i++) {
fdm_x_strides[i] = fast_divmod(static_cast<int>(x_strides[i]));
}
auto* output_tensor = context->Output(0, {x_rank, nonzero_elements});
ORT_ENFORCE(output_tensor, "failed to get first output!");
HIP_RETURN_IF_ERROR(NonZeroOutputPositions(
Stream(), x_data, x_size, x_rank, fdm_x_strides,
prefix_counts, nonzero_elements, output_tensor->MutableData<int64_t>()));
} else {
context->Output(0, {x_rank, nonzero_elements});
}
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
class NonZero final : public RocmKernel {
public:
NonZero(const OpKernelInfo& info) : RocmKernel(info) {}
Status ComputeInternal(OpKernelContext* context) const override;
};
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/onehot.h"
using namespace onnxruntime::common;
namespace onnxruntime {
namespace rocm {
// T1: indices, T2: depth, T3: values
#define REGISTER_TYPED_ONE_HOT_OP(in_type, out_type, depth_type) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
OneHot, \
kOnnxDomain, \
11, \
in_type##_##out_type##_##depth_type, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.InputMemoryType(OrtMemTypeCPUInput, 1) /* Keep depth in CPU */ \
.InputMemoryType(OrtMemTypeCPUInput, 2) /* Keep values in CPU */ \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<in_type>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<depth_type>()) \
.TypeConstraint("T3", DataTypeImpl::GetTensorType<out_type>()), \
OneHotOp<in_type, out_type, depth_type>);
REGISTER_TYPED_ONE_HOT_OP(int64_t, int64_t, int64_t)
REGISTER_TYPED_ONE_HOT_OP(int64_t, float, int64_t)
REGISTER_TYPED_ONE_HOT_OP(int32_t, float, int32_t)
REGISTER_TYPED_ONE_HOT_OP(int64_t, MLFloat16, int64_t)
REGISTER_TYPED_ONE_HOT_OP(int32_t, MLFloat16, int32_t)
template <typename in_type, typename out_type, typename depth_type>
Status OneHotOp<in_type, out_type, depth_type>::ComputeInternal(OpKernelContext* ctx) const {
typedef typename ToHipType<out_type>::MappedType HipT_Out;
const Tensor* indices = ctx->Input<Tensor>(0);
const Tensor* depth = ctx->Input<Tensor>(1);
const Tensor* values = ctx->Input<Tensor>(2);
ORT_RETURN_IF_ERROR(ValidateInputs(depth, values));
const auto* depth_data = depth->Data<depth_type>();
const auto depth_val = static_cast<int64_t>(
*depth_data); // As per spec in case 'depth' is of non-integer type, it will be casted to int64 before use.
if (depth_val <= 0) {
return Status(ONNXRUNTIME, INVALID_ARGUMENT, "Depth is negative.");
}
// prepare output shape
int64_t prefix_dim_size, suffix_dim_size;
TensorShapeVector output_shape;
ORT_RETURN_IF_ERROR(PrepareOutputShape(indices, depth_val, axis_, prefix_dim_size, suffix_dim_size, output_shape));
// allocate output
const auto* values_data = reinterpret_cast<const HipT_Out*>(values->Data<out_type>());
Tensor* output = ctx->Output(0, TensorShape(output_shape));
// edge case where we have a dim with a value of 0
if (output->Shape().Size() == 0)
return Status::OK();
const fast_divmod fdm_suffix(gsl::narrow_cast<int>(suffix_dim_size));
const auto* indices_data = indices->Data<in_type>();
auto* output_data = reinterpret_cast<HipT_Out*>(output->MutableData<out_type>());
if (values_data[0] == HipT_Out(0.f)) {
HIP_RETURN_IF_ERROR(hipMemsetAsync(output->MutableDataRaw(), 0, output->SizeInBytes(), Stream()));
OneHotWithZeroOffValueImpl(Stream(),
indices_data,
fdm_suffix,
depth_val,
values_data[1],
output_data,
indices->Shape().Size());
return Status::OK();
}
const fast_divmod fdm_depth_suffix(gsl::narrow_cast<int>(depth_val * suffix_dim_size));
OneHotImpl(Stream(),
indices_data, fdm_depth_suffix, fdm_suffix, depth_val,
values_data[1],
values_data[0],
output_data,
output->Shape().Size());
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/rocm_common.h"
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/providers/rocm/tensor/onehot.h"
namespace onnxruntime {
namespace rocm {
template <typename in_type, typename out_type>
__global__ void _OneHotImpl(
const in_type* indices_data,
const fast_divmod fdm_depth_suffix,
const fast_divmod fdm_suffix,
const int64_t depth_val,
const out_type on_value,
const out_type off_value,
out_type* output_data,
HIP_LONG N) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
int prefix_index, prefix_offset;
fdm_depth_suffix.divmod(id, prefix_index, prefix_offset);
int depth_index, suffix_index;
fdm_suffix.divmod(prefix_offset, depth_index, suffix_index);
HIP_LONG indices_index = prefix_index * fdm_suffix.d_ + suffix_index;
// handle index outside the range [-depth, depth-1] case
bool is_valid_range = indices_data[indices_index] >= -depth_val && indices_data[indices_index] < depth_val;
// handle negative index
in_type adjusted_indice = (indices_data[indices_index] + depth_val) % depth_val;
output_data[id] = (is_valid_range && adjusted_indice == in_type(depth_index)) ? on_value : off_value;
}
template<typename in_type, typename out_type>
__global__ void _OneHotWithZeroOffValueImpl(
const in_type* indices_data,
const fast_divmod fdm_suffix,
const int64_t depth_val,
const out_type on_value,
out_type* output_data,
HIP_LONG N) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
if (indices_data[id] >= -depth_val && indices_data[id] < depth_val) {
in_type adjusted_index = indices_data[id] >= 0 ? indices_data[id] : indices_data[id] + depth_val;
int q, r;
fdm_suffix.divmod(id, q, r);
output_data[(q * depth_val + adjusted_index) * fdm_suffix.d_ + r] = on_value;
}
}
template <typename in_type, typename out_type>
void OneHotImpl(
hipStream_t stream,
const in_type* indices_data,
const fast_divmod fdm_depth_suffix,
const fast_divmod fdm_suffix,
const int64_t depth_val,
const out_type on_value,
const out_type off_value,
out_type* output_data,
size_t count) {
int blocksPerGrid = (int)(ceil(static_cast<float>(count) / GridDim::maxThreadsPerBlock));
HIP_LONG N = static_cast<HIP_LONG>(count);
hipLaunchKernelGGL(HIP_KERNEL_NAME(_OneHotImpl<in_type, out_type>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
indices_data,
fdm_depth_suffix,
fdm_suffix,
depth_val,
on_value,
off_value,
output_data,
N);
}
template <typename in_type, typename out_type>
void OneHotWithZeroOffValueImpl(
hipStream_t stream,
const in_type* indices_data,
const fast_divmod fdm_suffix,
const int64_t depth_val,
const out_type on_value,
out_type* output_data,
size_t count) {
int blocksPerGrid = (int)(ceil(static_cast<float>(count) / GridDim::maxThreadsPerBlock));
HIP_LONG N = static_cast<HIP_LONG>(count);
hipLaunchKernelGGL(HIP_KERNEL_NAME(_OneHotWithZeroOffValueImpl<in_type, out_type>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
indices_data,
fdm_suffix,
depth_val,
on_value,
output_data,
N);
}
#define SPECIALIZED_OneHotImpl(in_type, out_type) \
template void OneHotImpl( \
hipStream_t stream, \
const in_type* indices_data, \
const fast_divmod fdm_depth_suffix, \
const fast_divmod fdm_suffix, \
const int64_t depth_val, \
const out_type on_value, \
const out_type off_value, \
out_type* output_data, \
size_t count);
SPECIALIZED_OneHotImpl(int64_t, int64_t)
SPECIALIZED_OneHotImpl(int64_t, float)
SPECIALIZED_OneHotImpl(int32_t, float)
SPECIALIZED_OneHotImpl(int64_t, half)
SPECIALIZED_OneHotImpl(int32_t, half)
#define SPECIALIZED_OneHotWithZeroOffValueImpl(in_type, out_type) \
template void OneHotWithZeroOffValueImpl( \
hipStream_t stream, \
const in_type* indices_data, \
const fast_divmod fdm_suffix, \
const int64_t depth_val, \
const out_type on_value, \
out_type* output_data, \
size_t count);
SPECIALIZED_OneHotWithZeroOffValueImpl(int64_t, int64_t)
SPECIALIZED_OneHotWithZeroOffValueImpl(int64_t, float)
SPECIALIZED_OneHotWithZeroOffValueImpl(int32_t, float)
SPECIALIZED_OneHotWithZeroOffValueImpl(int64_t, half)
SPECIALIZED_OneHotWithZeroOffValueImpl(int32_t, half)
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace onnxruntime {
namespace rocm {
template <typename in_type, typename out_type>
void OneHotImpl(
hipStream_t stream,
const in_type* indices,
const fast_divmod fdm_depth_suffix,
const fast_divmod fdm_suffix,
const int64_t depth_val,
const out_type on_value,
const out_type off_value,
out_type* output,
size_t count);
template <typename in_type, typename out_type>
void OneHotWithZeroOffValueImpl(
hipStream_t stream,
const in_type* indices,
const fast_divmod fdm_suffix,
const int64_t depth_val,
const out_type on_value,
out_type* output,
size_t count);
template <typename in_type, typename out_type, typename depth_type>
class OneHotOp final : public RocmKernel {
public:
explicit OneHotOp(const OpKernelInfo& info) : RocmKernel(info) {
int64_t tmp_axis;
if (info.GetAttr<int64_t>("axis", &tmp_axis).IsOK()) {
axis_ = tmp_axis;
}
}
Status ComputeInternal(OpKernelContext* context) const override;
private:
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(OneHotOp);
int64_t axis_ = -1;
};
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "pad.h"
#include "pad_impl.h"
#include "core/providers/cpu/tensor/utils.h"
namespace onnxruntime {
namespace rocm {
#define REGISTER_KERNEL_TYPED(T) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Pad, \
kOnnxDomain, \
2, 10, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
Pad<T>); \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Pad, \
kOnnxDomain, \
11, 12, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.InputMemoryType(OrtMemTypeCPUInput, 1) \
.InputMemoryType(OrtMemTypeCPUInput, 2) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
Pad<T>); \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
Pad, \
kOnnxDomain, \
13, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.InputMemoryType(OrtMemTypeCPUInput, 1) \
.InputMemoryType(OrtMemTypeCPUInput, 2) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
Pad<T>);
using PadsVector = PadBase::PadsVector;
static bool IsNCHWInputWithPaddingAlongHAndW(size_t input_rank,
const TArray<int64_t>& lower_pads,
const TArray<int64_t>& upper_pads) {
if (input_rank == 2) { // N = 1 and C = 1
return true;
}
// Is CHW input AND no padding along C dim
if (input_rank == 3 &&
lower_pads[0] == 0 && // start padding along C
upper_pads[0] == 0) { // end padding along C
return true;
}
// Is NCHW input AND no padding along N and C dims
if (input_rank == 4 &&
lower_pads[0] == 0 && lower_pads[1] == 0 && // start padding along N and C
upper_pads[0] == 0 && upper_pads[1] == 0) { // end padding along N and C
return true;
}
return false;
}
template <typename T>
typename ToHipType<T>::MappedType ToCudaValue(const T& value) {
return value;
}
template <>
typename ToHipType<MLFloat16>::MappedType ToCudaValue<MLFloat16>(const MLFloat16& value) {
return *reinterpret_cast<const typename ToHipType<MLFloat16>::MappedType*>(&value.val);
}
template <typename T>
Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
typedef typename ToHipType<T>::MappedType HipT;
const auto& input_tensor = *ctx->Input<Tensor>(0);
auto const& input_shape = input_tensor.Shape();
int32_t dimension_count = static_cast<int32_t>(input_shape.NumDimensions());
const PadsVector* p_pads = &pads_;
const PadsVector* p_slices = &slices_;
HipT value = ToHipType<T>::FromFloat(value_);
// kOnnxDomain Pad opset >= 11 (Or) kMsDomain opset == 1
PadsVector pads;
PadsVector slices;
if (is_dynamic_) {
const Tensor& pads_tensor = *ctx->Input<Tensor>(1);
const auto pads_tensor_dims = pads_tensor.Shape().GetDims();
ORT_ENFORCE(utils::IsPrimitiveDataType<int64_t>(pads_tensor.DataType()),
"Pads tensor should be an INT64 tensor");
ORT_ENFORCE(pads_tensor_dims.size() == 1 || (pads_tensor_dims.size() == 2 && pads_tensor_dims[0] == 1),
"Pads tensor should be a 1D tensor of shape [2 * input_rank] or a 2D tensor of shape [1, 2 * input_rank]");
const int64_t* pads_tensor_raw_data = pads_tensor.Data<int64_t>();
size_t pads_size = static_cast<size_t>(pads_tensor.Shape().Size());
ORT_ENFORCE(pads_size == 2 * static_cast<size_t>(dimension_count),
"Pads tensor size should be equal to twice the input dimension count ");
pads.reserve(2LL * dimension_count);
for (size_t i = 0; i < pads_size; ++i) {
pads.push_back(pads_tensor_raw_data[i]);
}
// Separate out any negative pads into the slices array
slices.resize(pads.size(), 0);
for (size_t index = 0; index < pads.size(); index++) {
if (pads[index] < 0) {
slices[index] = pads[index];
pads[index] = 0;
}
}
T raw_value{};
const Tensor* value_tensor = ctx->Input<Tensor>(2);
if (nullptr != value_tensor) {
ORT_ENFORCE(utils::IsPrimitiveDataType<T>(value_tensor->DataType()) &&
value_tensor->Shape().Size() == 1,
"Value tensor should be a 1D tensor of size 1 with the same type as that of the input tensor");
raw_value = value_tensor->Data<T>()[0];
value = ToCudaValue<T>(raw_value);
}
p_pads = &pads;
p_slices = &slices;
}
TensorPitches input_pitches(input_shape.GetDims());
TArray<int64_t> input_dims(input_shape.GetDims());
TArray<int64_t> input_strides(input_pitches);
auto output_dims(input_shape.AsShapeVector());
ORT_ENFORCE(static_cast<size_t>(dimension_count * 2) == p_pads->size(), "'pads' attribute has wrong number of values");
// Calculate output dimensions, and handle any negative padding
TArray<int64_t> lower_pads(dimension_count);
TArray<int64_t> upper_pads(dimension_count);
for (auto i = 0; i < dimension_count; i++) {
lower_pads[i] = (*p_pads)[i] + (*p_slices)[i];
upper_pads[i] = (*p_pads)[i + dimension_count] + (*p_slices)[i + dimension_count];
output_dims[i] += lower_pads[i] + upper_pads[i];
}
TensorShape output_shape(output_dims);
// special case when there is a dim value of 0 in the shape. behavior depends on mode
if (input_shape.Size() == 0) {
ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode_, input_shape, output_shape));
}
auto& output_tensor = *ctx->Output(0, output_shape);
if (std::all_of(p_pads->begin(), p_pads->end(), [](const int64_t v) { return v == 0; }) &&
std::all_of(p_slices->begin(), p_slices->end(), [](const int64_t v) { return v == 0; }) &&
output_shape.Size() > 0) {
HIP_RETURN_IF_ERROR(hipMemcpyAsync(
output_tensor.MutableData<T>(), input_tensor.Data<T>(),
sizeof(typename ToHipType<T>::MappedType) * output_shape.Size(),
hipMemcpyDeviceToDevice, Stream()));
return Status::OK();
}
if (IsNCHWInputWithPaddingAlongHAndW(static_cast<size_t>(dimension_count), lower_pads, upper_pads)) {
// If we have entered here, it means the input can only be 4-D (NCHW), 3-D (CHW), or 2-D (HW)
// NCHW input
int height_dim = 2;
int width_dim = 3;
if (dimension_count == 3) { // CHW input
height_dim = 1;
width_dim = 2;
} else if (dimension_count == 2) { // HW input
height_dim = 0;
width_dim = 1;
}
PadNCHWInputWithPaddingAlongHAndWImpl(
Stream(),
dimension_count == 4 ? input_dims[0] : 1,
dimension_count == 4 ? input_dims[1] : (dimension_count == 3 ? input_dims[0] : 1),
input_dims[height_dim],
output_dims[height_dim],
input_dims[width_dim],
output_dims[width_dim],
lower_pads[height_dim],
lower_pads[width_dim],
value,
static_cast<int>(mode_),
reinterpret_cast<const typename ToHipType<T>::MappedType*>(input_tensor.Data<T>()),
reinterpret_cast<typename ToHipType<T>::MappedType*>(output_tensor.MutableData<T>()),
output_tensor.Shape().Size());
return Status::OK();
}
TArray<fast_divmod> fdm_output_strides(dimension_count);
TensorPitches output_strides(output_dims);
for (auto i = 0; i < dimension_count; i++) {
fdm_output_strides[i] = fast_divmod(static_cast<int>(output_strides[i]));
}
PadImpl(
Stream(),
dimension_count,
input_dims,
input_strides,
lower_pads,
value,
static_cast<int>(mode_),
reinterpret_cast<const typename ToHipType<T>::MappedType*>(input_tensor.Data<T>()),
fdm_output_strides,
reinterpret_cast<typename ToHipType<T>::MappedType*>(output_tensor.MutableData<T>()),
output_tensor.Shape().Size());
return Status::OK();
}
#define SPECIALIZED_COMPUTE(T) \
REGISTER_KERNEL_TYPED(T) \
template Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const;
SPECIALIZED_COMPUTE(float)
SPECIALIZED_COMPUTE(double)
SPECIALIZED_COMPUTE(MLFloat16)
SPECIALIZED_COMPUTE(bool)
} // namespace rocm
}; // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/tensor/padbase.h"
using onnxruntime::PadBase;
namespace onnxruntime {
namespace rocm {
template <typename T>
class Pad final : public PadBase, public RocmKernel {
public:
Pad(const OpKernelInfo& info) : PadBase(info), RocmKernel(info) {}
Status ComputeInternal(OpKernelContext* context) const override;
};
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/cu_inc/common.cuh"
#include "pad_impl.h"
namespace onnxruntime {
namespace rocm {
// PadMode enum from core/providers/cpu/tensor/pad.h, cannot use that header because of nvcc/onnxruntime incompatibility
enum class PadMode : int {
Constant = 0,
Reflect,
Edge
};
template <typename T, int pad_mode>
__global__ void _PadKernel(
const size_t shape_rank,
const TArray<int64_t> input_dims,
const TArray<int64_t> input_strides,
const TArray<int64_t> lower_pads,
const T pad_value,
const T* input_data,
const TArray<fast_divmod> fdm_output_strides,
T* output_data,
const size_t N) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
HIP_LONG input_index = 0;
HIP_LONG output_index = id;
bool use_pad_value = false;
for (int dim = 0; dim < shape_rank && !use_pad_value; ++dim) {
int out_coord, r;
fdm_output_strides[dim].divmod(output_index, out_coord, r);
output_index = r;
int in_coord = 0;
if (out_coord < lower_pads[dim]) {
switch ((PadMode)pad_mode) {
case PadMode::Constant:
use_pad_value = true;
break;
case PadMode::Edge:
in_coord = 0;
break;
case PadMode::Reflect:
in_coord = lower_pads[dim] - out_coord;
break;
}
} else if (out_coord >= lower_pads[dim] + input_dims[dim]) {
switch ((PadMode)pad_mode) {
case PadMode::Constant:
use_pad_value = true;
break;
case PadMode::Edge:
in_coord = input_dims[dim] - 1;
break;
case PadMode::Reflect:
in_coord = input_dims[dim] - 2 - (out_coord - (lower_pads[dim] + input_dims[dim]));
break;
}
} else {
in_coord = out_coord - lower_pads[dim];
}
input_index += input_strides[dim] * in_coord;
}
output_data[id] = use_pad_value ? (T)pad_value : input_data[input_index];
}
template <typename T, int pad_mode>
__global__ void _PadNCHWInputWithPaddingAlongHAndWKernel(
const int64_t n, // Batch
const int64_t c, // Channel
const int64_t input_height,
const int64_t output_height,
const int64_t input_width,
const int64_t output_width,
const int64_t pad_height_start,
const int64_t pad_width_start,
const T pad_value,
const T* input_data,
T* output_data,
const size_t N) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
const int current_output_width = id % output_width;
int nc_index = id / output_width;
const int current_output_height = nc_index % output_height;
nc_index /= output_height;
int current_input_height = current_output_height - pad_height_start;
int current_input_width = current_output_width - pad_width_start;
switch ((PadMode)pad_mode) {
case PadMode::Constant:
output_data[id] = (current_input_height < 0 ||
current_input_width < 0 ||
current_input_height >= input_height ||
current_input_width >= input_width)
? pad_value
: input_data[(nc_index * input_height +
current_input_height) *
input_width +
current_input_width];
break;
case PadMode::Edge:
current_input_height = std::max(0, std::min(current_input_height, static_cast<int>(input_height - 1)));
current_input_width = std::max(0, std::min(current_input_width, static_cast<int>(input_width - 1)));
output_data[id] = input_data[(nc_index * input_height +
current_input_height) *
input_width +
current_input_width];
break;
case PadMode::Reflect:
current_input_height = std::max(current_input_height, -current_input_height);
current_input_height = std::min(static_cast<int>(current_input_height),
2 * static_cast<int>(input_height) - current_input_height - 2);
current_input_width = std::max(current_input_width, -current_input_width);
current_input_width = std::min(static_cast<int>(current_input_width),
2 * static_cast<int>(input_width) - current_input_width - 2);
output_data[id] = input_data[(nc_index * input_height +
current_input_height) *
input_width +
current_input_width];
break;
}
}
template <typename T>
void PadImpl(
hipStream_t stream,
const size_t shape_rank,
const TArray<int64_t>& input_dims,
const TArray<int64_t>& input_strides,
const TArray<int64_t>& lower_pads,
const T pad_value,
const int pad_mode,
const T* input_data,
const TArray<fast_divmod>& fdm_output_strides,
T* output_data,
const size_t N) {
if (N == 0) // special case where there's a dim value of 0 in the output shape
return;
int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
switch (pad_mode) {
case 0:
hipLaunchKernelGGL(HIP_KERNEL_NAME(_PadKernel<T, 0>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
shape_rank, input_dims, input_strides, lower_pads,
pad_value, input_data, fdm_output_strides, output_data, N);
break;
case 1:
hipLaunchKernelGGL(HIP_KERNEL_NAME(_PadKernel<T, 1>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
shape_rank, input_dims, input_strides, lower_pads,
pad_value, input_data, fdm_output_strides, output_data, N);
break;
case 2:
hipLaunchKernelGGL(HIP_KERNEL_NAME(_PadKernel<T, 2>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
shape_rank, input_dims, input_strides, lower_pads,
pad_value, input_data, fdm_output_strides, output_data, N);
break;
}
}
template <typename T>
void PadNCHWInputWithPaddingAlongHAndWImpl(
hipStream_t stream,
const int64_t n, // Batch
const int64_t c, // Channel
const int64_t input_height,
const int64_t output_height,
const int64_t input_width,
const int64_t output_width,
const int64_t pad_height_start,
const int64_t pad_width_start,
const T pad_value,
const int pad_mode,
const T* input_data,
T* output_data,
const size_t N) {
if (N == 0) // special case where there's a dim value of 0 in the output shape
return;
int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
switch (pad_mode) {
case 0:
hipLaunchKernelGGL(HIP_KERNEL_NAME(_PadNCHWInputWithPaddingAlongHAndWKernel<T, 0>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
n, c, input_height, output_height, input_width, output_width,
pad_height_start, pad_width_start,
pad_value, input_data, output_data, N);
break;
case 1:
hipLaunchKernelGGL(HIP_KERNEL_NAME(_PadNCHWInputWithPaddingAlongHAndWKernel<T, 1>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
n, c, input_height, output_height, input_width, output_width,
pad_height_start, pad_width_start,
pad_value, input_data, output_data, N);
break;
case 2:
hipLaunchKernelGGL(HIP_KERNEL_NAME(_PadNCHWInputWithPaddingAlongHAndWKernel<T, 2>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
n, c, input_height, output_height, input_width, output_width,
pad_height_start, pad_width_start,
pad_value, input_data, output_data, N);
break;
}
}
#define SPECIALIZED_IMPL(T) \
template void PadImpl<T>(hipStream_t stream, const size_t shape_rank, \
const TArray<int64_t>& input_dims, const TArray<int64_t>& input_strides, \
const TArray<int64_t>& lower_pads, \
const T pad_value, \
const int pad_mode, \
const T* input_data, \
const TArray<fast_divmod>& fdm_output_strides, \
T* output_data, \
const size_t N); \
template void PadNCHWInputWithPaddingAlongHAndWImpl<T>(hipStream_t stream, const int64_t n, const int64_t c, \
const int64_t input_height, const int64_t output_height, \
const int64_t input_width, const int64_t output_width, \
const int64_t pad_height_start, \
const int64_t pad_width_start, \
const T pad_value, \
const int pad_mode, \
const T* input_data, T* output_data, \
const size_t N);
SPECIALIZED_IMPL(float)
SPECIALIZED_IMPL(double)
SPECIALIZED_IMPL(half)
SPECIALIZED_IMPL(bool)
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
void PadNCHWInputWithPaddingAlongHAndWImpl(
hipStream_t stream,
const int64_t n, // Batch
const int64_t c, // Channel
const int64_t input_height,
const int64_t output_height,
const int64_t input_width,
const int64_t output_width,
const int64_t pad_height_start,
const int64_t pad_width_start,
const T pad_value,
const int pad_mode,
const T* input_data,
T* output_data,
const size_t N);
template <typename T>
void PadImpl(
hipStream_t stream,
const size_t shape_rank,
const TArray<int64_t>& input_dims,
const TArray<int64_t>& input_strides,
const TArray<int64_t>& lower_pads,
const T pad_value,
const int pad_mode,
const T* input_data,
const TArray<fast_divmod>& fdm_output_strides,
T* output_data,
const size_t N);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "quantize_linear.h"
#include "quantize_linear.cuh"
namespace onnxruntime {
namespace rocm {
template <class T, class U>
Status QuantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
typedef typename ToHipType<U>::MappedType CudaU;
auto& x = *ctx->Input<Tensor>(0);
auto& y_scale = *ctx->Input<Tensor>(1);
auto* y_zero_point = ctx->Input<Tensor>(2);
auto& y = *ctx->Output(0, x.Shape());
const auto& x_shape = x.Shape();
const CudaU* input = reinterpret_cast<const CudaU*>(x.Data<U>());
T* output = y.MutableData<T>();
// TO DO: support per-channel
ORT_ENFORCE(IsScalarOr1ElementVector(&y_scale), "y_scale must be a scalar or 1D tensor of size 1.");
ORT_ENFORCE(y_zero_point == nullptr || IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1.");
const T* zero_point = y_zero_point != nullptr ? y_zero_point->Data<T>() : nullptr;
const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.Data<U>());
const auto num_of_elements = x_shape.Size();
ORT_RETURN_IF_ERROR(CudaQuantizeLinear(Stream(), input, output, scale, zero_point, num_of_elements));
return Status::OK();
}
template <class T, class U>
Status DequantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
typedef typename ToHipType<U>::MappedType CudaU;
auto& x = *ctx->Input<Tensor>(0);
auto& y_scale = *ctx->Input<Tensor>(1);
auto* y_zero_point = ctx->Input<Tensor>(2);
const auto& x_shape = x.Shape();
auto& y = *ctx->Output(0, x_shape);
const T* input = x.Data<T>();
CudaU* output = reinterpret_cast<CudaU*>(y.MutableData<U>());
ORT_ENFORCE(IsScalarOr1ElementVector(&y_scale), "y_scale must be a scalar or 1D tensor of size 1.");
ORT_ENFORCE(y_zero_point == nullptr || IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1.");
const T* zero_point = y_zero_point != nullptr ? y_zero_point->Data<T>() : nullptr;
const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.Data<U>());
const auto num_of_elements = x_shape.Size();
ORT_RETURN_IF_ERROR(CudaDequantizeLinear(Stream(), input, output, scale, zero_point, num_of_elements));
return Status::OK();
}
// register QuantizeLinear kernels
#define REGISTER_Q_KERNEL_TYPED(T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
QuantizeLinear, \
kOnnxDomain, \
10, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<float>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()), \
QuantizeLinear<T, float>);
REGISTER_Q_KERNEL_TYPED(int8_t)
REGISTER_Q_KERNEL_TYPED(uint8_t)
// register DequantizeLinear kernels
#define REGISTER_DQ_KERNEL_TYPED(T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
DequantizeLinear, \
kOnnxDomain, \
10, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
DequantizeLinear<T, float>);
REGISTER_DQ_KERNEL_TYPED(int8_t)
REGISTER_DQ_KERNEL_TYPED(uint8_t)
// specialize QuantizeLinear::ComputeInternal and DequantizeLinear::ComputeInternal
#define SPECIALIZED_QDQ_COMPUTE(T, U) \
template Status QuantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const; \
template Status DequantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const;
SPECIALIZED_QDQ_COMPUTE(int8_t, float)
SPECIALIZED_QDQ_COMPUTE(uint8_t, float)
SPECIALIZED_QDQ_COMPUTE(int8_t, MLFloat16)
SPECIALIZED_QDQ_COMPUTE(uint8_t, MLFloat16)
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "quantize_linear.cuh"
#include <limits>
#include "core/providers/rocm/cu_inc/common.cuh"
namespace onnxruntime {
namespace rocm {
template <typename T>
struct Round;
template <>
struct Round<float> {
__device__ __forceinline__ int operator()(float v) const {
return __float2int_rn(v);
}
};
template <>
struct Round<half> {
__device__ __forceinline__ int operator()(half v) const {
return __half2int_rn(v);
}
};
template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
__global__ void QuantizeLinearKernel(const InT* input, OutT* output, const InT* scale_ptr, const OutT* zero_point_ptr, HIP_LONG N, Round<InT> round) {
HIP_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
InT scale = *scale_ptr;
OutT zero_point = zero_point_ptr != nullptr ? *zero_point_ptr : 0;
#pragma unroll
for (int i = 0; i < NumElementsPerThread; i++) {
if (id < N) {
int value = round(input[id] / scale) + zero_point;
output[id] = static_cast<OutT>(max(std::numeric_limits<OutT>::min(), min(std::numeric_limits<OutT>::max(), value)));
id += NumThreadsPerBlock;
}
}
}
template <class OutT, class InT>
Status CudaQuantizeLinear(hipStream_t stream, const InT* input, OutT* output, const InT* scale, const OutT* zero_point, size_t num_of_element) {
if (num_of_element <= 0)
return Status::OK();
int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
hipLaunchKernelGGL(HIP_KERNEL_NAME(QuantizeLinearKernel<GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
input,
output,
scale,
zero_point,
static_cast<int>(num_of_element),
Round<InT>());
return Status::OK();
}
template <class InT, class OutT, int NumThreadsPerBlock, int NumElementsPerThread>
__global__ void DequantizeLinearKernel(const InT* input, OutT* output, const OutT* scale_ptr, const InT* zero_point_ptr, HIP_LONG N) {
HIP_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
OutT scale = *scale_ptr;
InT zero_point = zero_point_ptr != nullptr ? *zero_point_ptr : 0;
#pragma unroll
for (int i = 0; i < NumElementsPerThread; i++) {
if (id < N) {
output[id] = static_cast<OutT>(input[id] - zero_point) * scale;
id += NumThreadsPerBlock;
}
}
}
template <class InT, class OutT>
Status CudaDequantizeLinear(hipStream_t stream, const InT* input, OutT* output, const OutT* scale, const InT* zero_point, size_t num_of_element) {
if (num_of_element <= 0)
return Status::OK();
int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
hipLaunchKernelGGL(HIP_KERNEL_NAME(DequantizeLinearKernel<InT, OutT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
input,
output,
scale,
zero_point,
static_cast<int>(num_of_element));
return Status::OK();
}
template Status CudaQuantizeLinear<int8_t, float>(hipStream_t stream, const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element);
template Status CudaQuantizeLinear<uint8_t, float>(hipStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element);
template Status CudaQuantizeLinear<int8_t, half>(hipStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element);
template Status CudaQuantizeLinear<uint8_t, half>(hipStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element);
template Status CudaDequantizeLinear<int8_t, float>(hipStream_t stream, const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element);
template Status CudaDequantizeLinear<uint8_t, float>(hipStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element);
template Status CudaDequantizeLinear<int8_t, half>(hipStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element);
template Status CudaDequantizeLinear<uint8_t, half>(hipStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element);
} // namespace rocm
} // namespace onnxruntime
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment