Commit 1a91fcc2 authored by gaoqiong's avatar gaoqiong
Browse files

add dtk所需文件

parent a144865d
Pipeline #492 failed with stages
in 0 seconds
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/common/gsl.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/tensor/transpose.h"
namespace onnxruntime {
namespace rocm {
class Transpose final : public RocmKernel, public TransposeBase {
public:
Transpose(const OpKernelInfo& info) : RocmKernel(info), TransposeBase(info) {
}
Status ComputeInternal(OpKernelContext* context) const override;
static Status DoTranspose(const Transpose& transpose_kernel,
const gsl::span<const size_t>& permutations, const Tensor& input, Tensor& output);
// `input_shape_override` (if provided) overrides the shape of `input` for compute purposes
// `output_shape_override` (if provided) overrides the shape of `output` for compute purposes
static Status DoTranspose(const hipDeviceProp_t& prop,
hipStream_t stream,
const rocblas_handle rocblas_handle,
const gsl::span<const size_t>& permutations,
const Tensor& input, Tensor& output,
const TensorShape* input_shape_override = nullptr,
const TensorShape* output_shape_override = nullptr);
};
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/cu_inc/common.cuh"
#include "transpose_impl.h"
namespace onnxruntime {
namespace rocm {
constexpr unsigned int kNumElementsPerThread = 4;
constexpr unsigned int kTileSize = 32;
// TileSize for current implementation is always 32, but still use template parameter to make it flexible for future.
// For each batch, transpose matrix [m, n] to [n, m].
template <typename T, unsigned int TileSize>
__global__ void Transpose3DKernel(const int64_t m, const int64_t n, const int64_t batch_stride, const T* input_data,
T* output_data) {
__shared__ T tile[TileSize][TileSize + 1];
int x = blockIdx.x * TileSize + threadIdx.x;
int y = blockIdx.y * TileSize + threadIdx.y;
if (x < n) {
#pragma unroll
for (unsigned int i = 0; i < TileSize; i += (TileSize / kNumElementsPerThread)) {
int y_idx = y + i;
if (y_idx < m) {
tile[threadIdx.y + i][threadIdx.x] = input_data[blockIdx.z * batch_stride + y_idx * n + x];
}
}
}
__syncthreads();
x = blockIdx.y * TileSize + threadIdx.x;
y = blockIdx.x * TileSize + threadIdx.y;
if (x < m) {
#pragma unroll
for (unsigned int i = 0; i < TileSize; i += (TileSize / kNumElementsPerThread)) {
int y_idx = y + i;
if (y_idx < n) {
output_data[blockIdx.z * batch_stride + y_idx * m + x] = tile[threadIdx.x][threadIdx.y + i];
}
}
}
}
bool CanDoTranspose3D(const hipDeviceProp_t& prop, size_t rank, const gsl::span<const int64_t>& input_dims,
const gsl::span<const size_t>& permutations, dim3& grid_size, dim3& block_size) {
// Permutation is done in the last two dimensions.
if (rank == 3 && permutations[rank - 2] == (rank - 1) && permutations[rank - 1] == (rank - 2)) {
// Normally maxGridSize.x is a large number but maxGridSize.y and maxGridSize.z are limited. Ideally we can check
// the input sizes to see if a dimension is too large so that we can use grid.x for it to avoid returning false.
// But this requires different versions of kernel implementation with different index compute logics.
// Below code is good enough for most of the cases for now, and if we see any case that input_dims[0] or
// input_dims[1] is too large in the future, we will handle it accordingly.
int grid_size_x = CeilDiv(static_cast<int>(input_dims[2]), kTileSize);
int grid_size_y = CeilDiv(static_cast<int>(input_dims[1]), kTileSize);
int grid_size_z = static_cast<int>(input_dims[0]);
if (grid_size_x <= prop.maxGridSize[0] && grid_size_y <= prop.maxGridSize[1] &&
grid_size_z <= prop.maxGridSize[2]) {
block_size = dim3(kTileSize, kTileSize / kNumElementsPerThread);
grid_size = dim3(static_cast<unsigned int>(grid_size_x), static_cast<unsigned int>(grid_size_y),
static_cast<unsigned int>(grid_size_z));
return true;
} else {
return false;
}
}
return false;
}
#define HANDLE_TRANSPOSE_3D_TILE_DIM(type) \
case sizeof(type): { \
Transpose3DKernel<type, kTileSize> \
<<<grid_size, block_size, 0, stream>>>(input_shape[1], input_shape[2], input_strides[0], \
reinterpret_cast<const ToHipType<type>::MappedType*>(input_data), \
reinterpret_cast<ToHipType<type>::MappedType*>(output_data)); \
} break
Status Transpose3DImpl(hipStream_t stream, size_t element_size, const TArray<int64_t>& input_shape,
const TArray<int64_t>& input_strides, const void* input_data, void* output_data, int64_t N,
const dim3& grid_size, const dim3& block_size) {
switch (element_size) {
HANDLE_TRANSPOSE_3D_TILE_DIM(int8_t);
HANDLE_TRANSPOSE_3D_TILE_DIM(int16_t);
HANDLE_TRANSPOSE_3D_TILE_DIM(int32_t);
HANDLE_TRANSPOSE_3D_TILE_DIM(int64_t);
default:
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for transpose on ROCM. Element size was ",
element_size);
}
return Status::OK();
}
template <int element_size>
__global__ void Transpose4DKernelParallelizeMultipleElementsPerThreadInInnermostDim(
const TArray<int64_t> input_strides, const void* input_data,
const TArray<int64_t> output_strides, void* output_data,
int64_t input_shape_2, HIP_LONG N) {
// coordinates will be: [d0, d1, d2, d3]
HIP_LONG d0 = blockIdx.z;
HIP_LONG d1 = blockIdx.y;
HIP_LONG d2 = threadIdx.y + blockIdx.x * blockDim.y;
HIP_LONG d3 = threadIdx.x;
HIP_LONG input_index = (d0 * input_strides[0] +
d1 * input_strides[1] +
d2 * input_strides[2]) /
(4 * sizeof(int) / element_size) +
d3 * input_strides[3];
HIP_LONG output_index = (d0 * output_strides[0] +
d1 * output_strides[1] +
d2 * output_strides[2]) /
(4 * sizeof(int) / element_size) +
d3 * output_strides[3];
const int4* v_input = reinterpret_cast<const int4*>(input_data);
int4* v_output = reinterpret_cast<int4*>(output_data);
if (input_index < N && output_index < N && d2 < input_shape_2) {
v_output[output_index] = v_input[input_index];
}
}
bool CanDoTranspose4DParallelizeMultipleElementsPerThreadInInnermostDim(const hipDeviceProp_t& prop,
size_t element_size,
int32_t rank,
const gsl::span<const int64_t>& input_dims,
const gsl::span<const size_t>& permutations,
dim3& grid_size, dim3& block_size) {
if (rank == 4 &&
// the permutations is not on the last dimension.
permutations[3] == 3) {
unsigned int num_elements_per_thread = 4 * sizeof(int) / static_cast<unsigned int>(element_size); // int4 is used in the kernel to access data.
// dims[3]: block.x
// dims[2]: block.y + grid.x
// dims[1]: grid.y
// dims[0]: grid.z
if (input_dims[3] / num_elements_per_thread <= prop.maxThreadsPerBlock &&
(input_dims[3] % num_elements_per_thread) == 0 &&
input_dims[1] <= prop.maxGridSize[1] &&
input_dims[0] <= prop.maxGridSize[2]) {
// There are 2 constrains when luanching the kernels
// 1. block_size_x * block_size_y <= prop.maxThreadsPerBlock
// 2. block_size_y * num_block_ext >= input_dims[2]
int64_t block_size_x = input_dims[3] / num_elements_per_thread;
int64_t max_block_size_y = prop.maxThreadsPerBlock / block_size_x;
int64_t block_size_y = min(input_dims[2], max_block_size_y);
int64_t num_block_ext = CeilDiv(input_dims[2], block_size_y);
if (num_block_ext <= prop.maxGridSize[0]) {
block_size = dim3(static_cast<unsigned int>(block_size_x), static_cast<unsigned int>(block_size_y));
grid_size = dim3(static_cast<unsigned int>(num_block_ext),
static_cast<unsigned int>(input_dims[1]),
static_cast<unsigned int>(input_dims[0]));
return true;
} else {
return false;
}
}
}
return false;
}
Status Transpose4DParallelizeMultipleElementsPerThreadInInnermostDim(
hipStream_t stream, size_t element_size,
const TArray<int64_t>& input_shape, const TArray<int64_t>& input_strides,
const void* input_data, const TArray<int64_t>& output_strides,
void* output_data, int N, const dim3& grid_size, const dim3& block_size) {
unsigned int num_elements_per_thread = 4 * sizeof(int) / static_cast<unsigned int>(element_size); // int4 is used in the kernel to access data.
switch (element_size) {
case sizeof(int8_t):
hipLaunchKernelGGL(HIP_KERNEL_NAME(Transpose4DKernelParallelizeMultipleElementsPerThreadInInnermostDim<sizeof(int8_t)>), grid_size, block_size, 0, stream,
input_strides, input_data,
output_strides, output_data,
input_shape[2],
N / num_elements_per_thread);
break;
case sizeof(int16_t):
hipLaunchKernelGGL(HIP_KERNEL_NAME(Transpose4DKernelParallelizeMultipleElementsPerThreadInInnermostDim<sizeof(int16_t)>), grid_size, block_size, 0, stream,
input_strides, input_data,
output_strides, output_data,
input_shape[2],
N / num_elements_per_thread);
break;
case sizeof(int32_t):
hipLaunchKernelGGL(HIP_KERNEL_NAME(Transpose4DKernelParallelizeMultipleElementsPerThreadInInnermostDim<sizeof(int32_t)>), grid_size, block_size, 0, stream,
input_strides, input_data,
output_strides, output_data,
input_shape[2],
N / num_elements_per_thread);
break;
case sizeof(int64_t):
hipLaunchKernelGGL(HIP_KERNEL_NAME(Transpose4DKernelParallelizeMultipleElementsPerThreadInInnermostDim<sizeof(int64_t)>), grid_size, block_size, 0, stream,
input_strides, input_data,
output_strides, output_data,
input_shape[2],
N / num_elements_per_thread);
break;
default:
// User will not hit this as this kernel is for fixed element size tensors only
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for transpose on ROCM. Element size was ",
element_size);
}
return Status::OK();
}
__global__ void Transpose4DKernelParallelizeOneElementPerThread(
const TArray<int64_t> input_strides, const int8_t* input_data,
const TArray<int64_t> output_strides, int8_t* output_data,
size_t element_size, int64_t input_shape_2, HIP_LONG N) {
// coordinates will be: [d0, d1, d2, d3]
HIP_LONG d0 = blockIdx.z;
HIP_LONG d1 = blockIdx.y;
HIP_LONG d2 = threadIdx.y + blockIdx.x * blockDim.y;
HIP_LONG d3 = threadIdx.x;
HIP_LONG input_index = d0 * input_strides[0] +
d1 * input_strides[1] +
d2 * input_strides[2] +
d3 * input_strides[3];
HIP_LONG output_index = d0 * output_strides[0] +
d1 * output_strides[1] +
d2 * output_strides[2] +
d3 * output_strides[3];
if (input_index < N && output_index < N && d2 < input_shape_2) {
const int8_t* input_data_to_be_copied = input_data + (input_index * element_size);
int8_t* output_data_to_be_copied = output_data + (output_index * element_size);
// copy over the bytes
for (size_t iter = 0; iter < element_size; ++iter) {
*output_data_to_be_copied++ = *input_data_to_be_copied++;
}
}
}
bool CanDoTranspose4DParallelizeOneElementPerThread(const hipDeviceProp_t& prop,
size_t element_size,
int32_t rank,
const gsl::span<const int64_t>& input_dims,
const gsl::span<const size_t>& permutations,
dim3& grid_size, dim3& block_size) {
if (rank == 4) {
// dims[3]: block.x
// dims[2]: block.y + grid.x
// dims[1]: grid.y
// dims[0]: grid.z
if (input_dims[3] <= prop.maxThreadsPerBlock &&
input_dims[1] <= prop.maxGridSize[1] &&
input_dims[0] <= prop.maxGridSize[2]) {
// There are 2 constrains when luanching the kernels
// 1. block_size_x * block_size_y <= prop.maxThreadsPerBlock
// 2. block_size_y * num_block_ext >= input_dims[2]
int64_t block_size_x = input_dims[3];
int64_t max_block_size_y = prop.maxThreadsPerBlock / block_size_x;
int64_t block_size_y = std::min(input_dims[2], max_block_size_y);
int64_t num_block_ext = CeilDiv(input_dims[2], block_size_y);
if (num_block_ext <= prop.maxGridSize[0]) {
block_size = dim3(static_cast<unsigned int>(block_size_x), static_cast<unsigned int>(block_size_y));
grid_size = dim3(static_cast<unsigned int>(num_block_ext),
static_cast<unsigned int>(input_dims[1]),
static_cast<unsigned int>(input_dims[0]));
return true;
} else {
return false;
}
}
}
return false;
}
Status Transpose4DParallelizeOneElementPerThread(
hipStream_t stream, size_t element_size,
const TArray<int64_t>& input_shape, const TArray<int64_t>& input_strides,
const void* input_data, const TArray<int64_t>& output_strides,
void* output_data, int N, const dim3& grid_size, const dim3& block_size) {
if (element_size != sizeof(int8_t) &&
element_size != sizeof(int16_t) &&
element_size != sizeof(int32_t) &&
element_size != sizeof(int64_t)) {
// User will not hit this as this kernel is for fixed element size tensors only
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for transpose on ROCM. Element size was ",
element_size);
}
hipLaunchKernelGGL(Transpose4DKernelParallelizeOneElementPerThread, grid_size, block_size, 0, stream,
input_strides, reinterpret_cast<const int8_t*>(input_data),
output_strides, reinterpret_cast<int8_t*>(output_data),
element_size, input_shape[2], N);
return Status::OK();
}
template <typename T>
__global__ void TransposeKernel(int32_t shape_rank, const TArray<int64_t> input_strides,
const T* input_data, const TArray<fast_divmod> output_strides, T* output_data, HIP_LONG N) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
HIP_LONG input_index = 0;
HIP_LONG output_index = id;
#pragma unroll
for (auto dim = 0; dim < input_strides.Capacity(); ++dim) {
if (dim >= shape_rank) {
break;
}
int out_coord, r;
output_strides[dim].divmod(output_index, out_coord, r);
output_index = r;
input_index += input_strides[dim] * out_coord;
}
output_data[id] = input_data[input_index];
}
Status TransposeImpl(hipStream_t stream, size_t element_size, int32_t shape_rank, const TArray<int64_t>& input_strides,
const void* input_data, const TArray<fast_divmod>& fdm_output_strides, void* output_data, int N) {
int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
switch (element_size) {
case sizeof(int8_t):
hipLaunchKernelGGL(HIP_KERNEL_NAME(TransposeKernel<int8_t>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
shape_rank, input_strides,
reinterpret_cast<const ToHipType<int8_t>::MappedType*>(input_data),
fdm_output_strides,
reinterpret_cast<ToHipType<int8_t>::MappedType*>(output_data),
N);
break;
case sizeof(int16_t):
hipLaunchKernelGGL(HIP_KERNEL_NAME(TransposeKernel<int16_t>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
shape_rank, input_strides,
reinterpret_cast<const ToHipType<int16_t>::MappedType*>(input_data),
fdm_output_strides,
reinterpret_cast<ToHipType<int16_t>::MappedType*>(output_data),
N);
break;
case sizeof(int32_t):
hipLaunchKernelGGL(HIP_KERNEL_NAME(TransposeKernel<int32_t>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
shape_rank, input_strides,
reinterpret_cast<const ToHipType<int32_t>::MappedType*>(input_data),
fdm_output_strides,
reinterpret_cast<ToHipType<int32_t>::MappedType*>(output_data),
N);
break;
case sizeof(int64_t):
hipLaunchKernelGGL(HIP_KERNEL_NAME(TransposeKernel<int64_t>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
shape_rank, input_strides,
reinterpret_cast<const ToHipType<int64_t>::MappedType*>(input_data),
fdm_output_strides,
reinterpret_cast<ToHipType<int64_t>::MappedType*>(output_data),
N);
break;
default:
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for transpose on ROCM. Element size was ",
element_size);
}
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace onnxruntime {
namespace rocm {
bool CanDoTranspose3D(const hipDeviceProp_t& prop,
size_t rank, const gsl::span<const int64_t>& input_dims, const gsl::span<const size_t>& permutations,
dim3& grid_size, dim3& block_size);
Status Transpose3DImpl(hipStream_t stream, size_t element_size, const TArray<int64_t>& input_shape, const TArray<int64_t>& input_strides, const void* input_data,
void* output_data, int64_t N,
const dim3& grid_size, const dim3& block_size);
bool CanDoTranspose4DParallelizeMultipleElementsPerThreadInInnermostDim(const hipDeviceProp_t& prop,
size_t element_size,
int32_t rank,
const gsl::span<const int64_t>& input_dims,
const gsl::span<const size_t>& permutations,
dim3& grid_size, dim3& block_size);
Status Transpose4DParallelizeMultipleElementsPerThreadInInnermostDim(hipStream_t stream,
size_t element_size, const TArray<int64_t>& input_shape,
const TArray<int64_t>& input_strides, const void* input_data,
const TArray<int64_t>& output_strides, void* output_data, int N,
const dim3& grid_size, const dim3& block_size);
bool CanDoTranspose4DParallelizeOneElementPerThread(const hipDeviceProp_t& prop,
size_t element_size,
int32_t rank,
const gsl::span<const int64_t>& input_dims,
const gsl::span<const size_t>& permutations,
dim3& grid_size, dim3& block_size);
Status Transpose4DParallelizeOneElementPerThread(hipStream_t stream,
size_t element_size, const TArray<int64_t>& input_shape,
const TArray<int64_t>& input_strides, const void* input_data,
const TArray<int64_t>& output_strides, void* output_data, int N,
const dim3& grid_size, const dim3& block_size);
Status TransposeImpl(hipStream_t stream, size_t element_size, int32_t shape_rank, const TArray<int64_t>& input_strides,
const void* input_data, const TArray<fast_divmod>& fdm_output_strides, void* output_data, int N);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/trilu.h"
#include "core/providers/rocm/tensor/trilu_impl.h"
#include "core/providers/cpu/tensor/utils.h"
using namespace onnxruntime::common;
namespace onnxruntime {
namespace rocm {
ONNX_OPERATOR_KERNEL_EX(
Trilu,
kOnnxDomain,
14,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.InputMemoryType(OrtMemTypeCPUInput, 1)
.MayInplace(0, 0)
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
Trilu);
Status Trilu::ComputeInternal(OpKernelContext* ctx) const {
const Tensor* input_ptr = ctx->Input<Tensor>(0);
const auto* k = ctx->Input<Tensor>(1);
int64_t k_val = 0;
if (k) {
ORT_ENFORCE(IsScalarOr1ElementVector(k), "k should be a 1-D or 0-D tensor.");
k_val = *(k->Data<int64_t>());
}
if (input_ptr == nullptr) return Status(common::ONNXRUNTIME, common::FAIL, "input count mismatch");
const Tensor& input = *input_ptr;
const auto& shape = input.Shape();
const auto& input_dims = shape.GetDims();
int32_t rank = gsl::narrow_cast<int32_t>(input_dims.size());
if (rank < 2) {
return Status(ONNXRUNTIME, INVALID_ARGUMENT, "Input tensor should have a rank of at least 2");
}
Tensor* output = ctx->Output(0, shape);
int64_t matrix_size = input_dims[rank - 1] * input_dims[rank - 2];
if (matrix_size == 0) {
return Status::OK();
}
const fast_divmod row_col_divmod_indices(gsl::narrow_cast<int>(input_dims[rank - 1]));
const fast_divmod batch_divmod_indices(gsl::narrow_cast<int>(matrix_size));
size_t element_size = input.DataType()->Size();
return TriluImpl(
this->Stream(),
upper_,
element_size,
k_val,
input.DataRaw(),
output->MutableDataRaw(),
gsl::narrow<int>(shape.Size()),
batch_divmod_indices,
row_col_divmod_indices);
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/rocm_kernel.h"
namespace onnxruntime {
namespace rocm {
class Trilu final : public RocmKernel {
public:
Trilu(const OpKernelInfo& info) : RocmKernel(info), upper_(info.GetAttrOrDefault<int64_t>("upper", 1) >= 1) {
}
~Trilu() = default;
Status ComputeInternal(OpKernelContext* context) const override;
private:
bool upper_;
};
} // namespace rocm
} // namespace onnxruntime
\ No newline at end of file
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/cu_inc/common.cuh"
#include "trilu_impl.h"
#include <stdio.h>
namespace onnxruntime {
namespace rocm {
template <typename T, bool upper>
__global__ void TriluKernel(
int64_t k,
const T* input_data,
T* output_data,
const HIP_LONG N,
const fast_divmod batch_divmod_indices,
const fast_divmod row_col_divmod_indices) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
int row, col;
row_col_divmod_indices.divmod(batch_divmod_indices.mod(id), row, col);
output_data[id] = upper ? (((row + k) <= col) ? input_data[id] : 0) : (((row + k) >= col) ? input_data[id] : 0);
}
Status TriluImpl(
hipStream_t stream,
bool upper,
size_t element_size,
int64_t k,
const void* input_data,
void* output_data,
int N,
const fast_divmod& batch_divmod_indices,
const fast_divmod& row_col_divmod_indices) {
int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
switch (element_size) {
case sizeof(int8_t):
if (upper) {
hipLaunchKernelGGL(HIP_KERNEL_NAME(TriluKernel<int8_t, true>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
k,
reinterpret_cast<const ToHipType<int8_t>::MappedType*>(input_data),
reinterpret_cast<ToHipType<int8_t>::MappedType*>(output_data),
(HIP_LONG)N,
batch_divmod_indices,
row_col_divmod_indices);
} else {
hipLaunchKernelGGL(HIP_KERNEL_NAME(TriluKernel<int8_t, false>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
k,
reinterpret_cast<const ToHipType<int8_t>::MappedType*>(input_data),
reinterpret_cast<ToHipType<int8_t>::MappedType*>(output_data),
(HIP_LONG)N,
batch_divmod_indices,
row_col_divmod_indices);
}
break;
case sizeof(int16_t):
if (upper) {
hipLaunchKernelGGL(HIP_KERNEL_NAME(TriluKernel<int16_t, true>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
k,
reinterpret_cast<const ToHipType<int16_t>::MappedType*>(input_data),
reinterpret_cast<ToHipType<int16_t>::MappedType*>(output_data),
(HIP_LONG)N,
batch_divmod_indices,
row_col_divmod_indices);
} else {
hipLaunchKernelGGL(HIP_KERNEL_NAME(TriluKernel<int16_t, false>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
k,
reinterpret_cast<const ToHipType<int16_t>::MappedType*>(input_data),
reinterpret_cast<ToHipType<int16_t>::MappedType*>(output_data),
(HIP_LONG)N,
batch_divmod_indices,
row_col_divmod_indices);
}
break;
case sizeof(int32_t):
if (upper) {
hipLaunchKernelGGL(HIP_KERNEL_NAME(TriluKernel<int32_t, true>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
k,
reinterpret_cast<const ToHipType<int32_t>::MappedType*>(input_data),
reinterpret_cast<ToHipType<int32_t>::MappedType*>(output_data),
(HIP_LONG)N,
batch_divmod_indices,
row_col_divmod_indices);
} else {
hipLaunchKernelGGL(HIP_KERNEL_NAME(TriluKernel<int32_t, false>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
k,
reinterpret_cast<const ToHipType<int32_t>::MappedType*>(input_data),
reinterpret_cast<ToHipType<int32_t>::MappedType*>(output_data),
(HIP_LONG)N,
batch_divmod_indices,
row_col_divmod_indices);
}
break;
case sizeof(int64_t):
if (upper) {
hipLaunchKernelGGL(HIP_KERNEL_NAME(TriluKernel<int64_t, true>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
k,
reinterpret_cast<const ToHipType<int64_t>::MappedType*>(input_data),
reinterpret_cast<ToHipType<int64_t>::MappedType*>(output_data),
(HIP_LONG)N,
batch_divmod_indices,
row_col_divmod_indices);
} else {
hipLaunchKernelGGL(HIP_KERNEL_NAME(TriluKernel<int64_t, false>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
k,
reinterpret_cast<const ToHipType<int64_t>::MappedType*>(input_data),
reinterpret_cast<ToHipType<int64_t>::MappedType*>(output_data),
(HIP_LONG)N,
batch_divmod_indices,
row_col_divmod_indices);
}
break;
default:
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for transpose on ROCM. Element size was ",
element_size);
}
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
\ No newline at end of file
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace onnxruntime {
namespace rocm {
Status TriluImpl(
hipStream_t stream,
bool upper,
size_t element_size,
int64_t k,
const void* input_data,
void* output_data,
int N,
const fast_divmod& batch_divmod_indices,
const fast_divmod& row_col_divmod_indices);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/unsqueeze.h"
namespace onnxruntime {
namespace rocm {
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Unsqueeze,
kOnnxDomain,
1, 10,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.Alias(0, 0)
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
Unsqueeze);
// explicitly support negative axis
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Unsqueeze,
kOnnxDomain,
11, 12,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.Alias(0, 0)
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
Unsqueeze);
// axes is input instead of attribute, support bfloat16
ONNX_OPERATOR_KERNEL_EX(
Unsqueeze,
kOnnxDomain,
13,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.Alias(0, 0)
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.InputMemoryType(OrtMemTypeCPUInput, 1),
Unsqueeze);
Status Unsqueeze::ComputeInternal(OpKernelContext* ctx) const {
Prepare p;
ORT_RETURN_IF_ERROR(PrepareCompute(ctx, p));
const void* input = p.input_tensor->DataRaw();
void* output = p.output_tensor->MutableDataRaw();
if (input == output)
return Status::OK();
auto count = p.input_tensor->Shape().Size();
auto element_bytes = p.input_tensor->DataType()->Size();
HIP_RETURN_IF_ERROR(hipMemcpyAsync(output, input, count * element_bytes, hipMemcpyDeviceToDevice, Stream()));
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/tensor/unsqueeze.h"
namespace onnxruntime {
namespace rocm {
class Unsqueeze final : public UnsqueezeBase, public RocmKernel {
public:
Unsqueeze(const OpKernelInfo& info) : UnsqueezeBase(info), RocmKernel(info) {}
Status ComputeInternal(OpKernelContext* context) const override;
};
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "upsample.h"
#include "upsample_impl.h"
#include "core/providers/rocm/tensor/resize_impl.h"
#include "core/providers/cpu/tensor/utils.h"
using namespace onnxruntime::common;
namespace onnxruntime {
namespace rocm {
#define REGISTER_VERSIONED_TYPED_KERNEL(T, start, end) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Upsample, \
kOnnxDomain, \
start, \
end, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.InputMemoryType(OrtMemTypeCPUInput, 1) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
Upsample<T>)
REGISTER_VERSIONED_TYPED_KERNEL(float, 7, 8);
REGISTER_VERSIONED_TYPED_KERNEL(double, 7, 8);
REGISTER_VERSIONED_TYPED_KERNEL(MLFloat16, 7, 8);
REGISTER_VERSIONED_TYPED_KERNEL(int32_t, 7, 8);
REGISTER_VERSIONED_TYPED_KERNEL(uint8_t, 7, 8);
// Upsample was deprecated in opset 10
REGISTER_VERSIONED_TYPED_KERNEL(float, 9, 9);
REGISTER_VERSIONED_TYPED_KERNEL(double, 9, 9);
REGISTER_VERSIONED_TYPED_KERNEL(MLFloat16, 9, 9);
REGISTER_VERSIONED_TYPED_KERNEL(int32_t, 9, 9);
REGISTER_VERSIONED_TYPED_KERNEL(uint8_t, 9, 9);
template <typename T>
Status Upsample<T>::BaseCompute(OpKernelContext* context,
const std::vector<float>& roi,
const std::vector<float>& scales,
const gsl::span<const int64_t>& output_dims) const {
const Tensor* X = context->Input<Tensor>(0);
auto X_dims = X->Shape().GetDims();
int32_t rank = static_cast<int32_t>(X_dims.size());
ORT_ENFORCE(static_cast<int32_t>(output_dims.size()) == rank, "Rank of input and output tensor should be same.");
if (rank == 0)
return Status(ONNXRUNTIME, INVALID_ARGUMENT,
is_resize_ ? "Resize: input tensor cannot be scalar." : "Upsample: input tensor cannot be scalar.");
if (rank != static_cast<int32_t>(scales.size()))
return Status(ONNXRUNTIME, INVALID_ARGUMENT,
is_resize_ ? "Resize: input tensor's dimension does not match the scales." : "Upsample: input tensor's dimension does not match the scales.");
if (roi.size() != 2 * X->Shape().GetDims().size())
return Status(ONNXRUNTIME, INVALID_ARGUMENT,
"Resize: size of roi array should be 2 * N where N is the rank of input tensor X.");
Tensor* Y = context->Output(0, output_dims);
// Return early if the output tensor is going to be of size 0
if (Y->Shape().Size() == 0) {
return Status::OK();
}
typedef typename ToHipType<T>::MappedType HipT;
// kernel
TensorPitches input_pitches(X_dims);
TArray<int64_t> input_strides(input_pitches);
TensorPitches output_pitches(output_dims);
TArray<fast_divmod> output_div_pitches(rank);
for (int32_t i = 0; i < rank; ++i) {
output_div_pitches[i] = fast_divmod(gsl::narrow_cast<int>(output_pitches[i]));
}
size_t output_count = Y->Shape().Size();
if (is_resize_) {
TArray<int64_t> input_shape(X_dims);
TArray<int64_t> output_shape(output_dims);
TArray<float, 10> roi_vals(roi);
TArray<float> scales_vals(scales);
size_t temp_buffer_size = CalcResizeBufferSize(mode_, output_dims);
auto dims_mapping_buffer = GetScratchBuffer<unsigned char>(temp_buffer_size);
void* dims_mapping = reinterpret_cast<void*>(dims_mapping_buffer.get());
ResizeImpl(Stream(), mode_, (int)rank, input_shape, output_shape,
input_strides, output_div_pitches, scales_vals, roi_vals,
reinterpret_cast<const HipT*>(X->Data<T>()),
reinterpret_cast<HipT*>(Y->MutableData<T>()),
output_count, use_extrapolation_, ToHipType<T>::FromFloat(extrapolation_value_),
cubic_coeff_a_, exclude_outside_,
coordinate_transform_mode_, nearest_mode_,
dims_mapping);
} else {
TArray<fast_divmod> scales_div(rank);
for (int32_t i = 0; i < rank; ++i) {
scales_div[i] = fast_divmod(gsl::narrow_cast<int>(ceil(scales[i])));
}
UpampleImpl(Stream(),
mode_,
rank,
(UpsampleMode::LINEAR == mode_) ? (rank == 2 ? X_dims[0] : X_dims[2]) : 0,
input_strides,
output_div_pitches,
scales_div,
reinterpret_cast<const HipT*>(X->Data<T>()),
reinterpret_cast<HipT*>(Y->MutableData<T>()),
output_count);
}
return Status::OK();
}
template <typename T>
Status Upsample<T>::ComputeInternal(OpKernelContext* context) const {
const Tensor* X = context->Input<Tensor>(0);
ORT_ENFORCE(X != nullptr);
TensorShapeVector output_dims(X->Shape().GetDims().size());
std::vector<float> roi_array(X->Shape().GetDims().size() * 2, 0.0f);
if (!roi_cached_) {
bool use_default_roi = true;
if (need_roi_input_) {
ORT_ENFORCE(roi_input_idx_ > 0, "Invalid roi input index.");
const auto* roi = context->Input<Tensor>(roi_input_idx_);
if (roi != nullptr) {
ParseRoiData(roi, roi_array);
use_default_roi = false;
}
}
if (use_default_roi) {
// default roi includes ensures all the values in that axis are included in the roi
// normalized roi is thus : [start, end] = [0, 1]
const auto input_dims = X->Shape().GetDims();
size_t input_rank = input_dims.size();
roi_array.resize(input_rank * 2);
for (size_t i = 0; i < input_rank; ++i) {
roi_array[i] = 0;
roi_array[i + input_rank] = 1;
}
}
}
const std::vector<float>& roi = roi_cached_ ? roi_ : roi_array;
if (OpKernel::Node().InputDefs().size() == 1) {
// Compute output shape from scales and input dims
ComputeOutputShape(scales_, X->Shape().GetDims(), output_dims);
return BaseCompute(context, roi, scales_, output_dims);
}
const Tensor* scales = context->Input<Tensor>(scales_input_idx_);
const Tensor* sizes = context->Input<Tensor>(sizes_input_idx_);
if (scales_cached_) {
ORT_ENFORCE(sizes == nullptr, "Only one of scales or sizes must be provided as input.");
ComputeOutputShape(scales_, X->Shape().GetDims(), output_dims);
return BaseCompute(context, roi, scales_, output_dims);
}
std::vector<float> scales_array(X->Shape().GetDims().size());
if (scales != nullptr && scales->Shape().Size() != 0) {
// use scales input data
ORT_ENFORCE(sizes == nullptr, "Only one of scales or sizes must be provided as input.");
ParseScalesData(scales, scales_array);
ComputeOutputShape(scales_array, X->Shape().GetDims(), output_dims);
} else {
// When sizes input is available directly populate it into the output_dims array.
ORT_ENFORCE(sizes != nullptr && sizes->Shape().Size() != 0,
"Either scales or sizes MUST be provided as input.");
ORT_ENFORCE(sizes->Shape().Size() == static_cast<int64_t>(output_dims.size()),
"Resize: input tensor's rank does not match the output tensor's rank.");
memcpy(output_dims.data(), sizes->Data<int64_t>(), sizes->Shape().Size() * sizeof(int64_t));
ParseScalesDataFromOutputSize(output_dims, X->Shape().GetDims(), scales_array);
}
return BaseCompute(context, roi, scales_array, output_dims);
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/tensor/upsamplebase.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
class Upsample : public UpsampleBase, public RocmKernel {
public:
Upsample(const OpKernelInfo& info) : UpsampleBase(info), RocmKernel(info) {
}
Status ComputeInternal(OpKernelContext* context) const override;
Status BaseCompute(OpKernelContext* context, const std::vector<float>& roi, const std::vector<float>& scales,
const gsl::span<const int64_t>& output_dims) const;
};
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/cu_inc/common.cuh"
#include "upsample_impl.h"
namespace onnxruntime {
namespace rocm {
template <typename T, int RANK>
__global__ void _UpampleNearestKernel(const TArray<int64_t> input_pitches,
const TArray<fast_divmod> output_div_pitches,
const TArray<fast_divmod> scales_div,
const T* __restrict__ input_data,
T* __restrict__ output_data,
const size_t N) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
HIP_LONG input_index = 0;
HIP_LONG output_index = id;
int div, mod;
for (int dim = 0; dim < RANK; ++dim) {
output_div_pitches[dim].divmod(output_index, div, mod);
output_index = mod;
if (scales_div[dim].d_ != 1 && div > 0) {
scales_div[dim].divmod(div, div, mod);
}
input_index += input_pitches[dim] * div;
}
output_data[id] = input_data[input_index];
}
// The following method supports a 4-D input in 'Linear mode'
// that amounts to 'Bilinear' Upsampling/Resizing in the sense that it assumes
// the scale values for the outermost 2 dimensions are 1.
// This is the common use-case where the 4-D input (batched multi-channel images)
// is usually of shape [N, C, H, W] and the scales are [1.0, 1.0, height_scale, width_scale]
template <typename T>
__global__ void _UpampleBilinear4DInputKernel(const int64_t input_dim2,
const TArray<int64_t> input_pitches,
const TArray<fast_divmod> output_div_pitches,
const TArray<fast_divmod> scales_div,
const T* __restrict__ input_data,
T* __restrict__ output_data,
const size_t N) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
HIP_LONG input_index = 0;
// For bilinear mode, scales[0]=scales[1]=1
int mod;
int index_of_dim0, index_of_dim1, index_of_dim2, index_of_dim3;
output_div_pitches[0].divmod(id, index_of_dim0, mod);
output_div_pitches[1].divmod(mod, index_of_dim1, mod);
output_div_pitches[2].divmod(mod, index_of_dim2, mod);
index_of_dim3 = mod;
int index_of_input_dim2, index_of_input_dim3, x_offset, y_offset;
scales_div[2].divmod(index_of_dim2, index_of_input_dim2, y_offset);
scales_div[3].divmod(index_of_dim3, index_of_input_dim3, x_offset);
input_index = index_of_dim0 * input_pitches[0] +
index_of_dim1 * input_pitches[1] +
index_of_input_dim2 * input_pitches[2] +
index_of_input_dim3;
T x00 = input_data[input_index];
T x10, x01, x11;
bool end_of_dim2 = false;
if (index_of_input_dim2 == (input_dim2 - 1)) {
// It's the end in dimension 2
x01 = x00;
end_of_dim2 = true;
} else {
x01 = input_data[input_index + input_pitches[2]];
}
if (index_of_input_dim3 == (input_pitches[2] - 1)) {
// It's the end in dimension 3
x10 = x00;
x11 = x01;
} else {
x10 = input_data[input_index + 1];
x11 = end_of_dim2 ? x10 : input_data[input_index + input_pitches[2] + 1];
}
T y_offset_T = static_cast<T>(y_offset);
T x_offset_T = static_cast<T>(x_offset);
T scales_div2_T = static_cast<T>(scales_div[2].d_);
T scales_div3_T = static_cast<T>(scales_div[3].d_);
T y0 = x00 + static_cast<T>(y_offset_T * (x01 - x00) / scales_div2_T);
T y1 = x10 + static_cast<T>(y_offset_T * (x11 - x10) / scales_div2_T);
output_data[id] = y0 + static_cast<T>(x_offset_T * (y1 - y0) / scales_div3_T);
}
// The following method supports a 2-D input in 'Linear mode'
template <typename T>
__global__ void _UpampleBilinear2DInputKernel(const int64_t input_dim0,
const TArray<int64_t> input_pitches,
const TArray<fast_divmod> output_div_pitches,
const TArray<fast_divmod> scales_div,
const T* __restrict__ input_data,
T* __restrict__ output_data,
const size_t N) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
HIP_LONG input_index = 0;
int mod;
int index_of_dim0, index_of_dim1;
output_div_pitches[0].divmod(id, index_of_dim0, mod);
index_of_dim1 = mod;
int index_of_input_dim0, index_of_input_dim1, x_offset, y_offset;
scales_div[0].divmod(index_of_dim0, index_of_input_dim0, y_offset);
scales_div[1].divmod(index_of_dim1, index_of_input_dim1, x_offset);
input_index = index_of_input_dim0 * input_pitches[0] + index_of_input_dim1;
T x00 = input_data[input_index];
T x10, x01, x11;
bool end_of_dim0 = false;
if (index_of_input_dim0 == (input_dim0 - 1)) {
// It's the end in dimension 0
x01 = x00;
end_of_dim0 = true;
} else {
x01 = input_data[input_index + input_pitches[0]];
}
if (index_of_input_dim1 == (input_pitches[0] - 1)) {
// It's the end in dimension 1
x10 = x00;
x11 = x01;
} else {
x10 = input_data[input_index + 1];
x11 = end_of_dim0 ? x10 : input_data[input_index + input_pitches[0] + 1];
}
T y_offset_T = static_cast<T>(y_offset);
T x_offset_T = static_cast<T>(x_offset);
T scales_div0_T = static_cast<T>(scales_div[0].d_);
T scales_div1_T = static_cast<T>(scales_div[1].d_);
T y0 = x00 + static_cast<T>(y_offset_T * (x01 - x00) / scales_div0_T);
T y1 = x10 + static_cast<T>(y_offset_T * (x11 - x10) / scales_div0_T);
output_data[id] = y0 + static_cast<T>(x_offset_T * (y1 - y0) / scales_div1_T);
}
template <typename T>
void UpampleImpl(hipStream_t stream,
const onnxruntime::UpsampleMode upsample_mode,
const size_t rank,
const int64_t input_dim2,
const TArray<int64_t>& input_pitches,
const TArray<fast_divmod>& output_div_pitches,
const TArray<fast_divmod>& scales_div,
const T* input_data,
T* output_data,
const size_t N) {
int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
if (onnxruntime::UpsampleMode::NN == upsample_mode) {
if (rank == 4) {
hipLaunchKernelGGL(HIP_KERNEL_NAME(_UpampleNearestKernel<T, 4>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
input_pitches, output_div_pitches, scales_div,
input_data, output_data, N);
} else if (rank == 3) {
hipLaunchKernelGGL(HIP_KERNEL_NAME(_UpampleNearestKernel<T, 3>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
input_pitches, output_div_pitches, scales_div,
input_data, output_data, N);
} else if (rank == 2) {
hipLaunchKernelGGL(HIP_KERNEL_NAME(_UpampleNearestKernel<T, 2>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
input_pitches, output_div_pitches, scales_div,
input_data, output_data, N);
} else if (rank == 1) {
hipLaunchKernelGGL(HIP_KERNEL_NAME(_UpampleNearestKernel<T, 1>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
input_pitches, output_div_pitches, scales_div,
input_data, output_data, N);
} else {
ORT_THROW("Unsupported rank by the Upsample ROCM kernel. Input rank: ", rank);
}
} else if (onnxruntime::UpsampleMode::LINEAR == upsample_mode) {
if (rank == 4) {
hipLaunchKernelGGL(HIP_KERNEL_NAME(_UpampleBilinear4DInputKernel<T>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
input_dim2, input_pitches, output_div_pitches, scales_div,
input_data, output_data, N);
} else if (rank == 2) {
hipLaunchKernelGGL(HIP_KERNEL_NAME(_UpampleBilinear2DInputKernel<T>), blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
input_dim2, input_pitches, output_div_pitches, scales_div,
input_data, output_data, N);
} else {
ORT_THROW("Unsupported rank by the Upsample ROCM kernel. Input rank: ", rank);
}
} else {
// Should never encounter this as Upsample only supports 'Nearest' and 'Linear' modes.
// But if we do encounter this it is best to throw instead of returning silently.
ORT_THROW("Unsupported mode for Upsample: ", upsample_mode);
}
}
#define SPECIALIZED_IMPL(T) \
template void UpampleImpl<T>(hipStream_t stream, \
const onnxruntime::UpsampleMode upsample_mode, \
const size_t rank, \
const int64_t input_dim2, \
const TArray<int64_t>& input_pitches, \
const TArray<fast_divmod>& output_div_pitches, \
const TArray<fast_divmod>& scales_div, \
const T* input_data, \
T* output_data, \
const size_t N);
SPECIALIZED_IMPL(float)
SPECIALIZED_IMPL(double)
SPECIALIZED_IMPL(half)
SPECIALIZED_IMPL(int32_t)
SPECIALIZED_IMPL(uint8_t)
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
#include "core/common/common.h"
#include "core/providers/cpu/tensor/upsamplebase.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
void UpampleImpl(hipStream_t stream,
const onnxruntime::UpsampleMode upsample_mode,
const size_t rank,
const int64_t input_dim2,
const TArray<int64_t>& input_pitches,
const TArray<fast_divmod>& output_div_pitches,
const TArray<fast_divmod>& scales_div,
const T* input_data,
T* output_data,
const size_t N);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "where.h"
#include "where_impl.h"
#include "core/providers/cpu/tensor/utils.h"
namespace onnxruntime {
namespace rocm {
// kernel builder functions
#define WHERE_TYPED_KERNEL_WITH_TYPE_NAME(T, TName) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Where, \
kOnnxDomain, \
9, \
15, \
TName, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("B", DataTypeImpl::GetTensorType<bool>()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
Where<T>); \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
Where, \
kOnnxDomain, \
16, \
TName, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("B", DataTypeImpl::GetTensorType<bool>()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
Where<T>);
// Compute where operator output shape based upon three way broad-casting.
Status ComputeOutputShape(const std::string& node_name, const TensorShape& cond_shape,
const TensorShape& x_shape, const TensorShape& y_shape, TensorShape& out_shape) {
size_t cond_rank = cond_shape.NumDimensions();
size_t x_rank = x_shape.NumDimensions();
size_t y_rank = y_shape.NumDimensions();
size_t out_rank = std::max(std::max(cond_rank, x_rank), y_rank);
std::vector<int64_t> output_dims(out_rank, 0);
for (size_t i = 0; i < out_rank; ++i) {
int64_t cond_dim = 1;
if (i < cond_rank)
cond_dim = cond_shape[cond_rank - 1 - i];
int64_t x_dim = 1;
if (i < x_rank)
x_dim = x_shape[x_rank - 1 - i];
int64_t y_dim = 1;
if (i < y_rank)
y_dim = y_shape[y_rank - 1 - i];
int64_t out_dim = std::max(std::max(cond_dim, x_dim), y_dim);
// special case to handle a dim of 0 which can be broadcast with a 1
if (out_dim == 1)
out_dim = std::min(std::min(cond_dim, x_dim), y_dim);
if (cond_dim != out_dim && cond_dim != 1)
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, node_name, ": condition operand cannot broadcast on dim ", cond_rank - 1 - i,
" Condition Shape: ", cond_shape.ToString(), ", X Shape: ", x_shape.ToString(), ", Y Shape: ", y_shape.ToString());
if (x_dim != out_dim && x_dim != 1)
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, node_name, ": X operand cannot broadcast on dim ", x_rank - 1 - i,
" Condition Shape: ", cond_shape.ToString(), ", X Shape: ", x_shape.ToString(), ", Y Shape: ", y_shape.ToString());
if (y_dim != out_dim && y_dim != 1)
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, node_name, ": Y operand cannot broadcast on dim ", y_rank - 1 - i,
" Condition Shape: ", cond_shape.ToString(), ", X Shape: ", x_shape.ToString(), ", Y Shape: ", y_shape.ToString());
output_dims[out_rank - 1 - i] = out_dim;
}
out_shape = TensorShape(output_dims);
return Status::OK();
}
struct TernaryElementwisePreparation {
const Tensor* a_tensor = nullptr;
const Tensor* b_tensor = nullptr;
const Tensor* c_tensor = nullptr;
size_t output_rank_or_simple_broadcast = 0; // for no_broadcast cases, output_rank uses SimpleBroadcast enums
TArray<int64_t> a_padded_strides; // for a shape == output shape, this is nullptr
TArray<int64_t> b_padded_strides; // for b shape == output shape, this is nullptr
TArray<int64_t> c_padded_strides; // for c shape == output shape, this is nullptr
TArray<fast_divmod> fdm_output_strides;
BroadcastIndexType a_index_type = BroadcastIndexType::NoBroadcast;
BroadcastIndexType b_index_type = BroadcastIndexType::NoBroadcast;
BroadcastIndexType c_index_type = BroadcastIndexType::NoBroadcast;
TernaryElementwisePreparation(const Tensor* a, const Tensor* b, const Tensor* c)
: a_tensor(a), b_tensor(b), c_tensor(c) {}
Status TernaryElementwiseBroadcastPrepareHelper(const TensorShape& a_shape,
const TensorShape& b_shape,
const TensorShape& c_shape,
const TensorShape& output_shape) {
int32_t a_rank = static_cast<int32_t>(a_shape.NumDimensions());
int32_t b_rank = static_cast<int32_t>(b_shape.NumDimensions());
int32_t c_rank = static_cast<int32_t>(c_shape.NumDimensions());
int32_t out_rank = std::max(std::max(a_rank, b_rank), c_rank);
// early return when shapes match
if (a_shape == b_shape && b_shape == c_shape) {
output_rank_or_simple_broadcast = static_cast<size_t>(SimpleBroadcast::NoBroadcast);
return Status::OK();
}
output_rank_or_simple_broadcast = out_rank;
auto padder = [out_rank](int32_t rank, const TensorShape& shape, TArray<int64_t>& padded_strides) {
padded_strides.SetSize(out_rank);
if (rank > 0) {
TensorPitches pitches(shape.GetDims());
auto offset = out_rank - rank;
for (auto i = offset; i < out_rank; ++i) {
// the stride for broadcast dimension is kept as 0
if (shape.GetDims()[gsl::narrow_cast<size_t>(i) - offset] != 1) {
padded_strides[i] = pitches[gsl::narrow_cast<size_t>(i) - offset];
}
}
}
};
bool has_need_compute = false;
if (a_shape.Size() == 1) {
a_index_type = BroadcastIndexType::Scalar;
} else if (a_shape != output_shape) {
padder(a_rank, a_shape, a_padded_strides);
a_index_type = BroadcastIndexType::NeedCompute;
has_need_compute = true;
}
if (b_shape.Size() == 1) {
b_index_type = BroadcastIndexType::Scalar;
} else if (b_shape != output_shape) {
padder(b_rank, b_shape, b_padded_strides);
b_index_type = BroadcastIndexType::NeedCompute;
has_need_compute = true;
}
if (c_shape.Size() == 1) {
c_index_type = BroadcastIndexType::Scalar;
} else if (c_shape != output_shape) {
padder(c_rank, c_shape, c_padded_strides);
c_index_type = BroadcastIndexType::NeedCompute;
has_need_compute = true;
}
if (!has_need_compute) {
output_rank_or_simple_broadcast = static_cast<size_t>(SimpleBroadcast::NoBroadcast);
return Status::OK();
}
TensorPitches output_pitches(output_shape.GetDims());
fdm_output_strides.SetSize(out_rank);
for (auto i = 0; i < out_rank; ++i) {
fdm_output_strides[i] = fast_divmod(static_cast<int32_t>(output_pitches[i]));
}
return Status::OK();
}
};
template <typename T>
Status Where<T>::ComputeInternal(OpKernelContext* context) const {
typedef typename ToHipType<T>::MappedType HipT;
const auto* const condition = context->Input<Tensor>(0);
const auto* const X = context->Input<Tensor>(1);
const auto* const Y = context->Input<Tensor>(2);
ORT_ENFORCE(condition && X && Y, "condition, X, and Y inputs are required!");
auto const& condition_shape = condition->Shape();
auto const& X_shape = X->Shape();
auto const& Y_shape = Y->Shape();
TensorShape output_shape;
ORT_RETURN_IF_ERROR(ComputeOutputShape(Node().Name(), condition_shape, X_shape, Y_shape, output_shape));
auto output_tensor = context->Output(0, output_shape);
if (output_shape.Size() == 0)
return Status::OK();
TernaryElementwisePreparation prepare(condition, X, Y);
ORT_RETURN_IF_ERROR(prepare.TernaryElementwiseBroadcastPrepareHelper(condition_shape, X_shape, Y_shape, output_shape));
WhereImpl<HipT>(
Stream(),
prepare.output_rank_or_simple_broadcast,
prepare.a_index_type,
prepare.a_padded_strides,
reinterpret_cast<const bool*>(prepare.a_tensor->Data<bool>()),
prepare.b_index_type,
prepare.b_padded_strides,
reinterpret_cast<const HipT*>(prepare.b_tensor->Data<T>()),
prepare.c_index_type,
prepare.c_padded_strides,
reinterpret_cast<const HipT*>(prepare.c_tensor->Data<T>()),
prepare.fdm_output_strides,
reinterpret_cast<HipT*>(output_tensor->MutableData<T>()),
output_tensor->Shape().Size());
return Status::OK();
}
#define SPECIALIZED_COMPUTE_WITH_NAME(T, TName) \
WHERE_TYPED_KERNEL_WITH_TYPE_NAME(T, TName) \
template Status Where<T>::ComputeInternal(OpKernelContext* context) const;
#define SPECIALIZED_COMPUTE(T) \
SPECIALIZED_COMPUTE_WITH_NAME(T, T)
SPECIALIZED_COMPUTE(uint8_t)
SPECIALIZED_COMPUTE(int32_t)
SPECIALIZED_COMPUTE(int64_t)
SPECIALIZED_COMPUTE(float)
SPECIALIZED_COMPUTE(double_t)
SPECIALIZED_COMPUTE(MLFloat16)
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
class Where final : public RocmKernel {
public:
Where(const OpKernelInfo& info) : RocmKernel(info) {}
Status ComputeInternal(OpKernelContext* context) const override;
};
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#ifdef __GNUC__
#include "onnxruntime_config.h"
#pragma GCC diagnostic ignored "-Wswitch"
#endif
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
#include "core/providers/rocm/cu_inc/common.cuh"
#include "where_impl.h"
namespace onnxruntime {
namespace rocm {
// broadcast by computing output coordinate from offset, using fast_divmod
template <typename T, BroadcastIndexType CondIndexType, BroadcastIndexType XIndexType, BroadcastIndexType YIndexType, int NumThreadsPerBlock, int NumElementsPerThread>
__global__ void _TenaryElementWise(
size_t output_rank,
const TArray<int64_t> cond_padded_strides,
const bool* cond_data,
const TArray<int64_t> x_padded_strides,
const T* x_data,
const TArray<int64_t> y_padded_strides,
const T* y_data,
const TArray<fast_divmod> fdm_output_strides,
T* output_data,
HIP_LONG N) {
HIP_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
bool cond_value[NumElementsPerThread];
T x_value[NumElementsPerThread];
T y_value[NumElementsPerThread];
HIP_LONG id = start;
#pragma unroll
for (int i = 0; i < NumElementsPerThread; i++) {
if (id < N) {
// compute indexes with broadcasting rules: https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
HIP_LONG cond_index = (CondIndexType == BroadcastIndexType::NoBroadcast ? id : 0);
HIP_LONG x_index = (XIndexType == BroadcastIndexType::NoBroadcast ? id : 0);
HIP_LONG y_index = (YIndexType == BroadcastIndexType::NoBroadcast ? id : 0);
HIP_LONG offset = id;
#pragma unroll
for (auto dim = 0; dim < fdm_output_strides.Capacity(); dim++) {
if (dim >= output_rank) {
break;
}
int q, r;
fdm_output_strides[dim].divmod(offset, q, r);
if (CondIndexType == BroadcastIndexType::NeedCompute) {
cond_index += static_cast<int>(cond_padded_strides[dim]) * q;
}
if (XIndexType == BroadcastIndexType::NeedCompute) {
x_index += static_cast<int>(x_padded_strides[dim]) * q;
}
if (YIndexType == BroadcastIndexType::NeedCompute) {
y_index += static_cast<int>(y_padded_strides[dim]) * q;
}
offset = r;
}
cond_value[i] = cond_data[cond_index];
x_value[i] = x_data[x_index];
y_value[i] = y_data[y_index];
id += NumThreadsPerBlock;
}
}
id = start;
#pragma unroll
for (int i = 0; i < NumElementsPerThread; i++) {
if (id < N) {
output_data[id] = cond_value[i] ? x_value[i] : y_value[i];
id += NumThreadsPerBlock;
}
}
}
// for scalar broadcast or non-broadcast case
template <typename T, BroadcastIndexType CondIndexType, BroadcastIndexType XIndexType, BroadcastIndexType YIndexType, int NumThreadsPerBlock, int NumElementsPerThread>
__global__ void _TenaryElementWiseSimple(
const bool* cond_data,
const T* x_data,
const T* y_data,
T* output_data,
HIP_LONG N) {
HIP_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
bool cond_value[NumElementsPerThread];
T x_value[NumElementsPerThread];
T y_value[NumElementsPerThread];
HIP_LONG id = start;
#pragma unroll
for (int i = 0; i < NumElementsPerThread; i++) {
if (id < N) {
cond_value[i] = cond_data[CondIndexType == BroadcastIndexType::NoBroadcast ? id : 0];
x_value[i] = x_data[XIndexType == BroadcastIndexType::NoBroadcast ? id : 0];
y_value[i] = y_data[YIndexType == BroadcastIndexType::NoBroadcast ? id : 0];
id += NumThreadsPerBlock;
}
}
id = start;
#pragma unroll
for (int i = 0; i < NumElementsPerThread; i++) {
if (id < N) {
output_data[id] = cond_value[i] ? x_value[i] : y_value[i];
id += NumThreadsPerBlock;
}
}
}
#define HANDLE_Y_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE, Y_INDEX_TYPE) \
case Y_INDEX_TYPE: { \
_TenaryElementWiseSimple<T, \
COND_INDEX_TYPE, \
X_INDEX_TYPE, \
Y_INDEX_TYPE, \
GridDim::maxThreadsPerBlock, \
GridDim::maxElementsPerThread> \
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(cond_data, \
x_data, \
y_data, \
output_data, \
N); \
} break
#define HANDLE_X_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE, Y_INDEX_TYPE_VAL) \
case X_INDEX_TYPE: { \
switch (Y_INDEX_TYPE_VAL) { \
HANDLE_Y_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::NoBroadcast); \
HANDLE_Y_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::Scalar); \
} \
} break
#define HANDLE_COND_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE_VAL, Y_INDEX_TYPE_VAL) \
case COND_INDEX_TYPE: { \
switch (X_INDEX_TYPE_VAL) { \
HANDLE_X_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, BroadcastIndexType::NoBroadcast, Y_INDEX_TYPE_VAL); \
HANDLE_X_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, BroadcastIndexType::Scalar, Y_INDEX_TYPE_VAL); \
} \
} break
#define HANDLE_Y_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, Y_INDEX_TYPE) \
case Y_INDEX_TYPE: { \
_TenaryElementWise<T, \
COND_INDEX_TYPE, \
X_INDEX_TYPE, \
Y_INDEX_TYPE, \
GridDim::maxThreadsPerBlock, \
GridDim::maxElementsPerThread> \
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(output_rank_or_simple_broadcast, \
cond_padded_strides, \
cond_data, \
x_padded_strides, \
x_data, \
y_padded_strides, \
y_data, \
fdm_output_strides, \
output_data, \
N); \
} break
#define HANDLE_X_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, Y_INDEX_TYPE_VAL) \
case X_INDEX_TYPE: { \
switch (Y_INDEX_TYPE_VAL) { \
HANDLE_Y_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::NoBroadcast); \
HANDLE_Y_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::Scalar); \
HANDLE_Y_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::NeedCompute); \
} \
} break
#define HANDLE_COND_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE_VAL, Y_INDEX_TYPE_VAL) \
case COND_INDEX_TYPE: { \
switch (X_INDEX_TYPE_VAL) { \
HANDLE_X_INDEX_TYPE(COND_INDEX_TYPE, BroadcastIndexType::NoBroadcast, Y_INDEX_TYPE_VAL); \
HANDLE_X_INDEX_TYPE(COND_INDEX_TYPE, BroadcastIndexType::Scalar, Y_INDEX_TYPE_VAL); \
HANDLE_X_INDEX_TYPE(COND_INDEX_TYPE, BroadcastIndexType::NeedCompute, Y_INDEX_TYPE_VAL); \
} \
} break
template <typename T>
void WhereImpl(
hipStream_t stream,
size_t output_rank_or_simple_broadcast,
BroadcastIndexType cond_index_type,
const TArray<int64_t>& cond_padded_strides,
const bool* cond_data,
BroadcastIndexType x_index_type,
const TArray<int64_t>& x_padded_strides,
const T* x_data,
BroadcastIndexType y_index_type,
const TArray<int64_t>& y_padded_strides,
const T* y_data,
const TArray<fast_divmod>& fdm_output_strides,
T* output_data,
size_t count) {
int blocksPerGrid = static_cast<int>(CeilDiv(count, GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
HIP_LONG N = static_cast<HIP_LONG>(count);
if (output_rank_or_simple_broadcast == static_cast<size_t>(SimpleBroadcast::NoBroadcast)) {
switch (cond_index_type) {
HANDLE_COND_INDEX_TYPE_SIMPLE(BroadcastIndexType::NoBroadcast, x_index_type, y_index_type);
HANDLE_COND_INDEX_TYPE_SIMPLE(BroadcastIndexType::Scalar, x_index_type, y_index_type);
}
} else {
switch (cond_index_type) {
HANDLE_COND_INDEX_TYPE(BroadcastIndexType::NoBroadcast, x_index_type, y_index_type);
HANDLE_COND_INDEX_TYPE(BroadcastIndexType::Scalar, x_index_type, y_index_type);
HANDLE_COND_INDEX_TYPE(BroadcastIndexType::NeedCompute, x_index_type, y_index_type);
}
}
}
#define SPECIALIZED_IMPL(T) \
template void WhereImpl<T>(hipStream_t stream, \
size_t output_rank_or_simple_broadcast, \
BroadcastIndexType cond_index_type, \
const TArray<int64_t>& cond_padded_strides, \
const bool* cond_data, \
BroadcastIndexType x_index_type, \
const TArray<int64_t>& x_padded_strides, \
const T* x_data, \
BroadcastIndexType y_index_type, \
const TArray<int64_t>& y_padded_strides, \
const T* y_data, \
const TArray<fast_divmod>& fdm_output_strides, \
T* output_data, \
size_t count);
SPECIALIZED_IMPL(uint8_t)
SPECIALIZED_IMPL(int32_t)
SPECIALIZED_IMPL(int64_t)
SPECIALIZED_IMPL(float)
SPECIALIZED_IMPL(double_t)
SPECIALIZED_IMPL(half)
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
#include "core/common/common.h"
namespace onnxruntime {
namespace rocm {
template <typename T>
void WhereImpl(
hipStream_t stream,
size_t output_rank_or_simple_broadcast,
BroadcastIndexType cond_index_type,
const TArray<int64_t>& cond_padded_strides,
const bool* cond_data,
BroadcastIndexType x_index_type,
const TArray<int64_t>& x_padded_strides,
const T* x_data,
BroadcastIndexType y_index_type,
const TArray<int64_t>& y_padded_strides,
const T* y_data,
const TArray<fast_divmod>& fdm_output_strides,
T* output_data,
size_t count);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#ifndef NDEBUG
namespace onnxruntime {
namespace rocm {
namespace test {
// Test header provides function declarations in EP-side bridge.
bool TestDeferredRelease();
bool TestDeferredReleaseWithoutArena();
bool TestBeamSearchTopK();
} // namespace test
} // namespace rocm
} // namespace onnxruntime
#endif
#ifndef NDEBUG
#include "contrib_ops/rocm/transformers/beam_search_topk.h"
#include <algorithm>
#include <numeric>
#include <queue>
#include <random>
#include <hip/hip_runtime.h>
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wunused-result"
namespace onnxruntime {
namespace rocm {
namespace test {
void FillAndShuffle(std::vector<float>& values, int32_t batch_size, int32_t beam_size, int32_t vocab_size) {
std::random_device rd;
std::mt19937 generator(rd());
for (int32_t batch = 0; batch < batch_size; batch++) {
int32_t batch_base_idx = batch * beam_size * vocab_size;
for (int32_t beam = 0; beam < beam_size; beam++) {
int32_t value = beam;
int32_t beam_base_idx = beam * vocab_size;
for (int32_t vocab = 0; vocab < vocab_size; vocab++) {
values[batch_base_idx + beam_base_idx + vocab] = (float)(value);
value += beam_size;
}
std::shuffle(values.begin() + batch_base_idx + beam_base_idx,
values.begin() + batch_base_idx + beam_base_idx + vocab_size,
generator);
}
}
}
void ComputeTopKReference(const std::vector<float>& values,
std::vector<float>& top_k_values,
std::vector<int32_t>& top_k_tokens,
std::vector<int32_t>& top_k_indices,
int32_t batch_size,
int32_t beam_size,
int32_t vocab_size,
int32_t k) {
using VK = std::pair<float, int32_t>;
for (int32_t b = 0; b < batch_size; b++) {
std::priority_queue<VK, std::vector<VK>, std::greater<VK>> queue;
int32_t base_idx = b * beam_size * vocab_size;
// initialize queue with k elements
for (int32_t i = 0; i < k; i++) {
queue.push({values[base_idx + i], i});
}
for (int32_t i = k; i < beam_size * vocab_size; i++) {
if (values[base_idx + i] > queue.top().first) {
queue.pop();
queue.push({values[base_idx + i], i});
}
}
int32_t top_k_base_idx = b * k;
for (int32_t i = k - 1; i >= 0; i--) {
top_k_values[top_k_base_idx + i] = queue.top().first;
top_k_tokens[top_k_base_idx + i] = queue.top().second % vocab_size;
top_k_indices[top_k_base_idx + i] = queue.top().second / vocab_size;
queue.pop();
}
}
}
bool TestBeamSearchTopK() {
int32_t batch_size = 4;
int32_t beam_size = 4;
int32_t vocab_size = 50257;
int32_t k = 2 * beam_size;
int32_t batch_x_beam_x_vocab = batch_size * beam_size * vocab_size;
std::vector<float> values(batch_x_beam_x_vocab);
FillAndShuffle(values, batch_size, beam_size, vocab_size);
std::vector<float> top_k_values_ref(batch_size * k);
std::vector<int32_t> top_k_tokens_ref(batch_size * k);
std::vector<int32_t> top_k_indices_ref(batch_size * k);
ComputeTopKReference(values, top_k_values_ref, top_k_tokens_ref, top_k_indices_ref, batch_size, beam_size, vocab_size, k);
const int32_t max_vocab_parts = 128;
size_t buffer_size = batch_x_beam_x_vocab * 4 // input
+ batch_size * beam_size * k * (max_vocab_parts + 1) * 2 * 4 // tmp
+ batch_size * k * 3 * 4; // output size
void* rocm_buffer = nullptr;
hipMalloc(&rocm_buffer, buffer_size);
float* values_device = (float*)rocm_buffer;
float* top_k_1st_values_tmp = (float*)(values_device + batch_x_beam_x_vocab);
int32_t* top_k_1st_tokens_tmp = (int32_t*)(top_k_1st_values_tmp + batch_size * beam_size * k * max_vocab_parts);
float* top_k_2nd_values_tmp = (float*)(top_k_1st_tokens_tmp + batch_size * beam_size * k * max_vocab_parts);
int32_t* top_k_2nd_tokens_tmp = (int32_t*)(top_k_2nd_values_tmp + batch_size * beam_size * k);
float* top_k_value = (float*)(top_k_2nd_tokens_tmp + batch_size * beam_size * k);
int32_t* top_k_token = (int32_t*)(top_k_value + batch_size * k);
int32_t* top_k_indices = (int32_t*)(top_k_token + batch_size * k);
hipMemcpy(values_device, values.data(), batch_x_beam_x_vocab * 4, hipMemcpyHostToDevice);
contrib::rocm::BeamSearchTopK(values_device,
batch_size,
beam_size,
vocab_size,
k,
top_k_1st_values_tmp,
top_k_1st_tokens_tmp,
top_k_2nd_values_tmp,
top_k_2nd_tokens_tmp,
top_k_value,
top_k_token,
top_k_indices,
NULL /*stream*/);
std::vector<float> top_k_values_host(batch_size * k);
std::vector<int32_t> top_k_token_host(batch_size * k);
std::vector<int32_t> top_k_indices_host(batch_size * k);
hipMemcpy(top_k_values_host.data(), top_k_value, batch_size * k * 4, hipMemcpyDeviceToHost);
hipMemcpy(top_k_token_host.data(), top_k_token, batch_size * k * 4, hipMemcpyDeviceToHost);
hipMemcpy(top_k_indices_host.data(), top_k_indices, batch_size * k * 4, hipMemcpyDeviceToHost);
for (int32_t i = 0; i < batch_size * k; i++) {
if (top_k_values_ref[i] != top_k_values_host[i] ||
top_k_tokens_ref[i] != top_k_token_host[i] ||
top_k_indices_ref[i] != top_k_indices_host[i]) {
return false;
}
}
return true;
}
} // namespace test
} // namespace rocm
} // namespace onnxruntime
#endif
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
// This test is built only under DEBUG mode because it requires
// extra code in the core of ROCM EP and that code may
// 1. slow down performance critical applications and
// 2. increase binary size of ORT.
#ifndef NDEBUG
#include <iostream>
#include "core/providers/rocm/test/all_tests.h"
#include "core/providers/rocm/rocm_execution_provider.h"
#include "core/providers/rocm/rocm_allocator.h"
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wunused-result"
namespace onnxruntime {
namespace rocm {
namespace test {
bool TestDeferredRelease() {
// Create ROCM EP.
ROCMExecutionProviderInfo info;
ROCMExecutionProvider ep(info);
// Initialize allocators in EP.
onnxruntime::AllocatorManager allocator_manager;
ep.RegisterAllocator(allocator_manager);
// Allocator for call hipHostMalloc and hipHostFree
// For details, see ROCMPinnedAllocator in rocm_allocator.cc.
AllocatorPtr cpu_pinned_alloc = ep.GetAllocator(DEFAULT_CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPU);
// 10 MB
const size_t n_bytes = 10 * 1000000;
const int64_t n_allocs = 64;
ORT_THROW_IF_ERROR(ep.OnRunStart());
for (size_t i = 0; i < n_allocs; ++i) {
// Allocate 10MB ROCM pinned memory.
auto pinned_buffer = ep.AllocateBufferOnCPUPinned<void>(n_bytes);
// Release it using ROCM callback.
ep.AddDeferredReleaseCPUPtr(pinned_buffer.release());
}
// Memory stats
AllocatorStats stats;
cpu_pinned_alloc->GetStats(&stats);
ORT_ENFORCE(stats.num_allocs == n_allocs);
ORT_THROW_IF_ERROR(ep.OnRunEnd(true));
return true;
}
bool TestDeferredReleaseWithoutArena() {
// Create ROCM EP.
ROCMExecutionProviderInfo info;
ROCMExecutionProvider ep(info);
// Initialize allocators in EP.
onnxruntime::AllocatorManager allocator_manager;
OrtDevice pinned_device{OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
// Create allocator without BFCArena
AllocatorCreationInfo pinned_memory_info(
[](OrtDevice::DeviceId device_id) {
return std::make_unique<ROCMPinnedAllocator>(device_id, CUDA_PINNED);
},
pinned_device.Id(),
false /* no arena */);
auto rocm_pinned_alloc = CreateAllocator(pinned_memory_info);
allocator_manager.InsertAllocator(rocm_pinned_alloc);
// Use existing allocator in allocator_manager.
// Also register new allocator created by this EP in allocator_manager.
ep.RegisterAllocator(allocator_manager);
// Allocator for call hipHostMalloc and hipHostFree
// For details, see ROCMPinnedAllocator in rocm_allocator.cc.
AllocatorPtr cpu_pinned_alloc = ep.GetAllocator(DEFAULT_CPU_ALLOCATOR_DEVICE_ID, OrtMemTypeCPU);
// 10 MB
const size_t n_bytes = 10 * 1000000;
const int64_t n_allocs = 64;
ORT_THROW_IF_ERROR(ep.OnRunStart());
for (size_t i = 0; i < n_allocs; ++i) {
// Allocate 10MB ROCM pinned memory.
auto pinned_buffer = ep.AllocateBufferOnCPUPinned<void>(n_bytes);
// Release it using ROCM callback.
ep.AddDeferredReleaseCPUPtr(pinned_buffer.release());
}
ORT_THROW_IF_ERROR(ep.OnRunEnd(true));
return true;
}
} // namespace test
} // namespace rocm
} // namespace onnxruntime
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment