Commit 1a91fcc2 authored by gaoqiong's avatar gaoqiong
Browse files

add dtk所需文件

parent a144865d
Pipeline #492 failed with stages
in 0 seconds
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/reduction/reduction_functions.h"
#include <algorithm>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include "core/common/common.h"
#include "core/providers/rocm/atomic/common.cuh"
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/providers/rocm/shared_inc/rocm_utils.h"
#include "core/providers/rocm/reduction/reduction_utils.cuh"
#include "core/providers/rocm/cu_inc/unary_elementwise_impl.cuh"
namespace onnxruntime {
namespace rocm {
namespace detail {
constexpr auto MAX_NUM_ELEMENTS_PER_THREAD = 4;
constexpr auto MAX_NUM_WARPS_PER_BLOCK = 8;
constexpr auto MAX_NUM_BLOCKS_IN_GRID_ROW = 256;
constexpr auto MAX_NUM_GRID_ROWS = 32768;
dim3 compute_block_dim(int num_cols) {
const int x = GPU_WARP_SIZE_HOST;
const int y = std::min(MAX_NUM_WARPS_PER_BLOCK, std::max(1, num_cols / (MAX_NUM_ELEMENTS_PER_THREAD * x)));
return dim3(x, y);
}
std::pair<dim3, dim3> compute_grid_and_block_dims(int num_rows, int num_cols) {
const auto block_dim = compute_block_dim(num_cols);
const auto grid_x =
std::min<int>(
MAX_NUM_BLOCKS_IN_GRID_ROW,
std::max<int>(1, num_cols / (MAX_NUM_ELEMENTS_PER_THREAD * block_dim.x * block_dim.y)));
const auto grid_y = std::min(MAX_NUM_GRID_ROWS, num_rows);
const dim3 grid_dim(grid_x, grid_y);
return {grid_dim, block_dim};
}
uintptr_t round_up_to_aligned(uintptr_t original, size_t alignment) {
assert((alignment & (alignment - 1)) == 0);
const size_t alignment_mask = ~(alignment - 1);
return (original + alignment - 1) & alignment_mask;
}
/**
* call_reduce_matrix_columns() intermediate buffer layout
*
* Given buffer element type TBuf, the intermediate buffer layout looks like this:
*
* -----
* m * num_blocks_per_row * sizeof(TBuf) bytes for block reductions per row
* alignment padding bytes as needed
* m * sizeof(int) bytes for block done counts per row
* -----
*/
size_t compute_reduce_matrix_columns_intermediate_buffer_size(
int element_size, int num_rows, int num_cols) {
ORT_ENFORCE(element_size >= 0 && num_rows >= 0 && num_cols >= 0);
const auto grid_dim = compute_grid_and_block_dims(num_rows, num_cols).first;
size_t buffer_size{};
// at the beginning, for sizing purposes, assume we are aligned
buffer_size += static_cast<size_t>(num_rows) * grid_dim.x * element_size;
buffer_size = round_up_to_aligned(buffer_size, alignof(int));
buffer_size += static_cast<size_t>(num_rows) * sizeof(int);
// add padding to give us room to align
buffer_size += alignof(max_align_t) - 1;
return buffer_size;
}
template <typename TBuf>
Status get_reduction_buffers(
int num_rows, int num_cols, void* buffer, size_t buffer_size,
TBuf*& block_reductions_buffer, int*& block_done_counts_buffer) {
const auto grid_dim = compute_grid_and_block_dims(num_rows, num_cols).first;
const uintptr_t begin_addr = reinterpret_cast<uintptr_t>(buffer);
const uintptr_t block_reductions_addr =
round_up_to_aligned(begin_addr, alignof(TBuf));
const uintptr_t block_done_counts_buffer_addr =
round_up_to_aligned(
block_reductions_addr + static_cast<size_t>(num_rows) * grid_dim.x * sizeof(TBuf), alignof(int));
const uintptr_t end_addr =
block_done_counts_buffer_addr + static_cast<size_t>(num_rows) * sizeof(int);
const size_t required_size = end_addr - begin_addr;
ORT_RETURN_IF_NOT(
required_size <= buffer_size,
"Buffer size is too small (", buffer_size, " bytes). ",
"At least ", required_size, " bytes are needed from the given base address (", buffer, ").");
block_reductions_buffer = reinterpret_cast<TBuf*>(block_reductions_addr);
block_done_counts_buffer = reinterpret_cast<int*>(block_done_counts_buffer_addr);
return Status::OK();
}
template <typename TIn, typename TOut, typename TBuf, typename TOp, typename TFinalOp, bool DivideResultBySize>
__device__ void reduce_all(
const int num_elements, const TIn* const input, TOut* const output,
TBuf* const block_reductions_buffer, int* const block_done_count_buffer) {
extern __shared__ unsigned char shared_memory_bytes[];
TBuf* shared_memory = reinterpret_cast<TBuf*>(shared_memory_bytes);
// Thread-level indices:
// Linear index of thread in block.
const int tid_in_block = threadIdx.y * blockDim.x + threadIdx.x;
// Total number of threads in a 2-D block.
const int num_threads_in_block = blockDim.x * blockDim.y;
// Warp-level indices:
// Warp index of thread.
const int wid_in_block = tid_in_block / GPU_WARP_SIZE;
// Lane index of thread.
const int lid_in_block = tid_in_block % GPU_WARP_SIZE;
// Warp count per block.
const int num_warps_in_block = num_threads_in_block / GPU_WARP_SIZE;
// Grid-level indices:
// Linear index of block in grid row.
const int bid_in_grid_row = blockIdx.x;
// Linear index of thread in grid row.
const int tid_in_grid_row = bid_in_grid_row * (blockDim.x * blockDim.y) + tid_in_block;
// Total number of blocks in a grid row.
const int num_blocks_in_grid_row = gridDim.x;
// Total number of threads in a grid row with 2-D blocks.
const int num_threads_in_grid_row = num_blocks_in_grid_row * num_threads_in_block;
const auto write_result = [&output, &num_elements](const TOut result) {
// Compilation time if-else branch controlled by template argument can be
// optimized out, so there will be no branch in real computation phase.
if (DivideResultBySize) {
output[0] = TFinalOp()(result / TOut(num_elements));
} else {
output[0] = TFinalOp()(result);
}
};
// Thread-level reduction (storage change: global memory -> register).
// One thread reduces MAX_NUM_ELEMENTS_PER_THREAD elements to a thread register
// in one iteration.
TBuf value = 0;
for (int id = tid_in_grid_row; id < num_elements; id += MAX_NUM_ELEMENTS_PER_THREAD * num_threads_in_grid_row) {
TIn v[MAX_NUM_ELEMENTS_PER_THREAD];
#pragma unroll
for (int i = 0; i < MAX_NUM_ELEMENTS_PER_THREAD; i++) {
const int offset = id + i * num_threads_in_grid_row;
if (offset < num_elements) {
v[i] = input[offset];
}
}
#pragma unroll
for (int i = 0; i < MAX_NUM_ELEMENTS_PER_THREAD; i++) {
const int offset = id + i * num_threads_in_grid_row;
if (offset < num_elements) {
value += TOp()(TBuf(v[i]));
}
}
}
#if __CUDA_ARCH__ >= 700
__syncwarp();
#else
__syncthreads();
#endif
// Warp-level reduction (storage change: register -> register).
// The values in a warp will be summed up to a scalar. After warp-level
// reduction, each block holds num_warps_in_block values in the shared memory.
#pragma unroll
for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
value += WARP_SHFL_DOWN(value, stride);
}
// Return early if only one warp is used for reduction.
// Given a fixed amount of threads, we prefer threads over warps over blocks so that we never have cases such as
// 1. two blocks and each of them has only 1 warp (32 threads).
// 2. two warps and each of them has only 2 threads.
if (num_warps_in_block == 1) {
if (tid_in_grid_row == 0) {
write_result(value);
}
return;
}
if (lid_in_block == 0) {
shared_memory[wid_in_block] = value;
}
__syncthreads();
// Block-level reduction (storage change: shared memory -> global memory).
// The values in a block will be summed up to a scalar.
// Note that the values are stored in the shared memory.
// Here we assume that the size of shared_memory is smaller
// than num_warps_in_block, so we just keep halving the number
// of threads in each iteration. Our assumption is always true because
// the size of shared_memory equals to the number of warps.
#pragma unroll
for (int stride = MAX_NUM_WARPS_PER_BLOCK / 2; stride > 0; stride /= 2) {
if (tid_in_block + stride < num_warps_in_block) {
shared_memory[tid_in_block] += shared_memory[tid_in_block + stride];
}
__syncthreads();
}
// Return early if only one block is used for reduction.
if (num_blocks_in_grid_row == 1) {
if (tid_in_grid_row == 0) {
write_result(shared_memory[0]);
}
return;
}
if (tid_in_block == 0) {
block_reductions_buffer[bid_in_grid_row] = shared_memory[0];
}
__threadfence();
__syncthreads();
// Grid-level reduction. We use the last block to sum up values
// stored in the global block_reductions_buffer.
__shared__ bool is_last_block_done;
if (tid_in_block == 0) {
const int count = atomicAdd(block_done_count_buffer, 1);
is_last_block_done = (count == (num_blocks_in_grid_row - 1));
}
// All threads in each block see if they belong the last active block
// (i.e., the value of is_last_block_done).
__syncthreads();
// Only the block which saw that count equals to num_blocks_in_grid_row - 1 can
// enter the following block.
if (is_last_block_done) {
const int pow2_bound = least_pow2_bound(num_blocks_in_grid_row);
for (int stride = pow2_bound / 2; stride > 0; stride /= 2) {
if (tid_in_block < stride && tid_in_block + stride < num_blocks_in_grid_row) {
block_reductions_buffer[tid_in_block] += block_reductions_buffer[tid_in_block + stride];
}
__syncthreads();
}
// The first thread in the last block assigns the final output.
if (tid_in_block == 0) {
write_result(block_reductions_buffer[0]);
}
}
}
template <typename TIn, typename TOut, typename TBuf, typename TOp, typename TFinalOp, bool DivideResultBySize>
__global__ void reduce_matrix_columns_kernel(
const int num_rows, const int num_cols, const TIn* const input, TOut* const output,
TBuf* const block_reductions_buffer, int* const block_done_counts_buffer) {
const int num_blocks_in_grid_row = gridDim.x;
const int row_id_in_grid = blockIdx.y;
const int num_grid_rows = gridDim.y;
// one row per iteration
// row_id is int64_t to avoid int overflow in offset calculations
for (int64_t row_id = row_id_in_grid; row_id < num_rows; row_id += num_grid_rows) {
const TIn* const row_data = input + row_id * num_cols;
TOut* const row_output = output + row_id;
TBuf* const row_block_reductions_buffer = block_reductions_buffer + row_id * num_blocks_in_grid_row;
int* const row_block_done_counts_buffer = block_done_counts_buffer + row_id;
reduce_all<TIn, TOut, TBuf, TOp, TFinalOp, DivideResultBySize>(
num_cols, row_data, row_output,
row_block_reductions_buffer, row_block_done_counts_buffer);
}
}
template <typename TIn, typename TOut, typename TOp, typename TFinalOp, bool DivideResultBySize>
Status call_reduce_matrix_columns(
hipStream_t stream, const TIn* input, TOut* output, const int num_rows, const int num_cols, void* buffer, size_t buffer_size) {
ORT_ENFORCE(num_rows >= 0 && num_cols >= 0);
using TBuf = AccumulationType_t<TIn>;
const auto grid_and_block_dims = compute_grid_and_block_dims(num_rows, num_cols);
const dim3& grid_dim = grid_and_block_dims.first;
const dim3& block_dim = grid_and_block_dims.second;
TBuf* block_reductions_buffer;
int* block_done_counts_buffer;
ORT_RETURN_IF_ERROR(get_reduction_buffers(
num_rows, num_cols, buffer, buffer_size,
block_reductions_buffer, block_done_counts_buffer));
// If more than one block is used per grid row, then inter-block reduction is needed.
if (grid_dim.x > 1) {
HIP_RETURN_IF_ERROR(hipMemsetAsync(block_done_counts_buffer, 0, num_rows * sizeof(int), stream));
}
const int shared_mem_size = sizeof(TBuf) * block_dim.x * block_dim.y / GPU_WARP_SIZE_HOST;
hipLaunchKernelGGL(HIP_KERNEL_NAME(reduce_matrix_columns_kernel<TIn, TOut, TBuf, TOp, TFinalOp, DivideResultBySize>), grid_dim, block_dim, shared_mem_size, stream,
num_rows, num_cols, input, output, block_reductions_buffer, block_done_counts_buffer);
return Status::OK();
}
} // namespace detail
template <typename TIn, typename TOut>
Status reduce_sum(
hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) {
return detail::call_reduce_matrix_columns<TIn, TOut, Identity, Identity, false>(
stream, input, output, 1, size, buffer, buffer_size);
}
template <typename TIn, typename TOut>
Status reduce_square_sum(
hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) {
return detail::call_reduce_matrix_columns<TIn, TOut, Square, Identity, false>(
stream, input, output, 1, size, buffer, buffer_size);
}
template <typename TIn, typename TOut>
Status reduce_l2_norm(
hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) {
return detail::call_reduce_matrix_columns<TIn, TOut, Square, Sqrt, false>(
stream, input, output, 1, size, buffer, buffer_size);
}
template <typename TIn, typename TOut>
Status reduce_mean(
hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size) {
return detail::call_reduce_matrix_columns<TIn, TOut, Identity, Identity, true>(
stream, input, output, 1, size, buffer, buffer_size);
}
#define INSTANTIATE_REDUCE_SUM(TIn, TOut) \
template Status reduce_sum<TIn, TOut>(hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size)
INSTANTIATE_REDUCE_SUM(half, half);
INSTANTIATE_REDUCE_SUM(half, float);
INSTANTIATE_REDUCE_SUM(float, float);
INSTANTIATE_REDUCE_SUM(double, double);
INSTANTIATE_REDUCE_SUM(BFloat16, BFloat16);
INSTANTIATE_REDUCE_SUM(BFloat16, float);
#undef INSTANTIATE_REDUCE_SUM
#define INSTANTIATE_REDUCE_SQUARE_SUM(TIn, TOut) \
template Status reduce_square_sum<TIn, TOut>(hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size)
INSTANTIATE_REDUCE_SQUARE_SUM(half, float);
INSTANTIATE_REDUCE_SQUARE_SUM(float, float);
INSTANTIATE_REDUCE_SQUARE_SUM(double, double);
INSTANTIATE_REDUCE_SQUARE_SUM(BFloat16, float);
#undef INSTANTIATE_REDUCE_SQUARE_SUM
#define INSTANTIATE_REDUCE_L2_NORM(TIn, TOut) \
template Status reduce_l2_norm<TIn, TOut>(hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size)
INSTANTIATE_REDUCE_L2_NORM(half, float);
INSTANTIATE_REDUCE_L2_NORM(float, float);
INSTANTIATE_REDUCE_L2_NORM(double, double);
#undef INSTANTIATE_REDUCE_L2_NORM
#define INSTANTIATE_REDUCE_MEAN(TIn, TOut) \
template Status reduce_mean<TIn, TOut>(hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size)
INSTANTIATE_REDUCE_MEAN(half, float);
INSTANTIATE_REDUCE_MEAN(float, float);
INSTANTIATE_REDUCE_MEAN(double, double);
#undef INSTANTIATE_REDUCE_MEAN
namespace detail {
template <typename TIn, typename TOut, typename TBuf>
__global__ void reduce_matrix_rows_kernel(const TIn* input, TOut* output, int m, int n) {
constexpr int x_load_count_per_thread = 1;
constexpr int y_load_count_per_thread = 4;
const int t_count_x_in_grid = blockDim.x * gridDim.x;
const int t_count_y_in_grid = blockDim.y * gridDim.y;
const int x_grid_stride = t_count_x_in_grid * x_load_count_per_thread;
const int y_grid_stride = t_count_y_in_grid * y_load_count_per_thread;
const int tid_x_in_grid = threadIdx.x + blockDim.x * blockIdx.x;
const int tid_y_in_grid = threadIdx.y + blockDim.y * blockIdx.y;
const int tid_in_block = threadIdx.x + blockDim.x * threadIdx.y;
// Shape is blockDim.y-by-blockDim.x and element type is TBuf.
extern __shared__ unsigned char shared_memory_bytes[];
TBuf* shared_memory = reinterpret_cast<TBuf*>(shared_memory_bytes);
// to prevent int overflow in index calculation for input size m*n
const int64_t n_int64 = static_cast<int64_t>(n);
for (int col = tid_x_in_grid; col < n; col += x_grid_stride) {
shared_memory[tid_in_block] = TBuf(0.0f);
TBuf sum = TBuf(0.0f);
// This loops load multiple blockDim.y-by-blockDim.x sub-tensors from the input.
for (int row = tid_y_in_grid; row < m; row += y_grid_stride) {
// Thread-level reduction. Each thread loads y_load_count_per_thread values
// and aggregrate them.
#pragma unroll y_load_count_per_thread
for (int row_inner = 0; row_inner < y_load_count_per_thread; ++row_inner) {
int row_final = row + row_inner * t_count_y_in_grid;
int col_final = col;
if (row_final < m && col_final < n) {
sum += TBuf(input[row_final * n_int64 + col_final]);
}
}
}
// Write thread-level reduction result into shared memory.
shared_memory[tid_in_block] = sum;
// Wait all threads to finish their thread-level reductions.
__syncthreads();
// This loop conducts reduction on elements stored in shared memory.
// Each block reduces blockDim.y-by-blockDim.x tensor to 1-by-blockDim.x tensor.
#pragma unroll 4
for (int stride = blockDim.y / 2; stride > 0; stride /= 2) {
if (threadIdx.y < stride) {
shared_memory[tid_in_block] += shared_memory[tid_in_block + stride * blockDim.x];
}
__syncthreads();
}
if (threadIdx.y == 0) {
atomic_add(output + col, TOut(shared_memory[threadIdx.x]));
}
}
}
template <typename TIn, typename TOut, typename TBuf>
Status call_reduce_matrix_rows(hipStream_t stream, const TIn* input, TOut* output, int m, int n, bool reset_initial_output) {
ORT_ENFORCE(m >= 0 && n >= 0);
if (reset_initial_output) {
HIP_RETURN_IF_ERROR(hipMemsetAsync(output, 0, n * sizeof(TOut), stream));
}
constexpr int max_num_threads_in_block = 512;
constexpr int max_num_blocks_in_grid = 512;
constexpr int load_count_per_thread = 4;
const int block_x_dim = least_pow2_bound(std::max(1, std::min(n, GPU_WARP_SIZE_HOST)));
const int block_y_dim = least_pow2_bound(std::max(1, std::min(max_num_threads_in_block / block_x_dim, m / load_count_per_thread)));
const int grid_x_dim = std::max(1, std::min(n / block_x_dim, max_num_blocks_in_grid));
const int grid_y_dim = std::max(1, std::min(max_num_blocks_in_grid / grid_x_dim, m / block_y_dim / 4));
const dim3 grid(grid_x_dim, grid_y_dim, 1);
const dim3 block(block_x_dim, block_y_dim, 1);
reduce_matrix_rows_kernel<TIn, TOut, TBuf><<<grid, block, block.y * block.x * sizeof(TBuf), stream>>>(
input, output, m, n);
return Status::OK();
}
} // namespace detail
template <typename T>
struct OP_Div {
__device__ __inline__ T operator()(const T& a) const {
return a / v_;
}
OP_Div(T v) : v_(v) {}
T v_;
};
template <typename T>
void UnaryDiv(hipStream_t stream, const T* input, T* output, T denominator, size_t count) {
UnaryElementWiseImpl(stream, input, output, OP_Div<T>(denominator), count);
}
#define INSTANTIATE_UNARY_DIV(T) \
template void UnaryDiv<T>(hipStream_t stream, const T* input, T* output, T denominator, size_t count)
INSTANTIATE_UNARY_DIV(half);
INSTANTIATE_UNARY_DIV(float);
INSTANTIATE_UNARY_DIV(double);
INSTANTIATE_UNARY_DIV(BFloat16);
#undef INSTANTIATE_UNARY_DIV
template <typename TIn, typename TOut>
Status reduce_matrix_rows(hipStream_t stream, const TIn* input, TOut* output, int m, int n, bool reset_initial_output) {
using TBuf = AccumulationType_t<TIn>;
return detail::call_reduce_matrix_rows<TIn, TOut, TBuf>(stream, input, output, m, n, reset_initial_output);
}
#define INSTANTIATE_REDUCE_MATRIX_ROWS(T) \
template Status reduce_matrix_rows<T, T>(hipStream_t stream, const T* input, T* output, int m, int n, bool reset_initial_output)
INSTANTIATE_REDUCE_MATRIX_ROWS(half);
INSTANTIATE_REDUCE_MATRIX_ROWS(float);
INSTANTIATE_REDUCE_MATRIX_ROWS(double);
INSTANTIATE_REDUCE_MATRIX_ROWS(BFloat16);
#undef INSTANTIATE_REDUCE_MATRIX_ROWS
template <typename TIn, typename TOut>
Status reduce_matrix_columns(hipStream_t stream, const TIn* input, TOut* output, int m, int n, void* buffer, size_t buffer_size) {
return detail::call_reduce_matrix_columns<TIn, TOut, Identity, Identity, false>(
stream, input, output, m, n, buffer, buffer_size);
}
#define INSTANTIATE_REDUCE_MATRIX_COLUMNS(T) \
template Status reduce_matrix_columns<T, T>(hipStream_t stream, const T* input, T* output, int m, int n, void* buffer, size_t buffer_size)
INSTANTIATE_REDUCE_MATRIX_COLUMNS(half);
INSTANTIATE_REDUCE_MATRIX_COLUMNS(float);
INSTANTIATE_REDUCE_MATRIX_COLUMNS(double);
INSTANTIATE_REDUCE_MATRIX_COLUMNS(BFloat16);
#undef INSTANTIATE_REDUCE_MATRIX_COLUMNS
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_common.h"
#include "core/providers/rocm/shared_inc/accumulation_type.h"
namespace onnxruntime {
namespace rocm {
namespace detail {
size_t compute_reduce_matrix_columns_intermediate_buffer_size(
int element_size, int num_rows, int num_cols);
} // namespace detail
/**
* Computes the size in bytes of the intermediate buffer needed by reduce_matrix_columns().
* @tparam TIn The input data type.
* @param m The number of matrix rows.
* @param n The number of matrix columns.
* @return The size of the intermediate buffer.
*/
template <typename TIn>
size_t compute_reduce_matrix_columns_buffer_size(int m, int n) {
using TBuf = AccumulationType_t<TIn>;
return detail::compute_reduce_matrix_columns_intermediate_buffer_size(
sizeof(TBuf), m, n);
}
/**
* Computes the size in bytes of the intermediate buffer needed by the reduce_x() functions.
* @tparam TIn The input data type.
* @param size The number of elements.
* @return The size of the intermediate buffer.
*/
template <typename TIn>
size_t compute_reduction_buffer_size(int size) {
using TBuf = AccumulationType_t<TIn>;
return detail::compute_reduce_matrix_columns_intermediate_buffer_size(
sizeof(TBuf), 1, size);
}
/** Computes the sum of the given elements. */
template <typename TIn, typename TOut>
Status reduce_sum(hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size);
/** Computes the sum of the squares of the given elements. */
template <typename TIn, typename TOut>
Status reduce_square_sum(hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size);
/** Computes the L2 norm of the given elements. */
template <typename TIn, typename TOut>
Status reduce_l2_norm(hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size);
/** Computes the mean of the given elements. */
template <typename TIn, typename TOut>
Status reduce_mean(hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size);
enum class ApplicableMatrixReduction {
// can use reduce_matrix_rows()
Rows,
// can use reduce_matrix_columns()
Columns,
// no optimized matrix reduction function applies
None,
};
/**
* Determines whether a cuDNN reduction can be computed by an optimized matrix reduction function.
* @param miopen_reduce_op The cuDNN reduction op type.
* @param dims The input dimensions.
* @param axes The reduction axes.
* @param[out] m If matrix reduction is possible, the number of matrix rows to use.
* @param[out] n If matrix reduction is possible, the number of matrix columns to use.
* @return The type of matrix reduction that can be done.
*/
ApplicableMatrixReduction get_applicable_matrix_reduction(
const miopenReduceTensorOp_t miopen_reduce_op,
gsl::span<const int64_t> dims, gsl::span<const int64_t> axes,
int& m, int& n);
/**
* Reduces the rows in a row-major matrix to a single row containing the sum of each column.
* @param input The input data.
* @param output The output data.
* @param m The number of matrix rows.
* @param n The number of matrix columns.
* @param reset_initial_output Whether to reset (i.e., zero) the output values first.
*/
template <typename TIn, typename TOut>
Status reduce_matrix_rows(hipStream_t stream, const TIn* input, TOut* output, int m, int n, bool reset_initial_output = true);
/**
* Reduces the columns in a row-major matrix to a single column containing the sum of each row.
* @param input The input data.
* @param output The output data.
* @param m The number of matrix rows.
* @param n The number of matrix columns.
* @param buffer The intermediate buffer.
* @param buffer_size The size of the intermediate buffer in bytes.
*/
template <typename TIn, typename TOut>
Status reduce_matrix_columns(hipStream_t stream, const TIn* input, TOut* output, int m, int n, void* buffer, size_t buffer_size);
/** Apply unary elementwise division. */
template <typename T>
void UnaryDiv(hipStream_t stream, const T* input, T* output, T denominator, size_t count);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/optional.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/reduction/reduction_ops.h"
#include "core/providers/rocm/reduction/reduction_functions.h"
namespace onnxruntime {
namespace rocm {
namespace ReductionOps {
// Implementation that holds the core logic of reduction op processing
// `input_shape_override` is the input shape for compute purposes (if provided)
template <typename T, miopenReduceTensorIndices_t ReduceTensorIndices = MIOPEN_REDUCE_TENSOR_NO_INDICES>
std::unique_ptr<Tensor> ReduceCompute(ROCMExecutionProvider& rocm_ep, miopenReduceTensorOp_t miopen_reduce_op, AllocatorPtr allocator,
const Tensor& input, gsl::span<const int64_t> axes,
bool keep_dims, bool calculate_log, bool calculate_sqt, bool log_sum_exp,
bool fast_reduction, const TensorShape* input_shape_override = nullptr);
} // namespace ReductionOps
// Holds some metadata that will be used during actual reduction op compute time
struct PrepareReduceMetadata {
int64_t input_count;
int64_t output_count;
// This holds the output dims without any reduced dims squeezed (even if keep_dims == 1)
TensorShapeVector output_dims;
// This holds the output dims with with reduced dims squeezed (if keep_dims == 1)
TensorShapeVector squeezed_output_dims;
TensorShapeVector input_dims_miopen;
TensorShapeVector output_dims_miopen;
};
template <bool allow_multi_axes>
class ReduceKernel : public RocmKernel, public ReduceKernelBase<allow_multi_axes> {
protected:
ReduceKernel(
const OpKernelInfo& info,
optional<int64_t> keep_dims_override = {})
: RocmKernel(info),
ReduceKernelBase<allow_multi_axes>(info, keep_dims_override),
calculate_log_(false),
calculate_sqt_(false),
log_sum_exp_(false),
fast_reduction_(false) {
// We need to cast away the const as PerThreadMiopenHandle() is currently a non-const method
// TODO: Clean up the ROCMExecutionProvider interface to avoid this
rocm_ep_ = const_cast<ROCMExecutionProvider*>(static_cast<const ROCMExecutionProvider*>(info.GetExecutionProvider()));
}
// Only Max Min need to set ReduceTensorIndices MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES as per miopen library manual
// Only Max Min will have indices output, need to set the indices to nullptr for other ops
template <typename T, miopenReduceTensorIndices_t ReduceTensorIndices = MIOPEN_REDUCE_TENSOR_NO_INDICES>
Status ComputeImpl(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const;
// Used by ReduceSumTraining which will have axes as input
template <typename T, miopenReduceTensorIndices_t ReduceTensorIndices = MIOPEN_REDUCE_TENSOR_NO_INDICES>
Status ComputeImplEx(OpKernelContext* ctx, miopenReduceTensorOp_t miopen_reduce_op) const;
template <typename T, typename OutT, miopenReduceTensorIndices_t ReduceTensorIndices>
Status ReduceKernelShared(
const T* X,
const TensorShape& input_shape,
OutT* Y,
const TensorShape& output_shape,
miopenReduceTensorOp_t miopen_reduce_op,
TensorShapeVector& output_dims) const;
using ReduceKernelBase<allow_multi_axes>::axes_;
using ReduceKernelBase<allow_multi_axes>::keepdims_;
using ReduceKernelBase<allow_multi_axes>::noop_with_empty_axes_;
bool calculate_log_;
bool calculate_sqt_;
bool log_sum_exp_;
// Indicates if this reduction can be delegated to our highly-optimized reduction kernels.
// Those efficient kernels are defined/implemented in reduction_functions.h/.cu.
bool fast_reduction_;
// We need to access to the ROCM EP instance to get the miopen handle
ROCMExecutionProvider* rocm_ep_;
};
template <typename T>
class ArgMax final : public ReduceKernel<false> {
public:
ArgMax(const OpKernelInfo& info) : ReduceKernel<false>(info) {}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T, MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES>(ctx, MIOPEN_REDUCE_TENSOR_MAX);
}
};
template <typename T>
class ArgMin final : public ReduceKernel<false> {
public:
ArgMin(const OpKernelInfo& info) : ReduceKernel<false>(info) {}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T, MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES>(ctx, MIOPEN_REDUCE_TENSOR_MIN);
}
};
template <typename T>
class ReduceL1 final : public ReduceKernel<true> {
public:
ReduceL1(const OpKernelInfo& info) : ReduceKernel<true>(info) {}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_NORM1);
}
};
template <typename T>
class ReduceL2 final : public ReduceKernel<true> {
public:
ReduceL2(const OpKernelInfo& info) : ReduceKernel<true>(info) {}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_NORM2);
}
};
template <typename T>
class ReduceMax final : public ReduceKernel<true> {
public:
ReduceMax(const OpKernelInfo& info) : ReduceKernel<true>(info) {}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_MAX);
}
};
template <typename T>
class ReduceMean final : public ReduceKernel<true> {
public:
ReduceMean(const OpKernelInfo& info) : ReduceKernel<true>(info) {
fast_reduction_ = true;
}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_AVG);
}
};
template <typename T>
class ReduceMin final : public ReduceKernel<true> {
public:
ReduceMin(const OpKernelInfo& info) : ReduceKernel<true>(info) {}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_MIN);
}
};
template <typename T>
class ReduceProd final : public ReduceKernel<true> {
public:
ReduceProd(const OpKernelInfo& info) : ReduceKernel<true>(info) {}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_MUL);
}
};
template <typename T>
class ReduceSum final : public ReduceKernel<true> {
public:
ReduceSum(const OpKernelInfo& info) : ReduceKernel<true>(info) {
fast_reduction_ = true;
}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_ADD);
}
};
template <typename T>
class ReduceLogSum final : public ReduceKernel<true> {
public:
ReduceLogSum(const OpKernelInfo& info) : ReduceKernel<true>(info) {
ReduceKernel<true>::calculate_log_ = true;
fast_reduction_ = true;
}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_ADD);
}
};
template <typename T>
class ReduceSumSquare final : public ReduceKernel<true> {
public:
ReduceSumSquare(const OpKernelInfo& info) : ReduceKernel<true>(info) {
ReduceKernel<true>::calculate_sqt_ = true;
fast_reduction_ = true;
}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_ADD);
}
};
template <typename T>
class ReduceLogSumExp final : public ReduceKernel<true> {
public:
ReduceLogSumExp(const OpKernelInfo& info) : ReduceKernel<true>(info) {
ReduceKernel<true>::log_sum_exp_ = true;
}
Status ComputeInternal(OpKernelContext* ctx) const override {
return ComputeImpl<T>(ctx, MIOPEN_REDUCE_TENSOR_ADD);
}
};
Status PrepareForReduce(const Tensor* X,
bool keepdims,
gsl::span<const int64_t> axes,
PrepareReduceMetadata& prepare_reduce_metadata,
const TensorShape* input_shape_override = nullptr);
template <typename T, miopenReduceTensorIndices_t ReduceTensorIndices>
Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata,
/*out*/ Tensor& output, miopenReduceTensorOp_t miopen_reduce_op,
gsl::span<const int64_t> axes,
bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction,
const TensorShape* input_shape_override = nullptr);
// ROCM's reduction descriptor miopenReduceTensorDescriptor_t is a pointer so
// it's safer to wrap it with automatically memory deleter as MiopenReduceDescriptor.
// An implicit caster from MiopenReduceDescriptor to miopenReduceTensorDescriptor_t
// is implemented below, so ROCM can seamlessly work.
class MiopenReduceDescriptor final {
public:
MiopenReduceDescriptor() : desc_(nullptr) {
}
~MiopenReduceDescriptor() {
if (desc_ != nullptr) {
miopenDestroyReduceTensorDescriptor(desc_);
desc_ = nullptr;
}
}
MiopenReduceDescriptor(const MiopenReduceDescriptor&) = delete;
MiopenReduceDescriptor& operator=(const MiopenReduceDescriptor&) = delete;
Status Set(miopenReduceTensorOp_t op, miopenDataType_t type, miopenReduceTensorIndices_t indices) {
if (!desc_)
MIOPEN_RETURN_IF_ERROR(miopenCreateReduceTensorDescriptor(&desc_));
MIOPEN_RETURN_IF_ERROR(miopenSetReduceTensorDescriptor(
desc_,
op,
type,
MIOPEN_PROPAGATE_NAN,
indices,
MIOPEN_32BIT_INDICES)); // currently only the 32-bit (unsigned int) type is supported.
return Status::OK();
}
operator miopenReduceTensorDescriptor_t() const { return desc_; }
private:
miopenReduceTensorDescriptor_t desc_;
};
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/cu_inc/common.cuh"
namespace onnxruntime {
namespace rocm {
__forceinline__ __host__ __device__ int least_pow2_bound(int value) {
unsigned int value_ = static_cast<unsigned int>(value);
--value_;
value_ |= value_ >> 1;
value_ |= value_ >> 2;
value_ |= value_ >> 4;
value_ |= value_ >> 8;
value_ |= value_ >> 16;
return static_cast<int>(++value_);
}
struct Square {
template <typename T>
__forceinline__ __device__ T operator()(const T& value) {
return value * value;
}
};
struct Sqrt {
template <typename T>
__forceinline__ __device__ T operator()(const T& value) {
return _Sqrt(value);
}
};
struct Identity {
template <typename T>
__forceinline__ __device__ T operator()(const T& value) {
return value;
}
};
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/rocm_check_memory.h"
#include "core/providers/rocm/rocm_common.h"
namespace onnxruntime {
void CheckIfMemoryOnCurrentGpuDevice(const void* ptr) {
hipPointerAttribute_t attrs;
HIP_CALL_THROW(hipPointerGetAttributes(&attrs, ptr));
int current_device;
HIP_CALL_THROW(hipGetDevice(&current_device));
ORT_ENFORCE(attrs.device == current_device,
"Current ROCM device is ", current_device,
" but the memory of pointer ", ptr,
" is allocated on device ", attrs.device);
}
} // onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
namespace onnxruntime {
// Throw if "ptr" is not allocated on the ROCM device obtained by hipGetDevice.
void CheckIfMemoryOnCurrentGpuDevice(const void* ptr);
} // onnxruntime
\ No newline at end of file
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/rocm_graph.h"
#include "core/providers/rocm/rocm_common.h"
#include <hip/hip_runtime_api.h>
#include <hip/driver_types.h>
namespace onnxruntime {
ROCMGraph::ROCMGraph(hipStream_t stream) : stream_(stream) {
#if (defined(CUDA_VERSION) && CUDA_VERSION < 10000)
ORT_THROW("ROCM graphs can only be used in Onnxruntime built with ROCM >= 10.0");
#endif
}
void ROCMGraph::SetStream(hipStream_t stream) {
stream_ = stream;
}
void ROCMGraph::CaptureBegin() {
#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
ORT_ENFORCE(!has_graph_exec_,
"This rocm graph has already captured a graph. "
"Create a new instance to capture a new graph.");
HIP_CALL_THROW(hipStreamSynchronize(stream_));
// For now rocm graph can only work with a single thread. In the future, we
// will support multiple threads. For multiple threads with multiple graphs
// and streams, `hipStreamCaptureModeGlobal` needs to be changed to
// `hipStreamCaptureModeThreadLocal`
HIP_CALL_THROW(hipStreamBeginCapture(stream_, hipStreamCaptureModeGlobal));
#else
ORT_THROW("ROCM graphs can only be used in Onnxruntime built with ROCM >= 10.0");
#endif
}
void ROCMGraph::CaptureEnd() {
#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
HIP_CALL_THROW(hipStreamEndCapture(stream_, &graph_));
if (graph_ == NULL) {
ORT_THROW("ROCMGraph::CaptureEnd: graph_ is NULL");
}
has_graph_ = true;
HIP_CALL_THROW(hipGraphInstantiate(&graph_exec_, graph_, NULL, NULL, 0));
has_graph_exec_ = true;
HIP_CALL_THROW(hipGraphDestroy(graph_));
has_graph_ = false;
#else
ORT_THROW("ROCM graphs can only be used in Onnxruntime built with ROCM >= 10.0");
#endif
}
Status ROCMGraph::Replay() {
// Although this function is not thread safe, the lock is not needed here because
// ROCM EP maintains a separate rocm graph per thread
#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
LOGS_DEFAULT(INFO) << "Replaying ROCM graph on stream " << stream_;
HIP_RETURN_IF_ERROR(hipGraphLaunch(graph_exec_, stream_));
HIP_RETURN_IF_ERROR(hipStreamSynchronize(stream_));
#else
ORT_THROW("ROCM graphs can only be used in Onnxruntime built with ROCM >= 10.0");
#endif
return Status::OK();
}
void ROCMGraph::Reset() {
#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
if (has_graph_) {
HIP_CALL_THROW(hipGraphDestroy(graph_));
has_graph_ = false;
}
if (has_graph_exec_) {
HIP_CALL_THROW(hipGraphExecDestroy(graph_exec_));
has_graph_exec_ = false;
}
#else
ORT_THROW("ROCM graphs can only be used in Onnxruntime built with ROCM >= 10.0");
#endif
}
ROCMGraph::~ROCMGraph() {
Reset();
}
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/common.h"
#include "core/platform/ort_mutex.h"
#include "core/providers/rocm/rocm_pch.h"
namespace onnxruntime {
using CaptureId_t = unsigned long long;
struct ROCMGraph {
ROCMGraph() {};
ROCMGraph(hipStream_t stream);
~ROCMGraph();
void SetStream(hipStream_t stream);
void CaptureBegin();
void CaptureEnd();
Status Replay();
void Reset();
private:
#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
hipGraph_t graph_ = NULL;
hipGraphExec_t graph_exec_ = NULL;
#endif
bool has_graph_ = false;
bool has_graph_exec_ = false;
hipStream_t stream_ = nullptr; // Does not own the stream
};
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <memory>
#include "core/providers/providers.h"
struct OrtROCMProviderOptions;
struct OrtROCMProviderOptionsV2;
namespace onnxruntime {
// defined in provider_bridge_ort.cc
struct CudaProviderFactoryCreator {
static std::shared_ptr<IExecutionProviderFactory> Create(const OrtROCMProviderOptions* provider_options);
static std::shared_ptr<IExecutionProviderFactory> Create(const OrtROCMProviderOptionsV2* provider_options);
};
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <hip/hip_fp16.h>
#include "core/framework/float16.h"
namespace onnxruntime {
namespace rocm {
// specifies the auxiliary type to use for accumulation of the given type
template <typename T>
struct AccumulationType;
template <>
struct AccumulationType<half> { using type = float; };
template <>
struct AccumulationType<float> { using type = float; };
template <>
struct AccumulationType<double> { using type = double; };
template <>
struct AccumulationType<BFloat16> { using type = float; };
template <typename T>
using AccumulationType_t = typename AccumulationType<T>::type;
} // namespace rocm
} // namespace onnxruntime
//
// Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include <iostream>
#include <limits>
#include <hip/hip_runtime.h>
#include <cmath>
#include "core/common/common.h"
namespace onnxruntime {
namespace rocm {
// The code below is based on section 4 Unsigned division of paper https://gmplib.org/~tege/divcnst-pldi94.pdf
// In current ORT, fast_divmod is used for calculating the position of a element in tensor,
// so unsigned integer division from the paper is good enough for ORT. The advantage is that div is very simple,
// then GPU compiler can do loop unroll easilly when divmod is called in a loop.
struct fast_divmod {
fast_divmod(int d = 1) {
d_ = d == 0 ? 1 : d;
ORT_ENFORCE(d_ >= 1 && d_ <= static_cast<uint32_t>(std::numeric_limits<int>::max()));
for (l_ = 0; l_ < 32; l_++)
if ((1U << l_) >= d_) break;
uint64_t one = 1;
uint64_t m = ((one << 32) * ((one << l_) - d_)) / d_ + 1;
M_ = static_cast<uint32_t>(m);
// according to paper, the value of m' should fit in a unsigned integer.
ORT_ENFORCE(M_ > 0 && M_ == m);
}
__host__ __device__ inline int div(int n) const {
#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
uint32_t t = __umulhi(M_, n);
return (t + n) >> l_;
#else
// Using uint64_t for t, then t + n won't overflow.
uint64_t t = ((uint64_t)M_ * n) >> 32;
return static_cast<int>((t + n) >> l_);
#endif
}
__host__ __device__ inline int mod(int n) const {
return n - div(n) * d_;
}
__host__ __device__ inline void divmod(int n, int& q, int& r) const {
q = div(n);
r = n - q * d_;
}
uint32_t d_; // divisor
uint32_t M_; // m' in the paper.
uint32_t l_; // l_ = ceil(log2(d_))
};
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
namespace onnxruntime {
namespace rocm {
Status GemmInt8(int m,
int n,
int k,
int32_t alpha_matmul,
int32_t beta_matmul,
const int8_t* a,
int lda,
const int8_t* b,
int ldb,
int32_t* c,
int ldc,
const RocmKernel* rocm_kernel);
}
}
\ No newline at end of file
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
// for things shared between nvcc and onnxruntime
// as currently nvcc cannot compile all onnxruntime headers
#pragma once
#include <hip/hip_fp16.h>
#include <memory>
#include <type_traits>
#include <vector>
#include "core/common/gsl.h"
#include "core/framework/float16.h"
#include "core/providers/rocm/shared_inc/fast_divmod.h"
namespace onnxruntime {
namespace rocm {
enum class SimpleBroadcast : int32_t {
NoBroadcast = (int32_t)-1,
LeftScalar = (int32_t)-2,
RightScalar = (int32_t)-3,
RightPerChannelBatch1 = (int32_t)-4,
RightPerChannelBatchN = (int32_t)-5,
};
enum class BroadcastIndexType : int32_t {
NoBroadcast = (int32_t)0,
Scalar = (int32_t)1,
NeedCompute = (int32_t)2,
};
template <typename T>
class IConstantBuffer {
public:
virtual ~IConstantBuffer(){};
virtual const T* GetBuffer(hipStream_t stream, size_t count) = 0;
};
template <typename T>
std::unique_ptr<IConstantBuffer<T>> CreateConstantOnes();
template <typename T>
void Fill(hipStream_t stream, T* output, T value, int64_t count);
/*
This is a utility wrapper for arbitrary type array
Commonly used for passing small list of metadata during rocm kernel launch
It's better to pass the array by value than having another cuMemcpy to pass the data to device.
*/
template <typename T, int32_t capacity = 8>
struct TArray {
#if defined(USE_ROCM)
#define TARRAY_CONSTRUCTOR_SPECIFIERS __host__ __device__
#else
#define TARRAY_CONSTRUCTOR_SPECIFIERS
#endif
TARRAY_CONSTRUCTOR_SPECIFIERS TArray() = default;
TARRAY_CONSTRUCTOR_SPECIFIERS TArray(const TArray&) = default;
TARRAY_CONSTRUCTOR_SPECIFIERS TArray& operator=(const TArray&) = default;
#undef TARRAY_CONSTRUCTOR_SPECIFIERS
TArray(int32_t size) : size_(size), data_() {
ORT_ENFORCE(
0 <= size && size <= capacity,
"TArray size must be within range [0, ", capacity, "]. Actual: ", size);
}
TArray(const std::vector<T>& vec) : TArray(static_cast<int32_t>(vec.size())) {
static_assert(std::is_trivially_copyable<T>::value, "T must be trivially copyable.");
memcpy(data_, vec.data(), vec.size() * sizeof(T));
}
TArray(gsl::span<const T> vec) : TArray(static_cast<int32_t>(vec.size())) {
static_assert(std::is_trivially_copyable<T>::value, "T must be trivially copyable.");
memcpy(data_, vec.data(), vec.size() * sizeof(T));
}
void SetSize(int32_t size) {
ORT_ENFORCE(
0 <= size && size <= capacity,
"TArray size must be within range [0, ", capacity, "]. Actual: ", size);
size_ = size;
}
__host__ __device__ int32_t Size() const {
return size_;
}
__host__ __device__ T& operator[](int32_t index) {
return data_[index];
}
__host__ __device__ __forceinline__ const T& operator[](int32_t index) const {
return data_[index];
}
__host__ __device__ T* Data() {
return data_;
}
__host__ __device__ const T* Data() const {
return data_;
}
static constexpr int32_t Capacity() { return capacity; };
private:
int32_t size_ = 0;
T data_[capacity] = {};
};
// Bitmask tensor is uint_32 type.
using BitmaskElementType = uint32_t;
constexpr int kNumBitsPerBitmaskElement = std::numeric_limits<BitmaskElementType>::digits;
template <typename T>
struct NumericLimits {
__inline__ __host__ __device__ static T Min() {
return std::numeric_limits<T>::lowest();
}
__inline__ __host__ __device__ static T Max() {
return std::numeric_limits<T>::max();
}
};
template <>
struct NumericLimits<MLFloat16> {
__inline__ __host__ __device__ static half Min() {
return -65504.0;
}
__inline__ __host__ __device__ static half Max() {
return 65504.0;
}
};
template <>
struct NumericLimits<half> {
__inline__ __host__ __device__ static half Min() {
return -65504.0;
}
__inline__ __host__ __device__ static half Max() {
return 65504.0;
}
};
template <>
struct NumericLimits<float> {
__inline__ __host__ __device__ static float Min() {
return -INFINITY;
}
__inline__ __host__ __device__ static float Max() {
return INFINITY;
}
};
template <>
struct NumericLimits<double> {
__inline__ __host__ __device__ static double Min() {
return -HUGE_VAL;
}
__inline__ __host__ __device__ static double Max() {
return HUGE_VAL;
}
};
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "cast_op.h"
#include "core/providers/rocm/math/unary_elementwise_ops_impl.h"
using namespace ONNX_NAMESPACE;
using namespace onnxruntime::common;
namespace onnxruntime {
namespace rocm {
const std::vector<MLDataType> castOpTypeConstraints{
DataTypeImpl::GetTensorType<MLFloat16>(),
DataTypeImpl::GetTensorType<BFloat16>(),
DataTypeImpl::GetTensorType<float>(),
DataTypeImpl::GetTensorType<double>(),
DataTypeImpl::GetTensorType<int8_t>(),
DataTypeImpl::GetTensorType<int16_t>(),
DataTypeImpl::GetTensorType<int32_t>(),
DataTypeImpl::GetTensorType<int64_t>(),
DataTypeImpl::GetTensorType<uint8_t>(),
DataTypeImpl::GetTensorType<uint16_t>(),
DataTypeImpl::GetTensorType<uint32_t>(),
DataTypeImpl::GetTensorType<uint64_t>(),
DataTypeImpl::GetTensorType<bool>()
};
#define REGISTER_KERNEL_TYPED(T) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Cast, \
kOnnxDomain, \
6, 8, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("T2", castOpTypeConstraints), \
Cast<T>); \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Cast, \
kOnnxDomain, \
9, 12, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("T2", castOpTypeConstraints), \
Cast<T>); \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
Cast, \
kOnnxDomain, \
13, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("T2", castOpTypeConstraints), \
Cast<T>);
template <typename SrcT>
Status Cast<SrcT>::ComputeInternal(OpKernelContext* context) const {
typedef typename ToHipType<SrcT>::MappedType CudaSrcT;
const Tensor* X = context->Input<Tensor>(0);
const TensorShape& shape = X->Shape();
Tensor* Y = context->Output(0, shape);
const auto* x_data = reinterpret_cast<const CudaSrcT*>(X->Data<SrcT>());
size_t count = shape.Size();
#define CASE(TP_TYPE, DstT) \
case TP_TYPE: \
if (count > 0) { \
Impl_Cast<CudaSrcT, typename ToHipType<DstT>::MappedType>( \
Stream(), \
x_data, \
reinterpret_cast<typename ToHipType<DstT>::MappedType*>(Y->MutableData<DstT>()), \
count); \
} \
break;
switch (to_) {
CASE(TensorProto_DataType_FLOAT16, MLFloat16)
CASE(TensorProto_DataType_BFLOAT16, BFloat16)
CASE(TensorProto_DataType_FLOAT, float)
CASE(TensorProto_DataType_DOUBLE, double)
CASE(TensorProto_DataType_INT8, int8_t)
CASE(TensorProto_DataType_INT16, int16_t)
CASE(TensorProto_DataType_INT32, int32_t)
CASE(TensorProto_DataType_INT64, int64_t)
CASE(TensorProto_DataType_UINT8, uint8_t)
CASE(TensorProto_DataType_UINT16, uint16_t)
CASE(TensorProto_DataType_UINT32, uint32_t)
CASE(TensorProto_DataType_UINT64, uint64_t)
CASE(TensorProto_DataType_BOOL, bool)
case TensorProto_DataType_STRING:
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Casting to and from strings is not supported yet.");
case TensorProto_DataType_UNDEFINED:
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Cast op must have 'to' argument of type DataType");
default:
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unexpected 'to' argument value: ", to_);
}
return Status::OK();
}
#define SPECIALIZE_IMPL(T) \
REGISTER_KERNEL_TYPED(T) \
template Status Cast<T>::ComputeInternal(OpKernelContext* context) const;
SPECIALIZE_IMPL(MLFloat16)
SPECIALIZE_IMPL(float)
SPECIALIZE_IMPL(double)
SPECIALIZE_IMPL(int8_t)
SPECIALIZE_IMPL(int16_t)
SPECIALIZE_IMPL(int32_t)
SPECIALIZE_IMPL(int64_t)
SPECIALIZE_IMPL(uint8_t)
SPECIALIZE_IMPL(uint16_t)
SPECIALIZE_IMPL(uint32_t)
SPECIALIZE_IMPL(uint64_t)
SPECIALIZE_IMPL(bool)
SPECIALIZE_IMPL(BFloat16)
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace onnxruntime {
namespace rocm {
template <typename SrcT>
class Cast final : public RocmKernel {
public:
Cast(const OpKernelInfo& info) : RocmKernel(info) {
int64_t to;
Status status = info.GetAttr("to", &to);
ORT_ENFORCE(status.IsOK(), "Attribute to is not set.");
to_ = gsl::narrow_cast<ONNX_NAMESPACE::TensorProto_DataType>(to);
}
Status ComputeInternal(OpKernelContext* context) const override;
private:
ONNX_NAMESPACE::TensorProto_DataType to_;
};
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "compress.h"
#include "compress_impl.h"
namespace onnxruntime {
namespace rocm {
ONNX_OPERATOR_VERSIONED_KERNEL_EX(
Compress,
kOnnxDomain,
9, 10,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.TypeConstraint("T1", DataTypeImpl::GetTensorType<bool>()),
Compress);
// explicit negative axis support
ONNX_OPERATOR_KERNEL_EX(
Compress,
kOnnxDomain,
11,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes())
.TypeConstraint("T1", DataTypeImpl::GetTensorType<bool>()),
Compress);
Status Compress::ComputeInternal(OpKernelContext* ctx) const {
const Tensor* input_tensor = ctx->Input<Tensor>(0);
ORT_ENFORCE(input_tensor);
size_t rank = input_tensor->Shape().NumDimensions();
auto input_dimensions = input_tensor->Shape().GetDims();
int64_t axis = 0;
if (has_axis_) {
axis = HandleNegativeAxis(axis_, rank);
}
const Tensor* condition = ctx->Input<Tensor>(1);
ORT_ENFORCE(condition);
auto condition_length = condition->Shape().Size();
auto condition_data = condition->Data<bool>();
// if has axis, we need to compress on dimension[axis], otherwise compress on the flattened input data
int64_t input_size = input_tensor->Shape().Size();
int64_t compress_input_length = has_axis_ ? input_dimensions[axis] : input_size;
int64_t valid_condition_length = compress_input_length < condition_length ? compress_input_length : condition_length;
auto condition_cumulative_sum_buffer = GetScratchBuffer<int32_t>(gsl::narrow<size_t>(valid_condition_length));
auto condition_cumulative_sum = condition_cumulative_sum_buffer.get();
size_t temp_storage_bytes = 0;
HIP_RETURN_IF_ERROR(CompressCalcPrefixSumTempStorageBytes(Stream(),
reinterpret_cast<const int8_t*>(condition_data),
condition_cumulative_sum,
gsl::narrow<int>(valid_condition_length),
temp_storage_bytes));
auto temp_buffer = GetScratchBuffer<uint8_t>(temp_storage_bytes);
auto d_temp_storage = temp_buffer.get();
HIP_RETURN_IF_ERROR(CompressInclusivePrefixSum(Stream(),
d_temp_storage,
temp_storage_bytes,
reinterpret_cast<const int8_t*>(condition_data),
condition_cumulative_sum,
gsl::narrow<int>(valid_condition_length)));
// hipMemcpyAsync from device memory to pageable host memory will return only once the copy has completed.
int32_t positive_condition_count = 0;
HIP_RETURN_IF_ERROR(hipMemcpyAsync(&positive_condition_count, condition_cumulative_sum + valid_condition_length - 1, sizeof(int32_t), hipMemcpyDeviceToHost, Stream()));
std::vector<int64_t> output_dims(input_dimensions.begin(), input_dimensions.end());
if (has_axis_) {
output_dims[axis] = positive_condition_count;
} else {
output_dims.resize(1);
output_dims[0] = positive_condition_count;
}
TensorShape output_shape(output_dims);
auto output_tensor = ctx->Output(0, output_shape);
if (positive_condition_count <= 0) {
return Status::OK();
}
auto element_bytes = input_tensor->DataType()->Size();
int64_t axis_right_stride = 1;
if (has_axis_) {
for (auto i = static_cast<size_t>(axis + 1); i < rank; ++i) {
axis_right_stride *= input_dimensions[i];
}
}
ORT_RETURN_IF_ERROR(CompressImpl(Stream(),
element_bytes,
gsl::narrow_cast<int32_t>(valid_condition_length),
gsl::narrow_cast<int32_t>(axis_right_stride),
has_axis_ ? gsl::narrow_cast<int32_t>(input_dimensions[axis])
: gsl::narrow_cast<int32_t>(input_size),
gsl::narrow_cast<int32_t>(positive_condition_count),
condition_cumulative_sum,
condition_data,
input_tensor->DataRaw(),
output_tensor->MutableDataRaw(),
input_size));
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/common/common.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace onnxruntime {
namespace rocm {
class Compress final : public RocmKernel {
public:
Compress(const OpKernelInfo& info) : RocmKernel(info) {
has_axis_ = info.GetAttr("axis", &axis_).IsOK();
}
Status ComputeInternal(OpKernelContext* context) const override;
private:
int64_t axis_;
bool has_axis_;
};
} // namespace rocm
} // namespace onnxruntime
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include <hipcub/hipcub.hpp>
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/providers/rocm/rocm_common.h"
//TODO:fix the warnings
#ifdef _MSC_VER
#pragma warning(disable : 4244)
#endif
#include "core/providers/rocm/tensor/compress_impl.h"
#include <thrust/functional.h>
#include <thrust/iterator/transform_iterator.h>
namespace onnxruntime {
namespace rocm {
// This cast is for transform iterator. This type affects the accumulator type width
// in InclusiveSum(). By default, the accumulator type matches the input, but for int8_t
// the sum overflows quickly, so we want the source type to match the output (int32_t).
// see https://github.com/NVIDIA/cub/issues/384
struct CastToInt32 : public thrust::unary_function<int8_t, int32_t> {
__host__ __device__ int32_t operator()(int8_t v) const {
return static_cast<int32_t>(v);
}
};
hipError_t CompressCalcPrefixSumTempStorageBytes(hipStream_t stream, const int8_t* condition_data, int32_t* condition_cumulative_sum, int length, size_t& temp_storage_bytes) {
auto input_iter = thrust::make_transform_iterator(condition_data, CastToInt32());
return hipcub::DeviceScan::InclusiveSum(
nullptr, temp_storage_bytes, input_iter, condition_cumulative_sum, length, stream);
}
hipError_t CompressInclusivePrefixSum(hipStream_t stream, void* d_temp_storage, size_t temp_storage_bytes, const int8_t* condition_data, int32_t* condition_cumulative_sum, int length) {
auto input_iter = thrust::make_transform_iterator(condition_data, CastToInt32());
return hipcub::DeviceScan::InclusiveSum(
d_temp_storage, temp_storage_bytes, input_iter, condition_cumulative_sum, length, stream);
}
template <typename T>
__global__ void _CompressKernel(const int32_t valid_condition_length,
const fast_divmod axis_right_stride_div,
const fast_divmod input_axis_included_stride_div,
const int32_t output_axis_included_stride,
const int32_t* condition_cumulative_sum,
const bool* condition_data,
const T* input_data,
T* output_data,
const HIP_LONG N) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
HIP_LONG output_index = 0;
int div, mod;
input_axis_included_stride_div.divmod(id, div, mod);
output_index = output_axis_included_stride * div;
axis_right_stride_div.divmod(mod, div, mod);
if (div < valid_condition_length && condition_data[div]) {
output_index += (condition_cumulative_sum[div] - 1) * axis_right_stride_div.d_ + mod;
output_data[output_index] = input_data[id];
}
}
Status CompressImpl(hipStream_t stream,
const size_t element_bytes,
const int32_t valid_condition_length,
const int32_t axis_right_stride,
const int32_t input_axis_dim_length,
const int32_t output_axis_dim_length,
const int32_t* condition_cumulative_sum,
const bool* condition_data,
const void* input_data,
void* output_data,
const size_t N) {
int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
fast_divmod axis_right_stride_div(axis_right_stride);
fast_divmod input_axis_included_stride_div(axis_right_stride * input_axis_dim_length);
int output_axis_included_stride = axis_right_stride * output_axis_dim_length;
switch (element_bytes) {
case sizeof(int8_t):
hipLaunchKernelGGL(_CompressKernel, blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
valid_condition_length,
axis_right_stride_div,
input_axis_included_stride_div,
output_axis_included_stride,
condition_cumulative_sum,
condition_data,
reinterpret_cast<const ToHipType<int8_t>::MappedType*>(input_data),
reinterpret_cast<ToHipType<int8_t>::MappedType*>(output_data),
(HIP_LONG)N);
break;
case sizeof(int16_t):
hipLaunchKernelGGL(_CompressKernel, blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
valid_condition_length,
axis_right_stride_div,
input_axis_included_stride_div,
output_axis_included_stride,
condition_cumulative_sum,
condition_data,
reinterpret_cast<const ToHipType<int16_t>::MappedType*>(input_data),
reinterpret_cast<ToHipType<int16_t>::MappedType*>(output_data),
(HIP_LONG)N);
break;
case sizeof(int32_t):
hipLaunchKernelGGL(_CompressKernel, blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
valid_condition_length,
axis_right_stride_div,
input_axis_included_stride_div,
output_axis_included_stride,
condition_cumulative_sum,
condition_data,
reinterpret_cast<const ToHipType<int32_t>::MappedType*>(input_data),
reinterpret_cast<ToHipType<int32_t>::MappedType*>(output_data),
(HIP_LONG)N);
break;
case sizeof(int64_t):
hipLaunchKernelGGL(_CompressKernel, blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream,
valid_condition_length,
axis_right_stride_div,
input_axis_included_stride_div,
output_axis_included_stride,
condition_cumulative_sum,
condition_data,
reinterpret_cast<const ToHipType<int64_t>::MappedType*>(input_data),
reinterpret_cast<ToHipType<int64_t>::MappedType*>(output_data),
(HIP_LONG)N);
break;
default:
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for Compress operator");
}
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
#include "core/common/common.h"
namespace onnxruntime {
namespace rocm {
hipError_t CompressCalcPrefixSumTempStorageBytes(hipStream_t stream, const int8_t* condition_data,
int32_t* condition_cumulative_sum, int length, size_t& temp_storage_bytes);
hipError_t CompressInclusivePrefixSum(hipStream_t stream, void* d_temp_storage, size_t temp_storage_bytes,
const int8_t* condition_data, int32_t* condition_cumulative_sum, int length);
Status CompressImpl(hipStream_t stream,
const size_t element_bytes,
const int32_t valid_condition_length,
const int32_t axis_right_stride,
const int32_t input_axis_dim_length,
const int32_t output_axis_dim_length,
const int32_t* condition_cumulative_sum,
const bool* condition_data,
const void* input_data,
void* output_data,
const size_t N);
} // namespace rocm
} // namespace onnxruntime
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/concat.h"
#include "core/providers/rocm/tensor/concat_impl.h"
namespace onnxruntime {
namespace rocm {
ONNX_OPERATOR_VERSIONED_KERNEL_EX(Concat,
kOnnxDomain,
4, 10,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
Concat);
// opset 11 explicitly support negative axis
ONNX_OPERATOR_VERSIONED_KERNEL_EX(Concat,
kOnnxDomain,
11, 12,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
Concat);
ONNX_OPERATOR_KERNEL_EX(Concat,
kOnnxDomain,
13,
kRocmExecutionProvider,
(*KernelDefBuilder::Create())
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
Concat);
Status Concat::ComputeInternal(OpKernelContext* ctx) const {
auto input_count = Node().InputArgCount().front();
// Hold pointers to the input tensors to be used in the PrepareForCompute() step
InlinedTensorsVector input_tensors;
input_tensors.reserve(input_count);
for (int i = 0; i < input_count; ++i) {
input_tensors.push_back(ctx->Input<Tensor>(i));
}
Prepare p;
ORT_RETURN_IF_ERROR(PrepareForCompute(ctx, input_tensors, p));
// Return at this point if output tensor is going to be empty
if (p.output_num_elements == 0)
return Status::OK();
std::vector<int64_t> concat_sizes;
concat_sizes.reserve(input_count);
RocmAsyncBuffer<const void*> input_ptr(this, input_count);
gsl::span<const void*> input_ptr_cpuspan = input_ptr.CpuSpan();
std::vector<int64_t> axis_dimension_input_output_mapping(p.output_tensor->Shape()[p.axis]);
int index = 0;
for (int i = 0; i < input_count; ++i) {
const auto& input = p.inputs[i];
concat_sizes.push_back(input.tensor->Shape()[p.axis]);
input_ptr_cpuspan[i] = input.tensor->DataRaw();
for (int j = 0; j < input.tensor->Shape()[p.axis]; ++j) {
axis_dimension_input_output_mapping.at(index++) = i;
}
}
auto element_bytes = p.output_tensor->DataType()->Size();
int block_size_inside_axis_dim = static_cast<int>(p.output_axis_pitch / p.output_tensor->Shape()[p.axis]);
int block_size_including_axis_dim = static_cast<int>(p.output_axis_pitch);
if (std::all_of(concat_sizes.begin(), concat_sizes.end(), [&](int64_t size) { return size == concat_sizes[0]; })) {
if (input_count <= 32) {
TArray<const void*, 32> input_ptr_array(input_count);
for (int i = 0; i < input_count; ++i) input_ptr_array[i] = input_ptr_cpuspan[i];
ORT_RETURN_IF_ERROR(ConcatSameConcatDimImpl(
Stream(), element_bytes, block_size_including_axis_dim, block_size_inside_axis_dim, concat_sizes[0],
p.output_tensor->MutableDataRaw(), input_ptr_array, static_cast<size_t>(p.output_num_elements)));
} else {
ORT_RETURN_IF_ERROR(input_ptr.CopyToGpu());
ORT_RETURN_IF_ERROR(ConcatSameConcatDimImpl(
Stream(), element_bytes, block_size_including_axis_dim, block_size_inside_axis_dim, concat_sizes[0],
p.output_tensor->MutableDataRaw(), input_ptr.GpuPtr(), static_cast<size_t>(p.output_num_elements)));
}
} else {
RocmAsyncBuffer<int64_t> concat_sizes_gpu(this, concat_sizes);
RocmAsyncBuffer<int64_t> axis_dimension_input_output_mapping_gpu(this, axis_dimension_input_output_mapping);
std::vector<int64_t> concat_sizes_range(concat_sizes);
for (size_t i = 1; i < concat_sizes_range.size(); ++i) {
concat_sizes_range[i] += concat_sizes_range[i - 1];
}
RocmAsyncBuffer<int64_t> concat_sizes_range_gpu(this, concat_sizes_range);
ORT_RETURN_IF_ERROR(concat_sizes_gpu.CopyToGpu());
ORT_RETURN_IF_ERROR(axis_dimension_input_output_mapping_gpu.CopyToGpu());
ORT_RETURN_IF_ERROR(concat_sizes_range_gpu.CopyToGpu());
ORT_RETURN_IF_ERROR(input_ptr.CopyToGpu());
ORT_RETURN_IF_ERROR(ConcatImpl(Stream(), element_bytes, block_size_including_axis_dim, block_size_inside_axis_dim,
concat_sizes_gpu.GpuPtr(), concat_sizes_range_gpu.GpuPtr(),
axis_dimension_input_output_mapping_gpu.GpuPtr(), p.output_tensor->MutableDataRaw(),
input_ptr.GpuPtr(), static_cast<size_t>(p.output_num_elements)));
}
return Status::OK();
}
} // namespace rocm
} // namespace onnxruntime
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment