Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
onnxruntime_v14
Commits
1a91fcc2
Commit
1a91fcc2
authored
Jul 25, 2023
by
gaoqiong
Browse files
add dtk所需文件
parent
a144865d
Pipeline
#492
failed with stages
in 0 seconds
Changes
280
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1968 additions
and
0 deletions
+1968
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_functions.cu
...time/core/providers/rocm/reduction/reduction_functions.cu
+515
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_functions.h
...ntime/core/providers/rocm/reduction/reduction_functions.h
+111
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_ops.h
...onnxruntime/core/providers/rocm/reduction/reduction_ops.h
+273
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_utils.cuh
...runtime/core/providers/rocm/reduction/reduction_utils.cuh
+43
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_check_memory.cc
...dgpu/onnxruntime/core/providers/rocm/rocm_check_memory.cc
+18
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_check_memory.h
...mdgpu/onnxruntime/core/providers/rocm/rocm_check_memory.h
+9
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_graph.cc
...ease/amdgpu/onnxruntime/core/providers/rocm/rocm_graph.cc
+89
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_graph.h
...lease/amdgpu/onnxruntime/core/providers/rocm/rocm_graph.h
+37
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_provider_factory_creator.h
...ntime/core/providers/rocm/rocm_provider_factory_creator.h
+19
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/accumulation_type.h
...untime/core/providers/rocm/shared_inc/accumulation_type.h
+28
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/fast_divmod.h
.../onnxruntime/core/providers/rocm/shared_inc/fast_divmod.h
+62
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/integer_gemm.h
...onnxruntime/core/providers/rocm/shared_inc/integer_gemm.h
+23
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/rocm_utils.h
...u/onnxruntime/core/providers/rocm/shared_inc/rocm_utils.h
+172
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/cast_op.cc
.../amdgpu/onnxruntime/core/providers/rocm/tensor/cast_op.cc
+123
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/cast_op.h
...e/amdgpu/onnxruntime/core/providers/rocm/tensor/cast_op.h
+29
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/compress.cc
...amdgpu/onnxruntime/core/providers/rocm/tensor/compress.cc
+114
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/compress.h
.../amdgpu/onnxruntime/core/providers/rocm/tensor/compress.h
+25
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/compress_impl.cu
...u/onnxruntime/core/providers/rocm/tensor/compress_impl.cu
+143
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/compress_impl.h
...pu/onnxruntime/core/providers/rocm/tensor/compress_impl.h
+30
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/concat.cc
...e/amdgpu/onnxruntime/core/providers/rocm/tensor/concat.cc
+105
-0
No files found.
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_functions.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/reduction/reduction_functions.h"
#include <algorithm>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include "core/common/common.h"
#include "core/providers/rocm/atomic/common.cuh"
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/providers/rocm/shared_inc/rocm_utils.h"
#include "core/providers/rocm/reduction/reduction_utils.cuh"
#include "core/providers/rocm/cu_inc/unary_elementwise_impl.cuh"
namespace
onnxruntime
{
namespace
rocm
{
namespace
detail
{
constexpr
auto
MAX_NUM_ELEMENTS_PER_THREAD
=
4
;
constexpr
auto
MAX_NUM_WARPS_PER_BLOCK
=
8
;
constexpr
auto
MAX_NUM_BLOCKS_IN_GRID_ROW
=
256
;
constexpr
auto
MAX_NUM_GRID_ROWS
=
32768
;
dim3
compute_block_dim
(
int
num_cols
)
{
const
int
x
=
GPU_WARP_SIZE_HOST
;
const
int
y
=
std
::
min
(
MAX_NUM_WARPS_PER_BLOCK
,
std
::
max
(
1
,
num_cols
/
(
MAX_NUM_ELEMENTS_PER_THREAD
*
x
)));
return
dim3
(
x
,
y
);
}
std
::
pair
<
dim3
,
dim3
>
compute_grid_and_block_dims
(
int
num_rows
,
int
num_cols
)
{
const
auto
block_dim
=
compute_block_dim
(
num_cols
);
const
auto
grid_x
=
std
::
min
<
int
>
(
MAX_NUM_BLOCKS_IN_GRID_ROW
,
std
::
max
<
int
>
(
1
,
num_cols
/
(
MAX_NUM_ELEMENTS_PER_THREAD
*
block_dim
.
x
*
block_dim
.
y
)));
const
auto
grid_y
=
std
::
min
(
MAX_NUM_GRID_ROWS
,
num_rows
);
const
dim3
grid_dim
(
grid_x
,
grid_y
);
return
{
grid_dim
,
block_dim
};
}
uintptr_t
round_up_to_aligned
(
uintptr_t
original
,
size_t
alignment
)
{
assert
((
alignment
&
(
alignment
-
1
))
==
0
);
const
size_t
alignment_mask
=
~
(
alignment
-
1
);
return
(
original
+
alignment
-
1
)
&
alignment_mask
;
}
/**
* call_reduce_matrix_columns() intermediate buffer layout
*
* Given buffer element type TBuf, the intermediate buffer layout looks like this:
*
* -----
* m * num_blocks_per_row * sizeof(TBuf) bytes for block reductions per row
* alignment padding bytes as needed
* m * sizeof(int) bytes for block done counts per row
* -----
*/
size_t
compute_reduce_matrix_columns_intermediate_buffer_size
(
int
element_size
,
int
num_rows
,
int
num_cols
)
{
ORT_ENFORCE
(
element_size
>=
0
&&
num_rows
>=
0
&&
num_cols
>=
0
);
const
auto
grid_dim
=
compute_grid_and_block_dims
(
num_rows
,
num_cols
).
first
;
size_t
buffer_size
{};
// at the beginning, for sizing purposes, assume we are aligned
buffer_size
+=
static_cast
<
size_t
>
(
num_rows
)
*
grid_dim
.
x
*
element_size
;
buffer_size
=
round_up_to_aligned
(
buffer_size
,
alignof
(
int
));
buffer_size
+=
static_cast
<
size_t
>
(
num_rows
)
*
sizeof
(
int
);
// add padding to give us room to align
buffer_size
+=
alignof
(
max_align_t
)
-
1
;
return
buffer_size
;
}
template
<
typename
TBuf
>
Status
get_reduction_buffers
(
int
num_rows
,
int
num_cols
,
void
*
buffer
,
size_t
buffer_size
,
TBuf
*&
block_reductions_buffer
,
int
*&
block_done_counts_buffer
)
{
const
auto
grid_dim
=
compute_grid_and_block_dims
(
num_rows
,
num_cols
).
first
;
const
uintptr_t
begin_addr
=
reinterpret_cast
<
uintptr_t
>
(
buffer
);
const
uintptr_t
block_reductions_addr
=
round_up_to_aligned
(
begin_addr
,
alignof
(
TBuf
));
const
uintptr_t
block_done_counts_buffer_addr
=
round_up_to_aligned
(
block_reductions_addr
+
static_cast
<
size_t
>
(
num_rows
)
*
grid_dim
.
x
*
sizeof
(
TBuf
),
alignof
(
int
));
const
uintptr_t
end_addr
=
block_done_counts_buffer_addr
+
static_cast
<
size_t
>
(
num_rows
)
*
sizeof
(
int
);
const
size_t
required_size
=
end_addr
-
begin_addr
;
ORT_RETURN_IF_NOT
(
required_size
<=
buffer_size
,
"Buffer size is too small ("
,
buffer_size
,
" bytes). "
,
"At least "
,
required_size
,
" bytes are needed from the given base address ("
,
buffer
,
")."
);
block_reductions_buffer
=
reinterpret_cast
<
TBuf
*>
(
block_reductions_addr
);
block_done_counts_buffer
=
reinterpret_cast
<
int
*>
(
block_done_counts_buffer_addr
);
return
Status
::
OK
();
}
template
<
typename
TIn
,
typename
TOut
,
typename
TBuf
,
typename
TOp
,
typename
TFinalOp
,
bool
DivideResultBySize
>
__device__
void
reduce_all
(
const
int
num_elements
,
const
TIn
*
const
input
,
TOut
*
const
output
,
TBuf
*
const
block_reductions_buffer
,
int
*
const
block_done_count_buffer
)
{
extern
__shared__
unsigned
char
shared_memory_bytes
[];
TBuf
*
shared_memory
=
reinterpret_cast
<
TBuf
*>
(
shared_memory_bytes
);
// Thread-level indices:
// Linear index of thread in block.
const
int
tid_in_block
=
threadIdx
.
y
*
blockDim
.
x
+
threadIdx
.
x
;
// Total number of threads in a 2-D block.
const
int
num_threads_in_block
=
blockDim
.
x
*
blockDim
.
y
;
// Warp-level indices:
// Warp index of thread.
const
int
wid_in_block
=
tid_in_block
/
GPU_WARP_SIZE
;
// Lane index of thread.
const
int
lid_in_block
=
tid_in_block
%
GPU_WARP_SIZE
;
// Warp count per block.
const
int
num_warps_in_block
=
num_threads_in_block
/
GPU_WARP_SIZE
;
// Grid-level indices:
// Linear index of block in grid row.
const
int
bid_in_grid_row
=
blockIdx
.
x
;
// Linear index of thread in grid row.
const
int
tid_in_grid_row
=
bid_in_grid_row
*
(
blockDim
.
x
*
blockDim
.
y
)
+
tid_in_block
;
// Total number of blocks in a grid row.
const
int
num_blocks_in_grid_row
=
gridDim
.
x
;
// Total number of threads in a grid row with 2-D blocks.
const
int
num_threads_in_grid_row
=
num_blocks_in_grid_row
*
num_threads_in_block
;
const
auto
write_result
=
[
&
output
,
&
num_elements
](
const
TOut
result
)
{
// Compilation time if-else branch controlled by template argument can be
// optimized out, so there will be no branch in real computation phase.
if
(
DivideResultBySize
)
{
output
[
0
]
=
TFinalOp
()(
result
/
TOut
(
num_elements
));
}
else
{
output
[
0
]
=
TFinalOp
()(
result
);
}
};
// Thread-level reduction (storage change: global memory -> register).
// One thread reduces MAX_NUM_ELEMENTS_PER_THREAD elements to a thread register
// in one iteration.
TBuf
value
=
0
;
for
(
int
id
=
tid_in_grid_row
;
id
<
num_elements
;
id
+=
MAX_NUM_ELEMENTS_PER_THREAD
*
num_threads_in_grid_row
)
{
TIn
v
[
MAX_NUM_ELEMENTS_PER_THREAD
];
#pragma unroll
for
(
int
i
=
0
;
i
<
MAX_NUM_ELEMENTS_PER_THREAD
;
i
++
)
{
const
int
offset
=
id
+
i
*
num_threads_in_grid_row
;
if
(
offset
<
num_elements
)
{
v
[
i
]
=
input
[
offset
];
}
}
#pragma unroll
for
(
int
i
=
0
;
i
<
MAX_NUM_ELEMENTS_PER_THREAD
;
i
++
)
{
const
int
offset
=
id
+
i
*
num_threads_in_grid_row
;
if
(
offset
<
num_elements
)
{
value
+=
TOp
()(
TBuf
(
v
[
i
]));
}
}
}
#if __CUDA_ARCH__ >= 700
__syncwarp
();
#else
__syncthreads
();
#endif
// Warp-level reduction (storage change: register -> register).
// The values in a warp will be summed up to a scalar. After warp-level
// reduction, each block holds num_warps_in_block values in the shared memory.
#pragma unroll
for
(
int
stride
=
GPU_WARP_SIZE
/
2
;
stride
>
0
;
stride
/=
2
)
{
value
+=
WARP_SHFL_DOWN
(
value
,
stride
);
}
// Return early if only one warp is used for reduction.
// Given a fixed amount of threads, we prefer threads over warps over blocks so that we never have cases such as
// 1. two blocks and each of them has only 1 warp (32 threads).
// 2. two warps and each of them has only 2 threads.
if
(
num_warps_in_block
==
1
)
{
if
(
tid_in_grid_row
==
0
)
{
write_result
(
value
);
}
return
;
}
if
(
lid_in_block
==
0
)
{
shared_memory
[
wid_in_block
]
=
value
;
}
__syncthreads
();
// Block-level reduction (storage change: shared memory -> global memory).
// The values in a block will be summed up to a scalar.
// Note that the values are stored in the shared memory.
// Here we assume that the size of shared_memory is smaller
// than num_warps_in_block, so we just keep halving the number
// of threads in each iteration. Our assumption is always true because
// the size of shared_memory equals to the number of warps.
#pragma unroll
for
(
int
stride
=
MAX_NUM_WARPS_PER_BLOCK
/
2
;
stride
>
0
;
stride
/=
2
)
{
if
(
tid_in_block
+
stride
<
num_warps_in_block
)
{
shared_memory
[
tid_in_block
]
+=
shared_memory
[
tid_in_block
+
stride
];
}
__syncthreads
();
}
// Return early if only one block is used for reduction.
if
(
num_blocks_in_grid_row
==
1
)
{
if
(
tid_in_grid_row
==
0
)
{
write_result
(
shared_memory
[
0
]);
}
return
;
}
if
(
tid_in_block
==
0
)
{
block_reductions_buffer
[
bid_in_grid_row
]
=
shared_memory
[
0
];
}
__threadfence
();
__syncthreads
();
// Grid-level reduction. We use the last block to sum up values
// stored in the global block_reductions_buffer.
__shared__
bool
is_last_block_done
;
if
(
tid_in_block
==
0
)
{
const
int
count
=
atomicAdd
(
block_done_count_buffer
,
1
);
is_last_block_done
=
(
count
==
(
num_blocks_in_grid_row
-
1
));
}
// All threads in each block see if they belong the last active block
// (i.e., the value of is_last_block_done).
__syncthreads
();
// Only the block which saw that count equals to num_blocks_in_grid_row - 1 can
// enter the following block.
if
(
is_last_block_done
)
{
const
int
pow2_bound
=
least_pow2_bound
(
num_blocks_in_grid_row
);
for
(
int
stride
=
pow2_bound
/
2
;
stride
>
0
;
stride
/=
2
)
{
if
(
tid_in_block
<
stride
&&
tid_in_block
+
stride
<
num_blocks_in_grid_row
)
{
block_reductions_buffer
[
tid_in_block
]
+=
block_reductions_buffer
[
tid_in_block
+
stride
];
}
__syncthreads
();
}
// The first thread in the last block assigns the final output.
if
(
tid_in_block
==
0
)
{
write_result
(
block_reductions_buffer
[
0
]);
}
}
}
template
<
typename
TIn
,
typename
TOut
,
typename
TBuf
,
typename
TOp
,
typename
TFinalOp
,
bool
DivideResultBySize
>
__global__
void
reduce_matrix_columns_kernel
(
const
int
num_rows
,
const
int
num_cols
,
const
TIn
*
const
input
,
TOut
*
const
output
,
TBuf
*
const
block_reductions_buffer
,
int
*
const
block_done_counts_buffer
)
{
const
int
num_blocks_in_grid_row
=
gridDim
.
x
;
const
int
row_id_in_grid
=
blockIdx
.
y
;
const
int
num_grid_rows
=
gridDim
.
y
;
// one row per iteration
// row_id is int64_t to avoid int overflow in offset calculations
for
(
int64_t
row_id
=
row_id_in_grid
;
row_id
<
num_rows
;
row_id
+=
num_grid_rows
)
{
const
TIn
*
const
row_data
=
input
+
row_id
*
num_cols
;
TOut
*
const
row_output
=
output
+
row_id
;
TBuf
*
const
row_block_reductions_buffer
=
block_reductions_buffer
+
row_id
*
num_blocks_in_grid_row
;
int
*
const
row_block_done_counts_buffer
=
block_done_counts_buffer
+
row_id
;
reduce_all
<
TIn
,
TOut
,
TBuf
,
TOp
,
TFinalOp
,
DivideResultBySize
>
(
num_cols
,
row_data
,
row_output
,
row_block_reductions_buffer
,
row_block_done_counts_buffer
);
}
}
template
<
typename
TIn
,
typename
TOut
,
typename
TOp
,
typename
TFinalOp
,
bool
DivideResultBySize
>
Status
call_reduce_matrix_columns
(
hipStream_t
stream
,
const
TIn
*
input
,
TOut
*
output
,
const
int
num_rows
,
const
int
num_cols
,
void
*
buffer
,
size_t
buffer_size
)
{
ORT_ENFORCE
(
num_rows
>=
0
&&
num_cols
>=
0
);
using
TBuf
=
AccumulationType_t
<
TIn
>
;
const
auto
grid_and_block_dims
=
compute_grid_and_block_dims
(
num_rows
,
num_cols
);
const
dim3
&
grid_dim
=
grid_and_block_dims
.
first
;
const
dim3
&
block_dim
=
grid_and_block_dims
.
second
;
TBuf
*
block_reductions_buffer
;
int
*
block_done_counts_buffer
;
ORT_RETURN_IF_ERROR
(
get_reduction_buffers
(
num_rows
,
num_cols
,
buffer
,
buffer_size
,
block_reductions_buffer
,
block_done_counts_buffer
));
// If more than one block is used per grid row, then inter-block reduction is needed.
if
(
grid_dim
.
x
>
1
)
{
HIP_RETURN_IF_ERROR
(
hipMemsetAsync
(
block_done_counts_buffer
,
0
,
num_rows
*
sizeof
(
int
),
stream
));
}
const
int
shared_mem_size
=
sizeof
(
TBuf
)
*
block_dim
.
x
*
block_dim
.
y
/
GPU_WARP_SIZE_HOST
;
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
reduce_matrix_columns_kernel
<
TIn
,
TOut
,
TBuf
,
TOp
,
TFinalOp
,
DivideResultBySize
>
),
grid_dim
,
block_dim
,
shared_mem_size
,
stream
,
num_rows
,
num_cols
,
input
,
output
,
block_reductions_buffer
,
block_done_counts_buffer
);
return
Status
::
OK
();
}
}
// namespace detail
template
<
typename
TIn
,
typename
TOut
>
Status
reduce_sum
(
hipStream_t
stream
,
const
TIn
*
input
,
TOut
*
output
,
int
size
,
void
*
buffer
,
size_t
buffer_size
)
{
return
detail
::
call_reduce_matrix_columns
<
TIn
,
TOut
,
Identity
,
Identity
,
false
>
(
stream
,
input
,
output
,
1
,
size
,
buffer
,
buffer_size
);
}
template
<
typename
TIn
,
typename
TOut
>
Status
reduce_square_sum
(
hipStream_t
stream
,
const
TIn
*
input
,
TOut
*
output
,
int
size
,
void
*
buffer
,
size_t
buffer_size
)
{
return
detail
::
call_reduce_matrix_columns
<
TIn
,
TOut
,
Square
,
Identity
,
false
>
(
stream
,
input
,
output
,
1
,
size
,
buffer
,
buffer_size
);
}
template
<
typename
TIn
,
typename
TOut
>
Status
reduce_l2_norm
(
hipStream_t
stream
,
const
TIn
*
input
,
TOut
*
output
,
int
size
,
void
*
buffer
,
size_t
buffer_size
)
{
return
detail
::
call_reduce_matrix_columns
<
TIn
,
TOut
,
Square
,
Sqrt
,
false
>
(
stream
,
input
,
output
,
1
,
size
,
buffer
,
buffer_size
);
}
template
<
typename
TIn
,
typename
TOut
>
Status
reduce_mean
(
hipStream_t
stream
,
const
TIn
*
input
,
TOut
*
output
,
int
size
,
void
*
buffer
,
size_t
buffer_size
)
{
return
detail
::
call_reduce_matrix_columns
<
TIn
,
TOut
,
Identity
,
Identity
,
true
>
(
stream
,
input
,
output
,
1
,
size
,
buffer
,
buffer_size
);
}
#define INSTANTIATE_REDUCE_SUM(TIn, TOut) \
template Status reduce_sum<TIn, TOut>(hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size)
INSTANTIATE_REDUCE_SUM
(
half
,
half
);
INSTANTIATE_REDUCE_SUM
(
half
,
float
);
INSTANTIATE_REDUCE_SUM
(
float
,
float
);
INSTANTIATE_REDUCE_SUM
(
double
,
double
);
INSTANTIATE_REDUCE_SUM
(
BFloat16
,
BFloat16
);
INSTANTIATE_REDUCE_SUM
(
BFloat16
,
float
);
#undef INSTANTIATE_REDUCE_SUM
#define INSTANTIATE_REDUCE_SQUARE_SUM(TIn, TOut) \
template Status reduce_square_sum<TIn, TOut>(hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size)
INSTANTIATE_REDUCE_SQUARE_SUM
(
half
,
float
);
INSTANTIATE_REDUCE_SQUARE_SUM
(
float
,
float
);
INSTANTIATE_REDUCE_SQUARE_SUM
(
double
,
double
);
INSTANTIATE_REDUCE_SQUARE_SUM
(
BFloat16
,
float
);
#undef INSTANTIATE_REDUCE_SQUARE_SUM
#define INSTANTIATE_REDUCE_L2_NORM(TIn, TOut) \
template Status reduce_l2_norm<TIn, TOut>(hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size)
INSTANTIATE_REDUCE_L2_NORM
(
half
,
float
);
INSTANTIATE_REDUCE_L2_NORM
(
float
,
float
);
INSTANTIATE_REDUCE_L2_NORM
(
double
,
double
);
#undef INSTANTIATE_REDUCE_L2_NORM
#define INSTANTIATE_REDUCE_MEAN(TIn, TOut) \
template Status reduce_mean<TIn, TOut>(hipStream_t stream, const TIn* input, TOut* output, int size, void* buffer, size_t buffer_size)
INSTANTIATE_REDUCE_MEAN
(
half
,
float
);
INSTANTIATE_REDUCE_MEAN
(
float
,
float
);
INSTANTIATE_REDUCE_MEAN
(
double
,
double
);
#undef INSTANTIATE_REDUCE_MEAN
namespace
detail
{
template
<
typename
TIn
,
typename
TOut
,
typename
TBuf
>
__global__
void
reduce_matrix_rows_kernel
(
const
TIn
*
input
,
TOut
*
output
,
int
m
,
int
n
)
{
constexpr
int
x_load_count_per_thread
=
1
;
constexpr
int
y_load_count_per_thread
=
4
;
const
int
t_count_x_in_grid
=
blockDim
.
x
*
gridDim
.
x
;
const
int
t_count_y_in_grid
=
blockDim
.
y
*
gridDim
.
y
;
const
int
x_grid_stride
=
t_count_x_in_grid
*
x_load_count_per_thread
;
const
int
y_grid_stride
=
t_count_y_in_grid
*
y_load_count_per_thread
;
const
int
tid_x_in_grid
=
threadIdx
.
x
+
blockDim
.
x
*
blockIdx
.
x
;
const
int
tid_y_in_grid
=
threadIdx
.
y
+
blockDim
.
y
*
blockIdx
.
y
;
const
int
tid_in_block
=
threadIdx
.
x
+
blockDim
.
x
*
threadIdx
.
y
;
// Shape is blockDim.y-by-blockDim.x and element type is TBuf.
extern
__shared__
unsigned
char
shared_memory_bytes
[];
TBuf
*
shared_memory
=
reinterpret_cast
<
TBuf
*>
(
shared_memory_bytes
);
// to prevent int overflow in index calculation for input size m*n
const
int64_t
n_int64
=
static_cast
<
int64_t
>
(
n
);
for
(
int
col
=
tid_x_in_grid
;
col
<
n
;
col
+=
x_grid_stride
)
{
shared_memory
[
tid_in_block
]
=
TBuf
(
0.0
f
);
TBuf
sum
=
TBuf
(
0.0
f
);
// This loops load multiple blockDim.y-by-blockDim.x sub-tensors from the input.
for
(
int
row
=
tid_y_in_grid
;
row
<
m
;
row
+=
y_grid_stride
)
{
// Thread-level reduction. Each thread loads y_load_count_per_thread values
// and aggregrate them.
#pragma unroll y_load_count_per_thread
for
(
int
row_inner
=
0
;
row_inner
<
y_load_count_per_thread
;
++
row_inner
)
{
int
row_final
=
row
+
row_inner
*
t_count_y_in_grid
;
int
col_final
=
col
;
if
(
row_final
<
m
&&
col_final
<
n
)
{
sum
+=
TBuf
(
input
[
row_final
*
n_int64
+
col_final
]);
}
}
}
// Write thread-level reduction result into shared memory.
shared_memory
[
tid_in_block
]
=
sum
;
// Wait all threads to finish their thread-level reductions.
__syncthreads
();
// This loop conducts reduction on elements stored in shared memory.
// Each block reduces blockDim.y-by-blockDim.x tensor to 1-by-blockDim.x tensor.
#pragma unroll 4
for
(
int
stride
=
blockDim
.
y
/
2
;
stride
>
0
;
stride
/=
2
)
{
if
(
threadIdx
.
y
<
stride
)
{
shared_memory
[
tid_in_block
]
+=
shared_memory
[
tid_in_block
+
stride
*
blockDim
.
x
];
}
__syncthreads
();
}
if
(
threadIdx
.
y
==
0
)
{
atomic_add
(
output
+
col
,
TOut
(
shared_memory
[
threadIdx
.
x
]));
}
}
}
template
<
typename
TIn
,
typename
TOut
,
typename
TBuf
>
Status
call_reduce_matrix_rows
(
hipStream_t
stream
,
const
TIn
*
input
,
TOut
*
output
,
int
m
,
int
n
,
bool
reset_initial_output
)
{
ORT_ENFORCE
(
m
>=
0
&&
n
>=
0
);
if
(
reset_initial_output
)
{
HIP_RETURN_IF_ERROR
(
hipMemsetAsync
(
output
,
0
,
n
*
sizeof
(
TOut
),
stream
));
}
constexpr
int
max_num_threads_in_block
=
512
;
constexpr
int
max_num_blocks_in_grid
=
512
;
constexpr
int
load_count_per_thread
=
4
;
const
int
block_x_dim
=
least_pow2_bound
(
std
::
max
(
1
,
std
::
min
(
n
,
GPU_WARP_SIZE_HOST
)));
const
int
block_y_dim
=
least_pow2_bound
(
std
::
max
(
1
,
std
::
min
(
max_num_threads_in_block
/
block_x_dim
,
m
/
load_count_per_thread
)));
const
int
grid_x_dim
=
std
::
max
(
1
,
std
::
min
(
n
/
block_x_dim
,
max_num_blocks_in_grid
));
const
int
grid_y_dim
=
std
::
max
(
1
,
std
::
min
(
max_num_blocks_in_grid
/
grid_x_dim
,
m
/
block_y_dim
/
4
));
const
dim3
grid
(
grid_x_dim
,
grid_y_dim
,
1
);
const
dim3
block
(
block_x_dim
,
block_y_dim
,
1
);
reduce_matrix_rows_kernel
<
TIn
,
TOut
,
TBuf
><<<
grid
,
block
,
block
.
y
*
block
.
x
*
sizeof
(
TBuf
),
stream
>>>
(
input
,
output
,
m
,
n
);
return
Status
::
OK
();
}
}
// namespace detail
template
<
typename
T
>
struct
OP_Div
{
__device__
__inline__
T
operator
()(
const
T
&
a
)
const
{
return
a
/
v_
;
}
OP_Div
(
T
v
)
:
v_
(
v
)
{}
T
v_
;
};
template
<
typename
T
>
void
UnaryDiv
(
hipStream_t
stream
,
const
T
*
input
,
T
*
output
,
T
denominator
,
size_t
count
)
{
UnaryElementWiseImpl
(
stream
,
input
,
output
,
OP_Div
<
T
>
(
denominator
),
count
);
}
#define INSTANTIATE_UNARY_DIV(T) \
template void UnaryDiv<T>(hipStream_t stream, const T* input, T* output, T denominator, size_t count)
INSTANTIATE_UNARY_DIV
(
half
);
INSTANTIATE_UNARY_DIV
(
float
);
INSTANTIATE_UNARY_DIV
(
double
);
INSTANTIATE_UNARY_DIV
(
BFloat16
);
#undef INSTANTIATE_UNARY_DIV
template
<
typename
TIn
,
typename
TOut
>
Status
reduce_matrix_rows
(
hipStream_t
stream
,
const
TIn
*
input
,
TOut
*
output
,
int
m
,
int
n
,
bool
reset_initial_output
)
{
using
TBuf
=
AccumulationType_t
<
TIn
>
;
return
detail
::
call_reduce_matrix_rows
<
TIn
,
TOut
,
TBuf
>
(
stream
,
input
,
output
,
m
,
n
,
reset_initial_output
);
}
#define INSTANTIATE_REDUCE_MATRIX_ROWS(T) \
template Status reduce_matrix_rows<T, T>(hipStream_t stream, const T* input, T* output, int m, int n, bool reset_initial_output)
INSTANTIATE_REDUCE_MATRIX_ROWS
(
half
);
INSTANTIATE_REDUCE_MATRIX_ROWS
(
float
);
INSTANTIATE_REDUCE_MATRIX_ROWS
(
double
);
INSTANTIATE_REDUCE_MATRIX_ROWS
(
BFloat16
);
#undef INSTANTIATE_REDUCE_MATRIX_ROWS
template
<
typename
TIn
,
typename
TOut
>
Status
reduce_matrix_columns
(
hipStream_t
stream
,
const
TIn
*
input
,
TOut
*
output
,
int
m
,
int
n
,
void
*
buffer
,
size_t
buffer_size
)
{
return
detail
::
call_reduce_matrix_columns
<
TIn
,
TOut
,
Identity
,
Identity
,
false
>
(
stream
,
input
,
output
,
m
,
n
,
buffer
,
buffer_size
);
}
#define INSTANTIATE_REDUCE_MATRIX_COLUMNS(T) \
template Status reduce_matrix_columns<T, T>(hipStream_t stream, const T* input, T* output, int m, int n, void* buffer, size_t buffer_size)
INSTANTIATE_REDUCE_MATRIX_COLUMNS
(
half
);
INSTANTIATE_REDUCE_MATRIX_COLUMNS
(
float
);
INSTANTIATE_REDUCE_MATRIX_COLUMNS
(
double
);
INSTANTIATE_REDUCE_MATRIX_COLUMNS
(
BFloat16
);
#undef INSTANTIATE_REDUCE_MATRIX_COLUMNS
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_functions.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_common.h"
#include "core/providers/rocm/shared_inc/accumulation_type.h"
namespace
onnxruntime
{
namespace
rocm
{
namespace
detail
{
size_t
compute_reduce_matrix_columns_intermediate_buffer_size
(
int
element_size
,
int
num_rows
,
int
num_cols
);
}
// namespace detail
/**
* Computes the size in bytes of the intermediate buffer needed by reduce_matrix_columns().
* @tparam TIn The input data type.
* @param m The number of matrix rows.
* @param n The number of matrix columns.
* @return The size of the intermediate buffer.
*/
template
<
typename
TIn
>
size_t
compute_reduce_matrix_columns_buffer_size
(
int
m
,
int
n
)
{
using
TBuf
=
AccumulationType_t
<
TIn
>
;
return
detail
::
compute_reduce_matrix_columns_intermediate_buffer_size
(
sizeof
(
TBuf
),
m
,
n
);
}
/**
* Computes the size in bytes of the intermediate buffer needed by the reduce_x() functions.
* @tparam TIn The input data type.
* @param size The number of elements.
* @return The size of the intermediate buffer.
*/
template
<
typename
TIn
>
size_t
compute_reduction_buffer_size
(
int
size
)
{
using
TBuf
=
AccumulationType_t
<
TIn
>
;
return
detail
::
compute_reduce_matrix_columns_intermediate_buffer_size
(
sizeof
(
TBuf
),
1
,
size
);
}
/** Computes the sum of the given elements. */
template
<
typename
TIn
,
typename
TOut
>
Status
reduce_sum
(
hipStream_t
stream
,
const
TIn
*
input
,
TOut
*
output
,
int
size
,
void
*
buffer
,
size_t
buffer_size
);
/** Computes the sum of the squares of the given elements. */
template
<
typename
TIn
,
typename
TOut
>
Status
reduce_square_sum
(
hipStream_t
stream
,
const
TIn
*
input
,
TOut
*
output
,
int
size
,
void
*
buffer
,
size_t
buffer_size
);
/** Computes the L2 norm of the given elements. */
template
<
typename
TIn
,
typename
TOut
>
Status
reduce_l2_norm
(
hipStream_t
stream
,
const
TIn
*
input
,
TOut
*
output
,
int
size
,
void
*
buffer
,
size_t
buffer_size
);
/** Computes the mean of the given elements. */
template
<
typename
TIn
,
typename
TOut
>
Status
reduce_mean
(
hipStream_t
stream
,
const
TIn
*
input
,
TOut
*
output
,
int
size
,
void
*
buffer
,
size_t
buffer_size
);
enum
class
ApplicableMatrixReduction
{
// can use reduce_matrix_rows()
Rows
,
// can use reduce_matrix_columns()
Columns
,
// no optimized matrix reduction function applies
None
,
};
/**
* Determines whether a cuDNN reduction can be computed by an optimized matrix reduction function.
* @param miopen_reduce_op The cuDNN reduction op type.
* @param dims The input dimensions.
* @param axes The reduction axes.
* @param[out] m If matrix reduction is possible, the number of matrix rows to use.
* @param[out] n If matrix reduction is possible, the number of matrix columns to use.
* @return The type of matrix reduction that can be done.
*/
ApplicableMatrixReduction
get_applicable_matrix_reduction
(
const
miopenReduceTensorOp_t
miopen_reduce_op
,
gsl
::
span
<
const
int64_t
>
dims
,
gsl
::
span
<
const
int64_t
>
axes
,
int
&
m
,
int
&
n
);
/**
* Reduces the rows in a row-major matrix to a single row containing the sum of each column.
* @param input The input data.
* @param output The output data.
* @param m The number of matrix rows.
* @param n The number of matrix columns.
* @param reset_initial_output Whether to reset (i.e., zero) the output values first.
*/
template
<
typename
TIn
,
typename
TOut
>
Status
reduce_matrix_rows
(
hipStream_t
stream
,
const
TIn
*
input
,
TOut
*
output
,
int
m
,
int
n
,
bool
reset_initial_output
=
true
);
/**
* Reduces the columns in a row-major matrix to a single column containing the sum of each row.
* @param input The input data.
* @param output The output data.
* @param m The number of matrix rows.
* @param n The number of matrix columns.
* @param buffer The intermediate buffer.
* @param buffer_size The size of the intermediate buffer in bytes.
*/
template
<
typename
TIn
,
typename
TOut
>
Status
reduce_matrix_columns
(
hipStream_t
stream
,
const
TIn
*
input
,
TOut
*
output
,
int
m
,
int
n
,
void
*
buffer
,
size_t
buffer_size
);
/** Apply unary elementwise division. */
template
<
typename
T
>
void
UnaryDiv
(
hipStream_t
stream
,
const
T
*
input
,
T
*
output
,
T
denominator
,
size_t
count
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_ops.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/optional.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/reduction/reduction_ops.h"
#include "core/providers/rocm/reduction/reduction_functions.h"
namespace
onnxruntime
{
namespace
rocm
{
namespace
ReductionOps
{
// Implementation that holds the core logic of reduction op processing
// `input_shape_override` is the input shape for compute purposes (if provided)
template
<
typename
T
,
miopenReduceTensorIndices_t
ReduceTensorIndices
=
MIOPEN_REDUCE_TENSOR_NO_INDICES
>
std
::
unique_ptr
<
Tensor
>
ReduceCompute
(
ROCMExecutionProvider
&
rocm_ep
,
miopenReduceTensorOp_t
miopen_reduce_op
,
AllocatorPtr
allocator
,
const
Tensor
&
input
,
gsl
::
span
<
const
int64_t
>
axes
,
bool
keep_dims
,
bool
calculate_log
,
bool
calculate_sqt
,
bool
log_sum_exp
,
bool
fast_reduction
,
const
TensorShape
*
input_shape_override
=
nullptr
);
}
// namespace ReductionOps
// Holds some metadata that will be used during actual reduction op compute time
struct
PrepareReduceMetadata
{
int64_t
input_count
;
int64_t
output_count
;
// This holds the output dims without any reduced dims squeezed (even if keep_dims == 1)
TensorShapeVector
output_dims
;
// This holds the output dims with with reduced dims squeezed (if keep_dims == 1)
TensorShapeVector
squeezed_output_dims
;
TensorShapeVector
input_dims_miopen
;
TensorShapeVector
output_dims_miopen
;
};
template
<
bool
allow_multi_axes
>
class
ReduceKernel
:
public
RocmKernel
,
public
ReduceKernelBase
<
allow_multi_axes
>
{
protected:
ReduceKernel
(
const
OpKernelInfo
&
info
,
optional
<
int64_t
>
keep_dims_override
=
{})
:
RocmKernel
(
info
),
ReduceKernelBase
<
allow_multi_axes
>
(
info
,
keep_dims_override
),
calculate_log_
(
false
),
calculate_sqt_
(
false
),
log_sum_exp_
(
false
),
fast_reduction_
(
false
)
{
// We need to cast away the const as PerThreadMiopenHandle() is currently a non-const method
// TODO: Clean up the ROCMExecutionProvider interface to avoid this
rocm_ep_
=
const_cast
<
ROCMExecutionProvider
*>
(
static_cast
<
const
ROCMExecutionProvider
*>
(
info
.
GetExecutionProvider
()));
}
// Only Max Min need to set ReduceTensorIndices MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES as per miopen library manual
// Only Max Min will have indices output, need to set the indices to nullptr for other ops
template
<
typename
T
,
miopenReduceTensorIndices_t
ReduceTensorIndices
=
MIOPEN_REDUCE_TENSOR_NO_INDICES
>
Status
ComputeImpl
(
OpKernelContext
*
ctx
,
miopenReduceTensorOp_t
miopen_reduce_op
)
const
;
// Used by ReduceSumTraining which will have axes as input
template
<
typename
T
,
miopenReduceTensorIndices_t
ReduceTensorIndices
=
MIOPEN_REDUCE_TENSOR_NO_INDICES
>
Status
ComputeImplEx
(
OpKernelContext
*
ctx
,
miopenReduceTensorOp_t
miopen_reduce_op
)
const
;
template
<
typename
T
,
typename
OutT
,
miopenReduceTensorIndices_t
ReduceTensorIndices
>
Status
ReduceKernelShared
(
const
T
*
X
,
const
TensorShape
&
input_shape
,
OutT
*
Y
,
const
TensorShape
&
output_shape
,
miopenReduceTensorOp_t
miopen_reduce_op
,
TensorShapeVector
&
output_dims
)
const
;
using
ReduceKernelBase
<
allow_multi_axes
>::
axes_
;
using
ReduceKernelBase
<
allow_multi_axes
>::
keepdims_
;
using
ReduceKernelBase
<
allow_multi_axes
>::
noop_with_empty_axes_
;
bool
calculate_log_
;
bool
calculate_sqt_
;
bool
log_sum_exp_
;
// Indicates if this reduction can be delegated to our highly-optimized reduction kernels.
// Those efficient kernels are defined/implemented in reduction_functions.h/.cu.
bool
fast_reduction_
;
// We need to access to the ROCM EP instance to get the miopen handle
ROCMExecutionProvider
*
rocm_ep_
;
};
template
<
typename
T
>
class
ArgMax
final
:
public
ReduceKernel
<
false
>
{
public:
ArgMax
(
const
OpKernelInfo
&
info
)
:
ReduceKernel
<
false
>
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
ctx
)
const
override
{
return
ComputeImpl
<
T
,
MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES
>
(
ctx
,
MIOPEN_REDUCE_TENSOR_MAX
);
}
};
template
<
typename
T
>
class
ArgMin
final
:
public
ReduceKernel
<
false
>
{
public:
ArgMin
(
const
OpKernelInfo
&
info
)
:
ReduceKernel
<
false
>
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
ctx
)
const
override
{
return
ComputeImpl
<
T
,
MIOPEN_REDUCE_TENSOR_FLATTENED_INDICES
>
(
ctx
,
MIOPEN_REDUCE_TENSOR_MIN
);
}
};
template
<
typename
T
>
class
ReduceL1
final
:
public
ReduceKernel
<
true
>
{
public:
ReduceL1
(
const
OpKernelInfo
&
info
)
:
ReduceKernel
<
true
>
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
ctx
)
const
override
{
return
ComputeImpl
<
T
>
(
ctx
,
MIOPEN_REDUCE_TENSOR_NORM1
);
}
};
template
<
typename
T
>
class
ReduceL2
final
:
public
ReduceKernel
<
true
>
{
public:
ReduceL2
(
const
OpKernelInfo
&
info
)
:
ReduceKernel
<
true
>
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
ctx
)
const
override
{
return
ComputeImpl
<
T
>
(
ctx
,
MIOPEN_REDUCE_TENSOR_NORM2
);
}
};
template
<
typename
T
>
class
ReduceMax
final
:
public
ReduceKernel
<
true
>
{
public:
ReduceMax
(
const
OpKernelInfo
&
info
)
:
ReduceKernel
<
true
>
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
ctx
)
const
override
{
return
ComputeImpl
<
T
>
(
ctx
,
MIOPEN_REDUCE_TENSOR_MAX
);
}
};
template
<
typename
T
>
class
ReduceMean
final
:
public
ReduceKernel
<
true
>
{
public:
ReduceMean
(
const
OpKernelInfo
&
info
)
:
ReduceKernel
<
true
>
(
info
)
{
fast_reduction_
=
true
;
}
Status
ComputeInternal
(
OpKernelContext
*
ctx
)
const
override
{
return
ComputeImpl
<
T
>
(
ctx
,
MIOPEN_REDUCE_TENSOR_AVG
);
}
};
template
<
typename
T
>
class
ReduceMin
final
:
public
ReduceKernel
<
true
>
{
public:
ReduceMin
(
const
OpKernelInfo
&
info
)
:
ReduceKernel
<
true
>
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
ctx
)
const
override
{
return
ComputeImpl
<
T
>
(
ctx
,
MIOPEN_REDUCE_TENSOR_MIN
);
}
};
template
<
typename
T
>
class
ReduceProd
final
:
public
ReduceKernel
<
true
>
{
public:
ReduceProd
(
const
OpKernelInfo
&
info
)
:
ReduceKernel
<
true
>
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
ctx
)
const
override
{
return
ComputeImpl
<
T
>
(
ctx
,
MIOPEN_REDUCE_TENSOR_MUL
);
}
};
template
<
typename
T
>
class
ReduceSum
final
:
public
ReduceKernel
<
true
>
{
public:
ReduceSum
(
const
OpKernelInfo
&
info
)
:
ReduceKernel
<
true
>
(
info
)
{
fast_reduction_
=
true
;
}
Status
ComputeInternal
(
OpKernelContext
*
ctx
)
const
override
{
return
ComputeImpl
<
T
>
(
ctx
,
MIOPEN_REDUCE_TENSOR_ADD
);
}
};
template
<
typename
T
>
class
ReduceLogSum
final
:
public
ReduceKernel
<
true
>
{
public:
ReduceLogSum
(
const
OpKernelInfo
&
info
)
:
ReduceKernel
<
true
>
(
info
)
{
ReduceKernel
<
true
>::
calculate_log_
=
true
;
fast_reduction_
=
true
;
}
Status
ComputeInternal
(
OpKernelContext
*
ctx
)
const
override
{
return
ComputeImpl
<
T
>
(
ctx
,
MIOPEN_REDUCE_TENSOR_ADD
);
}
};
template
<
typename
T
>
class
ReduceSumSquare
final
:
public
ReduceKernel
<
true
>
{
public:
ReduceSumSquare
(
const
OpKernelInfo
&
info
)
:
ReduceKernel
<
true
>
(
info
)
{
ReduceKernel
<
true
>::
calculate_sqt_
=
true
;
fast_reduction_
=
true
;
}
Status
ComputeInternal
(
OpKernelContext
*
ctx
)
const
override
{
return
ComputeImpl
<
T
>
(
ctx
,
MIOPEN_REDUCE_TENSOR_ADD
);
}
};
template
<
typename
T
>
class
ReduceLogSumExp
final
:
public
ReduceKernel
<
true
>
{
public:
ReduceLogSumExp
(
const
OpKernelInfo
&
info
)
:
ReduceKernel
<
true
>
(
info
)
{
ReduceKernel
<
true
>::
log_sum_exp_
=
true
;
}
Status
ComputeInternal
(
OpKernelContext
*
ctx
)
const
override
{
return
ComputeImpl
<
T
>
(
ctx
,
MIOPEN_REDUCE_TENSOR_ADD
);
}
};
Status
PrepareForReduce
(
const
Tensor
*
X
,
bool
keepdims
,
gsl
::
span
<
const
int64_t
>
axes
,
PrepareReduceMetadata
&
prepare_reduce_metadata
,
const
TensorShape
*
input_shape_override
=
nullptr
);
template
<
typename
T
,
miopenReduceTensorIndices_t
ReduceTensorIndices
>
Status
ReduceComputeCore
(
ROCMExecutionProvider
&
rocm_ep
,
const
Tensor
&
input
,
PrepareReduceMetadata
&
prepare_reduce_metadata
,
/*out*/
Tensor
&
output
,
miopenReduceTensorOp_t
miopen_reduce_op
,
gsl
::
span
<
const
int64_t
>
axes
,
bool
calculate_log
,
bool
calculate_sqt
,
bool
log_sum_exp
,
bool
fast_reduction
,
const
TensorShape
*
input_shape_override
=
nullptr
);
// ROCM's reduction descriptor miopenReduceTensorDescriptor_t is a pointer so
// it's safer to wrap it with automatically memory deleter as MiopenReduceDescriptor.
// An implicit caster from MiopenReduceDescriptor to miopenReduceTensorDescriptor_t
// is implemented below, so ROCM can seamlessly work.
class
MiopenReduceDescriptor
final
{
public:
MiopenReduceDescriptor
()
:
desc_
(
nullptr
)
{
}
~
MiopenReduceDescriptor
()
{
if
(
desc_
!=
nullptr
)
{
miopenDestroyReduceTensorDescriptor
(
desc_
);
desc_
=
nullptr
;
}
}
MiopenReduceDescriptor
(
const
MiopenReduceDescriptor
&
)
=
delete
;
MiopenReduceDescriptor
&
operator
=
(
const
MiopenReduceDescriptor
&
)
=
delete
;
Status
Set
(
miopenReduceTensorOp_t
op
,
miopenDataType_t
type
,
miopenReduceTensorIndices_t
indices
)
{
if
(
!
desc_
)
MIOPEN_RETURN_IF_ERROR
(
miopenCreateReduceTensorDescriptor
(
&
desc_
));
MIOPEN_RETURN_IF_ERROR
(
miopenSetReduceTensorDescriptor
(
desc_
,
op
,
type
,
MIOPEN_PROPAGATE_NAN
,
indices
,
MIOPEN_32BIT_INDICES
));
// currently only the 32-bit (unsigned int) type is supported.
return
Status
::
OK
();
}
operator
miopenReduceTensorDescriptor_t
()
const
{
return
desc_
;
}
private:
miopenReduceTensorDescriptor_t
desc_
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/reduction/reduction_utils.cuh
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/cu_inc/common.cuh"
namespace
onnxruntime
{
namespace
rocm
{
__forceinline__
__host__
__device__
int
least_pow2_bound
(
int
value
)
{
unsigned
int
value_
=
static_cast
<
unsigned
int
>
(
value
);
--
value_
;
value_
|=
value_
>>
1
;
value_
|=
value_
>>
2
;
value_
|=
value_
>>
4
;
value_
|=
value_
>>
8
;
value_
|=
value_
>>
16
;
return
static_cast
<
int
>
(
++
value_
);
}
struct
Square
{
template
<
typename
T
>
__forceinline__
__device__
T
operator
()(
const
T
&
value
)
{
return
value
*
value
;
}
};
struct
Sqrt
{
template
<
typename
T
>
__forceinline__
__device__
T
operator
()(
const
T
&
value
)
{
return
_Sqrt
(
value
);
}
};
struct
Identity
{
template
<
typename
T
>
__forceinline__
__device__
T
operator
()(
const
T
&
value
)
{
return
value
;
}
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_check_memory.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/rocm_check_memory.h"
#include "core/providers/rocm/rocm_common.h"
namespace
onnxruntime
{
void
CheckIfMemoryOnCurrentGpuDevice
(
const
void
*
ptr
)
{
hipPointerAttribute_t
attrs
;
HIP_CALL_THROW
(
hipPointerGetAttributes
(
&
attrs
,
ptr
));
int
current_device
;
HIP_CALL_THROW
(
hipGetDevice
(
&
current_device
));
ORT_ENFORCE
(
attrs
.
device
==
current_device
,
"Current ROCM device is "
,
current_device
,
" but the memory of pointer "
,
ptr
,
" is allocated on device "
,
attrs
.
device
);
}
}
// onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_check_memory.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
namespace
onnxruntime
{
// Throw if "ptr" is not allocated on the ROCM device obtained by hipGetDevice.
void
CheckIfMemoryOnCurrentGpuDevice
(
const
void
*
ptr
);
}
// onnxruntime
\ No newline at end of file
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_graph.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/rocm_graph.h"
#include "core/providers/rocm/rocm_common.h"
#include <hip/hip_runtime_api.h>
#include <hip/driver_types.h>
namespace
onnxruntime
{
ROCMGraph
::
ROCMGraph
(
hipStream_t
stream
)
:
stream_
(
stream
)
{
#if (defined(CUDA_VERSION) && CUDA_VERSION < 10000)
ORT_THROW
(
"ROCM graphs can only be used in Onnxruntime built with ROCM >= 10.0"
);
#endif
}
void
ROCMGraph
::
SetStream
(
hipStream_t
stream
)
{
stream_
=
stream
;
}
void
ROCMGraph
::
CaptureBegin
()
{
#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
ORT_ENFORCE
(
!
has_graph_exec_
,
"This rocm graph has already captured a graph. "
"Create a new instance to capture a new graph."
);
HIP_CALL_THROW
(
hipStreamSynchronize
(
stream_
));
// For now rocm graph can only work with a single thread. In the future, we
// will support multiple threads. For multiple threads with multiple graphs
// and streams, `hipStreamCaptureModeGlobal` needs to be changed to
// `hipStreamCaptureModeThreadLocal`
HIP_CALL_THROW
(
hipStreamBeginCapture
(
stream_
,
hipStreamCaptureModeGlobal
));
#else
ORT_THROW
(
"ROCM graphs can only be used in Onnxruntime built with ROCM >= 10.0"
);
#endif
}
void
ROCMGraph
::
CaptureEnd
()
{
#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
HIP_CALL_THROW
(
hipStreamEndCapture
(
stream_
,
&
graph_
));
if
(
graph_
==
NULL
)
{
ORT_THROW
(
"ROCMGraph::CaptureEnd: graph_ is NULL"
);
}
has_graph_
=
true
;
HIP_CALL_THROW
(
hipGraphInstantiate
(
&
graph_exec_
,
graph_
,
NULL
,
NULL
,
0
));
has_graph_exec_
=
true
;
HIP_CALL_THROW
(
hipGraphDestroy
(
graph_
));
has_graph_
=
false
;
#else
ORT_THROW
(
"ROCM graphs can only be used in Onnxruntime built with ROCM >= 10.0"
);
#endif
}
Status
ROCMGraph
::
Replay
()
{
// Although this function is not thread safe, the lock is not needed here because
// ROCM EP maintains a separate rocm graph per thread
#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
LOGS_DEFAULT
(
INFO
)
<<
"Replaying ROCM graph on stream "
<<
stream_
;
HIP_RETURN_IF_ERROR
(
hipGraphLaunch
(
graph_exec_
,
stream_
));
HIP_RETURN_IF_ERROR
(
hipStreamSynchronize
(
stream_
));
#else
ORT_THROW
(
"ROCM graphs can only be used in Onnxruntime built with ROCM >= 10.0"
);
#endif
return
Status
::
OK
();
}
void
ROCMGraph
::
Reset
()
{
#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
if
(
has_graph_
)
{
HIP_CALL_THROW
(
hipGraphDestroy
(
graph_
));
has_graph_
=
false
;
}
if
(
has_graph_exec_
)
{
HIP_CALL_THROW
(
hipGraphExecDestroy
(
graph_exec_
));
has_graph_exec_
=
false
;
}
#else
ORT_THROW
(
"ROCM graphs can only be used in Onnxruntime built with ROCM >= 10.0"
);
#endif
}
ROCMGraph
::~
ROCMGraph
()
{
Reset
();
}
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_graph.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/common.h"
#include "core/platform/ort_mutex.h"
#include "core/providers/rocm/rocm_pch.h"
namespace
onnxruntime
{
using
CaptureId_t
=
unsigned
long
long
;
struct
ROCMGraph
{
ROCMGraph
()
{};
ROCMGraph
(
hipStream_t
stream
);
~
ROCMGraph
();
void
SetStream
(
hipStream_t
stream
);
void
CaptureBegin
();
void
CaptureEnd
();
Status
Replay
();
void
Reset
();
private:
#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
hipGraph_t
graph_
=
NULL
;
hipGraphExec_t
graph_exec_
=
NULL
;
#endif
bool
has_graph_
=
false
;
bool
has_graph_exec_
=
false
;
hipStream_t
stream_
=
nullptr
;
// Does not own the stream
};
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/rocm_provider_factory_creator.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <memory>
#include "core/providers/providers.h"
struct
OrtROCMProviderOptions
;
struct
OrtROCMProviderOptionsV2
;
namespace
onnxruntime
{
// defined in provider_bridge_ort.cc
struct
CudaProviderFactoryCreator
{
static
std
::
shared_ptr
<
IExecutionProviderFactory
>
Create
(
const
OrtROCMProviderOptions
*
provider_options
);
static
std
::
shared_ptr
<
IExecutionProviderFactory
>
Create
(
const
OrtROCMProviderOptionsV2
*
provider_options
);
};
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/accumulation_type.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <hip/hip_fp16.h>
#include "core/framework/float16.h"
namespace
onnxruntime
{
namespace
rocm
{
// specifies the auxiliary type to use for accumulation of the given type
template
<
typename
T
>
struct
AccumulationType
;
template
<
>
struct
AccumulationType
<
half
>
{
using
type
=
float
;
};
template
<
>
struct
AccumulationType
<
float
>
{
using
type
=
float
;
};
template
<
>
struct
AccumulationType
<
double
>
{
using
type
=
double
;
};
template
<
>
struct
AccumulationType
<
BFloat16
>
{
using
type
=
float
;
};
template
<
typename
T
>
using
AccumulationType_t
=
typename
AccumulationType
<
T
>::
type
;
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/fast_divmod.h
0 → 100644
View file @
1a91fcc2
//
// Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include <iostream>
#include <limits>
#include <hip/hip_runtime.h>
#include <cmath>
#include "core/common/common.h"
namespace
onnxruntime
{
namespace
rocm
{
// The code below is based on section 4 Unsigned division of paper https://gmplib.org/~tege/divcnst-pldi94.pdf
// In current ORT, fast_divmod is used for calculating the position of a element in tensor,
// so unsigned integer division from the paper is good enough for ORT. The advantage is that div is very simple,
// then GPU compiler can do loop unroll easilly when divmod is called in a loop.
struct
fast_divmod
{
fast_divmod
(
int
d
=
1
)
{
d_
=
d
==
0
?
1
:
d
;
ORT_ENFORCE
(
d_
>=
1
&&
d_
<=
static_cast
<
uint32_t
>
(
std
::
numeric_limits
<
int
>::
max
()));
for
(
l_
=
0
;
l_
<
32
;
l_
++
)
if
((
1U
<<
l_
)
>=
d_
)
break
;
uint64_t
one
=
1
;
uint64_t
m
=
((
one
<<
32
)
*
((
one
<<
l_
)
-
d_
))
/
d_
+
1
;
M_
=
static_cast
<
uint32_t
>
(
m
);
// according to paper, the value of m' should fit in a unsigned integer.
ORT_ENFORCE
(
M_
>
0
&&
M_
==
m
);
}
__host__
__device__
inline
int
div
(
int
n
)
const
{
#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
uint32_t
t
=
__umulhi
(
M_
,
n
);
return
(
t
+
n
)
>>
l_
;
#else
// Using uint64_t for t, then t + n won't overflow.
uint64_t
t
=
((
uint64_t
)
M_
*
n
)
>>
32
;
return
static_cast
<
int
>
((
t
+
n
)
>>
l_
);
#endif
}
__host__
__device__
inline
int
mod
(
int
n
)
const
{
return
n
-
div
(
n
)
*
d_
;
}
__host__
__device__
inline
void
divmod
(
int
n
,
int
&
q
,
int
&
r
)
const
{
q
=
div
(
n
);
r
=
n
-
q
*
d_
;
}
uint32_t
d_
;
// divisor
uint32_t
M_
;
// m' in the paper.
uint32_t
l_
;
// l_ = ceil(log2(d_))
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/integer_gemm.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
rocm
{
Status
GemmInt8
(
int
m
,
int
n
,
int
k
,
int32_t
alpha_matmul
,
int32_t
beta_matmul
,
const
int8_t
*
a
,
int
lda
,
const
int8_t
*
b
,
int
ldb
,
int32_t
*
c
,
int
ldc
,
const
RocmKernel
*
rocm_kernel
);
}
}
\ No newline at end of file
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/shared_inc/rocm_utils.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
// for things shared between nvcc and onnxruntime
// as currently nvcc cannot compile all onnxruntime headers
#pragma once
#include <hip/hip_fp16.h>
#include <memory>
#include <type_traits>
#include <vector>
#include "core/common/gsl.h"
#include "core/framework/float16.h"
#include "core/providers/rocm/shared_inc/fast_divmod.h"
namespace
onnxruntime
{
namespace
rocm
{
enum
class
SimpleBroadcast
:
int32_t
{
NoBroadcast
=
(
int32_t
)
-
1
,
LeftScalar
=
(
int32_t
)
-
2
,
RightScalar
=
(
int32_t
)
-
3
,
RightPerChannelBatch1
=
(
int32_t
)
-
4
,
RightPerChannelBatchN
=
(
int32_t
)
-
5
,
};
enum
class
BroadcastIndexType
:
int32_t
{
NoBroadcast
=
(
int32_t
)
0
,
Scalar
=
(
int32_t
)
1
,
NeedCompute
=
(
int32_t
)
2
,
};
template
<
typename
T
>
class
IConstantBuffer
{
public:
virtual
~
IConstantBuffer
(){};
virtual
const
T
*
GetBuffer
(
hipStream_t
stream
,
size_t
count
)
=
0
;
};
template
<
typename
T
>
std
::
unique_ptr
<
IConstantBuffer
<
T
>>
CreateConstantOnes
();
template
<
typename
T
>
void
Fill
(
hipStream_t
stream
,
T
*
output
,
T
value
,
int64_t
count
);
/*
This is a utility wrapper for arbitrary type array
Commonly used for passing small list of metadata during rocm kernel launch
It's better to pass the array by value than having another cuMemcpy to pass the data to device.
*/
template
<
typename
T
,
int32_t
capacity
=
8
>
struct
TArray
{
#if defined(USE_ROCM)
#define TARRAY_CONSTRUCTOR_SPECIFIERS __host__ __device__
#else
#define TARRAY_CONSTRUCTOR_SPECIFIERS
#endif
TARRAY_CONSTRUCTOR_SPECIFIERS
TArray
()
=
default
;
TARRAY_CONSTRUCTOR_SPECIFIERS
TArray
(
const
TArray
&
)
=
default
;
TARRAY_CONSTRUCTOR_SPECIFIERS
TArray
&
operator
=
(
const
TArray
&
)
=
default
;
#undef TARRAY_CONSTRUCTOR_SPECIFIERS
TArray
(
int32_t
size
)
:
size_
(
size
),
data_
()
{
ORT_ENFORCE
(
0
<=
size
&&
size
<=
capacity
,
"TArray size must be within range [0, "
,
capacity
,
"]. Actual: "
,
size
);
}
TArray
(
const
std
::
vector
<
T
>&
vec
)
:
TArray
(
static_cast
<
int32_t
>
(
vec
.
size
()))
{
static_assert
(
std
::
is_trivially_copyable
<
T
>::
value
,
"T must be trivially copyable."
);
memcpy
(
data_
,
vec
.
data
(),
vec
.
size
()
*
sizeof
(
T
));
}
TArray
(
gsl
::
span
<
const
T
>
vec
)
:
TArray
(
static_cast
<
int32_t
>
(
vec
.
size
()))
{
static_assert
(
std
::
is_trivially_copyable
<
T
>::
value
,
"T must be trivially copyable."
);
memcpy
(
data_
,
vec
.
data
(),
vec
.
size
()
*
sizeof
(
T
));
}
void
SetSize
(
int32_t
size
)
{
ORT_ENFORCE
(
0
<=
size
&&
size
<=
capacity
,
"TArray size must be within range [0, "
,
capacity
,
"]. Actual: "
,
size
);
size_
=
size
;
}
__host__
__device__
int32_t
Size
()
const
{
return
size_
;
}
__host__
__device__
T
&
operator
[](
int32_t
index
)
{
return
data_
[
index
];
}
__host__
__device__
__forceinline__
const
T
&
operator
[](
int32_t
index
)
const
{
return
data_
[
index
];
}
__host__
__device__
T
*
Data
()
{
return
data_
;
}
__host__
__device__
const
T
*
Data
()
const
{
return
data_
;
}
static
constexpr
int32_t
Capacity
()
{
return
capacity
;
};
private:
int32_t
size_
=
0
;
T
data_
[
capacity
]
=
{};
};
// Bitmask tensor is uint_32 type.
using
BitmaskElementType
=
uint32_t
;
constexpr
int
kNumBitsPerBitmaskElement
=
std
::
numeric_limits
<
BitmaskElementType
>::
digits
;
template
<
typename
T
>
struct
NumericLimits
{
__inline__
__host__
__device__
static
T
Min
()
{
return
std
::
numeric_limits
<
T
>::
lowest
();
}
__inline__
__host__
__device__
static
T
Max
()
{
return
std
::
numeric_limits
<
T
>::
max
();
}
};
template
<
>
struct
NumericLimits
<
MLFloat16
>
{
__inline__
__host__
__device__
static
half
Min
()
{
return
-
65504.0
;
}
__inline__
__host__
__device__
static
half
Max
()
{
return
65504.0
;
}
};
template
<
>
struct
NumericLimits
<
half
>
{
__inline__
__host__
__device__
static
half
Min
()
{
return
-
65504.0
;
}
__inline__
__host__
__device__
static
half
Max
()
{
return
65504.0
;
}
};
template
<
>
struct
NumericLimits
<
float
>
{
__inline__
__host__
__device__
static
float
Min
()
{
return
-
INFINITY
;
}
__inline__
__host__
__device__
static
float
Max
()
{
return
INFINITY
;
}
};
template
<
>
struct
NumericLimits
<
double
>
{
__inline__
__host__
__device__
static
double
Min
()
{
return
-
HUGE_VAL
;
}
__inline__
__host__
__device__
static
double
Max
()
{
return
HUGE_VAL
;
}
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/cast_op.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "cast_op.h"
#include "core/providers/rocm/math/unary_elementwise_ops_impl.h"
using
namespace
ONNX_NAMESPACE
;
using
namespace
onnxruntime
::
common
;
namespace
onnxruntime
{
namespace
rocm
{
const
std
::
vector
<
MLDataType
>
castOpTypeConstraints
{
DataTypeImpl
::
GetTensorType
<
MLFloat16
>
(),
DataTypeImpl
::
GetTensorType
<
BFloat16
>
(),
DataTypeImpl
::
GetTensorType
<
float
>
(),
DataTypeImpl
::
GetTensorType
<
double
>
(),
DataTypeImpl
::
GetTensorType
<
int8_t
>
(),
DataTypeImpl
::
GetTensorType
<
int16_t
>
(),
DataTypeImpl
::
GetTensorType
<
int32_t
>
(),
DataTypeImpl
::
GetTensorType
<
int64_t
>
(),
DataTypeImpl
::
GetTensorType
<
uint8_t
>
(),
DataTypeImpl
::
GetTensorType
<
uint16_t
>
(),
DataTypeImpl
::
GetTensorType
<
uint32_t
>
(),
DataTypeImpl
::
GetTensorType
<
uint64_t
>
(),
DataTypeImpl
::
GetTensorType
<
bool
>
()
};
#define REGISTER_KERNEL_TYPED(T) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Cast, \
kOnnxDomain, \
6, 8, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("T2", castOpTypeConstraints), \
Cast<T>); \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Cast, \
kOnnxDomain, \
9, 12, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("T2", castOpTypeConstraints), \
Cast<T>); \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
Cast, \
kOnnxDomain, \
13, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("T2", castOpTypeConstraints), \
Cast<T>);
template
<
typename
SrcT
>
Status
Cast
<
SrcT
>::
ComputeInternal
(
OpKernelContext
*
context
)
const
{
typedef
typename
ToHipType
<
SrcT
>::
MappedType
CudaSrcT
;
const
Tensor
*
X
=
context
->
Input
<
Tensor
>
(
0
);
const
TensorShape
&
shape
=
X
->
Shape
();
Tensor
*
Y
=
context
->
Output
(
0
,
shape
);
const
auto
*
x_data
=
reinterpret_cast
<
const
CudaSrcT
*>
(
X
->
Data
<
SrcT
>
());
size_t
count
=
shape
.
Size
();
#define CASE(TP_TYPE, DstT) \
case TP_TYPE: \
if (count > 0) { \
Impl_Cast<CudaSrcT, typename ToHipType<DstT>::MappedType>( \
Stream(), \
x_data, \
reinterpret_cast<typename ToHipType<DstT>::MappedType*>(Y->MutableData<DstT>()), \
count); \
} \
break;
switch
(
to_
)
{
CASE
(
TensorProto_DataType_FLOAT16
,
MLFloat16
)
CASE
(
TensorProto_DataType_BFLOAT16
,
BFloat16
)
CASE
(
TensorProto_DataType_FLOAT
,
float
)
CASE
(
TensorProto_DataType_DOUBLE
,
double
)
CASE
(
TensorProto_DataType_INT8
,
int8_t
)
CASE
(
TensorProto_DataType_INT16
,
int16_t
)
CASE
(
TensorProto_DataType_INT32
,
int32_t
)
CASE
(
TensorProto_DataType_INT64
,
int64_t
)
CASE
(
TensorProto_DataType_UINT8
,
uint8_t
)
CASE
(
TensorProto_DataType_UINT16
,
uint16_t
)
CASE
(
TensorProto_DataType_UINT32
,
uint32_t
)
CASE
(
TensorProto_DataType_UINT64
,
uint64_t
)
CASE
(
TensorProto_DataType_BOOL
,
bool
)
case
TensorProto_DataType_STRING
:
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
FAIL
,
"Casting to and from strings is not supported yet."
);
case
TensorProto_DataType_UNDEFINED
:
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
FAIL
,
"Cast op must have 'to' argument of type DataType"
);
default:
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
FAIL
,
"Unexpected 'to' argument value: "
,
to_
);
}
return
Status
::
OK
();
}
#define SPECIALIZE_IMPL(T) \
REGISTER_KERNEL_TYPED(T) \
template Status Cast<T>::ComputeInternal(OpKernelContext* context) const;
SPECIALIZE_IMPL
(
MLFloat16
)
SPECIALIZE_IMPL
(
float
)
SPECIALIZE_IMPL
(
double
)
SPECIALIZE_IMPL
(
int8_t
)
SPECIALIZE_IMPL
(
int16_t
)
SPECIALIZE_IMPL
(
int32_t
)
SPECIALIZE_IMPL
(
int64_t
)
SPECIALIZE_IMPL
(
uint8_t
)
SPECIALIZE_IMPL
(
uint16_t
)
SPECIALIZE_IMPL
(
uint32_t
)
SPECIALIZE_IMPL
(
uint64_t
)
SPECIALIZE_IMPL
(
bool
)
SPECIALIZE_IMPL
(
BFloat16
)
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/cast_op.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
SrcT
>
class
Cast
final
:
public
RocmKernel
{
public:
Cast
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{
int64_t
to
;
Status
status
=
info
.
GetAttr
(
"to"
,
&
to
);
ORT_ENFORCE
(
status
.
IsOK
(),
"Attribute to is not set."
);
to_
=
gsl
::
narrow_cast
<
ONNX_NAMESPACE
::
TensorProto_DataType
>
(
to
);
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
private:
ONNX_NAMESPACE
::
TensorProto_DataType
to_
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/compress.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "compress.h"
#include "compress_impl.h"
namespace
onnxruntime
{
namespace
rocm
{
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Compress
,
kOnnxDomain
,
9
,
10
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
())
.
TypeConstraint
(
"T1"
,
DataTypeImpl
::
GetTensorType
<
bool
>
()),
Compress
);
// explicit negative axis support
ONNX_OPERATOR_KERNEL_EX
(
Compress
,
kOnnxDomain
,
11
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
())
.
TypeConstraint
(
"T1"
,
DataTypeImpl
::
GetTensorType
<
bool
>
()),
Compress
);
Status
Compress
::
ComputeInternal
(
OpKernelContext
*
ctx
)
const
{
const
Tensor
*
input_tensor
=
ctx
->
Input
<
Tensor
>
(
0
);
ORT_ENFORCE
(
input_tensor
);
size_t
rank
=
input_tensor
->
Shape
().
NumDimensions
();
auto
input_dimensions
=
input_tensor
->
Shape
().
GetDims
();
int64_t
axis
=
0
;
if
(
has_axis_
)
{
axis
=
HandleNegativeAxis
(
axis_
,
rank
);
}
const
Tensor
*
condition
=
ctx
->
Input
<
Tensor
>
(
1
);
ORT_ENFORCE
(
condition
);
auto
condition_length
=
condition
->
Shape
().
Size
();
auto
condition_data
=
condition
->
Data
<
bool
>
();
// if has axis, we need to compress on dimension[axis], otherwise compress on the flattened input data
int64_t
input_size
=
input_tensor
->
Shape
().
Size
();
int64_t
compress_input_length
=
has_axis_
?
input_dimensions
[
axis
]
:
input_size
;
int64_t
valid_condition_length
=
compress_input_length
<
condition_length
?
compress_input_length
:
condition_length
;
auto
condition_cumulative_sum_buffer
=
GetScratchBuffer
<
int32_t
>
(
gsl
::
narrow
<
size_t
>
(
valid_condition_length
));
auto
condition_cumulative_sum
=
condition_cumulative_sum_buffer
.
get
();
size_t
temp_storage_bytes
=
0
;
HIP_RETURN_IF_ERROR
(
CompressCalcPrefixSumTempStorageBytes
(
Stream
(),
reinterpret_cast
<
const
int8_t
*>
(
condition_data
),
condition_cumulative_sum
,
gsl
::
narrow
<
int
>
(
valid_condition_length
),
temp_storage_bytes
));
auto
temp_buffer
=
GetScratchBuffer
<
uint8_t
>
(
temp_storage_bytes
);
auto
d_temp_storage
=
temp_buffer
.
get
();
HIP_RETURN_IF_ERROR
(
CompressInclusivePrefixSum
(
Stream
(),
d_temp_storage
,
temp_storage_bytes
,
reinterpret_cast
<
const
int8_t
*>
(
condition_data
),
condition_cumulative_sum
,
gsl
::
narrow
<
int
>
(
valid_condition_length
)));
// hipMemcpyAsync from device memory to pageable host memory will return only once the copy has completed.
int32_t
positive_condition_count
=
0
;
HIP_RETURN_IF_ERROR
(
hipMemcpyAsync
(
&
positive_condition_count
,
condition_cumulative_sum
+
valid_condition_length
-
1
,
sizeof
(
int32_t
),
hipMemcpyDeviceToHost
,
Stream
()));
std
::
vector
<
int64_t
>
output_dims
(
input_dimensions
.
begin
(),
input_dimensions
.
end
());
if
(
has_axis_
)
{
output_dims
[
axis
]
=
positive_condition_count
;
}
else
{
output_dims
.
resize
(
1
);
output_dims
[
0
]
=
positive_condition_count
;
}
TensorShape
output_shape
(
output_dims
);
auto
output_tensor
=
ctx
->
Output
(
0
,
output_shape
);
if
(
positive_condition_count
<=
0
)
{
return
Status
::
OK
();
}
auto
element_bytes
=
input_tensor
->
DataType
()
->
Size
();
int64_t
axis_right_stride
=
1
;
if
(
has_axis_
)
{
for
(
auto
i
=
static_cast
<
size_t
>
(
axis
+
1
);
i
<
rank
;
++
i
)
{
axis_right_stride
*=
input_dimensions
[
i
];
}
}
ORT_RETURN_IF_ERROR
(
CompressImpl
(
Stream
(),
element_bytes
,
gsl
::
narrow_cast
<
int32_t
>
(
valid_condition_length
),
gsl
::
narrow_cast
<
int32_t
>
(
axis_right_stride
),
has_axis_
?
gsl
::
narrow_cast
<
int32_t
>
(
input_dimensions
[
axis
])
:
gsl
::
narrow_cast
<
int32_t
>
(
input_size
),
gsl
::
narrow_cast
<
int32_t
>
(
positive_condition_count
),
condition_cumulative_sum
,
condition_data
,
input_tensor
->
DataRaw
(),
output_tensor
->
MutableDataRaw
(),
input_size
));
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/compress.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/common/common.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
rocm
{
class
Compress
final
:
public
RocmKernel
{
public:
Compress
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{
has_axis_
=
info
.
GetAttr
(
"axis"
,
&
axis_
).
IsOK
();
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
private:
int64_t
axis_
;
bool
has_axis_
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/compress_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include <hipcub/hipcub.hpp>
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/providers/rocm/rocm_common.h"
//TODO:fix the warnings
#ifdef _MSC_VER
#pragma warning(disable : 4244)
#endif
#include "core/providers/rocm/tensor/compress_impl.h"
#include <thrust/functional.h>
#include <thrust/iterator/transform_iterator.h>
namespace
onnxruntime
{
namespace
rocm
{
// This cast is for transform iterator. This type affects the accumulator type width
// in InclusiveSum(). By default, the accumulator type matches the input, but for int8_t
// the sum overflows quickly, so we want the source type to match the output (int32_t).
// see https://github.com/NVIDIA/cub/issues/384
struct
CastToInt32
:
public
thrust
::
unary_function
<
int8_t
,
int32_t
>
{
__host__
__device__
int32_t
operator
()(
int8_t
v
)
const
{
return
static_cast
<
int32_t
>
(
v
);
}
};
hipError_t
CompressCalcPrefixSumTempStorageBytes
(
hipStream_t
stream
,
const
int8_t
*
condition_data
,
int32_t
*
condition_cumulative_sum
,
int
length
,
size_t
&
temp_storage_bytes
)
{
auto
input_iter
=
thrust
::
make_transform_iterator
(
condition_data
,
CastToInt32
());
return
hipcub
::
DeviceScan
::
InclusiveSum
(
nullptr
,
temp_storage_bytes
,
input_iter
,
condition_cumulative_sum
,
length
,
stream
);
}
hipError_t
CompressInclusivePrefixSum
(
hipStream_t
stream
,
void
*
d_temp_storage
,
size_t
temp_storage_bytes
,
const
int8_t
*
condition_data
,
int32_t
*
condition_cumulative_sum
,
int
length
)
{
auto
input_iter
=
thrust
::
make_transform_iterator
(
condition_data
,
CastToInt32
());
return
hipcub
::
DeviceScan
::
InclusiveSum
(
d_temp_storage
,
temp_storage_bytes
,
input_iter
,
condition_cumulative_sum
,
length
,
stream
);
}
template
<
typename
T
>
__global__
void
_CompressKernel
(
const
int32_t
valid_condition_length
,
const
fast_divmod
axis_right_stride_div
,
const
fast_divmod
input_axis_included_stride_div
,
const
int32_t
output_axis_included_stride
,
const
int32_t
*
condition_cumulative_sum
,
const
bool
*
condition_data
,
const
T
*
input_data
,
T
*
output_data
,
const
HIP_LONG
N
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
N
);
HIP_LONG
output_index
=
0
;
int
div
,
mod
;
input_axis_included_stride_div
.
divmod
(
id
,
div
,
mod
);
output_index
=
output_axis_included_stride
*
div
;
axis_right_stride_div
.
divmod
(
mod
,
div
,
mod
);
if
(
div
<
valid_condition_length
&&
condition_data
[
div
])
{
output_index
+=
(
condition_cumulative_sum
[
div
]
-
1
)
*
axis_right_stride_div
.
d_
+
mod
;
output_data
[
output_index
]
=
input_data
[
id
];
}
}
Status
CompressImpl
(
hipStream_t
stream
,
const
size_t
element_bytes
,
const
int32_t
valid_condition_length
,
const
int32_t
axis_right_stride
,
const
int32_t
input_axis_dim_length
,
const
int32_t
output_axis_dim_length
,
const
int32_t
*
condition_cumulative_sum
,
const
bool
*
condition_data
,
const
void
*
input_data
,
void
*
output_data
,
const
size_t
N
)
{
int
blocksPerGrid
=
(
int
)(
ceil
(
static_cast
<
float
>
(
N
)
/
GridDim
::
maxThreadsPerBlock
));
fast_divmod
axis_right_stride_div
(
axis_right_stride
);
fast_divmod
input_axis_included_stride_div
(
axis_right_stride
*
input_axis_dim_length
);
int
output_axis_included_stride
=
axis_right_stride
*
output_axis_dim_length
;
switch
(
element_bytes
)
{
case
sizeof
(
int8_t
):
hipLaunchKernelGGL
(
_CompressKernel
,
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
valid_condition_length
,
axis_right_stride_div
,
input_axis_included_stride_div
,
output_axis_included_stride
,
condition_cumulative_sum
,
condition_data
,
reinterpret_cast
<
const
ToHipType
<
int8_t
>::
MappedType
*>
(
input_data
),
reinterpret_cast
<
ToHipType
<
int8_t
>::
MappedType
*>
(
output_data
),
(
HIP_LONG
)
N
);
break
;
case
sizeof
(
int16_t
):
hipLaunchKernelGGL
(
_CompressKernel
,
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
valid_condition_length
,
axis_right_stride_div
,
input_axis_included_stride_div
,
output_axis_included_stride
,
condition_cumulative_sum
,
condition_data
,
reinterpret_cast
<
const
ToHipType
<
int16_t
>::
MappedType
*>
(
input_data
),
reinterpret_cast
<
ToHipType
<
int16_t
>::
MappedType
*>
(
output_data
),
(
HIP_LONG
)
N
);
break
;
case
sizeof
(
int32_t
):
hipLaunchKernelGGL
(
_CompressKernel
,
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
valid_condition_length
,
axis_right_stride_div
,
input_axis_included_stride_div
,
output_axis_included_stride
,
condition_cumulative_sum
,
condition_data
,
reinterpret_cast
<
const
ToHipType
<
int32_t
>::
MappedType
*>
(
input_data
),
reinterpret_cast
<
ToHipType
<
int32_t
>::
MappedType
*>
(
output_data
),
(
HIP_LONG
)
N
);
break
;
case
sizeof
(
int64_t
):
hipLaunchKernelGGL
(
_CompressKernel
,
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
valid_condition_length
,
axis_right_stride_div
,
input_axis_included_stride_div
,
output_axis_included_stride
,
condition_cumulative_sum
,
condition_data
,
reinterpret_cast
<
const
ToHipType
<
int64_t
>::
MappedType
*>
(
input_data
),
reinterpret_cast
<
ToHipType
<
int64_t
>::
MappedType
*>
(
output_data
),
(
HIP_LONG
)
N
);
break
;
default:
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
FAIL
,
"Type not supported for Compress operator"
);
}
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/compress_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
#include "core/common/common.h"
namespace
onnxruntime
{
namespace
rocm
{
hipError_t
CompressCalcPrefixSumTempStorageBytes
(
hipStream_t
stream
,
const
int8_t
*
condition_data
,
int32_t
*
condition_cumulative_sum
,
int
length
,
size_t
&
temp_storage_bytes
);
hipError_t
CompressInclusivePrefixSum
(
hipStream_t
stream
,
void
*
d_temp_storage
,
size_t
temp_storage_bytes
,
const
int8_t
*
condition_data
,
int32_t
*
condition_cumulative_sum
,
int
length
);
Status
CompressImpl
(
hipStream_t
stream
,
const
size_t
element_bytes
,
const
int32_t
valid_condition_length
,
const
int32_t
axis_right_stride
,
const
int32_t
input_axis_dim_length
,
const
int32_t
output_axis_dim_length
,
const
int32_t
*
condition_cumulative_sum
,
const
bool
*
condition_data
,
const
void
*
input_data
,
void
*
output_data
,
const
size_t
N
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/concat.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/concat.h"
#include "core/providers/rocm/tensor/concat_impl.h"
namespace
onnxruntime
{
namespace
rocm
{
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Concat
,
kOnnxDomain
,
4
,
10
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
()),
Concat
);
// opset 11 explicitly support negative axis
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Concat
,
kOnnxDomain
,
11
,
12
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
()),
Concat
);
ONNX_OPERATOR_KERNEL_EX
(
Concat
,
kOnnxDomain
,
13
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
()),
Concat
);
Status
Concat
::
ComputeInternal
(
OpKernelContext
*
ctx
)
const
{
auto
input_count
=
Node
().
InputArgCount
().
front
();
// Hold pointers to the input tensors to be used in the PrepareForCompute() step
InlinedTensorsVector
input_tensors
;
input_tensors
.
reserve
(
input_count
);
for
(
int
i
=
0
;
i
<
input_count
;
++
i
)
{
input_tensors
.
push_back
(
ctx
->
Input
<
Tensor
>
(
i
));
}
Prepare
p
;
ORT_RETURN_IF_ERROR
(
PrepareForCompute
(
ctx
,
input_tensors
,
p
));
// Return at this point if output tensor is going to be empty
if
(
p
.
output_num_elements
==
0
)
return
Status
::
OK
();
std
::
vector
<
int64_t
>
concat_sizes
;
concat_sizes
.
reserve
(
input_count
);
RocmAsyncBuffer
<
const
void
*>
input_ptr
(
this
,
input_count
);
gsl
::
span
<
const
void
*>
input_ptr_cpuspan
=
input_ptr
.
CpuSpan
();
std
::
vector
<
int64_t
>
axis_dimension_input_output_mapping
(
p
.
output_tensor
->
Shape
()[
p
.
axis
]);
int
index
=
0
;
for
(
int
i
=
0
;
i
<
input_count
;
++
i
)
{
const
auto
&
input
=
p
.
inputs
[
i
];
concat_sizes
.
push_back
(
input
.
tensor
->
Shape
()[
p
.
axis
]);
input_ptr_cpuspan
[
i
]
=
input
.
tensor
->
DataRaw
();
for
(
int
j
=
0
;
j
<
input
.
tensor
->
Shape
()[
p
.
axis
];
++
j
)
{
axis_dimension_input_output_mapping
.
at
(
index
++
)
=
i
;
}
}
auto
element_bytes
=
p
.
output_tensor
->
DataType
()
->
Size
();
int
block_size_inside_axis_dim
=
static_cast
<
int
>
(
p
.
output_axis_pitch
/
p
.
output_tensor
->
Shape
()[
p
.
axis
]);
int
block_size_including_axis_dim
=
static_cast
<
int
>
(
p
.
output_axis_pitch
);
if
(
std
::
all_of
(
concat_sizes
.
begin
(),
concat_sizes
.
end
(),
[
&
](
int64_t
size
)
{
return
size
==
concat_sizes
[
0
];
}))
{
if
(
input_count
<=
32
)
{
TArray
<
const
void
*
,
32
>
input_ptr_array
(
input_count
);
for
(
int
i
=
0
;
i
<
input_count
;
++
i
)
input_ptr_array
[
i
]
=
input_ptr_cpuspan
[
i
];
ORT_RETURN_IF_ERROR
(
ConcatSameConcatDimImpl
(
Stream
(),
element_bytes
,
block_size_including_axis_dim
,
block_size_inside_axis_dim
,
concat_sizes
[
0
],
p
.
output_tensor
->
MutableDataRaw
(),
input_ptr_array
,
static_cast
<
size_t
>
(
p
.
output_num_elements
)));
}
else
{
ORT_RETURN_IF_ERROR
(
input_ptr
.
CopyToGpu
());
ORT_RETURN_IF_ERROR
(
ConcatSameConcatDimImpl
(
Stream
(),
element_bytes
,
block_size_including_axis_dim
,
block_size_inside_axis_dim
,
concat_sizes
[
0
],
p
.
output_tensor
->
MutableDataRaw
(),
input_ptr
.
GpuPtr
(),
static_cast
<
size_t
>
(
p
.
output_num_elements
)));
}
}
else
{
RocmAsyncBuffer
<
int64_t
>
concat_sizes_gpu
(
this
,
concat_sizes
);
RocmAsyncBuffer
<
int64_t
>
axis_dimension_input_output_mapping_gpu
(
this
,
axis_dimension_input_output_mapping
);
std
::
vector
<
int64_t
>
concat_sizes_range
(
concat_sizes
);
for
(
size_t
i
=
1
;
i
<
concat_sizes_range
.
size
();
++
i
)
{
concat_sizes_range
[
i
]
+=
concat_sizes_range
[
i
-
1
];
}
RocmAsyncBuffer
<
int64_t
>
concat_sizes_range_gpu
(
this
,
concat_sizes_range
);
ORT_RETURN_IF_ERROR
(
concat_sizes_gpu
.
CopyToGpu
());
ORT_RETURN_IF_ERROR
(
axis_dimension_input_output_mapping_gpu
.
CopyToGpu
());
ORT_RETURN_IF_ERROR
(
concat_sizes_range_gpu
.
CopyToGpu
());
ORT_RETURN_IF_ERROR
(
input_ptr
.
CopyToGpu
());
ORT_RETURN_IF_ERROR
(
ConcatImpl
(
Stream
(),
element_bytes
,
block_size_including_axis_dim
,
block_size_inside_axis_dim
,
concat_sizes_gpu
.
GpuPtr
(),
concat_sizes_range_gpu
.
GpuPtr
(),
axis_dimension_input_output_mapping_gpu
.
GpuPtr
(),
p
.
output_tensor
->
MutableDataRaw
(),
input_ptr
.
GpuPtr
(),
static_cast
<
size_t
>
(
p
.
output_num_elements
)));
}
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
Prev
1
…
5
6
7
8
9
10
11
12
13
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment