Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
onnxruntime_v14
Commits
1a91fcc2
Commit
1a91fcc2
authored
Jul 25, 2023
by
gaoqiong
Browse files
add dtk所需文件
parent
a144865d
Pipeline
#492
failed with stages
in 0 seconds
Changes
280
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1974 additions
and
0 deletions
+1974
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/transpose.h
...amdgpu/onnxruntime/core/providers/rocm/tensor/transpose.h
+36
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/transpose_impl.cu
.../onnxruntime/core/providers/rocm/tensor/transpose_impl.cu
+370
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/transpose_impl.h
...u/onnxruntime/core/providers/rocm/tensor/transpose_impl.h
+45
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/trilu.cc
...se/amdgpu/onnxruntime/core/providers/rocm/tensor/trilu.cc
+63
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/trilu.h
...ase/amdgpu/onnxruntime/core/providers/rocm/tensor/trilu.h
+21
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/trilu_impl.cu
...dgpu/onnxruntime/core/providers/rocm/tensor/trilu_impl.cu
+123
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/trilu_impl.h
...mdgpu/onnxruntime/core/providers/rocm/tensor/trilu_impl.h
+23
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/unsqueeze.cc
...mdgpu/onnxruntime/core/providers/rocm/tensor/unsqueeze.cc
+59
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/unsqueeze.h
...amdgpu/onnxruntime/core/providers/rocm/tensor/unsqueeze.h
+18
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/upsample.cc
...amdgpu/onnxruntime/core/providers/rocm/tensor/upsample.cc
+187
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/upsample.h
.../amdgpu/onnxruntime/core/providers/rocm/tensor/upsample.h
+25
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/upsample_impl.cu
...u/onnxruntime/core/providers/rocm/tensor/upsample_impl.cu
+220
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/upsample_impl.h
...pu/onnxruntime/core/providers/rocm/tensor/upsample_impl.h
+26
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/where.cc
...se/amdgpu/onnxruntime/core/providers/rocm/tensor/where.cc
+220
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/where.h
...ase/amdgpu/onnxruntime/core/providers/rocm/tensor/where.h
+18
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/where_impl.cu
...dgpu/onnxruntime/core/providers/rocm/tensor/where_impl.cu
+244
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/where_impl.h
...mdgpu/onnxruntime/core/providers/rocm/tensor/where_impl.h
+30
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/all_tests.h
...e/amdgpu/onnxruntime/core/providers/rocm/test/all_tests.h
+16
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/beam_search_topk.cc
.../onnxruntime/core/providers/rocm/test/beam_search_topk.cc
+138
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/rocm_execution_provider_test.cc
.../core/providers/rocm/test/rocm_execution_provider_test.cc
+92
-0
No files found.
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/transpose.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/common/gsl.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/tensor/transpose.h"
namespace
onnxruntime
{
namespace
rocm
{
class
Transpose
final
:
public
RocmKernel
,
public
TransposeBase
{
public:
Transpose
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
),
TransposeBase
(
info
)
{
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
static
Status
DoTranspose
(
const
Transpose
&
transpose_kernel
,
const
gsl
::
span
<
const
size_t
>&
permutations
,
const
Tensor
&
input
,
Tensor
&
output
);
// `input_shape_override` (if provided) overrides the shape of `input` for compute purposes
// `output_shape_override` (if provided) overrides the shape of `output` for compute purposes
static
Status
DoTranspose
(
const
hipDeviceProp_t
&
prop
,
hipStream_t
stream
,
const
rocblas_handle
rocblas_handle
,
const
gsl
::
span
<
const
size_t
>&
permutations
,
const
Tensor
&
input
,
Tensor
&
output
,
const
TensorShape
*
input_shape_override
=
nullptr
,
const
TensorShape
*
output_shape_override
=
nullptr
);
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/transpose_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/cu_inc/common.cuh"
#include "transpose_impl.h"
namespace
onnxruntime
{
namespace
rocm
{
constexpr
unsigned
int
kNumElementsPerThread
=
4
;
constexpr
unsigned
int
kTileSize
=
32
;
// TileSize for current implementation is always 32, but still use template parameter to make it flexible for future.
// For each batch, transpose matrix [m, n] to [n, m].
template
<
typename
T
,
unsigned
int
TileSize
>
__global__
void
Transpose3DKernel
(
const
int64_t
m
,
const
int64_t
n
,
const
int64_t
batch_stride
,
const
T
*
input_data
,
T
*
output_data
)
{
__shared__
T
tile
[
TileSize
][
TileSize
+
1
];
int
x
=
blockIdx
.
x
*
TileSize
+
threadIdx
.
x
;
int
y
=
blockIdx
.
y
*
TileSize
+
threadIdx
.
y
;
if
(
x
<
n
)
{
#pragma unroll
for
(
unsigned
int
i
=
0
;
i
<
TileSize
;
i
+=
(
TileSize
/
kNumElementsPerThread
))
{
int
y_idx
=
y
+
i
;
if
(
y_idx
<
m
)
{
tile
[
threadIdx
.
y
+
i
][
threadIdx
.
x
]
=
input_data
[
blockIdx
.
z
*
batch_stride
+
y_idx
*
n
+
x
];
}
}
}
__syncthreads
();
x
=
blockIdx
.
y
*
TileSize
+
threadIdx
.
x
;
y
=
blockIdx
.
x
*
TileSize
+
threadIdx
.
y
;
if
(
x
<
m
)
{
#pragma unroll
for
(
unsigned
int
i
=
0
;
i
<
TileSize
;
i
+=
(
TileSize
/
kNumElementsPerThread
))
{
int
y_idx
=
y
+
i
;
if
(
y_idx
<
n
)
{
output_data
[
blockIdx
.
z
*
batch_stride
+
y_idx
*
m
+
x
]
=
tile
[
threadIdx
.
x
][
threadIdx
.
y
+
i
];
}
}
}
}
bool
CanDoTranspose3D
(
const
hipDeviceProp_t
&
prop
,
size_t
rank
,
const
gsl
::
span
<
const
int64_t
>&
input_dims
,
const
gsl
::
span
<
const
size_t
>&
permutations
,
dim3
&
grid_size
,
dim3
&
block_size
)
{
// Permutation is done in the last two dimensions.
if
(
rank
==
3
&&
permutations
[
rank
-
2
]
==
(
rank
-
1
)
&&
permutations
[
rank
-
1
]
==
(
rank
-
2
))
{
// Normally maxGridSize.x is a large number but maxGridSize.y and maxGridSize.z are limited. Ideally we can check
// the input sizes to see if a dimension is too large so that we can use grid.x for it to avoid returning false.
// But this requires different versions of kernel implementation with different index compute logics.
// Below code is good enough for most of the cases for now, and if we see any case that input_dims[0] or
// input_dims[1] is too large in the future, we will handle it accordingly.
int
grid_size_x
=
CeilDiv
(
static_cast
<
int
>
(
input_dims
[
2
]),
kTileSize
);
int
grid_size_y
=
CeilDiv
(
static_cast
<
int
>
(
input_dims
[
1
]),
kTileSize
);
int
grid_size_z
=
static_cast
<
int
>
(
input_dims
[
0
]);
if
(
grid_size_x
<=
prop
.
maxGridSize
[
0
]
&&
grid_size_y
<=
prop
.
maxGridSize
[
1
]
&&
grid_size_z
<=
prop
.
maxGridSize
[
2
])
{
block_size
=
dim3
(
kTileSize
,
kTileSize
/
kNumElementsPerThread
);
grid_size
=
dim3
(
static_cast
<
unsigned
int
>
(
grid_size_x
),
static_cast
<
unsigned
int
>
(
grid_size_y
),
static_cast
<
unsigned
int
>
(
grid_size_z
));
return
true
;
}
else
{
return
false
;
}
}
return
false
;
}
#define HANDLE_TRANSPOSE_3D_TILE_DIM(type) \
case sizeof(type): { \
Transpose3DKernel<type, kTileSize> \
<<<grid_size, block_size, 0, stream>>>(input_shape[1], input_shape[2], input_strides[0], \
reinterpret_cast<const ToHipType<type>::MappedType*>(input_data), \
reinterpret_cast<ToHipType<type>::MappedType*>(output_data)); \
} break
Status
Transpose3DImpl
(
hipStream_t
stream
,
size_t
element_size
,
const
TArray
<
int64_t
>&
input_shape
,
const
TArray
<
int64_t
>&
input_strides
,
const
void
*
input_data
,
void
*
output_data
,
int64_t
N
,
const
dim3
&
grid_size
,
const
dim3
&
block_size
)
{
switch
(
element_size
)
{
HANDLE_TRANSPOSE_3D_TILE_DIM
(
int8_t
);
HANDLE_TRANSPOSE_3D_TILE_DIM
(
int16_t
);
HANDLE_TRANSPOSE_3D_TILE_DIM
(
int32_t
);
HANDLE_TRANSPOSE_3D_TILE_DIM
(
int64_t
);
default:
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
FAIL
,
"Type not supported for transpose on ROCM. Element size was "
,
element_size
);
}
return
Status
::
OK
();
}
template
<
int
element_size
>
__global__
void
Transpose4DKernelParallelizeMultipleElementsPerThreadInInnermostDim
(
const
TArray
<
int64_t
>
input_strides
,
const
void
*
input_data
,
const
TArray
<
int64_t
>
output_strides
,
void
*
output_data
,
int64_t
input_shape_2
,
HIP_LONG
N
)
{
// coordinates will be: [d0, d1, d2, d3]
HIP_LONG
d0
=
blockIdx
.
z
;
HIP_LONG
d1
=
blockIdx
.
y
;
HIP_LONG
d2
=
threadIdx
.
y
+
blockIdx
.
x
*
blockDim
.
y
;
HIP_LONG
d3
=
threadIdx
.
x
;
HIP_LONG
input_index
=
(
d0
*
input_strides
[
0
]
+
d1
*
input_strides
[
1
]
+
d2
*
input_strides
[
2
])
/
(
4
*
sizeof
(
int
)
/
element_size
)
+
d3
*
input_strides
[
3
];
HIP_LONG
output_index
=
(
d0
*
output_strides
[
0
]
+
d1
*
output_strides
[
1
]
+
d2
*
output_strides
[
2
])
/
(
4
*
sizeof
(
int
)
/
element_size
)
+
d3
*
output_strides
[
3
];
const
int4
*
v_input
=
reinterpret_cast
<
const
int4
*>
(
input_data
);
int4
*
v_output
=
reinterpret_cast
<
int4
*>
(
output_data
);
if
(
input_index
<
N
&&
output_index
<
N
&&
d2
<
input_shape_2
)
{
v_output
[
output_index
]
=
v_input
[
input_index
];
}
}
bool
CanDoTranspose4DParallelizeMultipleElementsPerThreadInInnermostDim
(
const
hipDeviceProp_t
&
prop
,
size_t
element_size
,
int32_t
rank
,
const
gsl
::
span
<
const
int64_t
>&
input_dims
,
const
gsl
::
span
<
const
size_t
>&
permutations
,
dim3
&
grid_size
,
dim3
&
block_size
)
{
if
(
rank
==
4
&&
// the permutations is not on the last dimension.
permutations
[
3
]
==
3
)
{
unsigned
int
num_elements_per_thread
=
4
*
sizeof
(
int
)
/
static_cast
<
unsigned
int
>
(
element_size
);
// int4 is used in the kernel to access data.
// dims[3]: block.x
// dims[2]: block.y + grid.x
// dims[1]: grid.y
// dims[0]: grid.z
if
(
input_dims
[
3
]
/
num_elements_per_thread
<=
prop
.
maxThreadsPerBlock
&&
(
input_dims
[
3
]
%
num_elements_per_thread
)
==
0
&&
input_dims
[
1
]
<=
prop
.
maxGridSize
[
1
]
&&
input_dims
[
0
]
<=
prop
.
maxGridSize
[
2
])
{
// There are 2 constrains when luanching the kernels
// 1. block_size_x * block_size_y <= prop.maxThreadsPerBlock
// 2. block_size_y * num_block_ext >= input_dims[2]
int64_t
block_size_x
=
input_dims
[
3
]
/
num_elements_per_thread
;
int64_t
max_block_size_y
=
prop
.
maxThreadsPerBlock
/
block_size_x
;
int64_t
block_size_y
=
min
(
input_dims
[
2
],
max_block_size_y
);
int64_t
num_block_ext
=
CeilDiv
(
input_dims
[
2
],
block_size_y
);
if
(
num_block_ext
<=
prop
.
maxGridSize
[
0
])
{
block_size
=
dim3
(
static_cast
<
unsigned
int
>
(
block_size_x
),
static_cast
<
unsigned
int
>
(
block_size_y
));
grid_size
=
dim3
(
static_cast
<
unsigned
int
>
(
num_block_ext
),
static_cast
<
unsigned
int
>
(
input_dims
[
1
]),
static_cast
<
unsigned
int
>
(
input_dims
[
0
]));
return
true
;
}
else
{
return
false
;
}
}
}
return
false
;
}
Status
Transpose4DParallelizeMultipleElementsPerThreadInInnermostDim
(
hipStream_t
stream
,
size_t
element_size
,
const
TArray
<
int64_t
>&
input_shape
,
const
TArray
<
int64_t
>&
input_strides
,
const
void
*
input_data
,
const
TArray
<
int64_t
>&
output_strides
,
void
*
output_data
,
int
N
,
const
dim3
&
grid_size
,
const
dim3
&
block_size
)
{
unsigned
int
num_elements_per_thread
=
4
*
sizeof
(
int
)
/
static_cast
<
unsigned
int
>
(
element_size
);
// int4 is used in the kernel to access data.
switch
(
element_size
)
{
case
sizeof
(
int8_t
):
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
Transpose4DKernelParallelizeMultipleElementsPerThreadInInnermostDim
<
sizeof
(
int8_t
)
>
),
grid_size
,
block_size
,
0
,
stream
,
input_strides
,
input_data
,
output_strides
,
output_data
,
input_shape
[
2
],
N
/
num_elements_per_thread
);
break
;
case
sizeof
(
int16_t
):
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
Transpose4DKernelParallelizeMultipleElementsPerThreadInInnermostDim
<
sizeof
(
int16_t
)
>
),
grid_size
,
block_size
,
0
,
stream
,
input_strides
,
input_data
,
output_strides
,
output_data
,
input_shape
[
2
],
N
/
num_elements_per_thread
);
break
;
case
sizeof
(
int32_t
):
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
Transpose4DKernelParallelizeMultipleElementsPerThreadInInnermostDim
<
sizeof
(
int32_t
)
>
),
grid_size
,
block_size
,
0
,
stream
,
input_strides
,
input_data
,
output_strides
,
output_data
,
input_shape
[
2
],
N
/
num_elements_per_thread
);
break
;
case
sizeof
(
int64_t
):
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
Transpose4DKernelParallelizeMultipleElementsPerThreadInInnermostDim
<
sizeof
(
int64_t
)
>
),
grid_size
,
block_size
,
0
,
stream
,
input_strides
,
input_data
,
output_strides
,
output_data
,
input_shape
[
2
],
N
/
num_elements_per_thread
);
break
;
default:
// User will not hit this as this kernel is for fixed element size tensors only
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
FAIL
,
"Type not supported for transpose on ROCM. Element size was "
,
element_size
);
}
return
Status
::
OK
();
}
__global__
void
Transpose4DKernelParallelizeOneElementPerThread
(
const
TArray
<
int64_t
>
input_strides
,
const
int8_t
*
input_data
,
const
TArray
<
int64_t
>
output_strides
,
int8_t
*
output_data
,
size_t
element_size
,
int64_t
input_shape_2
,
HIP_LONG
N
)
{
// coordinates will be: [d0, d1, d2, d3]
HIP_LONG
d0
=
blockIdx
.
z
;
HIP_LONG
d1
=
blockIdx
.
y
;
HIP_LONG
d2
=
threadIdx
.
y
+
blockIdx
.
x
*
blockDim
.
y
;
HIP_LONG
d3
=
threadIdx
.
x
;
HIP_LONG
input_index
=
d0
*
input_strides
[
0
]
+
d1
*
input_strides
[
1
]
+
d2
*
input_strides
[
2
]
+
d3
*
input_strides
[
3
];
HIP_LONG
output_index
=
d0
*
output_strides
[
0
]
+
d1
*
output_strides
[
1
]
+
d2
*
output_strides
[
2
]
+
d3
*
output_strides
[
3
];
if
(
input_index
<
N
&&
output_index
<
N
&&
d2
<
input_shape_2
)
{
const
int8_t
*
input_data_to_be_copied
=
input_data
+
(
input_index
*
element_size
);
int8_t
*
output_data_to_be_copied
=
output_data
+
(
output_index
*
element_size
);
// copy over the bytes
for
(
size_t
iter
=
0
;
iter
<
element_size
;
++
iter
)
{
*
output_data_to_be_copied
++
=
*
input_data_to_be_copied
++
;
}
}
}
bool
CanDoTranspose4DParallelizeOneElementPerThread
(
const
hipDeviceProp_t
&
prop
,
size_t
element_size
,
int32_t
rank
,
const
gsl
::
span
<
const
int64_t
>&
input_dims
,
const
gsl
::
span
<
const
size_t
>&
permutations
,
dim3
&
grid_size
,
dim3
&
block_size
)
{
if
(
rank
==
4
)
{
// dims[3]: block.x
// dims[2]: block.y + grid.x
// dims[1]: grid.y
// dims[0]: grid.z
if
(
input_dims
[
3
]
<=
prop
.
maxThreadsPerBlock
&&
input_dims
[
1
]
<=
prop
.
maxGridSize
[
1
]
&&
input_dims
[
0
]
<=
prop
.
maxGridSize
[
2
])
{
// There are 2 constrains when luanching the kernels
// 1. block_size_x * block_size_y <= prop.maxThreadsPerBlock
// 2. block_size_y * num_block_ext >= input_dims[2]
int64_t
block_size_x
=
input_dims
[
3
];
int64_t
max_block_size_y
=
prop
.
maxThreadsPerBlock
/
block_size_x
;
int64_t
block_size_y
=
std
::
min
(
input_dims
[
2
],
max_block_size_y
);
int64_t
num_block_ext
=
CeilDiv
(
input_dims
[
2
],
block_size_y
);
if
(
num_block_ext
<=
prop
.
maxGridSize
[
0
])
{
block_size
=
dim3
(
static_cast
<
unsigned
int
>
(
block_size_x
),
static_cast
<
unsigned
int
>
(
block_size_y
));
grid_size
=
dim3
(
static_cast
<
unsigned
int
>
(
num_block_ext
),
static_cast
<
unsigned
int
>
(
input_dims
[
1
]),
static_cast
<
unsigned
int
>
(
input_dims
[
0
]));
return
true
;
}
else
{
return
false
;
}
}
}
return
false
;
}
Status
Transpose4DParallelizeOneElementPerThread
(
hipStream_t
stream
,
size_t
element_size
,
const
TArray
<
int64_t
>&
input_shape
,
const
TArray
<
int64_t
>&
input_strides
,
const
void
*
input_data
,
const
TArray
<
int64_t
>&
output_strides
,
void
*
output_data
,
int
N
,
const
dim3
&
grid_size
,
const
dim3
&
block_size
)
{
if
(
element_size
!=
sizeof
(
int8_t
)
&&
element_size
!=
sizeof
(
int16_t
)
&&
element_size
!=
sizeof
(
int32_t
)
&&
element_size
!=
sizeof
(
int64_t
))
{
// User will not hit this as this kernel is for fixed element size tensors only
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
FAIL
,
"Type not supported for transpose on ROCM. Element size was "
,
element_size
);
}
hipLaunchKernelGGL
(
Transpose4DKernelParallelizeOneElementPerThread
,
grid_size
,
block_size
,
0
,
stream
,
input_strides
,
reinterpret_cast
<
const
int8_t
*>
(
input_data
),
output_strides
,
reinterpret_cast
<
int8_t
*>
(
output_data
),
element_size
,
input_shape
[
2
],
N
);
return
Status
::
OK
();
}
template
<
typename
T
>
__global__
void
TransposeKernel
(
int32_t
shape_rank
,
const
TArray
<
int64_t
>
input_strides
,
const
T
*
input_data
,
const
TArray
<
fast_divmod
>
output_strides
,
T
*
output_data
,
HIP_LONG
N
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
N
);
HIP_LONG
input_index
=
0
;
HIP_LONG
output_index
=
id
;
#pragma unroll
for
(
auto
dim
=
0
;
dim
<
input_strides
.
Capacity
();
++
dim
)
{
if
(
dim
>=
shape_rank
)
{
break
;
}
int
out_coord
,
r
;
output_strides
[
dim
].
divmod
(
output_index
,
out_coord
,
r
);
output_index
=
r
;
input_index
+=
input_strides
[
dim
]
*
out_coord
;
}
output_data
[
id
]
=
input_data
[
input_index
];
}
Status
TransposeImpl
(
hipStream_t
stream
,
size_t
element_size
,
int32_t
shape_rank
,
const
TArray
<
int64_t
>&
input_strides
,
const
void
*
input_data
,
const
TArray
<
fast_divmod
>&
fdm_output_strides
,
void
*
output_data
,
int
N
)
{
int
blocksPerGrid
=
(
int
)(
ceil
(
static_cast
<
float
>
(
N
)
/
GridDim
::
maxThreadsPerBlock
));
switch
(
element_size
)
{
case
sizeof
(
int8_t
):
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
TransposeKernel
<
int8_t
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
shape_rank
,
input_strides
,
reinterpret_cast
<
const
ToHipType
<
int8_t
>::
MappedType
*>
(
input_data
),
fdm_output_strides
,
reinterpret_cast
<
ToHipType
<
int8_t
>::
MappedType
*>
(
output_data
),
N
);
break
;
case
sizeof
(
int16_t
):
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
TransposeKernel
<
int16_t
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
shape_rank
,
input_strides
,
reinterpret_cast
<
const
ToHipType
<
int16_t
>::
MappedType
*>
(
input_data
),
fdm_output_strides
,
reinterpret_cast
<
ToHipType
<
int16_t
>::
MappedType
*>
(
output_data
),
N
);
break
;
case
sizeof
(
int32_t
):
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
TransposeKernel
<
int32_t
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
shape_rank
,
input_strides
,
reinterpret_cast
<
const
ToHipType
<
int32_t
>::
MappedType
*>
(
input_data
),
fdm_output_strides
,
reinterpret_cast
<
ToHipType
<
int32_t
>::
MappedType
*>
(
output_data
),
N
);
break
;
case
sizeof
(
int64_t
):
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
TransposeKernel
<
int64_t
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
shape_rank
,
input_strides
,
reinterpret_cast
<
const
ToHipType
<
int64_t
>::
MappedType
*>
(
input_data
),
fdm_output_strides
,
reinterpret_cast
<
ToHipType
<
int64_t
>::
MappedType
*>
(
output_data
),
N
);
break
;
default:
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
FAIL
,
"Type not supported for transpose on ROCM. Element size was "
,
element_size
);
}
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/transpose_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
rocm
{
bool
CanDoTranspose3D
(
const
hipDeviceProp_t
&
prop
,
size_t
rank
,
const
gsl
::
span
<
const
int64_t
>&
input_dims
,
const
gsl
::
span
<
const
size_t
>&
permutations
,
dim3
&
grid_size
,
dim3
&
block_size
);
Status
Transpose3DImpl
(
hipStream_t
stream
,
size_t
element_size
,
const
TArray
<
int64_t
>&
input_shape
,
const
TArray
<
int64_t
>&
input_strides
,
const
void
*
input_data
,
void
*
output_data
,
int64_t
N
,
const
dim3
&
grid_size
,
const
dim3
&
block_size
);
bool
CanDoTranspose4DParallelizeMultipleElementsPerThreadInInnermostDim
(
const
hipDeviceProp_t
&
prop
,
size_t
element_size
,
int32_t
rank
,
const
gsl
::
span
<
const
int64_t
>&
input_dims
,
const
gsl
::
span
<
const
size_t
>&
permutations
,
dim3
&
grid_size
,
dim3
&
block_size
);
Status
Transpose4DParallelizeMultipleElementsPerThreadInInnermostDim
(
hipStream_t
stream
,
size_t
element_size
,
const
TArray
<
int64_t
>&
input_shape
,
const
TArray
<
int64_t
>&
input_strides
,
const
void
*
input_data
,
const
TArray
<
int64_t
>&
output_strides
,
void
*
output_data
,
int
N
,
const
dim3
&
grid_size
,
const
dim3
&
block_size
);
bool
CanDoTranspose4DParallelizeOneElementPerThread
(
const
hipDeviceProp_t
&
prop
,
size_t
element_size
,
int32_t
rank
,
const
gsl
::
span
<
const
int64_t
>&
input_dims
,
const
gsl
::
span
<
const
size_t
>&
permutations
,
dim3
&
grid_size
,
dim3
&
block_size
);
Status
Transpose4DParallelizeOneElementPerThread
(
hipStream_t
stream
,
size_t
element_size
,
const
TArray
<
int64_t
>&
input_shape
,
const
TArray
<
int64_t
>&
input_strides
,
const
void
*
input_data
,
const
TArray
<
int64_t
>&
output_strides
,
void
*
output_data
,
int
N
,
const
dim3
&
grid_size
,
const
dim3
&
block_size
);
Status
TransposeImpl
(
hipStream_t
stream
,
size_t
element_size
,
int32_t
shape_rank
,
const
TArray
<
int64_t
>&
input_strides
,
const
void
*
input_data
,
const
TArray
<
fast_divmod
>&
fdm_output_strides
,
void
*
output_data
,
int
N
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/trilu.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/trilu.h"
#include "core/providers/rocm/tensor/trilu_impl.h"
#include "core/providers/cpu/tensor/utils.h"
using
namespace
onnxruntime
::
common
;
namespace
onnxruntime
{
namespace
rocm
{
ONNX_OPERATOR_KERNEL_EX
(
Trilu
,
kOnnxDomain
,
14
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
InputMemoryType
(
OrtMemTypeCPUInput
,
1
)
.
MayInplace
(
0
,
0
)
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
()),
Trilu
);
Status
Trilu
::
ComputeInternal
(
OpKernelContext
*
ctx
)
const
{
const
Tensor
*
input_ptr
=
ctx
->
Input
<
Tensor
>
(
0
);
const
auto
*
k
=
ctx
->
Input
<
Tensor
>
(
1
);
int64_t
k_val
=
0
;
if
(
k
)
{
ORT_ENFORCE
(
IsScalarOr1ElementVector
(
k
),
"k should be a 1-D or 0-D tensor."
);
k_val
=
*
(
k
->
Data
<
int64_t
>
());
}
if
(
input_ptr
==
nullptr
)
return
Status
(
common
::
ONNXRUNTIME
,
common
::
FAIL
,
"input count mismatch"
);
const
Tensor
&
input
=
*
input_ptr
;
const
auto
&
shape
=
input
.
Shape
();
const
auto
&
input_dims
=
shape
.
GetDims
();
int32_t
rank
=
gsl
::
narrow_cast
<
int32_t
>
(
input_dims
.
size
());
if
(
rank
<
2
)
{
return
Status
(
ONNXRUNTIME
,
INVALID_ARGUMENT
,
"Input tensor should have a rank of at least 2"
);
}
Tensor
*
output
=
ctx
->
Output
(
0
,
shape
);
int64_t
matrix_size
=
input_dims
[
rank
-
1
]
*
input_dims
[
rank
-
2
];
if
(
matrix_size
==
0
)
{
return
Status
::
OK
();
}
const
fast_divmod
row_col_divmod_indices
(
gsl
::
narrow_cast
<
int
>
(
input_dims
[
rank
-
1
]));
const
fast_divmod
batch_divmod_indices
(
gsl
::
narrow_cast
<
int
>
(
matrix_size
));
size_t
element_size
=
input
.
DataType
()
->
Size
();
return
TriluImpl
(
this
->
Stream
(),
upper_
,
element_size
,
k_val
,
input
.
DataRaw
(),
output
->
MutableDataRaw
(),
gsl
::
narrow
<
int
>
(
shape
.
Size
()),
batch_divmod_indices
,
row_col_divmod_indices
);
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/trilu.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
rocm
{
class
Trilu
final
:
public
RocmKernel
{
public:
Trilu
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
),
upper_
(
info
.
GetAttrOrDefault
<
int64_t
>
(
"upper"
,
1
)
>=
1
)
{
}
~
Trilu
()
=
default
;
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
private:
bool
upper_
;
};
}
// namespace rocm
}
// namespace onnxruntime
\ No newline at end of file
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/trilu_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/cu_inc/common.cuh"
#include "trilu_impl.h"
#include <stdio.h>
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
,
bool
upper
>
__global__
void
TriluKernel
(
int64_t
k
,
const
T
*
input_data
,
T
*
output_data
,
const
HIP_LONG
N
,
const
fast_divmod
batch_divmod_indices
,
const
fast_divmod
row_col_divmod_indices
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
N
);
int
row
,
col
;
row_col_divmod_indices
.
divmod
(
batch_divmod_indices
.
mod
(
id
),
row
,
col
);
output_data
[
id
]
=
upper
?
(((
row
+
k
)
<=
col
)
?
input_data
[
id
]
:
0
)
:
(((
row
+
k
)
>=
col
)
?
input_data
[
id
]
:
0
);
}
Status
TriluImpl
(
hipStream_t
stream
,
bool
upper
,
size_t
element_size
,
int64_t
k
,
const
void
*
input_data
,
void
*
output_data
,
int
N
,
const
fast_divmod
&
batch_divmod_indices
,
const
fast_divmod
&
row_col_divmod_indices
)
{
int
blocksPerGrid
=
(
int
)(
ceil
(
static_cast
<
float
>
(
N
)
/
GridDim
::
maxThreadsPerBlock
));
switch
(
element_size
)
{
case
sizeof
(
int8_t
):
if
(
upper
)
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
TriluKernel
<
int8_t
,
true
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
k
,
reinterpret_cast
<
const
ToHipType
<
int8_t
>::
MappedType
*>
(
input_data
),
reinterpret_cast
<
ToHipType
<
int8_t
>::
MappedType
*>
(
output_data
),
(
HIP_LONG
)
N
,
batch_divmod_indices
,
row_col_divmod_indices
);
}
else
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
TriluKernel
<
int8_t
,
false
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
k
,
reinterpret_cast
<
const
ToHipType
<
int8_t
>::
MappedType
*>
(
input_data
),
reinterpret_cast
<
ToHipType
<
int8_t
>::
MappedType
*>
(
output_data
),
(
HIP_LONG
)
N
,
batch_divmod_indices
,
row_col_divmod_indices
);
}
break
;
case
sizeof
(
int16_t
):
if
(
upper
)
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
TriluKernel
<
int16_t
,
true
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
k
,
reinterpret_cast
<
const
ToHipType
<
int16_t
>::
MappedType
*>
(
input_data
),
reinterpret_cast
<
ToHipType
<
int16_t
>::
MappedType
*>
(
output_data
),
(
HIP_LONG
)
N
,
batch_divmod_indices
,
row_col_divmod_indices
);
}
else
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
TriluKernel
<
int16_t
,
false
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
k
,
reinterpret_cast
<
const
ToHipType
<
int16_t
>::
MappedType
*>
(
input_data
),
reinterpret_cast
<
ToHipType
<
int16_t
>::
MappedType
*>
(
output_data
),
(
HIP_LONG
)
N
,
batch_divmod_indices
,
row_col_divmod_indices
);
}
break
;
case
sizeof
(
int32_t
):
if
(
upper
)
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
TriluKernel
<
int32_t
,
true
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
k
,
reinterpret_cast
<
const
ToHipType
<
int32_t
>::
MappedType
*>
(
input_data
),
reinterpret_cast
<
ToHipType
<
int32_t
>::
MappedType
*>
(
output_data
),
(
HIP_LONG
)
N
,
batch_divmod_indices
,
row_col_divmod_indices
);
}
else
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
TriluKernel
<
int32_t
,
false
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
k
,
reinterpret_cast
<
const
ToHipType
<
int32_t
>::
MappedType
*>
(
input_data
),
reinterpret_cast
<
ToHipType
<
int32_t
>::
MappedType
*>
(
output_data
),
(
HIP_LONG
)
N
,
batch_divmod_indices
,
row_col_divmod_indices
);
}
break
;
case
sizeof
(
int64_t
):
if
(
upper
)
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
TriluKernel
<
int64_t
,
true
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
k
,
reinterpret_cast
<
const
ToHipType
<
int64_t
>::
MappedType
*>
(
input_data
),
reinterpret_cast
<
ToHipType
<
int64_t
>::
MappedType
*>
(
output_data
),
(
HIP_LONG
)
N
,
batch_divmod_indices
,
row_col_divmod_indices
);
}
else
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
TriluKernel
<
int64_t
,
false
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
k
,
reinterpret_cast
<
const
ToHipType
<
int64_t
>::
MappedType
*>
(
input_data
),
reinterpret_cast
<
ToHipType
<
int64_t
>::
MappedType
*>
(
output_data
),
(
HIP_LONG
)
N
,
batch_divmod_indices
,
row_col_divmod_indices
);
}
break
;
default:
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
FAIL
,
"Type not supported for transpose on ROCM. Element size was "
,
element_size
);
}
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
\ No newline at end of file
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/trilu_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
rocm
{
Status
TriluImpl
(
hipStream_t
stream
,
bool
upper
,
size_t
element_size
,
int64_t
k
,
const
void
*
input_data
,
void
*
output_data
,
int
N
,
const
fast_divmod
&
batch_divmod_indices
,
const
fast_divmod
&
row_col_divmod_indices
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/unsqueeze.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/unsqueeze.h"
namespace
onnxruntime
{
namespace
rocm
{
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Unsqueeze
,
kOnnxDomain
,
1
,
10
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
Alias
(
0
,
0
)
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
()),
Unsqueeze
);
// explicitly support negative axis
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Unsqueeze
,
kOnnxDomain
,
11
,
12
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
Alias
(
0
,
0
)
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
()),
Unsqueeze
);
// axes is input instead of attribute, support bfloat16
ONNX_OPERATOR_KERNEL_EX
(
Unsqueeze
,
kOnnxDomain
,
13
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
Alias
(
0
,
0
)
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
())
.
InputMemoryType
(
OrtMemTypeCPUInput
,
1
),
Unsqueeze
);
Status
Unsqueeze
::
ComputeInternal
(
OpKernelContext
*
ctx
)
const
{
Prepare
p
;
ORT_RETURN_IF_ERROR
(
PrepareCompute
(
ctx
,
p
));
const
void
*
input
=
p
.
input_tensor
->
DataRaw
();
void
*
output
=
p
.
output_tensor
->
MutableDataRaw
();
if
(
input
==
output
)
return
Status
::
OK
();
auto
count
=
p
.
input_tensor
->
Shape
().
Size
();
auto
element_bytes
=
p
.
input_tensor
->
DataType
()
->
Size
();
HIP_RETURN_IF_ERROR
(
hipMemcpyAsync
(
output
,
input
,
count
*
element_bytes
,
hipMemcpyDeviceToDevice
,
Stream
()));
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/unsqueeze.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/tensor/unsqueeze.h"
namespace
onnxruntime
{
namespace
rocm
{
class
Unsqueeze
final
:
public
UnsqueezeBase
,
public
RocmKernel
{
public:
Unsqueeze
(
const
OpKernelInfo
&
info
)
:
UnsqueezeBase
(
info
),
RocmKernel
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/upsample.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "upsample.h"
#include "upsample_impl.h"
#include "core/providers/rocm/tensor/resize_impl.h"
#include "core/providers/cpu/tensor/utils.h"
using
namespace
onnxruntime
::
common
;
namespace
onnxruntime
{
namespace
rocm
{
#define REGISTER_VERSIONED_TYPED_KERNEL(T, start, end) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Upsample, \
kOnnxDomain, \
start, \
end, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.InputMemoryType(OrtMemTypeCPUInput, 1) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
Upsample<T>)
REGISTER_VERSIONED_TYPED_KERNEL
(
float
,
7
,
8
);
REGISTER_VERSIONED_TYPED_KERNEL
(
double
,
7
,
8
);
REGISTER_VERSIONED_TYPED_KERNEL
(
MLFloat16
,
7
,
8
);
REGISTER_VERSIONED_TYPED_KERNEL
(
int32_t
,
7
,
8
);
REGISTER_VERSIONED_TYPED_KERNEL
(
uint8_t
,
7
,
8
);
// Upsample was deprecated in opset 10
REGISTER_VERSIONED_TYPED_KERNEL
(
float
,
9
,
9
);
REGISTER_VERSIONED_TYPED_KERNEL
(
double
,
9
,
9
);
REGISTER_VERSIONED_TYPED_KERNEL
(
MLFloat16
,
9
,
9
);
REGISTER_VERSIONED_TYPED_KERNEL
(
int32_t
,
9
,
9
);
REGISTER_VERSIONED_TYPED_KERNEL
(
uint8_t
,
9
,
9
);
template
<
typename
T
>
Status
Upsample
<
T
>::
BaseCompute
(
OpKernelContext
*
context
,
const
std
::
vector
<
float
>&
roi
,
const
std
::
vector
<
float
>&
scales
,
const
gsl
::
span
<
const
int64_t
>&
output_dims
)
const
{
const
Tensor
*
X
=
context
->
Input
<
Tensor
>
(
0
);
auto
X_dims
=
X
->
Shape
().
GetDims
();
int32_t
rank
=
static_cast
<
int32_t
>
(
X_dims
.
size
());
ORT_ENFORCE
(
static_cast
<
int32_t
>
(
output_dims
.
size
())
==
rank
,
"Rank of input and output tensor should be same."
);
if
(
rank
==
0
)
return
Status
(
ONNXRUNTIME
,
INVALID_ARGUMENT
,
is_resize_
?
"Resize: input tensor cannot be scalar."
:
"Upsample: input tensor cannot be scalar."
);
if
(
rank
!=
static_cast
<
int32_t
>
(
scales
.
size
()))
return
Status
(
ONNXRUNTIME
,
INVALID_ARGUMENT
,
is_resize_
?
"Resize: input tensor's dimension does not match the scales."
:
"Upsample: input tensor's dimension does not match the scales."
);
if
(
roi
.
size
()
!=
2
*
X
->
Shape
().
GetDims
().
size
())
return
Status
(
ONNXRUNTIME
,
INVALID_ARGUMENT
,
"Resize: size of roi array should be 2 * N where N is the rank of input tensor X."
);
Tensor
*
Y
=
context
->
Output
(
0
,
output_dims
);
// Return early if the output tensor is going to be of size 0
if
(
Y
->
Shape
().
Size
()
==
0
)
{
return
Status
::
OK
();
}
typedef
typename
ToHipType
<
T
>::
MappedType
HipT
;
// kernel
TensorPitches
input_pitches
(
X_dims
);
TArray
<
int64_t
>
input_strides
(
input_pitches
);
TensorPitches
output_pitches
(
output_dims
);
TArray
<
fast_divmod
>
output_div_pitches
(
rank
);
for
(
int32_t
i
=
0
;
i
<
rank
;
++
i
)
{
output_div_pitches
[
i
]
=
fast_divmod
(
gsl
::
narrow_cast
<
int
>
(
output_pitches
[
i
]));
}
size_t
output_count
=
Y
->
Shape
().
Size
();
if
(
is_resize_
)
{
TArray
<
int64_t
>
input_shape
(
X_dims
);
TArray
<
int64_t
>
output_shape
(
output_dims
);
TArray
<
float
,
10
>
roi_vals
(
roi
);
TArray
<
float
>
scales_vals
(
scales
);
size_t
temp_buffer_size
=
CalcResizeBufferSize
(
mode_
,
output_dims
);
auto
dims_mapping_buffer
=
GetScratchBuffer
<
unsigned
char
>
(
temp_buffer_size
);
void
*
dims_mapping
=
reinterpret_cast
<
void
*>
(
dims_mapping_buffer
.
get
());
ResizeImpl
(
Stream
(),
mode_
,
(
int
)
rank
,
input_shape
,
output_shape
,
input_strides
,
output_div_pitches
,
scales_vals
,
roi_vals
,
reinterpret_cast
<
const
HipT
*>
(
X
->
Data
<
T
>
()),
reinterpret_cast
<
HipT
*>
(
Y
->
MutableData
<
T
>
()),
output_count
,
use_extrapolation_
,
ToHipType
<
T
>::
FromFloat
(
extrapolation_value_
),
cubic_coeff_a_
,
exclude_outside_
,
coordinate_transform_mode_
,
nearest_mode_
,
dims_mapping
);
}
else
{
TArray
<
fast_divmod
>
scales_div
(
rank
);
for
(
int32_t
i
=
0
;
i
<
rank
;
++
i
)
{
scales_div
[
i
]
=
fast_divmod
(
gsl
::
narrow_cast
<
int
>
(
ceil
(
scales
[
i
])));
}
UpampleImpl
(
Stream
(),
mode_
,
rank
,
(
UpsampleMode
::
LINEAR
==
mode_
)
?
(
rank
==
2
?
X_dims
[
0
]
:
X_dims
[
2
])
:
0
,
input_strides
,
output_div_pitches
,
scales_div
,
reinterpret_cast
<
const
HipT
*>
(
X
->
Data
<
T
>
()),
reinterpret_cast
<
HipT
*>
(
Y
->
MutableData
<
T
>
()),
output_count
);
}
return
Status
::
OK
();
}
template
<
typename
T
>
Status
Upsample
<
T
>::
ComputeInternal
(
OpKernelContext
*
context
)
const
{
const
Tensor
*
X
=
context
->
Input
<
Tensor
>
(
0
);
ORT_ENFORCE
(
X
!=
nullptr
);
TensorShapeVector
output_dims
(
X
->
Shape
().
GetDims
().
size
());
std
::
vector
<
float
>
roi_array
(
X
->
Shape
().
GetDims
().
size
()
*
2
,
0.0
f
);
if
(
!
roi_cached_
)
{
bool
use_default_roi
=
true
;
if
(
need_roi_input_
)
{
ORT_ENFORCE
(
roi_input_idx_
>
0
,
"Invalid roi input index."
);
const
auto
*
roi
=
context
->
Input
<
Tensor
>
(
roi_input_idx_
);
if
(
roi
!=
nullptr
)
{
ParseRoiData
(
roi
,
roi_array
);
use_default_roi
=
false
;
}
}
if
(
use_default_roi
)
{
// default roi includes ensures all the values in that axis are included in the roi
// normalized roi is thus : [start, end] = [0, 1]
const
auto
input_dims
=
X
->
Shape
().
GetDims
();
size_t
input_rank
=
input_dims
.
size
();
roi_array
.
resize
(
input_rank
*
2
);
for
(
size_t
i
=
0
;
i
<
input_rank
;
++
i
)
{
roi_array
[
i
]
=
0
;
roi_array
[
i
+
input_rank
]
=
1
;
}
}
}
const
std
::
vector
<
float
>&
roi
=
roi_cached_
?
roi_
:
roi_array
;
if
(
OpKernel
::
Node
().
InputDefs
().
size
()
==
1
)
{
// Compute output shape from scales and input dims
ComputeOutputShape
(
scales_
,
X
->
Shape
().
GetDims
(),
output_dims
);
return
BaseCompute
(
context
,
roi
,
scales_
,
output_dims
);
}
const
Tensor
*
scales
=
context
->
Input
<
Tensor
>
(
scales_input_idx_
);
const
Tensor
*
sizes
=
context
->
Input
<
Tensor
>
(
sizes_input_idx_
);
if
(
scales_cached_
)
{
ORT_ENFORCE
(
sizes
==
nullptr
,
"Only one of scales or sizes must be provided as input."
);
ComputeOutputShape
(
scales_
,
X
->
Shape
().
GetDims
(),
output_dims
);
return
BaseCompute
(
context
,
roi
,
scales_
,
output_dims
);
}
std
::
vector
<
float
>
scales_array
(
X
->
Shape
().
GetDims
().
size
());
if
(
scales
!=
nullptr
&&
scales
->
Shape
().
Size
()
!=
0
)
{
// use scales input data
ORT_ENFORCE
(
sizes
==
nullptr
,
"Only one of scales or sizes must be provided as input."
);
ParseScalesData
(
scales
,
scales_array
);
ComputeOutputShape
(
scales_array
,
X
->
Shape
().
GetDims
(),
output_dims
);
}
else
{
// When sizes input is available directly populate it into the output_dims array.
ORT_ENFORCE
(
sizes
!=
nullptr
&&
sizes
->
Shape
().
Size
()
!=
0
,
"Either scales or sizes MUST be provided as input."
);
ORT_ENFORCE
(
sizes
->
Shape
().
Size
()
==
static_cast
<
int64_t
>
(
output_dims
.
size
()),
"Resize: input tensor's rank does not match the output tensor's rank."
);
memcpy
(
output_dims
.
data
(),
sizes
->
Data
<
int64_t
>
(),
sizes
->
Shape
().
Size
()
*
sizeof
(
int64_t
));
ParseScalesDataFromOutputSize
(
output_dims
,
X
->
Shape
().
GetDims
(),
scales_array
);
}
return
BaseCompute
(
context
,
roi
,
scales_array
,
output_dims
);
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/upsample.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/tensor/upsamplebase.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
class
Upsample
:
public
UpsampleBase
,
public
RocmKernel
{
public:
Upsample
(
const
OpKernelInfo
&
info
)
:
UpsampleBase
(
info
),
RocmKernel
(
info
)
{
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
Status
BaseCompute
(
OpKernelContext
*
context
,
const
std
::
vector
<
float
>&
roi
,
const
std
::
vector
<
float
>&
scales
,
const
gsl
::
span
<
const
int64_t
>&
output_dims
)
const
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/upsample_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/cu_inc/common.cuh"
#include "upsample_impl.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
,
int
RANK
>
__global__
void
_UpampleNearestKernel
(
const
TArray
<
int64_t
>
input_pitches
,
const
TArray
<
fast_divmod
>
output_div_pitches
,
const
TArray
<
fast_divmod
>
scales_div
,
const
T
*
__restrict__
input_data
,
T
*
__restrict__
output_data
,
const
size_t
N
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
N
);
HIP_LONG
input_index
=
0
;
HIP_LONG
output_index
=
id
;
int
div
,
mod
;
for
(
int
dim
=
0
;
dim
<
RANK
;
++
dim
)
{
output_div_pitches
[
dim
].
divmod
(
output_index
,
div
,
mod
);
output_index
=
mod
;
if
(
scales_div
[
dim
].
d_
!=
1
&&
div
>
0
)
{
scales_div
[
dim
].
divmod
(
div
,
div
,
mod
);
}
input_index
+=
input_pitches
[
dim
]
*
div
;
}
output_data
[
id
]
=
input_data
[
input_index
];
}
// The following method supports a 4-D input in 'Linear mode'
// that amounts to 'Bilinear' Upsampling/Resizing in the sense that it assumes
// the scale values for the outermost 2 dimensions are 1.
// This is the common use-case where the 4-D input (batched multi-channel images)
// is usually of shape [N, C, H, W] and the scales are [1.0, 1.0, height_scale, width_scale]
template
<
typename
T
>
__global__
void
_UpampleBilinear4DInputKernel
(
const
int64_t
input_dim2
,
const
TArray
<
int64_t
>
input_pitches
,
const
TArray
<
fast_divmod
>
output_div_pitches
,
const
TArray
<
fast_divmod
>
scales_div
,
const
T
*
__restrict__
input_data
,
T
*
__restrict__
output_data
,
const
size_t
N
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
N
);
HIP_LONG
input_index
=
0
;
// For bilinear mode, scales[0]=scales[1]=1
int
mod
;
int
index_of_dim0
,
index_of_dim1
,
index_of_dim2
,
index_of_dim3
;
output_div_pitches
[
0
].
divmod
(
id
,
index_of_dim0
,
mod
);
output_div_pitches
[
1
].
divmod
(
mod
,
index_of_dim1
,
mod
);
output_div_pitches
[
2
].
divmod
(
mod
,
index_of_dim2
,
mod
);
index_of_dim3
=
mod
;
int
index_of_input_dim2
,
index_of_input_dim3
,
x_offset
,
y_offset
;
scales_div
[
2
].
divmod
(
index_of_dim2
,
index_of_input_dim2
,
y_offset
);
scales_div
[
3
].
divmod
(
index_of_dim3
,
index_of_input_dim3
,
x_offset
);
input_index
=
index_of_dim0
*
input_pitches
[
0
]
+
index_of_dim1
*
input_pitches
[
1
]
+
index_of_input_dim2
*
input_pitches
[
2
]
+
index_of_input_dim3
;
T
x00
=
input_data
[
input_index
];
T
x10
,
x01
,
x11
;
bool
end_of_dim2
=
false
;
if
(
index_of_input_dim2
==
(
input_dim2
-
1
))
{
// It's the end in dimension 2
x01
=
x00
;
end_of_dim2
=
true
;
}
else
{
x01
=
input_data
[
input_index
+
input_pitches
[
2
]];
}
if
(
index_of_input_dim3
==
(
input_pitches
[
2
]
-
1
))
{
// It's the end in dimension 3
x10
=
x00
;
x11
=
x01
;
}
else
{
x10
=
input_data
[
input_index
+
1
];
x11
=
end_of_dim2
?
x10
:
input_data
[
input_index
+
input_pitches
[
2
]
+
1
];
}
T
y_offset_T
=
static_cast
<
T
>
(
y_offset
);
T
x_offset_T
=
static_cast
<
T
>
(
x_offset
);
T
scales_div2_T
=
static_cast
<
T
>
(
scales_div
[
2
].
d_
);
T
scales_div3_T
=
static_cast
<
T
>
(
scales_div
[
3
].
d_
);
T
y0
=
x00
+
static_cast
<
T
>
(
y_offset_T
*
(
x01
-
x00
)
/
scales_div2_T
);
T
y1
=
x10
+
static_cast
<
T
>
(
y_offset_T
*
(
x11
-
x10
)
/
scales_div2_T
);
output_data
[
id
]
=
y0
+
static_cast
<
T
>
(
x_offset_T
*
(
y1
-
y0
)
/
scales_div3_T
);
}
// The following method supports a 2-D input in 'Linear mode'
template
<
typename
T
>
__global__
void
_UpampleBilinear2DInputKernel
(
const
int64_t
input_dim0
,
const
TArray
<
int64_t
>
input_pitches
,
const
TArray
<
fast_divmod
>
output_div_pitches
,
const
TArray
<
fast_divmod
>
scales_div
,
const
T
*
__restrict__
input_data
,
T
*
__restrict__
output_data
,
const
size_t
N
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
N
);
HIP_LONG
input_index
=
0
;
int
mod
;
int
index_of_dim0
,
index_of_dim1
;
output_div_pitches
[
0
].
divmod
(
id
,
index_of_dim0
,
mod
);
index_of_dim1
=
mod
;
int
index_of_input_dim0
,
index_of_input_dim1
,
x_offset
,
y_offset
;
scales_div
[
0
].
divmod
(
index_of_dim0
,
index_of_input_dim0
,
y_offset
);
scales_div
[
1
].
divmod
(
index_of_dim1
,
index_of_input_dim1
,
x_offset
);
input_index
=
index_of_input_dim0
*
input_pitches
[
0
]
+
index_of_input_dim1
;
T
x00
=
input_data
[
input_index
];
T
x10
,
x01
,
x11
;
bool
end_of_dim0
=
false
;
if
(
index_of_input_dim0
==
(
input_dim0
-
1
))
{
// It's the end in dimension 0
x01
=
x00
;
end_of_dim0
=
true
;
}
else
{
x01
=
input_data
[
input_index
+
input_pitches
[
0
]];
}
if
(
index_of_input_dim1
==
(
input_pitches
[
0
]
-
1
))
{
// It's the end in dimension 1
x10
=
x00
;
x11
=
x01
;
}
else
{
x10
=
input_data
[
input_index
+
1
];
x11
=
end_of_dim0
?
x10
:
input_data
[
input_index
+
input_pitches
[
0
]
+
1
];
}
T
y_offset_T
=
static_cast
<
T
>
(
y_offset
);
T
x_offset_T
=
static_cast
<
T
>
(
x_offset
);
T
scales_div0_T
=
static_cast
<
T
>
(
scales_div
[
0
].
d_
);
T
scales_div1_T
=
static_cast
<
T
>
(
scales_div
[
1
].
d_
);
T
y0
=
x00
+
static_cast
<
T
>
(
y_offset_T
*
(
x01
-
x00
)
/
scales_div0_T
);
T
y1
=
x10
+
static_cast
<
T
>
(
y_offset_T
*
(
x11
-
x10
)
/
scales_div0_T
);
output_data
[
id
]
=
y0
+
static_cast
<
T
>
(
x_offset_T
*
(
y1
-
y0
)
/
scales_div1_T
);
}
template
<
typename
T
>
void
UpampleImpl
(
hipStream_t
stream
,
const
onnxruntime
::
UpsampleMode
upsample_mode
,
const
size_t
rank
,
const
int64_t
input_dim2
,
const
TArray
<
int64_t
>&
input_pitches
,
const
TArray
<
fast_divmod
>&
output_div_pitches
,
const
TArray
<
fast_divmod
>&
scales_div
,
const
T
*
input_data
,
T
*
output_data
,
const
size_t
N
)
{
int
blocksPerGrid
=
(
int
)(
ceil
(
static_cast
<
float
>
(
N
)
/
GridDim
::
maxThreadsPerBlock
));
if
(
onnxruntime
::
UpsampleMode
::
NN
==
upsample_mode
)
{
if
(
rank
==
4
)
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_UpampleNearestKernel
<
T
,
4
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
input_pitches
,
output_div_pitches
,
scales_div
,
input_data
,
output_data
,
N
);
}
else
if
(
rank
==
3
)
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_UpampleNearestKernel
<
T
,
3
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
input_pitches
,
output_div_pitches
,
scales_div
,
input_data
,
output_data
,
N
);
}
else
if
(
rank
==
2
)
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_UpampleNearestKernel
<
T
,
2
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
input_pitches
,
output_div_pitches
,
scales_div
,
input_data
,
output_data
,
N
);
}
else
if
(
rank
==
1
)
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_UpampleNearestKernel
<
T
,
1
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
input_pitches
,
output_div_pitches
,
scales_div
,
input_data
,
output_data
,
N
);
}
else
{
ORT_THROW
(
"Unsupported rank by the Upsample ROCM kernel. Input rank: "
,
rank
);
}
}
else
if
(
onnxruntime
::
UpsampleMode
::
LINEAR
==
upsample_mode
)
{
if
(
rank
==
4
)
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_UpampleBilinear4DInputKernel
<
T
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
input_dim2
,
input_pitches
,
output_div_pitches
,
scales_div
,
input_data
,
output_data
,
N
);
}
else
if
(
rank
==
2
)
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_UpampleBilinear2DInputKernel
<
T
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
input_dim2
,
input_pitches
,
output_div_pitches
,
scales_div
,
input_data
,
output_data
,
N
);
}
else
{
ORT_THROW
(
"Unsupported rank by the Upsample ROCM kernel. Input rank: "
,
rank
);
}
}
else
{
// Should never encounter this as Upsample only supports 'Nearest' and 'Linear' modes.
// But if we do encounter this it is best to throw instead of returning silently.
ORT_THROW
(
"Unsupported mode for Upsample: "
,
upsample_mode
);
}
}
#define SPECIALIZED_IMPL(T) \
template void UpampleImpl<T>(hipStream_t stream, \
const onnxruntime::UpsampleMode upsample_mode, \
const size_t rank, \
const int64_t input_dim2, \
const TArray<int64_t>& input_pitches, \
const TArray<fast_divmod>& output_div_pitches, \
const TArray<fast_divmod>& scales_div, \
const T* input_data, \
T* output_data, \
const size_t N);
SPECIALIZED_IMPL
(
float
)
SPECIALIZED_IMPL
(
double
)
SPECIALIZED_IMPL
(
half
)
SPECIALIZED_IMPL
(
int32_t
)
SPECIALIZED_IMPL
(
uint8_t
)
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/upsample_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
#include "core/common/common.h"
#include "core/providers/cpu/tensor/upsamplebase.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
void
UpampleImpl
(
hipStream_t
stream
,
const
onnxruntime
::
UpsampleMode
upsample_mode
,
const
size_t
rank
,
const
int64_t
input_dim2
,
const
TArray
<
int64_t
>&
input_pitches
,
const
TArray
<
fast_divmod
>&
output_div_pitches
,
const
TArray
<
fast_divmod
>&
scales_div
,
const
T
*
input_data
,
T
*
output_data
,
const
size_t
N
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/where.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "where.h"
#include "where_impl.h"
#include "core/providers/cpu/tensor/utils.h"
namespace
onnxruntime
{
namespace
rocm
{
// kernel builder functions
#define WHERE_TYPED_KERNEL_WITH_TYPE_NAME(T, TName) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Where, \
kOnnxDomain, \
9, \
15, \
TName, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("B", DataTypeImpl::GetTensorType<bool>()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
Where<T>); \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
Where, \
kOnnxDomain, \
16, \
TName, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("B", DataTypeImpl::GetTensorType<bool>()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
Where<T>);
// Compute where operator output shape based upon three way broad-casting.
Status
ComputeOutputShape
(
const
std
::
string
&
node_name
,
const
TensorShape
&
cond_shape
,
const
TensorShape
&
x_shape
,
const
TensorShape
&
y_shape
,
TensorShape
&
out_shape
)
{
size_t
cond_rank
=
cond_shape
.
NumDimensions
();
size_t
x_rank
=
x_shape
.
NumDimensions
();
size_t
y_rank
=
y_shape
.
NumDimensions
();
size_t
out_rank
=
std
::
max
(
std
::
max
(
cond_rank
,
x_rank
),
y_rank
);
std
::
vector
<
int64_t
>
output_dims
(
out_rank
,
0
);
for
(
size_t
i
=
0
;
i
<
out_rank
;
++
i
)
{
int64_t
cond_dim
=
1
;
if
(
i
<
cond_rank
)
cond_dim
=
cond_shape
[
cond_rank
-
1
-
i
];
int64_t
x_dim
=
1
;
if
(
i
<
x_rank
)
x_dim
=
x_shape
[
x_rank
-
1
-
i
];
int64_t
y_dim
=
1
;
if
(
i
<
y_rank
)
y_dim
=
y_shape
[
y_rank
-
1
-
i
];
int64_t
out_dim
=
std
::
max
(
std
::
max
(
cond_dim
,
x_dim
),
y_dim
);
// special case to handle a dim of 0 which can be broadcast with a 1
if
(
out_dim
==
1
)
out_dim
=
std
::
min
(
std
::
min
(
cond_dim
,
x_dim
),
y_dim
);
if
(
cond_dim
!=
out_dim
&&
cond_dim
!=
1
)
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
FAIL
,
node_name
,
": condition operand cannot broadcast on dim "
,
cond_rank
-
1
-
i
,
" Condition Shape: "
,
cond_shape
.
ToString
(),
", X Shape: "
,
x_shape
.
ToString
(),
", Y Shape: "
,
y_shape
.
ToString
());
if
(
x_dim
!=
out_dim
&&
x_dim
!=
1
)
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
FAIL
,
node_name
,
": X operand cannot broadcast on dim "
,
x_rank
-
1
-
i
,
" Condition Shape: "
,
cond_shape
.
ToString
(),
", X Shape: "
,
x_shape
.
ToString
(),
", Y Shape: "
,
y_shape
.
ToString
());
if
(
y_dim
!=
out_dim
&&
y_dim
!=
1
)
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
FAIL
,
node_name
,
": Y operand cannot broadcast on dim "
,
y_rank
-
1
-
i
,
" Condition Shape: "
,
cond_shape
.
ToString
(),
", X Shape: "
,
x_shape
.
ToString
(),
", Y Shape: "
,
y_shape
.
ToString
());
output_dims
[
out_rank
-
1
-
i
]
=
out_dim
;
}
out_shape
=
TensorShape
(
output_dims
);
return
Status
::
OK
();
}
struct
TernaryElementwisePreparation
{
const
Tensor
*
a_tensor
=
nullptr
;
const
Tensor
*
b_tensor
=
nullptr
;
const
Tensor
*
c_tensor
=
nullptr
;
size_t
output_rank_or_simple_broadcast
=
0
;
// for no_broadcast cases, output_rank uses SimpleBroadcast enums
TArray
<
int64_t
>
a_padded_strides
;
// for a shape == output shape, this is nullptr
TArray
<
int64_t
>
b_padded_strides
;
// for b shape == output shape, this is nullptr
TArray
<
int64_t
>
c_padded_strides
;
// for c shape == output shape, this is nullptr
TArray
<
fast_divmod
>
fdm_output_strides
;
BroadcastIndexType
a_index_type
=
BroadcastIndexType
::
NoBroadcast
;
BroadcastIndexType
b_index_type
=
BroadcastIndexType
::
NoBroadcast
;
BroadcastIndexType
c_index_type
=
BroadcastIndexType
::
NoBroadcast
;
TernaryElementwisePreparation
(
const
Tensor
*
a
,
const
Tensor
*
b
,
const
Tensor
*
c
)
:
a_tensor
(
a
),
b_tensor
(
b
),
c_tensor
(
c
)
{}
Status
TernaryElementwiseBroadcastPrepareHelper
(
const
TensorShape
&
a_shape
,
const
TensorShape
&
b_shape
,
const
TensorShape
&
c_shape
,
const
TensorShape
&
output_shape
)
{
int32_t
a_rank
=
static_cast
<
int32_t
>
(
a_shape
.
NumDimensions
());
int32_t
b_rank
=
static_cast
<
int32_t
>
(
b_shape
.
NumDimensions
());
int32_t
c_rank
=
static_cast
<
int32_t
>
(
c_shape
.
NumDimensions
());
int32_t
out_rank
=
std
::
max
(
std
::
max
(
a_rank
,
b_rank
),
c_rank
);
// early return when shapes match
if
(
a_shape
==
b_shape
&&
b_shape
==
c_shape
)
{
output_rank_or_simple_broadcast
=
static_cast
<
size_t
>
(
SimpleBroadcast
::
NoBroadcast
);
return
Status
::
OK
();
}
output_rank_or_simple_broadcast
=
out_rank
;
auto
padder
=
[
out_rank
](
int32_t
rank
,
const
TensorShape
&
shape
,
TArray
<
int64_t
>&
padded_strides
)
{
padded_strides
.
SetSize
(
out_rank
);
if
(
rank
>
0
)
{
TensorPitches
pitches
(
shape
.
GetDims
());
auto
offset
=
out_rank
-
rank
;
for
(
auto
i
=
offset
;
i
<
out_rank
;
++
i
)
{
// the stride for broadcast dimension is kept as 0
if
(
shape
.
GetDims
()[
gsl
::
narrow_cast
<
size_t
>
(
i
)
-
offset
]
!=
1
)
{
padded_strides
[
i
]
=
pitches
[
gsl
::
narrow_cast
<
size_t
>
(
i
)
-
offset
];
}
}
}
};
bool
has_need_compute
=
false
;
if
(
a_shape
.
Size
()
==
1
)
{
a_index_type
=
BroadcastIndexType
::
Scalar
;
}
else
if
(
a_shape
!=
output_shape
)
{
padder
(
a_rank
,
a_shape
,
a_padded_strides
);
a_index_type
=
BroadcastIndexType
::
NeedCompute
;
has_need_compute
=
true
;
}
if
(
b_shape
.
Size
()
==
1
)
{
b_index_type
=
BroadcastIndexType
::
Scalar
;
}
else
if
(
b_shape
!=
output_shape
)
{
padder
(
b_rank
,
b_shape
,
b_padded_strides
);
b_index_type
=
BroadcastIndexType
::
NeedCompute
;
has_need_compute
=
true
;
}
if
(
c_shape
.
Size
()
==
1
)
{
c_index_type
=
BroadcastIndexType
::
Scalar
;
}
else
if
(
c_shape
!=
output_shape
)
{
padder
(
c_rank
,
c_shape
,
c_padded_strides
);
c_index_type
=
BroadcastIndexType
::
NeedCompute
;
has_need_compute
=
true
;
}
if
(
!
has_need_compute
)
{
output_rank_or_simple_broadcast
=
static_cast
<
size_t
>
(
SimpleBroadcast
::
NoBroadcast
);
return
Status
::
OK
();
}
TensorPitches
output_pitches
(
output_shape
.
GetDims
());
fdm_output_strides
.
SetSize
(
out_rank
);
for
(
auto
i
=
0
;
i
<
out_rank
;
++
i
)
{
fdm_output_strides
[
i
]
=
fast_divmod
(
static_cast
<
int32_t
>
(
output_pitches
[
i
]));
}
return
Status
::
OK
();
}
};
template
<
typename
T
>
Status
Where
<
T
>::
ComputeInternal
(
OpKernelContext
*
context
)
const
{
typedef
typename
ToHipType
<
T
>::
MappedType
HipT
;
const
auto
*
const
condition
=
context
->
Input
<
Tensor
>
(
0
);
const
auto
*
const
X
=
context
->
Input
<
Tensor
>
(
1
);
const
auto
*
const
Y
=
context
->
Input
<
Tensor
>
(
2
);
ORT_ENFORCE
(
condition
&&
X
&&
Y
,
"condition, X, and Y inputs are required!"
);
auto
const
&
condition_shape
=
condition
->
Shape
();
auto
const
&
X_shape
=
X
->
Shape
();
auto
const
&
Y_shape
=
Y
->
Shape
();
TensorShape
output_shape
;
ORT_RETURN_IF_ERROR
(
ComputeOutputShape
(
Node
().
Name
(),
condition_shape
,
X_shape
,
Y_shape
,
output_shape
));
auto
output_tensor
=
context
->
Output
(
0
,
output_shape
);
if
(
output_shape
.
Size
()
==
0
)
return
Status
::
OK
();
TernaryElementwisePreparation
prepare
(
condition
,
X
,
Y
);
ORT_RETURN_IF_ERROR
(
prepare
.
TernaryElementwiseBroadcastPrepareHelper
(
condition_shape
,
X_shape
,
Y_shape
,
output_shape
));
WhereImpl
<
HipT
>
(
Stream
(),
prepare
.
output_rank_or_simple_broadcast
,
prepare
.
a_index_type
,
prepare
.
a_padded_strides
,
reinterpret_cast
<
const
bool
*>
(
prepare
.
a_tensor
->
Data
<
bool
>
()),
prepare
.
b_index_type
,
prepare
.
b_padded_strides
,
reinterpret_cast
<
const
HipT
*>
(
prepare
.
b_tensor
->
Data
<
T
>
()),
prepare
.
c_index_type
,
prepare
.
c_padded_strides
,
reinterpret_cast
<
const
HipT
*>
(
prepare
.
c_tensor
->
Data
<
T
>
()),
prepare
.
fdm_output_strides
,
reinterpret_cast
<
HipT
*>
(
output_tensor
->
MutableData
<
T
>
()),
output_tensor
->
Shape
().
Size
());
return
Status
::
OK
();
}
#define SPECIALIZED_COMPUTE_WITH_NAME(T, TName) \
WHERE_TYPED_KERNEL_WITH_TYPE_NAME(T, TName) \
template Status Where<T>::ComputeInternal(OpKernelContext* context) const;
#define SPECIALIZED_COMPUTE(T) \
SPECIALIZED_COMPUTE_WITH_NAME(T, T)
SPECIALIZED_COMPUTE
(
uint8_t
)
SPECIALIZED_COMPUTE
(
int32_t
)
SPECIALIZED_COMPUTE
(
int64_t
)
SPECIALIZED_COMPUTE
(
float
)
SPECIALIZED_COMPUTE
(
double_t
)
SPECIALIZED_COMPUTE
(
MLFloat16
)
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/where.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
class
Where
final
:
public
RocmKernel
{
public:
Where
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/where_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#ifdef __GNUC__
#include "onnxruntime_config.h"
#pragma GCC diagnostic ignored "-Wswitch"
#endif
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
#include "core/providers/rocm/cu_inc/common.cuh"
#include "where_impl.h"
namespace
onnxruntime
{
namespace
rocm
{
// broadcast by computing output coordinate from offset, using fast_divmod
template
<
typename
T
,
BroadcastIndexType
CondIndexType
,
BroadcastIndexType
XIndexType
,
BroadcastIndexType
YIndexType
,
int
NumThreadsPerBlock
,
int
NumElementsPerThread
>
__global__
void
_TenaryElementWise
(
size_t
output_rank
,
const
TArray
<
int64_t
>
cond_padded_strides
,
const
bool
*
cond_data
,
const
TArray
<
int64_t
>
x_padded_strides
,
const
T
*
x_data
,
const
TArray
<
int64_t
>
y_padded_strides
,
const
T
*
y_data
,
const
TArray
<
fast_divmod
>
fdm_output_strides
,
T
*
output_data
,
HIP_LONG
N
)
{
HIP_LONG
start
=
NumElementsPerThread
*
NumThreadsPerBlock
*
blockIdx
.
x
+
threadIdx
.
x
;
bool
cond_value
[
NumElementsPerThread
];
T
x_value
[
NumElementsPerThread
];
T
y_value
[
NumElementsPerThread
];
HIP_LONG
id
=
start
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NumElementsPerThread
;
i
++
)
{
if
(
id
<
N
)
{
// compute indexes with broadcasting rules: https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
HIP_LONG
cond_index
=
(
CondIndexType
==
BroadcastIndexType
::
NoBroadcast
?
id
:
0
);
HIP_LONG
x_index
=
(
XIndexType
==
BroadcastIndexType
::
NoBroadcast
?
id
:
0
);
HIP_LONG
y_index
=
(
YIndexType
==
BroadcastIndexType
::
NoBroadcast
?
id
:
0
);
HIP_LONG
offset
=
id
;
#pragma unroll
for
(
auto
dim
=
0
;
dim
<
fdm_output_strides
.
Capacity
();
dim
++
)
{
if
(
dim
>=
output_rank
)
{
break
;
}
int
q
,
r
;
fdm_output_strides
[
dim
].
divmod
(
offset
,
q
,
r
);
if
(
CondIndexType
==
BroadcastIndexType
::
NeedCompute
)
{
cond_index
+=
static_cast
<
int
>
(
cond_padded_strides
[
dim
])
*
q
;
}
if
(
XIndexType
==
BroadcastIndexType
::
NeedCompute
)
{
x_index
+=
static_cast
<
int
>
(
x_padded_strides
[
dim
])
*
q
;
}
if
(
YIndexType
==
BroadcastIndexType
::
NeedCompute
)
{
y_index
+=
static_cast
<
int
>
(
y_padded_strides
[
dim
])
*
q
;
}
offset
=
r
;
}
cond_value
[
i
]
=
cond_data
[
cond_index
];
x_value
[
i
]
=
x_data
[
x_index
];
y_value
[
i
]
=
y_data
[
y_index
];
id
+=
NumThreadsPerBlock
;
}
}
id
=
start
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NumElementsPerThread
;
i
++
)
{
if
(
id
<
N
)
{
output_data
[
id
]
=
cond_value
[
i
]
?
x_value
[
i
]
:
y_value
[
i
];
id
+=
NumThreadsPerBlock
;
}
}
}
// for scalar broadcast or non-broadcast case
template
<
typename
T
,
BroadcastIndexType
CondIndexType
,
BroadcastIndexType
XIndexType
,
BroadcastIndexType
YIndexType
,
int
NumThreadsPerBlock
,
int
NumElementsPerThread
>
__global__
void
_TenaryElementWiseSimple
(
const
bool
*
cond_data
,
const
T
*
x_data
,
const
T
*
y_data
,
T
*
output_data
,
HIP_LONG
N
)
{
HIP_LONG
start
=
NumElementsPerThread
*
NumThreadsPerBlock
*
blockIdx
.
x
+
threadIdx
.
x
;
bool
cond_value
[
NumElementsPerThread
];
T
x_value
[
NumElementsPerThread
];
T
y_value
[
NumElementsPerThread
];
HIP_LONG
id
=
start
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NumElementsPerThread
;
i
++
)
{
if
(
id
<
N
)
{
cond_value
[
i
]
=
cond_data
[
CondIndexType
==
BroadcastIndexType
::
NoBroadcast
?
id
:
0
];
x_value
[
i
]
=
x_data
[
XIndexType
==
BroadcastIndexType
::
NoBroadcast
?
id
:
0
];
y_value
[
i
]
=
y_data
[
YIndexType
==
BroadcastIndexType
::
NoBroadcast
?
id
:
0
];
id
+=
NumThreadsPerBlock
;
}
}
id
=
start
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NumElementsPerThread
;
i
++
)
{
if
(
id
<
N
)
{
output_data
[
id
]
=
cond_value
[
i
]
?
x_value
[
i
]
:
y_value
[
i
];
id
+=
NumThreadsPerBlock
;
}
}
}
#define HANDLE_Y_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE, Y_INDEX_TYPE) \
case Y_INDEX_TYPE: { \
_TenaryElementWiseSimple<T, \
COND_INDEX_TYPE, \
X_INDEX_TYPE, \
Y_INDEX_TYPE, \
GridDim::maxThreadsPerBlock, \
GridDim::maxElementsPerThread> \
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(cond_data, \
x_data, \
y_data, \
output_data, \
N); \
} break
#define HANDLE_X_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE, Y_INDEX_TYPE_VAL) \
case X_INDEX_TYPE: { \
switch (Y_INDEX_TYPE_VAL) { \
HANDLE_Y_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::NoBroadcast); \
HANDLE_Y_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::Scalar); \
} \
} break
#define HANDLE_COND_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, X_INDEX_TYPE_VAL, Y_INDEX_TYPE_VAL) \
case COND_INDEX_TYPE: { \
switch (X_INDEX_TYPE_VAL) { \
HANDLE_X_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, BroadcastIndexType::NoBroadcast, Y_INDEX_TYPE_VAL); \
HANDLE_X_INDEX_TYPE_SIMPLE(COND_INDEX_TYPE, BroadcastIndexType::Scalar, Y_INDEX_TYPE_VAL); \
} \
} break
#define HANDLE_Y_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, Y_INDEX_TYPE) \
case Y_INDEX_TYPE: { \
_TenaryElementWise<T, \
COND_INDEX_TYPE, \
X_INDEX_TYPE, \
Y_INDEX_TYPE, \
GridDim::maxThreadsPerBlock, \
GridDim::maxElementsPerThread> \
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(output_rank_or_simple_broadcast, \
cond_padded_strides, \
cond_data, \
x_padded_strides, \
x_data, \
y_padded_strides, \
y_data, \
fdm_output_strides, \
output_data, \
N); \
} break
#define HANDLE_X_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, Y_INDEX_TYPE_VAL) \
case X_INDEX_TYPE: { \
switch (Y_INDEX_TYPE_VAL) { \
HANDLE_Y_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::NoBroadcast); \
HANDLE_Y_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::Scalar); \
HANDLE_Y_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE, BroadcastIndexType::NeedCompute); \
} \
} break
#define HANDLE_COND_INDEX_TYPE(COND_INDEX_TYPE, X_INDEX_TYPE_VAL, Y_INDEX_TYPE_VAL) \
case COND_INDEX_TYPE: { \
switch (X_INDEX_TYPE_VAL) { \
HANDLE_X_INDEX_TYPE(COND_INDEX_TYPE, BroadcastIndexType::NoBroadcast, Y_INDEX_TYPE_VAL); \
HANDLE_X_INDEX_TYPE(COND_INDEX_TYPE, BroadcastIndexType::Scalar, Y_INDEX_TYPE_VAL); \
HANDLE_X_INDEX_TYPE(COND_INDEX_TYPE, BroadcastIndexType::NeedCompute, Y_INDEX_TYPE_VAL); \
} \
} break
template
<
typename
T
>
void
WhereImpl
(
hipStream_t
stream
,
size_t
output_rank_or_simple_broadcast
,
BroadcastIndexType
cond_index_type
,
const
TArray
<
int64_t
>&
cond_padded_strides
,
const
bool
*
cond_data
,
BroadcastIndexType
x_index_type
,
const
TArray
<
int64_t
>&
x_padded_strides
,
const
T
*
x_data
,
BroadcastIndexType
y_index_type
,
const
TArray
<
int64_t
>&
y_padded_strides
,
const
T
*
y_data
,
const
TArray
<
fast_divmod
>&
fdm_output_strides
,
T
*
output_data
,
size_t
count
)
{
int
blocksPerGrid
=
static_cast
<
int
>
(
CeilDiv
(
count
,
GridDim
::
maxThreadsPerBlock
*
GridDim
::
maxElementsPerThread
));
HIP_LONG
N
=
static_cast
<
HIP_LONG
>
(
count
);
if
(
output_rank_or_simple_broadcast
==
static_cast
<
size_t
>
(
SimpleBroadcast
::
NoBroadcast
))
{
switch
(
cond_index_type
)
{
HANDLE_COND_INDEX_TYPE_SIMPLE
(
BroadcastIndexType
::
NoBroadcast
,
x_index_type
,
y_index_type
);
HANDLE_COND_INDEX_TYPE_SIMPLE
(
BroadcastIndexType
::
Scalar
,
x_index_type
,
y_index_type
);
}
}
else
{
switch
(
cond_index_type
)
{
HANDLE_COND_INDEX_TYPE
(
BroadcastIndexType
::
NoBroadcast
,
x_index_type
,
y_index_type
);
HANDLE_COND_INDEX_TYPE
(
BroadcastIndexType
::
Scalar
,
x_index_type
,
y_index_type
);
HANDLE_COND_INDEX_TYPE
(
BroadcastIndexType
::
NeedCompute
,
x_index_type
,
y_index_type
);
}
}
}
#define SPECIALIZED_IMPL(T) \
template void WhereImpl<T>(hipStream_t stream, \
size_t output_rank_or_simple_broadcast, \
BroadcastIndexType cond_index_type, \
const TArray<int64_t>& cond_padded_strides, \
const bool* cond_data, \
BroadcastIndexType x_index_type, \
const TArray<int64_t>& x_padded_strides, \
const T* x_data, \
BroadcastIndexType y_index_type, \
const TArray<int64_t>& y_padded_strides, \
const T* y_data, \
const TArray<fast_divmod>& fdm_output_strides, \
T* output_data, \
size_t count);
SPECIALIZED_IMPL
(
uint8_t
)
SPECIALIZED_IMPL
(
int32_t
)
SPECIALIZED_IMPL
(
int64_t
)
SPECIALIZED_IMPL
(
float
)
SPECIALIZED_IMPL
(
double_t
)
SPECIALIZED_IMPL
(
half
)
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/where_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
#include "core/common/common.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
void
WhereImpl
(
hipStream_t
stream
,
size_t
output_rank_or_simple_broadcast
,
BroadcastIndexType
cond_index_type
,
const
TArray
<
int64_t
>&
cond_padded_strides
,
const
bool
*
cond_data
,
BroadcastIndexType
x_index_type
,
const
TArray
<
int64_t
>&
x_padded_strides
,
const
T
*
x_data
,
BroadcastIndexType
y_index_type
,
const
TArray
<
int64_t
>&
y_padded_strides
,
const
T
*
y_data
,
const
TArray
<
fast_divmod
>&
fdm_output_strides
,
T
*
output_data
,
size_t
count
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/all_tests.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#ifndef NDEBUG
namespace
onnxruntime
{
namespace
rocm
{
namespace
test
{
// Test header provides function declarations in EP-side bridge.
bool
TestDeferredRelease
();
bool
TestDeferredReleaseWithoutArena
();
bool
TestBeamSearchTopK
();
}
// namespace test
}
// namespace rocm
}
// namespace onnxruntime
#endif
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/beam_search_topk.cc
0 → 100644
View file @
1a91fcc2
#ifndef NDEBUG
#include "contrib_ops/rocm/transformers/beam_search_topk.h"
#include <algorithm>
#include <numeric>
#include <queue>
#include <random>
#include <hip/hip_runtime.h>
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wunused-result"
namespace
onnxruntime
{
namespace
rocm
{
namespace
test
{
void
FillAndShuffle
(
std
::
vector
<
float
>&
values
,
int32_t
batch_size
,
int32_t
beam_size
,
int32_t
vocab_size
)
{
std
::
random_device
rd
;
std
::
mt19937
generator
(
rd
());
for
(
int32_t
batch
=
0
;
batch
<
batch_size
;
batch
++
)
{
int32_t
batch_base_idx
=
batch
*
beam_size
*
vocab_size
;
for
(
int32_t
beam
=
0
;
beam
<
beam_size
;
beam
++
)
{
int32_t
value
=
beam
;
int32_t
beam_base_idx
=
beam
*
vocab_size
;
for
(
int32_t
vocab
=
0
;
vocab
<
vocab_size
;
vocab
++
)
{
values
[
batch_base_idx
+
beam_base_idx
+
vocab
]
=
(
float
)(
value
);
value
+=
beam_size
;
}
std
::
shuffle
(
values
.
begin
()
+
batch_base_idx
+
beam_base_idx
,
values
.
begin
()
+
batch_base_idx
+
beam_base_idx
+
vocab_size
,
generator
);
}
}
}
void
ComputeTopKReference
(
const
std
::
vector
<
float
>&
values
,
std
::
vector
<
float
>&
top_k_values
,
std
::
vector
<
int32_t
>&
top_k_tokens
,
std
::
vector
<
int32_t
>&
top_k_indices
,
int32_t
batch_size
,
int32_t
beam_size
,
int32_t
vocab_size
,
int32_t
k
)
{
using
VK
=
std
::
pair
<
float
,
int32_t
>
;
for
(
int32_t
b
=
0
;
b
<
batch_size
;
b
++
)
{
std
::
priority_queue
<
VK
,
std
::
vector
<
VK
>
,
std
::
greater
<
VK
>>
queue
;
int32_t
base_idx
=
b
*
beam_size
*
vocab_size
;
// initialize queue with k elements
for
(
int32_t
i
=
0
;
i
<
k
;
i
++
)
{
queue
.
push
({
values
[
base_idx
+
i
],
i
});
}
for
(
int32_t
i
=
k
;
i
<
beam_size
*
vocab_size
;
i
++
)
{
if
(
values
[
base_idx
+
i
]
>
queue
.
top
().
first
)
{
queue
.
pop
();
queue
.
push
({
values
[
base_idx
+
i
],
i
});
}
}
int32_t
top_k_base_idx
=
b
*
k
;
for
(
int32_t
i
=
k
-
1
;
i
>=
0
;
i
--
)
{
top_k_values
[
top_k_base_idx
+
i
]
=
queue
.
top
().
first
;
top_k_tokens
[
top_k_base_idx
+
i
]
=
queue
.
top
().
second
%
vocab_size
;
top_k_indices
[
top_k_base_idx
+
i
]
=
queue
.
top
().
second
/
vocab_size
;
queue
.
pop
();
}
}
}
bool
TestBeamSearchTopK
()
{
int32_t
batch_size
=
4
;
int32_t
beam_size
=
4
;
int32_t
vocab_size
=
50257
;
int32_t
k
=
2
*
beam_size
;
int32_t
batch_x_beam_x_vocab
=
batch_size
*
beam_size
*
vocab_size
;
std
::
vector
<
float
>
values
(
batch_x_beam_x_vocab
);
FillAndShuffle
(
values
,
batch_size
,
beam_size
,
vocab_size
);
std
::
vector
<
float
>
top_k_values_ref
(
batch_size
*
k
);
std
::
vector
<
int32_t
>
top_k_tokens_ref
(
batch_size
*
k
);
std
::
vector
<
int32_t
>
top_k_indices_ref
(
batch_size
*
k
);
ComputeTopKReference
(
values
,
top_k_values_ref
,
top_k_tokens_ref
,
top_k_indices_ref
,
batch_size
,
beam_size
,
vocab_size
,
k
);
const
int32_t
max_vocab_parts
=
128
;
size_t
buffer_size
=
batch_x_beam_x_vocab
*
4
// input
+
batch_size
*
beam_size
*
k
*
(
max_vocab_parts
+
1
)
*
2
*
4
// tmp
+
batch_size
*
k
*
3
*
4
;
// output size
void
*
rocm_buffer
=
nullptr
;
hipMalloc
(
&
rocm_buffer
,
buffer_size
);
float
*
values_device
=
(
float
*
)
rocm_buffer
;
float
*
top_k_1st_values_tmp
=
(
float
*
)(
values_device
+
batch_x_beam_x_vocab
);
int32_t
*
top_k_1st_tokens_tmp
=
(
int32_t
*
)(
top_k_1st_values_tmp
+
batch_size
*
beam_size
*
k
*
max_vocab_parts
);
float
*
top_k_2nd_values_tmp
=
(
float
*
)(
top_k_1st_tokens_tmp
+
batch_size
*
beam_size
*
k
*
max_vocab_parts
);
int32_t
*
top_k_2nd_tokens_tmp
=
(
int32_t
*
)(
top_k_2nd_values_tmp
+
batch_size
*
beam_size
*
k
);
float
*
top_k_value
=
(
float
*
)(
top_k_2nd_tokens_tmp
+
batch_size
*
beam_size
*
k
);
int32_t
*
top_k_token
=
(
int32_t
*
)(
top_k_value
+
batch_size
*
k
);
int32_t
*
top_k_indices
=
(
int32_t
*
)(
top_k_token
+
batch_size
*
k
);
hipMemcpy
(
values_device
,
values
.
data
(),
batch_x_beam_x_vocab
*
4
,
hipMemcpyHostToDevice
);
contrib
::
rocm
::
BeamSearchTopK
(
values_device
,
batch_size
,
beam_size
,
vocab_size
,
k
,
top_k_1st_values_tmp
,
top_k_1st_tokens_tmp
,
top_k_2nd_values_tmp
,
top_k_2nd_tokens_tmp
,
top_k_value
,
top_k_token
,
top_k_indices
,
NULL
/*stream*/
);
std
::
vector
<
float
>
top_k_values_host
(
batch_size
*
k
);
std
::
vector
<
int32_t
>
top_k_token_host
(
batch_size
*
k
);
std
::
vector
<
int32_t
>
top_k_indices_host
(
batch_size
*
k
);
hipMemcpy
(
top_k_values_host
.
data
(),
top_k_value
,
batch_size
*
k
*
4
,
hipMemcpyDeviceToHost
);
hipMemcpy
(
top_k_token_host
.
data
(),
top_k_token
,
batch_size
*
k
*
4
,
hipMemcpyDeviceToHost
);
hipMemcpy
(
top_k_indices_host
.
data
(),
top_k_indices
,
batch_size
*
k
*
4
,
hipMemcpyDeviceToHost
);
for
(
int32_t
i
=
0
;
i
<
batch_size
*
k
;
i
++
)
{
if
(
top_k_values_ref
[
i
]
!=
top_k_values_host
[
i
]
||
top_k_tokens_ref
[
i
]
!=
top_k_token_host
[
i
]
||
top_k_indices_ref
[
i
]
!=
top_k_indices_host
[
i
])
{
return
false
;
}
}
return
true
;
}
}
// namespace test
}
// namespace rocm
}
// namespace onnxruntime
#endif
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/test/rocm_execution_provider_test.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
// This test is built only under DEBUG mode because it requires
// extra code in the core of ROCM EP and that code may
// 1. slow down performance critical applications and
// 2. increase binary size of ORT.
#ifndef NDEBUG
#include <iostream>
#include "core/providers/rocm/test/all_tests.h"
#include "core/providers/rocm/rocm_execution_provider.h"
#include "core/providers/rocm/rocm_allocator.h"
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wunused-result"
namespace
onnxruntime
{
namespace
rocm
{
namespace
test
{
bool
TestDeferredRelease
()
{
// Create ROCM EP.
ROCMExecutionProviderInfo
info
;
ROCMExecutionProvider
ep
(
info
);
// Initialize allocators in EP.
onnxruntime
::
AllocatorManager
allocator_manager
;
ep
.
RegisterAllocator
(
allocator_manager
);
// Allocator for call hipHostMalloc and hipHostFree
// For details, see ROCMPinnedAllocator in rocm_allocator.cc.
AllocatorPtr
cpu_pinned_alloc
=
ep
.
GetAllocator
(
DEFAULT_CPU_ALLOCATOR_DEVICE_ID
,
OrtMemTypeCPU
);
// 10 MB
const
size_t
n_bytes
=
10
*
1000000
;
const
int64_t
n_allocs
=
64
;
ORT_THROW_IF_ERROR
(
ep
.
OnRunStart
());
for
(
size_t
i
=
0
;
i
<
n_allocs
;
++
i
)
{
// Allocate 10MB ROCM pinned memory.
auto
pinned_buffer
=
ep
.
AllocateBufferOnCPUPinned
<
void
>
(
n_bytes
);
// Release it using ROCM callback.
ep
.
AddDeferredReleaseCPUPtr
(
pinned_buffer
.
release
());
}
// Memory stats
AllocatorStats
stats
;
cpu_pinned_alloc
->
GetStats
(
&
stats
);
ORT_ENFORCE
(
stats
.
num_allocs
==
n_allocs
);
ORT_THROW_IF_ERROR
(
ep
.
OnRunEnd
(
true
));
return
true
;
}
bool
TestDeferredReleaseWithoutArena
()
{
// Create ROCM EP.
ROCMExecutionProviderInfo
info
;
ROCMExecutionProvider
ep
(
info
);
// Initialize allocators in EP.
onnxruntime
::
AllocatorManager
allocator_manager
;
OrtDevice
pinned_device
{
OrtDevice
::
CPU
,
OrtDevice
::
MemType
::
CUDA_PINNED
,
DEFAULT_CPU_ALLOCATOR_DEVICE_ID
};
// Create allocator without BFCArena
AllocatorCreationInfo
pinned_memory_info
(
[](
OrtDevice
::
DeviceId
device_id
)
{
return
std
::
make_unique
<
ROCMPinnedAllocator
>
(
device_id
,
CUDA_PINNED
);
},
pinned_device
.
Id
(),
false
/* no arena */
);
auto
rocm_pinned_alloc
=
CreateAllocator
(
pinned_memory_info
);
allocator_manager
.
InsertAllocator
(
rocm_pinned_alloc
);
// Use existing allocator in allocator_manager.
// Also register new allocator created by this EP in allocator_manager.
ep
.
RegisterAllocator
(
allocator_manager
);
// Allocator for call hipHostMalloc and hipHostFree
// For details, see ROCMPinnedAllocator in rocm_allocator.cc.
AllocatorPtr
cpu_pinned_alloc
=
ep
.
GetAllocator
(
DEFAULT_CPU_ALLOCATOR_DEVICE_ID
,
OrtMemTypeCPU
);
// 10 MB
const
size_t
n_bytes
=
10
*
1000000
;
const
int64_t
n_allocs
=
64
;
ORT_THROW_IF_ERROR
(
ep
.
OnRunStart
());
for
(
size_t
i
=
0
;
i
<
n_allocs
;
++
i
)
{
// Allocate 10MB ROCM pinned memory.
auto
pinned_buffer
=
ep
.
AllocateBufferOnCPUPinned
<
void
>
(
n_bytes
);
// Release it using ROCM callback.
ep
.
AddDeferredReleaseCPUPtr
(
pinned_buffer
.
release
());
}
ORT_THROW_IF_ERROR
(
ep
.
OnRunEnd
(
true
));
return
true
;
}
}
// namespace test
}
// namespace rocm
}
// namespace onnxruntime
#endif
Prev
1
…
10
11
12
13
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment