Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
onnxruntime_v14
Commits
1a91fcc2
Commit
1a91fcc2
authored
Jul 25, 2023
by
gaoqiong
Browse files
add dtk所需文件
parent
a144865d
Pipeline
#492
failed with stages
in 0 seconds
Changes
280
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1887 additions
and
0 deletions
+1887
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/quantize_linear.cuh
...nnxruntime/core/providers/rocm/tensor/quantize_linear.cuh
+20
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/quantize_linear.h
.../onnxruntime/core/providers/rocm/tensor/quantize_linear.h
+29
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reshape.cc
.../amdgpu/onnxruntime/core/providers/rocm/tensor/reshape.cc
+57
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reshape.h
...e/amdgpu/onnxruntime/core/providers/rocm/tensor/reshape.h
+77
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/resize.cc
...e/amdgpu/onnxruntime/core/providers/rocm/tensor/resize.cc
+51
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/resize.h
...se/amdgpu/onnxruntime/core/providers/rocm/tensor/resize.h
+24
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/resize_impl.cu
...gpu/onnxruntime/core/providers/rocm/tensor/resize_impl.cu
+837
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/resize_impl.h
...dgpu/onnxruntime/core/providers/rocm/tensor/resize_impl.h
+40
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reverse_sequence.cc
...nnxruntime/core/providers/rocm/tensor/reverse_sequence.cc
+70
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reverse_sequence.h
...onnxruntime/core/providers/rocm/tensor/reverse_sequence.h
+36
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reverse_sequence_impl.cu
...ntime/core/providers/rocm/tensor/reverse_sequence_impl.cu
+99
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reverse_sequence_impl.h
...untime/core/providers/rocm/tensor/reverse_sequence_impl.h
+24
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_elements.cc
...nnxruntime/core/providers/rocm/tensor/scatter_elements.cc
+122
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_elements.h
...onnxruntime/core/providers/rocm/tensor/scatter_elements.h
+28
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_elements_impl.h
...untime/core/providers/rocm/tensor/scatter_elements_impl.h
+19
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_nd.cc
...dgpu/onnxruntime/core/providers/rocm/tensor/scatter_nd.cc
+89
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_nd.h
...mdgpu/onnxruntime/core/providers/rocm/tensor/scatter_nd.h
+20
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_nd_impl.cu
...onnxruntime/core/providers/rocm/tensor/scatter_nd_impl.cu
+133
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_nd_impl.h
.../onnxruntime/core/providers/rocm/tensor/scatter_nd_impl.h
+23
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/sequence_op.cc
...gpu/onnxruntime/core/providers/rocm/tensor/sequence_op.cc
+89
-0
No files found.
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/quantize_linear.cuh
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "quantize_linear.h"
#include "core/providers/rocm/rocm_common.h"
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
class
T
,
class
U
>
Status
CudaQuantizeLinear
(
hipStream_t
stream
,
const
U
*
input
,
T
*
output
,
const
U
*
scale
,
const
T
*
zero_point
,
size_t
num_of_element
);
template
<
class
T
,
class
U
>
Status
CudaDequantizeLinear
(
hipStream_t
stream
,
const
T
*
input
,
U
*
output
,
const
U
*
scale
,
const
T
*
zero_point
,
size_t
num_of_element
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/quantize_linear.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
class
T
,
class
U
=
float
>
class
QuantizeLinear
final
:
public
RocmKernel
{
public:
QuantizeLinear
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
p_op_kernel_context
)
const
override
;
};
template
<
class
T
,
class
U
=
float
>
class
DequantizeLinear
final
:
public
RocmKernel
{
public:
DequantizeLinear
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
p_op_kernel_context
)
const
override
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reshape.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "reshape.h"
namespace
onnxruntime
{
namespace
rocm
{
ONNX_OPERATOR_KERNEL_EX
(
Reshape
,
kOnnxDomain
,
14
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
())
.
TypeConstraint
(
"shape"
,
DataTypeImpl
::
GetTensorType
<
int64_t
>
())
.
Alias
(
0
,
0
)
.
InputMemoryType
(
OrtMemTypeCPUInput
,
1
),
Reshape
);
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Reshape
,
kOnnxDomain
,
13
,
13
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
())
.
TypeConstraint
(
"shape"
,
DataTypeImpl
::
GetTensorType
<
int64_t
>
())
.
Alias
(
0
,
0
)
.
InputMemoryType
(
OrtMemTypeCPUInput
,
1
),
Reshape
);
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Reshape
,
kOnnxDomain
,
5
,
12
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
())
.
TypeConstraint
(
"shape"
,
DataTypeImpl
::
GetTensorType
<
int64_t
>
())
.
Alias
(
0
,
0
)
.
InputMemoryType
(
OrtMemTypeCPUInput
,
1
),
Reshape
);
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Reshape
,
kOnnxDomain
,
1
,
4
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
Alias
(
0
,
0
)
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
()),
Reshape_1
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reshape.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/tensor/reshape_helper.h"
namespace
onnxruntime
{
namespace
rocm
{
class
Reshape
final
:
public
RocmKernel
{
public:
Reshape
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
),
allow_zero_
(
info
.
GetAttrOrDefault
(
"allowzero"
,
static_cast
<
int64_t
>
(
0
))
==
1
)
{
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
{
// Copy the second input tensor into the shape vector
const
Tensor
*
shapeTensor
=
context
->
Input
<
Tensor
>
(
1
);
if
(
shapeTensor
==
nullptr
)
return
Status
(
common
::
ONNXRUNTIME
,
common
::
FAIL
,
"input count mismatch"
);
if
(
shapeTensor
->
Shape
().
NumDimensions
()
!=
1
)
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
FAIL
,
"A shape tensor must be a vector tensor, got "
,
shapeTensor
->
Shape
().
NumDimensions
(),
" dimensions"
);
auto
data_span
=
shapeTensor
->
template
DataAsSpan
<
int64_t
>();
TensorShapeVector
shape
(
data_span
.
begin
(),
data_span
.
end
());
const
Tensor
*
X
=
context
->
Input
<
Tensor
>
(
0
);
if
(
X
==
nullptr
)
return
Status
(
common
::
ONNXRUNTIME
,
common
::
FAIL
,
"input count mismatch"
);
const
TensorShape
&
X_shape
=
X
->
Shape
();
ReshapeHelper
helper
(
X_shape
,
shape
,
allow_zero_
);
Tensor
*
Y
=
context
->
Output
(
0
,
TensorShape
(
shape
));
const
void
*
source
=
X
->
DataRaw
();
void
*
target
=
Y
->
MutableDataRaw
();
//If source and target pointers are not equal (non-inplace operation), we need to copy the data.
if
(
target
!=
source
)
{
ORT_RETURN_IF_ERROR
(
CopyTensor
(
*
X
,
*
Y
));
}
return
Status
::
OK
();
}
private:
bool
allow_zero_
;
};
class
Reshape_1
final
:
public
RocmKernel
{
public:
Reshape_1
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{
Status
status
=
info
.
GetAttrs
(
"shape"
,
shape_
);
ORT_ENFORCE
(
status
.
IsOK
(),
"Attribute shape is not set."
);
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
{
TensorShapeVector
shape
=
shape_
;
const
Tensor
*
X
=
context
->
Input
<
Tensor
>
(
0
);
const
TensorShape
&
X_shape
=
X
->
Shape
();
ReshapeHelper
helper
(
X_shape
,
shape
);
Tensor
*
Y
=
context
->
Output
(
0
,
TensorShape
(
shape
));
const
void
*
source
=
X
->
DataRaw
();
void
*
target
=
Y
->
MutableDataRaw
();
//If source and target pointers are not equal (non-inplace operation), we need to copy the data.
if
(
target
!=
source
)
{
ORT_RETURN_IF_ERROR
(
CopyTensor
(
*
X
,
*
Y
));
}
return
Status
::
OK
();
}
private:
TensorShapeVector
shape_
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/resize.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "resize.h"
namespace
onnxruntime
{
namespace
rocm
{
#define REGISTER_KERNEL_TYPED(T) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Resize, \
kOnnxDomain, \
10, 10, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.InputMemoryType(OrtMemTypeCPUInput, 1) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
Resize<T>); \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Resize, \
kOnnxDomain, \
11, 12, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.InputMemoryType(OrtMemTypeCPUInput, 1) \
.InputMemoryType(OrtMemTypeCPUInput, 2) \
.InputMemoryType(OrtMemTypeCPUInput, 3) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()), \
Resize<T>); \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
Resize, \
kOnnxDomain, \
13, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.InputMemoryType(OrtMemTypeCPUInput, 1) \
.InputMemoryType(OrtMemTypeCPUInput, 2) \
.InputMemoryType(OrtMemTypeCPUInput, 3) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()), \
Resize<T>);
REGISTER_KERNEL_TYPED
(
float
)
REGISTER_KERNEL_TYPED
(
double
)
REGISTER_KERNEL_TYPED
(
MLFloat16
)
REGISTER_KERNEL_TYPED
(
int32_t
)
REGISTER_KERNEL_TYPED
(
uint8_t
)
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/resize.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/tensor/upsample.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
class
Resize
:
public
Upsample
<
T
>
{
public:
Resize
(
const
OpKernelInfo
&
info
)
:
Upsample
<
T
>
(
info
)
{
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
{
return
Upsample
<
T
>::
ComputeInternal
(
context
);
}
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/resize_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/providers/rocm/tensor/resize_impl.h"
namespace
onnxruntime
{
namespace
rocm
{
using
onnxruntime
::
ResizeCoordinateTransformationMode
;
using
onnxruntime
::
ResizeNearestMode
;
using
onnxruntime
::
UpsampleMode
;
struct
NearestPixel_SIMPLE
{
__device__
__forceinline__
int
operator
()
(
float
x_original
,
bool
is_down_sampling
)
const
{
if
(
is_down_sampling
)
{
return
static_cast
<
int
>
(
_Ceil
(
x_original
));
}
return
static_cast
<
int
>
(
x_original
);
}
};
struct
NearestPixel_ROUND_PREFER_FLOOR
{
__device__
__forceinline__
int
operator
()
(
float
x_original
,
bool
)
const
{
if
(
x_original
==
static_cast
<
int
>
(
x_original
)
+
0.5
f
)
{
return
static_cast
<
int
>
(
_Floor
(
x_original
));
}
return
static_cast
<
int
>
(
roundf
(
x_original
));
}
};
struct
NearestPixel_ROUND_PREFER_CEIL
{
__device__
__forceinline__
int
operator
()
(
float
x_original
,
bool
)
const
{
return
static_cast
<
int
>
(
roundf
(
x_original
));
}
};
struct
NearestPixel_FLOOR
{
__device__
__forceinline__
int
operator
()
(
float
x_original
,
bool
)
const
{
return
static_cast
<
int
>
(
_Floor
(
x_original
));
}
};
struct
NearestPixel_CEIL
{
__device__
__forceinline__
int
operator
()
(
float
x_original
,
bool
)
const
{
return
static_cast
<
int
>
(
_Ceil
(
x_original
));
}
};
struct
TransformCoordinate_ASYMMETRIC
{
__device__
__forceinline__
float
operator
()
(
float
x_resized
,
float
x_scale
,
float
,
float
,
float
,
float
)
const
{
return
x_resized
/
x_scale
;
}
};
struct
TransformCoordinate_HALF_PIXEL
{
__device__
__forceinline__
float
operator
()
(
float
x_resized
,
float
x_scale
,
float
,
float
,
float
,
float
)
const
{
return
((
x_resized
+
0.5
f
)
/
x_scale
)
-
0.5
f
;
}
};
struct
TransformCoordinate_PYTORCH_HALF_PIXEL
{
__device__
__forceinline__
float
operator
()
(
float
x_resized
,
float
x_scale
,
float
length_resized
,
float
,
float
,
float
)
const
{
return
length_resized
>
1
?
(
x_resized
+
0.5
f
)
/
x_scale
-
0.5
f
:
0.0
f
;
}
};
struct
TransformCoordinate_TF_HALF_PIXEL_FOR_NN
{
__device__
__forceinline__
float
operator
()
(
float
x_resized
,
float
x_scale
,
float
,
float
,
float
,
float
)
const
{
return
(
x_resized
+
0.5
f
)
/
x_scale
;
}
};
struct
TransformCoordinate_ALIGN_CORNERS
{
__device__
__forceinline__
float
operator
()
(
float
x_resized
,
float
,
float
length_resized
,
float
length_original
,
float
,
float
)
const
{
return
length_resized
==
1
?
0
:
x_resized
*
(
length_original
-
1
)
/
(
length_resized
-
1
);
}
};
struct
TransformCoordinate_TF_CROP_AND_RESIZE
{
__device__
__forceinline__
float
operator
()
(
float
x_resized
,
float
,
float
length_resized
,
float
length_original
,
float
roi_start
,
float
roi_end
)
const
{
auto
orig
=
length_resized
>
1
?
roi_start
*
(
length_original
-
1
)
+
(
x_resized
*
(
roi_end
-
roi_start
)
*
(
length_original
-
1
))
/
(
length_resized
-
1
)
:
0.5
*
(
roi_start
+
roi_end
)
*
(
length_original
-
1
);
return
static_cast
<
float
>
(
orig
);
}
};
#define CASE_TYPE_USING_HINT(enum_type, type, HINT, ...) \
case enum_type: { \
using HINT = type; \
return __VA_ARGS__(); \
}
#define CASE_TYPE_COORD(enum_type, type, ...) \
CASE_TYPE_USING_HINT(enum_type, type, coord_t, __VA_ARGS__)
#define DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE(TYPE, ...) \
[&] { \
const auto& the_type = TYPE; \
/* don't use TYPE again in case it is an expensive or side-effect op */
\
switch (the_type) { \
CASE_TYPE_COORD(ResizeCoordinateTransformationMode::HALF_PIXEL, TransformCoordinate_HALF_PIXEL, __VA_ARGS__) \
CASE_TYPE_COORD(ResizeCoordinateTransformationMode::ASYMMETRIC, TransformCoordinate_ASYMMETRIC, __VA_ARGS__) \
CASE_TYPE_COORD(ResizeCoordinateTransformationMode::PYTORCH_HALF_PIXEL, TransformCoordinate_PYTORCH_HALF_PIXEL, __VA_ARGS__) \
CASE_TYPE_COORD(ResizeCoordinateTransformationMode::ALIGN_CORNERS, TransformCoordinate_ALIGN_CORNERS, __VA_ARGS__) \
CASE_TYPE_COORD(ResizeCoordinateTransformationMode::TF_HALF_PIXEL_FOR_NN, TransformCoordinate_TF_HALF_PIXEL_FOR_NN, __VA_ARGS__) \
CASE_TYPE_COORD(ResizeCoordinateTransformationMode::TF_CROP_AND_RESIZE, TransformCoordinate_TF_CROP_AND_RESIZE, __VA_ARGS__) \
default: \
ORT_THROW("unknown ResizeCoordinateTransformationMode"); \
} \
}()
#define CASE_TYPE_NEAREST(enum_type, type, ...) \
CASE_TYPE_USING_HINT(enum_type, type, nearest_t, __VA_ARGS__)
#define DISPATCH_RESIZE_NEAREST_MODE(TYPE, ...) \
[&] { \
const auto& the_type = TYPE; \
/* don't use TYPE again in case it is an expensive or side-effect op */
\
switch (the_type) { \
CASE_TYPE_NEAREST(ResizeNearestMode::SIMPLE, NearestPixel_SIMPLE, __VA_ARGS__) \
CASE_TYPE_NEAREST(ResizeNearestMode::ROUND_PREFER_FLOOR, NearestPixel_ROUND_PREFER_FLOOR, __VA_ARGS__) \
CASE_TYPE_NEAREST(ResizeNearestMode::ROUND_PREFER_CEIL, NearestPixel_ROUND_PREFER_CEIL, __VA_ARGS__) \
CASE_TYPE_NEAREST(ResizeNearestMode::FLOOR, NearestPixel_FLOOR, __VA_ARGS__) \
CASE_TYPE_NEAREST(ResizeNearestMode::CEIL, NearestPixel_CEIL, __VA_ARGS__) \
default: \
ORT_THROW("unknown ResizeNearestMode"); \
} \
}()
struct
NearestMappingInfo
{
int
origin_
;
int
extrapolate_
;
};
template
<
typename
T
,
typename
CudaFunctionOriginalCoordinate
,
typename
CudaFunctionNearestPixel
>
__global__
void
_ResizeNearestMappingKernel2D
(
const
int
input_height
,
const
int
input_width
,
const
int
output_height
,
const
int
output_width
,
const
float
scales_height
,
const
float
scales_width
,
const
float
roi_start_height
,
const
float
roi_end_height
,
const
float
roi_start_width
,
const
float
roi_end_width
,
const
bool
extrapolation_enabled
,
const
CudaFunctionOriginalCoordinate
&
transform_coordinate
,
const
CudaFunctionNearestPixel
&
calc_nearest_pixel
,
NearestMappingInfo
*
dims_mapping
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
output_height
+
output_width
);
if
(
id
>=
0
&&
id
<
output_height
)
{
// for Height
int
dim
=
id
;
// only apply co-ordinate transformation if scale != 1.0
if
(
scales_height
==
1.0
f
)
{
dims_mapping
[
id
].
extrapolate_
=
0
;
}
else
{
float
orig_coord
=
transform_coordinate
(
static_cast
<
float
>
(
dim
),
scales_height
,
static_cast
<
float
>
(
output_height
),
static_cast
<
float
>
(
input_height
),
roi_start_height
,
roi_end_height
);
dims_mapping
[
id
].
extrapolate_
=
static_cast
<
int
>
(
extrapolation_enabled
&&
(
orig_coord
<
0.
f
||
orig_coord
>
static_cast
<
float
>
(
input_height
-
1
)));
dim
=
calc_nearest_pixel
(
orig_coord
,
scales_height
<
1
);
if
(
dim
>=
input_height
)
dim
=
input_height
-
1
;
if
(
dim
<
0
)
dim
=
0
;
}
dims_mapping
[
id
].
origin_
=
dim
;
}
else
{
int
dim
=
id
-
output_height
;
// only apply co-ordinate transformation if scale != 1.0
if
(
scales_width
==
1.0
f
)
{
dims_mapping
[
id
].
extrapolate_
=
0
;
}
else
{
float
orig_coord
=
transform_coordinate
(
static_cast
<
float
>
(
dim
),
scales_width
,
static_cast
<
float
>
(
output_width
),
static_cast
<
float
>
(
input_width
),
roi_start_width
,
roi_end_width
);
dims_mapping
[
id
].
extrapolate_
=
static_cast
<
int
>
(
extrapolation_enabled
&&
(
orig_coord
<
0.
f
||
orig_coord
>
static_cast
<
float
>
(
input_width
-
1
)));
dim
=
calc_nearest_pixel
(
orig_coord
,
scales_width
<
1
);
if
(
dim
>=
input_width
)
dim
=
input_width
-
1
;
if
(
dim
<
0
)
dim
=
0
;
}
dims_mapping
[
id
].
origin_
=
dim
;
return
;
}
}
template
<
typename
T
,
typename
CudaFunctionOriginalCoordinate
,
typename
CudaFunctionNearestPixel
>
__global__
void
_ResizeNearestMappingKernel
(
const
size_t
rank
,
const
TArray
<
int64_t
>
input_shape
,
const
TArray
<
int64_t
>
output_shape
,
const
TArray
<
float
>
scales
,
const
TArray
<
float
,
10
>
roi
,
const
size_t
total_dim_sum
,
bool
extrapolation_enabled
,
const
CudaFunctionOriginalCoordinate
&
transform_coordinate
,
const
CudaFunctionNearestPixel
&
calc_nearest_pixel
,
int64_t
*
prefix_dim_sum
,
NearestMappingInfo
*
dims_mapping
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
total_dim_sum
);
int64_t
dim_sum
=
0
;
for
(
int
axis
=
0
;
axis
<
rank
;
++
axis
)
{
if
(
id
==
dim_sum
)
{
prefix_dim_sum
[
axis
]
=
dim_sum
;
}
if
(
id
>=
dim_sum
&&
id
<
dim_sum
+
output_shape
[
axis
])
{
int
dim
=
id
-
dim_sum
;
// only apply co-ordinate transformation if scale != 1.0
if
(
scales
[
axis
]
==
1.0
f
)
{
dims_mapping
[
id
].
extrapolate_
=
0
;
}
else
{
float
orig_coord
=
transform_coordinate
(
static_cast
<
float
>
(
dim
),
scales
[
axis
],
static_cast
<
float
>
(
output_shape
[
axis
]),
static_cast
<
float
>
(
input_shape
[
axis
]),
roi
[
axis
],
roi
[
axis
+
rank
]);
dims_mapping
[
id
].
extrapolate_
=
static_cast
<
int
>
(
extrapolation_enabled
&&
(
orig_coord
<
0.
f
||
orig_coord
>
static_cast
<
float
>
(
input_shape
[
axis
]
-
1
)));
dim
=
calc_nearest_pixel
(
orig_coord
,
scales
[
axis
]
<
1
);
if
(
dim
>=
input_shape
[
axis
])
dim
=
input_shape
[
axis
]
-
1
;
if
(
dim
<
0
)
dim
=
0
;
}
dims_mapping
[
id
].
origin_
=
dim
;
return
;
}
dim_sum
+=
output_shape
[
axis
];
}
}
template
<
typename
T
,
bool
UseExtrapolation
>
__global__
void
_ResizeNearestKernel2D
(
const
int64_t
output_height
,
const
int64_t
output_width
,
const
int64_t
input_stride_image
,
const
int
input_stride_row
,
const
fast_divmod
output_stride_image
,
const
fast_divmod
output_stride_row
,
const
T
*
input_data
,
T
*
output_data
,
const
size_t
N
,
const
T
extrapolation_value
,
const
NearestMappingInfo
*
dims_mapping
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
N
);
int
imageid
,
h
,
w
,
output_index
;
output_stride_image
.
divmod
(
static_cast
<
int
>
(
id
),
imageid
,
output_index
);
output_stride_row
.
divmod
(
output_index
,
h
,
w
);
if
(
UseExtrapolation
)
{
if
(
dims_mapping
[
h
].
extrapolate_
+
dims_mapping
[
output_height
+
w
].
extrapolate_
)
{
output_data
[
id
]
=
extrapolation_value
;
return
;
}
}
int
input_index
=
input_stride_image
*
imageid
+
input_stride_row
*
dims_mapping
[
h
].
origin_
+
dims_mapping
[
output_height
+
w
].
origin_
;
output_data
[
id
]
=
input_data
[
input_index
];
}
template
<
typename
T
>
__global__
void
_ResizeNearestKernel
(
const
int
rank
,
const
TArray
<
int64_t
>
input_strides
,
const
TArray
<
fast_divmod
>
output_div_pitches
,
const
T
*
input_data
,
T
*
output_data
,
const
size_t
N
,
const
T
extrapolation_value
,
const
int64_t
*
prefix_dim_sum
,
const
NearestMappingInfo
*
dims_mapping
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
N
);
int
output_index
=
static_cast
<
int
>
(
id
);
int
input_index
=
0
;
int
extrapolation_occured
=
0
;
for
(
int
axis
=
0
;
axis
<
rank
;
++
axis
)
{
int
dim
=
0
;
output_div_pitches
[
axis
].
divmod
(
output_index
,
dim
,
output_index
);
const
NearestMappingInfo
&
mi
=
dims_mapping
[
prefix_dim_sum
[
axis
]
+
dim
];
extrapolation_occured
+=
mi
.
extrapolate_
;
input_index
+=
input_strides
[
axis
]
*
mi
.
origin_
;
}
output_data
[
id
]
=
extrapolation_occured
?
extrapolation_value
:
input_data
[
input_index
];
}
struct
LinearMappingInfo
{
int
origin_
;
float
weight_
;
int
extrapolate_
;
};
template
<
typename
T
,
typename
CudaFunctionOriginalCoordinate
>
__global__
void
_ResizeBilinearCoordinateMapping
(
int64_t
input_height
,
int64_t
input_width
,
int64_t
output_height
,
int64_t
output_width
,
float
scale_height
,
float
scale_width
,
float
roi_height_start
,
float
roi_height_end
,
float
roi_width_start
,
float
roi_width_end
,
const
size_t
SumHW
,
bool
extrapolation_enabled
,
const
CudaFunctionOriginalCoordinate
&
transform_coordinate
,
LinearMappingInfo
*
dims_mapping
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
SumHW
);
if
(
id
<
output_height
)
{
// y = id
float
input_y
=
scale_height
==
1
?
static_cast
<
float
>
(
id
)
:
transform_coordinate
(
static_cast
<
float
>
(
id
),
scale_height
,
static_cast
<
float
>
(
output_height
),
static_cast
<
float
>
(
input_height
),
roi_height_start
,
roi_height_end
);
dims_mapping
[
id
].
extrapolate_
=
(
int
)(
extrapolation_enabled
&&
(
input_y
<
0
||
input_y
>
static_cast
<
float
>
(
input_height
-
1
)));
input_y
=
max
(
0.0
f
,
min
(
input_y
,
static_cast
<
float
>
(
input_height
-
1
)));
int
y_int
=
static_cast
<
int
>
(
input_y
);
dims_mapping
[
id
].
origin_
=
y_int
;
dims_mapping
[
id
].
weight_
=
(
y_int
>=
input_height
-
1
)
?
0.5
f
:
input_y
-
y_int
;
}
else
{
//x = id - output_height
float
input_x
=
scale_width
==
1
?
static_cast
<
float
>
(
id
-
output_height
)
:
transform_coordinate
(
static_cast
<
float
>
(
id
-
output_height
),
scale_width
,
static_cast
<
float
>
(
output_width
),
static_cast
<
float
>
(
input_width
),
roi_width_start
,
roi_width_end
);
dims_mapping
[
id
].
extrapolate_
=
(
int
)(
extrapolation_enabled
&&
(
input_x
<
0
||
input_x
>
static_cast
<
float
>
(
input_width
-
1
)));
input_x
=
max
(
0.0
f
,
min
(
input_x
,
static_cast
<
float
>
(
input_width
-
1
)));
int
x_int
=
static_cast
<
int
>
(
input_x
);
dims_mapping
[
id
].
origin_
=
x_int
;
dims_mapping
[
id
].
weight_
=
(
x_int
>=
input_width
-
1
)
?
0.5
f
:
input_x
-
x_int
;
}
}
// The following method supports a 2-D or 4-D input in 'Linear mode'. Last two dimension is [H, W].
// the scale values for the outer dimensions except last two are 1.
template
<
typename
T
>
__global__
void
_ResizeBilinearKernel
(
int64_t
input_height
,
int64_t
input_width
,
int64_t
output_height
,
int64_t
output_width
,
fast_divmod
div_output_width
,
fast_divmod
div_output_image
,
const
T
*
input_data
,
T
*
output_data
,
const
size_t
N
,
const
T
extrapolation_value
,
LinearMappingInfo
*
dims_mapping
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
N
);
int
bxc
,
output_image_index
;
div_output_image
.
divmod
(
id
,
bxc
,
output_image_index
);
HIP_LONG
input_index
=
bxc
*
input_height
*
input_width
;
int
output_y
,
output_x
;
div_output_width
.
divmod
(
output_image_index
,
output_y
,
output_x
);
if
(
dims_mapping
[
output_y
].
extrapolate_
||
dims_mapping
[
output_x
+
output_height
].
extrapolate_
)
{
output_data
[
id
]
=
extrapolation_value
;
return
;
}
float
y_offset_0
=
dims_mapping
[
output_y
].
weight_
;
int
y_int
=
dims_mapping
[
output_y
].
origin_
;
float
x_offset_0
=
dims_mapping
[
output_x
+
output_height
].
weight_
;
int
x_int
=
dims_mapping
[
output_x
+
output_height
].
origin_
;
input_index
+=
y_int
*
input_width
+
x_int
;
T
x00
=
input_data
[
input_index
];
bool
end_of_h
=
(
y_int
>=
input_height
-
1
);
bool
end_of_w
=
(
x_int
>=
input_width
-
1
);
T
x10
=
end_of_w
?
x00
:
input_data
[
input_index
+
1
];
T
x01
=
end_of_h
?
x00
:
input_data
[
input_index
+
input_width
];
T
x11
=
end_of_w
?
x01
:
(
end_of_h
?
x10
:
input_data
[
input_index
+
input_width
+
1
]);
float
y_offset_1
=
1.0
f
-
y_offset_0
;
float
x_offset_1
=
1.0
f
-
x_offset_0
;
output_data
[
id
]
=
x00
*
static_cast
<
T
>
(
y_offset_1
*
x_offset_1
)
+
x01
*
static_cast
<
T
>
(
y_offset_0
*
x_offset_1
)
+
x10
*
static_cast
<
T
>
(
y_offset_1
*
x_offset_0
)
+
x11
*
static_cast
<
T
>
(
y_offset_0
*
x_offset_0
);
}
template
<
typename
T
,
typename
CudaFunctionOriginalCoordinate
>
__global__
void
_ResizeTrilinearCoordinateMapping
(
int64_t
input_depth
,
int64_t
input_height
,
int64_t
input_width
,
int64_t
output_depth
,
int64_t
output_height
,
int64_t
output_width
,
float
scale_depth
,
float
scale_height
,
float
scale_width
,
float
roi_depth_start
,
float
roi_depth_end
,
float
roi_height_start
,
float
roi_height_end
,
float
roi_width_start
,
float
roi_width_end
,
const
size_t
SumDHW
,
bool
extrapolation_enabled
,
const
CudaFunctionOriginalCoordinate
&
transform_coordinate
,
LinearMappingInfo
*
dims_mapping
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
SumDHW
);
if
(
id
<
output_depth
)
{
// z = id
float
input_z
=
scale_depth
==
1
?
static_cast
<
float
>
(
id
)
:
transform_coordinate
(
static_cast
<
float
>
(
id
),
scale_depth
,
static_cast
<
float
>
(
output_depth
),
static_cast
<
float
>
(
input_depth
),
roi_depth_start
,
roi_depth_end
);
dims_mapping
[
id
].
extrapolate_
=
(
int
)(
extrapolation_enabled
&&
(
input_z
<
0
||
input_z
>
static_cast
<
float
>
(
input_depth
-
1
)));
input_z
=
max
(
0.0
f
,
min
(
input_z
,
static_cast
<
float
>
(
input_depth
-
1
)));
int
z_int
=
static_cast
<
int
>
(
input_z
);
dims_mapping
[
id
].
origin_
=
z_int
;
dims_mapping
[
id
].
weight_
=
(
z_int
>=
input_depth
-
1
)
?
0.5
f
:
input_z
-
z_int
;
}
else
if
(
id
>=
output_depth
&&
id
<
(
output_depth
+
output_height
))
{
// y = id - output_depth
float
input_y
=
scale_height
==
1
?
static_cast
<
float
>
(
id
-
output_depth
)
:
transform_coordinate
(
static_cast
<
float
>
(
id
-
output_depth
),
scale_height
,
static_cast
<
float
>
(
output_height
),
static_cast
<
float
>
(
input_height
),
roi_height_start
,
roi_height_end
);
dims_mapping
[
id
].
extrapolate_
=
(
int
)(
extrapolation_enabled
&&
(
input_y
<
0
||
input_y
>
static_cast
<
float
>
(
input_height
-
1
)));
input_y
=
max
(
0.0
f
,
min
(
input_y
,
static_cast
<
float
>
(
input_height
-
1
)));
int
y_int
=
static_cast
<
int
>
(
input_y
);
dims_mapping
[
id
].
origin_
=
y_int
;
dims_mapping
[
id
].
weight_
=
(
y_int
>=
input_height
-
1
)
?
0.5
f
:
input_y
-
y_int
;
}
else
{
//x = id - output_depth - output_height
float
input_x
=
scale_width
==
1
?
static_cast
<
float
>
(
id
-
output_depth
-
output_height
)
:
transform_coordinate
(
static_cast
<
float
>
(
id
-
output_depth
-
output_height
),
scale_width
,
static_cast
<
float
>
(
output_width
),
static_cast
<
float
>
(
input_width
),
roi_width_start
,
roi_width_end
);
dims_mapping
[
id
].
extrapolate_
=
(
int
)(
extrapolation_enabled
&&
(
input_x
<
0
||
input_x
>
static_cast
<
float
>
(
input_width
-
1
)));
input_x
=
max
(
0.0
f
,
min
(
input_x
,
static_cast
<
float
>
(
input_width
-
1
)));
int
x_int
=
static_cast
<
int
>
(
input_x
);
dims_mapping
[
id
].
origin_
=
x_int
;
dims_mapping
[
id
].
weight_
=
(
x_int
>=
input_width
-
1
)
?
0.5
f
:
input_x
-
x_int
;
}
}
// The following method supports a 3-D or 5-D input in 'Linear mode'. Last two dimension is [D, sH, W].
// the scale values for the outer dimensions except last two are 1.
template
<
typename
T
>
__global__
void
_ResizeTrilinearKernel
(
int64_t
input_depth
,
int64_t
input_height
,
int64_t
input_width
,
int64_t
output_depth
,
int64_t
output_height
,
int64_t
output_width
,
fast_divmod
div_output_height
,
fast_divmod
div_output_width
,
fast_divmod
div_output_image
,
const
T
*
input_data
,
T
*
output_data
,
const
size_t
N
,
const
T
extrapolation_value
,
LinearMappingInfo
*
dims_mapping
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
N
);
int
bxc
,
output_image_index
;
div_output_image
.
divmod
(
id
,
bxc
,
output_image_index
);
HIP_LONG
input_index
=
bxc
*
input_depth
*
input_height
*
input_width
;
int
output_z
,
output_y
,
output_x
,
temp
;
div_output_height
.
divmod
(
output_image_index
,
output_z
,
temp
);
div_output_width
.
divmod
(
temp
,
output_y
,
output_x
);
if
(
dims_mapping
[
output_z
].
extrapolate_
||
dims_mapping
[
output_y
+
output_depth
].
extrapolate_
||
dims_mapping
[
output_x
+
output_depth
+
output_height
].
extrapolate_
)
{
output_data
[
id
]
=
extrapolation_value
;
return
;
}
float
z_offset_0
=
dims_mapping
[
output_z
].
weight_
;
int
z_int
=
dims_mapping
[
output_z
].
origin_
;
float
y_offset_0
=
dims_mapping
[
output_y
+
output_depth
].
weight_
;
int
y_int
=
dims_mapping
[
output_y
+
output_depth
].
origin_
;
float
x_offset_0
=
dims_mapping
[
output_x
+
output_depth
+
output_height
].
weight_
;
int
x_int
=
dims_mapping
[
output_x
+
output_depth
+
output_height
].
origin_
;
input_index
+=
z_int
*
input_height
*
input_width
+
y_int
*
input_width
+
x_int
;
T
x000
=
input_data
[
input_index
];
bool
end_of_h
=
(
y_int
>=
input_height
-
1
);
bool
end_of_w
=
(
x_int
>=
input_width
-
1
);
T
x100
=
end_of_w
?
x000
:
input_data
[
input_index
+
1
];
T
x010
=
end_of_h
?
x000
:
input_data
[
input_index
+
input_width
];
T
x110
=
end_of_w
?
x010
:
(
end_of_h
?
x100
:
input_data
[
input_index
+
input_width
+
1
]);
bool
end_of_d
=
(
z_int
>=
input_depth
-
1
);
if
(
!
end_of_d
)
{
input_index
=
input_index
+
input_height
*
input_width
;
}
T
x001
=
end_of_d
?
x000
:
input_data
[
input_index
];
T
x101
=
end_of_w
?
x001
:
input_data
[
input_index
+
1
];
T
x011
=
end_of_h
?
x001
:
input_data
[
input_index
+
input_width
];
T
x111
=
end_of_w
?
x011
:
(
end_of_h
?
x101
:
input_data
[
input_index
+
input_width
+
1
]);
float
z_offset_1
=
1.0
f
-
z_offset_0
;
float
y_offset_1
=
1.0
f
-
y_offset_0
;
float
x_offset_1
=
1.0
f
-
x_offset_0
;
output_data
[
id
]
=
x000
*
static_cast
<
T
>
(
z_offset_1
*
y_offset_1
*
x_offset_1
)
+
x010
*
static_cast
<
T
>
(
z_offset_1
*
y_offset_0
*
x_offset_1
)
+
x100
*
static_cast
<
T
>
(
z_offset_1
*
y_offset_1
*
x_offset_0
)
+
x110
*
static_cast
<
T
>
(
z_offset_1
*
y_offset_0
*
x_offset_0
)
+
x001
*
static_cast
<
T
>
(
z_offset_0
*
y_offset_1
*
x_offset_1
)
+
x011
*
static_cast
<
T
>
(
z_offset_0
*
y_offset_0
*
x_offset_1
)
+
x101
*
static_cast
<
T
>
(
z_offset_0
*
y_offset_1
*
x_offset_0
)
+
x111
*
static_cast
<
T
>
(
z_offset_0
*
y_offset_0
*
x_offset_0
);
}
template
<
typename
T
>
__device__
__forceinline__
float
CubicInterpolationRowwise
(
const
T
*
image
,
int
x
,
int
y
,
int
input_height
,
int
input_width
,
float
coeff0
,
float
coeff1
,
float
coeff2
,
float
coeff3
)
{
int
row_index
=
max
(
0
,
min
(
y
,
input_height
-
1
))
*
input_width
;
return
coeff0
*
static_cast
<
float
>
(
image
[
row_index
+
max
(
0
,
min
(
x
-
1
,
input_width
-
1
))])
+
coeff1
*
static_cast
<
float
>
(
image
[
row_index
+
max
(
0
,
min
(
x
,
input_width
-
1
))])
+
coeff2
*
static_cast
<
float
>
(
image
[
row_index
+
max
(
0
,
min
(
x
+
1
,
input_width
-
1
))])
+
coeff3
*
static_cast
<
float
>
(
image
[
row_index
+
max
(
0
,
min
(
x
+
2
,
input_width
-
1
))]);
}
struct
CubicMappingInfo
{
int
origin_
;
int
extrapolate_
;
float
coeff0_
;
float
coeff1_
;
float
coeff2_
;
float
coeff3_
;
};
template
<
typename
T
,
typename
CudaFunctionOriginalCoordinate
>
__global__
void
_ResizeCubicCoordinateMapping
(
int64_t
input_height
,
int64_t
input_width
,
int64_t
output_height
,
int64_t
output_width
,
float
scale_height
,
float
scale_width
,
float
roi_height_start
,
float
roi_height_end
,
float
roi_width_start
,
float
roi_width_end
,
const
size_t
SumHW
,
bool
extrapolation_enabled
,
float
cubic_coeff_a
,
bool
exclude_outside
,
const
CudaFunctionOriginalCoordinate
&
transform_coordinate
,
CubicMappingInfo
*
dims_mapping
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
SumHW
);
auto
&
dm
=
dims_mapping
[
id
];
bool
is_y_axis
=
(
id
<
output_height
);
int
max_input_coord
=
static_cast
<
int
>
(
is_y_axis
?
input_height
:
input_width
);
float
scale
=
is_y_axis
?
scale_height
:
scale_width
;
float
input_coordinat
=
scale
==
1
?
(
is_y_axis
?
id
:
id
-
output_height
)
:
transform_coordinate
(
static_cast
<
float
>
(
is_y_axis
?
id
:
id
-
output_height
),
scale
,
static_cast
<
float
>
(
is_y_axis
?
output_height
:
output_width
),
static_cast
<
float
>
(
max_input_coord
),
(
is_y_axis
?
roi_height_start
:
roi_width_start
),
(
is_y_axis
?
roi_height_end
:
roi_width_end
));
int
coord_int
=
static_cast
<
int
>
(
_Floor
(
input_coordinat
));
float
s_coord
=
abs
(
input_coordinat
-
coord_int
);
float
coeff_sum
=
1.0
f
;
float
coeff_0
=
static_cast
<
float
>
(((
cubic_coeff_a
*
(
s_coord
+
1
)
-
5
*
cubic_coeff_a
)
*
(
s_coord
+
1
)
+
8
*
cubic_coeff_a
)
*
(
s_coord
+
1
)
-
4
*
cubic_coeff_a
);
float
coeff_1
=
static_cast
<
float
>
(((
cubic_coeff_a
+
2
)
*
s_coord
-
(
cubic_coeff_a
+
3
))
*
s_coord
*
s_coord
+
1
);
float
coeff_2
=
static_cast
<
float
>
(((
cubic_coeff_a
+
2
)
*
(
1
-
s_coord
)
-
(
cubic_coeff_a
+
3
))
*
(
1
-
s_coord
)
*
(
1
-
s_coord
)
+
1
);
float
coeff_3
=
static_cast
<
float
>
(((
cubic_coeff_a
*
(
2
-
s_coord
)
-
5
*
cubic_coeff_a
)
*
(
2
-
s_coord
)
+
8
*
cubic_coeff_a
)
*
(
2
-
s_coord
)
-
4
*
cubic_coeff_a
);
if
(
exclude_outside
)
{
coeff_0
=
(
coord_int
-
1
<
0
||
coord_int
-
1
>=
max_input_coord
)
?
0.0
:
coeff_0
;
coeff_1
=
(
coord_int
+
0
<
0
||
coord_int
+
0
>=
max_input_coord
)
?
0.0
:
coeff_1
;
coeff_2
=
(
coord_int
+
1
<
0
||
coord_int
+
1
>=
max_input_coord
)
?
0.0
:
coeff_2
;
coeff_3
=
(
coord_int
+
2
<
0
||
coord_int
+
2
>=
max_input_coord
)
?
0.0
:
coeff_3
;
coeff_sum
=
coeff_0
+
coeff_1
+
coeff_2
+
coeff_3
;
}
dm
.
origin_
=
coord_int
;
dm
.
coeff0_
=
coeff_0
/
coeff_sum
;
dm
.
coeff1_
=
coeff_1
/
coeff_sum
;
dm
.
coeff2_
=
coeff_2
/
coeff_sum
;
dm
.
coeff3_
=
coeff_3
/
coeff_sum
;
dm
.
extrapolate_
=
(
int
)(
extrapolation_enabled
&&
(
input_coordinat
<
0
||
input_coordinat
>
static_cast
<
float
>
(
max_input_coord
-
1
)));
}
template
<
typename
T
>
__global__
void
_ResizeBiCubicKernel
(
int64_t
input_height
,
int64_t
input_width
,
int64_t
output_height
,
int64_t
output_width
,
fast_divmod
div_output_width
,
fast_divmod
div_output_image
,
const
T
*
input_data
,
T
*
output_data
,
const
size_t
N
,
const
T
extrapolation_value
,
CubicMappingInfo
*
dims_mapping
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
N
);
int
bxc
,
output_image_index
,
output_x
,
output_y
;
div_output_image
.
divmod
(
id
,
bxc
,
output_image_index
);
HIP_LONG
input_index
=
bxc
*
input_height
*
input_width
;
div_output_width
.
divmod
(
output_image_index
,
output_y
,
output_x
);
CubicMappingInfo
&
y_info
=
dims_mapping
[
output_y
];
CubicMappingInfo
&
x_info
=
dims_mapping
[
output_x
+
output_height
];
if
(
y_info
.
extrapolate_
||
x_info
.
extrapolate_
)
{
output_data
[
id
]
=
extrapolation_value
;
return
;
}
float
w0
=
x_info
.
coeff0_
;
float
w1
=
x_info
.
coeff1_
;
float
w2
=
x_info
.
coeff2_
;
float
w3
=
x_info
.
coeff3_
;
int
x_int
=
x_info
.
origin_
;
int
y_int
=
y_info
.
origin_
;
const
T
*
image
=
input_data
+
input_index
;
output_data
[
id
]
=
y_info
.
coeff0_
*
CubicInterpolationRowwise
(
image
,
x_int
,
y_int
-
1
,
input_height
,
input_width
,
w0
,
w1
,
w2
,
w3
)
+
y_info
.
coeff1_
*
CubicInterpolationRowwise
(
image
,
x_int
,
y_int
,
input_height
,
input_width
,
w0
,
w1
,
w2
,
w3
)
+
y_info
.
coeff2_
*
CubicInterpolationRowwise
(
image
,
x_int
,
y_int
+
1
,
input_height
,
input_width
,
w0
,
w1
,
w2
,
w3
)
+
y_info
.
coeff3_
*
CubicInterpolationRowwise
(
image
,
x_int
,
y_int
+
2
,
input_height
,
input_width
,
w0
,
w1
,
w2
,
w3
);
}
size_t
CalcResizeBufferSize
(
const
onnxruntime
::
UpsampleMode
upsample_mode
,
const
gsl
::
span
<
const
int64_t
>&
output_dims
)
{
switch
(
upsample_mode
)
{
case
UpsampleMode
::
NN
:
return
sizeof
(
int64_t
)
*
output_dims
.
size
()
+
sizeof
(
NearestMappingInfo
)
*
static_cast
<
size_t
>
(
std
::
accumulate
(
output_dims
.
begin
(),
output_dims
.
end
(),
(
int64_t
)
0
));
case
UpsampleMode
::
LINEAR
:
return
sizeof
(
LinearMappingInfo
)
*
static_cast
<
size_t
>
(
std
::
accumulate
(
output_dims
.
rbegin
(),
output_dims
.
rbegin
()
+
2
,
(
int64_t
)
0
));
case
UpsampleMode
::
CUBIC
:
return
sizeof
(
CubicMappingInfo
)
*
static_cast
<
size_t
>
(
std
::
accumulate
(
output_dims
.
rbegin
(),
output_dims
.
rbegin
()
+
2
,
(
int64_t
)
0
));
}
return
0
;
}
template
<
typename
T
>
void
ResizeNearestImpl
(
hipStream_t
stream
,
const
int
rank
,
TArray
<
int64_t
>&
input_shape
,
TArray
<
int64_t
>&
output_shape
,
TArray
<
int64_t
>&
input_strides
,
TArray
<
fast_divmod
>&
output_div_pitches
,
TArray
<
float
>&
scales_vals
,
TArray
<
float
,
10
>&
roi_vals
,
const
T
*
input_data
,
T
*
output_data
,
const
size_t
N
,
bool
extrapolation_enabled
,
const
T
extrapolation_value
,
float
cubic_coeff_a
,
ResizeCoordinateTransformationMode
transform_coordinate
,
ResizeNearestMode
calc_nearest_pixel
,
int64_t
*
/* prefix_dim_sum */
,
NearestMappingInfo
*
dims_mapping
)
{
unsigned
int
blocksPerGrid
=
static_cast
<
unsigned
int
>
(
ceil
(
static_cast
<
float
>
(
N
)
/
GridDim
::
maxThreadsPerBlock
));
bool
could2d
=
rank
>=
2
&&
transform_coordinate
!=
ResizeCoordinateTransformationMode
::
TF_CROP_AND_RESIZE
&&
std
::
all_of
(
scales_vals
.
Data
(),
scales_vals
.
Data
()
+
(
rank
-
2
),
[](
float
v
)
{
return
v
==
1.0
;
});
if
(
could2d
)
{
int64_t
output_height
=
output_shape
[
rank
-
2
];
int64_t
output_width
=
output_shape
[
rank
-
1
];
fast_divmod
div_output_image
=
(
rank
>
2
)
?
output_div_pitches
[
rank
-
3
]
:
fast_divmod
(
static_cast
<
int
>
(
output_height
*
output_width
));
int
blocksPerDimsMappingGrid
=
static_cast
<
int
>
(
ceil
((
output_height
+
output_width
)
/
32.0
));
DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE
(
transform_coordinate
,
[
&
]()
{
DISPATCH_RESIZE_NEAREST_MODE
(
calc_nearest_pixel
,
[
&
]()
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_ResizeNearestMappingKernel2D
<
T
>
),
blocksPerDimsMappingGrid
,
32
,
0
,
stream
,
static_cast
<
int
>
(
input_shape
[
rank
-
2
]),
static_cast
<
int
>
(
input_shape
[
rank
-
1
]),
static_cast
<
int
>
(
output_height
),
static_cast
<
int
>
(
output_width
),
scales_vals
[
rank
-
2
],
scales_vals
[
rank
-
1
],
roi_vals
[
rank
-
2
],
roi_vals
[
rank
-
2
+
rank
],
roi_vals
[
rank
-
1
],
roi_vals
[
rank
-
1
+
rank
],
extrapolation_enabled
,
coord_t
(),
nearest_t
(),
dims_mapping
);
});
});
if
(
extrapolation_enabled
)
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_ResizeNearestKernel2D
<
T
,
true
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
output_height
,
output_width
,
input_shape
[
rank
-
2
]
*
input_shape
[
rank
-
1
],
static_cast
<
int
>
(
input_shape
[
rank
-
1
]),
div_output_image
,
output_div_pitches
[
rank
-
2
],
input_data
,
output_data
,
N
,
extrapolation_value
,
dims_mapping
);
}
else
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_ResizeNearestKernel2D
<
T
,
false
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
output_height
,
output_width
,
input_shape
[
rank
-
2
]
*
input_shape
[
rank
-
1
],
static_cast
<
int
>
(
input_shape
[
rank
-
1
]),
div_output_image
,
output_div_pitches
[
rank
-
2
],
input_data
,
output_data
,
N
,
extrapolation_value
,
dims_mapping
);
}
return
;
}
int64_t
total_dim_sum
=
std
::
accumulate
(
output_shape
.
Data
(),
output_shape
.
Data
()
+
rank
,
(
int64_t
)
0
);
int
blocksPerDimsMappingGrid
=
(
int
)(
ceil
(
static_cast
<
double
>
(
total_dim_sum
)
/
32
));
DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE
(
transform_coordinate
,
[
&
]()
{
DISPATCH_RESIZE_NEAREST_MODE
(
calc_nearest_pixel
,
[
&
]()
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_ResizeNearestMappingKernel
<
T
>
),
blocksPerDimsMappingGrid
,
32
,
0
,
stream
,
rank
,
input_shape
,
output_shape
,
scales_vals
,
roi_vals
,
total_dim_sum
,
extrapolation_enabled
,
coord_t
(),
nearest_t
(),
reinterpret_cast
<
int64_t
*>
(
dims_mapping
),
reinterpret_cast
<
NearestMappingInfo
*>
(
reinterpret_cast
<
int64_t
*>
(
dims_mapping
)
+
rank
));
});
});
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_ResizeNearestKernel
<
T
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
rank
,
input_strides
,
output_div_pitches
,
input_data
,
output_data
,
N
,
extrapolation_value
,
reinterpret_cast
<
const
int64_t
*>
(
dims_mapping
),
reinterpret_cast
<
const
NearestMappingInfo
*>
(
reinterpret_cast
<
int64_t
*>
(
dims_mapping
)
+
rank
));
return
;
}
template
<
typename
T
>
void
ResizeImpl
(
hipStream_t
stream
,
const
UpsampleMode
upsample_mode
,
const
int
rank
,
TArray
<
int64_t
>&
input_shape
,
TArray
<
int64_t
>&
output_shape
,
TArray
<
int64_t
>&
input_strides
,
TArray
<
fast_divmod
>&
output_div_pitches
,
TArray
<
float
>&
scales_vals
,
TArray
<
float
,
10
>&
roi_vals
,
const
T
*
input_data
,
T
*
output_data
,
const
size_t
N
,
bool
extrapolation_enabled
,
const
T
extrapolation_value
,
float
cubic_coeff_a
,
bool
exclude_outside
,
ResizeCoordinateTransformationMode
coordinate_transform_mode
,
ResizeNearestMode
nearest_mode
,
void
*
dims_mapping
)
{
bool
isSame
=
std
::
all_of
(
scales_vals
.
Data
(),
scales_vals
.
Data
()
+
rank
,
[](
float
v
)
{
return
v
==
1.0
f
;
})
&&
(
coordinate_transform_mode
!=
ResizeCoordinateTransformationMode
::
TF_CROP_AND_RESIZE
);
if
(
isSame
)
{
HIP_CALL_THROW
(
hipMemcpyAsync
(
output_data
,
input_data
,
N
*
sizeof
(
T
),
hipMemcpyDeviceToDevice
,
stream
));
return
;
}
if
(
upsample_mode
==
UpsampleMode
::
NN
)
{
ResizeNearestImpl
(
stream
,
rank
,
input_shape
,
output_shape
,
input_strides
,
output_div_pitches
,
scales_vals
,
roi_vals
,
input_data
,
output_data
,
N
,
extrapolation_enabled
,
extrapolation_value
,
cubic_coeff_a
,
coordinate_transform_mode
,
nearest_mode
,
reinterpret_cast
<
int64_t
*>
(
dims_mapping
),
reinterpret_cast
<
NearestMappingInfo
*>
(
reinterpret_cast
<
int64_t
*>
(
dims_mapping
)
+
rank
));
return
;
}
// We support a special case of bilinear or bicubic if the input data is 4D with the outer 2 scales being 1.0
// We would have validated the outer scale values by the time execution reaches this
bool
is_2D
=
(
rank
==
2
||
rank
==
4
);
// We support a special case of trilinear or tricubic if the input data is 5D with the outer 2 scales being 1.0
// We would have validated the outer scale values by the time execution reaches this
bool
is_3D
=
(
rank
==
3
||
rank
==
5
);
// Should not hit this as we have already validated input rank/scales and we provide verbose error messages
// to the user.
ORT_ENFORCE
(
is_2D
||
is_3D
,
"Only bilinear/trilinear and bicubic modes are supported in Resize"
);
int
blocksPerGrid
=
static_cast
<
int
>
(
ceil
(
static_cast
<
float
>
(
N
)
/
GridDim
::
maxThreadsPerBlock
));
fast_divmod
div_output_image
;
if
(
is_2D
)
{
div_output_image
=
(
rank
>
2
)
?
output_div_pitches
[
rank
-
3
]
:
fast_divmod
(
gsl
::
narrow_cast
<
int
>
(
N
));
}
else
if
(
is_3D
)
{
div_output_image
=
(
rank
>
3
)
?
output_div_pitches
[
rank
-
4
]
:
fast_divmod
(
gsl
::
narrow_cast
<
int
>
(
N
));
}
int64_t
output_depth
=
is_3D
?
output_shape
[
rank
-
3
]
:
0
;
int64_t
output_height
=
output_shape
[
rank
-
2
];
int64_t
output_width
=
output_shape
[
rank
-
1
];
int
blocksPerDimsMappingGrid
=
static_cast
<
int
>
(
ceil
((
output_depth
+
output_height
+
output_width
)
/
32.0
));
switch
(
upsample_mode
)
{
case
UpsampleMode
::
LINEAR
:
if
(
is_2D
)
{
DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE
(
coordinate_transform_mode
,
[
&
]()
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_ResizeBilinearCoordinateMapping
<
T
>
),
blocksPerDimsMappingGrid
,
32
,
0
,
stream
,
input_shape
[
rank
-
2
],
input_shape
[
rank
-
1
],
output_height
,
output_width
,
scales_vals
[
rank
-
2
],
scales_vals
[
rank
-
1
],
roi_vals
[
rank
-
2
],
roi_vals
[
rank
-
2
+
rank
],
roi_vals
[
rank
-
1
],
roi_vals
[
rank
-
1
+
rank
],
output_height
+
output_width
,
extrapolation_enabled
,
coord_t
(),
reinterpret_cast
<
LinearMappingInfo
*>
(
dims_mapping
));
});
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_ResizeBilinearKernel
<
T
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
input_shape
[
rank
-
2
],
input_shape
[
rank
-
1
],
output_height
,
output_width
,
output_div_pitches
[
rank
-
2
],
div_output_image
,
input_data
,
output_data
,
N
,
extrapolation_value
,
reinterpret_cast
<
LinearMappingInfo
*>
(
dims_mapping
));
return
;
}
else
if
(
is_3D
)
{
DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE
(
coordinate_transform_mode
,
[
&
]()
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_ResizeTrilinearCoordinateMapping
<
T
>
),
blocksPerDimsMappingGrid
,
32
,
0
,
stream
,
input_shape
[
rank
-
3
]
,
input_shape
[
rank
-
2
],
input_shape
[
rank
-
1
],
output_depth
,
output_height
,
output_width
,
scales_vals
[
rank
-
3
],
scales_vals
[
rank
-
2
],
scales_vals
[
rank
-
1
],
roi_vals
[
rank
-
3
],
roi_vals
[
rank
-
3
+
rank
],
roi_vals
[
rank
-
2
],
roi_vals
[
rank
-
2
+
rank
],
roi_vals
[
rank
-
1
],
roi_vals
[
rank
-
1
+
rank
],
output_depth
+
output_height
+
output_width
,
extrapolation_enabled
,
coord_t
(),
reinterpret_cast
<
LinearMappingInfo
*>
(
dims_mapping
));
});
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_ResizeTrilinearKernel
<
T
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
input_shape
[
rank
-
3
],
input_shape
[
rank
-
2
],
input_shape
[
rank
-
1
],
output_depth
,
output_height
,
output_width
,
output_div_pitches
[
rank
-
3
],
output_div_pitches
[
rank
-
2
],
div_output_image
,
input_data
,
output_data
,
N
,
extrapolation_value
,
reinterpret_cast
<
LinearMappingInfo
*>
(
dims_mapping
));
return
;
}
ORT_THROW
(
"Only bilinear/trilinear and bicubic modes are supported in Resize"
);
break
;
case
UpsampleMode
::
CUBIC
:
if
(
is_2D
)
{
DISPATCH_RESIZE_COORDINATE_TRANSFORMATION_MODE
(
coordinate_transform_mode
,
[
&
]()
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_ResizeCubicCoordinateMapping
<
T
>
),
blocksPerDimsMappingGrid
,
32
,
0
,
stream
,
input_shape
[
rank
-
2
],
input_shape
[
rank
-
1
],
output_height
,
output_width
,
scales_vals
[
rank
-
2
],
scales_vals
[
rank
-
1
],
roi_vals
[
rank
-
2
],
roi_vals
[
rank
-
2
+
rank
],
roi_vals
[
rank
-
1
],
roi_vals
[
rank
-
1
+
rank
],
output_height
+
output_width
,
extrapolation_enabled
,
cubic_coeff_a
,
exclude_outside
,
coord_t
(),
reinterpret_cast
<
CubicMappingInfo
*>
(
dims_mapping
));
});
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_ResizeBiCubicKernel
<
T
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
input_shape
[
rank
-
2
],
input_shape
[
rank
-
1
],
output_height
,
output_width
,
output_div_pitches
[
rank
-
2
],
div_output_image
,
input_data
,
output_data
,
N
,
extrapolation_value
,
reinterpret_cast
<
CubicMappingInfo
*>
(
dims_mapping
));
return
;
}
ORT_THROW
(
"Only bilinear/trilinear and bicubic modes are supported in Resize"
);
case
UpsampleMode
::
NN
:
ORT_THROW
(
"Only bilinear/trilinear and bicubic modes are supported in Resize"
);
}
}
#define SPECIALIZED_IMPL(T) \
template void ResizeImpl<T>( \
hipStream_t stream, \
const UpsampleMode upsample_mode, \
const int rank, \
TArray<int64_t>& input_shape, \
TArray<int64_t>& output_shape, \
TArray<int64_t>& input_strides, \
TArray<fast_divmod>& output_div_pitches, \
TArray<float>& scales_vals, \
TArray<float, 10>& roi_vals, \
const T* input_data, \
T* output_data, \
const size_t N, \
bool extrapolation_enabled, \
const T extrapolation_value, \
float cubic_coeff_a, \
bool exclude_outside, \
ResizeCoordinateTransformationMode coordinate_transform_mode, \
ResizeNearestMode nearest_mode, \
void* dims_mapping);
SPECIALIZED_IMPL
(
float
)
SPECIALIZED_IMPL
(
double
)
SPECIALIZED_IMPL
(
half
)
SPECIALIZED_IMPL
(
int32_t
)
SPECIALIZED_IMPL
(
uint8_t
)
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/resize_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
#include "core/common/common.h"
#include "core/providers/cpu/tensor/upsamplebase.h"
#include "core/providers/rocm/rocm_common.h"
namespace
onnxruntime
{
namespace
rocm
{
size_t
CalcResizeBufferSize
(
const
onnxruntime
::
UpsampleMode
upsample_mode
,
const
gsl
::
span
<
const
int64_t
>&
output_dims
);
template
<
typename
T
>
void
ResizeImpl
(
hipStream_t
stream
,
const
onnxruntime
::
UpsampleMode
upsample_mode
,
const
int
rank
,
TArray
<
int64_t
>&
input_shape
,
TArray
<
int64_t
>&
output_shape
,
TArray
<
int64_t
>&
input_strides
,
TArray
<
fast_divmod
>&
output_div_pitches
,
TArray
<
float
>&
scales_vals
,
TArray
<
float
,
10
>&
roi
,
const
T
*
input_data
,
T
*
output_data
,
const
size_t
N
,
bool
extrapolation_enabled
,
const
T
extrapolation_value
,
float
cubic_coeff_a
,
bool
exclude_outside
,
onnxruntime
::
ResizeCoordinateTransformationMode
coordinate_transform_mode
,
onnxruntime
::
ResizeNearestMode
nearest_mode
,
void
*
dims_mapping
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reverse_sequence.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "reverse_sequence.h"
#include "reverse_sequence_impl.h"
#include "core/providers/cpu/tensor/utils.h"
namespace
onnxruntime
{
namespace
rocm
{
ONNX_OPERATOR_KERNEL_EX
(
ReverseSequence
,
kOnnxDomain
,
10
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
()).
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
()),
ReverseSequenceOp
);
#define ReverseSequenceCallCudaImplTypeAs(T, TEqual) \
if (X.IsDataType<T>()) { \
HIP_RETURN_IF_ERROR(ReverseSequenceCudaImpl( \
Stream(), \
reinterpret_cast<const typename ToHipType<TEqual>::MappedType*>(X.Data<T>()), \
seq_lengths.Data<int64_t>(), \
reinterpret_cast<typename ToHipType<TEqual>::MappedType*>(Y.MutableData<T>()), \
gsl::narrow<int>(batch_size), gsl::narrow<int>(max_seq_len), gsl::narrow<int>(element_size), \
time_major_)); \
return Status::OK(); \
}
Status
ReverseSequenceOp
::
ComputeInternal
(
OpKernelContext
*
context
)
const
{
const
auto
&
X
=
*
context
->
Input
<
Tensor
>
(
0
);
const
auto
&
dims
=
X
.
Shape
();
const
auto
batch_size
=
time_major_
?
dims
[
1
]
:
dims
[
0
];
const
auto
max_seq_len
=
time_major_
?
dims
[
0
]
:
dims
[
1
];
const
auto
element_size
=
dims
.
SizeFromDimension
(
2
);
const
auto
&
seq_lengths
=
*
context
->
Input
<
Tensor
>
(
1
);
const
auto
&
seq_len_shape
=
seq_lengths
.
Shape
();
if
(
seq_len_shape
.
NumDimensions
()
!=
1
||
seq_len_shape
[
0
]
!=
batch_size
)
{
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
INVALID_ARGUMENT
,
"sequence_lens shape must be {batch_size}. Got:"
,
seq_len_shape
,
". batch_size="
,
batch_size
);
}
auto
&
Y
=
*
context
->
Output
(
0
,
dims
);
ReverseSequenceCallCudaImplTypeAs
(
float
,
int32_t
);
ReverseSequenceCallCudaImplTypeAs
(
int32_t
,
int32_t
);
ReverseSequenceCallCudaImplTypeAs
(
uint32_t
,
int32_t
);
ReverseSequenceCallCudaImplTypeAs
(
MLFloat16
,
int16_t
);
ReverseSequenceCallCudaImplTypeAs
(
int16_t
,
int16_t
);
ReverseSequenceCallCudaImplTypeAs
(
uint16_t
,
int16_t
);
ReverseSequenceCallCudaImplTypeAs
(
int8_t
,
int8_t
);
ReverseSequenceCallCudaImplTypeAs
(
uint8_t
,
int8_t
);
ReverseSequenceCallCudaImplTypeAs
(
bool
,
int8_t
);
ReverseSequenceCallCudaImplTypeAs
(
int64_t
,
int64_t
);
ReverseSequenceCallCudaImplTypeAs
(
double
,
int64_t
);
ReverseSequenceCallCudaImplTypeAs
(
uint64_t
,
int64_t
);
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
NOT_IMPLEMENTED
,
"Type for "
,
X
.
DataType
(),
" is not supported yet in ReverseSequence."
);
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reverse_sequence.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
rocm
{
class
ReverseSequenceOp
final
:
public
RocmKernel
{
public:
ReverseSequenceOp
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{
int64_t
batch_axis
;
int64_t
time_axis
;
ORT_ENFORCE
(
info
.
GetAttr
<
int64_t
>
(
"batch_axis"
,
&
batch_axis
).
IsOK
());
ORT_ENFORCE
(
info
.
GetAttr
<
int64_t
>
(
"time_axis"
,
&
time_axis
).
IsOK
());
ORT_ENFORCE
(
batch_axis
<
2
,
"Invalid batch_axis of "
,
batch_axis
,
". Must be 0 or 1"
);
ORT_ENFORCE
(
time_axis
<
2
,
"Invalid time_axis of "
,
time_axis
,
". Must be 0 or 1"
);
ORT_ENFORCE
(
batch_axis
!=
time_axis
,
"time_axis and batch_axis must have different values but both are "
,
time_axis
);
time_major_
=
time_axis
==
0
;
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
private:
bool
time_major_
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reverse_sequence_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "reverse_sequence_impl.h"
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/common/common.h"
namespace
onnxruntime
{
namespace
rocm
{
static
const
int
kReverseSequenceElementsPerThread
=
4
;
template
<
typename
T
,
bool
time_major
>
__global__
void
ReverseSequenceImplKernel
(
const
T
*
x_data
,
const
int64_t
*
seq_len_data
,
T
*
y_data
,
const
int
batch_size
,
const
int
max_seq_len
,
const
int
element_size
,
const
int
group_count
,
const
fast_divmod
fdm_grouped_stride_0
,
const
fast_divmod
fdm_grouped_stride_1
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
grouped_index
,
group_count
);
int
batch_id
,
seq_id
,
gid
=
grouped_index
;
if
(
time_major
)
{
fdm_grouped_stride_0
.
divmod
(
gid
,
seq_id
,
gid
);
fdm_grouped_stride_1
.
divmod
(
gid
,
batch_id
,
gid
);
}
else
{
fdm_grouped_stride_0
.
divmod
(
gid
,
batch_id
,
gid
);
fdm_grouped_stride_1
.
divmod
(
gid
,
seq_id
,
gid
);
}
int
eid
=
gid
*
kReverseSequenceElementsPerThread
;
int
target_seq_id
=
(
seq_id
<
(
int
)
seq_len_data
[
batch_id
])
?
((
int
)
seq_len_data
[
batch_id
]
-
1
-
seq_id
)
:
seq_id
;
int
flat_src_idx
,
flat_target_idx
;
if
(
time_major
)
{
flat_src_idx
=
seq_id
*
batch_size
*
element_size
+
batch_id
*
element_size
+
eid
;
flat_target_idx
=
target_seq_id
*
batch_size
*
element_size
+
batch_id
*
element_size
+
eid
;
}
else
{
flat_src_idx
=
batch_id
*
max_seq_len
*
element_size
+
seq_id
*
element_size
+
eid
;
flat_target_idx
=
batch_id
*
max_seq_len
*
element_size
+
target_seq_id
*
element_size
+
eid
;
}
y_data
[
flat_target_idx
]
=
x_data
[
flat_src_idx
];
#pragma unroll
for
(
int
i
=
1
;
i
<
kReverseSequenceElementsPerThread
;
++
i
)
{
if
(
eid
+
i
<
element_size
)
{
y_data
[
flat_target_idx
+
i
]
=
x_data
[
flat_src_idx
+
i
];
}
}
}
template
<
typename
T
>
hipError_t
ReverseSequenceCudaImpl
(
hipStream_t
stream
,
const
T
*
x_data
,
const
int64_t
*
seq_len_data
,
T
*
y_data
,
const
int
batch_size
,
const
int
max_seq_len
,
const
int
element_size
,
const
bool
time_major
)
{
int
element_group_size
=
CeilDiv
(
element_size
,
kReverseSequenceElementsPerThread
);
fast_divmod
fdm_grouped_stride_1
(
element_group_size
);
fast_divmod
fdm_grouped_stride_0
(
element_group_size
*
((
time_major
)
?
batch_size
:
max_seq_len
));
int
group_count
=
batch_size
*
max_seq_len
*
element_group_size
;
int
blocksPerGrid
=
CeilDiv
(
group_count
,
GridDim
::
maxThreadsPerBlock
);
if
(
time_major
)
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
ReverseSequenceImplKernel
<
T
,
true
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
x_data
,
seq_len_data
,
y_data
,
batch_size
,
max_seq_len
,
element_size
,
group_count
,
fdm_grouped_stride_0
,
fdm_grouped_stride_1
);
}
else
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
ReverseSequenceImplKernel
<
T
,
false
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
x_data
,
seq_len_data
,
y_data
,
batch_size
,
max_seq_len
,
element_size
,
group_count
,
fdm_grouped_stride_0
,
fdm_grouped_stride_1
);
}
return
hipSuccess
;
}
#define InstantiateReverseSequenceImpl(T) \
template hipError_t ReverseSequenceCudaImpl( \
hipStream_t stream, \
const T* x_data, \
const int64_t* seq_len_data, \
T* y_data, \
const int batch_size, \
const int max_seq_len, \
const int element_size, \
const bool time_major)
InstantiateReverseSequenceImpl
(
int64_t
);
InstantiateReverseSequenceImpl
(
int32_t
);
InstantiateReverseSequenceImpl
(
int16_t
);
InstantiateReverseSequenceImpl
(
int8_t
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/reverse_sequence_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
hipError_t
ReverseSequenceCudaImpl
(
hipStream_t
stream
,
const
T
*
x_data
,
const
int64_t
*
seq_len_data
,
T
*
y_data
,
const
int
batch_size
,
const
int
max_seq_len
,
const
int
element_size
,
const
bool
time_major
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_elements.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/scatter_elements.h"
#include "core/providers/cpu/tensor/utils.h"
#include "core/providers/rocm/tensor/gather_elements.h"
#include "core/providers/rocm/tensor/gather_elements_impl.h"
#include "core/providers/rocm/tensor/scatter_elements_impl.h"
namespace
onnxruntime
{
namespace
rocm
{
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Scatter
,
kOnnxDomain
,
9
,
10
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
())
.
TypeConstraint
(
"Tind"
,
std
::
vector
<
MLDataType
>
{
DataTypeImpl
::
GetTensorType
<
int32_t
>
(),
DataTypeImpl
::
GetTensorType
<
int64_t
>
()}),
ScatterElements
);
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
ScatterElements
,
kOnnxDomain
,
11
,
12
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
())
.
TypeConstraint
(
"Tind"
,
std
::
vector
<
MLDataType
>
{
DataTypeImpl
::
GetTensorType
<
int32_t
>
(),
DataTypeImpl
::
GetTensorType
<
int64_t
>
()}),
ScatterElements
);
ONNX_OPERATOR_KERNEL_EX
(
ScatterElements
,
kOnnxDomain
,
13
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
())
.
TypeConstraint
(
"Tind"
,
std
::
vector
<
MLDataType
>
{
DataTypeImpl
::
GetTensorType
<
int32_t
>
(),
DataTypeImpl
::
GetTensorType
<
int64_t
>
()}),
ScatterElements
);
#define CASE_SCATTER_ELEMENTS_IMPL(type) \
case sizeof(type): { \
const type* indices_data = reinterpret_cast<const type*>(indices_data_raw); \
ORT_RETURN_IF_ERROR(ScatterElementsImpl(stream, input_data, indices_data, updates_data, output_data, args)); \
} break
template
<
typename
T
>
struct
ScatterElements
::
ComputeImpl
{
Status
operator
()(
hipStream_t
stream
,
const
void
*
input_data_raw
,
const
void
*
updates_data_raw
,
const
void
*
indices_data_raw
,
void
*
output_data_raw
,
const
size_t
index_element_size
,
const
GatherScatterElementsArgs
&
args
)
const
{
typedef
typename
ToHipType
<
T
>::
MappedType
HipT
;
const
HipT
*
input_data
=
reinterpret_cast
<
const
HipT
*>
(
input_data_raw
);
const
HipT
*
updates_data
=
reinterpret_cast
<
const
HipT
*>
(
updates_data_raw
);
HipT
*
output_data
=
reinterpret_cast
<
HipT
*>
(
output_data_raw
);
switch
(
index_element_size
)
{
CASE_SCATTER_ELEMENTS_IMPL
(
int32_t
);
CASE_SCATTER_ELEMENTS_IMPL
(
int64_t
);
// should not reach here as we validate if the all relevant types are supported in the Compute method
default:
ORT_THROW
(
"Unsupported indices element size by the ScatterElements ROCM kernel"
);
}
return
Status
::
OK
();
}
};
#undef CASE_SCATTER_ELEMENTS_IMPL
Status
ScatterElements
::
ComputeInternal
(
OpKernelContext
*
context
)
const
{
const
auto
*
input_tensor
=
context
->
Input
<
Tensor
>
(
0
);
const
auto
&
input_shape
=
input_tensor
->
Shape
();
const
int64_t
input_size
=
input_shape
.
Size
();
const
int64_t
input_rank
=
static_cast
<
int64_t
>
(
input_shape
.
NumDimensions
());
const
int64_t
axis
=
HandleNegativeAxis
(
axis_
,
input_rank
);
const
auto
*
indices_tensor
=
context
->
Input
<
Tensor
>
(
1
);
const
auto
*
updates_tensor
=
context
->
Input
<
Tensor
>
(
2
);
if
(
input_tensor
->
DataType
()
!=
updates_tensor
->
DataType
())
{
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
INVALID_ARGUMENT
,
"data type is different from updates type"
);
}
const
auto
&
indices_shape
=
indices_tensor
->
Shape
();
auto
indices_dims
=
indices_shape
.
GetDims
();
auto
updates_dims
=
updates_tensor
->
Shape
().
GetDims
();
if
(
indices_dims
.
size
()
!=
updates_dims
.
size
())
{
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
INVALID_ARGUMENT
,
"Indices and updates must have the same rank"
);
}
for
(
size_t
i
=
0
;
i
<
indices_dims
.
size
();
++
i
)
{
if
(
indices_dims
[
i
]
!=
updates_dims
[
i
])
{
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
INVALID_ARGUMENT
,
"Indices vs updates dimensions differs at position="
,
i
,
" "
,
indices_dims
[
i
],
" vs "
,
updates_dims
[
i
]);
}
}
// Validate input shapes and ranks (invoke the static method in the CPU GatherElements kernel that hosts the shared
// checks)
ORT_RETURN_IF_ERROR
(
onnxruntime
::
GatherElements
::
ValidateInputShapes
(
input_shape
,
indices_shape
,
axis
));
auto
*
output_tensor
=
context
->
Output
(
0
,
input_shape
);
if
(
input_size
==
0
)
return
Status
::
OK
();
GatherScatterElementsArgs
args
;
args
.
input_size
=
input_size
;
args
.
indices_size
=
indices_shape
.
Size
();
TensorShapeVector
input_shape_vec
=
input_shape
.
AsShapeVector
();
TensorShapeVector
indices_shape_vec
=
indices_shape
.
AsShapeVector
();
CoalesceDimensions
(
input_shape_vec
,
indices_shape_vec
,
nullptr
,
axis
,
args
);
// Use element size instead of concrete types so we can specialize less template functions to reduce binary size.
int
dtype
=
GetElementType
(
input_tensor
->
DataType
()
->
Size
());
if
(
dtype
==
ONNX_NAMESPACE
::
TensorProto_DataType_UNDEFINED
)
{
ORT_THROW
(
"Unsupported element size by the ScatterElements ROCM kernel"
);
}
utils
::
MLTypeCallDispatcher
<
int8_t
,
MLFloat16
,
float
,
double
>
t_disp
(
dtype
);
return
t_disp
.
InvokeRet
<
Status
,
ComputeImpl
>
(
Stream
(),
input_tensor
->
DataRaw
(),
updates_tensor
->
DataRaw
(),
indices_tensor
->
DataRaw
(),
output_tensor
->
MutableDataRaw
(),
indices_tensor
->
DataType
()
->
Size
(),
args
);
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_elements.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
rocm
{
class
ScatterElements
final
:
public
RocmKernel
{
public:
ScatterElements
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{
ORT_ENFORCE
(
info
.
GetAttr
<
int64_t
>
(
"axis"
,
&
axis_
).
IsOK
(),
"Missing/Invalid 'axis' attribute value"
);
}
~
ScatterElements
()
=
default
;
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
private:
template
<
typename
T
>
struct
ComputeImpl
;
int64_t
axis_
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_elements_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
rocm
{
struct
GatherScatterElementsArgs
;
template
<
typename
T
,
typename
TIndex
>
Status
ScatterElementsImpl
(
hipStream_t
stream
,
const
T
*
input_data
,
const
TIndex
*
indices_data
,
const
T
*
updates_data
,
T
*
output_data
,
const
GatherScatterElementsArgs
&
args
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_nd.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/scatter_nd.h"
#include "core/providers/rocm/tensor/scatter_nd_impl.h"
#include "core/providers/rocm/shared_inc/rocm_utils.h"
#include "core/providers/cpu/tensor/utils.h"
namespace
onnxruntime
{
namespace
rocm
{
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
ScatterND
,
kOnnxDomain
,
11
,
12
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
())
.
MayInplace
(
0
,
0
),
ScatterND
);
ONNX_OPERATOR_KERNEL_EX
(
ScatterND
,
kOnnxDomain
,
13
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
())
.
MayInplace
(
0
,
0
),
ScatterND
);
Status
ScatterND
::
ComputeInternal
(
OpKernelContext
*
context
)
const
{
const
auto
*
input_tensor
=
context
->
Input
<
Tensor
>
(
0
);
const
auto
*
indices_tensor
=
context
->
Input
<
Tensor
>
(
1
);
const
auto
*
updates_tensor
=
context
->
Input
<
Tensor
>
(
2
);
const
auto
&
input_shape
=
input_tensor
->
Shape
();
const
auto
&
indices_shape
=
indices_tensor
->
Shape
();
const
auto
&
updates_shape
=
updates_tensor
->
Shape
();
// Validate input shapes
ORT_RETURN_IF_ERROR
(
onnxruntime
::
ScatterND
::
ValidateShapes
(
input_shape
,
indices_shape
,
updates_shape
));
auto
*
output_tensor
=
context
->
Output
(
0
,
input_shape
);
const
void
*
input_data
=
input_tensor
->
DataRaw
();
void
*
output_data
=
output_tensor
->
MutableDataRaw
();
size_t
element_size
=
input_tensor
->
DataType
()
->
Size
();
if
(
input_data
!=
output_data
)
{
// TODO: Run benchmarks to determine if a dedicated kernel doing data copy will be faster than invoking hipMemcpy ?
HIP_RETURN_IF_ERROR
(
hipMemcpyAsync
(
output_data
,
input_data
,
input_tensor
->
SizeInBytes
(),
hipMemcpyDeviceToDevice
,
Stream
()));
}
// Bail out early
if
(
indices_shape
.
Size
()
==
0
)
{
return
Status
::
OK
();
}
auto
last_index_dimension
=
indices_shape
[
indices_shape
.
NumDimensions
()
-
1
];
// We need element counts for each dimension and the input dim value for each dimension
// for the range [0, last_index_dimension).
// To avoid multiple GPU data transfers, we combine this into one array and send it through
TensorPitches
input_strides
(
input_shape
);
std
::
vector
<
int64_t
>
element_counts_and_input_dims
(
last_index_dimension
*
2
,
0LL
);
for
(
int64_t
i
=
0
;
i
<
last_index_dimension
;
++
i
)
{
element_counts_and_input_dims
[
i
]
=
input_strides
[
i
];
element_counts_and_input_dims
[
i
+
last_index_dimension
]
=
input_shape
[
i
];
}
RocmAsyncBuffer
<
int64_t
>
element_counts_and_input_dims_gpu
(
this
,
element_counts_and_input_dims
);
ORT_RETURN_IF_ERROR
(
element_counts_and_input_dims_gpu
.
CopyToGpu
());
ORT_RETURN_IF_ERROR
(
ScatterNDImpl
(
Stream
(),
output_data
,
element_size
,
indices_shape
.
Size
()
/
static_cast
<
size_t
>
(
last_index_dimension
),
indices_tensor
->
Data
<
int64_t
>
(),
// only int64_t is supported for indices as per the onnx spec
last_index_dimension
,
element_counts_and_input_dims_gpu
.
GpuPtr
(),
updates_tensor
->
DataRaw
(),
input_shape
.
SizeFromDimension
(
last_index_dimension
)));
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_nd.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/tensor/scatter_nd.h"
namespace
onnxruntime
{
namespace
rocm
{
class
ScatterND
final
:
public
RocmKernel
{
public:
explicit
ScatterND
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_nd_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/scatter_nd_impl.h"
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/providers/rocm/atomic/common.cuh"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
__global__
void
_ScatterNDKernel
(
T
*
output_data
,
const
size_t
num_indices
,
const
int64_t
*
indices_data
,
const
int64_t
last_index_dimension
,
const
int64_t
*
element_counts_and_input_dims
,
const
T
*
updates_data
,
const
size_t
num_updates_elements
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
num_indices
);
// Compute the base offset into the output data
int64_t
data_offset
=
0
;
size_t
indices_start
=
last_index_dimension
*
id
;
size_t
indices_end
=
indices_start
+
last_index_dimension
;
for
(
size_t
i
=
indices_start
;
i
<
indices_end
;
++
i
)
{
int64_t
index
=
indices_data
[
i
];
int64_t
element_count_dim
=
element_counts_and_input_dims
[
i
-
indices_start
];
int64_t
dim_value
=
element_counts_and_input_dims
[
i
-
indices_start
+
last_index_dimension
];
// Clamp the index if out of range
// This would have been an error in the CPU kernel, but throwing in the ROCM EP
// is hard. This is the approach taken by other frameworks for out of bound indices
// in their corresponding GPU backends as well.
// index >= -dim_value && index < dim_value
if
(
index
>=
0
)
{
if
(
index
>=
dim_value
)
{
index
=
dim_value
-
1
;
}
}
else
{
if
(
index
<
-
dim_value
)
{
index
=
0
;
}
else
{
index
+=
dim_value
;
}
}
data_offset
+=
(
index
*
element_count_dim
);
}
const
T
*
updates_data_base
=
updates_data
+
num_updates_elements
*
id
;
T
*
output_data_base
=
output_data
+
data_offset
;
for
(
size_t
i
=
0
;
i
<
num_updates_elements
;
++
i
)
{
output_data_base
[
i
]
=
updates_data_base
[
i
];
}
}
Status
ScatterNDImpl
(
hipStream_t
stream
,
void
*
output_data
,
const
size_t
element_size
,
const
size_t
num_indices
,
const
int64_t
*
indices_data
,
const
int64_t
last_index_dimension
,
const
int64_t
*
element_counts_and_input_dims
,
const
void
*
updates_data
,
const
size_t
num_updates_elements
)
{
if
(
num_indices
==
0
)
return
Status
::
OK
();
// Parallelize on number of indices
int
blocksPerGrid
=
static_cast
<
int
>
(
ceil
(
static_cast
<
float
>
(
num_indices
)
/
GridDim
::
maxThreadsPerBlock
));
switch
(
element_size
)
{
case
sizeof
(
int8_t
):
hipLaunchKernelGGL
(
_ScatterNDKernel
,
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
reinterpret_cast
<
int8_t
*>
(
output_data
),
num_indices
,
indices_data
,
last_index_dimension
,
element_counts_and_input_dims
,
reinterpret_cast
<
const
int8_t
*>
(
updates_data
),
num_updates_elements
);
break
;
case
sizeof
(
int16_t
):
hipLaunchKernelGGL
(
_ScatterNDKernel
,
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
reinterpret_cast
<
int16_t
*>
(
output_data
),
num_indices
,
indices_data
,
last_index_dimension
,
element_counts_and_input_dims
,
reinterpret_cast
<
const
int16_t
*>
(
updates_data
),
num_updates_elements
);
break
;
case
sizeof
(
int32_t
):
hipLaunchKernelGGL
(
_ScatterNDKernel
,
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
reinterpret_cast
<
int32_t
*>
(
output_data
),
num_indices
,
indices_data
,
last_index_dimension
,
element_counts_and_input_dims
,
reinterpret_cast
<
const
int32_t
*>
(
updates_data
),
num_updates_elements
);
break
;
case
sizeof
(
int64_t
):
hipLaunchKernelGGL
(
_ScatterNDKernel
,
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
reinterpret_cast
<
int64_t
*>
(
output_data
),
num_indices
,
indices_data
,
last_index_dimension
,
element_counts_and_input_dims
,
reinterpret_cast
<
const
int64_t
*>
(
updates_data
),
num_updates_elements
);
break
;
default:
// Shouldn't hit this
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
FAIL
,
"Type not supported for ScatterND operator"
);
}
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/scatter_nd_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
rocm
{
Status
ScatterNDImpl
(
hipStream_t
stream
,
void
*
output_data
,
const
size_t
element_size
,
const
size_t
num_indices
,
const
int64_t
*
indices_data
,
const
int64_t
last_index_dimension
,
const
int64_t
*
element_counts_and_input_dims
,
const
void
*
updates_data
,
const
size_t
num_updates_elements
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/sequence_op.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "sequence_op.h"
namespace
onnxruntime
{
namespace
rocm
{
ONNX_OPERATOR_KERNEL_EX
(
SequenceAt
,
kOnnxDomain
,
11
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
InputMemoryType
(
OrtMemTypeCPUInput
,
1
)
.
TypeConstraint
(
"S"
,
DataTypeImpl
::
AllFixedSizeSequenceTensorTypes
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
())
.
TypeConstraint
(
"I"
,
std
::
vector
<
MLDataType
>
{
DataTypeImpl
::
GetTensorType
<
int32_t
>
(),
DataTypeImpl
::
GetTensorType
<
int64_t
>
()}),
SequenceAt
);
ONNX_OPERATOR_KERNEL_EX
(
SequenceConstruct
,
kOnnxDomain
,
11
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
())
.
TypeConstraint
(
"S"
,
DataTypeImpl
::
AllFixedSizeSequenceTensorTypes
()),
SequenceConstruct
);
ONNX_OPERATOR_KERNEL_EX
(
SequenceEmpty
,
kOnnxDomain
,
11
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"S"
,
DataTypeImpl
::
AllFixedSizeSequenceTensorTypes
()),
SequenceEmpty
);
ONNX_OPERATOR_KERNEL_EX
(
SequenceLength
,
kOnnxDomain
,
11
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
OutputMemoryType
(
OrtMemTypeCPUInput
,
0
)
.
TypeConstraint
(
"S"
,
DataTypeImpl
::
AllFixedSizeSequenceTensorTypes
())
.
TypeConstraint
(
"I"
,
DataTypeImpl
::
GetTensorType
<
int64_t
>
()),
SequenceLength
);
ONNX_OPERATOR_KERNEL_EX
(
ConcatFromSequence
,
kOnnxDomain
,
11
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"S"
,
DataTypeImpl
::
AllFixedSizeSequenceTensorTypes
()),
ConcatFromSequence
);
ONNX_OPERATOR_KERNEL_EX
(
SequenceErase
,
kOnnxDomain
,
11
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
InputMemoryType
(
OrtMemTypeCPUInput
,
1
)
.
TypeConstraint
(
"S"
,
DataTypeImpl
::
AllFixedSizeSequenceTensorTypes
())
.
TypeConstraint
(
"I"
,
std
::
vector
<
MLDataType
>
{
DataTypeImpl
::
GetTensorType
<
int32_t
>
(),
DataTypeImpl
::
GetTensorType
<
int64_t
>
()}),
SequenceErase
);
ONNX_OPERATOR_KERNEL_EX
(
SequenceInsert
,
kOnnxDomain
,
11
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
InputMemoryType
(
OrtMemTypeCPUInput
,
2
)
.
TypeConstraint
(
"S"
,
DataTypeImpl
::
AllFixedSizeSequenceTensorTypes
())
.
TypeConstraint
(
"I"
,
std
::
vector
<
MLDataType
>
{
DataTypeImpl
::
GetTensorType
<
int32_t
>
(),
DataTypeImpl
::
GetTensorType
<
int64_t
>
()}),
SequenceInsert
);
}
// namespace rocm
}
// namespace onnxruntime
Prev
1
…
8
9
10
11
12
13
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment