Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
onnxruntime_v14
Commits
1a91fcc2
Commit
1a91fcc2
authored
Jul 25, 2023
by
gaoqiong
Browse files
add dtk所需文件
parent
a144865d
Pipeline
#492
failed with stages
in 0 seconds
Changes
280
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1657 additions
and
0 deletions
+1657
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/binary_elementwise_ops_impl.h
...me/core/providers/rocm/math/binary_elementwise_ops_impl.h
+100
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/binary_elementwise_ops_impl_functors.cuh
...viders/rocm/math/binary_elementwise_ops_impl_functors.cuh
+40
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/clip.cc
...lease/amdgpu/onnxruntime/core/providers/rocm/math/clip.cc
+127
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/clip.h
...elease/amdgpu/onnxruntime/core/providers/rocm/math/clip.h
+35
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/clip_impl.cu
.../amdgpu/onnxruntime/core/providers/rocm/math/clip_impl.cu
+51
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/clip_impl.h
...e/amdgpu/onnxruntime/core/providers/rocm/math/clip_impl.h
+15
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/cumsum.cc
...ase/amdgpu/onnxruntime/core/providers/rocm/math/cumsum.cc
+144
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/cumsum.h
...ease/amdgpu/onnxruntime/core/providers/rocm/math/cumsum.h
+47
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/cumsum_impl.cu
...mdgpu/onnxruntime/core/providers/rocm/math/cumsum_impl.cu
+168
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/cumsum_impl.h
...amdgpu/onnxruntime/core/providers/rocm/math/cumsum_impl.h
+24
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/gemm.h
...elease/amdgpu/onnxruntime/core/providers/rocm/math/gemm.h
+36
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/matmul.h
...ease/amdgpu/onnxruntime/core/providers/rocm/math/matmul.h
+33
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/matmul_integer.cc
...pu/onnxruntime/core/providers/rocm/math/matmul_integer.cc
+114
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/matmul_integer.cu
...pu/onnxruntime/core/providers/rocm/math/matmul_integer.cu
+132
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/matmul_integer.cuh
...u/onnxruntime/core/providers/rocm/math/matmul_integer.cuh
+25
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/matmul_integer.h
...gpu/onnxruntime/core/providers/rocm/math/matmul_integer.h
+34
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/softmax.h
...ase/amdgpu/onnxruntime/core/providers/rocm/math/softmax.h
+69
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/softmax_blockwise_impl.cuh
...ntime/core/providers/rocm/math/softmax_blockwise_impl.cuh
+335
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/softmax_common.h
...gpu/onnxruntime/core/providers/rocm/math/softmax_common.h
+22
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/topk.cc
...lease/amdgpu/onnxruntime/core/providers/rocm/math/topk.cc
+106
-0
No files found.
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/binary_elementwise_ops_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
rocm
{
// These macros simplifies coding. To add a new op with following steps:
// 1. Add a new entry in BINARY_OPS() list
// 2. (optional) Define templated single element operator in binary_elementwise_ops_impl.cu
// 3. (optional) Implement specialized single element operator
// 4. Add op kernel class definition in binary_elementwise_ops.h
// 5. Add op kernel registration and compute specialization in binary_elementwise_ops.cc
#define BINARY_OPS() \
BINARY_OP_NAME_EXPR(Add, (a + b)) \
BINARY_OP_NAME_EXPR(Sub, (a - b)) \
BINARY_OP_NAME_EXPR(Mul, (a * b)) \
BINARY_OP_NAME_EXPR(Div, (a / b)) \
BINARY_OP_NAME_EXPR(Pow_7, _Pow(a, b)) \
BINARY_OP_NAME_EXPR(And, (a & b)) \
BINARY_OP_NAME_EXPR(Or, (a | b)) \
BINARY_OP_NAME_EXPR(Xor, (a ^ b)) \
BINARY_OP_NAME_EXPR(PRelu, (a > (T)0 ? a : a * b)) \
BINARY_OP_NAME_EXPR(Max, _Max(a, b)) \
BINARY_OP_NAME_EXPR(Min, _Min(a, b)) \
BINARY_OP_NAME_EXPR(Mod, _Mod(a, b)) \
BINARY_OP_NAME_EXPR(Fmod, _Fmod(a, b))
// NOTE that cu files are compiled with nvcc and should not refer to any onnxruntime headers
// so struct BinaryElementwisePreparation cannot be used here
#define BINARY_ELEMENTWISE_IMPL_DECLARATION(name) \
template <typename T> \
void Impl_##name( \
hipStream_t stream, \
int32_t output_rank_or_simple_broadcast, \
const TArray<int64_t>* lhs_padded_strides, \
const T* lhs_data, \
const TArray<int64_t>* rhs_padded_strides, \
const T* rhs_data, \
const TArray<fast_divmod>* fdm_output_strides, \
const fast_divmod& fdm_H, \
const fast_divmod& fdm_C, \
T* output_data, \
size_t count)
#define BINARY_OP_NAME_EXPR(name, expr) BINARY_ELEMENTWISE_IMPL_DECLARATION(name);
BINARY_OPS
()
#undef BINARY_OP_NAME_EXPR
#define BINARY_ELEMENTWISE_IMPL_DECLARATION_T1(name) \
template <typename T, typename T1> \
void ImplT1_##name( \
hipStream_t stream, \
int32_t output_rank_or_simple_broadcast, \
const TArray<int64_t>* lhs_padded_strides, \
const T* lhs_data, \
const TArray<int64_t>* rhs_padded_strides, \
const T1* rhs_data, \
const TArray<fast_divmod>* fdm_output_strides, \
const fast_divmod& fdm_H, \
const fast_divmod& fdm_C, \
T* output_data, \
size_t count)
BINARY_ELEMENTWISE_IMPL_DECLARATION_T1
(
Pow
);
#define BINARY_ELEMENTWISE_IMPL_DECLARATION_T2(name) \
template <typename T, typename T1, typename T2> \
void ImplT2_##name( \
hipStream_t stream, \
int32_t output_rank_or_simple_broadcast, \
const TArray<int64_t>* lhs_padded_strides, \
const T1* lhs_data, \
const TArray<int64_t>* rhs_padded_strides, \
const T2* rhs_data, \
const TArray<fast_divmod>* fdm_output_strides, \
const fast_divmod& fdm_H, \
const fast_divmod& fdm_C, \
T* output_data, \
size_t count)
#define BINARY_OPS2() \
BINARY_OP_NAME_EXPR2(Greater, (a > b)) \
BINARY_OP_NAME_EXPR2(Equal, (a == b)) \
BINARY_OP_NAME_EXPR2(Less, (a < b)) \
BINARY_OP_NAME_EXPR2(GreaterOrEqual, (a >= b)) \
BINARY_OP_NAME_EXPR2(LessOrEqual, (a <= b))
#define BINARY_OP_NAME_EXPR2(name, expr) BINARY_ELEMENTWISE_IMPL_DECLARATION_T2(name);
BINARY_OPS2
()
#undef BINARY_OP_NAME_EXPR2
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/binary_elementwise_ops_impl_functors.cuh
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/math/binary_elementwise_ops_impl.h"
namespace
onnxruntime
{
namespace
rocm
{
// define the device functors that perform the computation on scalars
#define OP_FUNCTOR_DEFINITION(name, expr) \
template <class T, class T1, class T2> \
struct OP_##name { \
__device__ __inline__ T operator()(T1 a, T2 b) const { \
return (expr); \
} \
};
#define BINARY_OP_NAME_EXPR(name, expr) \
OP_FUNCTOR_DEFINITION(name, expr)
BINARY_OPS
()
OP_FUNCTOR_DEFINITION
(
Pow
,
_Pow
(
a
,
b
))
#undef BINARY_OP_NAME_EXPR
#define BINARY_OP_NAME_EXPR2(name, expr) \
OP_FUNCTOR_DEFINITION(name, expr)
BINARY_OPS2
()
#undef BINARY_OP_NAME_EXPR2
#undef OP_FUNCTOR_DEFINITION
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/clip.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/math/clip.h"
#include "core/providers/rocm/math/clip_impl.h"
namespace
onnxruntime
{
namespace
rocm
{
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX
(
Clip
,
kOnnxDomain
,
6
,
10
,
float
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
GetTensorType
<
float
>
()),
Clip_6
<
float
>
);
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Clip
,
kOnnxDomain
,
11
,
11
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
GetTensorType
<
float
>
()),
Clip
);
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Clip
,
kOnnxDomain
,
12
,
12
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
BuildKernelDefConstraints
<
float
,
double
,
MLFloat16
,
int8_t
,
uint8_t
,
int64_t
,
uint64_t
>
()),
Clip
);
ONNX_OPERATOR_KERNEL_EX
(
Clip
,
kOnnxDomain
,
13
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
BuildKernelDefConstraints
<
float
,
double
,
MLFloat16
,
int8_t
,
uint8_t
,
int64_t
,
uint64_t
>
()),
Clip
);
template
<
typename
T
>
Status
Clip_6
<
T
>::
ComputeInternal
(
OpKernelContext
*
ctx
)
const
{
const
Tensor
&
X
=
*
ctx
->
Input
<
Tensor
>
(
0
);
const
TensorShape
&
input_shape
{
X
.
Shape
()};
Tensor
*
Y
=
ctx
->
Output
(
0
,
input_shape
);
const
size_t
count
=
input_shape
.
Size
();
if
(
count
>
0
)
{
auto
*
y_data
=
Y
->
MutableData
<
T
>
();
const
auto
*
x_data
=
X
.
Data
<
T
>
();
ClipImpl
<
T
>
(
Stream
(),
x_data
,
y_data
,
nullptr
,
nullptr
,
this
->
min_
,
this
->
max_
,
count
);
}
return
Status
::
OK
();
}
namespace
clip_internal
{
template
<
typename
T
>
struct
LowMax
{
constexpr
static
T
low
()
{
return
std
::
numeric_limits
<
T
>::
lowest
();
}
constexpr
static
T
max
()
{
return
std
::
numeric_limits
<
T
>::
max
();
}
};
template
<
>
struct
LowMax
<
MLFloat16
>
{
static
MLFloat16
low
()
{
return
MLFloat16
(
math
::
floatToHalf
(
std
::
numeric_limits
<
float
>::
lowest
()));
}
static
MLFloat16
max
()
{
return
MLFloat16
(
math
::
floatToHalf
(
std
::
numeric_limits
<
float
>::
max
()));
}
};
}
// namespace clip_internal
template
<
typename
T
>
struct
Clip
::
ComputeImpl
{
void
operator
()(
hipStream_t
stream
,
const
Tensor
*
X
,
const
Tensor
*
min
,
const
Tensor
*
max
,
Tensor
*
Y
)
const
{
auto
min_default
=
clip_internal
::
LowMax
<
T
>::
low
();
auto
max_default
=
clip_internal
::
LowMax
<
T
>::
max
();
const
T
*
min_data
=
nullptr
;
const
T
*
max_data
=
nullptr
;
// 1-2 Input on CPU
if
(
min
)
{
ORT_ENFORCE
(
min
->
Shape
().
IsScalar
(),
"min should be a scalar."
);
min_data
=
min
->
Data
<
T
>
();
}
if
(
max
)
{
ORT_ENFORCE
(
max
->
Shape
().
IsScalar
(),
"max should be a scalar."
);
max_data
=
max
->
Data
<
T
>
();
}
const
size_t
count
=
X
->
Shape
().
Size
();
if
(
count
>
0
)
{
auto
*
y_data
=
Y
->
MutableData
<
T
>
();
const
auto
*
x_data
=
X
->
Data
<
T
>
();
ClipImpl
<
T
>
(
stream
,
x_data
,
y_data
,
min_data
,
max_data
,
min_default
,
max_default
,
count
);
}
}
};
Status
Clip
::
ComputeInternal
(
OpKernelContext
*
ctx
)
const
{
const
auto
*
X
=
ctx
->
Input
<
Tensor
>
(
0
);
const
auto
*
min
=
ctx
->
Input
<
Tensor
>
(
1
);
const
auto
*
max
=
ctx
->
Input
<
Tensor
>
(
2
);
Tensor
*
Y
=
ctx
->
Output
(
0
,
X
->
Shape
());
utils
::
MLTypeCallDispatcher
<
float
,
double
,
MLFloat16
,
int8_t
,
uint8_t
,
int64_t
,
uint64_t
>
t_disp
(
X
->
GetElementType
());
t_disp
.
Invoke
<
ComputeImpl
>
(
Stream
(),
X
,
min
,
max
,
Y
);
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/clip.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/math/clip.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
class
Clip_6
final
:
public
onnxruntime
::
clip_internal
::
Clip_6Base
<
T
>
,
public
RocmKernel
{
public:
explicit
Clip_6
(
const
OpKernelInfo
&
info
)
:
onnxruntime
::
clip_internal
::
Clip_6Base
<
T
>
(
info
),
RocmKernel
{
info
}
{
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
};
// Since version 11. Min and Max are inputs
// version 12 adds type support
class
Clip
final
:
public
RocmKernel
{
public:
explicit
Clip
(
const
OpKernelInfo
&
info
)
:
RocmKernel
{
info
}
{
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
private:
template
<
typename
T
>
struct
ComputeImpl
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/clip_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/math/clip_impl.h"
#include "core/providers/rocm/cu_inc/common.cuh"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
__global__
void
_Clip
(
const
T
*
input
,
T
*
output
,
const
T
*
min
,
const
T
*
max
,
T
min_default
,
T
max_default
,
size_t
N
)
{
auto
min_val
=
(
min
)
?
*
min
:
min_default
;
auto
max_val
=
(
max
)
?
*
max
:
max_default
;
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
N
);
output
[
id
]
=
(
input
[
id
]
<
min_val
)
?
min_val
:
((
input
[
id
]
>
max_val
)
?
max_val
:
input
[
id
]);
}
template
<
typename
T
>
void
ClipImpl
(
hipStream_t
stream
,
const
T
*
input_data
,
T
*
output_data
,
const
T
*
min
,
const
T
*
max
,
T
min_default
,
T
max_default
,
size_t
count
)
{
typedef
typename
ToHipType
<
T
>::
MappedType
HipT
;
int
blocksPerGrid
=
(
int
)(
ceil
(
static_cast
<
float
>
(
count
)
/
GridDim
::
maxThreadsPerBlock
));
union
ConstAliasUnion
{
const
T
*
t
;
const
HipT
*
rocmT
;
ConstAliasUnion
(
const
T
*
_t
)
{
t
=
_t
;}
};
union
AliasUnion
{
T
*
t
;
HipT
*
rocmT
;
AliasUnion
(
T
*
_t
)
{
t
=
_t
;}
};
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_Clip
<
HipT
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
((
union
ConstAliasUnion
)
input_data
).
rocmT
,
((
union
AliasUnion
)
output_data
).
rocmT
,
((
union
ConstAliasUnion
)
min
).
rocmT
,
((
union
ConstAliasUnion
)
max
).
rocmT
,
*
((
union
AliasUnion
)
&
min_default
).
rocmT
,
*
((
union
AliasUnion
)
&
max_default
).
rocmT
,
count
);
}
template
void
ClipImpl
<
float
>(
hipStream_t
stream
,
const
float
*
input_data
,
float
*
output_data
,
const
float
*
min
,
const
float
*
max
,
float
min_default
,
float
max_default
,
size_t
count
);
template
void
ClipImpl
<
double
>(
hipStream_t
stream
,
const
double
*
input_data
,
double
*
output_data
,
const
double
*
min
,
const
double
*
max
,
double
min_default
,
double
max_default
,
size_t
count
);
template
void
ClipImpl
<
MLFloat16
>(
hipStream_t
stream
,
const
MLFloat16
*
input_data
,
MLFloat16
*
output_data
,
const
MLFloat16
*
min
,
const
MLFloat16
*
max
,
MLFloat16
min_default
,
MLFloat16
max_default
,
size_t
count
);
template
void
ClipImpl
<
int8_t
>(
hipStream_t
stream
,
const
int8_t
*
input_data
,
int8_t
*
output_data
,
const
int8_t
*
min
,
const
int8_t
*
max
,
int8_t
min_default
,
int8_t
max_default
,
size_t
count
);
template
void
ClipImpl
<
uint8_t
>(
hipStream_t
stream
,
const
uint8_t
*
input_data
,
uint8_t
*
output_data
,
const
uint8_t
*
min
,
const
uint8_t
*
max
,
uint8_t
min_default
,
uint8_t
max_default
,
size_t
count
);
template
void
ClipImpl
<
int64_t
>(
hipStream_t
stream
,
const
int64_t
*
input_data
,
int64_t
*
output_data
,
const
int64_t
*
min
,
const
int64_t
*
max
,
int64_t
min_default
,
int64_t
max_default
,
size_t
count
);
template
void
ClipImpl
<
uint64_t
>(
hipStream_t
stream
,
const
uint64_t
*
input_data
,
uint64_t
*
output_data
,
const
uint64_t
*
min
,
const
uint64_t
*
max
,
uint64_t
min_default
,
uint64_t
max_default
,
size_t
count
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/clip_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/math/clip.h"
#include "core/providers/rocm/rocm_common.h"
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
void
ClipImpl
(
hipStream_t
stream
,
const
T
*
input_data
,
T
*
output_data
,
const
T
*
min
,
const
T
*
max
,
T
min_default
,
T
max_default
,
size_t
count
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/cumsum.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "cumsum.h"
#include "cumsum_impl.h"
#include "core/providers/cpu/math/cumsum.h"
#include "core/providers/common.h"
namespace
onnxruntime
{
namespace
rocm
{
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
CumSum
,
kOnnxDomain
,
11
,
13
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
InputMemoryType
(
OrtMemTypeCPUInput
,
1
)
// 'axis' needs to be on CPU
.
TypeConstraint
(
"T"
,
std
::
vector
<
MLDataType
>
{
DataTypeImpl
::
GetTensorType
<
int32_t
>
(),
DataTypeImpl
::
GetTensorType
<
int64_t
>
(),
DataTypeImpl
::
GetTensorType
<
uint32_t
>
(),
DataTypeImpl
::
GetTensorType
<
uint64_t
>
(),
DataTypeImpl
::
GetTensorType
<
float
>
(),
DataTypeImpl
::
GetTensorType
<
double
>
()})
.
TypeConstraint
(
"T2"
,
std
::
vector
<
MLDataType
>
{
DataTypeImpl
::
GetTensorType
<
int32_t
>
(),
DataTypeImpl
::
GetTensorType
<
int64_t
>
()}),
CumSum
);
ONNX_OPERATOR_KERNEL_EX
(
CumSum
,
kOnnxDomain
,
14
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
InputMemoryType
(
OrtMemTypeCPUInput
,
1
)
// 'axis' needs to be on CPU
.
TypeConstraint
(
"T"
,
std
::
vector
<
MLDataType
>
{
DataTypeImpl
::
GetTensorType
<
int32_t
>
(),
DataTypeImpl
::
GetTensorType
<
int64_t
>
(),
DataTypeImpl
::
GetTensorType
<
uint32_t
>
(),
DataTypeImpl
::
GetTensorType
<
uint64_t
>
(),
DataTypeImpl
::
GetTensorType
<
float
>
(),
DataTypeImpl
::
GetTensorType
<
double
>
(),
DataTypeImpl
::
GetTensorType
<
MLFloat16
>
()})
// MLFloat16 is added in opset 14
.
TypeConstraint
(
"T2"
,
std
::
vector
<
MLDataType
>
{
DataTypeImpl
::
GetTensorType
<
int32_t
>
(),
DataTypeImpl
::
GetTensorType
<
int64_t
>
()}),
CumSum
);
Status
CumSum
::
ComputeInternal
(
OpKernelContext
*
ctx
)
const
{
const
Tensor
*
input
=
ctx
->
Input
<
Tensor
>
(
0
);
// input tensor
auto
rank
=
static_cast
<
int64_t
>
(
input
->
Shape
().
NumDimensions
());
// the rank of the input/output
if
(
rank
==
0
)
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
INVALID_ARGUMENT
,
"Cannot apply CumSum operator on a scalar"
);
const
Tensor
*
axis_tensor
=
ctx
->
Input
<
Tensor
>
(
1
);
// axis input tensor
int64_t
axis
=
0
;
ORT_THROW_IF_ERROR
(
cumsum_op
::
GetAxis
(
axis_tensor
,
rank
,
axis
));
TensorShape
output_shape
(
input
->
Shape
());
auto
&
output
=
*
ctx
->
Output
(
0
,
output_shape
);
// output tensor
// output tensor's size is 0, nothing to fill - return
if
(
output_shape
.
Size
()
==
0
)
return
Status
::
OK
();
const
auto
&
input_dims
=
input
->
Shape
().
GetDims
();
int64_t
current_dim
=
rank
-
1
;
int64_t
input_stride_along_axis
=
1
;
// axis (and by extension current_dim) can never be negative as this is validated much before
// so no need to add the extra check to make sure current_dim is within bounds of the vector size
while
(
current_dim
>
axis
)
{
input_stride_along_axis
*=
input_dims
[
current_dim
--
];
}
fast_divmod
fast_divmod_input_dim_along_axis
(
static_cast
<
int
>
(
input_dims
[
axis
]));
fast_divmod
fast_divmod_input_stride_along_axis
(
static_cast
<
int
>
(
input_stride_along_axis
));
if
(
input
->
IsDataType
<
float
>
())
{
CumSumImpl
(
Stream
(),
reinterpret_cast
<
const
typename
ToHipType
<
float
>::
MappedType
*>
(
input
->
Data
<
float
>
()),
fast_divmod_input_dim_along_axis
,
fast_divmod_input_stride_along_axis
,
reinterpret_cast
<
typename
ToHipType
<
float
>::
MappedType
*>
(
output
.
MutableData
<
float
>
()),
output_shape
.
Size
(),
exclusive_
,
reverse_
);
}
else
if
(
input
->
IsDataType
<
double
>
())
{
CumSumImpl
(
Stream
(),
reinterpret_cast
<
const
typename
ToHipType
<
double
>::
MappedType
*>
(
input
->
Data
<
double
>
()),
fast_divmod_input_dim_along_axis
,
fast_divmod_input_stride_along_axis
,
reinterpret_cast
<
typename
ToHipType
<
double
>::
MappedType
*>
(
output
.
MutableData
<
double
>
()),
output_shape
.
Size
(),
exclusive_
,
reverse_
);
}
else
if
(
input
->
IsDataType
<
int32_t
>
())
{
CumSumImpl
(
Stream
(),
reinterpret_cast
<
const
typename
ToHipType
<
int32_t
>::
MappedType
*>
(
input
->
Data
<
int32_t
>
()),
fast_divmod_input_dim_along_axis
,
fast_divmod_input_stride_along_axis
,
reinterpret_cast
<
typename
ToHipType
<
int32_t
>::
MappedType
*>
(
output
.
MutableData
<
int32_t
>
()),
output_shape
.
Size
(),
exclusive_
,
reverse_
);
}
else
if
(
input
->
IsDataType
<
int64_t
>
())
{
CumSumImpl
(
Stream
(),
reinterpret_cast
<
const
typename
ToHipType
<
int64_t
>::
MappedType
*>
(
input
->
Data
<
int64_t
>
()),
fast_divmod_input_dim_along_axis
,
fast_divmod_input_stride_along_axis
,
reinterpret_cast
<
typename
ToHipType
<
int64_t
>::
MappedType
*>
(
output
.
MutableData
<
int64_t
>
()),
output_shape
.
Size
(),
exclusive_
,
reverse_
);
}
else
if
(
input
->
IsDataType
<
uint32_t
>
())
{
CumSumImpl
(
Stream
(),
reinterpret_cast
<
const
typename
ToHipType
<
uint32_t
>::
MappedType
*>
(
input
->
Data
<
uint32_t
>
()),
fast_divmod_input_dim_along_axis
,
fast_divmod_input_stride_along_axis
,
reinterpret_cast
<
typename
ToHipType
<
uint32_t
>::
MappedType
*>
(
output
.
MutableData
<
uint32_t
>
()),
output_shape
.
Size
(),
exclusive_
,
reverse_
);
}
else
if
(
input
->
IsDataType
<
uint64_t
>
())
{
CumSumImpl
(
Stream
(),
reinterpret_cast
<
const
typename
ToHipType
<
uint64_t
>::
MappedType
*>
(
input
->
Data
<
uint64_t
>
()),
fast_divmod_input_dim_along_axis
,
fast_divmod_input_stride_along_axis
,
reinterpret_cast
<
typename
ToHipType
<
uint64_t
>::
MappedType
*>
(
output
.
MutableData
<
uint64_t
>
()),
output_shape
.
Size
(),
exclusive_
,
reverse_
);
}
else
if
(
input
->
IsDataType
<
MLFloat16
>
())
{
CumSumImpl
(
Stream
(),
reinterpret_cast
<
const
typename
ToHipType
<
MLFloat16
>::
MappedType
*>
(
input
->
Data
<
MLFloat16
>
()),
fast_divmod_input_dim_along_axis
,
fast_divmod_input_stride_along_axis
,
reinterpret_cast
<
typename
ToHipType
<
MLFloat16
>::
MappedType
*>
(
output
.
MutableData
<
MLFloat16
>
()),
output_shape
.
Size
(),
exclusive_
,
reverse_
);
}
else
{
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
INVALID_ARGUMENT
,
"Unsupported input data type to the CumSum op: "
,
input
->
DataType
());
}
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/cumsum.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/common.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
rocm
{
class
CumSum
final
:
public
RocmKernel
{
public:
explicit
CumSum
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{
// Process exclusive attribute
int64_t
exclusive
=
0
;
auto
status
=
info
.
GetAttr
(
"exclusive"
,
&
exclusive
);
if
(
status
.
IsOK
())
{
if
(
exclusive
==
1
||
exclusive
==
0
)
{
exclusive_
=
(
exclusive
==
1
);
}
else
{
ORT_ENFORCE
(
"attribute exclusive can only be 0 or 1"
);
}
}
// Process reverse attribute
int64_t
reverse
=
0
;
status
=
info
.
GetAttr
(
"reverse"
,
&
reverse
);
if
(
status
.
IsOK
())
{
if
(
reverse
==
1
||
reverse
==
0
)
{
reverse_
=
(
reverse
==
1
);
}
else
{
ORT_ENFORCE
(
"attribute reverse can only be 0 or 1"
);
}
}
}
~
CumSum
()
=
default
;
Status
ComputeInternal
(
OpKernelContext
*
ctx
)
const
override
;
private:
bool
exclusive_
=
false
;
bool
reverse_
=
false
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/cumsum_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/providers/rocm/shared_inc/fast_divmod.h"
#include "cumsum_impl.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
__global__
void
_CumSumKernel
(
const
T
*
input_data
,
const
fast_divmod
fast_divmod_input_dim_along_axis
,
const
fast_divmod
fast_divmod_input_stride_along_axis
,
T
*
output_data
,
const
int64_t
output_size
,
const
bool
exclusive
,
const
bool
reverse
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
indices_index
,
output_size
);
int
input_dim_along_axis
=
fast_divmod_input_dim_along_axis
.
d_
;
int
input_stride_along_axis
=
fast_divmod_input_stride_along_axis
.
d_
;
int
axis_dim
=
0
;
int
div
=
fast_divmod_input_stride_along_axis
.
div
(
static_cast
<
int
>
(
indices_index
));
fast_divmod_input_dim_along_axis
.
divmod
(
div
,
div
,
axis_dim
);
int
start
=
0
;
int
end
=
0
;
if
(
!
reverse
&&
!
exclusive
)
{
start
=
0
;
end
=
axis_dim
;
}
else
if
(
reverse
&&
!
exclusive
)
{
start
=
axis_dim
;
end
=
input_dim_along_axis
-
1
;
}
else
if
(
!
reverse
&&
exclusive
)
{
start
=
0
;
end
=
axis_dim
-
1
;
}
else
{
// reverse && exclusive
start
=
axis_dim
+
1
;
end
=
input_dim_along_axis
-
1
;
}
// count the number of elements to accumulate the sum
int
count
=
end
-
start
+
1
;
if
(
count
<=
0
)
{
output_data
[
indices_index
]
=
0
;
return
;
}
// adjust start index based on the above identified start dim value along the axis of interest
int
data_index
=
static_cast
<
int
>
(
indices_index
)
+
(
start
-
axis_dim
)
*
input_stride_along_axis
;
T
sum
=
0
;
// keep accumulating values from the start index for 'count' times and skip appropriately
while
(
count
!=
0
)
{
sum
+=
input_data
[
data_index
];
data_index
+=
input_stride_along_axis
;
--
count
;
}
output_data
[
indices_index
]
=
sum
;
}
template
<
typename
T
>
void
CumSumImpl
(
hipStream_t
stream
,
const
T
*
input_data
,
const
fast_divmod
&
input_dim_along_axis
,
const
fast_divmod
&
input_stride_along_axis
,
T
*
output_data
,
int64_t
output_size
,
bool
exclusive
,
bool
reverse
)
{
if
(
output_size
>
0
)
{
int
blocksPerGrid
=
static_cast
<
int
>
((
output_size
+
GridDim
::
maxThreadsPerBlock
-
1
)
/
GridDim
::
maxThreadsPerBlock
);
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_CumSumKernel
<
T
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
input_data
,
input_dim_along_axis
,
input_stride_along_axis
,
output_data
,
output_size
,
exclusive
,
reverse
);
}
}
template
void
CumSumImpl
<
int32_t
>(
hipStream_t
stream
,
const
int32_t
*
input_data
,
const
fast_divmod
&
input_dim_along_axis
,
const
fast_divmod
&
input_stride_along_axis
,
int32_t
*
output_data
,
int64_t
output_size
,
bool
exclusive
,
bool
reverse
);
template
void
CumSumImpl
<
int64_t
>(
hipStream_t
stream
,
const
int64_t
*
input_data
,
const
fast_divmod
&
input_dim_along_axis
,
const
fast_divmod
&
input_stride_along_axis
,
int64_t
*
output_data
,
int64_t
output_size
,
bool
exclusive
,
bool
reverse
);
template
void
CumSumImpl
<
uint32_t
>(
hipStream_t
stream
,
const
uint32_t
*
input_data
,
const
fast_divmod
&
input_dim_along_axis
,
const
fast_divmod
&
input_stride_along_axis
,
uint32_t
*
output_data
,
int64_t
output_size
,
bool
exclusive
,
bool
reverse
);
template
void
CumSumImpl
<
uint64_t
>(
hipStream_t
stream
,
const
uint64_t
*
input_data
,
const
fast_divmod
&
input_dim_along_axis
,
const
fast_divmod
&
input_stride_along_axis
,
uint64_t
*
output_data
,
int64_t
output_size
,
bool
exclusive
,
bool
reverse
);
template
void
CumSumImpl
<
float
>(
hipStream_t
stream
,
const
float
*
input_data
,
const
fast_divmod
&
input_dim_along_axis
,
const
fast_divmod
&
input_stride_along_axis
,
float
*
output_data
,
int64_t
output_size
,
bool
exclusive
,
bool
reverse
);
template
void
CumSumImpl
<
double
>(
hipStream_t
stream
,
const
double
*
input_data
,
const
fast_divmod
&
input_dim_along_axis
,
const
fast_divmod
&
input_stride_along_axis
,
double
*
output_data
,
int64_t
output_size
,
bool
exclusive
,
bool
reverse
);
template
void
CumSumImpl
<
half
>(
hipStream_t
stream
,
const
half
*
input_data
,
const
fast_divmod
&
input_dim_along_axis
,
const
fast_divmod
&
input_stride_along_axis
,
half
*
output_data
,
int64_t
output_size
,
bool
exclusive
,
bool
reverse
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/cumsum_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
void
CumSumImpl
(
hipStream_t
stream
,
const
T
*
input_data
,
const
fast_divmod
&
input_dim_along_axis
,
const
fast_divmod
&
input_stride_along_axis
,
T
*
output_data
,
int64_t
output_size
,
bool
exclusive
,
bool
reverse
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/gemm.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
class
Gemm
final
:
public
RocmKernel
{
using
Base
=
RocmKernel
;
public:
Gemm
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{
int64_t
temp
;
ORT_ENFORCE
(
info
.
GetAttr
<
int64_t
>
(
"transA"
,
&
temp
).
IsOK
());
trans_A_
=
(
temp
!=
0
);
ORT_ENFORCE
(
info
.
GetAttr
<
int64_t
>
(
"transB"
,
&
temp
).
IsOK
());
trans_B_
=
(
temp
!=
0
);
ORT_ENFORCE
(
info
.
GetAttr
<
float
>
(
"alpha"
,
&
alpha_
).
IsOK
());
ORT_ENFORCE
(
info
.
GetAttr
<
float
>
(
"beta"
,
&
beta_
).
IsOK
());
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
private:
bool
trans_A_
;
bool
trans_B_
;
float
alpha_
;
float
beta_
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/matmul.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
class
MatMul
final
:
public
RocmKernel
{
using
Base
=
RocmKernel
;
public:
MatMul
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
),
alpha_
{
info
.
GetAttrOrDefault
<
float
>
(
"alpha"
,
1.0
f
)},
trans_A_
{
info
.
GetAttrOrDefault
<
int64_t
>
(
"transA"
,
0
)
!=
0
},
trans_B_
{
info
.
GetAttrOrDefault
<
int64_t
>
(
"transB"
,
0
)
!=
0
},
trans_batch_a_
{
info
.
GetAttrOrDefault
<
int64_t
>
(
"transBatchA"
,
0
)
!=
0
},
trans_batch_b_
{
info
.
GetAttrOrDefault
<
int64_t
>
(
"transBatchB"
,
0
)
!=
0
}
{}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
private:
const
float
alpha_
;
const
bool
trans_A_
;
const
bool
trans_B_
;
const
bool
trans_batch_a_
;
const
bool
trans_batch_b_
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/matmul_integer.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "matmul_integer.h"
#include "matmul_integer.cuh"
#include "core/providers/cpu/math/matmul_helper.h"
#include "core/providers/rocm/shared_inc/fpgeneric.h"
#include "core/providers/rocm/shared_inc/integer_gemm.h"
#include "core/providers/rocm/rocm_allocator.h"
#include "core/providers/common.h"
namespace
onnxruntime
{
namespace
rocm
{
ONNX_OPERATOR_TYPED_KERNEL_EX
(
MatMulInteger
,
kOnnxDomain
,
10
,
int8_t
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
InputMemoryType
(
OrtMemTypeCPUInput
,
2
)
.
InputMemoryType
(
OrtMemTypeCPUInput
,
3
)
.
TypeConstraint
(
"T1"
,
DataTypeImpl
::
GetTensorType
<
int8_t
>
())
.
TypeConstraint
(
"T2"
,
DataTypeImpl
::
GetTensorType
<
int8_t
>
())
.
TypeConstraint
(
"T3"
,
DataTypeImpl
::
GetTensorType
<
int32_t
>
()),
MatMulInteger
<
int8_t
,
int8_t
>
);
template
<
>
Status
MatMulInteger
<
int8_t
,
int8_t
>::
ComputeInternal
(
OpKernelContext
*
ctx
)
const
{
auto
a
=
ctx
->
Input
<
Tensor
>
(
0
);
auto
b
=
ctx
->
Input
<
Tensor
>
(
1
);
ORT_ENFORCE
(
a
!=
nullptr
&&
b
!=
nullptr
);
MatMulComputeHelper
helper
;
ORT_RETURN_IF_ERROR
(
helper
.
Compute
(
a
->
Shape
(),
b
->
Shape
()));
Tensor
*
Y
=
ctx
->
Output
(
0
,
helper
.
OutputShape
());
// Bail out early if the output is going to be empty
if
(
Y
->
Shape
().
Size
()
==
0
)
return
Status
::
OK
();
const
int8_t
*
a_ptr
=
a
->
Data
<
int8_t
>
();
const
int8_t
*
b_ptr
=
b
->
Data
<
int8_t
>
();
int32_t
*
output_ptr
=
Y
->
MutableData
<
int32_t
>
();
// validate zero points
int8_t
a_offset
=
0
;
int8_t
b_offset
=
0
;
if
(
has_a_zero_point_
)
{
auto
a_zero_point
=
ctx
->
Input
<
Tensor
>
(
2
);
ORT_ENFORCE
(
IsScalarOr1ElementVector
(
a_zero_point
),
"MatmulInteger : input1 zero point must be a scalar or 1D tensor of size 1"
);
a_offset
=
*
(
a_zero_point
->
Data
<
int8_t
>
());
}
if
(
has_b_zero_point_
)
{
auto
b_zero_point
=
ctx
->
Input
<
Tensor
>
(
3
);
ORT_ENFORCE
(
IsScalarOr1ElementVector
(
b_zero_point
),
"MatmulInteger : input2 zero point must be a scalar or 1D tensor of size 1"
);
b_offset
=
*
(
b_zero_point
->
Data
<
int8_t
>
());
}
// offset output c[i,j] to
// k*a_offset*b_offset -
// b_offset * (a[i,0] + a[i,1] ...+a[i,k]) -
// a_offset * (b[0,j] + b[1,j] ... + b[k,j])
// ReduceRowSumOnMatrixA computes the b_offset * (a[i,0] + a[i,1] ...+a[i,k]) part
// ReduceColSumOnMatrixB computes the a_offset * (b[0,j] + b[1,j] ... + b[k,j]) part
// OffsetOutput computes gets the final result
IAllocatorUniquePtr
<
int32_t
>
a_row_buf
;
if
(
b_offset
!=
0
)
{
a_row_buf
=
GetScratchBuffer
<
int32_t
>
(
helper
.
OutputShape
().
Size
()
/
helper
.
N
());
ORT_RETURN_IF_ERROR
(
ReduceRowSumOnMatrixA
(
Stream
(),
a_ptr
,
a_row_buf
.
get
(),
b_offset
,
helper
));
}
IAllocatorUniquePtr
<
int32_t
>
b_col_buf
;
if
(
a_offset
!=
0
)
{
b_col_buf
=
GetScratchBuffer
<
int32_t
>
(
helper
.
OutputShape
().
Size
()
/
helper
.
M
());
ORT_RETURN_IF_ERROR
(
ReduceColSumOnMatrixB
(
Stream
(),
b_ptr
,
b_col_buf
.
get
(),
a_offset
,
helper
));
}
int
alpha
=
1
;
int
beta
=
0
;
if
(
a_offset
!=
0
||
b_offset
!=
0
)
{
ORT_RETURN_IF_ERROR
(
OffsetOutput
(
Stream
(),
a_row_buf
.
get
(),
b_col_buf
.
get
(),
output_ptr
,
a_offset
,
b_offset
,
helper
));
beta
=
1
;
}
for
(
size_t
batch
=
0
;
batch
<
helper
.
OutputOffsets
().
size
();
batch
++
)
{
ORT_RETURN_IF_ERROR
(
GemmInt8
(
static_cast
<
int
>
(
helper
.
M
()),
static_cast
<
int
>
(
helper
.
N
()),
static_cast
<
int
>
(
helper
.
K
()),
alpha
,
beta
,
a_ptr
+
helper
.
LeftOffsets
()[
batch
],
static_cast
<
int
>
(
helper
.
K
()),
b_ptr
+
helper
.
RightOffsets
()[
batch
],
static_cast
<
int
>
(
helper
.
N
()),
output_ptr
+
helper
.
OutputOffsets
()[
batch
],
static_cast
<
int
>
(
helper
.
N
()),
this
));
}
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/matmul_integer.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "matmul_integer.cuh"
#include <hipcub/hipcub.hpp>
#include "core/providers/rocm/cu_inc/common.cuh"
namespace
onnxruntime
{
namespace
rocm
{
template
<
int
TPB
>
__global__
void
ReduceRowSumOnMatrixAKernel
(
const
int8_t
*
matrix
,
int32_t
*
row_sum
,
const
int8_t
offset
,
int32_t
K
)
{
int32_t
thread_data
=
0
;
const
int8_t
*
row_ptr
=
matrix
+
blockIdx
.
x
*
K
;
for
(
int
i
=
threadIdx
.
x
;
i
<
K
;
i
+=
TPB
)
{
thread_data
+=
*
(
row_ptr
+
i
);
}
using
BlockReduce
=
hipcub
::
BlockReduce
<
int32_t
,
TPB
>
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
int32_t
sum
=
BlockReduce
(
temp_storage
).
Sum
(
thread_data
);
if
(
threadIdx
.
x
==
0
)
{
row_sum
[
blockIdx
.
x
]
=
offset
*
sum
;
}
}
Status
ReduceRowSumOnMatrixA
(
hipStream_t
stream
,
const
int8_t
*
matrix
,
int32_t
*
row_sum
,
const
int8_t
offset
,
const
MatMulComputeHelper
&
helper
)
{
for
(
size_t
batch
=
0
;
batch
<
helper
.
OutputOffsets
().
size
();
batch
++
)
{
ReduceRowSumOnMatrixAKernel
<
static_cast
<
int
>
(
GridDim
::
maxThreadsPerBlock
)
><<<
static_cast
<
int
>
(
helper
.
M
()),
GridDim
::
maxThreadsPerBlock
,
0
,
stream
>>>
(
matrix
+
helper
.
LeftOffsets
()[
batch
],
row_sum
+
batch
*
helper
.
M
(),
offset
,
static_cast
<
int
>
(
helper
.
K
()));
}
return
HIP_CALL
(
hipGetLastError
());
}
template
<
int
TPB
>
__global__
void
ReduceColSumOnMatrixBKernel
(
const
int8_t
*
matrix
,
int32_t
*
col_sum
,
const
int8_t
offset
,
int32_t
row
,
int32_t
col
)
{
int32_t
thread_data
=
0
;
const
int8_t
*
col_ptr
=
matrix
+
blockIdx
.
x
;
for
(
int
i
=
threadIdx
.
x
;
i
<
row
;
i
+=
TPB
)
{
thread_data
+=
*
(
col_ptr
+
i
*
col
);
}
using
BlockReduce
=
hipcub
::
BlockReduce
<
int32_t
,
TPB
>
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
int32_t
sum
=
BlockReduce
(
temp_storage
).
Sum
(
thread_data
);
if
(
threadIdx
.
x
==
0
)
{
col_sum
[
blockIdx
.
x
]
=
offset
*
sum
;
}
}
Status
ReduceColSumOnMatrixB
(
hipStream_t
stream
,
const
int8_t
*
matrix
,
int32_t
*
col_sum
,
const
int8_t
offset
,
const
MatMulComputeHelper
&
helper
)
{
for
(
size_t
batch
=
0
;
batch
<
helper
.
OutputOffsets
().
size
();
batch
++
)
{
ReduceColSumOnMatrixBKernel
<
static_cast
<
int
>
(
GridDim
::
maxThreadsPerBlock
)
><<<
static_cast
<
int
>
(
helper
.
N
()),
GridDim
::
maxThreadsPerBlock
,
0
,
stream
>>>
(
matrix
+
helper
.
RightOffsets
()[
batch
],
col_sum
+
batch
*
helper
.
N
(),
offset
,
static_cast
<
int32_t
>
(
helper
.
K
()),
static_cast
<
int32_t
>
(
helper
.
N
()));
}
return
HIP_CALL
(
hipGetLastError
());
}
__global__
void
ComputeOffsetOfMatrixAB
(
const
int32_t
*
row_sum
,
const
int32_t
*
col_sum
,
int32_t
*
output
,
int32_t
K_A_B
,
int32_t
N
)
{
for
(
int32_t
i
=
threadIdx
.
x
;
i
<
N
;
i
+=
blockDim
.
x
)
{
*
(
output
+
blockIdx
.
x
*
N
+
i
)
=
K_A_B
-
row_sum
[
blockIdx
.
x
]
-
col_sum
[
i
];
}
}
__global__
void
ComputeOffsetOfMatrixA
(
const
int32_t
*
col_sum
,
int32_t
*
output
,
int32_t
N
)
{
for
(
int32_t
i
=
threadIdx
.
x
;
i
<
N
;
i
+=
blockDim
.
x
)
{
*
(
output
+
blockIdx
.
x
*
N
+
i
)
=
-
col_sum
[
i
];
}
}
__global__
void
ComputeOffsetOfMatrixB
(
const
int32_t
*
row_sum
,
int32_t
*
output
,
int32_t
N
)
{
for
(
int32_t
i
=
threadIdx
.
x
;
i
<
N
;
i
+=
blockDim
.
x
)
{
*
(
output
+
blockIdx
.
x
*
N
+
i
)
=
-
row_sum
[
blockIdx
.
x
];
}
}
Status
OffsetOutput
(
hipStream_t
stream
,
const
int32_t
*
row_sum
,
const
int32_t
*
col_sum
,
int32_t
*
output
,
const
int8_t
a_offset
,
const
int8_t
b_offset
,
const
MatMulComputeHelper
&
helper
)
{
if
(
a_offset
&&
b_offset
)
{
for
(
size_t
batch
=
0
;
batch
<
helper
.
OutputOffsets
().
size
();
batch
++
)
{
ComputeOffsetOfMatrixAB
<<<
static_cast
<
int
>
(
helper
.
M
()),
GridDim
::
maxThreadsPerBlock
,
0
,
stream
>>>
(
row_sum
+
batch
*
helper
.
M
(),
col_sum
+
batch
*
helper
.
N
(),
output
+
helper
.
OutputOffsets
()[
batch
],
static_cast
<
int32_t
>
(
helper
.
K
())
*
a_offset
*
b_offset
,
static_cast
<
int32_t
>
(
helper
.
N
()));
}
}
else
if
(
a_offset
)
{
for
(
size_t
batch
=
0
;
batch
<
helper
.
OutputOffsets
().
size
();
batch
++
)
{
ComputeOffsetOfMatrixA
<<<
static_cast
<
int
>
(
helper
.
M
()),
GridDim
::
maxThreadsPerBlock
,
0
,
stream
>>>
(
col_sum
+
batch
*
helper
.
N
(),
output
+
helper
.
OutputOffsets
()[
batch
],
static_cast
<
int32_t
>
(
helper
.
N
()));
}
}
else
if
(
b_offset
)
{
for
(
size_t
batch
=
0
;
batch
<
helper
.
OutputOffsets
().
size
();
batch
++
)
{
ComputeOffsetOfMatrixB
<<<
static_cast
<
int
>
(
helper
.
M
()),
GridDim
::
maxThreadsPerBlock
,
0
,
stream
>>>
(
row_sum
+
batch
*
helper
.
M
(),
output
+
helper
.
OutputOffsets
()[
batch
],
static_cast
<
int32_t
>
(
helper
.
N
()));
}
}
return
HIP_CALL
(
hipGetLastError
());
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/matmul_integer.cuh
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "matmul_integer.h"
#include "core/providers/cpu/math/matmul_helper.h"
#include "core/providers/rocm/rocm_common.h"
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
rocm
{
Status
ReduceRowSumOnMatrixA
(
hipStream_t
stream
,
const
int8_t
*
matrix
,
int32_t
*
row_sum
,
const
int8_t
offset
,
const
MatMulComputeHelper
&
helper
);
Status
ReduceColSumOnMatrixB
(
hipStream_t
stream
,
const
int8_t
*
matrix
,
int32_t
*
col_sum
,
const
int8_t
offset
,
const
MatMulComputeHelper
&
helper
);
Status
OffsetOutput
(
hipStream_t
stream
,
const
int32_t
*
row_sum
,
const
int32_t
*
col_sum
,
int32_t
*
output
,
const
int8_t
a_offset
,
const
int8_t
b_offset
,
const
MatMulComputeHelper
&
helper
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/matmul_integer.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T1
,
typename
T2
>
class
MatMulInteger
final
:
public
RocmKernel
{
using
Base
=
RocmKernel
;
public:
MatMulInteger
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{
has_a_zero_point_
=
false
;
has_b_zero_point_
=
false
;
if
(
info
.
GetInputCount
()
>
2
)
{
has_a_zero_point_
=
true
;
}
if
(
info
.
GetInputCount
()
>
3
)
{
has_b_zero_point_
=
true
;
}
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
private:
bool
has_a_zero_point_
;
bool
has_b_zero_point_
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/softmax.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/gsl.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
,
bool
is_log_softmax
>
Status
SoftMaxComputeHelper
(
hipStream_t
stream
,
const
T
*
input
,
const
TensorShape
&
shape
,
T
*
Y
,
int64_t
axis
);
template
<
typename
input_t
,
typename
output_t
,
typename
acc_t
,
bool
is_log_softmax
>
void
dispatch_warpwise_softmax_forward
(
hipStream_t
stream
,
output_t
*
dst
,
const
input_t
*
src
,
int
softmax_elements
,
int
softmax_elements_stride
,
int
batch_count
);
template
<
typename
input_t
,
typename
output_t
,
typename
acc_t
,
bool
is_log_softmax
>
void
dispatch_blockwise_softmax_forward
(
hipStream_t
stream
,
output_t
*
output
,
const
input_t
*
input
,
int
softmax_elements
,
int
input_stride
,
int
output_stride
,
int
batch_count
);
template
<
typename
T
>
class
Softmax
final
:
public
RocmKernel
{
public:
Softmax
(
const
OpKernelInfo
&
info
)
:
RocmKernel
{
info
}
{
const
auto
&
node
=
info
.
node
();
opset_
=
node
.
SinceVersion
();
int64_t
axis
;
Status
status
=
info
.
GetAttr
<
int64_t
>
(
"axis"
,
&
axis
);
if
(
status
.
IsOK
())
{
axis_
=
gsl
::
narrow_cast
<
int
>
(
axis
);
}
else
{
if
(
opset_
<
13
)
{
axis_
=
1
;
// opset-12 and below, the default axis value is 1
}
else
{
axis_
=
-
1
;
// opset-13, the default axis value is -1
}
}
log_softmax_
=
info
.
GetKernelDef
().
OpName
()
==
"LogSoftmax"
;
// We need to cast away the const as PerThreadRocblasHandle() is currently a non-const method
// TODO: Clean up the ROCMExecutionProvider interface to avoid this
rocm_ep_
=
const_cast
<
ROCMExecutionProvider
*>
(
static_cast
<
const
ROCMExecutionProvider
*>
(
info
.
GetExecutionProvider
()));
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
private:
int64_t
axis_
;
bool
log_softmax_
;
int
opset_
;
// We need to access to the ROCM EP instance to get the rocblas handle to use
// for transposing(if applicable)
ROCMExecutionProvider
*
rocm_ep_
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/softmax_blockwise_impl.cuh
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
/**
* Copyright (c) 2016-present, Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// The code below is mostly copied from Pytorch SoftMax.cuh
#pragma once
#include "core/providers/rocm/cu_inc/common.cuh"
namespace
onnxruntime
{
namespace
rocm
{
constexpr
int
ALIGN_BYTES
=
16
;
const
int
max_threads
=
1024
;
dim3
SoftMax_getBlockSize
(
int
ILP
,
uint64_t
dim_size
)
{
uint64_t
block_size
=
1
;
uint64_t
max_block_size
=
std
::
min
(
dim_size
/
ILP
,
static_cast
<
uint64_t
>
(
max_threads
));
// In the vectorized case we want to trade off allowing more of the buffers to be accessed
// in a vectorized way against wanting a larger block size to get better utilisation.
// In general with ILP you can have (ILP-1)/ILP of the buffer accessed vectorised, at the risk
// of having a very small block size. We choose to keep >= 1/2 of the buffer vectorised while
// allowing a larger block size.
if
(
ILP
>
1
)
{
max_block_size
/=
2
;
}
while
(
block_size
<
(
max_block_size
))
block_size
*=
2
;
// Launch at least a single warp - the kernel assumes that.
block_size
=
std
::
max
(
block_size
,
static_cast
<
uint64_t
>
(
GPU_WARP_SIZE_HOST
));
return
dim3
(
static_cast
<
unsigned
int
>
(
block_size
));
}
////////////////////////////////////////////////////////////////////////////////
// Regular kernel (fast when dim_size is large; requires inner_size == 1)
////////////////////////////////////////////////////////////////////////////////
template
<
typename
T
,
typename
AccumT
>
struct
MaxFloat
{
__device__
__forceinline__
AccumT
operator
()(
AccumT
max
,
T
v
)
const
{
return
::
max
(
max
,
(
AccumT
)
v
);
}
};
template
<
typename
T
,
typename
AccumT
>
struct
AddFloat
{
__device__
__forceinline__
AccumT
operator
()(
AccumT
sum
,
T
v
)
const
{
return
sum
+
(
AccumT
)
v
;
}
};
template
<
typename
T
,
typename
AccumT
>
struct
SumExpFloat
{
__device__
__forceinline__
SumExpFloat
(
AccumT
v
)
:
max_k
(
v
)
{}
__device__
__forceinline__
AccumT
operator
()(
AccumT
sum
,
T
v
)
const
{
return
sum
+
expf
((
AccumT
)
v
-
max_k
);
}
const
AccumT
max_k
;
};
template
<
template
<
typename
>
class
Reduction
,
typename
AccumT
>
__device__
__forceinline__
AccumT
blockReduce
(
AccumT
*
smem
,
AccumT
val
,
const
Reduction
<
AccumT
>&
r
,
AccumT
defaultVal
)
{
// To avoid RaW races from chaining blockReduce calls together, we need a sync here
__syncthreads
();
smem
[
threadIdx
.
x
]
=
val
;
__syncthreads
();
AccumT
warpVal
=
defaultVal
;
// First warp will perform per-warp reductions for the remaining warps
uint32_t
mask
=
(((
uint64_t
)
1
)
<<
(
blockDim
.
x
/
GPU_WARP_SIZE
))
-
1
;
if
(
threadIdx
.
x
<
GPU_WARP_SIZE
)
{
int
lane
=
threadIdx
.
x
%
GPU_WARP_SIZE
;
if
(
lane
<
blockDim
.
x
/
GPU_WARP_SIZE
)
{
#pragma unroll
for
(
int
i
=
0
;
i
<
GPU_WARP_SIZE
;
++
i
)
{
warpVal
=
r
(
warpVal
,
smem
[
lane
*
GPU_WARP_SIZE
+
i
]);
}
#if !defined(USE_ROCM)
__syncwarp
(
mask
);
#endif
smem
[
lane
]
=
warpVal
;
}
}
__syncthreads
();
// First thread will perform a reduction of the above per-warp reductions
AccumT
blockVal
=
defaultVal
;
if
(
threadIdx
.
x
==
0
)
{
for
(
int
i
=
0
;
i
<
blockDim
.
x
/
GPU_WARP_SIZE
;
++
i
)
{
blockVal
=
r
(
blockVal
,
smem
[
i
]);
}
smem
[
0
]
=
blockVal
;
}
// Sync and broadcast
__syncthreads
();
return
smem
[
0
];
}
template
<
template
<
typename
,
typename
>
class
Reduction
,
int
ILP
,
typename
T
,
typename
AccumT
>
__device__
__forceinline__
AccumT
ilpReduce
(
int
shift
,
T
*
data
,
int
size
,
const
Reduction
<
T
,
AccumT
>&
r
,
AccumT
defaultVal
)
{
using
LoadT
=
aligned_vector
<
T
,
ILP
>
;
AccumT
threadVal
=
defaultVal
;
int
offset
=
threadIdx
.
x
;
// shift and do 1
if
(
shift
>
0
){
data
-=
shift
;
size
+=
shift
;
if
(
threadIdx
.
x
>=
shift
){
threadVal
=
r
(
threadVal
,
data
[
offset
]);
}
size
-=
blockDim
.
x
;
data
+=
blockDim
.
x
;
}
int
last
=
size
%
(
ILP
*
blockDim
.
x
);
T
v
[
ILP
];
LoadT
*
value
=
reinterpret_cast
<
LoadT
*>
(
&
v
);
for
(;
offset
*
ILP
<
(
size
-
last
);
offset
+=
blockDim
.
x
)
{
*
value
=
reinterpret_cast
<
LoadT
*>
(
data
)[
offset
];
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
{
threadVal
=
r
(
threadVal
,
v
[
j
]);
}
}
offset
=
size
-
last
+
threadIdx
.
x
;
// Epilogue
for
(;
offset
<
size
;
offset
+=
blockDim
.
x
)
threadVal
=
r
(
threadVal
,
data
[
offset
]);
return
threadVal
;
}
/**
* This will apply the Epilogue with vectorized reads & writes when input & output have the same shift
*/
template
<
int
ILP
,
typename
scalar_t
,
typename
accum_t
,
typename
outscalar_t
,
template
<
typename
,
typename
,
typename
>
class
Epilogue
>
__device__
__forceinline__
void
WriteFpropResultsVectorized
(
int
size
,
const
int
shift
,
scalar_t
*
input
,
outscalar_t
*
output
,
Epilogue
<
scalar_t
,
accum_t
,
outscalar_t
>
epilogue
)
{
using
LoadT
=
aligned_vector
<
scalar_t
,
ILP
>
;
using
StoreT
=
aligned_vector
<
outscalar_t
,
ILP
>
;
int
offset
=
threadIdx
.
x
;
// if unaligned, do one value / thread and move on, guaranteeing aligned reads/writes later
if
(
shift
>
0
)
{
input
-=
shift
;
output
-=
shift
;
size
+=
shift
;
if
(
threadIdx
.
x
>=
shift
)
{
output
[
offset
]
=
epilogue
(
input
[
offset
]);
}
size
-=
blockDim
.
x
;
input
+=
blockDim
.
x
;
output
+=
blockDim
.
x
;
}
const
int
last
=
size
%
(
ILP
*
blockDim
.
x
);
scalar_t
in_v
[
ILP
];
LoadT
*
in_value
=
reinterpret_cast
<
LoadT
*>
(
&
in_v
);
outscalar_t
out_v
[
ILP
];
StoreT
*
out_value
=
reinterpret_cast
<
StoreT
*>
(
&
out_v
);
for
(;
offset
*
ILP
<
(
size
-
last
);
offset
+=
blockDim
.
x
)
{
*
in_value
=
reinterpret_cast
<
LoadT
*>
(
input
)[
offset
];
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
{
out_v
[
j
]
=
epilogue
(
in_v
[
j
]);
}
reinterpret_cast
<
StoreT
*>
(
output
)[
offset
]
=
*
out_value
;
}
offset
=
size
-
last
+
threadIdx
.
x
;
// handle the tail
for
(;
offset
<
size
;
offset
+=
blockDim
.
x
)
{
output
[
offset
]
=
epilogue
(
input
[
offset
]);
}
}
/**
* This will apply the Epilogue with non-vectrorized reads & writes for the general case
*/
template
<
int
ILP
,
typename
scalar_t
,
typename
accum_t
,
typename
outscalar_t
,
template
<
typename
,
typename
,
typename
>
class
Epilogue
>
__device__
__forceinline__
void
WriteFpropResults
(
int
classes
,
scalar_t
*
input
,
outscalar_t
*
output
,
Epilogue
<
scalar_t
,
accum_t
,
outscalar_t
>
epilogue
)
{
int
offset
=
threadIdx
.
x
;
int
last
=
classes
%
(
ILP
*
blockDim
.
x
);
// Main bulk of loop with ILP
for
(;
offset
<
classes
-
last
;
offset
+=
blockDim
.
x
*
ILP
)
{
scalar_t
tmp
[
ILP
];
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
{
tmp
[
j
]
=
input
[
offset
+
j
*
blockDim
.
x
];
}
#pragma unroll
for
(
int
j
=
0
;
j
<
ILP
;
++
j
)
{
output
[
offset
+
j
*
blockDim
.
x
]
=
epilogue
(
tmp
[
j
]);
}
}
// Remainder - no ILP
for
(;
offset
<
classes
;
offset
+=
blockDim
.
x
)
{
output
[
offset
]
=
epilogue
(
input
[
offset
]);
}
}
template
<
int
ILP
,
typename
scalar_t
,
typename
accscalar_t
,
typename
outscalar_t
,
template
<
typename
,
typename
,
typename
>
class
Epilogue
>
__global__
void
softmax_block_forward
(
outscalar_t
*
output
,
scalar_t
*
input
,
int
classes
,
int
input_stride
,
int
output_stride
)
{
extern
__shared__
unsigned
char
smem
[];
auto
sdata
=
reinterpret_cast
<
accscalar_t
*>
(
smem
);
using
LoadT
=
aligned_vector
<
scalar_t
,
ILP
>
;
using
StoreT
=
aligned_vector
<
outscalar_t
,
ILP
>
;
// forward pointers to batch[blockIdx.x]
// each block handles a sample in the mini-batch
input
+=
blockIdx
.
x
*
input_stride
;
output
+=
blockIdx
.
x
*
output_stride
;
const
int
shift
=
((
uint64_t
)
input
)
%
ALIGN_BYTES
/
sizeof
(
scalar_t
);
const
int
output_shift
=
((
uint64_t
)
output
)
%
ALIGN_BYTES
/
sizeof
(
outscalar_t
);
// find the max
accscalar_t
threadMax
=
ilpReduce
<
MaxFloat
,
ILP
,
scalar_t
,
accscalar_t
>
(
shift
,
input
,
classes
,
MaxFloat
<
scalar_t
,
accscalar_t
>
(),
-
std
::
numeric_limits
<
accscalar_t
>::
max
());
accscalar_t
max_k
=
blockReduce
<
Max
,
accscalar_t
>
(
sdata
,
threadMax
,
Max
<
accscalar_t
>
(),
-
std
::
numeric_limits
<
accscalar_t
>::
max
());
// reduce all values
accscalar_t
threadExp
=
ilpReduce
<
SumExpFloat
,
ILP
,
scalar_t
,
accscalar_t
>
(
shift
,
input
,
classes
,
SumExpFloat
<
scalar_t
,
accscalar_t
>
(
max_k
),
static_cast
<
accscalar_t
>
(
0
));
accscalar_t
sumAll
=
blockReduce
<
Add
,
accscalar_t
>
(
sdata
,
threadExp
,
Add
<
accscalar_t
>
(),
static_cast
<
accscalar_t
>
(
0
));
Epilogue
<
scalar_t
,
accscalar_t
,
outscalar_t
>
epilogue
(
max_k
,
sumAll
);
if
(
shift
==
output_shift
)
{
WriteFpropResultsVectorized
<
ILP
,
scalar_t
,
accscalar_t
,
outscalar_t
,
Epilogue
>
(
classes
,
shift
,
input
,
output
,
epilogue
);
}
else
{
WriteFpropResults
<
ILP
,
scalar_t
,
accscalar_t
,
outscalar_t
,
Epilogue
>
(
classes
,
input
,
output
,
epilogue
);
}
}
template
<
typename
T
,
typename
AccumT
,
typename
OutT
>
struct
LogSoftMaxForwardEpilogue
{
__device__
__forceinline__
LogSoftMaxForwardEpilogue
(
AccumT
max_input
,
AccumT
sum
)
:
max_input
(
max_input
),
logsum
(
logf
(
sum
))
{}
__device__
__forceinline__
OutT
operator
()(
T
input
)
const
{
return
static_cast
<
OutT
>
((
AccumT
)
input
-
max_input
-
logsum
);
}
const
AccumT
max_input
;
const
AccumT
logsum
;
};
template
<
typename
T
,
typename
AccumT
,
typename
OutT
>
struct
SoftMaxForwardEpilogue
{
__device__
__forceinline__
SoftMaxForwardEpilogue
(
AccumT
max_input
,
AccumT
sum
)
:
max_input
(
max_input
)
,
sum
(
sum
)
{}
__device__
__forceinline__
OutT
operator
()(
T
input
)
const
{
return
static_cast
<
OutT
>
(
expf
((
AccumT
)
input
-
max_input
)
/
sum
);
}
const
AccumT
max_input
;
const
AccumT
sum
;
};
}
}
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/softmax_common.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/status.h"
#include "core/providers/rocm/miopen_common.h"
namespace
onnxruntime
{
namespace
rocm
{
Status
SoftmaxForward
(
miopenHandle_t
miopen_handle
,
const
void
*
alpha
,
const
miopenTensorDescriptor_t
input_tensor
,
const
void
*
input_data
,
const
void
*
beta
,
const
miopenTensorDescriptor_t
output_tensor
,
void
*
output_data
);
Status
SoftmaxBackward
(
miopenHandle_t
miopen_handle
,
bool
is_log_softmax
,
const
void
*
alpha
,
const
miopenTensorDescriptor_t
input_tensor
,
const
void
*
output_data
,
const
void
*
output_grad_data
,
const
void
*
beta
,
const
miopenTensorDescriptor_t
output_tensor
,
void
*
input_grad_data
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/math/topk.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "topk.h"
#include "topk_impl.h"
namespace
onnxruntime
{
namespace
rocm
{
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
TopK
,
kOnnxDomain
,
1
,
9
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
()).
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
()),
TopK
<
false
>
);
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
TopK
,
kOnnxDomain
,
10
,
10
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
()).
InputMemoryType
(
OrtMemTypeCPUInput
,
1
).
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
()).
TypeConstraint
(
"I"
,
DataTypeImpl
::
GetTensorType
<
int64_t
>
()),
TopK
<
true
>
);
ONNX_OPERATOR_KERNEL_EX
(
TopK
,
kOnnxDomain
,
11
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
()).
InputMemoryType
(
OrtMemTypeCPUInput
,
1
).
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
()).
TypeConstraint
(
"I"
,
DataTypeImpl
::
GetTensorType
<
int64_t
>
()),
TopK
<
true
>
);
template
<
bool
inputk
>
TopK
<
inputk
>::
TopK
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{
info
.
GetAttrOrDefault
<
int64_t
>
(
"axis"
,
&
axis_
,
-
1
);
info
.
GetAttrOrDefault
<
int64_t
>
(
"largest"
,
&
largest_
,
1
);
info
.
GetAttrOrDefault
<
int64_t
>
(
"sorted"
,
&
sorted_
,
1
);
if
(
!
inputk
)
{
info
.
GetAttrOrDefault
<
int64_t
>
(
"k"
,
&
K_
,
0
);
}
}
#define IS_PRIM_TYPE(T) utils::IsPrimitiveDataType<T>(prim_type)
#define TOPKIMPL(T) TopKImpl<T>(this, stream, tensor_X->Data<T>(), \
static_cast<T*>(tensor_V->MutableDataRaw()), \
static_cast<int64_t*>(tensor_I->MutableDataRaw()), \
elem_nums_rocm, \
elem_nums.size(), \
axis, K_, largest_, sorted_, N, dimension)
template
<
bool
inputk
>
Status
TopK
<
inputk
>::
ComputeInternal
(
OpKernelContext
*
ctx
)
const
{
auto
tensor_X
=
ctx
->
Input
<
Tensor
>
(
0
);
ORT_ENFORCE
(
nullptr
!=
tensor_X
);
int32_t
rank
=
static_cast
<
int32_t
>
(
tensor_X
->
Shape
().
NumDimensions
());
int32_t
axis
=
static_cast
<
int32_t
>
(
axis_
<
0
?
rank
+
axis_
:
axis_
);
ORT_ENFORCE
(
axis
>
-
1
&&
axis
<
rank
);
if
(
inputk
)
{
auto
tensor_K
=
ctx
->
Input
<
Tensor
>
(
1
);
ORT_ENFORCE
(
nullptr
!=
tensor_K
);
K_
=
*
tensor_K
->
Data
<
int64_t
>
();
ORT_ENFORCE
(
K_
>=
0
&&
K_
<=
tensor_X
->
Shape
().
GetDims
()[
axis
]);
}
auto
output_shape
=
tensor_X
->
Shape
();
output_shape
[
axis
]
=
K_
;
auto
tensor_V
=
ctx
->
Output
(
0
,
output_shape
);
auto
tensor_I
=
ctx
->
Output
(
1
,
output_shape
);
if
(
0
==
K_
)
{
return
Status
::
OK
();
}
auto
elem_nums
=
tensor_X
->
Shape
().
AsShapeVector
();
auto
dimension
=
elem_nums
[
axis
];
for
(
auto
i
=
static_cast
<
int64_t
>
(
elem_nums
.
size
())
-
2
;
i
>=
0
;
--
i
)
{
elem_nums
[
i
]
*=
elem_nums
[
i
+
1
];
}
auto
N
=
elem_nums
[
0
]
/
dimension
;
TArray
<
int64_t
>
elem_nums_rocm
(
elem_nums
);
auto
prim_type
=
tensor_X
->
DataType
()
->
AsPrimitiveDataType
();
if
(
prim_type
==
nullptr
)
{
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
FAIL
,
"Type not supported for TopK operator"
);
}
hipStream_t
stream
=
this
->
Stream
();
if
(
IS_PRIM_TYPE
(
uint8_t
))
return
TOPKIMPL
(
uint8_t
);
if
(
IS_PRIM_TYPE
(
uint16_t
))
return
TOPKIMPL
(
uint16_t
);
if
(
IS_PRIM_TYPE
(
uint32_t
))
return
TOPKIMPL
(
uint32_t
);
if
(
IS_PRIM_TYPE
(
uint64_t
))
return
TOPKIMPL
(
uint64_t
);
if
(
IS_PRIM_TYPE
(
int8_t
))
return
TOPKIMPL
(
int8_t
);
if
(
IS_PRIM_TYPE
(
int16_t
))
return
TOPKIMPL
(
int16_t
);
if
(
IS_PRIM_TYPE
(
int32_t
))
return
TOPKIMPL
(
int32_t
);
if
(
IS_PRIM_TYPE
(
int64_t
))
return
TOPKIMPL
(
int64_t
);
if
(
IS_PRIM_TYPE
(
MLFloat16
))
return
TOPKIMPL
(
MLFloat16
);
if
(
IS_PRIM_TYPE
(
float
))
return
TOPKIMPL
(
float
);
if
(
IS_PRIM_TYPE
(
double
))
return
TOPKIMPL
(
double
);
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
FAIL
,
"Type not supported for TopK operator"
);
}
}
// namespace rocm
}
// namespace onnxruntime
Prev
1
2
3
4
5
6
7
8
9
…
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment