Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
onnxruntime_v14
Commits
1a91fcc2
Commit
1a91fcc2
authored
Jul 25, 2023
by
gaoqiong
Browse files
add dtk所需文件
parent
a144865d
Pipeline
#492
failed with stages
in 0 seconds
Changes
280
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1909 additions
and
0 deletions
+1909
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/gather_impl.h
...dgpu/onnxruntime/core/providers/rocm/tensor/gather_impl.h
+25
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/gather_nd.cc
...mdgpu/onnxruntime/core/providers/rocm/tensor/gather_nd.cc
+196
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/gather_nd.h
...amdgpu/onnxruntime/core/providers/rocm/tensor/gather_nd.h
+46
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/gather_nd_impl.cu
.../onnxruntime/core/providers/rocm/tensor/gather_nd_impl.cu
+121
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/gather_nd_impl.h
...u/onnxruntime/core/providers/rocm/tensor/gather_nd_impl.h
+44
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/identity_op.cc
...gpu/onnxruntime/core/providers/rocm/tensor/identity_op.cc
+64
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/identity_op.h
...dgpu/onnxruntime/core/providers/rocm/tensor/identity_op.h
+90
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/nonzero_impl.cu
...pu/onnxruntime/core/providers/rocm/tensor/nonzero_impl.cu
+147
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/nonzero_impl.h
...gpu/onnxruntime/core/providers/rocm/tensor/nonzero_impl.h
+30
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/nonzero_op.cc
...dgpu/onnxruntime/core/providers/rocm/tensor/nonzero_op.cc
+100
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/nonzero_op.h
...mdgpu/onnxruntime/core/providers/rocm/tensor/nonzero_op.h
+20
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/onehot.cc
...e/amdgpu/onnxruntime/core/providers/rocm/tensor/onehot.cc
+91
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/onehot.cu
...e/amdgpu/onnxruntime/core/providers/rocm/tensor/onehot.cu
+137
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/onehot.h
...se/amdgpu/onnxruntime/core/providers/rocm/tensor/onehot.h
+52
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/pad.cc
...ease/amdgpu/onnxruntime/core/providers/rocm/tensor/pad.cc
+234
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/pad.h
...lease/amdgpu/onnxruntime/core/providers/rocm/tensor/pad.h
+22
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/pad_impl.cu
...amdgpu/onnxruntime/core/providers/rocm/tensor/pad_impl.cu
+236
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/pad_impl.h
.../amdgpu/onnxruntime/core/providers/rocm/tensor/pad_impl.h
+43
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/quantize_linear.cc
...onnxruntime/core/providers/rocm/tensor/quantize_linear.cc
+107
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/quantize_linear.cu
...onnxruntime/core/providers/rocm/tensor/quantize_linear.cu
+104
-0
No files found.
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/gather_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
rocm
{
void
GatherImpl
(
hipStream_t
stream
,
const
int64_t
input_block_size
,
const
int64_t
indices_max
,
const
fast_divmod
&
output_block_size
,
const
fast_divmod
&
block_size
,
const
void
*
indices_data
,
size_t
index_element_size
,
const
void
*
input_data
,
size_t
element_size
,
void
*
output_data
,
const
size_t
N
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/gather_nd.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/gather_nd.h"
#include "core/providers/rocm/tensor/gather_nd_impl.h"
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
rocm
{
Status
CheckBatchDimensionsMatch
(
size_t
num_batch_dimensions
,
const
std
::
vector
<
std
::
reference_wrapper
<
TensorShape
>>&
tensor_shapes
)
{
for
(
size_t
tensor_shape_idx
=
0
;
tensor_shape_idx
<
tensor_shapes
.
size
();
++
tensor_shape_idx
)
{
const
TensorShape
&
tensor_shape
=
tensor_shapes
[
tensor_shape_idx
];
ORT_RETURN_IF_NOT
(
num_batch_dimensions
<=
tensor_shape
.
NumDimensions
(),
"Number of batch dimensions exceeds tensor rank. "
,
"Batch dimension count: "
,
num_batch_dimensions
,
", tensor rank: "
,
tensor_shape
.
NumDimensions
(),
", tensor index: "
,
tensor_shape_idx
);
}
if
(
tensor_shapes
.
empty
())
return
Status
::
OK
();
const
TensorShape
&
first_tensor_shape
=
tensor_shapes
.
front
();
for
(
size_t
batch_dimension_idx
=
0
;
batch_dimension_idx
<
num_batch_dimensions
;
++
batch_dimension_idx
)
{
for
(
size_t
tensor_shape_idx
=
1
;
tensor_shape_idx
<
tensor_shapes
.
size
();
++
tensor_shape_idx
)
{
const
TensorShape
&
other_tensor_shape
=
tensor_shapes
[
tensor_shape_idx
];
ORT_RETURN_IF_NOT
(
first_tensor_shape
[
batch_dimension_idx
]
==
other_tensor_shape
[
batch_dimension_idx
],
"Batch dimensions differ at index "
,
batch_dimension_idx
,
": "
,
first_tensor_shape
[
batch_dimension_idx
],
" != "
,
other_tensor_shape
[
batch_dimension_idx
],
", tensor indices: 0, "
,
tensor_shape_idx
);
}
}
return
Status
::
OK
();
}
template
<
typename
TIndex
>
Status
GatherNDBase
::
PrepareCompute
(
hipStream_t
stream
,
const
int64_t
batch_dims
,
const
TensorShape
&
input_shape
,
const
TensorShape
&
indices_shape
,
const
Tensor
*
indices_tensor
,
int64_t
&
num_slices
,
int64_t
&
slice_size
,
IAllocatorUniquePtr
<
int64_t
>&
input_slice_offsets_buffer
)
const
{
const
auto
num_slice_dims
=
indices_shape
[
indices_shape
.
NumDimensions
()
-
1
];
num_slices
=
indices_shape
.
SizeToDimension
(
indices_shape
.
NumDimensions
()
-
1
);
slice_size
=
input_shape
.
SizeFromDimension
(
batch_dims
+
num_slice_dims
);
const
auto
num_batches
=
input_shape
.
SizeToDimension
(
batch_dims
);
const
auto
input_batch_stride
=
input_shape
.
SizeFromDimension
(
batch_dims
);
const
auto
num_slices_per_batch
=
num_slices
/
num_batches
;
const
TIndex
*
const
indices_data
=
indices_tensor
->
Data
<
TIndex
>
();
std
::
vector
<
int64_t
>
sizes_from_slice_dims
(
num_slice_dims
);
{
auto
running_product
=
slice_size
;
for
(
int64_t
i
=
0
;
i
<
num_slice_dims
;
++
i
)
{
sizes_from_slice_dims
[
num_slice_dims
-
1
-
i
]
=
running_product
;
running_product
*=
input_shape
[
batch_dims
+
num_slice_dims
-
1
-
i
];
}
}
auto
sizes_from_slice_dims_buffer
=
GetScratchBuffer
<
int64_t
>
(
sizes_from_slice_dims
.
size
());
HIP_RETURN_IF_ERROR
(
hipMemcpyAsync
(
sizes_from_slice_dims_buffer
.
get
(),
sizes_from_slice_dims
.
data
(),
sizes_from_slice_dims
.
size
()
*
sizeof
(
int64_t
),
hipMemcpyHostToDevice
,
stream
));
input_slice_offsets_buffer
=
GetScratchBuffer
<
int64_t
>
(
num_slices
);
TArray
<
int64_t
>
input_dims
(
input_shape
.
GetDims
());
ComputeSliceOffsetsImpl
(
stream
,
batch_dims
,
input_dims
,
num_slices
,
num_slices_per_batch
,
input_batch_stride
,
num_slice_dims
,
sizes_from_slice_dims_buffer
.
get
(),
indices_data
,
input_slice_offsets_buffer
.
get
());
return
Status
::
OK
();
}
#define REGISTER_KERNEL_VERSIONED_TYPED_GATHER_ND(TIndex, startver, endver) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
GatherND, \
kOnnxDomain, \
startver, \
endver, \
TIndex, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", \
std::vector<MLDataType>{ \
DataTypeImpl::GetTensorType<float>(), \
DataTypeImpl::GetTensorType<double>(), \
DataTypeImpl::GetTensorType<MLFloat16>(), \
DataTypeImpl::GetTensorType<int64_t>(), \
DataTypeImpl::GetTensorType<bool>(), \
}) \
.TypeConstraint("indices", DataTypeImpl::GetTensorType<TIndex>()), \
GatherND<TIndex>);
#define REGISTER_KERNEL_TYPED_GATHER_ND(TIndex, ver) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
GatherND, kOnnxDomain, ver, TIndex, kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", BuildKernelDefConstraints<float, MLFloat16, double, int64_t, BFloat16, bool>()) \
.TypeConstraint("indices", DataTypeImpl::GetTensorType<TIndex>()), \
GatherND<TIndex>);
REGISTER_KERNEL_TYPED_GATHER_ND
(
int64_t
,
13
)
REGISTER_KERNEL_VERSIONED_TYPED_GATHER_ND
(
int64_t
,
12
,
12
)
REGISTER_KERNEL_VERSIONED_TYPED_GATHER_ND
(
int64_t
,
11
,
11
)
template
<
typename
T
>
struct
GatherNDComputeImpl
{
void
operator
()(
hipStream_t
stream
,
const
int64_t
num_slices
,
const
int64_t
slice_size
,
const
void
*
const
kernel_input_data
,
void
*
const
kernel_output_data
,
int64_t
*
const
input_slice_offsets_data
)
const
{
typedef
typename
ToHipType
<
T
>::
MappedType
HipT
;
GatherNDImpl
<
HipT
>
(
stream
,
num_slices
,
kernel_input_data
,
kernel_output_data
,
slice_size
,
input_slice_offsets_data
);
}
};
template
<
typename
TIndex
>
Status
GatherND
<
TIndex
>::
ComputeInternal
(
OpKernelContext
*
context
)
const
{
auto
input_tensor
=
context
->
Input
<
Tensor
>
(
0
);
auto
indices_tensor
=
context
->
Input
<
Tensor
>
(
1
);
ORT_RETURN_IF_NOT
(
input_tensor
!=
nullptr
,
"input_tensor == nullptr"
);
ORT_RETURN_IF_NOT
(
indices_tensor
!=
nullptr
,
"indices_tensor == nullptr"
);
auto
input_shape
=
input_tensor
->
Shape
();
auto
indices_shape
=
indices_tensor
->
Shape
();
if
(
indices_shape
.
NumDimensions
()
==
0
)
{
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
INVALID_ARGUMENT
,
"indices tensor must has rank larger than 0"
);
}
auto
last_indices_dimension
=
batch_dims_
+
indices_shape
[
indices_shape
.
NumDimensions
()
-
1
];
if
(
last_indices_dimension
>
static_cast
<
int64_t
>
(
input_shape
.
NumDimensions
()))
{
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
INVALID_ARGUMENT
,
"last dimension of indices must not be larger than rank of input tensor"
);
}
ORT_RETURN_IF_ERROR
(
CheckBatchDimensionsMatch
(
static_cast
<
size_t
>
(
batch_dims_
),
{
input_shape
,
indices_shape
}));
// Output shape
std
::
vector
<
int64_t
>
shape
(
indices_shape
.
GetDims
().
begin
(),
indices_shape
.
GetDims
().
end
()
-
1
);
shape
.
insert
(
shape
.
end
(),
input_shape
.
GetDims
().
begin
()
+
last_indices_dimension
,
input_shape
.
GetDims
().
end
());
auto
output_tensor
=
context
->
Output
(
0
,
TensorShape
(
shape
));
// Bail out early in case the output is going to be empty
if
(
output_tensor
->
Shape
().
Size
()
==
0
)
{
return
Status
::
OK
();
}
// Compute
int64_t
num_slices
;
int64_t
slice_size
;
IAllocatorUniquePtr
<
int64_t
>
input_slice_offsets_buffer
;
ORT_RETURN_IF_ERROR
(
PrepareCompute
<
TIndex
>
(
Stream
(),
batch_dims_
,
input_shape
,
indices_shape
,
indices_tensor
,
num_slices
,
slice_size
,
input_slice_offsets_buffer
));
const
void
*
const
kernel_input_data
=
input_tensor
->
DataRaw
();
void
*
const
kernel_output_data
=
output_tensor
->
MutableDataRaw
();
utils
::
MLTypeCallDispatcher
<
float
,
MLFloat16
,
double
,
int64_t
,
BFloat16
,
bool
>
t_disp
(
input_tensor
->
GetElementType
());
t_disp
.
Invoke
<
GatherNDComputeImpl
>
(
Stream
(),
num_slices
,
slice_size
,
kernel_input_data
,
kernel_output_data
,
input_slice_offsets_buffer
.
get
());
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/gather_nd.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
rocm
{
Status
CheckBatchDimensionsMatch
(
size_t
num_batch_dimensions
,
const
std
::
vector
<
std
::
reference_wrapper
<
TensorShape
>>&
tensor_shapes
);
class
GatherNDBase
:
public
RocmKernel
{
public:
GatherNDBase
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{
info
.
GetAttrOrDefault
(
"batch_dims"
,
&
batch_dims_
,
static_cast
<
int64_t
>
(
0
));
ORT_ENFORCE
(
batch_dims_
>=
0
);
}
protected:
template
<
typename
TIndex
>
Status
PrepareCompute
(
hipStream_t
stream
,
const
int64_t
batch_dims
,
const
TensorShape
&
input_shape
,
const
TensorShape
&
indices_shape
,
const
Tensor
*
indices_tensor
,
int64_t
&
num_slices
,
int64_t
&
slice_size
,
IAllocatorUniquePtr
<
int64_t
>&
input_slice_offsets_buffer
)
const
;
int64_t
batch_dims_
;
};
template
<
typename
Tind
>
class
GatherND
final
:
public
GatherNDBase
{
public:
GatherND
(
const
OpKernelInfo
&
info
)
:
GatherNDBase
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/gather_nd_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/gather_nd_impl.h"
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/providers/rocm/atomic/common.cuh"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
TIndex
>
__global__
void
_ComputeSliceOffsetsKernel
(
const
int64_t
batch_dims
,
const
TArray
<
int64_t
>
input_dims
,
const
size_t
num_slices
,
const
size_t
num_slices_per_batch
,
const
size_t
input_batch_stride
,
const
size_t
num_slice_dims
,
const
int64_t
*
const
sizes_from_slice_dims_data
,
// num_slice_dims elements
const
TIndex
*
const
indices_data
,
// num_slices * num_slice_dims elements
int64_t
*
const
input_slice_offsets_data
)
{
// num_slices elements
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
slice_idx
,
num_slices
)
const
size_t
batch_idx
=
slice_idx
/
num_slices_per_batch
;
const
size_t
base_offset
=
batch_idx
*
input_batch_stride
;
const
TIndex
*
const
slice_indices
=
indices_data
+
slice_idx
*
num_slice_dims
;
size_t
relative_slice_offset
=
0
;
for
(
size_t
dim_idx
=
0
;
dim_idx
<
num_slice_dims
;
++
dim_idx
)
{
int64_t
index
=
static_cast
<
int64_t
>
(
slice_indices
[
dim_idx
]);
const
size_t
input_dim_idx
=
batch_dims
+
dim_idx
;
HIP_KERNEL_ASSERT
(
index
>=
-
input_dims
[
input_dim_idx
]
&&
index
<
input_dims
[
input_dim_idx
]);
if
(
index
<
0
)
index
+=
input_dims
[
input_dim_idx
];
relative_slice_offset
+=
index
*
sizes_from_slice_dims_data
[
dim_idx
];
}
input_slice_offsets_data
[
slice_idx
]
=
base_offset
+
relative_slice_offset
;
}
template
<
typename
T
>
__global__
void
_GatherNDKernel
(
const
size_t
num_slices
,
const
T
*
input_data
,
T
*
output_data
,
const
size_t
slice_size
,
const
int64_t
*
slice_offsets
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
i
,
num_slices
*
slice_size
)
uint64_t
slice_offset
=
slice_offsets
[
i
/
slice_size
];
output_data
[
i
]
=
input_data
[
slice_offset
+
i
%
slice_size
];
};
template
<
typename
TIndex
>
void
ComputeSliceOffsetsImpl
(
hipStream_t
stream
,
const
int64_t
batch_dims
,
const
TArray
<
int64_t
>
input_dims
,
const
size_t
num_slices
,
const
size_t
num_slices_per_batch
,
const
size_t
input_batch_stride
,
const
size_t
num_slice_dims
,
const
int64_t
*
const
sizes_from_slice_dims_data
,
// num_slice_dims elements
const
TIndex
*
const
indices_data
,
// num_slices * num_slice_dims elements
int64_t
*
const
input_slice_offsets_data
)
{
// num_slices elements
const
unsigned
int
blocks_per_grid
=
static_cast
<
unsigned
int
>
(
CeilDiv
(
num_slices
,
GridDim
::
maxThreadsPerBlock
));
hipLaunchKernelGGL
(
_ComputeSliceOffsetsKernel
,
blocks_per_grid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
batch_dims
,
input_dims
,
num_slices
,
num_slices_per_batch
,
input_batch_stride
,
num_slice_dims
,
sizes_from_slice_dims_data
,
indices_data
,
input_slice_offsets_data
);
}
template
<
typename
T
>
void
GatherNDImpl
(
hipStream_t
stream
,
const
size_t
num_slices
,
const
void
*
input_data
,
void
*
output_data
,
const
size_t
slice_size
,
const
int64_t
*
input_slice_offsets_data
)
{
const
unsigned
int
blocks_per_grid
=
static_cast
<
unsigned
int
>
(
CeilDiv
(
num_slices
*
slice_size
,
GridDim
::
maxThreadsPerBlock
));
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_GatherNDKernel
<
T
>
),
blocks_per_grid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
num_slices
,
static_cast
<
const
T
*>
(
input_data
),
static_cast
<
T
*>
(
output_data
),
slice_size
,
input_slice_offsets_data
);
}
#define SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL(TIndex) \
template void ComputeSliceOffsetsImpl<TIndex>( \
hipStream_t stream, \
const int64_t batch_dims, \
const TArray<int64_t> input_dims, \
const size_t num_slices, \
const size_t num_slices_per_batch, \
const size_t input_batch_stride, \
const size_t num_slice_dims, \
const int64_t* const sizes_from_slice_dims_data, \
const TIndex* const indices_data, \
int64_t* const input_slice_offsets_data);
#define SPECIALIZED_IMPL(T) \
template void GatherNDImpl<T>(hipStream_t stream, const size_t num_slices, const void* input_data, void* output_data, const size_t slice_size, const int64_t* input_slice_offsets_data);
SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL
(
int32_t
)
SPECIALIZED_COMPUTE_SLICE_OFFSETS_IMPL
(
int64_t
)
SPECIALIZED_IMPL
(
bool
)
SPECIALIZED_IMPL
(
float
)
SPECIALIZED_IMPL
(
int64_t
)
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
SPECIALIZED_IMPL
(
half
)
SPECIALIZED_IMPL
(
double
)
SPECIALIZED_IMPL
(
BFloat16
)
#endif
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/gather_nd_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
TIndex
>
void
ComputeSliceOffsetsImpl
(
hipStream_t
stream
,
const
int64_t
batch_dims
,
const
TArray
<
int64_t
>
input_dims
,
const
size_t
num_slices
,
const
size_t
num_slices_per_batch
,
const
size_t
input_batch_stride
,
const
size_t
num_slice_dims
,
const
int64_t
*
const
sizes_from_slice_dims_data
,
// num_slice_dims elements
const
TIndex
*
const
indices_data
,
// num_slices * num_slice_dims elements
int64_t
*
const
input_slice_offsets_data
);
// num_slices elements
template
<
typename
T
>
void
GatherNDImpl
(
hipStream_t
stream
,
const
size_t
num_slices
,
const
void
*
input_data
,
void
*
output_data
,
const
size_t
slice_size
,
const
int64_t
*
input_slice_offsets_data
);
#ifdef ENABLE_TRAINING
template
<
typename
T
>
void
GatherNDGradImpl
(
hipStream_t
stream
,
const
size_t
num_slices
,
const
void
*
update_data
,
void
*
output_data
,
const
size_t
slice_size
,
const
int64_t
*
input_slice_offsets_data
);
#endif
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/identity_op.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "identity_op.h"
namespace
onnxruntime
{
namespace
rocm
{
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Dropout
,
kOnnxDomain
,
7
,
9
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
{
DataTypeImpl
::
GetTensorType
<
MLFloat16
>
(),
DataTypeImpl
::
GetTensorType
<
float
>
(),
DataTypeImpl
::
GetTensorType
<
double
>
()})
.
Alias
(
0
,
0
),
IdentityOp
<
true
>
);
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Dropout
,
kOnnxDomain
,
10
,
11
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
{
DataTypeImpl
::
GetTensorType
<
MLFloat16
>
(),
DataTypeImpl
::
GetTensorType
<
float
>
(),
DataTypeImpl
::
GetTensorType
<
double
>
()})
.
TypeConstraint
(
"T1"
,
DataTypeImpl
::
GetTensorType
<
bool
>
())
.
Alias
(
0
,
0
),
IdentityOp
<
true
>
);
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Identity
,
kOnnxDomain
,
1
,
12
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
())
.
Alias
(
0
,
0
),
IdentityOp
<
false
>
);
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Identity
,
kOnnxDomain
,
13
,
13
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
())
.
Alias
(
0
,
0
),
IdentityOp
<
false
>
);
ONNX_OPERATOR_KERNEL_EX
(
Identity
,
kOnnxDomain
,
14
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"V"
,
DataTypeImpl
::
AllFixedSizeTensorAndSequenceTensorTypes
())
.
Alias
(
0
,
0
),
IdentityOp
<
false
>
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/identity_op.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
bool
is_dropout
>
class
IdentityOp
final
:
public
RocmKernel
{
public:
IdentityOp
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
{
auto
X_ml_type
=
context
->
InputType
(
0
);
if
(
X_ml_type
->
IsTensorType
())
{
const
Tensor
*
X
=
context
->
Input
<
Tensor
>
(
0
);
if
(
nullptr
==
X
)
{
return
Status
(
common
::
ONNXRUNTIME
,
common
::
FAIL
,
"IdentityOp rocm: input count mismatch."
);
}
const
TensorShape
&
shape
=
X
->
Shape
();
Tensor
*
Y
=
context
->
Output
(
0
,
shape
);
if
(
nullptr
==
Y
)
{
return
Status
(
common
::
ONNXRUNTIME
,
common
::
FAIL
,
"IdentityOp rocm: failed to allocate output tensor."
);
}
auto
X_type
=
X
->
DataType
();
const
void
*
source
=
X
->
DataRaw
(
X_type
);
void
*
target
=
Y
->
MutableDataRaw
(
X_type
);
//If source and target pointers are not equal, we need to copy the data.
if
(
target
!=
source
)
{
HIP_RETURN_IF_ERROR
(
hipMemcpyAsync
(
target
,
source
,
X
->
Shape
().
Size
()
*
X
->
DataType
()
->
Size
(),
hipMemcpyDeviceToDevice
,
Stream
()));
}
if
(
is_dropout
)
{
Tensor
*
mask
=
context
->
Output
(
1
,
shape
);
// a 'nullptr' returned would make it an unused optional output
if
(
mask
!=
nullptr
)
{
// Opset 7 differs with Opset 10 in that the type of the 'mask'
// output is tied with the type of the input in Opset 7 whereas
// the type of 'mask' in Opset 10 is 'bool' always
// so we have a common solution
void
*
mask_data
=
mask
->
MutableDataRaw
();
// In 'test'/'inference' mode, there are no input values dropped out
// so fill the buffer with 0/false
HIP_RETURN_IF_ERROR
(
hipMemsetAsync
(
mask_data
,
0
,
mask
->
SizeInBytes
(),
Stream
()));
}
}
}
else
if
(
X_ml_type
->
IsTensorSequenceType
())
{
const
TensorSeq
*
X
=
context
->
Input
<
TensorSeq
>
(
0
);
ORT_ENFORCE
(
X
!=
nullptr
,
"IdentityOp rocm: input tensor is missing."
);
TensorSeq
*
Y
=
context
->
Output
<
TensorSeq
>
(
0
);
ORT_ENFORCE
(
Y
!=
nullptr
,
"IdentityOp rocm: failed to allocate output tensor sequence."
);
if
(
X
==
Y
)
{
return
Status
::
OK
();
}
auto
X_type
=
X
->
DataType
();
Y
->
SetType
(
X_type
);
AllocatorPtr
alloc
;
auto
status
=
context
->
GetTempSpaceAllocator
(
&
alloc
);
if
(
!
status
.
IsOK
())
{
return
Status
(
common
::
ONNXRUNTIME
,
common
::
FAIL
,
"IdentityOp rocm: unable to get an allocator."
);
}
auto
X_size
=
X
->
Size
();
for
(
size_t
i
=
0
;
i
<
X_size
;
++
i
)
{
const
Tensor
&
source_tensor
=
X
->
Get
(
i
);
std
::
unique_ptr
<
Tensor
>
target_tensor
=
Tensor
::
Create
(
source_tensor
.
DataType
(),
source_tensor
.
Shape
(),
alloc
);
HIP_RETURN_IF_ERROR
(
hipMemcpyAsync
(
target_tensor
->
MutableDataRaw
(),
source_tensor
.
DataRaw
(),
source_tensor
.
SizeInBytes
(),
hipMemcpyDeviceToDevice
,
Stream
()));
Y
->
Add
(
std
::
move
(
*
target_tensor
));
}
}
else
{
return
Status
(
common
::
ONNXRUNTIME
,
common
::
FAIL
,
"IdentityOp rocm: unsupported input type."
);
}
return
Status
::
OK
();
}
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/nonzero_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "nonzero_impl.h"
#include "core/providers/rocm/shared_inc/rocm_call.h"
#include "core/providers/rocm/cu_inc/common.cuh"
#include <hipcub/hipcub.hpp>
namespace
onnxruntime
{
namespace
rocm
{
static
const
int
NONZERO_THREADS_PER_BLOCK
=
GridDim
::
maxThreadsPerBlock
;
//TODO:check overflow
int
NonZeroCalcBlockCount
(
int64_t
x_size
)
{
return
static_cast
<
int
>
(
CeilDiv
(
x_size
,
NONZERO_THREADS_PER_BLOCK
));
}
hipError_t
NonZeroCalcPrefixSumTempStorageBytes
(
hipStream_t
stream
,
int
*
prefix_counts
,
int
number_of_blocks
,
size_t
&
temp_storage_bytes
)
{
temp_storage_bytes
=
0
;
return
hipcub
::
DeviceScan
::
InclusiveSum
(
nullptr
,
temp_storage_bytes
,
prefix_counts
,
prefix_counts
,
number_of_blocks
,
stream
);
}
hipError_t
NonZeroInclusivePrefixSum
(
hipStream_t
stream
,
void
*
d_temp_storage
,
size_t
temp_storage_bytes
,
int
*
prefix_counts
,
int
number_of_blocks
)
{
return
hipcub
::
DeviceScan
::
InclusiveSum
(
d_temp_storage
,
temp_storage_bytes
,
prefix_counts
,
prefix_counts
,
number_of_blocks
,
stream
);
}
template
<
typename
InputT
,
int
THREADS_PER_BLOCK
>
__global__
void
NonZeroCountEachBlockKernel
(
const
InputT
*
x
,
int64_t
x_size
,
int
*
count_in_blocks
)
{
typedef
hipcub
::
BlockReduce
<
int
,
THREADS_PER_BLOCK
,
hipcub
::
BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY
>
BlockReduceT
;
__shared__
typename
BlockReduceT
::
TempStorage
temp_storage
;
int64_t
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// const hipcub::CastOp<bool> cast_to_bool; not supported on amd hipcub
int
nz
=
0
;
if
(
index
<
x_size
&&
bool
(
x
[
index
]))
++
nz
;
int
count
=
BlockReduceT
(
temp_storage
).
Sum
(
nz
);
if
(
threadIdx
.
x
==
0
)
{
count_in_blocks
[
blockIdx
.
x
]
=
count
;
}
}
template
<
typename
InputT
,
int
THREADS_PER_BLOCK
>
__global__
void
NonZeroOutputPositionsKernel
(
const
InputT
*
x
,
int64_t
x_size
,
int
x_rank
,
const
TArray
<
fast_divmod
>
x_strides
,
const
int
*
prefix_counts
,
int
nonzero_elements
,
int64_t
*
results
)
{
typedef
hipcub
::
BlockScan
<
int
,
THREADS_PER_BLOCK
>
BlockScanT
;
__shared__
typename
BlockScanT
::
TempStorage
temp_storage
;
int64_t
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// const hipcub::CastOp<bool> cast_to_bool; not supported on amd hipcub
int
nz
=
0
;
if
(
index
<
x_size
&&
bool
(
x
[
index
]))
++
nz
;
int
pos_in_block
=
0
;
BlockScanT
(
temp_storage
).
InclusiveSum
(
nz
,
pos_in_block
);
int
result_position
=
((
blockIdx
.
x
==
0
)
?
0
:
prefix_counts
[
blockIdx
.
x
-
1
])
+
pos_in_block
-
nz
;
if
(
index
<
x_size
&&
bool
(
x
[
index
]))
{
int
remain
=
(
int
)
index
,
dim
=
0
;
for
(
int
axis
=
0
,
rp
=
result_position
;
axis
<
x_rank
;
++
axis
,
rp
+=
nonzero_elements
)
{
x_strides
[
axis
].
divmod
(
remain
,
dim
,
remain
);
results
[
rp
]
=
(
int64_t
)
dim
;
}
}
}
constexpr
int
MAX_DIMS
=
16
;
template
<
typename
InputT
,
int
THREADS_PER_BLOCK
>
__global__
void
UnRolledNonZeroOutputPositionsKernel
(
const
InputT
*
x
,
int64_t
x_size
,
int
x_rank
,
const
TArray
<
fast_divmod
>
x_strides
,
const
int
*
prefix_counts
,
int
nonzero_elements
,
int64_t
*
results
)
{
typedef
hipcub
::
BlockScan
<
int
,
THREADS_PER_BLOCK
>
BlockScanT
;
__shared__
typename
BlockScanT
::
TempStorage
temp_storage
;
int64_t
index
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
// const hipcub::CastOp<bool> cast_to_bool; not supported on amd hipcub
int
nz
=
0
;
if
(
index
<
x_size
&&
bool
(
x
[
index
]))
++
nz
;
int
pos_in_block
=
0
;
BlockScanT
(
temp_storage
).
InclusiveSum
(
nz
,
pos_in_block
);
int
result_position
=
((
blockIdx
.
x
==
0
)
?
0
:
prefix_counts
[
blockIdx
.
x
-
1
])
+
pos_in_block
-
nz
;
if
(
index
<
x_size
&&
bool
(
x
[
index
]))
{
int
remain
=
(
int
)
index
,
dim
=
0
;
int
rp
=
result_position
;
#pragma unroll
for
(
int
axis
=
0
;
axis
<
MAX_DIMS
;
++
axis
)
{
if
(
axis
==
x_rank
)
{
break
;
}
x_strides
[
axis
].
divmod
(
remain
,
dim
,
remain
);
results
[
rp
]
=
(
int64_t
)
dim
;
rp
+=
nonzero_elements
;
}
}
}
template
<
typename
InputT
>
hipError_t
NonZeroCountEachBlock
(
hipStream_t
stream
,
const
InputT
*
x
,
int64_t
x_size
,
int
*
count_in_blocks
)
{
int
num_blocks
=
NonZeroCalcBlockCount
(
x_size
);
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
NonZeroCountEachBlockKernel
<
InputT
,
NONZERO_THREADS_PER_BLOCK
>
),
num_blocks
,
NONZERO_THREADS_PER_BLOCK
,
0
,
stream
,
x
,
x_size
,
count_in_blocks
);
return
hipSuccess
;
}
template
<
typename
InputT
>
hipError_t
NonZeroOutputPositions
(
hipStream_t
stream
,
const
InputT
*
x
,
int64_t
x_size
,
int
x_rank
,
const
TArray
<
fast_divmod
>&
x_strides
,
const
int
*
prefix_counts
,
int
nonzero_elements
,
int64_t
*
results
)
{
int
num_blocks
=
NonZeroCalcBlockCount
(
x_size
);
if
(
x_rank
>
MAX_DIMS
)
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
NonZeroOutputPositionsKernel
<
InputT
,
NONZERO_THREADS_PER_BLOCK
>
),
num_blocks
,
NONZERO_THREADS_PER_BLOCK
,
0
,
stream
,
x
,
x_size
,
x_rank
,
x_strides
,
prefix_counts
,
nonzero_elements
,
results
);
}
else
{
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
UnRolledNonZeroOutputPositionsKernel
<
InputT
,
NONZERO_THREADS_PER_BLOCK
>
),
num_blocks
,
NONZERO_THREADS_PER_BLOCK
,
0
,
stream
,
x
,
x_size
,
x_rank
,
x_strides
,
prefix_counts
,
nonzero_elements
,
results
);
}
return
hipSuccess
;
}
template
hipError_t
NonZeroCountEachBlock
(
hipStream_t
stream
,
const
bool
*
,
int64_t
,
int
*
);
template
hipError_t
NonZeroCountEachBlock
(
hipStream_t
stream
,
const
uint8_t
*
,
int64_t
,
int
*
);
template
hipError_t
NonZeroCountEachBlock
(
hipStream_t
stream
,
const
int64_t
*
,
int64_t
,
int
*
);
template
hipError_t
NonZeroCountEachBlock
(
hipStream_t
stream
,
const
int32_t
*
,
int64_t
,
int
*
);
template
hipError_t
NonZeroCountEachBlock
(
hipStream_t
stream
,
const
float
*
,
int64_t
,
int
*
);
template
hipError_t
NonZeroCountEachBlock
(
hipStream_t
stream
,
const
half
*
,
int64_t
,
int
*
);
template
hipError_t
NonZeroOutputPositions
(
hipStream_t
stream
,
const
bool
*
,
int64_t
,
int
,
const
TArray
<
fast_divmod
>
&
,
const
int
*
,
int
,
int64_t
*
);
template
hipError_t
NonZeroOutputPositions
(
hipStream_t
stream
,
const
uint8_t
*
,
int64_t
,
int
,
const
TArray
<
fast_divmod
>
&
,
const
int
*
,
int
,
int64_t
*
);
template
hipError_t
NonZeroOutputPositions
(
hipStream_t
stream
,
const
int64_t
*
,
int64_t
,
int
,
const
TArray
<
fast_divmod
>
&
,
const
int
*
,
int
,
int64_t
*
);
template
hipError_t
NonZeroOutputPositions
(
hipStream_t
stream
,
const
int32_t
*
,
int64_t
,
int
,
const
TArray
<
fast_divmod
>
&
,
const
int
*
,
int
,
int64_t
*
);
template
hipError_t
NonZeroOutputPositions
(
hipStream_t
stream
,
const
float
*
,
int64_t
,
int
,
const
TArray
<
fast_divmod
>
&
,
const
int
*
,
int
,
int64_t
*
);
template
hipError_t
NonZeroOutputPositions
(
hipStream_t
stream
,
const
half
*
,
int64_t
,
int
,
const
TArray
<
fast_divmod
>
&
,
const
int
*
,
int
,
int64_t
*
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/nonzero_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
rocm
{
int
NonZeroCalcBlockCount
(
int64_t
x_size
);
hipError_t
NonZeroCalcPrefixSumTempStorageBytes
(
hipStream_t
stream
,
int
*
prefix_counts
,
int
number_of_blocks
,
size_t
&
);
hipError_t
NonZeroInclusivePrefixSum
(
hipStream_t
stream
,
void
*
d_temp_storage
,
size_t
temp_storage_bytes
,
int
*
prefix_counts
,
int
number_of_blocks
);
// count nonzero elements in each block into counts_in_blocks,
// the counts_in_blocks buffer is pre-allocated on gpu first.
template
<
typename
InputT
>
hipError_t
NonZeroCountEachBlock
(
hipStream_t
stream
,
const
InputT
*
x
,
int64_t
x_size
,
int
*
counts_in_blocks
);
// output nonzero positions using input x and prefix_counts for each blocks
template
<
typename
InputT
>
hipError_t
NonZeroOutputPositions
(
hipStream_t
stream
,
const
InputT
*
x
,
int64_t
x_size
,
int
x_rank
,
const
TArray
<
fast_divmod
>&
x_strides
,
const
int
*
prefix_counts
,
int
nonzero_elements
,
int64_t
*
results
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/nonzero_op.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "nonzero_op.h"
#include "nonzero_impl.h"
#include "core/providers/cpu/tensor/utils.h"
namespace
onnxruntime
{
namespace
rocm
{
// kernel builder functions
#define NONZERO_TYPED_KERNEL_WITH_TYPE_NAME(type, type_name) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
NonZero, \
kOnnxDomain, \
9, 12, \
type_name, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<type>()), \
NonZero<type>) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
NonZero, \
kOnnxDomain, \
13, \
type_name, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<type>()), \
NonZero<type>)
#define NONZERO_TYPED_KERNEL(type) \
NONZERO_TYPED_KERNEL_WITH_TYPE_NAME(type, type)
// start with a subset of types, enable more as needed...
NONZERO_TYPED_KERNEL
(
bool
)
NONZERO_TYPED_KERNEL
(
uint8_t
)
//NONZERO_TYPED_KERNEL(uint16_t)
//NONZERO_TYPED_KERNEL(uint32_t)
//NONZERO_TYPED_KERNEL(uint64_t)
//NONZERO_TYPED_KERNEL(int8_t)
//NONZERO_TYPED_KERNEL(int16_t)
NONZERO_TYPED_KERNEL
(
int32_t
)
NONZERO_TYPED_KERNEL
(
int64_t
)
NONZERO_TYPED_KERNEL
(
MLFloat16
)
//NONZERO_TYPED_KERNEL(BFloat16)
NONZERO_TYPED_KERNEL
(
float
)
//NONZERO_TYPED_KERNEL(double)
//NONZERO_TYPED_KERNEL_WITH_TYPE_NAME(std::string, string)
#undef NONZERO_TYPED_KERNEL
#undef NONZERO_TYPED_KERNEL_WITH_TYPE_NAME
template
<
typename
T
>
Status
NonZero
<
T
>::
ComputeInternal
(
OpKernelContext
*
context
)
const
{
static
const
TensorShape
kScalarDims
{
1
};
const
auto
x
=
context
->
Input
<
Tensor
>
(
0
);
int
nonzero_elements
=
0
;
const
auto
&
x_shape
=
x
->
Shape
();
const
int
x_rank
=
x_shape
.
IsScalar
()
?
1
:
static_cast
<
int
>
(
x_shape
.
NumDimensions
());
auto
x_dims
=
(
x_shape
.
IsScalar
())
?
kScalarDims
.
GetDims
()
:
x_shape
.
GetDims
();
const
int64_t
x_size
=
x_shape
.
Size
();
if
(
x_size
>
0
)
{
auto
x_data
=
reinterpret_cast
<
const
typename
ToHipType
<
T
>::
MappedType
*>
(
x
->
Data
<
T
>
());
const
int
number_of_blocks
=
NonZeroCalcBlockCount
(
x_size
);
auto
prefix_buffer
=
GetScratchBuffer
<
int
>
(
number_of_blocks
);
int
*
prefix_counts
=
prefix_buffer
.
get
();
HIP_RETURN_IF_ERROR
(
NonZeroCountEachBlock
(
Stream
(),
x_data
,
x_size
,
prefix_counts
));
size_t
temp_storage_bytes
=
0
;
HIP_RETURN_IF_ERROR
(
NonZeroCalcPrefixSumTempStorageBytes
(
Stream
(),
prefix_counts
,
number_of_blocks
,
temp_storage_bytes
));
auto
temp_buffer
=
GetScratchBuffer
<
uint8_t
>
(
temp_storage_bytes
);
auto
d_temp_storage
=
temp_buffer
.
get
();
HIP_RETURN_IF_ERROR
(
NonZeroInclusivePrefixSum
(
Stream
(),
d_temp_storage
,
temp_storage_bytes
,
prefix_counts
,
number_of_blocks
));
// hipMemcpyAsync from device memory to pageable host memory will return only once the copy has completed.
HIP_RETURN_IF_ERROR
(
hipMemcpyAsync
(
&
nonzero_elements
,
prefix_counts
+
number_of_blocks
-
1
,
sizeof
(
int
),
hipMemcpyDeviceToHost
,
Stream
()));
TArray
<
fast_divmod
>
fdm_x_strides
(
x_rank
);
TensorPitches
x_strides
(
x_dims
);
for
(
auto
i
=
0
;
i
<
x_rank
;
i
++
)
{
fdm_x_strides
[
i
]
=
fast_divmod
(
static_cast
<
int
>
(
x_strides
[
i
]));
}
auto
*
output_tensor
=
context
->
Output
(
0
,
{
x_rank
,
nonzero_elements
});
ORT_ENFORCE
(
output_tensor
,
"failed to get first output!"
);
HIP_RETURN_IF_ERROR
(
NonZeroOutputPositions
(
Stream
(),
x_data
,
x_size
,
x_rank
,
fdm_x_strides
,
prefix_counts
,
nonzero_elements
,
output_tensor
->
MutableData
<
int64_t
>
()));
}
else
{
context
->
Output
(
0
,
{
x_rank
,
nonzero_elements
});
}
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/nonzero_op.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
class
NonZero
final
:
public
RocmKernel
{
public:
NonZero
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/onehot.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/onehot.h"
using
namespace
onnxruntime
::
common
;
namespace
onnxruntime
{
namespace
rocm
{
// T1: indices, T2: depth, T3: values
#define REGISTER_TYPED_ONE_HOT_OP(in_type, out_type, depth_type) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
OneHot, \
kOnnxDomain, \
11, \
in_type##_##out_type##_##depth_type, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.InputMemoryType(OrtMemTypeCPUInput, 1)
/* Keep depth in CPU */
\
.InputMemoryType(OrtMemTypeCPUInput, 2)
/* Keep values in CPU */
\
.TypeConstraint("T1", DataTypeImpl::GetTensorType<in_type>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<depth_type>()) \
.TypeConstraint("T3", DataTypeImpl::GetTensorType<out_type>()), \
OneHotOp<in_type, out_type, depth_type>);
REGISTER_TYPED_ONE_HOT_OP
(
int64_t
,
int64_t
,
int64_t
)
REGISTER_TYPED_ONE_HOT_OP
(
int64_t
,
float
,
int64_t
)
REGISTER_TYPED_ONE_HOT_OP
(
int32_t
,
float
,
int32_t
)
REGISTER_TYPED_ONE_HOT_OP
(
int64_t
,
MLFloat16
,
int64_t
)
REGISTER_TYPED_ONE_HOT_OP
(
int32_t
,
MLFloat16
,
int32_t
)
template
<
typename
in_type
,
typename
out_type
,
typename
depth_type
>
Status
OneHotOp
<
in_type
,
out_type
,
depth_type
>::
ComputeInternal
(
OpKernelContext
*
ctx
)
const
{
typedef
typename
ToHipType
<
out_type
>::
MappedType
HipT_Out
;
const
Tensor
*
indices
=
ctx
->
Input
<
Tensor
>
(
0
);
const
Tensor
*
depth
=
ctx
->
Input
<
Tensor
>
(
1
);
const
Tensor
*
values
=
ctx
->
Input
<
Tensor
>
(
2
);
ORT_RETURN_IF_ERROR
(
ValidateInputs
(
depth
,
values
));
const
auto
*
depth_data
=
depth
->
Data
<
depth_type
>
();
const
auto
depth_val
=
static_cast
<
int64_t
>
(
*
depth_data
);
// As per spec in case 'depth' is of non-integer type, it will be casted to int64 before use.
if
(
depth_val
<=
0
)
{
return
Status
(
ONNXRUNTIME
,
INVALID_ARGUMENT
,
"Depth is negative."
);
}
// prepare output shape
int64_t
prefix_dim_size
,
suffix_dim_size
;
TensorShapeVector
output_shape
;
ORT_RETURN_IF_ERROR
(
PrepareOutputShape
(
indices
,
depth_val
,
axis_
,
prefix_dim_size
,
suffix_dim_size
,
output_shape
));
// allocate output
const
auto
*
values_data
=
reinterpret_cast
<
const
HipT_Out
*>
(
values
->
Data
<
out_type
>
());
Tensor
*
output
=
ctx
->
Output
(
0
,
TensorShape
(
output_shape
));
// edge case where we have a dim with a value of 0
if
(
output
->
Shape
().
Size
()
==
0
)
return
Status
::
OK
();
const
fast_divmod
fdm_suffix
(
gsl
::
narrow_cast
<
int
>
(
suffix_dim_size
));
const
auto
*
indices_data
=
indices
->
Data
<
in_type
>
();
auto
*
output_data
=
reinterpret_cast
<
HipT_Out
*>
(
output
->
MutableData
<
out_type
>
());
if
(
values_data
[
0
]
==
HipT_Out
(
0.
f
))
{
HIP_RETURN_IF_ERROR
(
hipMemsetAsync
(
output
->
MutableDataRaw
(),
0
,
output
->
SizeInBytes
(),
Stream
()));
OneHotWithZeroOffValueImpl
(
Stream
(),
indices_data
,
fdm_suffix
,
depth_val
,
values_data
[
1
],
output_data
,
indices
->
Shape
().
Size
());
return
Status
::
OK
();
}
const
fast_divmod
fdm_depth_suffix
(
gsl
::
narrow_cast
<
int
>
(
depth_val
*
suffix_dim_size
));
OneHotImpl
(
Stream
(),
indices_data
,
fdm_depth_suffix
,
fdm_suffix
,
depth_val
,
values_data
[
1
],
values_data
[
0
],
output_data
,
output
->
Shape
().
Size
());
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/onehot.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/rocm_common.h"
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/providers/rocm/tensor/onehot.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
in_type
,
typename
out_type
>
__global__
void
_OneHotImpl
(
const
in_type
*
indices_data
,
const
fast_divmod
fdm_depth_suffix
,
const
fast_divmod
fdm_suffix
,
const
int64_t
depth_val
,
const
out_type
on_value
,
const
out_type
off_value
,
out_type
*
output_data
,
HIP_LONG
N
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
N
);
int
prefix_index
,
prefix_offset
;
fdm_depth_suffix
.
divmod
(
id
,
prefix_index
,
prefix_offset
);
int
depth_index
,
suffix_index
;
fdm_suffix
.
divmod
(
prefix_offset
,
depth_index
,
suffix_index
);
HIP_LONG
indices_index
=
prefix_index
*
fdm_suffix
.
d_
+
suffix_index
;
// handle index outside the range [-depth, depth-1] case
bool
is_valid_range
=
indices_data
[
indices_index
]
>=
-
depth_val
&&
indices_data
[
indices_index
]
<
depth_val
;
// handle negative index
in_type
adjusted_indice
=
(
indices_data
[
indices_index
]
+
depth_val
)
%
depth_val
;
output_data
[
id
]
=
(
is_valid_range
&&
adjusted_indice
==
in_type
(
depth_index
))
?
on_value
:
off_value
;
}
template
<
typename
in_type
,
typename
out_type
>
__global__
void
_OneHotWithZeroOffValueImpl
(
const
in_type
*
indices_data
,
const
fast_divmod
fdm_suffix
,
const
int64_t
depth_val
,
const
out_type
on_value
,
out_type
*
output_data
,
HIP_LONG
N
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
N
);
if
(
indices_data
[
id
]
>=
-
depth_val
&&
indices_data
[
id
]
<
depth_val
)
{
in_type
adjusted_index
=
indices_data
[
id
]
>=
0
?
indices_data
[
id
]
:
indices_data
[
id
]
+
depth_val
;
int
q
,
r
;
fdm_suffix
.
divmod
(
id
,
q
,
r
);
output_data
[(
q
*
depth_val
+
adjusted_index
)
*
fdm_suffix
.
d_
+
r
]
=
on_value
;
}
}
template
<
typename
in_type
,
typename
out_type
>
void
OneHotImpl
(
hipStream_t
stream
,
const
in_type
*
indices_data
,
const
fast_divmod
fdm_depth_suffix
,
const
fast_divmod
fdm_suffix
,
const
int64_t
depth_val
,
const
out_type
on_value
,
const
out_type
off_value
,
out_type
*
output_data
,
size_t
count
)
{
int
blocksPerGrid
=
(
int
)(
ceil
(
static_cast
<
float
>
(
count
)
/
GridDim
::
maxThreadsPerBlock
));
HIP_LONG
N
=
static_cast
<
HIP_LONG
>
(
count
);
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_OneHotImpl
<
in_type
,
out_type
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
indices_data
,
fdm_depth_suffix
,
fdm_suffix
,
depth_val
,
on_value
,
off_value
,
output_data
,
N
);
}
template
<
typename
in_type
,
typename
out_type
>
void
OneHotWithZeroOffValueImpl
(
hipStream_t
stream
,
const
in_type
*
indices_data
,
const
fast_divmod
fdm_suffix
,
const
int64_t
depth_val
,
const
out_type
on_value
,
out_type
*
output_data
,
size_t
count
)
{
int
blocksPerGrid
=
(
int
)(
ceil
(
static_cast
<
float
>
(
count
)
/
GridDim
::
maxThreadsPerBlock
));
HIP_LONG
N
=
static_cast
<
HIP_LONG
>
(
count
);
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_OneHotWithZeroOffValueImpl
<
in_type
,
out_type
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
indices_data
,
fdm_suffix
,
depth_val
,
on_value
,
output_data
,
N
);
}
#define SPECIALIZED_OneHotImpl(in_type, out_type) \
template void OneHotImpl( \
hipStream_t stream, \
const in_type* indices_data, \
const fast_divmod fdm_depth_suffix, \
const fast_divmod fdm_suffix, \
const int64_t depth_val, \
const out_type on_value, \
const out_type off_value, \
out_type* output_data, \
size_t count);
SPECIALIZED_OneHotImpl
(
int64_t
,
int64_t
)
SPECIALIZED_OneHotImpl
(
int64_t
,
float
)
SPECIALIZED_OneHotImpl
(
int32_t
,
float
)
SPECIALIZED_OneHotImpl
(
int64_t
,
half
)
SPECIALIZED_OneHotImpl
(
int32_t
,
half
)
#define SPECIALIZED_OneHotWithZeroOffValueImpl(in_type, out_type) \
template void OneHotWithZeroOffValueImpl( \
hipStream_t stream, \
const in_type* indices_data, \
const fast_divmod fdm_suffix, \
const int64_t depth_val, \
const out_type on_value, \
out_type* output_data, \
size_t count);
SPECIALIZED_OneHotWithZeroOffValueImpl
(
int64_t
,
int64_t
)
SPECIALIZED_OneHotWithZeroOffValueImpl
(
int64_t
,
float
)
SPECIALIZED_OneHotWithZeroOffValueImpl
(
int32_t
,
float
)
SPECIALIZED_OneHotWithZeroOffValueImpl
(
int64_t
,
half
)
SPECIALIZED_OneHotWithZeroOffValueImpl
(
int32_t
,
half
)
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/onehot.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
in_type
,
typename
out_type
>
void
OneHotImpl
(
hipStream_t
stream
,
const
in_type
*
indices
,
const
fast_divmod
fdm_depth_suffix
,
const
fast_divmod
fdm_suffix
,
const
int64_t
depth_val
,
const
out_type
on_value
,
const
out_type
off_value
,
out_type
*
output
,
size_t
count
);
template
<
typename
in_type
,
typename
out_type
>
void
OneHotWithZeroOffValueImpl
(
hipStream_t
stream
,
const
in_type
*
indices
,
const
fast_divmod
fdm_suffix
,
const
int64_t
depth_val
,
const
out_type
on_value
,
out_type
*
output
,
size_t
count
);
template
<
typename
in_type
,
typename
out_type
,
typename
depth_type
>
class
OneHotOp
final
:
public
RocmKernel
{
public:
explicit
OneHotOp
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{
int64_t
tmp_axis
;
if
(
info
.
GetAttr
<
int64_t
>
(
"axis"
,
&
tmp_axis
).
IsOK
())
{
axis_
=
tmp_axis
;
}
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
private:
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE
(
OneHotOp
);
int64_t
axis_
=
-
1
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/pad.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "pad.h"
#include "pad_impl.h"
#include "core/providers/cpu/tensor/utils.h"
namespace
onnxruntime
{
namespace
rocm
{
#define REGISTER_KERNEL_TYPED(T) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Pad, \
kOnnxDomain, \
2, 10, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
Pad<T>); \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Pad, \
kOnnxDomain, \
11, 12, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.InputMemoryType(OrtMemTypeCPUInput, 1) \
.InputMemoryType(OrtMemTypeCPUInput, 2) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
Pad<T>); \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
Pad, \
kOnnxDomain, \
13, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.InputMemoryType(OrtMemTypeCPUInput, 1) \
.InputMemoryType(OrtMemTypeCPUInput, 2) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
Pad<T>);
using
PadsVector
=
PadBase
::
PadsVector
;
static
bool
IsNCHWInputWithPaddingAlongHAndW
(
size_t
input_rank
,
const
TArray
<
int64_t
>&
lower_pads
,
const
TArray
<
int64_t
>&
upper_pads
)
{
if
(
input_rank
==
2
)
{
// N = 1 and C = 1
return
true
;
}
// Is CHW input AND no padding along C dim
if
(
input_rank
==
3
&&
lower_pads
[
0
]
==
0
&&
// start padding along C
upper_pads
[
0
]
==
0
)
{
// end padding along C
return
true
;
}
// Is NCHW input AND no padding along N and C dims
if
(
input_rank
==
4
&&
lower_pads
[
0
]
==
0
&&
lower_pads
[
1
]
==
0
&&
// start padding along N and C
upper_pads
[
0
]
==
0
&&
upper_pads
[
1
]
==
0
)
{
// end padding along N and C
return
true
;
}
return
false
;
}
template
<
typename
T
>
typename
ToHipType
<
T
>::
MappedType
ToCudaValue
(
const
T
&
value
)
{
return
value
;
}
template
<
>
typename
ToHipType
<
MLFloat16
>::
MappedType
ToCudaValue
<
MLFloat16
>
(
const
MLFloat16
&
value
)
{
return
*
reinterpret_cast
<
const
typename
ToHipType
<
MLFloat16
>::
MappedType
*>
(
&
value
.
val
);
}
template
<
typename
T
>
Status
Pad
<
T
>::
ComputeInternal
(
OpKernelContext
*
ctx
)
const
{
typedef
typename
ToHipType
<
T
>::
MappedType
HipT
;
const
auto
&
input_tensor
=
*
ctx
->
Input
<
Tensor
>
(
0
);
auto
const
&
input_shape
=
input_tensor
.
Shape
();
int32_t
dimension_count
=
static_cast
<
int32_t
>
(
input_shape
.
NumDimensions
());
const
PadsVector
*
p_pads
=
&
pads_
;
const
PadsVector
*
p_slices
=
&
slices_
;
HipT
value
=
ToHipType
<
T
>::
FromFloat
(
value_
);
// kOnnxDomain Pad opset >= 11 (Or) kMsDomain opset == 1
PadsVector
pads
;
PadsVector
slices
;
if
(
is_dynamic_
)
{
const
Tensor
&
pads_tensor
=
*
ctx
->
Input
<
Tensor
>
(
1
);
const
auto
pads_tensor_dims
=
pads_tensor
.
Shape
().
GetDims
();
ORT_ENFORCE
(
utils
::
IsPrimitiveDataType
<
int64_t
>
(
pads_tensor
.
DataType
()),
"Pads tensor should be an INT64 tensor"
);
ORT_ENFORCE
(
pads_tensor_dims
.
size
()
==
1
||
(
pads_tensor_dims
.
size
()
==
2
&&
pads_tensor_dims
[
0
]
==
1
),
"Pads tensor should be a 1D tensor of shape [2 * input_rank] or a 2D tensor of shape [1, 2 * input_rank]"
);
const
int64_t
*
pads_tensor_raw_data
=
pads_tensor
.
Data
<
int64_t
>
();
size_t
pads_size
=
static_cast
<
size_t
>
(
pads_tensor
.
Shape
().
Size
());
ORT_ENFORCE
(
pads_size
==
2
*
static_cast
<
size_t
>
(
dimension_count
),
"Pads tensor size should be equal to twice the input dimension count "
);
pads
.
reserve
(
2LL
*
dimension_count
);
for
(
size_t
i
=
0
;
i
<
pads_size
;
++
i
)
{
pads
.
push_back
(
pads_tensor_raw_data
[
i
]);
}
// Separate out any negative pads into the slices array
slices
.
resize
(
pads
.
size
(),
0
);
for
(
size_t
index
=
0
;
index
<
pads
.
size
();
index
++
)
{
if
(
pads
[
index
]
<
0
)
{
slices
[
index
]
=
pads
[
index
];
pads
[
index
]
=
0
;
}
}
T
raw_value
{};
const
Tensor
*
value_tensor
=
ctx
->
Input
<
Tensor
>
(
2
);
if
(
nullptr
!=
value_tensor
)
{
ORT_ENFORCE
(
utils
::
IsPrimitiveDataType
<
T
>
(
value_tensor
->
DataType
())
&&
value_tensor
->
Shape
().
Size
()
==
1
,
"Value tensor should be a 1D tensor of size 1 with the same type as that of the input tensor"
);
raw_value
=
value_tensor
->
Data
<
T
>
()[
0
];
value
=
ToCudaValue
<
T
>
(
raw_value
);
}
p_pads
=
&
pads
;
p_slices
=
&
slices
;
}
TensorPitches
input_pitches
(
input_shape
.
GetDims
());
TArray
<
int64_t
>
input_dims
(
input_shape
.
GetDims
());
TArray
<
int64_t
>
input_strides
(
input_pitches
);
auto
output_dims
(
input_shape
.
AsShapeVector
());
ORT_ENFORCE
(
static_cast
<
size_t
>
(
dimension_count
*
2
)
==
p_pads
->
size
(),
"'pads' attribute has wrong number of values"
);
// Calculate output dimensions, and handle any negative padding
TArray
<
int64_t
>
lower_pads
(
dimension_count
);
TArray
<
int64_t
>
upper_pads
(
dimension_count
);
for
(
auto
i
=
0
;
i
<
dimension_count
;
i
++
)
{
lower_pads
[
i
]
=
(
*
p_pads
)[
i
]
+
(
*
p_slices
)[
i
];
upper_pads
[
i
]
=
(
*
p_pads
)[
i
+
dimension_count
]
+
(
*
p_slices
)[
i
+
dimension_count
];
output_dims
[
i
]
+=
lower_pads
[
i
]
+
upper_pads
[
i
];
}
TensorShape
output_shape
(
output_dims
);
// special case when there is a dim value of 0 in the shape. behavior depends on mode
if
(
input_shape
.
Size
()
==
0
)
{
ORT_RETURN_IF_ERROR
(
PadBase
::
HandleDimValueZero
(
mode_
,
input_shape
,
output_shape
));
}
auto
&
output_tensor
=
*
ctx
->
Output
(
0
,
output_shape
);
if
(
std
::
all_of
(
p_pads
->
begin
(),
p_pads
->
end
(),
[](
const
int64_t
v
)
{
return
v
==
0
;
})
&&
std
::
all_of
(
p_slices
->
begin
(),
p_slices
->
end
(),
[](
const
int64_t
v
)
{
return
v
==
0
;
})
&&
output_shape
.
Size
()
>
0
)
{
HIP_RETURN_IF_ERROR
(
hipMemcpyAsync
(
output_tensor
.
MutableData
<
T
>
(),
input_tensor
.
Data
<
T
>
(),
sizeof
(
typename
ToHipType
<
T
>::
MappedType
)
*
output_shape
.
Size
(),
hipMemcpyDeviceToDevice
,
Stream
()));
return
Status
::
OK
();
}
if
(
IsNCHWInputWithPaddingAlongHAndW
(
static_cast
<
size_t
>
(
dimension_count
),
lower_pads
,
upper_pads
))
{
// If we have entered here, it means the input can only be 4-D (NCHW), 3-D (CHW), or 2-D (HW)
// NCHW input
int
height_dim
=
2
;
int
width_dim
=
3
;
if
(
dimension_count
==
3
)
{
// CHW input
height_dim
=
1
;
width_dim
=
2
;
}
else
if
(
dimension_count
==
2
)
{
// HW input
height_dim
=
0
;
width_dim
=
1
;
}
PadNCHWInputWithPaddingAlongHAndWImpl
(
Stream
(),
dimension_count
==
4
?
input_dims
[
0
]
:
1
,
dimension_count
==
4
?
input_dims
[
1
]
:
(
dimension_count
==
3
?
input_dims
[
0
]
:
1
),
input_dims
[
height_dim
],
output_dims
[
height_dim
],
input_dims
[
width_dim
],
output_dims
[
width_dim
],
lower_pads
[
height_dim
],
lower_pads
[
width_dim
],
value
,
static_cast
<
int
>
(
mode_
),
reinterpret_cast
<
const
typename
ToHipType
<
T
>::
MappedType
*>
(
input_tensor
.
Data
<
T
>
()),
reinterpret_cast
<
typename
ToHipType
<
T
>::
MappedType
*>
(
output_tensor
.
MutableData
<
T
>
()),
output_tensor
.
Shape
().
Size
());
return
Status
::
OK
();
}
TArray
<
fast_divmod
>
fdm_output_strides
(
dimension_count
);
TensorPitches
output_strides
(
output_dims
);
for
(
auto
i
=
0
;
i
<
dimension_count
;
i
++
)
{
fdm_output_strides
[
i
]
=
fast_divmod
(
static_cast
<
int
>
(
output_strides
[
i
]));
}
PadImpl
(
Stream
(),
dimension_count
,
input_dims
,
input_strides
,
lower_pads
,
value
,
static_cast
<
int
>
(
mode_
),
reinterpret_cast
<
const
typename
ToHipType
<
T
>::
MappedType
*>
(
input_tensor
.
Data
<
T
>
()),
fdm_output_strides
,
reinterpret_cast
<
typename
ToHipType
<
T
>::
MappedType
*>
(
output_tensor
.
MutableData
<
T
>
()),
output_tensor
.
Shape
().
Size
());
return
Status
::
OK
();
}
#define SPECIALIZED_COMPUTE(T) \
REGISTER_KERNEL_TYPED(T) \
template Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const;
SPECIALIZED_COMPUTE
(
float
)
SPECIALIZED_COMPUTE
(
double
)
SPECIALIZED_COMPUTE
(
MLFloat16
)
SPECIALIZED_COMPUTE
(
bool
)
}
// namespace rocm
};
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/pad.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/tensor/padbase.h"
using
onnxruntime
::
PadBase
;
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
class
Pad
final
:
public
PadBase
,
public
RocmKernel
{
public:
Pad
(
const
OpKernelInfo
&
info
)
:
PadBase
(
info
),
RocmKernel
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/pad_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/cu_inc/common.cuh"
#include "pad_impl.h"
namespace
onnxruntime
{
namespace
rocm
{
// PadMode enum from core/providers/cpu/tensor/pad.h, cannot use that header because of nvcc/onnxruntime incompatibility
enum
class
PadMode
:
int
{
Constant
=
0
,
Reflect
,
Edge
};
template
<
typename
T
,
int
pad_mode
>
__global__
void
_PadKernel
(
const
size_t
shape_rank
,
const
TArray
<
int64_t
>
input_dims
,
const
TArray
<
int64_t
>
input_strides
,
const
TArray
<
int64_t
>
lower_pads
,
const
T
pad_value
,
const
T
*
input_data
,
const
TArray
<
fast_divmod
>
fdm_output_strides
,
T
*
output_data
,
const
size_t
N
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
N
);
HIP_LONG
input_index
=
0
;
HIP_LONG
output_index
=
id
;
bool
use_pad_value
=
false
;
for
(
int
dim
=
0
;
dim
<
shape_rank
&&
!
use_pad_value
;
++
dim
)
{
int
out_coord
,
r
;
fdm_output_strides
[
dim
].
divmod
(
output_index
,
out_coord
,
r
);
output_index
=
r
;
int
in_coord
=
0
;
if
(
out_coord
<
lower_pads
[
dim
])
{
switch
((
PadMode
)
pad_mode
)
{
case
PadMode
::
Constant
:
use_pad_value
=
true
;
break
;
case
PadMode
::
Edge
:
in_coord
=
0
;
break
;
case
PadMode
::
Reflect
:
in_coord
=
lower_pads
[
dim
]
-
out_coord
;
break
;
}
}
else
if
(
out_coord
>=
lower_pads
[
dim
]
+
input_dims
[
dim
])
{
switch
((
PadMode
)
pad_mode
)
{
case
PadMode
::
Constant
:
use_pad_value
=
true
;
break
;
case
PadMode
::
Edge
:
in_coord
=
input_dims
[
dim
]
-
1
;
break
;
case
PadMode
::
Reflect
:
in_coord
=
input_dims
[
dim
]
-
2
-
(
out_coord
-
(
lower_pads
[
dim
]
+
input_dims
[
dim
]));
break
;
}
}
else
{
in_coord
=
out_coord
-
lower_pads
[
dim
];
}
input_index
+=
input_strides
[
dim
]
*
in_coord
;
}
output_data
[
id
]
=
use_pad_value
?
(
T
)
pad_value
:
input_data
[
input_index
];
}
template
<
typename
T
,
int
pad_mode
>
__global__
void
_PadNCHWInputWithPaddingAlongHAndWKernel
(
const
int64_t
n
,
// Batch
const
int64_t
c
,
// Channel
const
int64_t
input_height
,
const
int64_t
output_height
,
const
int64_t
input_width
,
const
int64_t
output_width
,
const
int64_t
pad_height_start
,
const
int64_t
pad_width_start
,
const
T
pad_value
,
const
T
*
input_data
,
T
*
output_data
,
const
size_t
N
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
N
);
const
int
current_output_width
=
id
%
output_width
;
int
nc_index
=
id
/
output_width
;
const
int
current_output_height
=
nc_index
%
output_height
;
nc_index
/=
output_height
;
int
current_input_height
=
current_output_height
-
pad_height_start
;
int
current_input_width
=
current_output_width
-
pad_width_start
;
switch
((
PadMode
)
pad_mode
)
{
case
PadMode
::
Constant
:
output_data
[
id
]
=
(
current_input_height
<
0
||
current_input_width
<
0
||
current_input_height
>=
input_height
||
current_input_width
>=
input_width
)
?
pad_value
:
input_data
[(
nc_index
*
input_height
+
current_input_height
)
*
input_width
+
current_input_width
];
break
;
case
PadMode
::
Edge
:
current_input_height
=
std
::
max
(
0
,
std
::
min
(
current_input_height
,
static_cast
<
int
>
(
input_height
-
1
)));
current_input_width
=
std
::
max
(
0
,
std
::
min
(
current_input_width
,
static_cast
<
int
>
(
input_width
-
1
)));
output_data
[
id
]
=
input_data
[(
nc_index
*
input_height
+
current_input_height
)
*
input_width
+
current_input_width
];
break
;
case
PadMode
::
Reflect
:
current_input_height
=
std
::
max
(
current_input_height
,
-
current_input_height
);
current_input_height
=
std
::
min
(
static_cast
<
int
>
(
current_input_height
),
2
*
static_cast
<
int
>
(
input_height
)
-
current_input_height
-
2
);
current_input_width
=
std
::
max
(
current_input_width
,
-
current_input_width
);
current_input_width
=
std
::
min
(
static_cast
<
int
>
(
current_input_width
),
2
*
static_cast
<
int
>
(
input_width
)
-
current_input_width
-
2
);
output_data
[
id
]
=
input_data
[(
nc_index
*
input_height
+
current_input_height
)
*
input_width
+
current_input_width
];
break
;
}
}
template
<
typename
T
>
void
PadImpl
(
hipStream_t
stream
,
const
size_t
shape_rank
,
const
TArray
<
int64_t
>&
input_dims
,
const
TArray
<
int64_t
>&
input_strides
,
const
TArray
<
int64_t
>&
lower_pads
,
const
T
pad_value
,
const
int
pad_mode
,
const
T
*
input_data
,
const
TArray
<
fast_divmod
>&
fdm_output_strides
,
T
*
output_data
,
const
size_t
N
)
{
if
(
N
==
0
)
// special case where there's a dim value of 0 in the output shape
return
;
int
blocksPerGrid
=
(
int
)(
ceil
(
static_cast
<
float
>
(
N
)
/
GridDim
::
maxThreadsPerBlock
));
switch
(
pad_mode
)
{
case
0
:
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_PadKernel
<
T
,
0
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
shape_rank
,
input_dims
,
input_strides
,
lower_pads
,
pad_value
,
input_data
,
fdm_output_strides
,
output_data
,
N
);
break
;
case
1
:
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_PadKernel
<
T
,
1
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
shape_rank
,
input_dims
,
input_strides
,
lower_pads
,
pad_value
,
input_data
,
fdm_output_strides
,
output_data
,
N
);
break
;
case
2
:
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_PadKernel
<
T
,
2
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
shape_rank
,
input_dims
,
input_strides
,
lower_pads
,
pad_value
,
input_data
,
fdm_output_strides
,
output_data
,
N
);
break
;
}
}
template
<
typename
T
>
void
PadNCHWInputWithPaddingAlongHAndWImpl
(
hipStream_t
stream
,
const
int64_t
n
,
// Batch
const
int64_t
c
,
// Channel
const
int64_t
input_height
,
const
int64_t
output_height
,
const
int64_t
input_width
,
const
int64_t
output_width
,
const
int64_t
pad_height_start
,
const
int64_t
pad_width_start
,
const
T
pad_value
,
const
int
pad_mode
,
const
T
*
input_data
,
T
*
output_data
,
const
size_t
N
)
{
if
(
N
==
0
)
// special case where there's a dim value of 0 in the output shape
return
;
int
blocksPerGrid
=
(
int
)(
ceil
(
static_cast
<
float
>
(
N
)
/
GridDim
::
maxThreadsPerBlock
));
switch
(
pad_mode
)
{
case
0
:
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_PadNCHWInputWithPaddingAlongHAndWKernel
<
T
,
0
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
n
,
c
,
input_height
,
output_height
,
input_width
,
output_width
,
pad_height_start
,
pad_width_start
,
pad_value
,
input_data
,
output_data
,
N
);
break
;
case
1
:
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_PadNCHWInputWithPaddingAlongHAndWKernel
<
T
,
1
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
n
,
c
,
input_height
,
output_height
,
input_width
,
output_width
,
pad_height_start
,
pad_width_start
,
pad_value
,
input_data
,
output_data
,
N
);
break
;
case
2
:
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_PadNCHWInputWithPaddingAlongHAndWKernel
<
T
,
2
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
n
,
c
,
input_height
,
output_height
,
input_width
,
output_width
,
pad_height_start
,
pad_width_start
,
pad_value
,
input_data
,
output_data
,
N
);
break
;
}
}
#define SPECIALIZED_IMPL(T) \
template void PadImpl<T>(hipStream_t stream, const size_t shape_rank, \
const TArray<int64_t>& input_dims, const TArray<int64_t>& input_strides, \
const TArray<int64_t>& lower_pads, \
const T pad_value, \
const int pad_mode, \
const T* input_data, \
const TArray<fast_divmod>& fdm_output_strides, \
T* output_data, \
const size_t N); \
template void PadNCHWInputWithPaddingAlongHAndWImpl<T>(hipStream_t stream, const int64_t n, const int64_t c, \
const int64_t input_height, const int64_t output_height, \
const int64_t input_width, const int64_t output_width, \
const int64_t pad_height_start, \
const int64_t pad_width_start, \
const T pad_value, \
const int pad_mode, \
const T* input_data, T* output_data, \
const size_t N);
SPECIALIZED_IMPL
(
float
)
SPECIALIZED_IMPL
(
double
)
SPECIALIZED_IMPL
(
half
)
SPECIALIZED_IMPL
(
bool
)
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/pad_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
void
PadNCHWInputWithPaddingAlongHAndWImpl
(
hipStream_t
stream
,
const
int64_t
n
,
// Batch
const
int64_t
c
,
// Channel
const
int64_t
input_height
,
const
int64_t
output_height
,
const
int64_t
input_width
,
const
int64_t
output_width
,
const
int64_t
pad_height_start
,
const
int64_t
pad_width_start
,
const
T
pad_value
,
const
int
pad_mode
,
const
T
*
input_data
,
T
*
output_data
,
const
size_t
N
);
template
<
typename
T
>
void
PadImpl
(
hipStream_t
stream
,
const
size_t
shape_rank
,
const
TArray
<
int64_t
>&
input_dims
,
const
TArray
<
int64_t
>&
input_strides
,
const
TArray
<
int64_t
>&
lower_pads
,
const
T
pad_value
,
const
int
pad_mode
,
const
T
*
input_data
,
const
TArray
<
fast_divmod
>&
fdm_output_strides
,
T
*
output_data
,
const
size_t
N
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/quantize_linear.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "quantize_linear.h"
#include "quantize_linear.cuh"
namespace
onnxruntime
{
namespace
rocm
{
template
<
class
T
,
class
U
>
Status
QuantizeLinear
<
T
,
U
>::
ComputeInternal
(
OpKernelContext
*
ctx
)
const
{
typedef
typename
ToHipType
<
U
>::
MappedType
CudaU
;
auto
&
x
=
*
ctx
->
Input
<
Tensor
>
(
0
);
auto
&
y_scale
=
*
ctx
->
Input
<
Tensor
>
(
1
);
auto
*
y_zero_point
=
ctx
->
Input
<
Tensor
>
(
2
);
auto
&
y
=
*
ctx
->
Output
(
0
,
x
.
Shape
());
const
auto
&
x_shape
=
x
.
Shape
();
const
CudaU
*
input
=
reinterpret_cast
<
const
CudaU
*>
(
x
.
Data
<
U
>
());
T
*
output
=
y
.
MutableData
<
T
>
();
// TO DO: support per-channel
ORT_ENFORCE
(
IsScalarOr1ElementVector
(
&
y_scale
),
"y_scale must be a scalar or 1D tensor of size 1."
);
ORT_ENFORCE
(
y_zero_point
==
nullptr
||
IsScalarOr1ElementVector
(
y_zero_point
),
"y_zero_point must be a scalar or 1D tensor of size 1."
);
const
T
*
zero_point
=
y_zero_point
!=
nullptr
?
y_zero_point
->
Data
<
T
>
()
:
nullptr
;
const
CudaU
*
scale
=
reinterpret_cast
<
const
CudaU
*>
(
y_scale
.
Data
<
U
>
());
const
auto
num_of_elements
=
x_shape
.
Size
();
ORT_RETURN_IF_ERROR
(
CudaQuantizeLinear
(
Stream
(),
input
,
output
,
scale
,
zero_point
,
num_of_elements
));
return
Status
::
OK
();
}
template
<
class
T
,
class
U
>
Status
DequantizeLinear
<
T
,
U
>::
ComputeInternal
(
OpKernelContext
*
ctx
)
const
{
typedef
typename
ToHipType
<
U
>::
MappedType
CudaU
;
auto
&
x
=
*
ctx
->
Input
<
Tensor
>
(
0
);
auto
&
y_scale
=
*
ctx
->
Input
<
Tensor
>
(
1
);
auto
*
y_zero_point
=
ctx
->
Input
<
Tensor
>
(
2
);
const
auto
&
x_shape
=
x
.
Shape
();
auto
&
y
=
*
ctx
->
Output
(
0
,
x_shape
);
const
T
*
input
=
x
.
Data
<
T
>
();
CudaU
*
output
=
reinterpret_cast
<
CudaU
*>
(
y
.
MutableData
<
U
>
());
ORT_ENFORCE
(
IsScalarOr1ElementVector
(
&
y_scale
),
"y_scale must be a scalar or 1D tensor of size 1."
);
ORT_ENFORCE
(
y_zero_point
==
nullptr
||
IsScalarOr1ElementVector
(
y_zero_point
),
"y_zero_point must be a scalar or 1D tensor of size 1."
);
const
T
*
zero_point
=
y_zero_point
!=
nullptr
?
y_zero_point
->
Data
<
T
>
()
:
nullptr
;
const
CudaU
*
scale
=
reinterpret_cast
<
const
CudaU
*>
(
y_scale
.
Data
<
U
>
());
const
auto
num_of_elements
=
x_shape
.
Size
();
ORT_RETURN_IF_ERROR
(
CudaDequantizeLinear
(
Stream
(),
input
,
output
,
scale
,
zero_point
,
num_of_elements
));
return
Status
::
OK
();
}
// register QuantizeLinear kernels
#define REGISTER_Q_KERNEL_TYPED(T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
QuantizeLinear, \
kOnnxDomain, \
10, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<float>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()), \
QuantizeLinear<T, float>);
REGISTER_Q_KERNEL_TYPED
(
int8_t
)
REGISTER_Q_KERNEL_TYPED
(
uint8_t
)
// register DequantizeLinear kernels
#define REGISTER_DQ_KERNEL_TYPED(T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
DequantizeLinear, \
kOnnxDomain, \
10, \
T, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
DequantizeLinear<T, float>);
REGISTER_DQ_KERNEL_TYPED
(
int8_t
)
REGISTER_DQ_KERNEL_TYPED
(
uint8_t
)
// specialize QuantizeLinear::ComputeInternal and DequantizeLinear::ComputeInternal
#define SPECIALIZED_QDQ_COMPUTE(T, U) \
template Status QuantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const; \
template Status DequantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const;
SPECIALIZED_QDQ_COMPUTE
(
int8_t
,
float
)
SPECIALIZED_QDQ_COMPUTE
(
uint8_t
,
float
)
SPECIALIZED_QDQ_COMPUTE
(
int8_t
,
MLFloat16
)
SPECIALIZED_QDQ_COMPUTE
(
uint8_t
,
MLFloat16
)
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/quantize_linear.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "quantize_linear.cuh"
#include <limits>
#include "core/providers/rocm/cu_inc/common.cuh"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
struct
Round
;
template
<
>
struct
Round
<
float
>
{
__device__
__forceinline__
int
operator
()(
float
v
)
const
{
return
__float2int_rn
(
v
);
}
};
template
<
>
struct
Round
<
half
>
{
__device__
__forceinline__
int
operator
()(
half
v
)
const
{
return
__half2int_rn
(
v
);
}
};
template
<
int
NumThreadsPerBlock
,
int
NumElementsPerThread
,
typename
OutT
,
typename
InT
>
__global__
void
QuantizeLinearKernel
(
const
InT
*
input
,
OutT
*
output
,
const
InT
*
scale_ptr
,
const
OutT
*
zero_point_ptr
,
HIP_LONG
N
,
Round
<
InT
>
round
)
{
HIP_LONG
id
=
NumElementsPerThread
*
NumThreadsPerBlock
*
blockIdx
.
x
+
threadIdx
.
x
;
InT
scale
=
*
scale_ptr
;
OutT
zero_point
=
zero_point_ptr
!=
nullptr
?
*
zero_point_ptr
:
0
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NumElementsPerThread
;
i
++
)
{
if
(
id
<
N
)
{
int
value
=
round
(
input
[
id
]
/
scale
)
+
zero_point
;
output
[
id
]
=
static_cast
<
OutT
>
(
max
(
std
::
numeric_limits
<
OutT
>::
min
(),
min
(
std
::
numeric_limits
<
OutT
>::
max
(),
value
)));
id
+=
NumThreadsPerBlock
;
}
}
}
template
<
class
OutT
,
class
InT
>
Status
CudaQuantizeLinear
(
hipStream_t
stream
,
const
InT
*
input
,
OutT
*
output
,
const
InT
*
scale
,
const
OutT
*
zero_point
,
size_t
num_of_element
)
{
if
(
num_of_element
<=
0
)
return
Status
::
OK
();
int
blocksPerGrid
=
static_cast
<
int
>
(
CeilDiv
(
num_of_element
,
GridDim
::
maxThreadsPerBlock
*
GridDim
::
maxElementsPerThread
));
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
QuantizeLinearKernel
<
GridDim
::
maxThreadsPerBlock
,
GridDim
::
maxElementsPerThread
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
input
,
output
,
scale
,
zero_point
,
static_cast
<
int
>
(
num_of_element
),
Round
<
InT
>
());
return
Status
::
OK
();
}
template
<
class
InT
,
class
OutT
,
int
NumThreadsPerBlock
,
int
NumElementsPerThread
>
__global__
void
DequantizeLinearKernel
(
const
InT
*
input
,
OutT
*
output
,
const
OutT
*
scale_ptr
,
const
InT
*
zero_point_ptr
,
HIP_LONG
N
)
{
HIP_LONG
id
=
NumElementsPerThread
*
NumThreadsPerBlock
*
blockIdx
.
x
+
threadIdx
.
x
;
OutT
scale
=
*
scale_ptr
;
InT
zero_point
=
zero_point_ptr
!=
nullptr
?
*
zero_point_ptr
:
0
;
#pragma unroll
for
(
int
i
=
0
;
i
<
NumElementsPerThread
;
i
++
)
{
if
(
id
<
N
)
{
output
[
id
]
=
static_cast
<
OutT
>
(
input
[
id
]
-
zero_point
)
*
scale
;
id
+=
NumThreadsPerBlock
;
}
}
}
template
<
class
InT
,
class
OutT
>
Status
CudaDequantizeLinear
(
hipStream_t
stream
,
const
InT
*
input
,
OutT
*
output
,
const
OutT
*
scale
,
const
InT
*
zero_point
,
size_t
num_of_element
)
{
if
(
num_of_element
<=
0
)
return
Status
::
OK
();
int
blocksPerGrid
=
static_cast
<
int
>
(
CeilDiv
(
num_of_element
,
GridDim
::
maxThreadsPerBlock
*
GridDim
::
maxElementsPerThread
));
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
DequantizeLinearKernel
<
InT
,
OutT
,
GridDim
::
maxThreadsPerBlock
,
GridDim
::
maxElementsPerThread
>
),
blocksPerGrid
,
GridDim
::
maxThreadsPerBlock
,
0
,
stream
,
input
,
output
,
scale
,
zero_point
,
static_cast
<
int
>
(
num_of_element
));
return
Status
::
OK
();
}
template
Status
CudaQuantizeLinear
<
int8_t
,
float
>(
hipStream_t
stream
,
const
float
*
input
,
int8_t
*
output
,
const
float
*
scale
,
const
int8_t
*
zero_point
,
size_t
num_of_element
);
template
Status
CudaQuantizeLinear
<
uint8_t
,
float
>(
hipStream_t
stream
,
const
float
*
input
,
uint8_t
*
output
,
const
float
*
scale
,
const
uint8_t
*
zero_point
,
size_t
num_of_element
);
template
Status
CudaQuantizeLinear
<
int8_t
,
half
>(
hipStream_t
stream
,
const
half
*
input
,
int8_t
*
output
,
const
half
*
scale
,
const
int8_t
*
zero_point
,
size_t
num_of_element
);
template
Status
CudaQuantizeLinear
<
uint8_t
,
half
>(
hipStream_t
stream
,
const
half
*
input
,
uint8_t
*
output
,
const
half
*
scale
,
const
uint8_t
*
zero_point
,
size_t
num_of_element
);
template
Status
CudaDequantizeLinear
<
int8_t
,
float
>(
hipStream_t
stream
,
const
int8_t
*
input
,
float
*
output
,
const
float
*
scale
,
const
int8_t
*
zero_point
,
size_t
num_of_element
);
template
Status
CudaDequantizeLinear
<
uint8_t
,
float
>(
hipStream_t
stream
,
const
uint8_t
*
input
,
float
*
output
,
const
float
*
scale
,
const
uint8_t
*
zero_point
,
size_t
num_of_element
);
template
Status
CudaDequantizeLinear
<
int8_t
,
half
>(
hipStream_t
stream
,
const
int8_t
*
input
,
half
*
output
,
const
half
*
scale
,
const
int8_t
*
zero_point
,
size_t
num_of_element
);
template
Status
CudaDequantizeLinear
<
uint8_t
,
half
>(
hipStream_t
stream
,
const
uint8_t
*
input
,
half
*
output
,
const
half
*
scale
,
const
uint8_t
*
zero_point
,
size_t
num_of_element
);
}
// namespace rocm
}
// namespace onnxruntime
Prev
1
…
7
8
9
10
11
12
13
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment