Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
onnxruntime_v14
Commits
1a91fcc2
Commit
1a91fcc2
authored
Jul 25, 2023
by
gaoqiong
Browse files
add dtk所需文件
parent
a144865d
Pipeline
#492
failed with stages
in 0 seconds
Changes
280
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2220 additions
and
0 deletions
+2220
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/sequence_op.h
...dgpu/onnxruntime/core/providers/rocm/tensor/sequence_op.h
+271
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/shape_op.cc
...amdgpu/onnxruntime/core/providers/rocm/tensor/shape_op.cc
+48
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/size.cc
...ase/amdgpu/onnxruntime/core/providers/rocm/tensor/size.cc
+35
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/slice.cc
...se/amdgpu/onnxruntime/core/providers/rocm/tensor/slice.cc
+239
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/slice.h
...ase/amdgpu/onnxruntime/core/providers/rocm/tensor/slice.h
+42
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/slice_impl.cu
...dgpu/onnxruntime/core/providers/rocm/tensor/slice_impl.cu
+133
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/slice_impl.h
...mdgpu/onnxruntime/core/providers/rocm/tensor/slice_impl.h
+36
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/space_depth_ops.cc
...onnxruntime/core/providers/rocm/tensor/space_depth_ops.cc
+186
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/space_depth_ops.h
.../onnxruntime/core/providers/rocm/tensor/space_depth_ops.h
+42
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/split.cc
...se/amdgpu/onnxruntime/core/providers/rocm/tensor/split.cc
+125
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/split.h
...ase/amdgpu/onnxruntime/core/providers/rocm/tensor/split.h
+18
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/split_impl.cu
...dgpu/onnxruntime/core/providers/rocm/tensor/split_impl.cu
+162
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/split_impl.h
...mdgpu/onnxruntime/core/providers/rocm/tensor/split_impl.h
+23
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/squeeze.cc
.../amdgpu/onnxruntime/core/providers/rocm/tensor/squeeze.cc
+77
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/squeeze.h
...e/amdgpu/onnxruntime/core/providers/rocm/tensor/squeeze.h
+18
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/tile.cc
...ase/amdgpu/onnxruntime/core/providers/rocm/tensor/tile.cc
+167
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/tile.h
...ease/amdgpu/onnxruntime/core/providers/rocm/tensor/tile.h
+18
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/tile_impl.cu
...mdgpu/onnxruntime/core/providers/rocm/tensor/tile_impl.cu
+273
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/tile_impl.h
...amdgpu/onnxruntime/core/providers/rocm/tensor/tile_impl.h
+25
-0
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/transpose.cc
...mdgpu/onnxruntime/core/providers/rocm/tensor/transpose.cc
+282
-0
No files found.
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/sequence_op.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/rocm/tensor/concat.h"
#include "core/providers/rocm/tensor/concat_impl.h"
namespace
onnxruntime
{
namespace
rocm
{
class
SequenceAt
final
:
public
RocmKernel
{
public:
SequenceAt
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
{
const
TensorSeq
*
X
=
context
->
Input
<
TensorSeq
>
(
0
);
const
Tensor
*
I
=
context
->
Input
<
Tensor
>
(
1
);
int64_t
idx
=
-
1
;
if
(
I
->
IsDataType
<
int32_t
>
())
{
idx
=
static_cast
<
int64_t
>
(
I
->
Data
<
int32_t
>
()[
0
]);
}
else
{
idx
=
I
->
Data
<
int64_t
>
()[
0
];
}
int64_t
sequence_size
=
static_cast
<
int64_t
>
(
X
->
Size
());
if
(
idx
<
0
)
{
idx
=
sequence_size
+
idx
;
}
ORT_ENFORCE
(
idx
>=
0
&&
idx
<
sequence_size
,
"SequenceAt GPU: Invalid sequence index."
);
const
Tensor
&
source_tensor
=
X
->
Get
(
idx
);
auto
source_type
=
source_tensor
.
DataType
();
const
void
*
source_addr
=
source_tensor
.
DataRaw
(
source_type
);
Tensor
*
target_tensor
=
context
->
Output
(
0
,
source_tensor
.
Shape
());
void
*
target_addr
=
target_tensor
->
MutableDataRaw
(
source_type
);
if
(
source_addr
!=
target_addr
)
{
HIP_RETURN_IF_ERROR
(
hipMemcpyAsync
(
target_addr
,
source_addr
,
source_tensor
.
SizeInBytes
(),
hipMemcpyDeviceToDevice
,
Stream
()));
}
return
Status
::
OK
();
}
};
// SequenceAt
class
SequenceConstruct
final
:
public
RocmKernel
{
public:
SequenceConstruct
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
{
auto
num_inputs
=
Node
().
InputArgCount
().
front
();
ORT_ENFORCE
(
num_inputs
>=
1
,
"Must have 1 or more inputs"
);
MLDataType
first_dtype
=
context
->
Input
<
Tensor
>
(
0
)
->
DataType
();
AllocatorPtr
alloc
;
ORT_ENFORCE
(
context
->
GetTempSpaceAllocator
(
&
alloc
).
IsOK
(),
"SequenceConstruct GPU: Unable to get an allocator."
);
TensorSeq
*
Y
=
context
->
Output
<
TensorSeq
>
(
0
);
Y
->
SetType
(
first_dtype
);
Y
->
Reserve
(
num_inputs
);
for
(
int
input_idx
=
0
;
input_idx
<
num_inputs
;
++
input_idx
)
{
const
auto
*
source_tensor
=
context
->
Input
<
Tensor
>
(
input_idx
);
std
::
unique_ptr
<
Tensor
>
target_tensor
=
Tensor
::
Create
(
source_tensor
->
DataType
(),
source_tensor
->
Shape
(),
alloc
);
HIP_RETURN_IF_ERROR
(
hipMemcpyAsync
(
target_tensor
->
MutableDataRaw
(),
source_tensor
->
DataRaw
(),
source_tensor
->
SizeInBytes
(),
hipMemcpyDeviceToDevice
,
Stream
()));
Y
->
Add
(
std
::
move
(
*
target_tensor
));
// Add will check for type consistency
}
return
Status
::
OK
();
}
};
// SequenceConstruct
class
SequenceEmpty
final
:
public
RocmKernel
{
public:
SequenceEmpty
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{
if
(
!
info
.
GetAttr
(
"dtype"
,
&
dtype_
).
IsOK
())
{
dtype_
=
ONNX_NAMESPACE
::
TensorProto_DataType_FLOAT
;
}
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
{
TensorSeq
*
Y
=
context
->
Output
<
TensorSeq
>
(
0
);
#ifdef SHARED_PROVIDER
Y
->
SetType
(
DataTypeImpl
::
GetTypeFromOnnxType
(
static_cast
<
int
>
(
dtype_
)));
#else
Y
->
SetType
(
DataTypeImpl
::
TensorTypeFromONNXEnum
(
static_cast
<
int
>
(
dtype_
))
->
GetElementType
());
#endif
return
Status
::
OK
();
}
private:
int64_t
dtype_
{};
};
// SequenceEmpty
class
SequenceLength
final
:
public
RocmKernel
{
public:
SequenceLength
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
{
const
TensorSeq
*
X
=
context
->
Input
<
TensorSeq
>
(
0
);
Tensor
*
Y
=
context
->
Output
(
0
,
{});
Y
->
MutableData
<
int64_t
>
()[
0
]
=
static_cast
<
int64_t
>
(
X
->
Size
());
return
Status
::
OK
();
}
};
// SequenceLength
class
ConcatFromSequence
final
:
public
RocmKernel
,
public
ConcatBase
{
public:
ConcatFromSequence
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
),
ConcatBase
(
info
,
true
)
{}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
{
const
TensorSeq
*
X
=
context
->
Input
<
TensorSeq
>
(
0
);
int64_t
input_count
=
static_cast
<
int64_t
>
(
X
->
Size
());
InlinedTensorsVector
input_tensors
;
for
(
int64_t
i
=
0
;
i
<
input_count
;
++
i
)
{
input_tensors
.
push_back
(
&
X
->
Get
(
i
));
}
Prepare
p
;
ORT_RETURN_IF_ERROR
(
PrepareForCompute
(
context
,
input_tensors
,
p
));
if
(
0
==
p
.
output_num_elements
)
{
return
Status
::
OK
();
}
int64_t
initial_output_offset
=
0
;
auto
element_bytes
=
p
.
output_tensor
->
DataType
()
->
Size
();
for
(
int
input_index
=
0
;
input_index
<
input_count
;
input_index
++
)
{
const
auto
&
prep
=
p
.
inputs
[
input_index
];
if
(
prep
.
num_elements
==
0
)
{
continue
;
}
auto
input_axis_pitch
=
prep
.
axis_pitch
;
const
uint8_t
*
input
=
static_cast
<
const
uint8_t
*>
(
prep
.
tensor
->
DataRaw
());
auto
input_size
=
prep
.
num_elements
;
uint8_t
*
output
=
static_cast
<
uint8_t
*>
(
p
.
output_tensor
->
MutableDataRaw
());
int64_t
cur_out_offset
=
0
;
int64_t
cur_in_offset
=
0
;
for
(
size_t
idx_copy
=
0
,
end
=
input_size
/
input_axis_pitch
;
idx_copy
<
end
;
++
idx_copy
)
{
HIP_RETURN_IF_ERROR
(
hipMemcpyAsync
(
output
+
(
initial_output_offset
+
cur_out_offset
)
*
element_bytes
,
input
+
cur_in_offset
*
element_bytes
,
input_axis_pitch
*
element_bytes
,
hipMemcpyDeviceToDevice
,
Stream
()));
cur_out_offset
+=
p
.
output_axis_pitch
;
cur_in_offset
+=
input_axis_pitch
;
}
initial_output_offset
+=
input_axis_pitch
;
}
return
Status
::
OK
();
}
};
// ConcatFromSequence
class
SequenceErase
final
:
public
RocmKernel
{
public:
SequenceErase
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
{
const
TensorSeq
*
X
=
context
->
Input
<
TensorSeq
>
(
0
);
int64_t
X_size
=
static_cast
<
int64_t
>
(
X
->
Size
());
int64_t
idx
=
X_size
-
1
;
const
Tensor
*
I
=
context
->
Input
<
Tensor
>
(
1
);
if
(
I
!=
nullptr
)
{
if
(
I
->
IsDataType
<
int32_t
>
())
{
idx
=
static_cast
<
int64_t
>
(
I
->
Data
<
int32_t
>
()[
0
]);
}
else
{
idx
=
I
->
Data
<
int64_t
>
()[
0
];
}
if
(
idx
<
0
)
{
idx
=
X_size
+
idx
;
}
ORT_ENFORCE
(
idx
>=
0
&&
idx
<
X_size
,
"SequenceErase GPU: Invalid sequence index."
);
}
AllocatorPtr
alloc
;
ORT_ENFORCE
(
context
->
GetTempSpaceAllocator
(
&
alloc
).
IsOK
(),
"SequenceErase GPU: Unable to get an allocator."
);
TensorSeq
*
Y
=
context
->
Output
<
TensorSeq
>
(
0
);
Y
->
SetType
(
X
->
DataType
());
Y
->
Reserve
(
X_size
-
1
);
for
(
int64_t
i
=
0
;
i
<
X_size
;
++
i
)
{
if
(
i
==
idx
)
{
continue
;
}
const
Tensor
&
source_tensor
=
X
->
Get
(
i
);
std
::
unique_ptr
<
Tensor
>
target_tensor
=
Tensor
::
Create
(
source_tensor
.
DataType
(),
source_tensor
.
Shape
(),
alloc
);
HIP_RETURN_IF_ERROR
(
hipMemcpyAsync
(
target_tensor
->
MutableDataRaw
(),
source_tensor
.
DataRaw
(),
source_tensor
.
SizeInBytes
(),
hipMemcpyDeviceToDevice
,
Stream
()));
Y
->
Add
(
std
::
move
(
*
target_tensor
));
// Add will check for type consistency
}
return
Status
::
OK
();
}
};
// SequenceErase
class
SequenceInsert
final
:
public
RocmKernel
{
public:
SequenceInsert
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
{
const
TensorSeq
*
S
=
context
->
Input
<
TensorSeq
>
(
0
);
int64_t
S_size
=
static_cast
<
int64_t
>
(
S
->
Size
());
int64_t
idx
=
S_size
;
const
Tensor
*
I
=
context
->
Input
<
Tensor
>
(
2
);
if
(
I
!=
nullptr
)
{
if
(
I
->
IsDataType
<
int32_t
>
())
{
idx
=
static_cast
<
int64_t
>
(
I
->
Data
<
int32_t
>
()[
0
]);
}
else
{
idx
=
I
->
Data
<
int64_t
>
()[
0
];
}
if
(
idx
<
0
)
{
idx
=
S_size
+
idx
;
}
ORT_ENFORCE
(
idx
>=
0
&&
idx
<=
S_size
,
"SequenceInsert GPU: Invalid sequence index."
);
}
const
Tensor
*
X
=
context
->
Input
<
Tensor
>
(
1
);
AllocatorPtr
alloc
;
ORT_ENFORCE
(
context
->
GetTempSpaceAllocator
(
&
alloc
).
IsOK
(),
"SequenceInsert GPU: Unable to get an allocator."
);
std
::
unique_ptr
<
Tensor
>
tensor_to_be_inserted
=
Tensor
::
Create
(
X
->
DataType
(),
X
->
Shape
(),
alloc
);
HIP_RETURN_IF_ERROR
(
hipMemcpyAsync
(
tensor_to_be_inserted
->
MutableDataRaw
(),
X
->
DataRaw
(),
X
->
SizeInBytes
(),
hipMemcpyDeviceToDevice
,
Stream
()));
TensorSeq
*
Y
=
context
->
Output
<
TensorSeq
>
(
0
);
Y
->
SetType
(
S
->
DataType
());
Y
->
Reserve
(
S_size
+
1
);
for
(
int64_t
i
=
0
;
i
<
S_size
;
++
i
)
{
if
(
i
==
idx
)
{
Y
->
Add
(
std
::
move
(
*
tensor_to_be_inserted
));
// Add will check for type consistency
}
const
Tensor
&
source_tensor
=
S
->
Get
(
i
);
std
::
unique_ptr
<
Tensor
>
target_tensor
=
Tensor
::
Create
(
source_tensor
.
DataType
(),
source_tensor
.
Shape
(),
alloc
);
HIP_RETURN_IF_ERROR
(
hipMemcpyAsync
(
target_tensor
->
MutableDataRaw
(),
source_tensor
.
DataRaw
(),
source_tensor
.
SizeInBytes
(),
hipMemcpyDeviceToDevice
,
Stream
()));
Y
->
Add
(
std
::
move
(
*
target_tensor
));
// Add will check for type consistency
}
if
(
idx
==
S_size
)
{
Y
->
Add
(
std
::
move
(
*
tensor_to_be_inserted
));
// Add will check for type consistency
}
return
Status
::
OK
();
}
};
// SequenceInsert
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/shape_op.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/cpu/tensor/shape_op.h"
#include "core/providers/rocm/rocm_fwd.h"
namespace
onnxruntime
{
namespace
rocm
{
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Shape
,
kOnnxDomain
,
1
,
12
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
// properly force CPU/GPU synch inside the kernel
.
OutputMemoryType
(
OrtMemTypeCPUInput
,
0
)
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
())
.
TypeConstraint
(
"T1"
,
DataTypeImpl
::
GetTensorType
<
int64_t
>
()),
Shape
);
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Shape
,
kOnnxDomain
,
13
,
14
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
// properly force CPU/GPU synch inside the kernel
.
OutputMemoryType
(
OrtMemTypeCPUInput
,
0
)
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
())
.
TypeConstraint
(
"T1"
,
DataTypeImpl
::
GetTensorType
<
int64_t
>
()),
Shape
);
ONNX_OPERATOR_KERNEL_EX
(
Shape
,
kOnnxDomain
,
15
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
// properly force CPU/GPU synch inside the kernel
.
OutputMemoryType
(
OrtMemTypeCPUInput
,
0
)
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
())
.
TypeConstraint
(
"T1"
,
DataTypeImpl
::
GetTensorType
<
int64_t
>
()),
Shape
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/size.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/cpu/tensor/size.h"
#include "core/providers/rocm/rocm_fwd.h"
namespace
onnxruntime
{
namespace
rocm
{
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Size
,
kOnnxDomain
,
1
,
12
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
OutputMemoryType
(
OrtMemTypeCPUInput
,
0
)
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllTensorTypes
())
.
TypeConstraint
(
"T1"
,
DataTypeImpl
::
GetTensorType
<
int64_t
>
()),
Size
);
ONNX_OPERATOR_KERNEL_EX
(
Size
,
kOnnxDomain
,
13
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
// properly force CPU/GPU synch inside the kernel
.
OutputMemoryType
(
OrtMemTypeCPUInput
,
0
)
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllTensorTypes
())
.
TypeConstraint
(
"T1"
,
DataTypeImpl
::
GetTensorType
<
int64_t
>
()),
Size
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/slice.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/slice.h"
#include "core/providers/cpu/tensor/utils.h"
#include "core/providers/rocm/tensor/slice_impl.h"
namespace
onnxruntime
{
namespace
rocm
{
// this really doesn't need to be a typed registration as the indices come from attributes and can only be int64.
// leaving as in maintain original incorrect registration setup (pre 02/2022).
#define REGISTER_VERSIONED_TYPED_SLICE(TIND) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Slice, \
kOnnxDomain, \
1, 9, \
TIND, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()), \
Slice<false>);
REGISTER_VERSIONED_TYPED_SLICE
(
int64_t
)
#define REGISTER_V10_TYPED_SLICE(TIND) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Slice, \
kOnnxDomain, \
10, 10, \
TIND, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.InputMemoryType(OrtMemTypeCPUInput, 1) \
.InputMemoryType(OrtMemTypeCPUInput, 2) \
.InputMemoryType(OrtMemTypeCPUInput, 3) \
.InputMemoryType(OrtMemTypeCPUInput, 4) \
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()) \
.TypeConstraint("Tind", DataTypeImpl::GetTensorType<TIND>()), \
Slice<true>);
REGISTER_V10_TYPED_SLICE
(
int32_t
)
REGISTER_V10_TYPED_SLICE
(
int64_t
)
#define REGISTER_V12_TYPED_SLICE(TIND) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( \
Slice, \
kOnnxDomain, \
11, 12, \
TIND, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.InputMemoryType(OrtMemTypeCPUInput, 1) \
.InputMemoryType(OrtMemTypeCPUInput, 2) \
.InputMemoryType(OrtMemTypeCPUInput, 3) \
.InputMemoryType(OrtMemTypeCPUInput, 4) \
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()) \
.TypeConstraint("Tind", DataTypeImpl::GetTensorType<TIND>()), \
Slice<true>);
REGISTER_V12_TYPED_SLICE
(
int32_t
)
REGISTER_V12_TYPED_SLICE
(
int64_t
)
#define REGISTER_V13_TYPED_SLICE(TIND) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
Slice, \
kOnnxDomain, \
13, \
TIND, \
kRocmExecutionProvider, \
(*KernelDefBuilder::Create()) \
.InputMemoryType(OrtMemTypeCPUInput, 1) \
.InputMemoryType(OrtMemTypeCPUInput, 2) \
.InputMemoryType(OrtMemTypeCPUInput, 3) \
.InputMemoryType(OrtMemTypeCPUInput, 4) \
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()) \
.TypeConstraint("Tind", DataTypeImpl::GetTensorType<TIND>()), \
Slice<true>);
REGISTER_V13_TYPED_SLICE
(
int32_t
)
REGISTER_V13_TYPED_SLICE
(
int64_t
)
static
Status
SliceImpCore
(
hipStream_t
stream
,
const
void
*
input_data
,
void
*
output_data
,
size_t
element_size
,
size_t
dimension_count
,
const
TArray
<
int64_t
>&
starts_buffer
,
const
TArray
<
int64_t
>&
steps_buffer
,
const
TArray
<
int64_t
>&
input_strides
,
const
TArray
<
fast_divmod
>&
output_strides
,
const
TensorShape
&
output_shape
)
{
if
(
output_shape
.
Size
()
==
0
)
{
return
Status
::
OK
();
}
return
SliceImpl
(
stream
,
element_size
,
gsl
::
narrow_cast
<
int32_t
>
(
dimension_count
),
starts_buffer
,
steps_buffer
,
input_strides
,
output_strides
,
input_data
,
output_data
,
output_shape
.
Size
());
}
namespace
SliceRocm
{
static
Status
ComputeSliceStrides
(
const
TensorShape
&
input_shape
,
TArray
<
int64_t
>&
input_strides
,
TArray
<
fast_divmod
>&
output_strides
,
SliceOp
::
PrepareForComputeMetadata
&
compute_metadata
)
{
// If we were able to coalesce the input and output shapes, use the new shapes to compute the strides.
const
auto
input_dimensions
=
input_shape
.
GetDims
();
size_t
rank
=
compute_metadata
.
p_flattened_input_dims_
?
compute_metadata
.
p_flattened_input_dims_
->
size
()
:
input_dimensions
.
size
();
input_strides
.
SetSize
(
gsl
::
narrow_cast
<
int32_t
>
(
rank
));
const
gsl
::
span
<
int64_t
>
input_strides_span
=
gsl
::
make_span
(
input_strides
.
Data
(),
input_strides
.
Size
());
if
(
compute_metadata
.
p_flattened_input_dims_
)
{
ORT_ENFORCE
(
TensorPitches
::
Calculate
(
input_strides_span
,
compute_metadata
.
flattened_input_dims_
));
}
else
{
ORT_ENFORCE
(
TensorPitches
::
Calculate
(
input_strides_span
,
input_dimensions
));
}
const
auto
output_dims
=
gsl
::
make_span
(
compute_metadata
.
p_flattened_output_dims_
!=
nullptr
?
compute_metadata
.
flattened_output_dims_
:
compute_metadata
.
output_dims_
);
TensorPitches
original_output_strides
(
output_dims
);
output_strides
.
SetSize
(
gsl
::
narrow_cast
<
int32_t
>
(
original_output_strides
.
size
()));
for
(
int32_t
i
=
0
,
limit
=
static_cast
<
int32_t
>
(
original_output_strides
.
size
());
i
<
limit
;
++
i
)
{
output_strides
[
i
]
=
fast_divmod
(
gsl
::
narrow_cast
<
int
>
(
original_output_strides
[
i
]));
}
return
Status
::
OK
();
}
Status
Impl
(
hipStream_t
stream
,
const
void
*
input_data
,
const
TensorShape
&
input_shape
,
void
*
output_data
,
SliceOp
::
PrepareForComputeMetadata
&
compute_metadata
,
size_t
element_size
)
{
const
auto
input_dimensions
=
input_shape
.
GetDims
();
size_t
dimension_count
=
input_dimensions
.
size
();
TArray
<
int64_t
>
starts_buffer
(
compute_metadata
.
starts_
);
TArray
<
int64_t
>
steps_buffer
(
compute_metadata
.
steps_
);
TArray
<
int64_t
>
input_strides
;
TArray
<
fast_divmod
>
output_strides
;
ORT_RETURN_IF_ERROR
(
ComputeSliceStrides
(
input_shape
,
input_strides
,
output_strides
,
compute_metadata
));
TensorShape
output_shape
(
compute_metadata
.
output_dims_
);
ORT_RETURN_IF_ERROR
(
SliceImpCore
(
stream
,
input_data
,
output_data
,
element_size
,
gsl
::
narrow_cast
<
int32_t
>
(
dimension_count
),
starts_buffer
,
steps_buffer
,
input_strides
,
output_strides
,
output_shape
));
return
Status
::
OK
();
}
}
// namespace SliceRocm
template
<
bool
dynamic
>
Status
Slice
<
dynamic
>::
ComputeInternal
(
OpKernelContext
*
ctx
)
const
{
const
Tensor
*
input_tensor
=
GetSlicedOrUnslicedTensor
(
ctx
);
ORT_ENFORCE
(
nullptr
!=
input_tensor
);
const
auto
&
input_shape
=
input_tensor
->
Shape
();
const
auto
input_dimensions
=
input_shape
.
GetDims
();
if
(
input_dimensions
.
empty
())
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
INVALID_ARGUMENT
,
"Cannot slice scalars"
);
SliceOp
::
PrepareForComputeMetadata
compute_metadata
(
input_dimensions
);
if
(
dynamic
)
{
TensorShapeVector
input_starts
,
input_ends
,
input_axes
,
input_steps
;
ORT_RETURN_IF_ERROR
(
FillInputVectors
(
ctx
,
input_starts
,
input_ends
,
input_axes
,
input_steps
));
ORT_RETURN_IF_ERROR
(
PrepareForCompute
(
input_starts
,
input_ends
,
input_axes
,
input_steps
,
compute_metadata
));
}
else
{
ORT_RETURN_IF_ERROR
(
PrepareForCompute
(
StartsAttribute
(),
EndsAttribute
(),
AxesAttribute
(),
compute_metadata
));
}
TensorShape
output_shape
(
compute_metadata
.
output_dims_
);
TArray
<
int64_t
>
starts_buffer
(
compute_metadata
.
starts_
);
TArray
<
int64_t
>
steps_buffer
(
compute_metadata
.
steps_
);
TArray
<
int64_t
>
input_strides
;
TArray
<
fast_divmod
>
output_strides
;
ORT_RETURN_IF_ERROR
(
SliceRocm
::
ComputeSliceStrides
(
input_shape
,
input_strides
,
output_strides
,
compute_metadata
));
// It may seem that we may use `SliceImpCore()` directly, but we need to go through `CallSliceImp()` because
// `ComputeInternal()` is shared between the inferencing and training kernels and the training kernel overrides
// `CallSliceImp()`
ORT_RETURN_IF_ERROR
(
CallSliceImp
(
input_tensor
->
DataType
()
->
Size
(),
input_dimensions
.
size
(),
starts_buffer
,
steps_buffer
,
input_strides
,
output_strides
,
ctx
,
output_shape
));
return
Status
::
OK
();
}
template
<
bool
dynamic
>
const
Tensor
*
Slice
<
dynamic
>::
GetSlicedOrUnslicedTensor
(
OpKernelContext
*
ctx
)
const
{
return
ctx
->
Input
<
Tensor
>
(
0
);
}
template
<
bool
dynamic
>
Status
Slice
<
dynamic
>::
FillInputVectors
(
OpKernelContext
*
ctx
,
TensorShapeVector
&
input_starts
,
TensorShapeVector
&
input_ends
,
TensorShapeVector
&
input_axes
,
TensorShapeVector
&
input_steps
)
const
{
return
FillVectorsFromInput
(
*
ctx
->
Input
<
Tensor
>
(
1
),
*
ctx
->
Input
<
Tensor
>
(
2
),
ctx
->
Input
<
Tensor
>
(
3
),
ctx
->
Input
<
Tensor
>
(
4
),
input_starts
,
input_ends
,
input_axes
,
input_steps
);
}
template
<
bool
dynamic
>
Status
Slice
<
dynamic
>::
CallSliceImp
(
size_t
element_size
,
size_t
dimension_count
,
const
TArray
<
int64_t
>&
starts_buffer
,
const
TArray
<
int64_t
>&
steps_buffer
,
const
TArray
<
int64_t
>&
input_strides
,
const
TArray
<
fast_divmod
>&
output_strides
,
OpKernelContext
*
ctx
,
const
TensorShape
&
output_shape
)
const
{
const
auto
*
input_tensor
=
ctx
->
Input
<
Tensor
>
(
0
);
auto
*
output_tensor
=
ctx
->
Output
(
0
,
output_shape
);
return
SliceImpCore
(
Stream
(),
input_tensor
->
DataRaw
(),
output_tensor
->
MutableDataRaw
(),
element_size
,
gsl
::
narrow_cast
<
int32_t
>
(
dimension_count
),
starts_buffer
,
steps_buffer
,
input_strides
,
output_strides
,
output_shape
);
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/slice.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/tensor/slice.h"
#include "core/providers/cpu/tensor/utils.h"
namespace
onnxruntime
{
namespace
rocm
{
namespace
SliceRocm
{
Status
Impl
(
hipStream_t
stream
,
const
void
*
input_data
,
const
TensorShape
&
input_shape
,
void
*
output_data
,
SliceOp
::
PrepareForComputeMetadata
&
prepare_metadata
,
size_t
element_size
);
}
// namespace SliceRocm
template
<
bool
dynamic
>
class
Slice
:
public
RocmKernel
,
public
SliceBase
{
public:
Slice
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
),
SliceBase
(
info
,
dynamic
)
{}
Status
ComputeInternal
(
OpKernelContext
*
ctx
)
const
override
;
private:
virtual
const
Tensor
*
GetSlicedOrUnslicedTensor
(
OpKernelContext
*
ctx
)
const
;
virtual
Status
FillInputVectors
(
OpKernelContext
*
ctx
,
TensorShapeVector
&
input_starts
,
TensorShapeVector
&
input_ends
,
TensorShapeVector
&
input_axes
,
TensorShapeVector
&
input_steps
)
const
;
virtual
Status
CallSliceImp
(
size_t
element_size
,
size_t
dimension_count
,
const
TArray
<
int64_t
>&
starts_buffer
,
const
TArray
<
int64_t
>&
steps_buffer
,
const
TArray
<
int64_t
>&
input_strides
,
const
TArray
<
fast_divmod
>&
output_strides
,
OpKernelContext
*
ctx
,
const
TensorShape
&
output_shape
)
const
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/slice_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/providers/rocm/rocm_common.h"
#include "core/providers/rocm/tensor/slice_impl.h"
namespace
onnxruntime
{
namespace
rocm
{
namespace
{
#ifdef USE_ROCM
constexpr
int
kNumElementsPerThread
=
2
;
constexpr
int
kNumThreadsPerBlock
=
512
;
#else
constexpr
int
kNumElementsPerThread
=
GridDim
::
maxElementsPerThread
;
constexpr
int
kNumThreadsPerBlock
=
GridDim
::
maxThreadsPerBlock
;
#endif
}
// namespace
template
<
bool
is_grad
,
int
DIMS
,
typename
T
>
__global__
void
_SliceKernel
(
const
TArray
<
int64_t
>
starts
,
const
TArray
<
int64_t
>
steps
,
const
TArray
<
int64_t
>
input_strides
,
const
TArray
<
fast_divmod
>
output_strides
,
const
T
*
input_data
,
T
*
output_data
,
const
HIP_LONG
N
)
{
HIP_LONG
start
=
kNumElementsPerThread
*
kNumThreadsPerBlock
*
blockIdx
.
x
+
threadIdx
.
x
;
T
values
[
kNumElementsPerThread
];
HIP_LONG
id
;
if
(
is_grad
)
{
id
=
start
;
#pragma unroll
for
(
int
i
=
0
;
i
<
kNumElementsPerThread
;
++
i
)
{
if
(
id
<
N
)
{
values
[
i
]
=
input_data
[
id
];
id
+=
kNumThreadsPerBlock
;
}
}
}
id
=
start
;
#pragma unroll
for
(
int
i
=
0
;
i
<
kNumElementsPerThread
;
++
i
)
{
if
(
id
<
N
)
{
HIP_LONG
input_index
=
0
;
int
div
;
int
mod
=
id
;
int
dim
=
0
;
#pragma unroll
for
(;
dim
<
DIMS
-
1
;
++
dim
)
{
output_strides
[
dim
].
divmod
(
mod
,
div
,
mod
);
input_index
+=
(
starts
[
dim
]
+
div
*
steps
[
dim
])
*
input_strides
[
dim
];
}
input_index
+=
starts
[
dim
]
+
mod
*
steps
[
dim
];
if
(
is_grad
)
{
output_data
[
input_index
]
=
values
[
i
];
}
else
{
values
[
i
]
=
input_data
[
input_index
];
}
id
+=
kNumThreadsPerBlock
;
}
}
if
(
!
is_grad
)
{
id
=
start
;
#pragma unroll
for
(
int
i
=
0
;
i
<
kNumElementsPerThread
;
++
i
)
{
if
(
id
<
N
)
{
output_data
[
id
]
=
values
[
i
];
id
+=
kNumThreadsPerBlock
;
}
}
}
}
template
<
bool
is_grad
>
Status
SliceImplEx
(
hipStream_t
stream
,
const
size_t
element_size
,
const
int32_t
dimension_count
,
const
TArray
<
int64_t
>&
starts
,
const
TArray
<
int64_t
>&
steps
,
const
TArray
<
int64_t
>&
input_strides
,
const
TArray
<
fast_divmod
>&
output_strides
,
const
void
*
input_data
,
void
*
output_data
,
const
size_t
N
)
{
int
blocksPerGrid
=
static_cast
<
int
>
(
CeilDiv
(
N
,
kNumThreadsPerBlock
*
kNumElementsPerThread
));
switch
(
element_size
)
{
#define HANDLE_DIMS(ELEMENT_TYPE, DIMS) \
case DIMS: { \
hipLaunchKernelGGL(HIP_KERNEL_NAME(_SliceKernel<is_grad, DIMS, ELEMENT_TYPE>), blocksPerGrid, kNumThreadsPerBlock, 0, stream, \
starts, steps, input_strides, output_strides, \
reinterpret_cast<const ToHipType<ELEMENT_TYPE>::MappedType*>(input_data), \
reinterpret_cast<ToHipType<ELEMENT_TYPE>::MappedType*>(output_data), (HIP_LONG)N); \
} break
#define HANDLE_ELEMENT_TYPE(ELEMENT_TYPE) \
case sizeof(ELEMENT_TYPE): { \
switch (dimension_count) { \
HANDLE_DIMS(ELEMENT_TYPE, 1); \
HANDLE_DIMS(ELEMENT_TYPE, 2); \
HANDLE_DIMS(ELEMENT_TYPE, 3); \
HANDLE_DIMS(ELEMENT_TYPE, 4); \
HANDLE_DIMS(ELEMENT_TYPE, 5); \
HANDLE_DIMS(ELEMENT_TYPE, 6); \
HANDLE_DIMS(ELEMENT_TYPE, 7); \
HANDLE_DIMS(ELEMENT_TYPE, 8); \
} \
} break
HANDLE_ELEMENT_TYPE
(
int8_t
);
HANDLE_ELEMENT_TYPE
(
int16_t
);
HANDLE_ELEMENT_TYPE
(
int32_t
);
HANDLE_ELEMENT_TYPE
(
int64_t
);
default:
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
FAIL
,
"Type not supported for Slice operator"
);
#undef HANDLE_ELEMENT_TYPE
#undef HANDLE_DIMS
}
return
Status
::
OK
();
}
Status
SliceImpl
(
hipStream_t
stream
,
const
size_t
element_size
,
const
int32_t
dimension_count
,
const
TArray
<
int64_t
>&
starts
,
const
TArray
<
int64_t
>&
steps
,
const
TArray
<
int64_t
>&
input_strides
,
const
TArray
<
fast_divmod
>&
output_strides
,
const
void
*
input_data
,
void
*
output_data
,
const
size_t
N
)
{
return
SliceImplEx
<
false
>
(
stream
,
element_size
,
dimension_count
,
starts
,
steps
,
input_strides
,
output_strides
,
input_data
,
output_data
,
N
);
}
#ifdef ENABLE_TRAINING
Status
SliceImplGrad
(
hipStream_t
stream
,
const
size_t
element_size
,
const
int32_t
dimension_count
,
const
TArray
<
int64_t
>&
starts
,
const
TArray
<
int64_t
>&
steps
,
const
TArray
<
int64_t
>&
input_strides
,
const
TArray
<
fast_divmod
>&
output_strides
,
const
void
*
input_data
,
void
*
output_data
,
const
size_t
N
)
{
return
SliceImplEx
<
true
>
(
stream
,
element_size
,
dimension_count
,
starts
,
steps
,
input_strides
,
output_strides
,
input_data
,
output_data
,
N
);
}
#endif // ENABLE_TRAINING
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/slice_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
rocm
{
Status
SliceImpl
(
hipStream_t
stream
,
const
size_t
element_size
,
const
int32_t
dimension_count
,
const
TArray
<
int64_t
>&
starts
,
const
TArray
<
int64_t
>&
steps
,
const
TArray
<
int64_t
>&
input_strides
,
const
TArray
<
fast_divmod
>&
output_strides
,
const
void
*
input_data
,
void
*
output_data
,
const
size_t
N
);
#ifdef ENABLE_TRAINING
Status
SliceImplGrad
(
hipStream_t
stream
,
const
size_t
element_size
,
const
int32_t
dimension_count
,
const
TArray
<
int64_t
>&
starts
,
const
TArray
<
int64_t
>&
steps
,
const
TArray
<
int64_t
>&
input_strides
,
const
TArray
<
fast_divmod
>&
output_strides
,
const
void
*
input_data
,
void
*
output_data
,
const
size_t
N
);
#endif // ENABLE_TRAINING
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/space_depth_ops.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include <vector>
#include "space_depth_ops.h"
#include "core/providers/rocm/tensor/transpose.h"
namespace
onnxruntime
{
namespace
rocm
{
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
SpaceToDepth
,
kOnnxDomain
,
1
,
12
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
{
DataTypeImpl
::
GetTensorType
<
float
>
(),
DataTypeImpl
::
GetTensorType
<
double
>
(),
DataTypeImpl
::
GetTensorType
<
MLFloat16
>
()}),
SpaceToDepth
);
ONNX_OPERATOR_KERNEL_EX
(
SpaceToDepth
,
kOnnxDomain
,
13
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
{
DataTypeImpl
::
GetTensorType
<
float
>
(),
DataTypeImpl
::
GetTensorType
<
double
>
(),
DataTypeImpl
::
GetTensorType
<
MLFloat16
>
()}),
SpaceToDepth
);
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
DepthToSpace
,
kOnnxDomain
,
1
,
10
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
{
DataTypeImpl
::
GetTensorType
<
float
>
(),
DataTypeImpl
::
GetTensorType
<
double
>
(),
DataTypeImpl
::
GetTensorType
<
MLFloat16
>
()}),
DepthToSpace
);
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
DepthToSpace
,
kOnnxDomain
,
11
,
12
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
{
DataTypeImpl
::
GetTensorType
<
float
>
(),
DataTypeImpl
::
GetTensorType
<
double
>
(),
DataTypeImpl
::
GetTensorType
<
MLFloat16
>
()}),
DepthToSpace
);
ONNX_OPERATOR_KERNEL_EX
(
DepthToSpace
,
kOnnxDomain
,
13
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
{
DataTypeImpl
::
GetTensorType
<
float
>
(),
DataTypeImpl
::
GetTensorType
<
double
>
(),
DataTypeImpl
::
GetTensorType
<
MLFloat16
>
()}),
DepthToSpace
);
static
Status
SpaceDepthOpCudaImpl
(
const
hipDeviceProp_t
&
prop
,
hipStream_t
stream
,
const
rocblas_handle
rocblas_handle
,
const
Tensor
&
input
,
Tensor
&
output
,
const
std
::
vector
<
size_t
>&
permutation
,
const
int64_t
batch_size
,
const
int64_t
in_dim1
,
const
int64_t
in_dim2
,
const
int64_t
in_dim3
,
const
int64_t
in_dim4
,
const
int64_t
in_dim5
,
const
TensorShape
&
virtual_output_shape
)
{
TensorShape
virtual_input_shape
{
batch_size
,
in_dim1
,
in_dim2
,
in_dim3
,
in_dim4
,
in_dim5
};
return
Transpose
::
DoTranspose
(
prop
,
stream
,
rocblas_handle
,
permutation
,
input
,
output
,
&
virtual_input_shape
,
&
virtual_output_shape
);
}
Status
SpaceToDepth
::
ComputeInternal
(
OpKernelContext
*
context
)
const
{
const
auto
*
tensor_pointer
=
context
->
Input
<
Tensor
>
(
0
);
if
(
tensor_pointer
==
nullptr
)
return
Status
(
common
::
ONNXRUNTIME
,
common
::
FAIL
,
"input count mismatch"
);
const
Tensor
&
input
=
*
tensor_pointer
;
int64_t
batch
=
-
1
;
int64_t
input_depth
=
-
1
;
int64_t
input_height
=
-
1
;
int64_t
input_width
=
-
1
;
int64_t
output_depth
=
-
1
;
int64_t
output_height
=
-
1
;
int64_t
output_width
=
-
1
;
ORT_RETURN_IF_ERROR
(
InputValidationsAndOutputDimsCalc
(
input
,
batch
,
input_depth
,
input_height
,
input_width
,
output_depth
,
output_height
,
output_width
,
true
));
// We use the "actual" output shape to construct the output tensor
Tensor
&
output
=
*
context
->
Output
(
0
,
{
batch
,
output_depth
,
output_height
,
output_width
});
// We will pass in the "virtual" output shape to be used by DoTranspose() in SpaceDepthOpCudaImpl(...)
TensorShape
virtual_output_shape
{
batch
,
blocksize_
,
blocksize_
,
input_depth
,
input_height
/
blocksize_
,
input_width
/
blocksize_
};
std
::
vector
<
size_t
>
permutation
=
{
0
,
3
,
5
,
1
,
2
,
4
};
ORT_RETURN_IF_ERROR
(
SpaceDepthOpCudaImpl
(
GetDeviceProp
(),
Stream
(),
RocblasHandle
(),
input
,
output
,
permutation
,
batch
,
input_depth
,
input_height
/
blocksize_
,
blocksize_
,
input_width
/
blocksize_
,
blocksize_
,
virtual_output_shape
));
return
Status
::
OK
();
}
Status
DepthToSpace
::
ComputeInternal
(
OpKernelContext
*
context
)
const
{
const
auto
*
tensor_pointer
=
context
->
Input
<
Tensor
>
(
0
);
if
(
tensor_pointer
==
nullptr
)
return
Status
(
common
::
ONNXRUNTIME
,
common
::
FAIL
,
"input count mismatch"
);
const
Tensor
&
input
=
*
tensor_pointer
;
int64_t
batch
=
-
1
;
int64_t
input_depth
=
-
1
;
int64_t
input_height
=
-
1
;
int64_t
input_width
=
-
1
;
int64_t
output_depth
=
-
1
;
int64_t
output_height
=
-
1
;
int64_t
output_width
=
-
1
;
ORT_RETURN_IF_ERROR
(
InputValidationsAndOutputDimsCalc
(
input
,
batch
,
input_depth
,
input_height
,
input_width
,
output_depth
,
output_height
,
output_width
,
false
));
// We use the "actual" output shape to construct the output tensor
Tensor
&
output
=
*
context
->
Output
(
0
,
{
batch
,
output_depth
,
output_height
,
output_width
});
// We will pass in the "virtual" output shape to be used by DoTranspose() in SpaceDepthOpCudaImpl(...)
TensorShape
virtual_output_shape
{
batch
,
input_depth
/
blocksize_
/
blocksize_
,
input_height
,
blocksize_
,
input_width
,
blocksize_
};
std
::
vector
<
size_t
>
permutation
;
permutation
.
reserve
(
6
);
permutation
.
push_back
(
0
);
if
(
is_dcr_
)
{
permutation
.
push_back
(
3
);
permutation
.
push_back
(
4
);
permutation
.
push_back
(
1
);
permutation
.
push_back
(
5
);
permutation
.
push_back
(
2
);
}
else
{
permutation
.
push_back
(
1
);
permutation
.
push_back
(
4
);
permutation
.
push_back
(
2
);
permutation
.
push_back
(
5
);
permutation
.
push_back
(
3
);
}
int64_t
dim1
=
is_dcr_
?
blocksize_
:
input_depth
/
blocksize_
/
blocksize_
;
int64_t
dim3
=
is_dcr_
?
input_depth
/
blocksize_
/
blocksize_
:
blocksize_
;
ORT_RETURN_IF_ERROR
(
SpaceDepthOpCudaImpl
(
GetDeviceProp
(),
Stream
(),
RocblasHandle
(),
input
,
output
,
permutation
,
batch
,
dim1
,
blocksize_
,
dim3
,
input_height
,
input_width
,
virtual_output_shape
));
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/space_depth_ops.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/tensor/space_depth_ops.h"
namespace
onnxruntime
{
namespace
rocm
{
class
SpaceToDepth
final
:
public
RocmKernel
,
SpaceDepthBase
{
public:
explicit
SpaceToDepth
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
),
SpaceDepthBase
(
info
)
{
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
};
class
DepthToSpace
final
:
public
RocmKernel
,
SpaceDepthBase
{
public:
explicit
DepthToSpace
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
),
SpaceDepthBase
(
info
)
{
std
::
string
mode
;
// if mode doesn't exist, then it is the default "DCR" mode
// (or) it is an opset < 11 model for which the only mode is "DCR" mode
if
(
info
.
GetAttr
(
"mode"
,
&
mode
).
IsOK
())
{
if
(
mode
==
"CRD"
)
is_dcr_
=
false
;
else
if
(
mode
!=
"DCR"
)
ORT_THROW
(
"DepthToSpace op: only 'DCR' and 'CRD' modes are supported"
);
}
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
private:
bool
is_dcr_
=
true
;
};
}
// namespace rocm
}
//namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/split.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/split.h"
#include "core/providers/rocm/tensor/split_impl.h"
#include "core/providers/cpu/tensor/utils.h"
namespace
onnxruntime
{
namespace
rocm
{
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Split
,
kOnnxDomain
,
2
,
10
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
()).
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
()),
Split
);
// explicitly supports negative axis
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Split
,
kOnnxDomain
,
11
,
12
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
()).
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
()),
Split
);
// explicitly supports 'split' as optional input
ONNX_OPERATOR_KERNEL_EX
(
Split
,
kOnnxDomain
,
13
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
InputMemoryType
(
OrtMemTypeCPUInput
,
1
)
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
()),
Split
);
Status
Split
::
ComputeInternal
(
OpKernelContext
*
ctx
)
const
{
const
Tensor
*
input_tensor
=
ctx
->
Input
<
Tensor
>
(
0
);
ORT_ENFORCE
(
input_tensor
);
auto
&
input_shape
=
input_tensor
->
Shape
();
auto
num_outputs
=
ctx
->
OutputCount
();
int64_t
axis
=
HandleNegativeAxis
(
axis_
,
input_shape
.
NumDimensions
());
int
before_dims
=
0
;
int
block_size_including_axis_dim
=
0
;
int
block_size_inside_axis_dim
=
0
;
std
::
vector
<
int64_t
>
split_sizes
(
num_outputs
);
const
Tensor
*
split_tensor
=
ctx
->
Input
<
Tensor
>
(
1
);
if
(
split_tensor
)
{
ORT_ENFORCE
(
split_tensor
->
Shape
().
NumDimensions
()
==
1
,
"An split tensor must be a vector tensor."
);
auto
nDims
=
static_cast
<
size_t
>
(
split_tensor
->
Shape
()[
0
]);
const
int64_t
*
data
=
split_tensor
->
Data
<
int64_t
>
();
split_sizes
.
assign
(
data
,
data
+
nDims
);
}
else
{
split_sizes
.
assign
(
split_sizes_
.
begin
(),
split_sizes_
.
end
());
}
ORT_RETURN_IF_ERROR
(
PrepareForCompute
(
input_shape
,
num_outputs
,
axis
,
before_dims
,
block_size_including_axis_dim
,
block_size_inside_axis_dim
,
split_sizes
));
auto
input_data
=
input_tensor
->
DataRaw
();
auto
input_dims
=
input_shape
.
GetDims
();
auto
output_dimensions
{
input_shape
.
AsShapeVector
()};
RocmAsyncBuffer
<
void
*>
output_ptr
(
this
,
num_outputs
);
gsl
::
span
<
void
*>
output_ptr_span
=
output_ptr
.
CpuSpan
();
TensorShapeVector
axis_dimension_input_output_mapping
(
input_dims
[
axis
]);
int
index
=
0
;
for
(
int
i
=
0
;
i
<
num_outputs
;
++
i
)
{
// update size of dimension for axis we're splitting on
auto
split_size
=
gsl
::
narrow
<
int
>
(
split_sizes
[
i
]);
output_dimensions
[
axis
]
=
split_size
;
Tensor
*
output
=
ctx
->
Output
(
i
,
TensorShape
{
output_dimensions
});
auto
output_data
=
output
->
MutableDataRaw
();
output_ptr_span
[
i
]
=
output_data
;
for
(
int
j
=
0
;
j
<
split_size
;
++
j
)
{
axis_dimension_input_output_mapping
.
at
(
index
++
)
=
i
;
}
}
if
(
input_tensor
->
Shape
().
Size
()
<=
0
)
return
Status
::
OK
();
size_t
element_size
=
input_tensor
->
DataType
()
->
Size
();
if
(
std
::
all_of
(
split_sizes
.
begin
(),
split_sizes
.
end
(),
[
&
](
int64_t
size
)
{
return
size
==
split_sizes
[
0
];
}))
{
if
(
num_outputs
<=
32
)
{
TArray
<
void
*
,
32
>
output_ptr_array
(
num_outputs
);
for
(
int
i
=
0
;
i
<
num_outputs
;
++
i
)
output_ptr_array
[
i
]
=
output_ptr_span
[
i
];
ORT_RETURN_IF_ERROR
(
SplitSameSplitDimImpl
(
Stream
(),
element_size
,
block_size_including_axis_dim
,
block_size_inside_axis_dim
,
split_sizes
[
0
],
num_outputs
,
input_data
,
output_ptr_array
,
static_cast
<
size_t
>
(
input_shape
.
Size
())));
}
else
{
ORT_RETURN_IF_ERROR
(
output_ptr
.
CopyToGpu
());
ORT_RETURN_IF_ERROR
(
SplitSameSplitDimImpl
(
Stream
(),
element_size
,
block_size_including_axis_dim
,
block_size_inside_axis_dim
,
split_sizes
[
0
],
num_outputs
,
input_data
,
output_ptr
.
GpuPtr
(),
static_cast
<
size_t
>
(
input_shape
.
Size
())));
}
}
else
{
ORT_RETURN_IF_ERROR
(
output_ptr
.
CopyToGpu
());
RocmAsyncBuffer
<
int64_t
>
split_sizes_gpu
(
this
,
split_sizes
);
ORT_RETURN_IF_ERROR
(
split_sizes_gpu
.
CopyToGpu
());
std
::
vector
<
int64_t
>
split_sizes_range
(
split_sizes
);
for
(
size_t
i
=
1
;
i
<
split_sizes_range
.
size
();
++
i
)
{
split_sizes_range
[
i
]
+=
split_sizes_range
[
i
-
1
];
}
RocmAsyncBuffer
<
int64_t
>
split_sizes_range_gpu
(
this
,
split_sizes_range
);
ORT_RETURN_IF_ERROR
(
split_sizes_range_gpu
.
CopyToGpu
());
RocmAsyncBuffer
<
int64_t
>
axis_dimension_input_output_mapping_gpu
(
this
,
axis_dimension_input_output_mapping
);
ORT_RETURN_IF_ERROR
(
axis_dimension_input_output_mapping_gpu
.
CopyToGpu
());
ORT_RETURN_IF_ERROR
(
SplitImpl
(
Stream
(),
element_size
,
block_size_including_axis_dim
,
block_size_inside_axis_dim
,
split_sizes_gpu
.
GpuPtr
(),
split_sizes_range_gpu
.
GpuPtr
(),
axis_dimension_input_output_mapping_gpu
.
GpuPtr
(),
num_outputs
,
input_data
,
output_ptr
.
GpuPtr
(),
static_cast
<
size_t
>
(
input_shape
.
Size
())));
}
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/split.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/tensor/split.h"
namespace
onnxruntime
{
namespace
rocm
{
class
Split
final
:
public
RocmKernel
,
public
SplitBase
{
public:
Split
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
),
SplitBase
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/split_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/split_impl.h"
#include "core/providers/rocm/cu_inc/common.cuh"
#include "core/providers/rocm/rocm_common.h"
namespace
onnxruntime
{
namespace
rocm
{
namespace
{
#ifdef USE_ROCM
constexpr
int
kNumElementsPerThread
=
2
;
constexpr
int
kNumThreadsPerBlock
=
512
;
#else
constexpr
int
kNumElementsPerThread
=
GridDim
::
maxElementsPerThread
;
constexpr
int
kNumThreadsPerBlock
=
GridDim
::
maxThreadsPerBlock
;
#endif
}
// namespace
template
<
typename
T
,
typename
OutputDataArray
>
__global__
void
_SplitKernelSameSplitDim
(
const
fast_divmod
block_size_including_axis_dim_div
,
const
fast_divmod
block_size_inside_axis_dim_div
,
const
fast_divmod
split_dim_size
,
const
int
num_outputs
,
const
T
*
input_data
,
OutputDataArray
output_data
,
const
HIP_LONG
N
)
{
HIP_LONG
start
=
kNumElementsPerThread
*
kNumThreadsPerBlock
*
blockIdx
.
x
+
threadIdx
.
x
;
T
value
[
kNumElementsPerThread
];
HIP_LONG
id
=
start
;
#pragma unroll
for
(
int
i
=
0
;
i
<
kNumElementsPerThread
;
++
i
)
{
if
(
id
<
N
)
{
value
[
i
]
=
input_data
[
id
];
id
+=
kNumThreadsPerBlock
;
}
}
id
=
start
;
#pragma unroll
for
(
int
i
=
0
;
i
<
kNumElementsPerThread
;
++
i
)
{
if
(
id
<
N
)
{
int
outer_block_index
,
block_index
,
offset
,
output_index
,
block_offset
;
block_size_including_axis_dim_div
.
divmod
(
id
,
outer_block_index
,
offset
);
block_size_inside_axis_dim_div
.
divmod
(
offset
,
block_index
,
offset
);
split_dim_size
.
divmod
(
block_index
,
output_index
,
block_offset
);
HIP_LONG
output_pos
=
(
outer_block_index
*
split_dim_size
.
d_
+
block_offset
)
*
block_size_inside_axis_dim_div
.
d_
+
offset
;
reinterpret_cast
<
T
*>
(
output_data
[
output_index
])[
output_pos
]
=
value
[
i
];
id
+=
kNumThreadsPerBlock
;
}
}
}
template
<
typename
OutputDataArray
>
Status
SplitSameSplitDimImpl
(
hipStream_t
stream
,
const
size_t
element_size
,
const
int
block_size_including_axis_dim
,
const
int
block_size_inside_axis_dim
,
const
int64_t
split_size
,
const
int
num_outputs
,
const
void
*
input_data
,
OutputDataArray
output_data
,
const
size_t
input_size
)
{
HIP_LONG
N
=
static_cast
<
HIP_LONG
>
(
input_size
);
int
blocksPerGrid
=
CeilDiv
(
N
,
kNumElementsPerThread
*
kNumThreadsPerBlock
);
fast_divmod
block_size_including_axis_dim_div
=
fast_divmod
(
block_size_including_axis_dim
);
fast_divmod
block_size_inside_axis_dim_div
=
fast_divmod
(
block_size_inside_axis_dim
);
fast_divmod
split_size_div
=
fast_divmod
(
static_cast
<
int
>
(
split_size
));
switch
(
element_size
)
{
#define CASE_ELEMENT_TYPE(type) \
case sizeof(type): { \
hipLaunchKernelGGL(_SplitKernelSameSplitDim, blocksPerGrid, kNumThreadsPerBlock, 0, stream, \
block_size_including_axis_dim_div, block_size_inside_axis_dim_div, split_size_div, num_outputs, \
reinterpret_cast<const ToHipType<type>::MappedType*>(input_data), output_data, N); \
} break
CASE_ELEMENT_TYPE
(
int8_t
);
CASE_ELEMENT_TYPE
(
int16_t
);
CASE_ELEMENT_TYPE
(
int32_t
);
CASE_ELEMENT_TYPE
(
int64_t
);
#undef CASE_ELEMENT_TYPE
default:
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
FAIL
,
"Type not supported for Slice operator"
);
}
return
Status
::
OK
();
}
template
Status
SplitSameSplitDimImpl
<
void
**
>(
hipStream_t
stream
,
const
size_t
element_size
,
const
int
block_size_including_axis_dim
,
const
int
block_size_inside_axis_dim
,
const
int64_t
split_size
,
const
int
num_outputs
,
const
void
*
input_data
,
void
**
output_data
,
const
size_t
input_size
);
template
Status
SplitSameSplitDimImpl
<
TArray
<
void
*
,
32
>
>
(
hipStream_t
stream
,
const
size_t
element_size
,
const
int
block_size_including_axis_dim
,
const
int
block_size_inside_axis_dim
,
const
int64_t
split_size
,
const
int
num_outputs
,
const
void
*
input_data
,
TArray
<
void
*
,
32
>
output_data
,
const
size_t
input_size
);
template
<
typename
T
>
__global__
void
_SplitKernel
(
const
fast_divmod
block_size_including_axis_dim_div
,
const
fast_divmod
block_size_inside_axis_dim_div
,
const
int64_t
*
split_sizes
,
const
int64_t
*
split_sizes_range
,
const
int64_t
*
axis_dimension_input_output_mapping
,
const
int
num_outputs
,
const
T
*
input_data
,
void
**
output_data
,
const
HIP_LONG
N
)
{
HIP_LONG
start
=
kNumElementsPerThread
*
kNumThreadsPerBlock
*
blockIdx
.
x
+
threadIdx
.
x
;
T
value
[
kNumElementsPerThread
];
HIP_LONG
id
=
start
;
#pragma unroll
for
(
int
i
=
0
;
i
<
kNumElementsPerThread
;
++
i
)
{
if
(
id
<
N
)
{
value
[
i
]
=
input_data
[
id
];
id
+=
kNumThreadsPerBlock
;
}
}
id
=
start
;
#pragma unroll
for
(
int
i
=
0
;
i
<
kNumElementsPerThread
;
++
i
)
{
if
(
id
<
N
)
{
int
outer_block_index
,
block_index
,
offset
;
block_size_including_axis_dim_div
.
divmod
(
id
,
outer_block_index
,
offset
);
block_size_inside_axis_dim_div
.
divmod
(
offset
,
block_index
,
offset
);
int
output_index
=
axis_dimension_input_output_mapping
[
block_index
];
int64_t
range_left
=
(
output_index
==
0
)
?
0
:
split_sizes_range
[
output_index
-
1
];
int
block_offset
=
block_index
-
static_cast
<
int
>
(
range_left
);
HIP_LONG
output_pos
=
(
outer_block_index
*
split_sizes
[
output_index
]
+
block_offset
)
*
block_size_inside_axis_dim_div
.
d_
+
offset
;
reinterpret_cast
<
T
*>
(
output_data
[
output_index
])[
output_pos
]
=
value
[
i
];
id
+=
kNumThreadsPerBlock
;
}
}
}
Status
SplitImpl
(
hipStream_t
stream
,
const
size_t
element_size
,
const
int
block_size_including_axis_dim
,
const
int
block_size_inside_axis_dim
,
const
int64_t
*
split_sizes
,
const
int64_t
*
split_sizes_range
,
const
int64_t
*
axis_dimension_input_output_mapping
,
const
int
num_outputs
,
const
void
*
input_data
,
void
**
output_data
,
const
size_t
input_size
)
{
HIP_LONG
N
=
static_cast
<
HIP_LONG
>
(
input_size
);
int
blocksPerGrid
=
CeilDiv
(
N
,
kNumElementsPerThread
*
kNumThreadsPerBlock
);
fast_divmod
block_size_including_axis_dim_div
=
fast_divmod
(
block_size_including_axis_dim
);
fast_divmod
block_size_inside_axis_dim_div
=
fast_divmod
(
block_size_inside_axis_dim
);
switch
(
element_size
)
{
#define CASE_ELEMENT_TYPE(type) \
case sizeof(type): { \
hipLaunchKernelGGL(_SplitKernel, blocksPerGrid, kNumThreadsPerBlock, 0, stream, \
block_size_including_axis_dim_div, block_size_inside_axis_dim_div, split_sizes, split_sizes_range, \
axis_dimension_input_output_mapping, num_outputs, \
reinterpret_cast<const ToHipType<type>::MappedType*>(input_data), output_data, N); \
} break
CASE_ELEMENT_TYPE
(
int8_t
);
CASE_ELEMENT_TYPE
(
int16_t
);
CASE_ELEMENT_TYPE
(
int32_t
);
CASE_ELEMENT_TYPE
(
int64_t
);
#undef CASE_ELEMENT_TYPE
default:
return
ORT_MAKE_STATUS
(
ONNXRUNTIME
,
FAIL
,
"Type not supported for Slice operator"
);
}
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/split_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
#include "core/common/common.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
OutputDataArray
>
Status
SplitSameSplitDimImpl
(
hipStream_t
stream
,
const
size_t
element_size
,
const
int
block_size_including_axis_dim
,
const
int
block_size_inside_axis_dim
,
const
int64_t
split_size
,
const
int
num_outputs
,
const
void
*
input_data
,
OutputDataArray
output_data
,
const
size_t
input_size
);
Status
SplitImpl
(
hipStream_t
stream
,
const
size_t
element_size
,
const
int
block_size_including_axis_dim
,
const
int
block_size_inside_axis_dim
,
const
int64_t
*
split_sizes
,
const
int64_t
*
split_sizes_range
,
const
int64_t
*
axis_dimension_input_output_mapping
,
const
int
num_outputs
,
const
void
*
input_data
,
void
**
output_data
,
const
size_t
input_size
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/squeeze.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "squeeze.h"
namespace
onnxruntime
{
namespace
rocm
{
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Squeeze
,
kOnnxDomain
,
1
,
10
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
Alias
(
0
,
0
)
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
()),
Squeeze
);
// explicit support for negative axis.
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Squeeze
,
kOnnxDomain
,
11
,
12
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
Alias
(
0
,
0
)
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
()),
Squeeze
);
// axes is input instead of attribute
ONNX_OPERATOR_KERNEL_EX
(
Squeeze
,
kOnnxDomain
,
13
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
Alias
(
0
,
0
)
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
())
.
InputMemoryType
(
OrtMemTypeCPUInput
,
1
),
Squeeze
);
Status
Squeeze
::
ComputeInternal
(
OpKernelContext
*
ctx
)
const
{
const
Tensor
*
X
=
ctx
->
Input
<
Tensor
>
(
0
);
const
TensorShape
&
X_shape
=
X
->
Shape
();
TensorShapeVector
axes
;
size_t
num_inputs
=
ctx
->
InputCount
();
if
(
num_inputs
==
2
)
{
//axes is an input
const
Tensor
*
axes_tensor
=
ctx
->
Input
<
Tensor
>
(
1
);
ORT_ENFORCE
(
axes_tensor
!=
nullptr
,
"Axes input is null"
);
ORT_ENFORCE
(
axes_tensor
->
Shape
().
NumDimensions
()
==
1
,
"An axes tensor must be a vector tensor."
);
auto
nDims
=
static_cast
<
size_t
>
(
axes_tensor
->
Shape
()[
0
]);
const
auto
*
data
=
axes_tensor
->
Data
<
int64_t
>
();
axes
.
assign
(
data
,
data
+
nDims
);
}
else
{
axes
.
assign
(
axes_
.
begin
(),
axes_
.
end
());
}
TensorShapeVector
output_shape
=
ComputeOutputShape
(
X_shape
,
axes
);
Tensor
*
Y
=
ctx
->
Output
(
0
,
TensorShape
(
output_shape
));
const
void
*
input
=
X
->
DataRaw
();
void
*
output
=
Y
->
MutableDataRaw
();
if
(
input
==
output
)
return
Status
::
OK
();
auto
count
=
X
->
Shape
().
Size
();
auto
element_bytes
=
X
->
DataType
()
->
Size
();
HIP_RETURN_IF_ERROR
(
hipMemcpyAsync
(
output
,
input
,
count
*
element_bytes
,
hipMemcpyDeviceToDevice
,
Stream
()));
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/squeeze.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/tensor/squeeze.h"
namespace
onnxruntime
{
namespace
rocm
{
class
Squeeze
final
:
public
SqueezeBase
,
public
RocmKernel
{
public:
Squeeze
(
const
OpKernelInfo
&
info
)
:
SqueezeBase
(
info
),
RocmKernel
(
info
)
{}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/tile.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/tensor/tile.h"
#include "core/providers/cpu/tensor/utils.h"
#include "tile_impl.h"
using
namespace
onnxruntime
::
common
;
namespace
onnxruntime
{
namespace
rocm
{
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Tile
,
kOnnxDomain
,
6
,
12
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
InputMemoryType
(
OrtMemTypeCPUInput
,
1
)
.
TypeConstraint
(
"T"
,
{
DataTypeImpl
::
GetTensorType
<
float
>
(),
DataTypeImpl
::
GetTensorType
<
double
>
(),
DataTypeImpl
::
GetTensorType
<
int32_t
>
(),
DataTypeImpl
::
GetTensorType
<
int64_t
>
(),
DataTypeImpl
::
GetTensorType
<
MLFloat16
>
()})
.
TypeConstraint
(
"T1"
,
DataTypeImpl
::
GetTensorType
<
int64_t
>
()),
Tile
);
ONNX_OPERATOR_KERNEL_EX
(
Tile
,
kOnnxDomain
,
13
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
InputMemoryType
(
OrtMemTypeCPUInput
,
1
)
.
TypeConstraint
(
"T"
,
{
DataTypeImpl
::
GetTensorType
<
float
>
(),
DataTypeImpl
::
GetTensorType
<
double
>
(),
DataTypeImpl
::
GetTensorType
<
int32_t
>
(),
DataTypeImpl
::
GetTensorType
<
int64_t
>
(),
DataTypeImpl
::
GetTensorType
<
MLFloat16
>
()})
.
TypeConstraint
(
"T1"
,
DataTypeImpl
::
GetTensorType
<
int64_t
>
()),
Tile
);
#define CASE_TILE(type) \
case sizeof(type): { \
TileImpl(Stream(), rank, fdm_input_shape, input_strides, \
reinterpret_cast<const typename ToHipType<type>::MappedType*>(input_data), fdm_output_strides, \
reinterpret_cast<typename ToHipType<type>::MappedType*>(output_data), output_tensor.Shape().Size()); \
} break
#define CASE_TILE_MEMCPY(type) \
case sizeof(type): { \
TileMemcpyImpl(Stream(), reinterpret_cast<const typename ToHipType<type>::MappedType*>(input_data), \
reinterpret_cast<typename ToHipType<type>::MappedType*>(output_data), input_shape.Size(), \
num_of_copies_per_batch); \
} break
#define CASE_TILE_BATCHED_MEMCPY(type) \
case sizeof(type): { \
TileBatchedMemcpyImpl(Stream(), reinterpret_cast<const typename ToHipType<type>::MappedType*>(input_data), \
reinterpret_cast<typename ToHipType<type>::MappedType*>(output_data), \
num_of_elements_per_batch, input_shape.Size(), num_of_batch_copies, \
num_of_copies_per_batch); \
} break
Status
Tile
::
ComputeInternal
(
OpKernelContext
*
ctx
)
const
{
auto
&
input_tensor
=
*
ctx
->
Input
<
Tensor
>
(
0
);
auto
&
repeats_tensor
=
*
ctx
->
Input
<
Tensor
>
(
1
);
int32_t
rank
=
static_cast
<
int32_t
>
(
input_tensor
.
Shape
().
NumDimensions
());
if
(
repeats_tensor
.
Shape
().
NumDimensions
()
!=
1
)
return
Status
(
ONNXRUNTIME
,
INVALID_ARGUMENT
,
"'repeat' input tensor must be 1 dimensional"
);
if
(
repeats_tensor
.
Shape
().
Size
()
!=
rank
)
return
Status
(
ONNXRUNTIME
,
INVALID_ARGUMENT
,
"'repeat' input tensor must have the same length as the 'input' tensor"
);
// Calculate the shape of the output tensor
auto
*
repeats
=
repeats_tensor
.
Data
<
int64_t
>
();
const
auto
&
input_shape
=
input_tensor
.
Shape
();
const
auto
input_dims
=
input_shape
.
GetDims
();
auto
output_dims
(
input_shape
.
AsShapeVector
());
for
(
auto
axis
=
0
;
axis
<
rank
;
axis
++
)
output_dims
[
axis
]
*=
repeats
[
axis
];
TensorShape
output_shape
(
output_dims
);
auto
&
output_tensor
=
*
ctx
->
Output
(
0
,
output_shape
);
void
*
output_data
=
output_tensor
.
MutableDataRaw
();
const
void
*
input_data
=
input_tensor
.
DataRaw
();
const
auto
element_size
=
input_tensor
.
DataType
()
->
Size
();
// Repeat tensor input can have 0 as a valid value
// check if the computed output_shape size is 0 and
// return an empty tensor if so.
if
(
output_shape
.
Size
()
==
0
)
{
return
Status
::
OK
();
}
// Repeat tensor has all 1s in it
if
(
output_shape
==
input_shape
)
{
return
HIP_CALL
(
hipMemcpyAsync
(
output_tensor
.
MutableDataRaw
(),
input_tensor
.
DataRaw
(),
input_tensor
.
SizeInBytes
(),
hipMemcpyDeviceToDevice
,
Stream
()));
}
bool
is_batched_memcpy
=
false
;
size_t
num_of_elements_per_batch
=
1
;
size_t
num_of_copies_per_batch
=
1
;
size_t
num_of_batch_copies
=
1
;
if
(
TileOp
::
IsTileMemcpy
(
input_shape
,
repeats
,
rank
,
is_batched_memcpy
,
num_of_elements_per_batch
,
num_of_copies_per_batch
,
num_of_batch_copies
))
{
if
(
!
is_batched_memcpy
)
{
switch
(
element_size
)
{
CASE_TILE_MEMCPY
(
float
);
CASE_TILE_MEMCPY
(
double
);
CASE_TILE_MEMCPY
(
MLFloat16
);
default:
ORT_THROW
(
"Unsupported value attribute datatype with sizeof=: "
,
element_size
);
break
;
}
}
else
{
switch
(
element_size
)
{
CASE_TILE_BATCHED_MEMCPY
(
float
);
CASE_TILE_BATCHED_MEMCPY
(
double
);
CASE_TILE_BATCHED_MEMCPY
(
MLFloat16
);
default:
ORT_THROW
(
"Unsupported value attribute datatype with sizeof=: "
,
element_size
);
break
;
}
}
return
Status
::
OK
();
}
TensorPitches
input_pitches
(
input_dims
);
TArray
<
int64_t
>
input_strides
(
input_pitches
);
TArray
<
fast_divmod
>
fdm_input_shape
(
rank
);
for
(
size_t
i
=
0
;
i
<
input_dims
.
size
();
++
i
)
{
fdm_input_shape
[
gsl
::
narrow_cast
<
int
>
(
i
)]
=
fast_divmod
(
gsl
::
narrow_cast
<
int
>
(
input_dims
[
i
]));
}
TArray
<
fast_divmod
>
fdm_output_strides
(
rank
);
TensorPitches
output_pitches
(
output_dims
);
for
(
auto
i
=
0
;
i
<
rank
;
i
++
)
{
fdm_output_strides
[
i
]
=
fast_divmod
(
static_cast
<
int
>
(
output_pitches
[
i
]));
}
static_assert
(
sizeof
(
float
)
==
sizeof
(
int32_t
),
"Float and Int32 are of different sizes"
);
static_assert
(
sizeof
(
double
)
==
sizeof
(
int64_t
),
"Double and Int64 are of different sizes"
);
if
(
output_tensor
.
Shape
().
Size
()
>
0
)
{
switch
(
element_size
)
{
CASE_TILE
(
float
);
CASE_TILE
(
double
);
CASE_TILE
(
MLFloat16
);
default:
ORT_THROW
(
"Unsupported value attribute datatype with sizeof=: "
,
element_size
);
break
;
}
}
return
Status
::
OK
();
}
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/tile.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/shared_library/provider_api.h"
#include "core/providers/rocm/rocm_kernel.h"
#include "core/providers/cpu/tensor/tile.h"
namespace
onnxruntime
{
namespace
rocm
{
struct
Tile
final
:
RocmKernel
{
explicit
Tile
(
const
OpKernelInfo
&
info
)
:
RocmKernel
(
info
)
{
}
Status
ComputeInternal
(
OpKernelContext
*
context
)
const
override
;
};
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/tile_impl.cu
0 → 100644
View file @
1a91fcc2
#include "hip/hip_runtime.h"
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/rocm/cu_inc/common.cuh"
#include "tile_impl.h"
namespace
onnxruntime
{
namespace
rocm
{
#ifdef USE_ROCM
constexpr
int
num_elements_per_thread
=
2
;
constexpr
int
num_threads_per_block
=
512
;
#else
constexpr
int
num_elements_per_thread
=
GridDim
::
maxElementsPerThread
;
constexpr
int
num_threads_per_block
=
GridDim
::
maxThreadsPerBlock
;
#endif
template
<
typename
T
>
__global__
void
_UnRolledTileKernel
(
const
size_t
shape_rank
,
const
TArray
<
fast_divmod
>
fdm_input_shape
,
const
TArray
<
int64_t
>
input_strides
,
const
T
*
input_data
,
const
TArray
<
fast_divmod
>
fdm_output_strides
,
T
*
output_data
,
const
HIP_LONG
N
)
{
HIP_LONG
start
=
num_elements_per_thread
*
num_threads_per_block
*
blockIdx
.
x
+
threadIdx
.
x
;
T
value
[
num_elements_per_thread
];
HIP_LONG
id
=
start
;
#pragma unroll
for
(
int
i
=
0
;
i
<
num_elements_per_thread
;
++
i
)
{
if
(
id
<
N
)
{
HIP_LONG
input_index
=
0
;
HIP_LONG
offset
=
id
;
#pragma unroll
for
(
auto
dim
=
0
;
dim
<
fdm_output_strides
.
Capacity
();
++
dim
)
{
if
(
dim
>=
shape_rank
)
{
break
;
}
int
out_coord
,
r
;
fdm_output_strides
[
dim
].
divmod
(
offset
,
out_coord
,
r
);
int
in_coord
=
fdm_input_shape
[
dim
].
mod
(
out_coord
);
input_index
+=
input_strides
[
dim
]
*
in_coord
;
offset
=
r
;
}
value
[
i
]
=
input_data
[
input_index
];
id
+=
num_threads_per_block
;
}
}
id
=
start
;
#pragma unroll
for
(
int
i
=
0
;
i
<
num_elements_per_thread
;
++
i
)
{
if
(
id
<
N
)
{
output_data
[
id
]
=
value
[
i
];
id
+=
num_threads_per_block
;
}
}
}
template
<
typename
T
>
void
TileImpl
(
hipStream_t
stream
,
const
size_t
shape_rank
,
const
TArray
<
fast_divmod
>&
fdm_input_shape
,
const
TArray
<
int64_t
>&
input_stride
,
const
T
*
input_data
,
const
TArray
<
fast_divmod
>&
fdm_output_strides
,
T
*
output_data
,
const
size_t
N
)
{
int
blocksPerGrid
=
static_cast
<
int
>
(
CeilDiv
(
N
,
num_threads_per_block
*
num_elements_per_thread
));
hipLaunchKernelGGL
(
HIP_KERNEL_NAME
(
_UnRolledTileKernel
<
T
>
),
blocksPerGrid
,
num_threads_per_block
,
0
,
stream
,
shape_rank
,
fdm_input_shape
,
input_stride
,
input_data
,
fdm_output_strides
,
output_data
,
static_cast
<
HIP_LONG
>
(
N
));
}
template
<
typename
T
>
__global__
void
_TileMemcpyKernelFromOutput
(
const
T
*
input_data
,
T
*
output_data
,
const
fast_divmod
divmod_num_input_elements
,
const
HIP_LONG
N
)
{
HIP_LONG
start
=
num_elements_per_thread
*
num_threads_per_block
*
blockIdx
.
x
+
threadIdx
.
x
;
T
value
[
num_elements_per_thread
];
HIP_LONG
id
=
start
;
#pragma unroll
for
(
int
i
=
0
;
i
<
num_elements_per_thread
;
++
i
)
{
if
(
id
<
N
)
{
value
[
i
]
=
input_data
[
divmod_num_input_elements
.
mod
(
id
)];
id
+=
num_threads_per_block
;
}
}
id
=
start
;
#pragma unroll
for
(
int
i
=
0
;
i
<
num_elements_per_thread
;
++
i
)
{
if
(
id
<
N
)
{
output_data
[
id
]
=
value
[
i
];
id
+=
num_threads_per_block
;
}
}
}
template
<
typename
T
>
__global__
void
_TileMemcpyKernelFromInput
(
const
T
*
input_data
,
T
*
output_data
,
const
HIP_LONG
N
,
const
size_t
repeats
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
N
);
T
input_val
=
input_data
[
id
];
for
(
size_t
i
=
0
;
i
<
repeats
;
++
i
)
{
output_data
[
id
]
=
input_val
;
id
+=
N
;
}
}
template
<
typename
T
>
size_t
GetVectorizedSize
(
size_t
num_input_elements
,
size_t
num_elements_per_batch
,
uint64_t
address_input
,
uint64_t
address_output
,
HIP_LONG
&
N
,
int
&
blocksPerGrid
)
{
constexpr
int
vec4_alignment
=
std
::
alignment_of
<
aligned_vector
<
T
,
4
>>::
value
;
constexpr
int
vec2_alignment
=
std
::
alignment_of
<
aligned_vector
<
T
,
2
>>::
value
;
N
=
static_cast
<
HIP_LONG
>
(
num_input_elements
);
size_t
vectorized_size
=
1
;
if
(
num_elements_per_batch
%
4
==
0
&&
address_input
%
vec4_alignment
==
0
&&
address_output
%
vec4_alignment
==
0
)
{
N
/=
4
;
vectorized_size
=
4
;
}
else
if
(
num_elements_per_batch
%
2
==
0
&&
address_input
%
vec2_alignment
==
0
&&
address_output
%
vec2_alignment
==
0
)
{
N
/=
2
;
vectorized_size
=
2
;
}
blocksPerGrid
=
CeilDiv
(
N
,
num_threads_per_block
);
return
vectorized_size
;
}
template
<
typename
T
>
void
TileMemcpyImpl
(
hipStream_t
stream
,
const
T
*
input_data
,
T
*
output_data
,
const
size_t
num_input_elements
,
const
size_t
repeats
)
{
// If the block number from input size is too small to fill all streaming multiprocessors,
// it won't have perf gain to launch from inputs. In this case we will use the output based kernel.
HIP_LONG
N
;
int
blocksPerGrid
;
size_t
vectorized_size
=
GetVectorizedSize
<
T
>
(
num_input_elements
,
num_input_elements
,
reinterpret_cast
<
uint64_t
>
(
input_data
),
reinterpret_cast
<
uint64_t
>
(
output_data
),
N
,
blocksPerGrid
);
if
(
blocksPerGrid
<
128
)
{
N
=
static_cast
<
HIP_LONG
>
(
num_input_elements
*
repeats
);
blocksPerGrid
=
CeilDiv
(
N
,
num_threads_per_block
*
num_elements_per_thread
);
hipLaunchKernelGGL
(
_TileMemcpyKernelFromOutput
,
blocksPerGrid
,
num_threads_per_block
,
0
,
stream
,
input_data
,
output_data
,
fast_divmod
(
static_cast
<
int
>
(
num_input_elements
)),
N
);
return
;
}
if
(
vectorized_size
==
4
)
{
using
Vec4T
=
aligned_vector
<
T
,
4
>
;
hipLaunchKernelGGL
(
_TileMemcpyKernelFromInput
,
blocksPerGrid
,
num_threads_per_block
,
0
,
stream
,
reinterpret_cast
<
const
Vec4T
*>
(
input_data
),
reinterpret_cast
<
Vec4T
*>
(
output_data
),
N
,
repeats
);
return
;
}
else
if
(
vectorized_size
==
2
)
{
using
Vec2T
=
aligned_vector
<
T
,
2
>
;
hipLaunchKernelGGL
(
_TileMemcpyKernelFromInput
,
blocksPerGrid
,
num_threads_per_block
,
0
,
stream
,
reinterpret_cast
<
const
Vec2T
*>
(
input_data
),
reinterpret_cast
<
Vec2T
*>
(
output_data
),
N
,
repeats
);
return
;
}
hipLaunchKernelGGL
(
_TileMemcpyKernelFromInput
,
blocksPerGrid
,
num_threads_per_block
,
0
,
stream
,
input_data
,
output_data
,
N
,
repeats
);
}
template
<
typename
T
>
__global__
void
_TileBatchedMemcpyKernelFromOutput
(
const
T
*
input_data
,
T
*
output_data
,
const
fast_divmod
divmod_size_output_row
,
const
size_t
size_input_row
,
const
fast_divmod
divmod_batch
,
const
fast_divmod
divmod_size_input_row
,
const
HIP_LONG
N
)
{
HIP_LONG
start
=
num_elements_per_thread
*
num_threads_per_block
*
blockIdx
.
x
+
threadIdx
.
x
;
T
value
[
num_elements_per_thread
];
HIP_LONG
id
=
start
;
#pragma unroll
for
(
int
i
=
0
;
i
<
num_elements_per_thread
;
++
i
)
{
if
(
id
<
N
)
{
int
batch_idx
,
element_idx
;
divmod_size_output_row
.
divmod
(
id
,
batch_idx
,
element_idx
);
value
[
i
]
=
input_data
[
divmod_batch
.
mod
(
batch_idx
)
*
size_input_row
+
divmod_size_input_row
.
mod
(
element_idx
)];
id
+=
num_threads_per_block
;
}
}
id
=
start
;
#pragma unroll
for
(
int
i
=
0
;
i
<
num_elements_per_thread
;
++
i
)
{
if
(
id
<
N
)
{
output_data
[
id
]
=
value
[
i
];
id
+=
num_threads_per_block
;
}
}
}
// Input size is [batch, data], output size is [batch * batch_repeats, data * repeats_per_batch].
// Here size_input_row = data, size_output_row = data * repeats_per_batch,
// size_output_batch = batch * data * repeats_per_batch
template
<
typename
T
>
__global__
void
_TileBatchedMemcpyKernelFromInput
(
const
T
*
input_data
,
T
*
output_data
,
const
fast_divmod
divmod_size_input_row
,
const
HIP_LONG
size_input_row
,
const
HIP_LONG
size_output_row
,
const
HIP_LONG
size_output_batch
,
const
size_t
batch_repeats
,
const
size_t
repeats_per_batch
,
const
HIP_LONG
N
)
{
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
(
id
,
N
);
T
input_val
=
input_data
[
id
];
HIP_LONG
q
,
r
;
divmod_size_input_row
.
divmod
(
id
,
q
,
r
);
HIP_LONG
batch_offset
=
q
*
size_output_row
+
r
;
for
(
size_t
i
=
0
;
i
<
batch_repeats
;
++
i
)
{
HIP_LONG
offset
=
batch_offset
;
for
(
size_t
j
=
0
;
j
<
repeats_per_batch
;
++
j
)
{
output_data
[
offset
]
=
input_val
;
offset
+=
size_input_row
;
}
batch_offset
+=
size_output_batch
;
}
}
// Input size is [batch, data], output size is [batch * batch_repeats, data * repeats_per_batch].
// Here size_input_row = data, num_input_elements = batch * data
template
<
typename
T
>
void
TileBatchedMemcpyImpl
(
hipStream_t
stream
,
const
T
*
input_data
,
T
*
output_data
,
const
size_t
size_input_row
,
const
size_t
num_input_elements
,
const
size_t
batch_repeats
,
const
size_t
repeats_per_batch
)
{
// If the block number from input size is too small to fill all streaming multiprocessors,
// it won't have perf gain to launch from inputs. In this case we will use the output based kernel.
HIP_LONG
N
;
int
blocksPerGrid
;
size_t
vectorized_size
=
GetVectorizedSize
<
T
>
(
num_input_elements
,
size_input_row
,
reinterpret_cast
<
uint64_t
>
(
input_data
),
reinterpret_cast
<
uint64_t
>
(
output_data
),
N
,
blocksPerGrid
);
if
(
blocksPerGrid
<
128
)
{
N
=
static_cast
<
HIP_LONG
>
(
num_input_elements
*
batch_repeats
*
repeats_per_batch
);
blocksPerGrid
=
CeilDiv
(
N
,
num_threads_per_block
*
num_elements_per_thread
);
hipLaunchKernelGGL
(
_TileBatchedMemcpyKernelFromOutput
,
blocksPerGrid
,
num_threads_per_block
,
0
,
stream
,
input_data
,
output_data
,
fast_divmod
(
static_cast
<
int
>
(
size_input_row
*
repeats_per_batch
)),
size_input_row
,
fast_divmod
(
static_cast
<
int
>
(
num_input_elements
/
size_input_row
)),
fast_divmod
(
static_cast
<
int
>
(
size_input_row
)),
N
);
return
;
}
HIP_LONG
size_input_row_vec
=
static_cast
<
HIP_LONG
>
(
size_input_row
);
if
(
vectorized_size
==
4
)
{
using
Vec4T
=
aligned_vector
<
T
,
4
>
;
size_input_row_vec
/=
4
;
hipLaunchKernelGGL
(
_TileBatchedMemcpyKernelFromInput
,
blocksPerGrid
,
num_threads_per_block
,
0
,
stream
,
reinterpret_cast
<
const
Vec4T
*>
(
input_data
),
reinterpret_cast
<
Vec4T
*>
(
output_data
),
fast_divmod
(
size_input_row_vec
),
size_input_row_vec
,
size_input_row_vec
*
static_cast
<
HIP_LONG
>
(
repeats_per_batch
),
N
*
static_cast
<
HIP_LONG
>
(
repeats_per_batch
),
batch_repeats
,
repeats_per_batch
,
N
);
return
;
}
else
if
(
vectorized_size
==
2
)
{
using
Vec2T
=
aligned_vector
<
T
,
2
>
;
size_input_row_vec
/=
2
;
hipLaunchKernelGGL
(
_TileBatchedMemcpyKernelFromInput
,
blocksPerGrid
,
num_threads_per_block
,
0
,
stream
,
reinterpret_cast
<
const
Vec2T
*>
(
input_data
),
reinterpret_cast
<
Vec2T
*>
(
output_data
),
fast_divmod
(
size_input_row_vec
),
size_input_row_vec
,
size_input_row_vec
*
static_cast
<
HIP_LONG
>
(
repeats_per_batch
),
N
*
static_cast
<
HIP_LONG
>
(
repeats_per_batch
),
batch_repeats
,
repeats_per_batch
,
N
);
return
;
}
hipLaunchKernelGGL
(
_TileBatchedMemcpyKernelFromInput
,
blocksPerGrid
,
num_threads_per_block
,
0
,
stream
,
input_data
,
output_data
,
fast_divmod
(
size_input_row_vec
),
size_input_row_vec
,
size_input_row_vec
*
static_cast
<
HIP_LONG
>
(
repeats_per_batch
),
N
*
static_cast
<
HIP_LONG
>
(
repeats_per_batch
),
batch_repeats
,
repeats_per_batch
,
N
);
}
#define SPECIALIZED_IMPL(T) \
template void TileImpl<T>(hipStream_t stream, const size_t shape_rank, const TArray<fast_divmod>& fdm_input_shape, \
const TArray<int64_t>& input_stride, const T* input_data, \
const TArray<fast_divmod>& fdm_output_strides, T* output_data, const size_t N); \
template void TileMemcpyImpl<T>(hipStream_t stream, const T* input_data, T* output_data, \
const size_t num_input_elements, const size_t repeats); \
template void TileBatchedMemcpyImpl<T>(hipStream_t stream, const T* input_data, T* output_data, \
const size_t size_input_row, const size_t num_input_elements, \
const size_t batch_repeats, const size_t repeats_per_batch);
SPECIALIZED_IMPL
(
float
)
SPECIALIZED_IMPL
(
double
)
SPECIALIZED_IMPL
(
half
)
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/tile_impl.h
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/rocm/shared_inc/rocm_utils.h"
namespace
onnxruntime
{
namespace
rocm
{
template
<
typename
T
>
void
TileImpl
(
hipStream_t
stream
,
const
size_t
shape_rank
,
const
TArray
<
fast_divmod
>&
fdm_input_shape
,
const
TArray
<
int64_t
>&
input_stride
,
const
T
*
input_data
,
const
TArray
<
fast_divmod
>&
fdm_output_strides
,
T
*
output_data
,
const
size_t
N
);
template
<
typename
T
>
void
TileMemcpyImpl
(
hipStream_t
stream
,
const
T
*
input_data
,
T
*
output_data
,
const
size_t
num_input_elements
,
const
size_t
repeats
);
template
<
typename
T
>
void
TileBatchedMemcpyImpl
(
hipStream_t
stream
,
const
T
*
input_data
,
T
*
output_data
,
const
size_t
size_input_row
,
const
size_t
num_input_elements
,
const
size_t
batch_repeats
,
const
size_t
repeats_per_batch
);
}
// namespace rocm
}
// namespace onnxruntime
build/Linux/Release/amdgpu/onnxruntime/core/providers/rocm/tensor/transpose.cc
0 → 100644
View file @
1a91fcc2
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/common/inlined_containers.h"
#include "core/providers/rocm/tensor/transpose.h"
#include "core/providers/rocm/tensor/transpose_impl.h"
#include "core/providers/cpu/tensor/utils.h"
#include "core/providers/rocm/shared_inc/fpgeneric.h"
namespace
onnxruntime
{
namespace
rocm
{
ONNX_OPERATOR_VERSIONED_KERNEL_EX
(
Transpose
,
kOnnxDomain
,
1
,
12
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
()),
Transpose
);
ONNX_OPERATOR_KERNEL_EX
(
Transpose
,
kOnnxDomain
,
13
,
kRocmExecutionProvider
,
(
*
KernelDefBuilder
::
Create
())
.
TypeConstraint
(
"T"
,
DataTypeImpl
::
AllFixedSizeTensorTypes
()),
Transpose
);
// special case acceleration using rocblas matrix transpose
static
std
::
tuple
<
int
,
int
>
TryTransposeWithRocblas
(
const
gsl
::
span
<
const
size_t
>&
perm
,
const
TensorShape
&
input_shape
)
{
int
M
=
0
;
int
N
=
0
;
if
(
perm
.
size
()
==
4
&&
input_shape
[
0
]
==
1
&&
perm
[
0
]
==
0
)
{
// NCHW <-> NHWC when N == 1
if
((
perm
[
1
]
==
2
&&
perm
[
2
]
==
3
&&
perm
[
3
]
==
1
)
||
(
perm
[
1
]
==
3
&&
perm
[
2
]
==
1
&&
perm
[
3
]
==
2
))
{
if
(
perm
[
1
]
==
2
)
{
M
=
gsl
::
narrow
<
int
>
(
input_shape
[
1
]);
N
=
gsl
::
narrow
<
int
>
(
input_shape
[
2
]
*
input_shape
[
3
]);
}
else
{
M
=
gsl
::
narrow
<
int
>
(
input_shape
[
1
]
*
input_shape
[
2
]);
N
=
gsl
::
narrow
<
int
>
(
input_shape
[
3
]);
}
}
}
else
if
(
perm
.
size
()
==
2
&&
perm
[
1
]
==
0
&&
perm
[
0
]
==
1
)
{
// 2D matrix transpose
M
=
gsl
::
narrow
<
int
>
(
input_shape
[
0
]);
N
=
gsl
::
narrow
<
int
>
(
input_shape
[
1
]);
}
return
std
::
make_tuple
(
M
,
N
);
}
template
<
typename
T
>
Status
TransposeWithRocblas
(
hipStream_t
stream
,
rocblas_handle
rocblas_handle
,
const
Tensor
&
input
,
Tensor
&
output
,
int
M
,
int
N
)
{
typedef
typename
ToHipType
<
T
>::
MappedType
HipT
;
HipT
one
=
ToHipType
<
T
>::
FromFloat
(
1.0
f
);
HipT
zero
=
ToHipType
<
T
>::
FromFloat
(
0.0
f
);
const
HipT
*
input_data
=
reinterpret_cast
<
const
HipT
*>
(
input
.
Data
<
T
>
());
HipT
*
output_data
=
reinterpret_cast
<
HipT
*>
(
output
.
MutableData
<
T
>
());
ROCBLAS_RETURN_IF_ERROR
(
rocblasTransposeHelper
(
stream
,
rocblas_handle
,
rocblas_operation_transpose
,
rocblas_operation_transpose
,
M
,
N
,
&
one
,
input_data
,
N
,
&
zero
,
input_data
,
N
,
output_data
,
M
));
return
Status
::
OK
();
}
Status
Transpose
::
DoTranspose
(
const
Transpose
&
transpose_kernel
,
const
gsl
::
span
<
const
size_t
>&
permutations
,
const
Tensor
&
input
,
Tensor
&
output
)
{
return
Transpose
::
DoTranspose
(
transpose_kernel
.
GetDeviceProp
(),
transpose_kernel
.
Stream
(),
transpose_kernel
.
RocblasHandle
(),
permutations
,
input
,
output
);
}
Status
Transpose
::
DoTranspose
(
const
hipDeviceProp_t
&
prop
,
hipStream_t
stream
,
const
rocblas_handle
rocblas_handle
,
const
gsl
::
span
<
const
size_t
>&
permutations
,
const
Tensor
&
input
,
Tensor
&
output
,
const
TensorShape
*
input_shape_override
,
const
TensorShape
*
output_shape_override
)
{
// special case when there is a dim value of 0 in the shape.
if
(
output
.
Shape
().
Size
()
==
0
)
return
Status
::
OK
();
const
auto
input_dims
=
input_shape_override
?
input_shape_override
->
GetDims
()
:
input
.
Shape
().
GetDims
();
const
auto
output_dims
=
output_shape_override
?
output_shape_override
->
GetDims
()
:
output
.
Shape
().
GetDims
();
auto
rank
=
static_cast
<
int32_t
>
(
input_dims
.
size
());
// flatten the adjacent dimensions which are contiguous
// for example: permutations[0, 2, 3, 1] -> [0, 2, 1], permutations[0, 3, 1, 2] -> [0, 2, 1]
auto
new_rank
=
rank
;
InlinedVector
<
size_t
>
new_permutations
(
permutations
.
begin
(),
permutations
.
end
());
TensorShapeVector
new_input_dims
=
ToShapeVector
(
input_dims
);
TensorShapeVector
new_output_dims
=
ToShapeVector
(
output_dims
);
// Remove all dims with value 1.
std
::
vector
<
bool
>
dims_to_remove
(
new_rank
,
false
);
int
input_pos
=
0
;
int
output_pos
=
0
;
int
perm_pos
=
0
;
for
(
int
i
=
0
;
i
<
new_rank
;
++
i
)
{
if
(
new_input_dims
[
i
]
!=
1
)
{
new_input_dims
[
input_pos
++
]
=
new_input_dims
[
i
];
}
else
{
dims_to_remove
[
i
]
=
true
;
}
if
(
new_output_dims
[
i
]
!=
1
)
{
new_output_dims
[
output_pos
++
]
=
new_output_dims
[
i
];
}
}
for
(
int
i
=
0
;
i
<
new_rank
;
++
i
)
{
if
(
!
dims_to_remove
[
new_permutations
[
i
]])
{
new_permutations
[
perm_pos
++
]
=
new_permutations
[
i
];
}
}
for
(
int
i
=
new_rank
-
1
;
i
>=
0
;
--
i
)
{
if
(
dims_to_remove
[
i
])
{
for
(
int
j
=
0
;
j
<
perm_pos
;
++
j
)
{
if
(
new_permutations
[
j
]
>
static_cast
<
size_t
>
(
i
))
{
new_permutations
[
j
]
-=
1
;
}
}
}
}
ORT_ENFORCE
(
input_pos
==
output_pos
&&
input_pos
==
perm_pos
);
new_rank
=
input_pos
;
new_input_dims
.
resize
(
new_rank
);
new_output_dims
.
resize
(
new_rank
);
new_permutations
.
resize
(
new_rank
);
for
(
auto
i
=
new_rank
-
1
;
i
>
0
;
i
--
)
{
auto
curr
=
new_permutations
[
i
];
auto
prev
=
new_permutations
[
i
-
1
];
if
(
prev
+
1
==
curr
)
{
// all dims bigger than curr need to be reduced by 1 due to the merging.
for
(
auto
j
=
0
;
j
<
new_rank
;
j
++
)
{
if
(
new_permutations
[
j
]
>
curr
)
{
new_permutations
[
j
]
-=
1
;
}
}
for
(
auto
j
=
i
+
1
;
j
<
new_rank
;
j
++
)
{
new_permutations
[
j
-
1
]
=
new_permutations
[
j
];
}
// update input dims
new_input_dims
[
prev
]
*=
new_input_dims
[
curr
];
new_input_dims
[
curr
]
=
1
;
for
(
auto
j
=
static_cast
<
int32_t
>
(
curr
+
1
);
j
<
new_rank
;
j
++
)
{
new_input_dims
[
j
-
1
]
=
new_input_dims
[
j
];
}
new_input_dims
[
new_rank
-
1
]
=
1
;
// update output dims
new_output_dims
[
i
-
1
]
*=
new_output_dims
[
i
];
new_output_dims
[
i
]
=
1
;
for
(
auto
j
=
i
+
1
;
j
<
new_rank
;
j
++
)
{
new_output_dims
[
j
-
1
]
=
new_output_dims
[
j
];
}
new_output_dims
[
new_rank
-
1
]
=
1
;
new_rank
--
;
}
}
new_permutations
.
resize
(
new_rank
);
new_input_dims
.
resize
(
new_rank
);
new_output_dims
.
resize
(
new_rank
);
if
(
new_rank
<=
1
)
{
HIP_RETURN_IF_ERROR
(
hipMemcpyAsync
(
output
.
MutableDataRaw
(),
input
.
DataRaw
(),
input
.
Shape
().
Size
()
*
input
.
DataType
()
->
Size
(),
hipMemcpyDeviceToDevice
,
stream
));
return
Status
::
OK
();
}
auto
element_type
=
input
.
GetElementType
();
size_t
element_size
=
input
.
DataType
()
->
Size
();
if
(
element_type
==
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT
||
element_type
==
ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE
||
element_type
==
ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16
)
{
auto
mn
=
TryTransposeWithRocblas
(
new_permutations
,
new_input_dims
);
int
M
=
std
::
get
<
0
>
(
mn
);
int
N
=
std
::
get
<
1
>
(
mn
);
if
(
M
!=
0
&&
N
!=
0
)
{
if
(
element_type
==
utils
::
GetONNXTensorElementDataType
<
float
>
())
{
return
TransposeWithRocblas
<
float
>
(
stream
,
rocblas_handle
,
input
,
output
,
M
,
N
);
}
else
if
(
element_type
==
utils
::
GetONNXTensorElementDataType
<
double
>
())
{
return
TransposeWithRocblas
<
double
>
(
stream
,
rocblas_handle
,
input
,
output
,
M
,
N
);
}
else
{
return
TransposeWithRocblas
<
MLFloat16
>
(
stream
,
rocblas_handle
,
input
,
output
,
M
,
N
);
}
}
}
// Transpose021 has a specialized Transpose3DImpl kernel
dim3
grid_size
,
block_size
;
if
(
CanDoTranspose3D
(
prop
,
static_cast
<
size_t
>
(
new_rank
),
new_input_dims
,
new_permutations
,
grid_size
,
block_size
))
{
TensorPitches
new_input_strides
(
new_input_dims
);
return
Transpose3DImpl
(
stream
,
element_size
,
ToConstSpan
(
new_input_dims
),
ToConstSpan
(
new_input_strides
),
input
.
DataRaw
(),
output
.
MutableDataRaw
(),
output
.
Shape
().
Size
(),
grid_size
,
block_size
);
}
// 3D-Transpose can treated as a special case of 4D-Transpose with first dimension being 1.
if
(
new_rank
==
3
)
{
new_permutations
[
0
]
++
;
new_permutations
[
1
]
++
;
new_permutations
[
2
]
++
;
new_permutations
.
insert
(
new_permutations
.
begin
(),
0
);
new_input_dims
.
insert
(
new_input_dims
.
begin
(),
1
);
new_output_dims
.
insert
(
new_output_dims
.
begin
(),
1
);
new_rank
=
4
;
}
TensorPitches
new_input_strides
(
new_input_dims
);
TensorPitches
new_output_strides
(
new_output_dims
);
TArray
<
int64_t
>
input_shape
(
new_input_dims
);
TArray
<
int64_t
>
tmp_input_strides
(
new_input_strides
);
if
(
CanDoTranspose4DParallelizeMultipleElementsPerThreadInInnermostDim
(
prop
,
element_size
,
new_rank
,
new_input_dims
,
new_permutations
,
grid_size
,
block_size
))
{
TArray
<
int64_t
>
tmp_output_strides
(
new_rank
);
for
(
auto
i
=
0
;
i
<
new_rank
;
i
++
)
{
tmp_output_strides
[
static_cast
<
int32_t
>
(
new_permutations
[
i
])]
=
new_output_strides
[
i
];
}
return
Transpose4DParallelizeMultipleElementsPerThreadInInnermostDim
(
stream
,
element_size
,
input_shape
,
tmp_input_strides
,
input
.
DataRaw
(),
tmp_output_strides
,
output
.
MutableDataRaw
(),
gsl
::
narrow
<
int
>
(
output
.
Shape
().
Size
()),
grid_size
,
block_size
);
}
// We used to check if Transpose4DParallelizeOneElementPerThread can be used before falling back to generic case,
// But tests on lots of cases showing that Transpose4DParallelizeOneElementPerThread is not faster than generic case,
// and even much slower than generic case for some cases.
// General cases
TArray
<
int64_t
>
input_strides
(
new_rank
);
for
(
auto
i
=
0
;
i
<
new_rank
;
i
++
)
{
input_strides
[
i
]
=
new_input_strides
[
new_permutations
[
i
]];
}
TArray
<
fast_divmod
>
output_strides
(
new_rank
);
for
(
auto
i
=
0
;
i
<
new_rank
;
i
++
)
{
output_strides
[
i
]
=
fast_divmod
(
gsl
::
narrow_cast
<
int
>
(
new_output_strides
[
i
]));
}
auto
status
=
TransposeImpl
(
stream
,
element_size
,
new_rank
,
input_strides
,
input
.
DataRaw
(),
output_strides
,
output
.
MutableDataRaw
(),
gsl
::
narrow
<
int
>
(
output
.
Shape
().
Size
()));
return
status
;
}
Status
Transpose
::
ComputeInternal
(
OpKernelContext
*
ctx
)
const
{
const
Tensor
*
X_ptr
=
ctx
->
Input
<
Tensor
>
(
0
);
if
(
X_ptr
==
nullptr
)
return
Status
(
common
::
ONNXRUNTIME
,
common
::
FAIL
,
"input count mismatch"
);
const
Tensor
&
X
=
*
X_ptr
;
const
TensorShape
&
input_shape
=
X
.
Shape
();
int32_t
rank
=
gsl
::
narrow_cast
<
int32_t
>
(
input_shape
.
NumDimensions
());
TensorShapeVector
output_dims
(
rank
);
InlinedVector
<
size_t
>
default_perm
(
rank
);
const
InlinedVector
<
size_t
>*
p_perm
=
nullptr
;
const
auto
&
status
=
ComputeOutputShape
(
X
,
output_dims
,
default_perm
,
p_perm
);
if
(
!
status
.
IsOK
())
return
status
;
TensorShape
output_shape
{
output_dims
};
Tensor
*
Y
=
ctx
->
Output
(
0
,
output_shape
);
return
DoTranspose
(
this
->
GetDeviceProp
(),
this
->
Stream
(),
this
->
RocblasHandle
(),
*
p_perm
,
X
,
*
Y
);
}
}
// namespace rocm
}
// namespace onnxruntime
Prev
1
…
9
10
11
12
13
14
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment