Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel_ROCM
Commits
55a89c74
Commit
55a89c74
authored
Dec 16, 2023
by
Jun Liu
Browse files
Merge branch 'develop' into amd-develop
parents
0dacd895
dcedf363
Changes
61
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1460 additions
and
328 deletions
+1460
-328
docs/sphinx/requirements.in
docs/sphinx/requirements.in
+1
-1
docs/sphinx/requirements.txt
docs/sphinx/requirements.txt
+9
-9
docs/wrapper.rst
docs/wrapper.rst
+73
-0
example/62_conv_fwd_activ/CMakeLists.txt
example/62_conv_fwd_activ/CMakeLists.txt
+2
-0
example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp
...nvnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp
+294
-0
example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc
example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc
+1
-1
example/64_tensor_transforms/CMakeLists.txt
example/64_tensor_transforms/CMakeLists.txt
+0
-2
include/ck/host_utility/device_prop.hpp
include/ck/host_utility/device_prop.hpp
+1
-1
include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
...ice/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
+69
-66
include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
...evice/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
+66
-77
include/ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp
...or_operation/gpu/device/impl/device_contraction_utils.hpp
+87
-0
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
...mpl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+27
-5
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
...impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
+1
-2
include/ck/tensor_operation/gpu/device/tensor_layout.hpp
include/ck/tensor_operation/gpu/device/tensor_layout.hpp
+0
-6
include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
...k/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
+2
-0
include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
...eration/operator_transform/transform_conv_fwd_to_gemm.hpp
+7
-8
include/ck/utility/tuple_helper.hpp
include/ck/utility/tuple_helper.hpp
+12
-0
include/ck/wrapper/layout.hpp
include/ck/wrapper/layout.hpp
+159
-150
include/ck/wrapper/tensor.hpp
include/ck/wrapper/tensor.hpp
+314
-0
include/ck/wrapper/utils/layout_utils.hpp
include/ck/wrapper/utils/layout_utils.hpp
+335
-0
No files found.
docs/sphinx/requirements.in
View file @
55a89c74
rocm-docs-core
>
=0.
2
0.
0
rocm-docs-core
=
=0.
3
0.
1
sphinxcontrib-bibtex==2.6.1
docs/sphinx/requirements.txt
View file @
55a89c74
...
...
@@ -16,7 +16,7 @@ beautifulsoup4==4.11.2
# via pydata-sphinx-theme
breathe==4.34.0
# via rocm-docs-core
certifi==202
2.12.7
certifi==202
3.7.22
# via requests
cffi==1.15.1
# via
...
...
@@ -26,7 +26,7 @@ charset-normalizer==3.1.0
# via requests
click==8.1.3
# via sphinx-external-toc
cryptography==4
0
.0.
2
cryptography==4
1
.0.
6
# via pyjwt
deprecated==1.2.13
# via pygithub
...
...
@@ -42,7 +42,7 @@ fastjsonschema==2.18.0
# via rocm-docs-core
gitdb==4.0.10
# via gitpython
gitpython==3.1.3
5
gitpython==3.1.3
7
# via rocm-docs-core
idna==3.4
# via requests
...
...
@@ -88,9 +88,9 @@ pydata-sphinx-theme==0.13.3
# via
# rocm-docs-core
# sphinx-book-theme
pygithub==1.58.
2
pygithub==1.58.
1
# via rocm-docs-core
pygments==2.1
4
.0
pygments==2.1
5
.0
# via
# accessible-pygments
# pydata-sphinx-theme
...
...
@@ -109,11 +109,11 @@ pyyaml==6.0
# pybtex
# rocm-docs-core
# sphinx-external-toc
requests==2.
28.2
requests==2.
31.0
# via
# pygithub
# sphinx
rocm-docs-core==0.
27.0
rocm-docs-core==0.
30.1
# via -r requirements.in
six==1.16.0
# via
...
...
@@ -141,7 +141,7 @@ sphinx-book-theme==1.0.1
# via rocm-docs-core
sphinx-copybutton==0.5.1
# via rocm-docs-core
sphinx-design==0.
3.0
sphinx-design==0.
4.1
# via rocm-docs-core
sphinx-external-toc==0.3.1
# via rocm-docs-core
...
...
@@ -163,7 +163,7 @@ sphinxcontrib-serializinghtml==1.1.5
# via sphinx
typing-extensions==4.5.0
# via pydata-sphinx-theme
urllib3==1.26.1
5
urllib3==1.26.1
8
# via requests
wrapt==1.15.0
# via deprecated
...
...
docs/wrapper.rst
0 → 100644
View file @
55a89c74
===============
Wrapper
===============
-------------------------------------
Description
-------------------------------------
.. note::
The wrapper is under development and its functionality is limited.
CK provides a lightweight wrapper for more complex operations implemented in
the library. It allows indexing of nested layouts using a simple interface
(avoiding complex descriptor transformations) and memory access (using Tensor).
Example:
.. code-block:: c
const auto shape_4x2x4 = ck::make_tuple(4, ck::make_tuple(2, 4));
const auto strides_s2x1x8 = ck::make_tuple(2, ck::make_tuple(1, 8));
const auto layout = ck::wrapper::make_layout(shape_4x2x4, strides_s2x1x8);
std::array<ck::index_t, 32> data;
auto tensor = ck::wrapper::make_tensor<ck::wrapper::MemoryTypeEnum::Generic>(&data[0], layout);
for(ck::index_t w = 0; w < size(tensor); w++) {
tensor(w) = w;
}
// slice() == slice(0, -1) (whole dimension)
auto tensor_slice = tensor(ck::wrapper::slice(1, 3), ck::make_tuple(ck::wrapper::slice(), ck::wrapper::slice()));
std::cout << "dims:2,(2,4) strides:2,(1,8)" << std::endl;
for(ck::index_t h = 0; h < ck::wrapper::size<0>(tensor_slice); h++)
{
for(ck::index_t w = 0; w < ck::wrapper::size<1>(tensor_slice); w++)
{
std::cout << tensor_slice(h, w) << " ";
}
std::cout << std::endl;
}
Output::
dims:2,(2,4) strides:2,(1,8)
1 5 9 13 17 21 25 29
2 6 10 14 18 22 26 30
-------------------------------------
Layout
-------------------------------------
.. doxygenstruct:: ck::wrapper::Layout
-------------------------------------
Layout helpers
-------------------------------------
.. doxygenfile:: layout_utils.hpp
-------------------------------------
Tensor
-------------------------------------
.. doxygenstruct:: ck::wrapper::Tensor
-------------------------------------
Tensor helpers
-------------------------------------
.. doxygenfile:: tensor_utils.hpp
example/62_conv_fwd_activ/CMakeLists.txt
View file @
55a89c74
...
...
@@ -42,6 +42,8 @@ foreach(gpu IN LISTS GPU_TARGETS)
# ScaleAdd ScaleAdd Relu
add_example_executable
(
example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16.cpp
)
add_example_dependencies
(
example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_fp16
)
add_example_executable
(
example_convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16 convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp
)
add_example_dependencies
(
example_convnd_fwd_activ_xdl example_convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16
)
set
(
target 1
)
endif
()
endforeach
()
example/62_conv_fwd_activ/convnd_fwd_xdl_scaleadd_scaleadd_relu_bcasted_bias_fp16.cpp
0 → 100644
View file @
55a89c74
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include <algorithm>
#include <cstdlib>
#include <iostream>
#include <numeric>
#include <type_traits>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
constexpr
ck
::
index_t
NDimSpatial
=
3
;
using
InDataType
=
ck
::
half_t
;
using
WeiDataType
=
ck
::
half_t
;
using
AccDataType
=
float
;
using
CShuffleDataType
=
ck
::
half_t
;
using
OutDataType
=
ck
::
half_t
;
template
<
ck
::
index_t
...
Is
>
using
S
=
ck
::
Sequence
<
Is
...
>
;
using
InLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGC
;
using
WeiLayout
=
ck
::
tensor_layout
::
convolution
::
GKZYXC
;
using
OutLayout
=
ck
::
tensor_layout
::
convolution
::
NDHWGK
;
using
BiasLayout
=
ck
::
tensor_layout
::
convolution
::
G_K
;
using
InElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
WeiElementOp
=
ck
::
tensor_operation
::
element_wise
::
PassThrough
;
using
OutElementOp
=
ck
::
tensor_operation
::
element_wise
::
ScaleAddScaleAddRelu
;
static
constexpr
auto
ConvSpec
=
ck
::
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
;
static
constexpr
auto
GemmSpec
=
ck
::
tensor_operation
::
device
::
GemmSpecialization
::
MNKPadding
;
template
<
typename
OutElementOp
>
using
DeviceGroupedConvNDFwdInstance
=
ck
::
tensor_operation
::
device
::
DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
<
NDimSpatial
,
InLayout
,
WeiLayout
,
ck
::
Tuple
<
OutLayout
,
BiasLayout
>
,
OutLayout
,
InDataType
,
WeiDataType
,
AccDataType
,
CShuffleDataType
,
ck
::
Tuple
<
OutDataType
,
OutDataType
>
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
ConvSpec
,
// ConvForwardSpecialization
GemmSpec
,
// GemmSpecialization
1
,
//
256
,
// BlockSize
128
,
// MPerBlock
256
,
// NPerBlock
32
,
// KPerBlock
8
,
// AK1
8
,
// BK1
32
,
// MPerXdl
32
,
// NPerXdl
2
,
// MXdlPerWave
4
,
// NXdlPerWave
S
<
4
,
64
,
1
>
,
// ABlockTransferThreadClusterLengths_AK0_M_AK1
S
<
1
,
0
,
2
>
,
// ABlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// ABlockTransferSrcAccessOrder
2
,
// ABlockTransferSrcVectorDim
8
,
// ABlockTransferSrcScalarPerVector
8
,
// ABlockTransferDstScalarPerVector_AK1
1
,
// ABlockLdsExtraM
S
<
4
,
64
,
1
>
,
// BBlockTransferThreadClusterLengths_BK0_N_BK1
S
<
1
,
0
,
2
>
,
// BBlockTransferThreadClusterArrangeOrder
S
<
1
,
0
,
2
>
,
// BBlockTransferSrcAccessOrder
2
,
// BBlockTransferSrcVectorDim
8
,
// BBlockTransferSrcScalarPerVector
8
,
// BBlockTransferDstScalarPerVector_BK1
1
,
// BBlockLdsExtraN
1
,
1
,
S
<
1
,
32
,
1
,
8
>
,
8
>
;
using
DeviceGroupedConvNDFwdActivInstance
=
DeviceGroupedConvNDFwdInstance
<
OutElementOp
>
;
namespace
{
// Use custom implementation to pass two more tensors for post op
template
<
ck
::
index_t
NDimSpatial
,
typename
InDataType
,
typename
WeiDataType
,
typename
OutDataType
,
typename
InElementOp
,
typename
WeiElementOp
,
typename
OutElementOp
,
typename
DeviceConvNDFwdInstance
>
bool
run_grouped_conv_fwd
(
bool
do_verification
,
int
init_method
,
bool
time_kernel
,
const
ck
::
utils
::
conv
::
ConvParam
&
conv_param
,
const
HostTensorDescriptor
&
in_g_n_c_wis_desc
,
const
HostTensorDescriptor
&
wei_g_k_c_xs_desc
,
const
HostTensorDescriptor
&
out_g_n_k_wos_desc
,
const
InElementOp
&
in_element_op
,
const
WeiElementOp
&
wei_element_op
,
const
OutElementOp
&
out_element_op
)
{
constexpr
ck
::
index_t
NumDs
=
2
;
const
ck
::
index_t
G
=
out_g_n_k_wos_desc
.
GetLengths
()[
0
];
const
ck
::
index_t
K
=
out_g_n_k_wos_desc
.
GetLengths
()[
2
];
// Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW)
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
bias_g_k_lengths
;
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
bias_g_k_strides
;
// Fill other lenghts than G,K with 1 and strides with 0
bias_g_k_lengths
.
fill
(
1
);
bias_g_k_strides
.
fill
(
0
);
bias_g_k_lengths
[
0
]
=
G
;
bias_g_k_lengths
[
2
]
=
K
;
bias_g_k_strides
[
0
]
=
K
;
// stride to G
bias_g_k_strides
[
2
]
=
1
;
// stride to K
const
auto
broadcasted_bias_desc
=
HostTensorDescriptor
(
bias_g_k_lengths
,
bias_g_k_strides
);
// y = relu ( alpha1 * conv(x) + alpha2 * z + bias )
Tensor
<
InDataType
>
in
(
in_g_n_c_wis_desc
);
Tensor
<
WeiDataType
>
wei
(
wei_g_k_c_xs_desc
);
Tensor
<
OutDataType
>
out_host
(
out_g_n_k_wos_desc
);
Tensor
<
OutDataType
>
out_device
(
out_g_n_k_wos_desc
);
std
::
array
<
Tensor
<
OutDataType
>
,
NumDs
>
d_tensors
=
{
Tensor
<
OutDataType
>
(
out_g_n_k_wos_desc
),
Tensor
<
OutDataType
>
(
broadcasted_bias_desc
)};
std
::
cout
<<
"in: "
<<
in
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"wei: "
<<
wei
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"out: "
<<
out_host
.
mDesc
<<
std
::
endl
;
std
::
cout
<<
"z_tensor: "
<<
d_tensors
[
0
].
mDesc
<<
std
::
endl
;
std
::
cout
<<
"bias_tensor: "
<<
d_tensors
[
1
].
mDesc
<<
std
::
endl
;
// Make sure that we allocated only G * K values for bias
assert
(
static_cast
<
ck
::
index_t
>
(
d_tensors
[
1
].
mData
.
size
())
==
G
*
K
);
switch
(
init_method
)
{
case
0
:
break
;
case
1
:
in
.
GenerateTensorValue
(
GeneratorTensor_2
<
InDataType
>
{
-
2
,
2
});
wei
.
GenerateTensorValue
(
GeneratorTensor_2
<
WeiDataType
>
{
-
2
,
2
});
d_tensors
[
0
].
GenerateTensorValue
(
GeneratorTensor_2
<
OutDataType
>
{
-
2
,
2
});
d_tensors
[
1
].
GenerateTensorValue
(
GeneratorTensor_2
<
OutDataType
>
{
-
2
,
2
});
break
;
default:
in
.
GenerateTensorValue
(
GeneratorTensor_3
<
InDataType
>
{
-
1.0
,
1.0
});
wei
.
GenerateTensorValue
(
GeneratorTensor_3
<
WeiDataType
>
{
-
0.05
,
0.05
});
d_tensors
[
0
].
GenerateTensorValue
(
GeneratorTensor_3
<
OutDataType
>
{
-
0.05
,
0.05
});
d_tensors
[
1
].
GenerateTensorValue
(
GeneratorTensor_3
<
OutDataType
>
{
-
0.05
,
0.05
});
}
DeviceMem
in_device_buf
(
sizeof
(
InDataType
)
*
in
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
wei_device_buf
(
sizeof
(
WeiDataType
)
*
wei
.
mDesc
.
GetElementSpaceSize
());
DeviceMem
z_buf
(
sizeof
(
OutDataType
)
*
d_tensors
[
0
].
mDesc
.
GetElementSpaceSize
());
DeviceMem
bias_buf
(
sizeof
(
OutDataType
)
*
d_tensors
[
1
].
mDesc
.
GetElementSpaceSize
());
DeviceMem
out_device_buf
(
sizeof
(
OutDataType
)
*
out_device
.
mDesc
.
GetElementSpaceSize
());
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
z_buf
.
ToDevice
(
d_tensors
[
0
].
mData
.
data
());
bias_buf
.
ToDevice
(
d_tensors
[
1
].
mData
.
data
());
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
a_g_n_c_wis_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
b_g_k_c_xs_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_lengths
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
e_g_n_k_wos_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_strides
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
conv_filter_dilations
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_right_pads
{};
auto
copy
=
[](
const
auto
&
x
,
auto
&
y
)
{
ck
::
ranges
::
copy
(
x
,
y
.
begin
());
};
copy
(
in_g_n_c_wis_desc
.
GetLengths
(),
a_g_n_c_wis_lengths
);
copy
(
in_g_n_c_wis_desc
.
GetStrides
(),
a_g_n_c_wis_strides
);
copy
(
wei_g_k_c_xs_desc
.
GetLengths
(),
b_g_k_c_xs_lengths
);
copy
(
wei_g_k_c_xs_desc
.
GetStrides
(),
b_g_k_c_xs_strides
);
copy
(
out_g_n_k_wos_desc
.
GetLengths
(),
e_g_n_k_wos_lengths
);
copy
(
out_g_n_k_wos_desc
.
GetStrides
(),
e_g_n_k_wos_strides
);
copy
(
conv_param
.
conv_filter_strides_
,
conv_filter_strides
);
copy
(
conv_param
.
conv_filter_dilations_
,
conv_filter_dilations
);
copy
(
conv_param
.
input_left_pads_
,
input_left_pads
);
copy
(
conv_param
.
input_right_pads_
,
input_right_pads
);
const
std
::
array
<
const
void
*
,
NumDs
>
ds
=
{
z_buf
.
GetDeviceBuffer
(),
bias_buf
.
GetDeviceBuffer
()};
auto
conv
=
DeviceConvNDFwdInstance
{};
auto
invoker
=
conv
.
MakeInvoker
();
auto
argument
=
conv
.
MakeArgument
(
in_device_buf
.
GetDeviceBuffer
(),
wei_device_buf
.
GetDeviceBuffer
(),
ds
,
out_device_buf
.
GetDeviceBuffer
(),
a_g_n_c_wis_lengths
,
a_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
std
::
array
<
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
,
NumDs
>
{
e_g_n_k_wos_lengths
,
bias_g_k_lengths
},
std
::
array
<
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>
,
NumDs
>
{
e_g_n_k_wos_strides
,
bias_g_k_strides
},
e_g_n_k_wos_lengths
,
e_g_n_k_wos_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
,
in_element_op
,
wei_element_op
,
out_element_op
);
if
(
!
conv
.
IsSupportedArgument
(
argument
))
{
throw
std
::
runtime_error
(
"The device op with the specified compilation parameters does "
"not support this convolution problem."
);
}
float
avg_time
=
invoker
.
Run
(
argument
,
StreamConfig
{
nullptr
,
time_kernel
});
std
::
size_t
flop
=
conv_param
.
GetFlops
()
+
G
*
K
+
conv_param
.
GetOutputByte
<
OutDataType
>
()
/
sizeof
(
OutDataType
);
std
::
size_t
num_btype
=
conv_param
.
GetByte
<
InDataType
,
WeiDataType
,
OutDataType
>
()
+
G
*
K
*
sizeof
(
OutDataType
)
+
conv_param
.
GetOutputByte
<
OutDataType
>
();
float
tflops
=
static_cast
<
float
>
(
flop
)
/
1.E9
/
avg_time
;
float
gb_per_sec
=
num_btype
/
1.E6
/
avg_time
;
std
::
cout
<<
"Perf: "
<<
avg_time
<<
" ms, "
<<
tflops
<<
" TFlops, "
<<
gb_per_sec
<<
" GB/s, "
<<
conv
.
GetTypeString
()
<<
std
::
endl
;
if
(
do_verification
)
{
auto
ref_conv
=
ck
::
tensor_operation
::
host
::
ReferenceConvFwd
<
NDimSpatial
,
InDataType
,
WeiDataType
,
OutDataType
,
InElementOp
,
WeiElementOp
,
OutElementOp
,
0
,
/*Num A Elementwise Tensors*/
0
,
/*Num B Elementwise Tensors*/
NumDs
>
();
auto
ref_invoker
=
ref_conv
.
MakeInvoker
();
auto
ref_argument
=
ref_conv
.
MakeArgument
(
in
,
wei
,
out_host
,
conv_param
.
conv_filter_strides_
,
conv_param
.
conv_filter_dilations_
,
conv_param
.
input_left_pads_
,
conv_param
.
input_right_pads_
,
in_element_op
,
wei_element_op
,
out_element_op
,
{},
{},
d_tensors
);
ref_invoker
.
Run
(
ref_argument
);
out_device_buf
.
FromDevice
(
out_device
.
mData
.
data
());
return
ck
::
utils
::
check_err
(
out_device
,
out_host
,
"Error: incorrect results!"
);
}
return
true
;
}
}
// namespace
#include "run_convnd_fwd_activ_example.inc"
int
main
(
int
argc
,
char
*
argv
[])
{
return
!
run_convnd_fwd_example
(
argc
,
argv
);
}
example/62_conv_fwd_activ/run_convnd_fwd_activ_example.inc
View file @
55a89c74
...
...
@@ -24,7 +24,7 @@ bool run_convnd_fwd_example(int argc, char* argv[])
// Following shapes are selected to avoid overflow. Expect inf in case of
// size increase for some elementwise ops.
ck
::
utils
::
conv
::
ConvParam
conv_param
{
3
,
1
,
16
,
128
,
8
,
{
3
,
3
,
3
},
{
17
,
17
,
17
},
{
2
,
2
,
2
},
{
1
,
1
,
1
},
{
1
,
1
,
1
},
{
1
,
1
,
1
}};
3
,
2
,
16
,
128
,
8
,
{
3
,
3
,
3
},
{
17
,
17
,
17
},
{
2
,
2
,
2
},
{
1
,
1
,
1
},
{
1
,
1
,
1
},
{
1
,
1
,
1
}};
if
(
argc
==
1
)
{
...
...
example/64_tensor_transforms/CMakeLists.txt
deleted
100644 → 0
View file @
0dacd895
add_example_executable
(
example_tensor_transform tensor_transform.cpp
)
add_example_executable
(
example_tensor_transform_using_wrapper tensor_transform_using_wrapper.cpp
)
include/ck/host_utility/device_prop.hpp
View file @
55a89c74
...
...
@@ -26,7 +26,7 @@ inline std::string get_device_name()
}
const
std
::
string
raw_name
(
props
.
gcnArchName
);
// https://github.com/ROCm
SoftwarePlatform
/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40
// https://github.com/ROCm/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40
static
std
::
map
<
std
::
string
,
std
::
string
>
device_name_map
=
{
{
"Ellesmere"
,
"gfx803"
},
{
"Baffin"
,
"gfx803"
},
...
...
include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_abd_xdl_cshuffle.hpp
View file @
55a89c74
...
...
@@ -14,6 +14,7 @@
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_abd.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_abd_xdl_cshuffle.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
...
...
@@ -500,22 +501,29 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
// for sanity check of vector memory access
for
(
index_t
i
=
0
;
i
<
NumATensor
;
++
i
)
{
a_mz_stride_
[
i
]
=
a_ms_ks_strides
[
i
][
NumDimM
-
1
];
a_kz_stride_
[
i
]
=
a_ms_ks_strides
[
i
][
NumDimM
+
NumDimK
-
1
];
as_mz_consecutive_
[
i
]
=
a_ms_ks_strides
[
i
][
NumDimM
-
1
]
==
1
;
as_kz_consecutive_
[
i
]
=
a_ms_ks_strides
[
i
][
NumDimM
+
NumDimK
-
1
]
==
1
;
as_max_read_elems_
[
i
]
=
CalculateMaxRead
<
NumDimM
,
NumDimK
>
(
a_ms_ks_lengths
[
i
],
a_ms_ks_strides
[
i
]);
}
for
(
index_t
i
=
0
;
i
<
NumBTensor
;
++
i
)
{
b_nz_stride_
[
i
]
=
b_ns_ks_strides
[
i
][
NumDimN
-
1
];
b_kz_stride_
[
i
]
=
b_ns_ks_strides
[
i
][
NumDimN
+
NumDimK
-
1
];
bs_nz_consecutive_
[
i
]
=
b_ns_ks_strides
[
i
][
NumDimN
-
1
]
==
1
;
bs_kz_consecutive_
[
i
]
=
b_ns_ks_strides
[
i
][
NumDimN
+
NumDimK
-
1
]
==
1
;
bs_max_read_elems_
[
i
]
=
CalculateMaxRead
<
NumDimN
,
NumDimK
>
(
b_ns_ks_lengths
[
i
],
b_ns_ks_strides
[
i
]);
}
for
(
index_t
i
=
0
;
i
<
NumDTensor
;
++
i
)
{
ds_nz_stride_
[
i
]
=
d_ms_ns_strides
[
i
][
NumDimM
+
NumDimN
-
1
];
ds_nz_consecutive_
[
i
]
=
d_ms_ns_strides
[
i
][
NumDimM
+
NumDimN
-
1
]
==
1
;
ds_max_read_elems_
[
i
]
=
CalculateMaxRead
<
NumDimM
,
NumDimN
>
(
d_ms_ns_lengths
[
i
],
d_ms_ns_strides
[
i
]);
}
e_nz_stride_
=
e_ms_ns_stride
[
NumDimM
+
NumDimN
-
1
];
e_nz_consecutive_
=
e_ms_ns_stride
[
NumDimM
+
NumDimN
-
1
]
==
1
;
e_max_write_elems_
=
CalculateMaxRead
<
NumDimM
,
NumDimN
>
(
e_ms_ns_length
,
e_ms_ns_stride
);
}
// pointers
...
...
@@ -545,16 +553,19 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
BElementwiseOperation
b_element_op_
;
CDEElementwiseOperation
cde_element_op_
;
// Strides for the last M/N/K dimensions of A/B/Ds/E
// for sanity check of vector load/store
std
::
array
<
index_t
,
NumATensor
>
a_mz_stride_
;
std
::
array
<
index_t
,
NumATensor
>
a_kz_stride_
;
std
::
array
<
index_t
,
NumBTensor
>
b_nz_stride_
;
std
::
array
<
index_t
,
NumBTensor
>
b_kz_stride_
;
std
::
array
<
index_t
,
NumDTensor
>
ds_nz_stride_
;
index_t
e_nz_stride_
;
// Describe whether the last part of a given dimension of A/B/D/E is consecutive
// in the memory or not.
std
::
array
<
bool
,
NumATensor
>
as_mz_consecutive_
;
std
::
array
<
bool
,
NumATensor
>
as_kz_consecutive_
;
std
::
array
<
bool
,
NumBTensor
>
bs_nz_consecutive_
;
std
::
array
<
bool
,
NumBTensor
>
bs_kz_consecutive_
;
std
::
array
<
bool
,
NumDTensor
>
ds_nz_consecutive_
;
bool
e_nz_consecutive_
;
std
::
array
<
index_t
,
NumATensor
>
as_max_read_elems_
;
std
::
array
<
index_t
,
NumBTensor
>
bs_max_read_elems_
;
std
::
array
<
index_t
,
NumDTensor
>
ds_max_read_elems_
;
index_t
e_max_write_elems_
;
};
// Invoker
...
...
@@ -643,73 +654,65 @@ struct DeviceContractionMultipleABD_Xdl_CShuffle
// check vector load/store
{
bool
all_valid
=
true
;
bool
valid_as_access
=
true
;
static_for
<
0
,
NumATensor
,
1
>
{}([
&
](
auto
i
)
{
// vector memory access of A: could be on M or AK1 dimension
if
constexpr
(
ABlockTransferSrcVectorDim
==
1
)
{
if
(
!
(
arg
.
a_mz_stride_
[
i
]
==
1
&&
arg
.
as_grid_desc_ak0_m_ak1_
[
i
].
GetLength
(
I1
)
%
ABlockTransferSrcScalarPerVector
==
0
))
{
all_valid
=
false
;
}
}
else
const
bool
valid_a_vector_size
=
arg
.
as_max_read_elems_
[
i
]
%
ABlockTransferSrcScalarPerVector
==
0
;
const
bool
valid_a_access_dim_m
=
ABlockTransferSrcVectorDim
==
1
&&
arg
.
as_mz_consecutive_
[
i
];
const
bool
valid_a_access_dim_k
=
ABlockTransferSrcVectorDim
==
2
&&
arg
.
as_kz_consecutive_
[
i
];
const
bool
valid_a_access_dim
=
valid_a_access_dim_m
||
valid_a_access_dim_k
;
if
(
!
(
valid_a_vector_size
&&
valid_a_access_dim
))
{
if
(
!
(
arg
.
a_kz_stride_
[
i
]
==
1
&&
arg
.
as_grid_desc_ak0_m_ak1_
[
i
].
GetLength
(
I2
)
%
ABlockTransferSrcScalarPerVector
==
0
))
{
all_valid
=
false
;
}
valid_as_access
=
false
;
}
});
if
(
!
valid_as_access
)
{
return
false
;
}
// vector memory access of B: could be on N or BK1 dimension
bool
valid_bs_access
=
true
;
static_for
<
0
,
NumBTensor
,
1
>
{}([
&
](
auto
i
)
{
if
constexpr
(
BBlockTransferSrcVectorDim
==
1
)
const
bool
valid_b_vector_size
=
arg
.
bs_max_read_elems_
[
i
]
%
BBlockTransferSrcScalarPerVector
==
0
;
const
bool
valid_b_access_dim_n
=
BBlockTransferSrcVectorDim
==
1
&&
arg
.
bs_nz_consecutive_
[
i
];
const
bool
valid_b_access_dim_k
=
BBlockTransferSrcVectorDim
==
2
&&
arg
.
bs_kz_consecutive_
[
i
];
const
bool
valid_b_access_dim
=
valid_b_access_dim_n
||
valid_b_access_dim_k
;
if
(
!
(
valid_b_vector_size
&&
valid_b_access_dim
))
{
if
(
!
(
arg
.
b_nz_stride_
[
i
]
==
1
&&
arg
.
bs_grid_desc_bk0_n_bk1_
[
i
].
GetLength
(
I1
)
%
BBlockTransferSrcScalarPerVector
==
0
))
{
all_valid
=
false
;
}
}
else
{
if
(
!
(
arg
.
b_kz_stride_
[
i
]
==
1
&&
arg
.
bs_grid_desc_bk0_n_bk1_
[
i
].
GetLength
(
I2
)
%
BBlockTransferSrcScalarPerVector
==
0
))
{
all_valid
=
false
;
}
valid_bs_access
=
false
;
}
});
if
(
!
valid_bs_access
)
{
return
false
;
}
// check vector load of Ds
bool
valid_ds_access
=
true
;
static_for
<
0
,
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
if
(
!
(
arg
.
ds_nz_stride_
[
i
]
==
1
&&
arg
.
ds_grid_desc_mblock_mperblock_nblock_nperblock_
[
i
].
GetLength
(
I3
)
%
CDEBlockTransferScalarPerVector_NPerBlock
==
0
))
const
bool
valid_d_vector_size
=
arg
.
ds_max_read_elems_
[
i
]
%
CDEBlockTransferScalarPerVector_NPerBlock
==
0
;
// Vector read of Ds is always on N dimension.
const
bool
valid_d_access_dim
=
arg
.
ds_nz_consecutive_
[
i
];
if
(
!
(
valid_d_vector_size
&&
valid_d_access_dim
))
{
all_
valid
=
false
;
valid
_ds_access
=
false
;
}
});
// vector memory access of E: always on NPerBlock dimension
if
(
!
(
arg
.
e_nz_stride_
==
1
&&
arg
.
e_grid_desc_mblock_mperblock_nblock_nperblock_
.
GetLength
(
I3
)
%
CDEBlockTransferScalarPerVector_NPerBlock
==
0
))
if
(
!
valid_ds_access
)
{
all_valid
=
false
;
return
false
;
}
if
(
!
all_valid
)
const
bool
valid_e_vector_size
=
arg
.
e_max_write_elems_
%
CDEBlockTransferScalarPerVector_NPerBlock
==
0
;
// Vector write of E is always on N dimension.
const
bool
valid_e_access_dim
=
arg
.
e_nz_consecutive_
;
if
(
!
(
valid_e_vector_size
&&
valid_e_access_dim
))
{
return
false
;
}
...
...
include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
View file @
55a89c74
...
...
@@ -13,6 +13,7 @@
#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
...
...
@@ -183,7 +184,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
return
generate_tuple
([
&
](
auto
i
)
{
return
vec
[
i
];
},
num
);
};
const
auto
a_ms_
n
s_lengths
=
to_tuple
(
a_ms_ks_lengths_vec
,
Number
<
NumDimM
+
NumDimK
>
{});
const
auto
a_ms_
k
s_lengths
=
to_tuple
(
a_ms_ks_lengths_vec
,
Number
<
NumDimM
+
NumDimK
>
{});
const
auto
a_ms_ks_strides
=
to_tuple
(
a_ms_ks_strides_vec
,
Number
<
NumDimM
+
NumDimK
>
{});
// dimension Ids for M0, M1, ...
...
...
@@ -194,14 +195,14 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
typename
arithmetic_sequence_gen
<
NumDimM
,
NumDimM
+
NumDimK
,
1
>::
type
{};
// lengths for M0, M1, ...
const
auto
mLengths
=
get_container_subset
(
a_ms_
n
s_lengths
,
mDimIds
);
const
auto
mLengths
=
get_container_subset
(
a_ms_
k
s_lengths
,
mDimIds
);
// lengths for K0, K1, ...
const
auto
kLengths
=
get_container_subset
(
a_ms_
n
s_lengths
,
kDimIds
);
const
auto
kLengths
=
get_container_subset
(
a_ms_
k
s_lengths
,
kDimIds
);
// naive tensor A[M0, M1, M2, ..., K0, K1, K2...]
const
auto
a_grid_desc_ms_ks
=
make_naive_tensor_descriptor
(
a_ms_
n
s_lengths
,
a_ms_ks_strides
);
make_naive_tensor_descriptor
(
a_ms_
k
s_lengths
,
a_ms_ks_strides
);
// transformed tensor A[MRaw = M0 * M1 * M2 * ... , KRaw = K0 * K1 * K2 * ...]
const
auto
a_grid_desc_mraw_kraw
=
transform_tensor_descriptor
(
...
...
@@ -383,7 +384,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
const
void
*
p_b_grid
,
std
::
array
<
const
void
*
,
NumDTensor
>
p_ds_grid
,
void
*
p_e_grid
,
const
std
::
vector
<
index_t
>&
a_ms_
n
s_lengths
,
const
std
::
vector
<
index_t
>&
a_ms_
k
s_lengths
,
const
std
::
vector
<
index_t
>&
a_ms_ks_strides
,
const
std
::
vector
<
index_t
>&
b_ns_ks_lengths
,
const
std
::
vector
<
index_t
>&
b_ns_ks_strides
,
...
...
@@ -398,7 +399,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
p_b_grid_
{
static_cast
<
const
BDataType
*>
(
p_b_grid
)},
p_ds_grid_
{},
p_e_grid_
{
static_cast
<
EDataType
*>
(
p_e_grid
)},
a_grid_desc_m_k_
{
DeviceOp
::
MakeAGridDescriptor_M_K
(
a_ms_
n
s_lengths
,
a_ms_ks_strides
)},
a_grid_desc_m_k_
{
DeviceOp
::
MakeAGridDescriptor_M_K
(
a_ms_
k
s_lengths
,
a_ms_ks_strides
)},
b_grid_desc_n_k_
{
DeviceOp
::
MakeBGridDescriptor_N_K
(
b_ns_ks_lengths
,
b_ns_ks_strides
)},
ds_grid_desc_m_n_
{},
e_grid_desc_m_n_
{
DeviceOp
::
MakeEGridDescriptor_M_N
(
e_ms_ns_lengths
,
e_ms_ns_strides
)},
...
...
@@ -411,13 +412,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
block_2_etile_map_
{
GridwiseGemm
::
MakeDefaultBlock2ETileMap
(
e_grid_desc_m_n_
)},
a_element_op_
{
a_element_op
},
b_element_op_
{
b_element_op
},
cde_element_op_
{
cde_element_op
},
a_mz_stride_
{},
a_kz_stride_
{},
b_nz_stride_
{},
b_kz_stride_
{},
ds_nz_stride_
{},
e_nz_stride_
{}
cde_element_op_
{
cde_element_op
}
{
// populate pointer, batch stride, desc for Ds
static_for
<
0
,
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
...
...
@@ -448,18 +443,26 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
}
// for sanity check of vector memory access
a_mz_stride_
=
a_ms_ks_strides
[
NumDimM
-
1
];
a_kz_stride_
=
a_ms_ks_strides
[
NumDimM
+
NumDimK
-
1
];
a_mz_consecutive_
=
a_ms_ks_strides
[
NumDimM
-
1
]
==
1
;
a_kz_consecutive_
=
a_ms_ks_strides
[
NumDimM
+
NumDimK
-
1
]
==
1
;
a_max_read_elems_
=
CalculateMaxRead
<
NumDimM
,
NumDimK
>
(
a_ms_ks_lengths
,
a_ms_ks_strides
);
b_nz_stride_
=
b_ns_ks_strides
[
NumDimN
-
1
];
b_kz_stride_
=
b_ns_ks_strides
[
NumDimN
+
NumDimK
-
1
];
b_nz_consecutive_
=
b_ns_ks_strides
[
NumDimN
-
1
]
==
1
;
b_kz_consecutive_
=
b_ns_ks_strides
[
NumDimN
+
NumDimK
-
1
]
==
1
;
b_max_read_elems_
=
CalculateMaxRead
<
NumDimN
,
NumDimK
>
(
b_ns_ks_lengths
,
b_ns_ks_strides
);
for
(
index_t
i
=
0
;
i
<
NumDTensor
;
++
i
)
{
ds_nz_stride_
[
i
]
=
ds_ms_ns_strides
[
i
][
NumDimM
+
NumDimN
-
1
];
ds_nz_consecutive_
[
i
]
=
ds_ms_ns_strides
[
i
][
NumDimM
+
NumDimN
-
1
]
==
1
;
ds_max_read_elems_
[
i
]
=
CalculateMaxRead
<
NumDimM
,
NumDimN
>
(
ds_ms_ns_lengths
[
i
],
ds_ms_ns_strides
[
i
]);
}
e_nz_stride_
=
e_ms_ns_strides
[
NumDimM
+
NumDimN
-
1
];
e_nz_consecutive_
=
e_ms_ns_strides
[
NumDimM
+
NumDimN
-
1
]
==
1
;
e_max_write_elems_
=
CalculateMaxRead
<
NumDimM
,
NumDimN
>
(
e_ms_ns_lengths
,
e_ms_ns_strides
);
}
void
Print
()
const
...
...
@@ -499,15 +502,19 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
BElementwiseOperation
b_element_op_
;
CDEElementwiseOperation
cde_element_op_
;
// Strides for the last M/N/K dimensions of A/B/Ds/E
// for sanity check of vector load/store
index_t
a_mz_stride_
;
index_t
a_kz_stride_
;
index_t
b_nz_stride_
;
index_t
b_kz_stride_
;
std
::
array
<
index_t
,
NumDTensor
>
ds_nz_stride_
;
index_t
e_mz_stride_
;
index_t
e_nz_stride_
;
// Describe whether the last part of a given dimension of A/B/D/E is consecutive
// in the memory or not.
bool
a_mz_consecutive_
;
bool
a_kz_consecutive_
;
bool
b_nz_consecutive_
;
bool
b_kz_consecutive_
;
std
::
array
<
bool
,
NumDTensor
>
ds_nz_consecutive_
;
bool
e_nz_consecutive_
;
index_t
a_max_read_elems_
;
index_t
b_max_read_elems_
;
std
::
array
<
index_t
,
NumDTensor
>
ds_max_read_elems_
;
index_t
e_max_write_elems_
;
};
// Invoker
...
...
@@ -616,65 +623,47 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
(
BBlockTransferSrcVectorDim
==
1
||
BBlockTransferSrcVectorDim
==
2
),
"wrong!"
);
// vector memory access of A: could be on M or AK1 dimension
if
constexpr
(
ABlockTransferSrcVectorDim
==
1
)
const
bool
valid_a_vector_size
=
arg
.
a_max_read_elems_
%
ABlockTransferSrcScalarPerVector
==
0
;
const
bool
valid_a_access_dim_m
=
ABlockTransferSrcVectorDim
==
1
&&
arg
.
a_mz_consecutive_
;
const
bool
valid_a_access_dim_k
=
ABlockTransferSrcVectorDim
==
2
&&
arg
.
a_kz_consecutive_
;
const
bool
valid_a_access_dim
=
valid_a_access_dim_m
||
valid_a_access_dim_k
;
if
(
!
(
valid_a_vector_size
&&
valid_a_access_dim
))
{
if
(
!
(
arg
.
a_mz_stride_
==
1
&&
arg
.
a_grid_desc_ak0_m_ak1_
.
GetLength
(
I1
)
%
ABlockTransferSrcScalarPerVector
==
0
))
{
return
false
;
}
}
else
{
if
(
!
(
arg
.
a_kz_stride_
==
1
&&
arg
.
a_grid_desc_ak0_m_ak1_
.
GetLength
(
I2
)
%
ABlockTransferSrcScalarPerVector
==
0
))
{
return
false
;
}
return
false
;
}
// vector memory access of B: could be on N or BK1 dimension
if
constexpr
(
BBlockTransferSrcVectorDim
==
1
)
{
if
(
!
(
arg
.
b_nz_stride_
==
1
&&
arg
.
b_grid_desc_bk0_n_bk1_
.
GetLength
(
I1
)
%
BBlockTransferSrcScalarPerVector
==
0
))
{
return
false
;
}
}
else
const
bool
valid_b_vector_size
=
arg
.
b_max_read_elems_
%
BBlockTransferSrcScalarPerVector
==
0
;
const
bool
valid_b_access_dim_n
=
BBlockTransferSrcVectorDim
==
1
&&
arg
.
b_nz_consecutive_
;
const
bool
valid_b_access_dim_k
=
BBlockTransferSrcVectorDim
==
2
&&
arg
.
b_kz_consecutive_
;
const
bool
valid_b_access_dim
=
valid_b_access_dim_n
||
valid_b_access_dim_k
;
if
(
!
(
valid_b_vector_size
&&
valid_b_access_dim
))
{
if
(
!
(
arg
.
b_kz_stride_
==
1
&&
arg
.
b_grid_desc_bk0_n_bk1_
.
GetLength
(
I2
)
%
BBlockTransferSrcScalarPerVector
==
0
))
{
return
false
;
}
return
false
;
}
// vector memory access of Ds: always on NPerBlock dimension
bool
valid_d_access
=
true
;
bool
valid_ds_access
=
true
;
static_for
<
0
,
NumDTensor
,
1
>
{}([
&
](
auto
i
)
{
if
(
!
(
arg
.
ds_nz_stride_
[
i
]
==
1
&&
arg
.
ds_grid_desc_mblock_mperblock_nblock_nperblock_
[
i
].
GetLength
(
I3
)
%
CDEBlockTransferScalarPerVector_NPerBlock
==
0
))
const
bool
valid_d_vector_size
=
arg
.
ds_max_read_elems_
[
i
]
%
CDEBlockTransferScalarPerVector_NPerBlock
==
0
;
// Vector read of Ds is always on N dimension.
const
bool
valid_d_access_dim
=
arg
.
ds_nz_consecutive_
[
i
];
if
(
!
(
valid_d_vector_size
&&
valid_d_access_dim
))
{
valid_d_access
=
false
;
valid_d
s
_access
=
false
;
}
});
if
(
valid_d_access
==
false
)
if
(
!
valid_ds_access
)
{
return
false
;
}
// vector memory access of E: always on NPerBlock dimension
if
(
!
(
arg
.
e_
nz_stride_
==
1
&&
arg
.
e_grid_desc_mblock_mperblock_nblock_nperblock_
.
GetLength
(
I3
)
%
CDEBlockTransferScalarPerVector_NPerBlock
==
0
))
const
bool
valid_e_vector_size
=
arg
.
e_
max_write_elems_
%
CDEBlockTransferScalarPerVector_NPerBlock
==
0
;
// Vector write of E is always on N dimension.
const
bool
valid_e_access_dim
=
arg
.
e_nz_consecutive_
;
if
(
!
(
valid_e_vector_size
&&
valid_e_access_dim
))
{
return
false
;
}
...
...
@@ -692,7 +681,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
const
void
*
p_b
,
std
::
array
<
const
void
*
,
NumDTensor
>
p_ds
,
void
*
p_e
,
const
std
::
vector
<
index_t
>&
a_ms_
n
s_lengths
,
const
std
::
vector
<
index_t
>&
a_ms_
k
s_lengths
,
const
std
::
vector
<
index_t
>&
a_ms_ks_strides
,
const
std
::
vector
<
index_t
>&
b_ns_ks_lengths
,
const
std
::
vector
<
index_t
>&
b_ns_ks_strides
,
...
...
@@ -708,7 +697,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
p_b
,
p_ds
,
p_e
,
a_ms_
n
s_lengths
,
a_ms_
k
s_lengths
,
a_ms_ks_strides
,
b_ns_ks_lengths
,
b_ns_ks_strides
,
...
...
@@ -729,7 +718,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
const
void
*
p_b
,
std
::
array
<
const
void
*
,
NumDTensor
>
p_ds
,
void
*
p_e
,
const
std
::
vector
<
index_t
>&
a_ms_
n
s_lengths
,
const
std
::
vector
<
index_t
>&
a_ms_
k
s_lengths
,
const
std
::
vector
<
index_t
>&
a_ms_ks_strides
,
const
std
::
vector
<
index_t
>&
b_ns_ks_lengths
,
const
std
::
vector
<
index_t
>&
b_ns_ks_strides
,
...
...
@@ -745,7 +734,7 @@ struct DeviceContractionMultipleD_Xdl_CShuffle
p_b
,
p_ds
,
p_e
,
a_ms_
n
s_lengths
,
a_ms_
k
s_lengths
,
a_ms_ks_strides
,
b_ns_ks_lengths
,
b_ns_ks_strides
,
...
...
include/ck/tensor_operation/gpu/device/impl/device_contraction_utils.hpp
0 → 100644
View file @
55a89c74
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cassert>
#include <sstream>
#include <vector>
#include "ck/ck.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
/**
* Calculates the maximum number of subsequent elements of the fast changing dimension
* that are consecutive in memory.
*
* Example:
* NumDimM = 2, NumDimK = 3
* A shape = [ 2, 3, 4, 5, 6]
* A strides = [360, 120, 30, 6, 1]
* | M | | K |
* It follows from strides that K is FCD and all the subsequent elements of K are consecutive
* in memory.
* But if strides were [360, 120, 6, 24, 1], then only 6 subsequent elements of K would be
* consecutive in memory.
*
* Assumes that the dimensions are split into two groups of `NumDim1` and `NumDim2` dimensions.
*/
template
<
index_t
NumDim1
,
index_t
NumDim2
>
auto
CalculateMaxRead
(
const
std
::
vector
<
index_t
>&
lengths
,
const
std
::
vector
<
index_t
>&
strides
)
{
if
(
lengths
.
size
()
!=
NumDim1
+
NumDim2
)
{
std
::
ostringstream
err
;
err
<<
"Incorrect number of lengths in "
<<
__FILE__
<<
":"
<<
__LINE__
<<
", in function: "
<<
__func__
;
throw
std
::
runtime_error
(
err
.
str
());
}
if
(
strides
.
size
()
!=
NumDim1
+
NumDim2
)
{
std
::
ostringstream
err
;
err
<<
"Incorrect number of strides in "
<<
__FILE__
<<
":"
<<
__LINE__
<<
", in function: "
<<
__func__
;
throw
std
::
runtime_error
(
err
.
str
());
}
// Determine the beginning and end idx of the group representing the FCD.
index_t
begin_idx
,
end_idx
;
if
(
strides
[
NumDim1
-
1
]
==
1
)
{
begin_idx
=
0
;
end_idx
=
NumDim1
-
1
;
}
else
if
(
strides
[
NumDim1
+
NumDim2
-
1
]
==
1
)
{
begin_idx
=
NumDim1
;
end_idx
=
NumDim1
+
NumDim2
-
1
;
}
else
{
// The dimension consecutive in memory is not the last dimension of any group, so only
// one element can be read/written at once.
return
1
;
}
index_t
consecutive_stride
=
1
;
for
(
index_t
dim_idx
=
end_idx
;
dim_idx
>=
begin_idx
;
--
dim_idx
)
{
if
(
strides
[
dim_idx
]
==
consecutive_stride
)
{
consecutive_stride
*=
lengths
[
dim_idx
];
}
else
{
break
;
}
}
const
index_t
max_subsequent_elems
=
consecutive_stride
;
return
max_subsequent_elems
;
}
}
// namespace device
}
// namespace tensor_operation
}
// namespace ck
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
View file @
55a89c74
...
...
@@ -357,15 +357,17 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
return
out_gemmm_gemmn_desc
;
}
// Shape of Ds and E must be aligned. Strides can be different.
// Pass e_g_n_k_wos_lengths for logical broadcast.
static
auto
MakeDsGridDescriptor_M_N
(
const
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds
_g_n_k_wos_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>
&
e
_g_n_k_wos_lengths
,
const
std
::
array
<
std
::
array
<
index_t
,
NDimSpatial
+
3
>
,
NumDTensor
>&
ds_g_n_k_wos_strides
)
{
return
generate_tuple
(
[
&
](
auto
i
)
{
using
DLayout
=
remove_cvref_t
<
tuple_element_t
<
i
.
value
,
DsLayout
>>
;
return
DeviceOp
::
MakeEGridDescriptor_M_N
<
DLayout
>
(
ds
_g_n_k_wos_lengths
[
i
]
,
return
DeviceOp
::
MakeEGridDescriptor_M_N
<
DLayout
>
(
e
_g_n_k_wos_lengths
,
ds_g_n_k_wos_strides
[
i
]);
},
Number
<
NumDTensor
>
{});
...
...
@@ -569,7 +571,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
// D desc
ds_grid_desc_m_n_
(
i
)
=
DeviceOp
::
MakeEGridDescriptor_M_N
<
DLayout
>
(
ds
_g_n_k_wos_lengths
[
i
]
,
ds_g_n_k_wos_strides
[
i
]);
e
_g_n_k_wos_lengths
,
ds_g_n_k_wos_strides
[
i
]);
});
compute_ptr_offset_of_batch_
.
BatchStrideE_
=
e_g_n_k_wos_strides
[
0
];
...
...
@@ -916,8 +918,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
is_same_v
<
DLayout
,
ctc
::
G_NDHW_K
>
||
is_same_v
<
DLayout
,
ctc
::
GNWK
>
||
is_same_v
<
DLayout
,
ctc
::
GNHWK
>
||
is_same_v
<
DLayout
,
ctc
::
GNDHWK
>
||
is_same_v
<
DLayout
,
ctc
::
NWGK
>
||
is_same_v
<
DLayout
,
ctc
::
NHWGK
>
||
is_same_v
<
DLayout
,
ctc
::
NDHWGK
>
||
is_same_v
<
DLayout
,
ctc
::
GK
>
||
is_same_v
<
DLayout
,
ctc
::
G_K
>
)
is_same_v
<
DLayout
,
ctc
::
NDHWGK
>
||
is_same_v
<
DLayout
,
ctc
::
G_K
>
)
{
const
index_t
K
=
arg
.
ds_g_n_k_wos_lengths_
[
i
][
2
];
...
...
@@ -925,6 +926,27 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
{
valid
=
false
;
}
if
constexpr
(
is_same_v
<
DLayout
,
ctc
::
G_K
>
)
{
// G and K must be the same
if
(
arg
.
ds_g_n_k_wos_lengths_
[
i
][
0
]
!=
arg
.
e_g_n_k_wos_lengths_
[
0
]
||
arg
.
ds_g_n_k_wos_lengths_
[
i
][
2
]
!=
arg
.
e_g_n_k_wos_lengths_
[
2
])
{
valid
=
false
;
}
}
else
{
// E and D must have the same shape
for
(
index_t
d
=
0
;
d
<
NDimSpatial
+
3
;
d
++
)
{
if
(
arg
.
ds_g_n_k_wos_lengths_
[
i
][
d
]
!=
arg
.
e_g_n_k_wos_lengths_
[
d
])
{
valid
=
false
;
}
}
}
}
else
{
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
View file @
55a89c74
...
...
@@ -631,8 +631,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
is_same_v
<
DLayout
,
ctc
::
G_NDHW_K
>
||
is_same_v
<
DLayout
,
ctc
::
GNWK
>
||
is_same_v
<
DLayout
,
ctc
::
GNHWK
>
||
is_same_v
<
DLayout
,
ctc
::
GNDHWK
>
||
is_same_v
<
DLayout
,
ctc
::
NWGK
>
||
is_same_v
<
DLayout
,
ctc
::
NHWGK
>
||
is_same_v
<
DLayout
,
ctc
::
NDHWGK
>
||
is_same_v
<
DLayout
,
ctc
::
GK
>
||
is_same_v
<
DLayout
,
ctc
::
G_K
>
)
is_same_v
<
DLayout
,
ctc
::
NDHWGK
>
||
is_same_v
<
DLayout
,
ctc
::
G_K
>
)
{
const
index_t
K
=
arg
.
ds_g_n_k_wos_lengths_
[
i
][
2
];
...
...
include/ck/tensor_operation/gpu/device/tensor_layout.hpp
View file @
55a89c74
...
...
@@ -308,12 +308,6 @@ struct GNDHWK : public BaseTensorLayout
static
constexpr
const
char
*
name
=
"GNDHWK"
;
};
// for output bias
struct
GK
:
public
BaseTensorLayout
{
static
constexpr
const
char
*
name
=
"GK"
;
};
// output tensor
// packed NWGK/NHWGK/NDHWGK
struct
NWGK
:
public
BaseTensorLayout
...
...
include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp
View file @
55a89c74
...
...
@@ -50,7 +50,9 @@ __global__ void
ignore
=
p_in_global
;
ignore
=
out_grid_desc
;
ignore
=
p_out_global
;
ignore
=
batch_count
;
ignore
=
block_2_tile_map
;
ignore
=
compute_ptr_offset_of_batch
;
#endif
}
...
...
include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
View file @
55a89c74
...
...
@@ -522,22 +522,21 @@ struct TransformConvFwdToGemm
// for output bias
template
<
typename
CLayout
,
typename
std
::
enable_if
<
is_same_v
<
CLayout
,
tensor_layout
::
convolution
::
GK
>
||
is_same_v
<
CLayout
,
tensor_layout
::
convolution
::
G_K
>
,
typename
std
::
enable_if
<
is_same_v
<
CLayout
,
tensor_layout
::
convolution
::
G_K
>,
bool
>::
type
=
false
>
static
auto
MakeCDescriptor_M_N
(
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
c_g_n_k_wos_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
/* c_g_n_k_wos_strides */
)
static
auto
MakeCDescriptor_M_N
(
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
c_g_n_k_wos_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
c_g_n_k_wos_strides
)
{
const
index_t
N
=
c_g_n_k_wos_lengths
[
1
];
const
index_t
K
=
c_g_n_k_wos_lengths
[
2
];
const
index_t
N
=
c_g_n_k_wos_lengths
[
1
];
const
index_t
K
=
c_g_n_k_wos_lengths
[
2
];
const
index_t
KStride
=
c_g_n_k_wos_strides
[
2
];
const
index_t
NHoWo
=
N
*
ck
::
accumulate_n
<
index_t
>
(
c_g_n_k_wos_lengths
.
begin
()
+
3
,
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
auto
out_gemmm_gemmn_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
NHoWo
,
K
),
make_tuple
(
I0
,
I1
));
make_naive_tensor_descriptor
(
make_tuple
(
NHoWo
,
K
),
make_tuple
(
I0
,
KStride
));
return
out_gemmm_gemmn_desc
;
}
...
...
include/ck/utility/tuple_helper.hpp
View file @
55a89c74
...
...
@@ -166,4 +166,16 @@ __host__ __device__ constexpr auto IsNestedTuple(const Tuple<Ts...>&)
return
(
is_detected
<
is_tuple
,
Ts
>::
value
||
...);
}
template
<
index_t
depth
=
0
,
typename
T
>
__host__
__device__
constexpr
auto
TupleDepth
(
const
T
&
)
{
return
depth
;
}
template
<
index_t
depth
=
0
,
typename
...
Ts
>
__host__
__device__
constexpr
auto
TupleDepth
(
const
Tuple
<
Ts
...
>&
)
{
return
math
::
max
(
TupleDepth
<
depth
+
1
>
(
Ts
{})...);
}
}
// namespace ck
example/64_tensor_transforms/tensor_transform_wrapper
.hpp
→
include/ck/wrapper/layout
.hpp
View file @
55a89c74
...
...
@@ -3,27 +3,13 @@
#pragma once
#include "ck/ck.hpp"
#include "ck/utility/number.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/utility/tuple_helper.hpp"
#include "ck/utility/sequence.hpp"
#include "ck/utility/sequence_helper.hpp"
#include "ck/utility/is_detected.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
#include "ck/wrapper/utils/layout_utils.hpp"
namespace
ck
{
namespace
tensor_transform_
wrapper
{
namespace
wrapper
{
/**
* \brief Layout wrapper
*
* \details
* Layout wrapper that performs the tensor descriptor logic.
* \brief Layout wrapper that performs the tensor descriptor logic.
*
* \tparam Shape Tuple of Number<> (for compile-time layout) or index_t
* (dynamic layout). It is possible to pass nested shapes
...
...
@@ -32,21 +18,39 @@ namespace tensor_transform_wrapper {
* (dynamic layout). Stride tuple should be nested if shape tuple is
* nested.
*/
template
<
typename
Shape
,
typename
Strides
=
Tuple
<
>
>
template
<
typename
Shape
,
typename
Strides
>
struct
Layout
{
private:
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I1
=
Number
<
1
>
{};
template
<
typename
T
>
using
is_tuple
=
decltype
(
std
::
declval
<
T
&>
().
IsTuple
());
// Generate default idxs tuple (idx with all merged nested shapes)
template
<
typename
...
Ts
>
__host__
__device__
constexpr
static
auto
GenerateDefaultIdxsTuple
(
const
Tuple
<
Ts
...
>&
)
{
return
generate_tuple
(
[
&
](
auto
)
{
if
constexpr
(
!
FlattenDescriptorType
::
IsKnownAtCompileTime
())
{
// runtime layout
return
index_t
(
0
);
}
else
{
// compiletime layout
return
I0
;
}
},
Number
<
Tuple
<
Ts
...
>::
Size
()
>
{});
}
// Generate packed (column-major) strides if not passed
template
<
typename
...
Ts
>
__host__
__device__
constexpr
static
auto
GenerateColumnMajorPackedStrides
(
const
Tuple
<
Ts
...
>&
tupl
e
)
GenerateColumnMajorPackedStrides
(
const
Tuple
<
Ts
...
>&
shap
e
)
{
const
auto
unrolled_shape
=
UnrollNestedTuple
(
shape
);
return
generate_tuple
(
[
&
](
auto
i
)
{
if
constexpr
(
i
.
value
==
0
)
...
...
@@ -56,10 +60,10 @@ struct Layout
else
{
return
TupleReduce
<
I0
.
value
,
i
.
value
>
([](
auto
x
,
auto
y
)
{
return
x
*
y
;
},
tupl
e
);
unrolled_shap
e
);
}
},
Number
<
Tuple
<
Ts
...
>
::
Size
()
>
{});
Number
<
decltype
(
unrolled_shape
)
::
Size
()
>
{});
}
// Generate LowerDims in Compile-time for MergeTrasform using passed Type
...
...
@@ -112,8 +116,8 @@ struct Layout
// Example shape: (2, (2, 2)), 2, (2, 2)
// Unrolled shape: 2, (2, 2), 2, (2, 2)
template
<
typename
...
ShapeDims
,
typename
...
IdxDims
>
__host__
__device__
constexpr
static
auto
Unroll
Shape
Via
Idx
(
const
Tuple
<
ShapeDims
...
>&
shape
,
const
Tuple
<
IdxDims
...
>&
idx
)
__host__
__device__
constexpr
static
auto
Align
Shape
To
Idx
(
const
Tuple
<
ShapeDims
...
>&
shape
,
const
Tuple
<
IdxDims
...
>&
idx
)
{
if
constexpr
(
!
IsNestedTuple
(
Tuple
<
IdxDims
...
>
{}))
{
...
...
@@ -125,7 +129,7 @@ struct Layout
// Iterate over shape tuple elements:
// 1. If corresponding idx element is tuple then return (will be unrolled)
// 2. If no, pack in tuple. It will be restored during unroll.
auto
unroll
ed_shape
_via_idx
=
generate_tuple
(
auto
align
ed_shape
=
generate_tuple
(
[
&
](
auto
i
)
{
if
constexpr
(
is_detected
<
is_tuple
,
tuple_element_t
<
i
,
Tuple
<
IdxDims
...
>>>::
value
)
...
...
@@ -140,37 +144,34 @@ struct Layout
Number
<
Tuple
<
IdxDims
...
>::
Size
()
>
{});
// Unroll and process next step
return
Unroll
Shape
Via
Idx
(
UnrollNestedTuple
<
0
,
1
>
(
unroll
ed_shape
_via_idx
),
UnrollNestedTuple
<
0
,
1
>
(
idx
));
return
Align
Shape
To
Idx
(
UnrollNestedTuple
<
0
,
1
>
(
align
ed_shape
),
UnrollNestedTuple
<
0
,
1
>
(
idx
));
}
}
template
<
typename
...
ShapeDims
,
typename
DescriptorToMerge
>
__host__
__device__
constexpr
static
auto
MakeMerge1d
(
const
Tuple
<
ShapeDims
...
>&
shape
,
DescriptorToMerge
&
desc
)
const
DescriptorToMerge
&
desc
)
{
// Reverse each element in tuple
using
ReversedUnrolledShape
=
decltype
(
TupleReverse
(
UnrollNestedTuple
(
shape
)));
const
auto
merge_elems
=
ReversedUnrolledShape
{};
const
auto
merge_elems
=
TupleReverse
(
UnrollNestedTuple
(
shape
));
// Generate reverted indexes (column major traverse)
using
MergeElemsSequence
=
typename
arithmetic_sequence_gen
<
0
,
ReversedUnrolledShape
::
Size
(),
1
>::
type
;
const
auto
lower_dims
=
make_tuple
(
MergeElemsSequence
::
Reverse
());
const
auto
upper_dims
=
make_tuple
(
Sequence
<
0
>
{});
using
MergeElemsSequence
=
typename
arithmetic_sequence_gen
<
0
,
merge_elems
.
Size
(),
1
>::
type
;
const
auto
lower_dims
=
make_tuple
(
MergeElemsSequence
::
Reverse
());
const
auto
upper_dims
=
make_tuple
(
Sequence
<
0
>
{});
// Merge to 1d
return
transform_tensor_descriptor
(
desc
,
make_tuple
(
make_merge_transform
(
merge_elems
)),
lower_dims
,
upper_dims
);
}
// Merge nested shape dims
// Merge nested shape dims
when corresponding index is also nested.
// Input desc shape: 2, 2, 2, 2, 2, 2
// Example idx: 1, 1, 1, 1
// Example shape: 2, (2, 2), 2, (2, 2)
// Merged shape: 2, 4, 2, 4
template
<
typename
...
ShapeDims
,
typename
...
IdxDims
,
typename
DescriptorToMerge
>
__host__
__device__
constexpr
static
auto
MakeMerges
(
const
Tuple
<
ShapeDims
...
>&
shape
,
const
Tuple
<
IdxDims
...
>&
,
DescriptorToMerge
&
desc
)
__host__
__device__
constexpr
static
auto
CreateMergedDescriptor
(
const
Tuple
<
ShapeDims
...
>&
shape
,
const
Tuple
<
IdxDims
...
>&
,
DescriptorToMerge
&
desc
)
{
const
auto
transforms
=
generate_tuple
(
[
&
](
auto
i
)
{
...
...
@@ -206,14 +207,38 @@ struct Layout
return
transform_tensor_descriptor
(
desc
,
transforms
,
lower_dims
,
upper_dims
);
}
template
<
typename
LayoutShape
,
typename
LayoutStrides
>
__host__
__device__
static
auto
MakeFlattenDescriptor
(
const
LayoutShape
&
shape
,
const
LayoutStrides
&
strides
)
{
const
auto
unrolled_shape
=
UnrollNestedTuple
(
shape
);
const
auto
unrolled_strides
=
UnrollNestedTuple
(
strides
);
static_assert
(
unrolled_shape
.
Size
()
==
unrolled_strides
.
Size
(),
"Size of strides and shape are not consistent."
);
return
make_naive_tensor_descriptor
(
unrolled_shape
,
unrolled_strides
);
}
// If the stride is not passed, you can infer it from `GenerateColumnMajorPackedStrides`.
using
DeducedStrides
=
std
::
conditional_t
<
is_same_v
<
Strides
,
Tuple
<>>
,
remove_cvref_t
<
decltype
(
GenerateColumnMajorPackedStrides
(
Shape
{}))
>
,
Strides
>
;
using
FlattenDescriptorType
=
remove_cvref_t
<
decltype
(
MakeFlattenDescriptor
(
Shape
{},
DeducedStrides
{}))
>
;
using
Descriptor1dType
=
remove_cvref_t
<
decltype
(
MakeMerge1d
(
Shape
{},
FlattenDescriptorType
{}))
>
;
using
DefaultIdxsTupleType
=
remove_cvref_t
<
decltype
(
GenerateDefaultIdxsTuple
(
Shape
{}))
>
;
template
<
typename
...
ShapeDims
,
typename
...
IdxDims
>
__host__
__device__
constexpr
auto
TransformDesc
(
const
Tuple
<
ShapeDims
...
>&
shape
,
const
Tuple
<
IdxDims
...
>&
idx
)
const
__host__
__device__
constexpr
static
auto
TransformDesc
(
const
Tuple
<
ShapeDims
...
>&
shape
,
const
Tuple
<
IdxDims
...
>&
idx
,
const
FlattenDescriptorType
&
naive_descriptor
)
{
if
constexpr
(
Tuple
<
IdxDims
...
>::
Size
()
==
I1
)
{
// 1d idx path
return
MakeMerge1d
(
shape
,
descriptor
_
);
return
MakeMerge1d
(
shape
,
naive_
descriptor
);
}
else
{
...
...
@@ -224,62 +249,55 @@ struct Layout
static_assert
(
Tuple
<
ShapeDims
...
>::
Size
()
==
Tuple
<
IdxDims
...
>::
Size
(),
"Idx rank and Shape rank must be the same (except 1d)."
);
// Unroll while IdxDims is nested
const
auto
unroll
ed_shape
_via_idx
=
Unroll
Shape
Via
Idx
(
shape
,
idx
);
const
auto
align
ed_shape
=
Align
Shape
To
Idx
(
shape
,
idx
);
// Transform correct form of shape
return
Mak
eMerge
s
(
unrolled_shape_via_idx
,
UnrollNestedTuple
(
idx
),
descriptor
_
);
return
Creat
eMerge
dDescriptor
(
aligned_shape
,
UnrollNestedTuple
(
idx
),
naive_
descriptor
);
}
}
template
<
typename
LayoutShape
,
typename
LayoutStrides
>
__host__
__device__
static
auto
MakeNaiveDescriptor
(
const
LayoutShape
&
shape
,
const
LayoutStrides
&
strides
)
{
const
auto
unrolled_shape
=
UnrollNestedTuple
(
shape
);
if
constexpr
(
ck
::
is_same_v
<
LayoutStrides
,
Tuple
<>>
)
{
// If shape is packed
const
auto
column_major_packed_strides
=
GenerateColumnMajorPackedStrides
(
unrolled_shape
);
return
make_naive_tensor_descriptor
(
unrolled_shape
,
column_major_packed_strides
);
}
else
{
const
auto
unrolled_strides
=
UnrollNestedTuple
(
strides
);
static_assert
(
unrolled_shape
.
Size
()
==
unrolled_strides
.
Size
(),
"Size of strides and shape are not consistent."
);
return
make_naive_tensor_descriptor
(
unrolled_shape
,
unrolled_strides
);
}
}
using
MergedNestsDescriptorType
=
remove_cvref_t
<
decltype
(
TransformDesc
(
Shape
{},
DefaultIdxsTupleType
{},
FlattenDescriptorType
{}))
>
;
public:
using
NaiveDescriptorType
=
remove_cvref_t
<
decltype
(
MakeNaiveDescriptor
(
Shape
{},
Strides
{}))
>
;
__host__
__device__
constexpr
auto
GetElementSpaceSize
()
const
{
return
flatten_descriptor_
.
GetElementSpaceSize
();
}
__host__
__device__
Layout
()
=
delete
;
/**
* \brief Layout constructor.
*
* \param shape Shape for layout.
* \param strides Strides for layout (optional if tensor is packed).
* \return Layout object.
*/
__host__
__device__
Layout
()
=
delete
;
__host__
__device__
Layout
(
const
S
hape
&
shape
,
const
S
trides
&
strides
)
:
descriptor_
{}
__host__
__device__
constexpr
Layout
(
const
Shape
&
shape
,
const
Strides
&
strides
)
:
flatten_descriptor_
{},
s
hape
_
(
shape
)
,
s
trides
_
(
strides
)
{
// Construct if runtime mode
if
constexpr
(
!
Naive
DescriptorType
::
IsKnownAtCompileTime
())
if
constexpr
(
!
Flatten
DescriptorType
::
IsKnownAtCompileTime
())
{
// Keep only shape, strides are not need for transforms
shape_
=
shape
;
descriptor_
=
MakeNaiveDescriptor
(
shape
,
strides
);
flatten_descriptor_
=
MakeFlattenDescriptor
(
shape_
,
strides_
);
descriptor_1d_
=
MakeMerge1d
(
shape_
,
flatten_descriptor_
);
merged_nests_descriptor_
=
TransformDesc
(
shape_
,
DefaultIdxsTupleType
{},
flatten_descriptor_
);
}
}
__host__
__device__
Layout
(
const
Shape
&
shape
)
:
descriptor_
{}
/**
* \brief Layout constructor (with default packed column-major strides).
*
* \param shape Shape for layout.
*/
__host__
__device__
constexpr
Layout
(
const
Shape
&
shape
)
:
flatten_descriptor_
{},
shape_
(
shape
),
strides_
(
GenerateColumnMajorPackedStrides
(
shape_
))
{
if
constexpr
(
!
Naive
DescriptorType
::
IsKnownAtCompileTime
())
if
constexpr
(
!
Flatten
DescriptorType
::
IsKnownAtCompileTime
())
{
shape_
=
shape
;
descriptor_
=
MakeNaiveDescriptor
(
shape
,
Strides
{});
flatten_descriptor_
=
MakeFlattenDescriptor
(
shape_
,
strides_
);
descriptor_1d_
=
MakeMerge1d
(
shape_
,
flatten_descriptor_
);
merged_nests_descriptor_
=
TransformDesc
(
shape_
,
DefaultIdxsTupleType
{},
flatten_descriptor_
);
}
}
...
...
@@ -292,7 +310,9 @@ struct Layout
template
<
typename
Idxs
>
__host__
__device__
constexpr
index_t
operator
()()
const
{
using
TransformedDesc
=
decltype
(
TransformDesc
(
Shape
{},
Idxs
{}));
static_assert
(
FlattenDescriptorType
::
IsKnownAtCompileTime
(),
"Compiletime operator used on runtime layout."
);
using
TransformedDesc
=
decltype
(
TransformDesc
(
Shape
{},
Idxs
{},
FlattenDescriptorType
{}));
using
UnrolledIdx
=
decltype
(
UnrollNestedTuple
(
Idxs
{}));
return
TransformedDesc
{}.
CalculateOffset
(
UnrolledIdx
{});
}
...
...
@@ -306,9 +326,22 @@ struct Layout
template
<
typename
...
Ts
>
__host__
__device__
index_t
operator
()(
const
Tuple
<
Ts
...
>&
Idx
)
const
{
// Static to construct transformed_desc only once
static
const
auto
transformed_desc
=
TransformDesc
(
shape_
,
Idx
);
return
transformed_desc
.
CalculateOffset
(
UnrollNestedTuple
(
Idx
));
if
constexpr
(
!
IsNestedTuple
(
Tuple
<
Ts
...
>
{})
&&
Tuple
<
Ts
...
>::
Size
()
==
1
)
{
// if 1d access
return
descriptor_1d_
.
CalculateOffset
(
Idx
);
}
else
if
constexpr
(
!
IsNestedTuple
(
Tuple
<
Ts
...
>
{})
&&
Tuple
<
Ts
...
>::
Size
()
==
Shape
::
Size
())
{
// if Shape::Size() access (merged nested shapes)
return
merged_nests_descriptor_
.
CalculateOffset
(
UnrollNestedTuple
(
Idx
));
}
else
{
// Custom index, need to transform descriptor
const
auto
transformed_desc
=
TransformDesc
(
shape_
,
Idx
,
flatten_descriptor_
);
return
transformed_desc
.
CalculateOffset
(
UnrollNestedTuple
(
Idx
));
}
}
/**
...
...
@@ -338,7 +371,7 @@ struct Layout
*
* \return Calculated size.
*/
__host__
__device__
constexpr
index_t
GetLength
()
const
__host__
__device__
constexpr
index_t
GetLength
s
()
const
{
const
auto
unrolled_shape
=
UnrollNestedTuple
(
shape_
);
return
TupleReduce
<
I0
.
value
,
unrolled_shape
.
Size
()
>
([](
auto
x
,
auto
y
)
{
return
x
*
y
;
},
...
...
@@ -346,80 +379,56 @@ struct Layout
}
/**
* \brief
Dimension
getter.
* \brief
Shape
getter.
*
* \tparam IDim Dimension idx.
* \return Calculated size.
* \return Shape.
*/
template
<
index_t
IDim
>
__host__
__device__
constexpr
auto
Get
()
const
{
const
auto
elem
=
shape_
.
At
(
Number
<
IDim
>
{});
return
elem
;
}
__host__
__device__
constexpr
const
Shape
&
GetShape
()
const
{
return
shape_
;
}
private:
NaiveDescriptorType
descriptor_
;
Shape
shape_
;
};
// Layout helpers
// Length getter (product if tuple)
template
<
index_t
idx
,
typename
Shape
,
typename
Strides
>
__host__
__device__
constexpr
index_t
size
(
const
Layout
<
Shape
,
Strides
>&
layout
)
{
return
layout
.
template
GetLength
<
idx
>();
}
// Get shape size (product of dims if tuple)
template
<
typename
...
ShapeDims
>
__host__
__device__
constexpr
index_t
size
(
const
Tuple
<
ShapeDims
...
>&
shape
)
{
using
UnrolledShape
=
decltype
(
UnrollNestedTuple
(
shape
));
return
TupleReduce
<
0
,
UnrolledShape
::
Size
()
>
([](
auto
x
,
auto
y
)
{
return
x
*
y
;
},
UnrolledShape
{});
}
// Get dim size (could be returned from get function)
template
<
typename
T
>
__host__
__device__
T
constexpr
size
(
const
T
&
dim
)
{
return
dim
;
}
// Get layout size (product of shapes)
template
<
typename
Shape
,
typename
Strides
>
__host__
__device__
constexpr
index_t
size
(
const
Layout
<
Shape
,
Strides
>&
layout
)
{
return
layout
.
GetLength
();
}
/**
* \brief Strides getter.
*
* \return Strides.
*/
__host__
__device__
constexpr
const
DeducedStrides
&
GetStrides
()
const
{
return
strides_
;
}
// Get shape element size
template
<
index_t
idx
,
typename
...
ShapeDims
>
__host__
__device__
constexpr
index_t
size
(
const
Tuple
<
ShapeDims
...
>&
shape
)
{
return
size
(
shape
.
At
(
Number
<
idx
>
{}));
}
/**
* \brief Get default lengths (tuple filled with Shape length elements).
*
* \return Default lengths.
*/
__host__
__device__
constexpr
auto
GetDefaultLengthsTuple
()
const
{
return
generate_tuple
([
&
](
auto
i
)
{
return
GetLength
<
i
>
();
},
Number
<
Shape
::
Size
()
>
{});
}
// Dim getter (tuple if tuple)
template
<
index_t
idx
,
typename
Shape
,
typename
Strides
>
__host__
__device__
constexpr
auto
get
(
const
Layout
<
Shape
,
Strides
>&
layout
)
{
return
layout
.
template
Get
<
idx
>();
}
/**
* \brief Get default start idx (tuple filled with 0s of the same size as Shape).
*
* \return Default start idx.
*/
__host__
__device__
constexpr
auto
GetDefaultStartIdxs
()
const
{
return
GenerateDefaultIdxsTuple
(
shape_
);
}
template
<
typename
Shape
,
typename
Strides
>
__host__
__device__
constexpr
Layout
<
Shape
,
Strides
>
make_layout
(
const
Shape
&
shape
,
const
Strides
&
strides
)
{
return
Layout
<
Shape
,
Strides
>
(
shape
,
strides
);
}
/**
* \brief Get default descriptor (with the same size as Shape)
*
* \return Default descriptor.
*/
__host__
__device__
constexpr
MergedNestsDescriptorType
GetDefaultDescriptor
()
{
return
merged_nests_descriptor_
;
}
template
<
typename
Shape
>
__host__
__device__
constexpr
Layout
<
Shape
>
make_layout
(
const
Shape
&
shape
)
{
return
Layout
<
Shape
>
(
shape
);
}
private:
FlattenDescriptorType
flatten_descriptor_
;
Descriptor1dType
descriptor_1d_
;
MergedNestsDescriptorType
merged_nests_descriptor_
;
const
Shape
shape_
;
const
DeducedStrides
strides_
;
};
}
// namespace
tensor_transform_
wrapper
}
// namespace wrapper
}
// namespace ck
include/ck/wrapper/tensor.hpp
0 → 100644
View file @
55a89c74
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "utils/tensor_utils.hpp"
#include "utils/layout_utils.hpp"
namespace
ck
{
namespace
wrapper
{
/**
* \brief Tensor wrapper that performs static and dynamic buffer logic.
*
* \tparam BufferAddressSpace Memory type (Generic, Global, LDS, VGPR, SGPR).
* \tparam ElementType Element data type.
* \tparam Shape Tensor shape (layout component).
* \tparam Strides Tensor strides (layout component).
* \tparam NumVectors Number of vectors (only for VGPR, SGPR).
* \tparam ScalarPerVector Scalars per vector (only for VGPR, SGPR).
*/
template
<
MemoryTypeEnum
BufferAddressSpace
,
typename
ElementType
,
typename
Shape
,
typename
Strides
,
index_t
NumVectors
,
// param for Register memory
index_t
ScalarPerVector
// param for Register memory
>
struct
Tensor
{
private:
// Check if Tuple contains Slice object
template
<
typename
T
>
constexpr
static
bool
IsSlicing
(
T
&&
)
{
return
is_detected
<
is_slice
,
T
>::
value
;
}
template
<
typename
...
Ts
>
constexpr
static
bool
IsSlicing
(
Tuple
<
Ts
...
>&&
)
{
return
(
IsSlicing
(
Ts
{})
||
...);
}
// Calculate first index of new tensor after slice
// It is needed to calculate offset for new tensor
template
<
typename
...
Ts
>
constexpr
auto
GetStartIdxForSlicedTensor
(
const
Tuple
<
Ts
...
>&
idx
)
const
{
const
auto
start_idx_for_sliced_tensor
=
generate_tuple
(
[
&
](
auto
i
)
{
constexpr
auto
num_i
=
Number
<
i
>
{};
if
constexpr
(
is_detected
<
is_tuple
,
tuple_element_t
<
i
.
value
,
Tuple
<
Ts
...
>>>::
value
)
{
// if tuple then recurrence
return
GetStartIdxForSlicedTensor
(
idx
.
At
(
num_i
));
}
else
if
constexpr
(
is_detected
<
is_slice
,
tuple_element_t
<
i
.
value
,
Tuple
<
Ts
...
>>>::
value
)
{
// if slice, return the beginning of the interval
return
idx
.
At
(
num_i
).
from_
;
}
else
{
// if one dim selected
return
idx
.
At
(
num_i
);
}
},
Number
<
Tuple
<
Ts
...
>::
Size
()
>
{});
return
start_idx_for_sliced_tensor
;
}
// Calculate new tensor shape after slice
template
<
typename
...
Ts
,
typename
ShapeTmpType
>
constexpr
auto
GetShapeFromSlicedTensor
(
const
Tuple
<
Ts
...
>&
idx
,
const
ShapeTmpType
&
shape
)
const
{
// Pack each value in tuple to remove empty tuples after generation
auto
new_shape
=
generate_tuple
(
[
&
](
auto
i
)
{
constexpr
auto
num_i
=
Number
<
i
>
{};
if
constexpr
(
is_detected
<
is_tuple
,
tuple_element_t
<
i
.
value
,
Tuple
<
Ts
...
>>>::
value
)
{
if
constexpr
(
!
IsSlicing
(
tuple_element_t
<
i
.
value
,
Tuple
<
Ts
...
>>
{}))
{
// if tuple does not have any slice then we can remove dimension
return
Tuple
<>
{};
}
else
{
// if tuple then recurrence
return
make_tuple
(
GetShapeFromSlicedTensor
(
idx
.
At
(
num_i
),
shape
.
At
(
num_i
)));
}
}
else
if
constexpr
(
is_detected
<
is_slice
,
tuple_element_t
<
i
.
value
,
Tuple
<
Ts
...
>>>::
value
)
{
// calculate new dimension
const
auto
&
dim
=
size
(
shape
.
At
(
num_i
));
const
auto
val
=
idx
.
At
(
num_i
).
range
(
dim
);
return
make_tuple
(
val
);
}
else
{
// remove dimension for just value
return
Tuple
<>
{};
}
},
Number
<
Tuple
<
Ts
...
>::
Size
()
>
{});
// Remove empty tuples (deleted elements) and return
return
UnrollNestedTuple
<
0
,
1
>
(
new_shape
);
}
template
<
typename
...
Ts
,
typename
StridesTmpType
>
constexpr
auto
GetStridesFromSlicedTensor
(
const
Tuple
<
Ts
...
>&
idx
,
const
StridesTmpType
&
strides
)
const
{
// Pack each value in tuple to remove empty tuples after generation
auto
new_strides
=
generate_tuple
(
[
&
](
auto
i
)
{
constexpr
auto
num_i
=
Number
<
i
>
{};
if
constexpr
(
is_detected
<
is_tuple
,
tuple_element_t
<
i
.
value
,
Tuple
<
Ts
...
>>>::
value
)
{
if
constexpr
(
!
IsSlicing
(
tuple_element_t
<
i
.
value
,
Tuple
<
Ts
...
>>
{}))
{
// if tuple does not have any slice then we can remove dimension
return
Tuple
<>
{};
}
else
{
// if tuple then recurrence
return
make_tuple
(
GetStridesFromSlicedTensor
(
idx
.
At
(
num_i
),
strides
.
At
(
num_i
)));
}
}
else
if
constexpr
(
is_detected
<
is_slice
,
tuple_element_t
<
i
.
value
,
Tuple
<
Ts
...
>>>::
value
)
{
// Stride will be the same
return
make_tuple
(
strides
.
At
(
num_i
));
}
else
{
// remove dimension for just value
return
Tuple
<>
{};
}
},
Number
<
Tuple
<
Ts
...
>::
Size
()
>
{});
// Remove empty tuples (deleted elements) and return
return
UnrollNestedTuple
<
0
,
1
>
(
new_strides
);
}
public:
using
ElementSpaceSize
=
decltype
(
Layout
<
Shape
,
Strides
>
{
Shape
{},
Strides
{}}.
GetElementSpaceSize
());
// SpaceSize type for buffer
using
TensorElementType
=
ElementType
;
// DataType
static
constexpr
MemoryTypeEnum
TensorBufferAddressSpace
=
BufferAddressSpace
;
static
constexpr
bool
IsDynamicBuffer
=
!
(
BufferAddressSpace
==
MemoryTypeEnum
::
Sgpr
||
BufferAddressSpace
==
MemoryTypeEnum
::
Vgpr
);
__host__
__device__
Tensor
()
=
delete
;
__host__
__device__
Tensor
(
ElementType
*
pointer
,
const
Layout
<
Shape
,
Strides
>&
layout
)
:
layout_
(
layout
),
buffer_
(
make_dynamic_buffer
<
BufferAddressSpace
>
(
pointer
,
layout
.
GetElementSpaceSize
()))
{
}
__host__
__device__
Tensor
(
const
Layout
<
Shape
,
Strides
>&
layout
)
:
layout_
(
layout
)
{
static_assert
(
!
IsDynamicBuffer
,
"Wrong BufferAddressSpace for register."
);
}
__host__
__device__
constexpr
const
Layout
<
Shape
,
Strides
>&
GetLayout
()
const
{
return
layout_
;
}
// Getter for new sliced tensor
template
<
typename
...
Ts
,
enable_if_t
<
IsSlicing
(
Tuple
<
Ts
...>{}),
bool
>
=
false
>
__host__
__device__
auto
operator
[](
const
Tuple
<
Ts
...
>&
idx
)
const
{
static_assert
(
IsDynamicBuffer
,
"Register slice is not supported"
);
// Calculate offset based on first idx for new tensor
const
index_t
offset
=
layout_
(
GetStartIdxForSlicedTensor
(
idx
));
auto
new_shape
=
GetShapeFromSlicedTensor
(
idx
,
layout_
.
GetShape
());
if
constexpr
(
is_same_v
<
Strides
,
Tuple
<>>
)
{
auto
new_layout
=
make_layout
(
new_shape
);
return
make_tensor
<
BufferAddressSpace
>
(
buffer_
.
p_data_
+
offset
,
new_layout
);
}
else
{
auto
new_strides
=
GetStridesFromSlicedTensor
(
idx
,
layout_
.
GetStrides
());
auto
new_layout
=
make_layout
(
new_shape
,
new_strides
);
return
make_tensor
<
BufferAddressSpace
>
(
buffer_
.
p_data_
+
offset
,
new_layout
);
}
}
template
<
typename
...
Ts
,
enable_if_t
<
IsSlicing
(
Tuple
<
Ts
...>{}),
bool
>
=
false
>
__host__
__device__
auto
operator
()(
const
Tuple
<
Ts
...
>&
idx
)
const
{
return
this
->
operator
[](
idx
);
}
template
<
typename
...
Idxs
,
enable_if_t
<
IsSlicing
(
Tuple
<
Idxs
...>{}),
bool
>
=
false
>
__host__
__device__
auto
operator
()(
Idxs
...
idxs
)
const
{
return
this
->
operator
[](
make_tuple
(
idxs
...));
}
// Getter for the const value
template
<
typename
...
Ts
,
enable_if_t
<!
IsSlicing
(
Tuple
<
Ts
...>{}),
bool
>
=
false
>
__host__
__device__
const
ElementType
&
operator
[](
const
Tuple
<
Ts
...
>&
idx
)
const
{
if
constexpr
(
IsDynamicBuffer
)
{
const
index_t
offset
=
layout_
(
idx
);
return
buffer_
[
offset
];
}
else
{
if
constexpr
(
is_same_v
<
Strides
,
Tuple
<>>
)
{
constexpr
index_t
offset
=
Layout
<
Shape
,
Strides
>
{
Shape
{}}.
template
operator
()
<
Tuple
<
Ts
...>
>
();
return
buffer_
[
Number
<
offset
>
{}];
}
else
{
constexpr
index_t
offset
=
Layout
<
Shape
,
Strides
>
{
Shape
{},
Strides
{}}.
template
operator
()
<
Tuple
<
Ts
...>
>
();
return
buffer_
[
Number
<
offset
>
{}];
}
}
}
template
<
typename
...
Ts
,
enable_if_t
<!
IsSlicing
(
Tuple
<
Ts
...>{}),
bool
>
=
false
>
__host__
__device__
const
ElementType
&
operator
()(
const
Tuple
<
Ts
...
>&
idx
)
const
{
return
this
->
operator
[](
idx
);
}
template
<
typename
...
Idxs
,
enable_if_t
<!
IsSlicing
(
Tuple
<
Idxs
...>{}),
bool
>
=
false
>
__host__
__device__
const
ElementType
&
operator
()(
Idxs
...
idxs
)
const
{
return
this
->
operator
[](
make_tuple
(
idxs
...));
}
// Getter for the value reference
template
<
typename
...
Ts
,
enable_if_t
<!
IsSlicing
(
Tuple
<
Ts
...>{}),
bool
>
=
false
>
__host__
__device__
ElementType
&
operator
[](
const
Tuple
<
Ts
...
>&
idx
)
{
if
constexpr
(
IsDynamicBuffer
)
{
const
index_t
offset
=
layout_
(
idx
);
return
buffer_
(
offset
);
}
else
{
if
constexpr
(
is_same_v
<
Strides
,
Tuple
<>>
)
{
constexpr
index_t
offset
=
Layout
<
Shape
,
Strides
>
{
Shape
{}}.
template
operator
()
<
Tuple
<
Ts
...>
>
();
return
buffer_
(
Number
<
offset
>
{});
}
else
{
constexpr
index_t
offset
=
Layout
<
Shape
,
Strides
>
{
Shape
{},
Strides
{}}.
template
operator
()
<
Tuple
<
Ts
...>
>
();
return
buffer_
(
Number
<
offset
>
{});
}
}
}
template
<
typename
...
Ts
,
enable_if_t
<!
IsSlicing
(
Tuple
<
Ts
...>{}),
bool
>
=
false
>
__host__
__device__
ElementType
&
operator
()(
const
Tuple
<
Ts
...
>&
idx
)
{
return
this
->
operator
[](
idx
);
}
template
<
typename
...
Idxs
,
enable_if_t
<!
IsSlicing
(
Tuple
<
Idxs
...>{}),
bool
>
=
false
>
__host__
__device__
ElementType
&
operator
()(
Idxs
...
idxs
)
{
return
this
->
operator
[](
make_tuple
(
idxs
...));
}
__host__
__device__
constexpr
auto
GetDefaultDescriptor
()
{
return
layout_
.
GetDefaultDescriptor
();
}
private:
using
DynamicBufferType
=
DynamicBuffer
<
BufferAddressSpace
,
ElementType
,
ElementSpaceSize
,
true
/*InvalidElementUseNumericalZeroValue*/
>
;
using
StaticBufferType
=
StaticBufferTupleOfVector
<
BufferAddressSpace
,
ElementType
,
NumVectors
,
ScalarPerVector
,
true
/*InvalidElementUseNumericalZeroValue*/
>
;
// If register use static buffer, else use dynamic buffer
using
Buffer
=
std
::
conditional_t
<
IsDynamicBuffer
,
DynamicBufferType
,
StaticBufferType
>
;
const
Layout
<
Shape
,
Strides
>
layout_
;
Buffer
buffer_
;
};
}
// namespace wrapper
}
// namespace ck
include/ck/wrapper/utils/layout_utils.hpp
0 → 100644
View file @
55a89c74
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/ck.hpp"
#include "ck/utility/number.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/utility/tuple_helper.hpp"
#include "ck/utility/sequence.hpp"
#include "ck/utility/sequence_helper.hpp"
#include "ck/utility/is_detected.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/multi_index_transform_helper.hpp"
namespace
ck
{
namespace
wrapper
{
// Disable from doxygen docs generation
/// @cond
// forward declaration
template
<
typename
Shape
,
typename
Strides
>
struct
Layout
;
template
<
typename
T
>
using
is_tuple
=
decltype
(
std
::
declval
<
T
&>
().
IsTuple
());
/// @endcond
// make_*
/**
* \brief Make layout function.
*
* \tparam Shape Shape for layout.
* \tparam Strides Strides for layout.
* \return Constructed layout.
*/
template
<
typename
Shape
,
typename
Strides
>
__host__
__device__
constexpr
Layout
<
Shape
,
Strides
>
make_layout
(
const
Shape
&
shape
,
const
Strides
&
strides
)
{
return
Layout
<
Shape
,
Strides
>
(
shape
,
strides
);
}
/**
* \brief Make layout function with packed strides
* (column-major).
*
* \tparam Shape Shape for layout.
* \return Constructed layout.
*/
template
<
typename
Shape
>
__host__
__device__
constexpr
Layout
<
Shape
,
Tuple
<>>
make_layout
(
const
Shape
&
shape
)
{
return
Layout
<
Shape
,
Tuple
<>>
(
shape
);
}
// Layout helpers
// get
// Get dim (could be returned from get with empty Idxs)
/**
* \private
*/
template
<
typename
T
>
__host__
__device__
T
constexpr
get
(
const
T
&
dim
)
{
return
dim
;
}
/**
* \brief Get element from tuple (Shape/Strides/Idxs).
*
* \tparam idx Index to lookup.
* \param tuple Tuple to lookup.
* \return Requsted element.
*/
template
<
index_t
idx
,
typename
...
Dims
>
__host__
__device__
constexpr
auto
get
(
const
Tuple
<
Dims
...
>&
tuple
)
{
return
tuple
.
At
(
Number
<
idx
>
{});
}
/**
* \brief Get sub layout.
*
* \tparam idx Index to lookup.
* \param layout Layout to create sub layout.
* \return Requsted sub layout.
*/
template
<
index_t
idx
,
typename
Shape
,
typename
Strides
>
__host__
__device__
constexpr
auto
get
(
const
Layout
<
Shape
,
Strides
>&
layout
)
{
const
auto
&
shape
=
layout
.
GetShape
();
const
auto
&
new_shape
=
get
<
idx
>
(
shape
);
static_assert
(
is_detected
<
is_tuple
,
decltype
(
new_shape
)
>::
value
,
"Shape of sub layout must be tuple"
);
if
constexpr
(
is_same_v
<
Strides
,
Tuple
<>>
)
{
// If stride not passed, create without strides
return
make_layout
(
new_shape
);
}
else
{
const
auto
&
strides
=
layout
.
GetStrides
();
const
auto
&
new_strides
=
get
<
idx
>
(
strides
);
static_assert
(
is_detected
<
is_tuple
,
decltype
(
new_strides
)
>::
value
,
"Strides of sub layout must be tuple"
);
return
make_layout
(
new_shape
,
new_strides
);
}
}
/**
* \brief Hierarchical get.
*
* \tparam Idxs Indexes to lookup.
* \param elem Element to lookup.
* \return Requsted element.
*/
template
<
index_t
Idx
,
index_t
...
Idxs
,
typename
T
>
__host__
__device__
constexpr
auto
get
(
const
T
&
elem
)
{
return
get
<
Idxs
...
>
(
get
<
Idx
>
(
elem
));
}
// size
// Get dim size (could be returned from get function)
/**
* \private
*/
template
<
typename
T
>
__host__
__device__
T
constexpr
size
(
const
T
&
dim
)
{
return
dim
;
}
/**
* \brief Length get (product if tuple).
*
* \tparam idx Index to lookup.
* \param layout Layout to get Shape of.
* \return Requsted length.
*/
template
<
index_t
idx
,
typename
Shape
,
typename
Strides
>
__host__
__device__
constexpr
index_t
size
(
const
Layout
<
Shape
,
Strides
>&
layout
)
{
return
layout
.
template
GetLength
<
idx
>();
}
/**
* \brief Shape size (product of dims).
*
* \param shape Shape to lookup.
* \return Requsted size.
*/
template
<
typename
...
ShapeDims
>
__host__
__device__
constexpr
index_t
size
(
const
Tuple
<
ShapeDims
...
>&
shape
)
{
const
auto
unrolled_shape
=
UnrollNestedTuple
(
shape
);
return
TupleReduce
<
0
,
unrolled_shape
.
Size
()
>
([](
auto
x
,
auto
y
)
{
return
x
*
y
;
},
unrolled_shape
);
}
/**
* \brief Layout size (product of dims).
*
* \param layout Layout to calculate shape size.
* \return Requsted size.
*/
template
<
typename
Shape
,
typename
Strides
>
__host__
__device__
constexpr
index_t
size
(
const
Layout
<
Shape
,
Strides
>&
layout
)
{
return
layout
.
GetLengths
();
}
/**
* \brief Length get from tuple (product if tuple).
*
* \tparam idx Index to lookup.
* \param tuple Tuple to lookup.
* \return Requsted length.
*/
template
<
index_t
idx
,
typename
...
Ts
>
__host__
__device__
constexpr
index_t
size
(
const
Tuple
<
Ts
...
>&
tuple
)
{
return
size
(
tuple
.
At
(
Number
<
idx
>
{}));
}
/**
* \brief Hierarchical size.
*
* \tparam Idx First index to lookup (to avoid empty Idxs).
* \tparam Idxs Next indexes to lookup.
* \param elem Element to lookup.
* \return Requsted element.
*/
template
<
index_t
Idx
,
index_t
...
Idxs
,
typename
T
>
__host__
__device__
constexpr
auto
size
(
const
T
&
elem
)
{
return
size
(
get
<
Idx
,
Idxs
...
>
(
elem
));
}
// rank
/**
* \brief Get layout rank (num elements in shape).
*
* \param layout Layout to calculate rank.
* \return Requsted rank.
*/
template
<
typename
Shape
,
typename
Strides
>
__host__
__device__
constexpr
auto
rank
([[
maybe_unused
]]
const
Layout
<
Shape
,
Strides
>&
layout
)
{
return
Shape
::
Size
();
}
/**
* \brief Get tuple rank (num elements in tuple).
* Return 1 if scalar passed.
*
* \param tuple Tuple to calculate rank.
* \return Requsted rank.
*/
template
<
typename
...
Dims
>
__host__
__device__
constexpr
auto
rank
([[
maybe_unused
]]
const
Tuple
<
Dims
...
>&
tuple
)
{
return
Tuple
<
Dims
...
>::
Size
();
}
/**
* \private
*/
template
<
index_t
IDim
>
__host__
__device__
constexpr
index_t
rank
(
const
Number
<
IDim
>&
)
{
return
1
;
}
/**
* \private
*/
__host__
__device__
constexpr
index_t
rank
(
const
index_t
&
)
{
return
1
;
}
/**
* \brief Hierarchical rank.
*
* \tparam Idxs Indexes to lookup.
* \param elem Element to lookup.
* \return Requsted rank.
*/
template
<
index_t
...
Idxs
,
typename
T
>
__host__
__device__
constexpr
auto
rank
(
const
T
&
elem
)
{
return
rank
(
get
<
Idxs
...
>
(
elem
));
}
// depth
/**
* \brief Get depth of the layout shape (return 0 if scalar).
*
* \param layout Layout to calculate depth.
* \return Requsted depth.
*/
template
<
typename
Shape
,
typename
Strides
>
__host__
__device__
constexpr
auto
depth
(
const
Layout
<
Shape
,
Strides
>&
layout
)
{
const
auto
&
shape
=
layout
.
GetShape
();
return
TupleDepth
(
shape
);
}
/**
* \brief Get depth of the tuple. (return 0 if scalar)
*
* \param tuple Tuple to calculate depth.
* \return Requsted depth.
*/
template
<
typename
...
Dims
>
__host__
__device__
constexpr
auto
depth
(
const
Tuple
<
Dims
...
>&
tuple
)
{
return
TupleDepth
(
tuple
);
}
/**
* \private
*/
template
<
index_t
IDim
>
__host__
__device__
constexpr
index_t
depth
(
const
Number
<
IDim
>&
)
{
return
0
;
}
/**
* \private
*/
__host__
__device__
constexpr
index_t
depth
(
const
index_t
&
)
{
return
0
;
}
/**
* \brief Hierarchical depth.
*
* \tparam Idxs Indexes to lookup.
* \param elem Element to lookup.
* \return Requsted depth.
*/
template
<
index_t
...
Idxs
,
typename
T
>
__host__
__device__
constexpr
auto
depth
(
const
T
&
elem
)
{
return
depth
(
get
<
Idxs
...
>
(
elem
));
}
/**
* \brief Get Layout strides.
*
* \param layout Layout to get strides from.
* \return Requsted strides.
*/
template
<
typename
Shape
,
typename
Strides
>
__host__
__device__
constexpr
const
auto
&
stride
(
const
Layout
<
Shape
,
Strides
>&
layout
)
{
return
layout
.
GetStrides
();
}
/**
* \brief Get Layout shape.
*
* \param layout Layout to get shape from.
* \return Requsted shape.
*/
template
<
typename
Shape
,
typename
Strides
>
__host__
__device__
constexpr
const
auto
&
shape
(
const
Layout
<
Shape
,
Strides
>&
layout
)
{
return
layout
.
GetShape
();
}
}
// namespace wrapper
}
// namespace ck
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment