Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
b12bbcee
Commit
b12bbcee
authored
Sep 26, 2019
by
Chao Liu
Browse files
clean up
parent
51a9fa1d
Changes
16
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
2372 additions
and
2299 deletions
+2372
-2299
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+32
-19
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
+35
-22
composable_kernel/include/tensor_description/tensor_coordinate.hpp
...e_kernel/include/tensor_description/tensor_coordinate.hpp
+116
-246
composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
...clude/tensor_description/tensor_coordinate_deprecated.hpp
+346
-0
composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp
...l/include/tensor_description/tensor_coordinate_helper.hpp
+1
-1
composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
...ernel/include/tensor_description/tensor_coordinate_v2.hpp
+0
-215
composable_kernel/include/tensor_description/tensor_view.hpp
composable_kernel/include/tensor_description/tensor_view.hpp
+2
-2
composable_kernel/include/tensor_description/tensor_visit.hpp
...osable_kernel/include/tensor_description/tensor_visit.hpp
+2
-2
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+1
-670
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
...ration/blockwise_generic_tensor_slice_copy_deprecated.hpp
+692
-0
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+7
-1113
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
...ation/threadwise_generic_tensor_slice_copy_deprecated.hpp
+1129
-0
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+1
-1
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+2
-2
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
..._convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
+1
-1
driver/src/driver.cpp
driver/src/driver.cpp
+5
-5
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
b12bbcee
...
@@ -5,9 +5,9 @@
...
@@ -5,9 +5,9 @@
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "blockwise_generic_tensor_slice_copy
_deprecated
.hpp"
#include "blockwise_gemm.hpp"
#include "blockwise_gemm.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "threadwise_generic_tensor_slice_copy
_deprecated
.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -265,8 +265,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -265,8 +265,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
// LDS double buffer: preload data into LDS
// LDS double buffer: preload data into LDS
{
{
blockwise_in_copy
.
Run
(
p_in_global
,
p_in_block_double
);
blockwise_in_copy
.
template
Run
<
Float
,
address_space_t
::
global
>(
p_in_global
,
blockwise_wei_copy
.
Run
(
p_wei_global
,
p_wei_block_double
);
p_in_block_double
);
blockwise_wei_copy
.
template
Run
<
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_block_double
);
}
}
// LDS double buffer: main body
// LDS double buffer: main body
...
@@ -288,8 +290,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -288,8 +290,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float
*
p_wei_block_next
=
Float
*
p_wei_block_next
=
even_loop
?
p_wei_block_double
+
wei_block_space
:
p_wei_block_double
;
even_loop
?
p_wei_block_double
+
wei_block_space
:
p_wei_block_double
;
Float
p_in_
register
_buffer
[
blockwise_in_copy
.
Get
Register
BufferSize
()];
Float
p_in_
thread
_buffer
[
blockwise_in_copy
.
Get
Thread
BufferSize
()];
Float
p_wei_
register
_buffer
[
blockwise_wei_copy
.
Get
Register
BufferSize
()];
Float
p_wei_
thread
_buffer
[
blockwise_wei_copy
.
Get
Thread
BufferSize
()];
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
...
@@ -297,23 +299,25 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -297,23 +299,25 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
__syncthreads
();
__syncthreads
();
// LDS doubel buffer: load next data from device mem
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
RunLoadRegisterBuffer
(
p_in_global
,
p_in_register_buffer
);
blockwise_in_copy
.
template
RunLoadThreadBuffer
<
Float
,
address_space_t
::
global
>(
blockwise_wei_copy
.
RunLoadRegisterBuffer
(
p_wei_global
,
p_wei_register_buffer
);
p_in_global
,
p_in_thread_buffer
);
blockwise_wei_copy
.
template
RunLoadThreadBuffer
<
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_thread_buffer
);
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
// LDS double buffer: store next data to LDS
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
RunStore
Register
Buffer
(
p_in_
register
_buffer
,
p_in_block_next
);
blockwise_in_copy
.
RunStore
Thread
Buffer
(
p_in_
thread
_buffer
,
p_in_block_next
);
blockwise_wei_copy
.
RunStore
Register
Buffer
(
p_wei_
register
_buffer
,
p_wei_block_next
);
blockwise_wei_copy
.
RunStore
Thread
Buffer
(
p_wei_
thread
_buffer
,
p_wei_block_next
);
}
}
}
}
// LDS double buffer: tail
// LDS double buffer: tail
{
{
// even iteration
// even iteration
Float
p_in_
register
_buffer
[
blockwise_in_copy
.
Get
Register
BufferSize
()];
Float
p_in_
thread
_buffer
[
blockwise_in_copy
.
Get
Thread
BufferSize
()];
Float
p_wei_
register
_buffer
[
blockwise_wei_copy
.
Get
Register
BufferSize
()];
Float
p_wei_
thread
_buffer
[
blockwise_wei_copy
.
Get
Thread
BufferSize
()];
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{},
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
...
@@ -321,17 +325,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -321,17 +325,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
__syncthreads
();
__syncthreads
();
// LDS doubel buffer: load next data from device mem
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
RunLoadRegisterBuffer
(
p_in_global
,
p_in_register_buffer
);
blockwise_in_copy
.
template
RunLoadThreadBuffer
<
Float
,
address_space_t
::
global
>(
blockwise_wei_copy
.
RunLoadRegisterBuffer
(
p_wei_global
,
p_wei_register_buffer
);
p_in_global
,
p_in_thread_buffer
);
blockwise_wei_copy
.
template
RunLoadThreadBuffer
<
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_thread_buffer
);
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
// LDS double buffer: store next data to LDS
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
RunStore
Register
Buffer
(
p_in_
register
_buffer
,
blockwise_in_copy
.
RunStore
Thread
Buffer
(
p_in_
thread
_buffer
,
p_in_block_double
+
in_block_space
);
p_in_block_double
+
in_block_space
);
blockwise_wei_copy
.
RunStore
Register
Buffer
(
p_wei_
register
_buffer
,
blockwise_wei_copy
.
RunStore
Thread
Buffer
(
p_wei_
thread
_buffer
,
p_wei_block_double
+
wei_block_space
);
p_wei_block_double
+
wei_block_space
);
// odd iteration
// odd iteration
__syncthreads
();
__syncthreads
();
...
@@ -390,7 +396,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -390,7 +396,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
0
,
0
,
b_thread_data_on_global
,
b_thread_data_on_global
,
0
})
0
})
.
template
Run_amd_experiment
<
Float
,
0
,
2
>(
p_out_thread
,
p_out_global
);
#if 0
.Run
#else
// tweaking
.
template
Run_optimized_address_calculation
<
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>
#endif
(
p_out_thread
,
p_out_global
);
}
}
}
}
};
};
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
b12bbcee
...
@@ -5,9 +5,9 @@
...
@@ -5,9 +5,9 @@
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "blockwise_generic_tensor_slice_copy
_deprecated
.hpp"
#include "blockwise_gemm.hpp"
#include "blockwise_gemm.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "threadwise_generic_tensor_slice_copy
_deprecated
.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -251,8 +251,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -251,8 +251,10 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
// LDS double buffer: preload data into LDS
// LDS double buffer: preload data into LDS
{
{
blockwise_in_copy
.
Run
(
p_in_global
,
p_in_block_double
);
blockwise_in_copy
.
template
Run
<
Float
,
address_space_t
::
global
>(
p_in_global
,
blockwise_wei_copy
.
Run
(
p_wei_global
,
p_wei_block_double
);
p_in_block_double
);
blockwise_wei_copy
.
template
Run
<
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_block_double
);
}
}
// LDS double buffer: main body
// LDS double buffer: main body
...
@@ -274,51 +276,54 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -274,51 +276,54 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
Float
*
p_wei_block_next
=
Float
*
p_wei_block_next
=
even_loop
?
p_wei_block_double
+
wei_block_space
:
p_wei_block_double
;
even_loop
?
p_wei_block_double
+
wei_block_space
:
p_wei_block_double
;
Float
p_in_
register
_buffer
[
blockwise_in_copy
.
Get
Register
BufferSize
()];
Float
p_in_
thread
_buffer
[
blockwise_in_copy
.
Get
Thread
BufferSize
()];
Float
p_wei_
register
_buffer
[
blockwise_wei_copy
.
Get
Register
BufferSize
()];
Float
p_wei_
thread
_buffer
[
blockwise_wei_copy
.
Get
Thread
BufferSize
()];
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStrides
()[
0
]
;
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
)
;
__syncthreads
();
__syncthreads
();
// LDS doubel buffer: load next data from device mem
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
RunLoadRegisterBuffer
(
p_in_global
,
p_in_register_buffer
);
blockwise_in_copy
.
template
RunLoadThreadBuffer
<
Float
,
address_space_t
::
global
>(
blockwise_wei_copy
.
RunLoadRegisterBuffer
(
p_wei_block_on_global
,
p_in_global
,
p_in_thread_buffer
);
p_wei_register_buffer
);
blockwise_wei_copy
.
template
RunLoadThreadBuffer
<
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_thread_buffer
);
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
// LDS double buffer: store next data to LDS
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
RunStore
Register
Buffer
(
p_in_
register
_buffer
,
p_in_block_next
);
blockwise_in_copy
.
RunStore
Thread
Buffer
(
p_in_
thread
_buffer
,
p_in_block_next
);
blockwise_wei_copy
.
RunStore
Register
Buffer
(
p_wei_
register
_buffer
,
p_wei_block_next
);
blockwise_wei_copy
.
RunStore
Thread
Buffer
(
p_wei_
thread
_buffer
,
p_wei_block_next
);
}
}
}
}
// LDS double buffer: tail
// LDS double buffer: tail
{
{
Float
p_in_
register
_buffer
[
blockwise_in_copy
.
Get
Register
BufferSize
()];
Float
p_in_
thread
_buffer
[
blockwise_in_copy
.
Get
Thread
BufferSize
()];
Float
p_wei_
register
_buffer
[
blockwise_wei_copy
.
Get
Register
BufferSize
()];
Float
p_wei_
thread
_buffer
[
blockwise_wei_copy
.
Get
Thread
BufferSize
()];
// even iteration
// even iteration
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStrides
()[
0
]
;
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{},
True
)
;
__syncthreads
();
__syncthreads
();
// LDS doubel buffer: load next data from device mem
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
RunLoadRegisterBuffer
(
p_in_global
,
p_in_register_buffer
);
blockwise_in_copy
.
template
RunLoadThreadBuffer
<
Float
,
address_space_t
::
global
>(
blockwise_wei_copy
.
RunLoadRegisterBuffer
(
p_wei_block_on_global
,
p_wei_register_buffer
);
p_in_global
,
p_in_thread_buffer
);
blockwise_wei_copy
.
template
RunLoadThreadBuffer
<
Float
,
address_space_t
::
global
>(
p_wei_global
,
p_wei_thread_buffer
);
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
// LDS double buffer: store next data to LDS
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
RunStore
Register
Buffer
(
p_in_
register
_buffer
,
blockwise_in_copy
.
RunStore
Thread
Buffer
(
p_in_
thread
_buffer
,
p_in_block_double
+
in_block_space
);
p_in_block_double
+
in_block_space
);
blockwise_wei_copy
.
RunStore
Register
Buffer
(
p_wei_
register
_buffer
,
blockwise_wei_copy
.
RunStore
Thread
Buffer
(
p_wei_
thread
_buffer
,
p_wei_block_double
+
wei_block_space
);
p_wei_block_double
+
wei_block_space
);
// odd iteration
// odd iteration
__syncthreads
();
__syncthreads
();
...
@@ -385,7 +390,15 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -385,7 +390,15 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
for
(
index_t
nrepeat
=
0
;
nrepeat
<
GemmNRepeat
;
++
nrepeat
)
for
(
index_t
nrepeat
=
0
;
nrepeat
<
GemmNRepeat
;
++
nrepeat
)
{
{
threadwise_out_copy
.
Run
(
p_out_thread
,
p_out_global
);
threadwise_out_copy
#if 1
.
Run
#else // tweaking
.
template
Run_optimized_address_calculation
<
Float
,
address_space_t
::
generic
,
address_space_t
::
global
>
#endif
(
p_out_thread
,
p_out_global
);
threadwise_out_copy
.
MoveSrcSliceWindow
(
Sequence
<
0
,
0
,
GemmNPerThreadSubC
>
{},
True
);
threadwise_out_copy
.
MoveSrcSliceWindow
(
Sequence
<
0
,
0
,
GemmNPerThreadSubC
>
{},
True
);
threadwise_out_copy
.
MoveDstSliceWindow
(
Sequence
<
0
,
0
,
B1
>
{},
True
);
threadwise_out_copy
.
MoveDstSliceWindow
(
Sequence
<
0
,
0
,
B1
>
{},
True
);
...
...
composable_kernel/include/tensor_description/tensor_coordinate.hpp
View file @
b12bbcee
#ifndef CK_TENSOR_COORDINATE_HPP
#ifndef CK_TENSOR_COORDINATE_
V2_
HPP
#define CK_TENSOR_COORDINATE_HPP
#define CK_TENSOR_COORDINATE_
V2_
HPP
#include "common_header.hpp"
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "dimension.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "multi_index_transform.hpp"
#include "tensor_descriptor.hpp"
namespace
ck
{
namespace
ck
{
// TensorDesc is ConstantTensorDescriptor
template
<
typename
TensorDesc
>
template
<
class
TensorDesc
>
struct
TensorCoordinate
;
struct
NormalTensorCoordinate
{
using
type
=
NormalTensorCoordinate
;
using
tensor_desc_type
=
TensorDesc
;
template
<
typename
NativeTensorDesc
>
struct
NativeTensorCoordinate
{
using
type
=
NativeTensorCoordinate
;
using
tensor_desc_type
=
NativeTensorDesc
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
__host__
__device__
constexpr
N
ormal
TensorCoordinate
(
Array
<
index_t
,
nDim
>
tensor_inde
x
)
__host__
__device__
constexpr
N
ative
TensorCoordinate
(
Index
id
x
)
:
mOffset
{
tensor_desc_type
::
GetOffsetFromMultiIndex
(
tensor_inde
x
)
}
:
mIndex
(
idx
),
mOffset
(
tensor_desc_type
::
CalculateOffset
(
id
x
)
)
{
{
}
}
template
<
class
...
Xs
>
template
<
typename
...
Xs
>
__host__
__device__
constexpr
N
ormal
TensorCoordinate
(
Xs
...
xs
)
__host__
__device__
constexpr
N
ative
TensorCoordinate
(
Xs
...
xs
)
:
N
ormal
TensorCoordinate
(
Array
<
index_t
,
nDim
>
{
xs
...})
:
N
ative
TensorCoordinate
(
Index
{
xs
...})
{
{
}
}
template
<
index_t
...
Xs
>
template
<
index_t
...
Xs
>
__host__
__device__
constexpr
N
ormal
TensorCoordinate
(
Sequence
<
Xs
...
>
)
__host__
__device__
constexpr
N
ative
TensorCoordinate
(
Sequence
<
Xs
...
>
)
:
N
ormal
TensorCoordinate
(
Array
<
index_t
,
nDim
>
{
Xs
...})
:
N
ative
TensorCoordinate
(
Index
{
Xs
...})
{
{
}
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
__host__
__device__
static
constexpr
auto
GetTensorDescriptor
()
{
return
tensor_desc_type
{};
}
__host__
__device__
constexpr
const
Index
&
GetIndex
()
const
{
return
mIndex
;
}
// T is Array or Sequence
__host__
__device__
constexpr
const
index_t
&
GetOffset
()
const
{
return
mOffset
;
}
template
<
class
T
>
__host__
__device__
type
operator
+=
(
T
step_sizes
)
__host__
__device__
constexpr
type
operator
+=
(
const
Index
&
idx_diff
)
{
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
// mIndex is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
mIndex
+=
idx_diff
;
mOffset
+=
tensor_desc_type
::
GetOffsetFromMultiIndex
(
step_sizes
);
mOffset
+=
tensor_desc_type
::
CalculateOffsetDiff
(
idx_diff
);
return
*
this
;
return
*
this
;
}
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
-=
(
const
Index
&
idx_diff
)
__host__
__device__
type
operator
-=
(
T
step_sizes
)
{
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
// mIndex is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
mIndex
-=
idx_diff
;
mOffset
-=
tensor_desc_type
::
GetOffsetFromMultiIndex
(
step_sizes
);
mOffset
-=
tensor_desc_type
::
CalculateOffsetDiff
(
idx_diff
);
return
*
this
;
return
*
this
;
}
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
+
(
const
Index
&
idx_diff
)
const
__host__
__device__
constexpr
type
operator
+
(
T
step_sizes
)
const
{
{
type
coord
=
*
this
;
type
coord
=
*
this
;
coord
+=
step_sizes
;
coord
+=
idx_diff
;
return
coord
;
return
coord
;
}
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
-
(
const
Index
&
idx_diff
)
const
__host__
__device__
constexpr
type
operator
-
(
T
step_sizes
)
const
{
{
type
coord
=
*
this
;
type
coord
=
*
this
;
coord
-=
step_sizes
;
coord
-=
idx_diff
;
return
coord
;
return
coord
;
}
}
// reposition point of origin, and return compensated offset.
__host__
__device__
static
constexpr
bool
IsUpperIndexMappedToValidOffset
()
{
return
true
;
}
// This is a hack to reduce index calculation during looping over
// a tensor whose origin is this TensorCoordinate. It does so, by spitting
// out the run-time offset to the pointer (to the tensor data) held by this
// TensorCoordiante, so the caller can add the offset into the run-time pointer of
// the data, so only 1 run-time variable (update pointer) is needed, instead
// of 2 run-time variables (old pointer and this offset)
// TODO: after introducing the concept of "run-time tensor view", which contains the
// run-time pointer to the data, always keep track of the pointer, instead of both
// offset and the pointer. This also bring additional benefit that we don't need to
// worry the offset might underflow (because offset is unsigned integer) when updating it.
__host__
__device__
constexpr
index_t
RepositionOrigin
()
{
index_t
offset_diff
=
mOffset
;
mOffset
=
0
;
return
offset_diff
;
}
private:
private:
// mIndex may be saved and updated, however, the value of some (or all) of its entries may
// never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
Index
mIndex
;
index_t
mOffset
;
index_t
mOffset
;
};
};
// TensorDesc is ConstantMergedTensorDescriptor
template
<
typename
TransformedTensorDesc
>
template
<
class
TensorDesc
>
struct
TransformedTensorCoordinate
struct
MergedTensorCoordinate
{
{
using
type
=
MergedTensorCoordinate
;
using
tensor_desc_type
=
TransformedTensorDesc
;
using
tensor_desc_type
=
TensorDesc
;
using
LowerCoord
=
typename
TensorCoordinate
<
decltype
(
tensor_desc_type
::
GetLowerTensorDescriptor
())
>::
type
;
using
UpperCoord
=
TransformedTensorCoordinate
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
static
constexpr
index_t
nOriginalDim
=
using
UpperIndex
=
MultiIndex
<
nDim
>
;
tensor_desc_type
::
GetOriginalTensorDescriptor
().
GetNumOfDimension
();
__host__
__device__
constexpr
Merg
edTensorCoordinate
(
Array
<
index_t
,
nDim
>
tensor_inde
x
)
__host__
__device__
constexpr
Transform
edTensorCoordinate
(
UpperIndex
id
x
)
:
m
OriginalIndex
{
tensor_desc_type
::
GetOrigin
al
M
ul
tiIndexFromMultiIndex
(
tensor_inde
x
)}
:
m
IndexUp
{
idx
},
mCoordLow
{
tensor_desc_type
::
C
al
c
ul
ateLowerIndex
(
id
x
)}
{
{
// partial offset on each dimension
}
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
constexpr
auto
partial_original_dims
=
tensor_desc_type
::
GetContainedOriginalDimensions
(
idim
);
constexpr
auto
partial_original_desc
=
tensor_desc_type
::
GetOriginalTensorDescriptor
().
Extract
(
partial_original_dims
);
mPartialOffsets
(
idim
)
=
partial_original_desc
.
GetOffsetFromMultiIndex
(
extract_array
(
mOriginalIndex
,
partial_original_dims
));
});
// complete offset
template
<
typename
...
Xs
>
mOffset
=
__host__
__device__
constexpr
TransformedTensorCoordinate
(
Xs
...
xs
)
accumulate_on_array
(
mPartialOffsets
,
math
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
:
TransformedTensorCoordinate
(
UpperIndex
{
xs
...})
{
}
}
template
<
class
...
Xs
>
template
<
index_t
...
Xs
>
__host__
__device__
constexpr
Merg
edTensorCoordinate
(
Xs
...
xs
)
__host__
__device__
constexpr
Transform
edTensorCoordinate
(
Sequence
<
Xs
...
>
)
:
Merg
edTensorCoordinate
(
Array
<
index_t
,
nDim
>
{
x
s
...})
:
Transform
edTensorCoordinate
(
UpperIndex
{
X
s
...})
{
{
}
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
__host__
__device__
static
constexpr
auto
GetTensorDescriptor
()
{
return
tensor_desc_type
{}
;
}
template
<
class
IDim
,
class
T
,
bool
PositiveDirection
>
__host__
__device__
constexpr
const
LowerCoord
&
GetLowerCoordinate
()
const
{
return
mCoordLow
;
}
__host__
__device__
void
MoveOnDimension
(
IDim
idim_
,
T
step_size
,
integral_constant
<
bool
,
PositiveDirection
>
)
__host__
__device__
constexpr
const
UpperIndex
&
GetUpperIndex
()
const
{
return
mIndexUp
;
}
{
constexpr
auto
idim
=
idim_
;
__host__
__device__
constexpr
const
UpperIndex
&
GetIndex
()
const
{
return
GetUpperIndex
();
}
// if step_size is known at compile time
__host__
__device__
constexpr
const
index_t
&
GetOffset
()
const
static_if
<
is_static
<
T
>::
value
>
{}(
{
[
&
](
auto
)
{
static_if
<
T
{}
==
0
>
{}([
&
](
auto
)
{
return
;
});
});
return
GetLowerCoordinate
().
GetOffset
();
// update original index
static_if
<
tensor_desc_type
::
ContainMultipleOriginalDimensions
(
idim
)
>
{}([
&
](
auto
)
{
constexpr
auto
partial_original_dims
=
tensor_desc_type
::
GetContainedOriginalDimensions
(
idim
);
constexpr
index_t
ndim_partial_original
=
partial_original_dims
.
GetSize
();
constexpr
auto
partial_original_desc
=
tensor_desc_type
::
GetOriginalTensorDescriptor
().
Extract
(
partial_original_dims
);
const
auto
partial_original_step_sizes
=
partial_original_desc
.
GetMultiIndexFrom1dIndex
(
step_size
);
// update partial original multi-id
auto
partial_original_id
=
extract_array
(
mOriginalIndex
,
partial_original_dims
);
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
partial_original_id
+=
partial_original_step_sizes
;
bool
carry
=
false
;
// do carry check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
ndim_partial_original
-
1
,
1
>
{}([
&
](
auto
IReverse
)
{
constexpr
index_t
i
=
ndim_partial_original
-
1
-
IReverse
;
if
(
carry
)
{
++
partial_original_id
(
i
);
}
carry
=
false
;
if
(
partial_original_id
[
i
]
>=
partial_original_desc
.
GetLength
(
i
))
{
partial_original_id
(
i
)
-=
partial_original_desc
.
GetLength
(
i
);
carry
=
true
;
}
});
// highest dimension
if
(
carry
)
{
++
partial_original_id
(
0
);
}
}).
Else
([
&
](
auto
)
{
// shift up multi-id to avoid unsigned integer underflow during intermediate
// calculations. After the shift, should have new_multi_id[...] >= 1
partial_original_id
+=
partial_original_desc
.
GetLengths
()
-
partial_original_step_sizes
;
bool
borrow
=
false
;
// do borrow check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
ndim_partial_original
-
1
,
1
>
{}([
&
](
auto
IReverse
)
{
constexpr
index_t
i
=
ndim_partial_original
-
1
-
IReverse
;
if
(
borrow
)
{
--
partial_original_id
(
i
);
}
borrow
=
false
;
if
(
partial_original_id
[
i
]
<
partial_original_desc
.
GetLength
(
i
))
{
partial_original_id
(
i
)
+=
partial_original_desc
.
GetLength
(
i
);
borrow
=
true
;
}
});
// highest dimension
if
(
borrow
)
{
--
partial_original_id
(
0
);
}
// shift back down multi-id
// here, should have new_multi_id[...] >= GetLengths()
partial_original_id
=
partial_original_id
-
partial_original_desc
.
GetLengths
();
});
// update "mOriginalIndex"
static_for
<
0
,
ndim_partial_original
,
1
>
{}([
&
](
auto
I
)
{
constexpr
auto
idim_original
=
partial_original_dims
[
I
];
mOriginalIndex
(
idim_original
)
=
partial_original_id
[
I
];
});
// calculate new partial offset on this merged dimension
const
index_t
old_partial_offset
=
mPartialOffsets
[
idim
];
mPartialOffsets
(
idim
)
=
partial_original_desc
.
GetOffsetFromMultiIndex
(
partial_original_id
);
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mOffset
=
(
mOffset
+
mPartialOffsets
[
idim
])
-
old_partial_offset
;
}).
Else
([
&
](
auto
fwd
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mOffset
+=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
}).
Else
([
&
](
auto
)
{
mOffset
-=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
});
});
}
}
// T is Array or Sequence
__host__
__device__
constexpr
UpperCoord
operator
+=
(
const
UpperIndex
&
idx_up_diff
)
template
<
class
T
>
__host__
__device__
type
operator
+=
(
T
step_sizes
)
{
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
// For transformation of multi-index difference, not all transformation functions need to
// know the old lower-index or the old upper-index. We pass both of them to the
// transformation function. The transformation function itself decides to use them or not.
mCoordLow
+=
tensor_desc_type
::
CalculateLowerIndexDiff
(
idx_up_diff
,
GetIndex
(),
GetLowerCoordinate
().
GetIndex
());
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
// mIndexUp is updated here, but some (or all) of its entries may never be used
// compiler should remove dead code path, because step_sizes is known at
// compiler should remove those entries as dead code
// compile time
mIndexUp
+=
idx_up_diff
;
if
(
step_sizes
[
idim
]
!=
0
)
{
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
true
>
{});
}
});
return
*
this
;
return
*
this
;
}
}
template
<
class
T
>
__host__
__device__
constexpr
UpperCoord
operator
-=
(
const
UpperIndex
&
idx_up_diff
)
__host__
__device__
type
operator
-=
(
T
step_sizes
)
{
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
mCoordLow
-=
tensor_desc_type
::
CalculateLowerIndexDiff
(
idx_up_diff
,
GetIndex
(),
GetLowerCoordinate
().
GetIndex
());
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
// mIndex is updated here, but some (or all) of its entries may never be used
// compiler should remove dead code path, because step_sizes is known at
// compiler should remove those entries as dead code
// compile time
mIndexUp
-=
idx_up_diff
;
if
(
step_sizes
[
idim
]
!=
0
)
{
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
false
>
{});
}
});
return
*
this
;
return
*
this
;
}
}
template
<
class
T
>
__host__
__device__
constexpr
UpperCoord
operator
+
(
const
UpperIndex
&
idx_up_diff
)
const
__host__
__device__
constexpr
type
operator
+
(
T
step_sizes
)
const
{
{
type
coord
=
*
this
;
UpperCoord
coord
_up
=
*
this
;
coord
+=
step_sizes
;
coord
_up
+=
idx_up_diff
;
return
coord
;
return
coord
_up
;
}
}
template
<
class
T
>
__host__
__device__
constexpr
UpperCoord
operator
-
(
const
UpperIndex
&
idx_up_diff
)
const
__host__
__device__
constexpr
type
operator
-
(
T
step_sizes
)
const
{
{
type
coord
=
*
this
;
UpperCoord
coord
_up
=
*
this
;
coord
-=
step_sizes
;
coord
_up
-=
idx_up_diff
;
return
coord
;
return
coord
_up
;
}
}
__host__
__device__
static
constexpr
index_t
RepositionOrigin
()
{
return
0
;
}
// this function should be inexpensive, because there is no upper-to-lower index transformation
__host__
__device__
constexpr
bool
IsUpperIndexMappedToValidOffset
()
const
{
return
tensor_desc_type
::
IsUpperIndexMappedToValidLowerIndex
(
GetIndex
())
&&
mCoordLow
.
IsUpperIndexMappedToValidOffset
();
}
private:
private:
// Allocate register memory for all merged dimensions and normal dimensions.
// mIndexUp may be calculated and updated, however, the value of some (or all) of its entries
// However, only those merged dimensions, whose index will be involved in arithmetic
// may
// after the construction of this TensorCoordinate (e.g. when user move a slicing
// never be used. Compiler should be able to remove these entries as well as its calculation
// window on the merged dimension), will use these register memory.
// as dead code.
// Let's hope compiler will optimize away those register memory allocated for normal
// TODO: make sure compiler indeed remove these dead code
// dimensions, and those merged dimensions, that would never be involved in index
UpperIndex
mIndexUp
;
// arithmetic after construction of TensorCoordinate.
LowerCoord
mCoordLow
;
// TODO: refactor TensorCoordinate, after introducing the concept of "dimensions"
// and simplify implementation of ConstantMergedTensorDescriptor, so we don't need to
// count on compiler to optimize away those register memory for us
Array
<
index_t
,
nOriginalDim
>
mOriginalIndex
;
Array
<
index_t
,
nDim
>
mPartialOffsets
;
// complete offset
index_t
mOffset
;
};
};
template
<
class
TensorDesc
>
template
<
typename
TensorDesc
>
struct
TensorCoordinate
struct
TensorCoordinate
{
{
private:
private:
template
<
class
...
Ts
>
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
Constant
TensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
Native
TensorDescriptor
<
Ts
...
>
)
{
{
return
NormalTensorCoordinate
<
ConstantTensorDescriptor
<
Ts
...
>>
();
return
NativeTensorCoordinate
<
NativeTensorDescriptor
<
Ts
...
>>
(
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
}
}
template
<
class
...
Ts
>
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
ConstantMerg
edTensorDescriptor
<
Ts
...
>
)
MakeDummyTensorCoordinate
(
Transform
edTensorDescriptor
<
Ts
...
>
)
{
{
return
MergedTensorCoordinate
<
ConstantMergedTensorDescriptor
<
Ts
...
>>
();
return
TransformedTensorCoordinate
<
TransformedTensorDescriptor
<
Ts
...
>>
(
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
}
}
public:
public:
...
...
composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
0 → 100644
View file @
b12bbcee
#ifndef CK_TENSOR_COORDINATE_DEPRECATED_HPP
#define CK_TENSOR_COORDINATE_DEPRECATED_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
namespace
ck
{
// TensorDesc is ConstantTensorDescriptor
template
<
class
TensorDesc
>
struct
NormalTensorCoordinate_deprecated
{
using
type
=
NormalTensorCoordinate_deprecated
;
using
tensor_desc_type
=
TensorDesc
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
__host__
__device__
constexpr
NormalTensorCoordinate_deprecated
(
Array
<
index_t
,
nDim
>
tensor_index
)
:
mOffset
{
tensor_desc_type
::
GetOffsetFromMultiIndex
(
tensor_index
)}
{
}
template
<
class
...
Xs
>
__host__
__device__
constexpr
NormalTensorCoordinate_deprecated
(
Xs
...
xs
)
:
NormalTensorCoordinate_deprecated
(
Array
<
index_t
,
nDim
>
{
xs
...})
{
}
template
<
index_t
...
Xs
>
__host__
__device__
constexpr
NormalTensorCoordinate_deprecated
(
Sequence
<
Xs
...
>
)
:
NormalTensorCoordinate_deprecated
(
Array
<
index_t
,
nDim
>
{
Xs
...})
{
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
// T is Array or Sequence
template
<
class
T
>
__host__
__device__
type
operator
+=
(
T
step_sizes
)
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
mOffset
+=
tensor_desc_type
::
GetOffsetFromMultiIndex
(
step_sizes
);
return
*
this
;
}
template
<
class
T
>
__host__
__device__
type
operator
-=
(
T
step_sizes
)
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
mOffset
-=
tensor_desc_type
::
GetOffsetFromMultiIndex
(
step_sizes
);
return
*
this
;
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
+
(
T
step_sizes
)
const
{
type
coord
=
*
this
;
coord
+=
step_sizes
;
return
coord
;
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
-
(
T
step_sizes
)
const
{
type
coord
=
*
this
;
coord
-=
step_sizes
;
return
coord
;
}
// reposition point of origin, and return compensated offset.
// This is a hack to reduce index calculation during looping over
// a tensor whose origin is this TensorCoordinate. It does so, by spitting
// out the run-time offset to the pointer (to the tensor data) held by this
// TensorCoordiante, so the caller can add the offset into the run-time pointer of
// the data, so only 1 run-time variable (update pointer) is needed, instead
// of 2 run-time variables (old pointer and this offset)
// TODO: after introducing the concept of "run-time tensor view", which contains the
// run-time pointer to the data, always keep track of the pointer, instead of both
// offset and the pointer. This also bring additional benefit that we don't need to
// worry the offset might underflow (because offset is unsigned integer) when updating it.
__host__
__device__
constexpr
index_t
RepositionOrigin
()
{
index_t
offset_diff
=
mOffset
;
mOffset
=
0
;
return
offset_diff
;
}
private:
index_t
mOffset
;
};
// TensorDesc is ConstantMergedTensorDescriptor
template
<
class
TensorDesc
>
struct
MergedTensorCoordinate
{
using
type
=
MergedTensorCoordinate
;
using
tensor_desc_type
=
TensorDesc
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
static
constexpr
index_t
nOriginalDim
=
tensor_desc_type
::
GetOriginalTensorDescriptor
().
GetNumOfDimension
();
__host__
__device__
constexpr
MergedTensorCoordinate
(
Array
<
index_t
,
nDim
>
tensor_index
)
:
mOriginalIndex
{
tensor_desc_type
::
GetOriginalMultiIndexFromMultiIndex
(
tensor_index
)}
{
// partial offset on each dimension
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
constexpr
auto
partial_original_dims
=
tensor_desc_type
::
GetContainedOriginalDimensions
(
idim
);
constexpr
auto
partial_original_desc
=
tensor_desc_type
::
GetOriginalTensorDescriptor
().
Extract
(
partial_original_dims
);
mPartialOffsets
(
idim
)
=
partial_original_desc
.
GetOffsetFromMultiIndex
(
extract_array
(
mOriginalIndex
,
partial_original_dims
));
});
// complete offset
mOffset
=
accumulate_on_array
(
mPartialOffsets
,
math
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
}
template
<
class
...
Xs
>
__host__
__device__
constexpr
MergedTensorCoordinate
(
Xs
...
xs
)
:
MergedTensorCoordinate
(
Array
<
index_t
,
nDim
>
{
xs
...})
{
}
__host__
__device__
constexpr
index_t
GetOffset
()
const
{
return
mOffset
;
}
template
<
class
IDim
,
class
T
,
bool
PositiveDirection
>
__host__
__device__
void
MoveOnDimension
(
IDim
idim_
,
T
step_size
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
constexpr
auto
idim
=
idim_
;
// if step_size is known at compile time
static_if
<
is_static
<
T
>::
value
>
{}(
[
&
](
auto
)
{
static_if
<
T
{}
==
0
>
{}([
&
](
auto
)
{
return
;
});
});
// update original index
static_if
<
tensor_desc_type
::
ContainMultipleOriginalDimensions
(
idim
)
>
{}([
&
](
auto
)
{
constexpr
auto
partial_original_dims
=
tensor_desc_type
::
GetContainedOriginalDimensions
(
idim
);
constexpr
index_t
ndim_partial_original
=
partial_original_dims
.
GetSize
();
constexpr
auto
partial_original_desc
=
tensor_desc_type
::
GetOriginalTensorDescriptor
().
Extract
(
partial_original_dims
);
const
auto
partial_original_step_sizes
=
partial_original_desc
.
GetMultiIndexFrom1dIndex
(
step_size
);
// update partial original multi-id
auto
partial_original_id
=
extract_array
(
mOriginalIndex
,
partial_original_dims
);
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
partial_original_id
+=
partial_original_step_sizes
;
bool
carry
=
false
;
// do carry check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
ndim_partial_original
-
1
,
1
>
{}([
&
](
auto
IReverse
)
{
constexpr
index_t
i
=
ndim_partial_original
-
1
-
IReverse
;
if
(
carry
)
{
++
partial_original_id
(
i
);
}
carry
=
false
;
if
(
partial_original_id
[
i
]
>=
partial_original_desc
.
GetLength
(
i
))
{
partial_original_id
(
i
)
-=
partial_original_desc
.
GetLength
(
i
);
carry
=
true
;
}
});
// highest dimension
if
(
carry
)
{
++
partial_original_id
(
0
);
}
}).
Else
([
&
](
auto
)
{
// shift up multi-id to avoid unsigned integer underflow during intermediate
// calculations. After the shift, should have new_multi_id[...] >= 1
partial_original_id
+=
partial_original_desc
.
GetLengths
()
-
partial_original_step_sizes
;
bool
borrow
=
false
;
// do borrow check in reversed order, starting from lowest dimension
// don't check the highest dimension
static_for
<
0
,
ndim_partial_original
-
1
,
1
>
{}([
&
](
auto
IReverse
)
{
constexpr
index_t
i
=
ndim_partial_original
-
1
-
IReverse
;
if
(
borrow
)
{
--
partial_original_id
(
i
);
}
borrow
=
false
;
if
(
partial_original_id
[
i
]
<
partial_original_desc
.
GetLength
(
i
))
{
partial_original_id
(
i
)
+=
partial_original_desc
.
GetLength
(
i
);
borrow
=
true
;
}
});
// highest dimension
if
(
borrow
)
{
--
partial_original_id
(
0
);
}
// shift back down multi-id
// here, should have new_multi_id[...] >= GetLengths()
partial_original_id
=
partial_original_id
-
partial_original_desc
.
GetLengths
();
});
// update "mOriginalIndex"
static_for
<
0
,
ndim_partial_original
,
1
>
{}([
&
](
auto
I
)
{
constexpr
auto
idim_original
=
partial_original_dims
[
I
];
mOriginalIndex
(
idim_original
)
=
partial_original_id
[
I
];
});
// calculate new partial offset on this merged dimension
const
index_t
old_partial_offset
=
mPartialOffsets
[
idim
];
mPartialOffsets
(
idim
)
=
partial_original_desc
.
GetOffsetFromMultiIndex
(
partial_original_id
);
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mOffset
=
(
mOffset
+
mPartialOffsets
[
idim
])
-
old_partial_offset
;
}).
Else
([
&
](
auto
fwd
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mOffset
+=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
}).
Else
([
&
](
auto
)
{
mOffset
-=
step_size
*
fwd
(
tensor_desc_type
{}).
GetStride
(
idim
);
});
});
}
// T is Array or Sequence
template
<
class
T
>
__host__
__device__
type
operator
+=
(
T
step_sizes
)
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
// compiler should remove dead code path, because step_sizes is known at
// compile time
if
(
step_sizes
[
idim
]
!=
0
)
{
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
true
>
{});
}
});
return
*
this
;
}
template
<
class
T
>
__host__
__device__
type
operator
-=
(
T
step_sizes
)
{
static_assert
(
is_same
<
typename
T
::
data_type
,
index_t
>
{}
&&
T
::
GetSize
()
==
nDim
,
"wrong!"
);
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
// compiler should remove dead code path, because step_sizes is known at
// compile time
if
(
step_sizes
[
idim
]
!=
0
)
{
this
->
MoveOnDimension
(
idim
,
step_sizes
[
idim
],
integral_constant
<
bool
,
false
>
{});
}
});
return
*
this
;
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
+
(
T
step_sizes
)
const
{
type
coord
=
*
this
;
coord
+=
step_sizes
;
return
coord
;
}
template
<
class
T
>
__host__
__device__
constexpr
type
operator
-
(
T
step_sizes
)
const
{
type
coord
=
*
this
;
coord
-=
step_sizes
;
return
coord
;
}
__host__
__device__
static
constexpr
index_t
RepositionOrigin
()
{
return
0
;
}
private:
// Allocate register memory for all merged dimensions and normal dimensions.
// However, only those merged dimensions, whose index will be involved in arithmetic
// after the construction of this TensorCoordinate (e.g. when user move a slicing
// window on the merged dimension), will use these register memory.
// Let's hope compiler will optimize away those register memory allocated for normal
// dimensions, and those merged dimensions, that would never be involved in index
// arithmetic after construction of TensorCoordinate.
// TODO: refactor TensorCoordinate, after introducing the concept of "dimensions"
// and simplify implementation of ConstantMergedTensorDescriptor, so we don't need to
// count on compiler to optimize away those register memory for us
Array
<
index_t
,
nOriginalDim
>
mOriginalIndex
;
Array
<
index_t
,
nDim
>
mPartialOffsets
;
// complete offset
index_t
mOffset
;
};
template
<
class
TensorDesc
>
struct
TensorCoordinate_deprecated
{
private:
template
<
class
...
Ts
>
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
ConstantTensorDescriptor
<
Ts
...
>
)
{
return
NormalTensorCoordinate_deprecated
<
ConstantTensorDescriptor
<
Ts
...
>>
();
}
template
<
class
...
Ts
>
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
ConstantMergedTensorDescriptor
<
Ts
...
>
)
{
return
MergedTensorCoordinate
<
ConstantMergedTensorDescriptor
<
Ts
...
>>
();
}
public:
using
type
=
decltype
(
MakeDummyTensorCoordinate
(
TensorDesc
{}));
};
}
// namespace ck
#endif
composable_kernel/include/tensor_description/tensor_coordinate_helper.hpp
View file @
b12bbcee
...
@@ -9,7 +9,7 @@ template <typename TensorDesc>
...
@@ -9,7 +9,7 @@ template <typename TensorDesc>
__host__
__device__
constexpr
auto
__host__
__device__
constexpr
auto
make_tensor_coordinate_v2
(
TensorDesc
,
MultiIndex
<
TensorDesc
::
GetNumOfDimension
()
>
idx
)
make_tensor_coordinate_v2
(
TensorDesc
,
MultiIndex
<
TensorDesc
::
GetNumOfDimension
()
>
idx
)
{
{
return
typename
TensorCoordinate
_v2
<
TensorDesc
>::
type
(
idx
);
return
typename
TensorCoordinate
<
TensorDesc
>::
type
(
idx
);
}
}
}
// namespace ck
}
// namespace ck
...
...
composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
deleted
100644 → 0
View file @
51a9fa1d
#ifndef CK_TENSOR_COORDINATE_V2_HPP
#define CK_TENSOR_COORDINATE_V2_HPP
#include "common_header.hpp"
#include "dimension.hpp"
#include "multi_index_transform.hpp"
#include "tensor_descriptor.hpp"
namespace
ck
{
template
<
typename
TensorDesc
>
struct
TensorCoordinate_v2
;
template
<
typename
NativeTensorDesc
>
struct
NativeTensorCoordinate
{
using
type
=
NativeTensorCoordinate
;
using
tensor_desc_type
=
NativeTensorDesc
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
__host__
__device__
constexpr
NativeTensorCoordinate
(
Index
idx
)
:
mIndex
(
idx
),
mOffset
(
tensor_desc_type
::
CalculateOffset
(
idx
))
{
}
template
<
typename
...
Xs
>
__host__
__device__
constexpr
NativeTensorCoordinate
(
Xs
...
xs
)
:
NativeTensorCoordinate
(
Index
{
xs
...})
{
}
template
<
index_t
...
Xs
>
__host__
__device__
constexpr
NativeTensorCoordinate
(
Sequence
<
Xs
...
>
)
:
NativeTensorCoordinate
(
Index
{
Xs
...})
{
}
__host__
__device__
static
constexpr
auto
GetTensorDescriptor
()
{
return
tensor_desc_type
{};
}
__host__
__device__
constexpr
const
Index
&
GetIndex
()
const
{
return
mIndex
;
}
__host__
__device__
constexpr
const
index_t
&
GetOffset
()
const
{
return
mOffset
;
}
__host__
__device__
constexpr
type
operator
+=
(
const
Index
&
idx_diff
)
{
// mIndex is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
mIndex
+=
idx_diff
;
mOffset
+=
tensor_desc_type
::
CalculateOffsetDiff
(
idx_diff
);
return
*
this
;
}
__host__
__device__
constexpr
type
operator
-=
(
const
Index
&
idx_diff
)
{
// mIndex is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
mIndex
-=
idx_diff
;
mOffset
-=
tensor_desc_type
::
CalculateOffsetDiff
(
idx_diff
);
return
*
this
;
}
__host__
__device__
constexpr
type
operator
+
(
const
Index
&
idx_diff
)
const
{
type
coord
=
*
this
;
coord
+=
idx_diff
;
return
coord
;
}
__host__
__device__
constexpr
type
operator
-
(
const
Index
&
idx_diff
)
const
{
type
coord
=
*
this
;
coord
-=
idx_diff
;
return
coord
;
}
__host__
__device__
static
constexpr
bool
IsUpperIndexMappedToValidOffset
()
{
return
true
;
}
private:
// mIndex may be saved and updated, however, the value of some (or all) of its entries may
// never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
Index
mIndex
;
index_t
mOffset
;
};
template
<
typename
TransformedTensorDesc
>
struct
TransformedTensorCoordinate
{
using
tensor_desc_type
=
TransformedTensorDesc
;
using
LowerCoord
=
typename
TensorCoordinate_v2
<
decltype
(
tensor_desc_type
::
GetLowerTensorDescriptor
())
>::
type
;
using
UpperCoord
=
TransformedTensorCoordinate
;
static
constexpr
index_t
nDim
=
tensor_desc_type
::
GetNumOfDimension
();
using
UpperIndex
=
MultiIndex
<
nDim
>
;
__host__
__device__
constexpr
TransformedTensorCoordinate
(
UpperIndex
idx
)
:
mIndexUp
{
idx
},
mCoordLow
{
tensor_desc_type
::
CalculateLowerIndex
(
idx
)}
{
}
template
<
typename
...
Xs
>
__host__
__device__
constexpr
TransformedTensorCoordinate
(
Xs
...
xs
)
:
TransformedTensorCoordinate
(
UpperIndex
{
xs
...})
{
}
template
<
index_t
...
Xs
>
__host__
__device__
constexpr
TransformedTensorCoordinate
(
Sequence
<
Xs
...
>
)
:
TransformedTensorCoordinate
(
UpperIndex
{
Xs
...})
{
}
__host__
__device__
static
constexpr
auto
GetTensorDescriptor
()
{
return
tensor_desc_type
{};
}
__host__
__device__
constexpr
const
LowerCoord
&
GetLowerCoordinate
()
const
{
return
mCoordLow
;
}
__host__
__device__
constexpr
const
UpperIndex
&
GetUpperIndex
()
const
{
return
mIndexUp
;
}
__host__
__device__
constexpr
const
UpperIndex
&
GetIndex
()
const
{
return
GetUpperIndex
();
}
__host__
__device__
constexpr
const
index_t
&
GetOffset
()
const
{
return
GetLowerCoordinate
().
GetOffset
();
}
__host__
__device__
constexpr
UpperCoord
operator
+=
(
const
UpperIndex
&
idx_up_diff
)
{
// For transformation of multi-index difference, not all transformation functions need to
// know the old lower-index or the old upper-index. We pass both of them to the
// transformation function. The transformation function itself decides to use them or not.
mCoordLow
+=
tensor_desc_type
::
CalculateLowerIndexDiff
(
idx_up_diff
,
GetIndex
(),
GetLowerCoordinate
().
GetIndex
());
// mIndexUp is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
mIndexUp
+=
idx_up_diff
;
return
*
this
;
}
__host__
__device__
constexpr
UpperCoord
operator
-=
(
const
UpperIndex
&
idx_up_diff
)
{
mCoordLow
-=
tensor_desc_type
::
CalculateLowerIndexDiff
(
idx_up_diff
,
GetIndex
(),
GetLowerCoordinate
().
GetIndex
());
// mIndex is updated here, but some (or all) of its entries may never be used
// compiler should remove those entries as dead code
mIndexUp
-=
idx_up_diff
;
return
*
this
;
}
__host__
__device__
constexpr
UpperCoord
operator
+
(
const
UpperIndex
&
idx_up_diff
)
const
{
UpperCoord
coord_up
=
*
this
;
coord_up
+=
idx_up_diff
;
return
coord_up
;
}
__host__
__device__
constexpr
UpperCoord
operator
-
(
const
UpperIndex
&
idx_up_diff
)
const
{
UpperCoord
coord_up
=
*
this
;
coord_up
-=
idx_up_diff
;
return
coord_up
;
}
// this function should be inexpensive, because there is no upper-to-lower index transformation
__host__
__device__
constexpr
bool
IsUpperIndexMappedToValidOffset
()
const
{
return
tensor_desc_type
::
IsUpperIndexMappedToValidLowerIndex
(
GetIndex
())
&&
mCoordLow
.
IsUpperIndexMappedToValidOffset
();
}
private:
// mIndexUp may be calculated and updated, however, the value of some (or all) of its entries
// may
// never be used. Compiler should be able to remove these entries as well as its calculation
// as dead code.
// TODO: make sure compiler indeed remove these dead code
UpperIndex
mIndexUp
;
LowerCoord
mCoordLow
;
};
template
<
typename
TensorDesc
>
struct
TensorCoordinate_v2
{
private:
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
NativeTensorDescriptor
<
Ts
...
>
)
{
return
NativeTensorCoordinate
<
NativeTensorDescriptor
<
Ts
...
>>
(
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
}
template
<
typename
...
Ts
>
__host__
__device__
static
constexpr
auto
MakeDummyTensorCoordinate
(
TransformedTensorDescriptor
<
Ts
...
>
)
{
return
TransformedTensorCoordinate
<
TransformedTensorDescriptor
<
Ts
...
>>
(
make_zero_array
<
index_t
,
TensorDesc
::
GetNumOfDimension
()
>
());
}
public:
using
type
=
decltype
(
MakeDummyTensorCoordinate
(
TensorDesc
{}));
};
}
// namespace ck
#endif
composable_kernel/include/tensor_description/tensor_view.hpp
View file @
b12bbcee
...
@@ -4,7 +4,7 @@
...
@@ -4,7 +4,7 @@
#include "common_header.hpp"
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_coordinate.hpp"
#include "tensor_coordinate
_deprecated
.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -14,7 +14,7 @@ struct NormalTensorView
...
@@ -14,7 +14,7 @@ struct NormalTensorView
{
{
using
type
=
NormalTensorView
;
using
type
=
NormalTensorView
;
using
tensor_desc_type
=
TensorDesc
;
using
tensor_desc_type
=
TensorDesc
;
using
coordinate_type
=
typename
NormalTensorCoordinate
<
TensorDesc
>::
type
;
using
coordinate_type
=
typename
NormalTensorCoordinate
_deprecated
<
TensorDesc
>::
type
;
using
data_type
=
TData
;
using
data_type
=
TData
;
static
constexpr
auto
nDim
=
TensorDesc
::
GetNumOfDimension
();
static
constexpr
auto
nDim
=
TensorDesc
::
GetNumOfDimension
();
...
...
composable_kernel/include/tensor_description/tensor_visit.hpp
View file @
b12bbcee
...
@@ -5,7 +5,7 @@
...
@@ -5,7 +5,7 @@
#include "dimension.hpp"
#include "dimension.hpp"
#include "dimension_transform.hpp"
#include "dimension_transform.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_coordinate
_v2
.hpp"
#include "tensor_coordinate.hpp"
namespace
ck
{
namespace
ck
{
...
@@ -13,7 +13,7 @@ template <class TensorDescriptor>
...
@@ -13,7 +13,7 @@ template <class TensorDescriptor>
struct
TensorVisit
struct
TensorVisit
{
{
using
Index
=
typename
TensorDescriptor
::
Index
;
using
Index
=
typename
TensorDescriptor
::
Index
;
using
Coordinate
=
typename
TensorCoordinate
_v2
<
TensorDescriptor
>::
type
;
using
Coordinate
=
typename
TensorCoordinate
<
TensorDescriptor
>::
type
;
__host__
__device__
static
void
Run_v1
(
Index
idx_begin
)
__host__
__device__
static
void
Run_v1
(
Index
idx_begin
)
{
{
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
b12bbcee
...
@@ -4,680 +4,11 @@
...
@@ -4,680 +4,11 @@
#include "common_header.hpp"
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_coordinate
_v2
.hpp"
#include "tensor_coordinate.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#endif
namespace
ck
{
namespace
ck
{
#if 0
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst.
// This functions assume each thread is reading and writing a normal (not merged) tensor,
// to simplify index calculations. To satisfy this assumption, the user need to make sure
// that, on a merged dimension that constains multiple original dimensions, the length of
// the last original dimension need to be evenly dividable by its sub-lengths. Also, the
// repeat-length on the merged dimension need to be 1. These sanity checks are performed
// in constructor of BlockwiseGenericTensorSliceCopy_v1
template <index_t BlockSize,
typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename SubLengths,
typename ThreadClusterLengths,
typename ThreadClusterArrangeOrder,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
struct BlockwiseGenericTensorSliceCopy_v1
{
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
static constexpr index_t nOriginalDimSrc =
SrcDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
static constexpr index_t nOriginalDimDst =
DstDesc::GetOriginalTensorDescriptor().GetNumOfDimension();
// per-thread offset
index_t mThreadSrcOffset;
index_t mThreadDstOffset;
// "mThreadSrcOriginalMultiId", "mThreadSrcPartialOffsets, "mThreadDstOriginalMultiId",
// "mThreadDstPartialOffsets" are always calculated inside constructor, and would be
// updated if slicing-window is moved. However, they will not be used if you always move
// the slicing-window along a non-merged dimension. In that case, compiler should be
// able to remove these calculation.
// TODO: make sure compiler would actually remove them in that case
// partial offset in each (merged) dimension
Array<index_t, nDim> mThreadSrcPartialOffsets;
Array<index_t, nDim> mThreadDstPartialOffsets;
// multi-id of original tensor
Array<index_t, nOriginalDimSrc> mThreadSrcOriginalMultiId;
Array<index_t, nOriginalDimDst> mThreadDstOriginalMultiId;
__device__ BlockwiseGenericTensorSliceCopy_v1(Array<index_t, nDim> src_block_data_id_begin,
Array<index_t, nDim> dst_block_data_id_begin)
{
// check NDim consistency
static_assert(
nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
nDim == SliceLengths::GetSize() && nDim == SubLengths::GetSize() &&
nDim == ThreadClusterLengths::GetSize() &&
nDim == ThreadClusterArrangeOrder::GetSize() &&
nDim == SrcDimAccessOrder::GetSize() && nDim == DstDimAccessOrder::GetSize(),
"wrong");
// check thread arrange order and read/write access order are valid
static_assert(is_valid_sequence_map<ThreadClusterArrangeOrder>::value &&
is_valid_sequence_map<SrcDimAccessOrder>::value &&
is_valid_sequence_map<DstDimAccessOrder>::value,
"wrong!");
// thread cluster
constexpr auto thread_cluster_desc = make_ConstantTensorDescriptor_packed(
ThreadClusterLengths::ReorderGivenNew2Old(ThreadClusterArrangeOrder{}));
// BlockSize
static_assert(BlockSize == thread_cluster_desc.GetElementSize(), "wrong! BlockSize");
// divide work
constexpr auto data_per_cluster_per_dims = SubLengths{} * ThreadClusterLengths{};
static_for<0, nDim, 1>{}([&](auto IDim) {
static_assert(SliceLengths::Get(IDim) % data_per_cluster_per_dims.Get(IDim) == 0,
"wrong! cannot evenly divide sliced tensor into cluster");
});
constexpr auto repeat_lengths = SliceLengths{} / data_per_cluster_per_dims;
// additional check for merged dimension
static_for<0, nDim, 1>{}([&](auto IDim_) {
// src
static_if<SrcDesc::ContainMultipleOriginalDimensions(IDim_)>{}([&](auto) {
constexpr auto IDim = decltype(IDim_){};
// on a merged dimension that constains multiple original dimensions,
// the length of the last original dimension need to evenly dividable by its
// sub-length,
// so each thread is effectively reading a normal (not merged) tensor
constexpr auto idim_last_original_src =
SrcDesc::GetContainedOriginalDimensions(IDim).Back();
static_assert(
SrcDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_src) %
SubLengths::Get(IDim) ==
0,
"wrong!");
// merged dimension should have repeat_lengths = 1
static_assert(repeat_lengths[IDim] == 1,
"wrong! repeat_lengths shoud be 1 on merged dimension");
});
// dst
static_if<DstDesc::ContainMultipleOriginalDimensions(IDim_)>{}([&](auto) {
constexpr auto IDim = decltype(IDim_){};
// on a merged dimension that constains multiple original dimensions,
// the length of the last original dimension need to evenly dividable by its
// sub-length,
// so each thread is effectively reading a normal (not merged) tensor
constexpr auto idim_last_original_dst =
DstDesc::GetContainedOriginalDimensions(IDim).Back();
static_assert(
DstDesc::GetOriginalTensorDescriptor().GetLength(idim_last_original_dst) %
SubLengths::Get(IDim) ==
0,
"wrong!");
// merged dimension should have repeat_lengths = 1
static_assert(repeat_lengths[IDim] == 1,
"wrong! repeat_lengths shoud be 1 on merged dimension");
});
});
// calculate mThreadSrcOffset, mThreadDstOffset
const auto thread_cluster_id =
thread_cluster_desc.GetMultiIndexFrom1dIndex(get_thread_local_1d_id());
const auto data_cluster_id =
reorder_array_given_old2new(thread_cluster_id, ThreadClusterArrangeOrder{});
const auto thread_data_id_begin = data_cluster_id * SubLengths{};
// original multi-id
mThreadSrcOriginalMultiId = SrcDesc::GetOriginalMultiIndexFromMultiIndex(
src_block_data_id_begin + thread_data_id_begin);
mThreadDstOriginalMultiId = DstDesc::GetOriginalMultiIndexFromMultiIndex(
dst_block_data_id_begin + thread_data_id_begin);
// partial offset on each dimension
static_for<0, nDim, 1>{}([&](auto IDim) {
constexpr auto src_partial_original_dims =
SrcDesc::GetContainedOriginalDimensions(IDim);
constexpr auto src_partial_original_desc =
SrcDesc::GetOriginalTensorDescriptor().Extract(src_partial_original_dims);
mThreadSrcPartialOffsets(IDim) = src_partial_original_desc.GetOffsetFromMultiIndex(
extract_array(mThreadSrcOriginalMultiId, src_partial_original_dims));
});
static_for<0, nDim, 1>{}([&](auto IDim) {
constexpr auto dst_partial_original_dims =
DstDesc::GetContainedOriginalDimensions(IDim);
constexpr auto dst_partial_original_desc =
DstDesc::GetOriginalTensorDescriptor().Extract(dst_partial_original_dims);
mThreadDstPartialOffsets(IDim) = dst_partial_original_desc.GetOffsetFromMultiIndex(
extract_array(mThreadDstOriginalMultiId, dst_partial_original_dims));
});
// complete offset
mThreadSrcOffset = accumulate_on_array(
mThreadSrcPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
mThreadDstOffset = accumulate_on_array(
mThreadDstPartialOffsets, math::plus<index_t>{}, static_cast<index_t>(0));
}
__device__ static constexpr auto GetRegisterBufferDescriptor()
{
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
return make_ConstantTensorDescriptor_packed(SubLengths{} * repeat_lengths);
}
__device__ static constexpr index_t GetRegisterBufferSize()
{
return GetRegisterBufferDescriptor().GetElementSpace();
}
template <typename TData>
__device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src,
TData* __restrict__ p_buffer) const
{
constexpr auto thread_sub_tensor_lengths = SubLengths{};
constexpr auto data_per_cluster_per_dims =
thread_sub_tensor_lengths * ThreadClusterLengths{};
constexpr auto repeat_lengths = SliceLengths{} / (SubLengths{} * ThreadClusterLengths{});
constexpr auto thread_buffer_desc = GetRegisterBufferDescriptor();
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
constexpr auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims;
constexpr auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
constexpr index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin);
constexpr index_t buffer_offset =
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
#else
ford<decltype(repeat_lengths)>{}([&](auto repeat_id) {
const auto src_thread_data_id_begin = repeat_id * data_per_cluster_per_dims;
const auto buffer_data_id_begin = repeat_id * thread_sub_tensor_lengths;
const index_t src_offset = SrcDesc::GetOffsetFromMultiIndex(src_thread_data_id_begin);
const index_t buffer_offset =
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
#endif
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
ThreadwiseGenericTensorSliceCopy_v1r2
<
SrcDesc
,
decltype
(
thread_buffer_desc
),
SubLengths
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
1
>
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
.
Run
(
p_src
+
src_offset
+
mThreadSrcOffset
,
p_buffer
+
buffer_offset
);
});
}
template
<
typename
TData
>
__device__
void
RunStoreRegisterBuffer
(
const
TData
*
__restrict__
p_buffer
,
TData
*
__restrict__
p_dst
)
const
{
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
ThreadClusterLengths
{};
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ThreadClusterLengths
{});
constexpr
auto
thread_buffer_desc
=
GetRegisterBufferDescriptor
();
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_id
)
{
constexpr
auto
buffer_data_id_begin
=
repeat_id
*
thread_sub_tensor_lengths
;
constexpr
auto
dst_data_id_begin
=
repeat_id
*
data_per_cluster_per_dims
;
constexpr
index_t
buffer_offset
=
thread_buffer_desc
.
GetOffsetFromMultiIndex
(
buffer_data_id_begin
);
constexpr
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_data_id_begin
);
#else
ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_id
)
{
const
auto
buffer_data_id_begin
=
repeat_id
*
thread_sub_tensor_lengths
;
const
auto
dst_data_id_begin
=
repeat_id
*
data_per_cluster_per_dims
;
const
index_t
buffer_offset
=
thread_buffer_desc
.
GetOffsetFromMultiIndex
(
buffer_data_id_begin
);
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_data_id_begin
);
#endif
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
ThreadwiseGenericTensorSliceCopy_v1r2
<
decltype
(
thread_buffer_desc
),
DstDesc
,
SubLengths
,
DstDimAccessOrder
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
.
Run
(
p_buffer
+
buffer_offset
,
p_dst
+
dst_offset
+
mThreadDstOffset
);
});
}
template
<
typename
TData
>
__device__
void
Run
(
const
TData
*
__restrict__
p_src
,
TData
*
__restrict__
p_dst
)
const
{
TData
p_buffer
[
GetRegisterBufferSize
()];
RunLoadRegisterBuffer
(
p_src
,
p_buffer
);
RunStoreRegisterBuffer
(
p_buffer
,
p_dst
);
}
// When moving the slicing windows along a merged dimension, if the strides of the
// contained (by the merged dimension) original dimensions are not in descending order,
// then there is no guarantee that the new offset will be larger than the old offset
// for movement in positive direction (vice versue for movement in negative direction).
// As a result, there is the possiblity that the offset calculation may result in
// unsigned integer underflow (due to "-" operation). However, this hazard should not
// happen, as long as the users make sure the slicing window would not be moved out of
// the boundary of the tensor being sliced. This functions doesn't do runtime sanity
// check on out-of-bound slicing window, for performance reason
template
<
index_t
IDim_
,
index_t
StepSize
,
bool
PositiveDirection
>
__device__
void
MoveSlicingWindowOnSourceTensor
(
Number
<
IDim_
>
,
Number
<
StepSize
>
,
integral_constant
<
bool
,
PositiveDirection
>
direction
)
{
constexpr
auto
IDim
=
Number
<
IDim_
>
{};
static_if
<
SrcDesc
::
ContainMultipleOriginalDimensions
(
IDim
)
>
{}([
&
](
auto
)
{
// logic for a merged dimension, also works for non-merged dimension, but its logic may
// be unncessarily complicated for compiler to remove calculations that are useless for
// a non-merged dimension
// extract partial original dimensions
constexpr
auto
src_partial_original_dims
=
SrcDesc
::
GetContainedOriginalDimensions
(
IDim
);
constexpr
auto
src_partial_original_desc
=
SrcDesc
::
GetOriginalTensorDescriptor
().
Extract
(
src_partial_original_dims
);
// calculate new partial original multi-id
auto
old_src_partial_original_id
=
extract_array
(
mThreadSrcOriginalMultiId
,
src_partial_original_dims
);
auto
new_src_partial_original_id
=
src_partial_original_desc
.
UpdateMultiIndexGivenStepSizeOf1dIndex
(
old_src_partial_original_id
,
StepSize
,
direction
);
// update "mThreadSrcOriginalMultiId"
static_for
<
0
,
decltype
(
src_partial_original_dims
)
::
GetSize
(),
1
>
{}([
&
](
auto
I
)
{
constexpr
auto
IDimOriginal
=
src_partial_original_dims
[
I
];
mThreadSrcOriginalMultiId
(
IDimOriginal
)
=
new_src_partial_original_id
[
I
];
});
// calculate new partial offset on this merged dimension
const
index_t
old_src_partial_offset
=
mThreadSrcPartialOffsets
[
IDim
];
const
index_t
new_src_partial_offset
=
src_partial_original_desc
.
GetOffsetFromMultiIndex
(
new_src_partial_original_id
);
// update "mThreadSrcPartialOffsets"
mThreadSrcPartialOffsets
(
IDim
)
=
new_src_partial_offset
;
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mThreadSrcOffset
=
(
mThreadSrcOffset
+
new_src_partial_offset
)
-
old_src_partial_offset
;
}).
Else
([
&
](
auto
)
{
// Logic for non-merged dimension. If you are never going to move the slicing window on
// a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
// which are being calculated here, will never be used later. In this case, compiler
// should be able to remove these calculations.
// TODO: make sure compiler would actually remove them in this case.
// It is the user's responsiblity to make sure the slicing window will not be moved out
// of the boundary of the tensor being sliced. Otherwise, there might be hazard like
// unsigned integer underflow. That is NO runtime sanity check to prevent the hazard
constexpr
auto
IDimOriginal
=
SrcDesc
::
GetContainedOriginalDimensions
(
IDim
).
Front
();
static_if
<
PositiveDirection
>
{}([
&
](
auto
fwd
)
{
mThreadSrcOffset
+=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
mThreadSrcOriginalMultiId
(
IDimOriginal
)
+=
StepSize
;
mThreadSrcPartialOffsets
(
IDim
)
+=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
}).
Else
([
&
](
auto
fwd
)
{
mThreadSrcOffset
-=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
mThreadSrcOriginalMultiId
(
IDimOriginal
)
-=
StepSize
;
mThreadSrcPartialOffsets
(
IDim
)
-=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
});
});
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
if
(
step_sizes
[
idim
]
!=
0
)
{
MoveSlicingWindowOnSourceTensor
(
idim
,
step_sizes
[
idim
],
positive_direction
);
}
});
}
};
// This version use TensorCoordiante
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst.
template
<
index_t
BlockSize
,
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
SubLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
BlockwiseGenericTensorSliceCopy_v2
{
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
__device__
constexpr
BlockwiseGenericTensorSliceCopy_v2
(
const
Index
&
src_block_slice_origin
,
const
Index
&
dst_block_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
ThreadClusterLengths
::
GetSize
()
&&
nDim
==
ThreadClusterArrangeOrder
::
GetSize
()
&&
nDim
==
SrcDimAccessOrder
::
GetSize
()
&&
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong! nDim not consistent"
);
static_assert
(
is_same
<
SliceLengths
,
decltype
(
SubLengths
{}
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
const
auto
thread_cluster_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
const
auto
data_cluster_id
=
reorder_array_given_old2new
(
thread_cluster_id
,
ThreadClusterArrangeOrder
{});
const
auto
thread_data_id_begin
=
data_cluster_id
*
SubLengths
{};
mThreadwiseLoad
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_id_begin
);
mThreadwiseLoad
.
SetDstSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
.
SetSrcSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
.
SetDstSliceOrigin
(
dst_block_slice_origin
+
thread_data_id_begin
);
}
__device__
static
constexpr
index_t
GetRegisterBufferSize
()
{
return
RegisterBufferDesc
::
GetElementSpace
();
}
template
<
typename
TData
>
__device__
void
RunLoadRegisterBuffer
(
const
TData
*
p_src
,
TData
*
p_buffer
)
const
{
#if 0
mThreadwiseLoad.Run(p_src, p_buffer);
#else
// hardcoded: global to register
mThreadwiseLoad
.
template
Run_amd_experiment
<
TData
,
2
,
0
>(
p_src
,
p_buffer
);
#endif
}
template
<
typename
TData
>
__device__
void
RunStoreRegisterBuffer
(
const
TData
*
p_buffer
,
TData
*
p_dst
)
const
{
#if 0
mThreadwiseStore.Run(p_buffer, p_dst);
#else
// hardcoded: register to LDS
mThreadwiseStore
.
template
Run_amd_experiment
<
TData
,
0
,
1
>(
p_buffer
,
p_dst
);
#endif
}
template
<
typename
TData
>
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
TData
p_buffer
[
GetRegisterBufferSize
()];
RunLoadRegisterBuffer
(
p_src
,
p_buffer
);
RunStoreRegisterBuffer
(
p_buffer
,
p_dst
);
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
mThreadwiseLoad
.
MoveSrcSliceWindow
(
step_sizes
,
positive_direction
);
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
mThreadwiseStore
.
MoveDstSliceWindow
(
step_sizes
,
positive_direction
);
}
private:
using
RegisterBufferDesc
=
decltype
(
make_ConstantTensorDescriptor_packed
(
SubLengths
{}));
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v2r1
<
SrcDesc
,
RegisterBufferDesc
,
SubLengths
,
SrcDimAccessOrder
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
1
>
;
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v2r1
<
RegisterBufferDesc
,
DstDesc
,
SubLengths
,
DstDimAccessOrder
,
DstDimAccessOrder
,
DstVectorAccessDim
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
;
ThreadwiseLoad
mThreadwiseLoad
;
ThreadwiseStore
mThreadwiseStore
;
};
// this version use TensorView and TensorCoordinate
template
<
index_t
BlockSize
,
typename
SrcTensor
,
typename
DstTensor
,
typename
SliceLengths
,
typename
SubLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
BlockwiseGenericTensorSliceCopy_v3
{
static
constexpr
index_t
nDim
=
SrcTensor
::
GetNumOfDimension
();
using
data_type
=
remove_cv_t
<
typename
SrcTensor
::
data_type
>
;
using
SrcCoordinate
=
typename
SrcTensor
::
coordinate_type
;
using
DstCoordinate
=
typename
DstTensor
::
coordinate_type
;
__device__
constexpr
BlockwiseGenericTensorSliceCopy_v3
(
SrcTensor
src_block
,
SrcCoordinate
src_block_slice_origin
,
DstTensor
dst_block
,
DstCoordinate
dst_block_slice_origin
)
:
mThreadBuffer
{
make_TensorView
(
ThreadBufferDesc
{},
mpBuffer
)}
{
static_assert
(
nDim
==
SrcTensor
::
GetNumOfDimension
()
&&
nDim
==
DstTensor
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
ThreadClusterLengths
::
GetSize
()
&&
nDim
==
ThreadClusterArrangeOrder
::
GetSize
()
&&
nDim
==
SrcDimAccessOrder
::
GetSize
()
&&
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong! nDim not consistent"
);
static_assert
(
is_same
<
SliceLengths
,
decltype
(
SubLengths
{}
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
static_assert
(
is_same
<
remove_cv_t
<
typename
SrcTensor
::
data_type
>
,
remove_cv_t
<
typename
DstTensor
::
data_type
>>
{},
"wrong! type conversion not supported yet"
);
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
const
auto
thread_cluster_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
const
auto
data_cluster_id
=
reorder_array_given_old2new
(
thread_cluster_id
,
ThreadClusterArrangeOrder
{});
const
auto
thread_data_id_begin
=
data_cluster_id
*
SubLengths
{};
mThreadwiseLoad
=
ThreadwiseLoad
(
src_block
,
src_block_slice_origin
+
thread_data_id_begin
,
mThreadBuffer
,
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
=
ThreadwiseStore
(
mThreadBuffer
,
make_zero_array
<
index_t
,
nDim
>
(),
dst_block
,
dst_block_slice_origin
+
thread_data_id_begin
);
}
__device__
void
RunLoadRegisterBuffer
()
{
mThreadwiseLoad
.
Run
();
}
__device__
void
RunStoreRegisterBuffer
()
const
{
mThreadwiseStore
.
Run
();
}
__device__
void
Run
()
{
mThreadwiseLoad
.
Run
();
mThreadwiseStore
.
Run
();
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
mThreadwiseLoad
.
MoveSrcSliceWindow
(
step_sizes
,
positive_direction
);
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
mThreadwiseStore
.
MoveDstSliceWindow
(
step_sizes
,
positive_direction
);
}
private:
using
ThreadBufferDesc
=
decltype
(
make_ConstantTensorDescriptor_packed
(
SubLengths
{}));
using
ThreadBufferTensor
=
NormalTensorView
<
ThreadBufferDesc
,
data_type
>
;
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v3r1
<
SrcTensor
,
ThreadBufferTensor
,
SubLengths
,
SrcDimAccessOrder
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
1
>
;
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v3r1
<
ThreadBufferTensor
,
DstTensor
,
SubLengths
,
DstDimAccessOrder
,
DstDimAccessOrder
,
DstVectorAccessDim
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
;
data_type
mpBuffer
[
ThreadBufferDesc
::
GetElementSpace
()];
ThreadBufferTensor
mThreadBuffer
;
ThreadwiseLoad
mThreadwiseLoad
;
ThreadwiseStore
mThreadwiseStore
;
};
#endif
template
<
index_t
BlockSize
,
template
<
index_t
BlockSize
,
typename
BlockSrcDesc
,
typename
BlockSrcDesc
,
typename
BlockDstDesc
,
typename
BlockDstDesc
,
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
0 → 100644
View file @
b12bbcee
#ifndef CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
#define CK_BLOCKWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_view.hpp"
#include "tensor_coordinate_deprecated.hpp"
#include "threadwise_generic_tensor_slice_copy_deprecated.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#endif
namespace
ck
{
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst.
// This functions assume each thread is reading and writing a normal (not merged) tensor,
// to simplify index calculations. To satisfy this assumption, the user need to make sure
// that, on a merged dimension that constains multiple original dimensions, the length of
// the last original dimension need to be evenly dividable by its sub-lengths. Also, the
// repeat-length on the merged dimension need to be 1. These sanity checks are performed
// in constructor of BlockwiseGenericTensorSliceCopy_v1
template
<
index_t
BlockSize
,
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
SubLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
BlockwiseGenericTensorSliceCopy_v1
{
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
static
constexpr
index_t
nOriginalDimSrc
=
SrcDesc
::
GetOriginalTensorDescriptor
().
GetNumOfDimension
();
static
constexpr
index_t
nOriginalDimDst
=
DstDesc
::
GetOriginalTensorDescriptor
().
GetNumOfDimension
();
// per-thread offset
index_t
mThreadSrcOffset
;
index_t
mThreadDstOffset
;
// "mThreadSrcOriginalMultiId", "mThreadSrcPartialOffsets, "mThreadDstOriginalMultiId",
// "mThreadDstPartialOffsets" are always calculated inside constructor, and would be
// updated if slicing-window is moved. However, they will not be used if you always move
// the slicing-window along a non-merged dimension. In that case, compiler should be
// able to remove these calculation.
// TODO: make sure compiler would actually remove them in that case
// partial offset in each (merged) dimension
Array
<
index_t
,
nDim
>
mThreadSrcPartialOffsets
;
Array
<
index_t
,
nDim
>
mThreadDstPartialOffsets
;
// multi-id of original tensor
Array
<
index_t
,
nOriginalDimSrc
>
mThreadSrcOriginalMultiId
;
Array
<
index_t
,
nOriginalDimDst
>
mThreadDstOriginalMultiId
;
__device__
BlockwiseGenericTensorSliceCopy_v1
(
Array
<
index_t
,
nDim
>
src_block_data_id_begin
,
Array
<
index_t
,
nDim
>
dst_block_data_id_begin
)
{
// check NDim consistency
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
ThreadClusterLengths
::
GetSize
()
&&
nDim
==
ThreadClusterArrangeOrder
::
GetSize
()
&&
nDim
==
SrcDimAccessOrder
::
GetSize
()
&&
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong"
);
// check thread arrange order and read/write access order are valid
static_assert
(
is_valid_sequence_map
<
ThreadClusterArrangeOrder
>::
value
&&
is_valid_sequence_map
<
SrcDimAccessOrder
>::
value
&&
is_valid_sequence_map
<
DstDimAccessOrder
>::
value
,
"wrong!"
);
// thread cluster
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
// BlockSize
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize"
);
// divide work
constexpr
auto
data_per_cluster_per_dims
=
SubLengths
{}
*
ThreadClusterLengths
{};
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
static_assert
(
SliceLengths
::
Get
(
IDim
)
%
data_per_cluster_per_dims
.
Get
(
IDim
)
==
0
,
"wrong! cannot evenly divide sliced tensor into cluster"
);
});
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
data_per_cluster_per_dims
;
// additional check for merged dimension
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim_
)
{
// src
static_if
<
SrcDesc
::
ContainMultipleOriginalDimensions
(
IDim_
)
>
{}([
&
](
auto
)
{
constexpr
auto
IDim
=
decltype
(
IDim_
){};
// on a merged dimension that constains multiple original dimensions,
// the length of the last original dimension need to evenly dividable by its
// sub-length,
// so each thread is effectively reading a normal (not merged) tensor
constexpr
auto
idim_last_original_src
=
SrcDesc
::
GetContainedOriginalDimensions
(
IDim
).
Back
();
static_assert
(
SrcDesc
::
GetOriginalTensorDescriptor
().
GetLength
(
idim_last_original_src
)
%
SubLengths
::
Get
(
IDim
)
==
0
,
"wrong!"
);
// merged dimension should have repeat_lengths = 1
static_assert
(
repeat_lengths
[
IDim
]
==
1
,
"wrong! repeat_lengths shoud be 1 on merged dimension"
);
});
// dst
static_if
<
DstDesc
::
ContainMultipleOriginalDimensions
(
IDim_
)
>
{}([
&
](
auto
)
{
constexpr
auto
IDim
=
decltype
(
IDim_
){};
// on a merged dimension that constains multiple original dimensions,
// the length of the last original dimension need to evenly dividable by its
// sub-length,
// so each thread is effectively reading a normal (not merged) tensor
constexpr
auto
idim_last_original_dst
=
DstDesc
::
GetContainedOriginalDimensions
(
IDim
).
Back
();
static_assert
(
DstDesc
::
GetOriginalTensorDescriptor
().
GetLength
(
idim_last_original_dst
)
%
SubLengths
::
Get
(
IDim
)
==
0
,
"wrong!"
);
// merged dimension should have repeat_lengths = 1
static_assert
(
repeat_lengths
[
IDim
]
==
1
,
"wrong! repeat_lengths shoud be 1 on merged dimension"
);
});
});
// calculate mThreadSrcOffset, mThreadDstOffset
const
auto
thread_cluster_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
const
auto
data_cluster_id
=
reorder_array_given_old2new
(
thread_cluster_id
,
ThreadClusterArrangeOrder
{});
const
auto
thread_data_id_begin
=
data_cluster_id
*
SubLengths
{};
// original multi-id
mThreadSrcOriginalMultiId
=
SrcDesc
::
GetOriginalMultiIndexFromMultiIndex
(
src_block_data_id_begin
+
thread_data_id_begin
);
mThreadDstOriginalMultiId
=
DstDesc
::
GetOriginalMultiIndexFromMultiIndex
(
dst_block_data_id_begin
+
thread_data_id_begin
);
// partial offset on each dimension
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
constexpr
auto
src_partial_original_dims
=
SrcDesc
::
GetContainedOriginalDimensions
(
IDim
);
constexpr
auto
src_partial_original_desc
=
SrcDesc
::
GetOriginalTensorDescriptor
().
Extract
(
src_partial_original_dims
);
mThreadSrcPartialOffsets
(
IDim
)
=
src_partial_original_desc
.
GetOffsetFromMultiIndex
(
extract_array
(
mThreadSrcOriginalMultiId
,
src_partial_original_dims
));
});
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
constexpr
auto
dst_partial_original_dims
=
DstDesc
::
GetContainedOriginalDimensions
(
IDim
);
constexpr
auto
dst_partial_original_desc
=
DstDesc
::
GetOriginalTensorDescriptor
().
Extract
(
dst_partial_original_dims
);
mThreadDstPartialOffsets
(
IDim
)
=
dst_partial_original_desc
.
GetOffsetFromMultiIndex
(
extract_array
(
mThreadDstOriginalMultiId
,
dst_partial_original_dims
));
});
// complete offset
mThreadSrcOffset
=
accumulate_on_array
(
mThreadSrcPartialOffsets
,
math
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
mThreadDstOffset
=
accumulate_on_array
(
mThreadDstPartialOffsets
,
math
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
}
__device__
static
constexpr
auto
GetRegisterBufferDescriptor
()
{
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ThreadClusterLengths
{});
return
make_ConstantTensorDescriptor_packed
(
SubLengths
{}
*
repeat_lengths
);
}
__device__
static
constexpr
index_t
GetRegisterBufferSize
()
{
return
GetRegisterBufferDescriptor
().
GetElementSpace
();
}
template
<
typename
TData
>
__device__
void
RunLoadRegisterBuffer
(
const
TData
*
__restrict__
p_src
,
TData
*
__restrict__
p_buffer
)
const
{
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
ThreadClusterLengths
{};
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ThreadClusterLengths
{});
constexpr
auto
thread_buffer_desc
=
GetRegisterBufferDescriptor
();
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_id
)
{
constexpr
auto
src_thread_data_id_begin
=
repeat_id
*
data_per_cluster_per_dims
;
constexpr
auto
buffer_data_id_begin
=
repeat_id
*
thread_sub_tensor_lengths
;
constexpr
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_thread_data_id_begin
);
constexpr
index_t
buffer_offset
=
thread_buffer_desc
.
GetOffsetFromMultiIndex
(
buffer_data_id_begin
);
#else
ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_id
)
{
const
auto
src_thread_data_id_begin
=
repeat_id
*
data_per_cluster_per_dims
;
const
auto
buffer_data_id_begin
=
repeat_id
*
thread_sub_tensor_lengths
;
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_thread_data_id_begin
);
const
index_t
buffer_offset
=
thread_buffer_desc
.
GetOffsetFromMultiIndex
(
buffer_data_id_begin
);
#endif
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
ThreadwiseGenericTensorSliceCopy_v1r2
<
SrcDesc
,
decltype
(
thread_buffer_desc
),
SubLengths
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
1
>
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
.
Run
(
p_src
+
src_offset
+
mThreadSrcOffset
,
p_buffer
+
buffer_offset
);
});
}
template
<
typename
TData
>
__device__
void
RunStoreRegisterBuffer
(
const
TData
*
__restrict__
p_buffer
,
TData
*
__restrict__
p_dst
)
const
{
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
ThreadClusterLengths
{};
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
ThreadClusterLengths
{});
constexpr
auto
thread_buffer_desc
=
GetRegisterBufferDescriptor
();
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_id
)
{
constexpr
auto
buffer_data_id_begin
=
repeat_id
*
thread_sub_tensor_lengths
;
constexpr
auto
dst_data_id_begin
=
repeat_id
*
data_per_cluster_per_dims
;
constexpr
index_t
buffer_offset
=
thread_buffer_desc
.
GetOffsetFromMultiIndex
(
buffer_data_id_begin
);
constexpr
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_data_id_begin
);
#else
ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_id
)
{
const
auto
buffer_data_id_begin
=
repeat_id
*
thread_sub_tensor_lengths
;
const
auto
dst_data_id_begin
=
repeat_id
*
data_per_cluster_per_dims
;
const
index_t
buffer_offset
=
thread_buffer_desc
.
GetOffsetFromMultiIndex
(
buffer_data_id_begin
);
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_data_id_begin
);
#endif
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
ThreadwiseGenericTensorSliceCopy_v1r2
<
decltype
(
thread_buffer_desc
),
DstDesc
,
SubLengths
,
DstDimAccessOrder
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
.
Run
(
p_buffer
+
buffer_offset
,
p_dst
+
dst_offset
+
mThreadDstOffset
);
});
}
template
<
typename
TData
>
__device__
void
Run
(
const
TData
*
__restrict__
p_src
,
TData
*
__restrict__
p_dst
)
const
{
TData
p_buffer
[
GetRegisterBufferSize
()];
RunLoadRegisterBuffer
(
p_src
,
p_buffer
);
RunStoreRegisterBuffer
(
p_buffer
,
p_dst
);
}
// When moving the slicing windows along a merged dimension, if the strides of the
// contained (by the merged dimension) original dimensions are not in descending order,
// then there is no guarantee that the new offset will be larger than the old offset
// for movement in positive direction (vice versue for movement in negative direction).
// As a result, there is the possiblity that the offset calculation may result in
// unsigned integer underflow (due to "-" operation). However, this hazard should not
// happen, as long as the users make sure the slicing window would not be moved out of
// the boundary of the tensor being sliced. This functions doesn't do runtime sanity
// check on out-of-bound slicing window, for performance reason
template
<
index_t
IDim_
,
index_t
StepSize
,
bool
PositiveDirection
>
__device__
void
MoveSlicingWindowOnSourceTensor
(
Number
<
IDim_
>
,
Number
<
StepSize
>
,
integral_constant
<
bool
,
PositiveDirection
>
direction
)
{
constexpr
auto
IDim
=
Number
<
IDim_
>
{};
static_if
<
SrcDesc
::
ContainMultipleOriginalDimensions
(
IDim
)
>
{}([
&
](
auto
)
{
// logic for a merged dimension, also works for non-merged dimension, but its logic may
// be unncessarily complicated for compiler to remove calculations that are useless for
// a non-merged dimension
// extract partial original dimensions
constexpr
auto
src_partial_original_dims
=
SrcDesc
::
GetContainedOriginalDimensions
(
IDim
);
constexpr
auto
src_partial_original_desc
=
SrcDesc
::
GetOriginalTensorDescriptor
().
Extract
(
src_partial_original_dims
);
// calculate new partial original multi-id
auto
old_src_partial_original_id
=
extract_array
(
mThreadSrcOriginalMultiId
,
src_partial_original_dims
);
auto
new_src_partial_original_id
=
src_partial_original_desc
.
UpdateMultiIndexGivenStepSizeOf1dIndex
(
old_src_partial_original_id
,
StepSize
,
direction
);
// update "mThreadSrcOriginalMultiId"
static_for
<
0
,
decltype
(
src_partial_original_dims
)
::
GetSize
(),
1
>
{}([
&
](
auto
I
)
{
constexpr
auto
IDimOriginal
=
src_partial_original_dims
[
I
];
mThreadSrcOriginalMultiId
(
IDimOriginal
)
=
new_src_partial_original_id
[
I
];
});
// calculate new partial offset on this merged dimension
const
index_t
old_src_partial_offset
=
mThreadSrcPartialOffsets
[
IDim
];
const
index_t
new_src_partial_offset
=
src_partial_original_desc
.
GetOffsetFromMultiIndex
(
new_src_partial_original_id
);
// update "mThreadSrcPartialOffsets"
mThreadSrcPartialOffsets
(
IDim
)
=
new_src_partial_offset
;
// update "mThreadSrcOffset", do "+" before "-" to avoid underflow
mThreadSrcOffset
=
(
mThreadSrcOffset
+
new_src_partial_offset
)
-
old_src_partial_offset
;
}).
Else
([
&
](
auto
)
{
// Logic for non-merged dimension. If you are never going to move the slicing window on
// a merged dimension, then "mThreadSrcOriginalMultiId" and "mThreadSrcPartialOffsets",
// which are being calculated here, will never be used later. In this case, compiler
// should be able to remove these calculations.
// TODO: make sure compiler would actually remove them in this case.
// It is the user's responsiblity to make sure the slicing window will not be moved out
// of the boundary of the tensor being sliced. Otherwise, there might be hazard like
// unsigned integer underflow. That is NO runtime sanity check to prevent the hazard
constexpr
auto
IDimOriginal
=
SrcDesc
::
GetContainedOriginalDimensions
(
IDim
).
Front
();
static_if
<
PositiveDirection
>
{}([
&
](
auto
fwd
)
{
mThreadSrcOffset
+=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
mThreadSrcOriginalMultiId
(
IDimOriginal
)
+=
StepSize
;
mThreadSrcPartialOffsets
(
IDim
)
+=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
}).
Else
([
&
](
auto
fwd
)
{
mThreadSrcOffset
-=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
mThreadSrcOriginalMultiId
(
IDimOriginal
)
-=
StepSize
;
mThreadSrcPartialOffsets
(
IDim
)
-=
StepSize
*
fwd
(
SrcDesc
{}).
GetStride
(
IDim
);
});
});
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim
)
{
if
(
step_sizes
[
idim
]
!=
0
)
{
MoveSlicingWindowOnSourceTensor
(
idim
,
step_sizes
[
idim
],
positive_direction
);
}
});
}
};
// This version use TensorCoordiante
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst.
template
<
index_t
BlockSize
,
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
SubLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
BlockwiseGenericTensorSliceCopy_v2
{
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
__device__
constexpr
BlockwiseGenericTensorSliceCopy_v2
(
const
Index
&
src_block_slice_origin
,
const
Index
&
dst_block_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
ThreadClusterLengths
::
GetSize
()
&&
nDim
==
ThreadClusterArrangeOrder
::
GetSize
()
&&
nDim
==
SrcDimAccessOrder
::
GetSize
()
&&
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong! nDim not consistent"
);
static_assert
(
is_same
<
SliceLengths
,
decltype
(
SubLengths
{}
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
const
auto
thread_cluster_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
const
auto
data_cluster_id
=
reorder_array_given_old2new
(
thread_cluster_id
,
ThreadClusterArrangeOrder
{});
const
auto
thread_data_id_begin
=
data_cluster_id
*
SubLengths
{};
mThreadwiseLoad
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_id_begin
);
mThreadwiseLoad
.
SetDstSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
.
SetSrcSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
.
SetDstSliceOrigin
(
dst_block_slice_origin
+
thread_data_id_begin
);
}
__device__
static
constexpr
index_t
GetThreadBufferSize
()
{
return
ThreadBufferDesc
::
GetElementSpace
();
}
template
<
typename
TData
,
address_space_t
BlockSrcAddressSpace
=
address_space_t
::
generic
,
address_space_t
ThreadBufferAddressSpace
=
address_space_t
::
generic
>
__device__
void
RunLoadThreadBuffer
(
const
TData
*
p_block_src
,
TData
*
p_thread_buffer
)
const
{
#if 0
mThreadwiseLoad.Run(p_block_src, p_thread_buffer);
#else
// tweaking
mThreadwiseLoad
.
template
Run_optimized_address_calculation
<
TData
,
BlockSrcAddressSpace
,
ThreadBufferAddressSpace
>(
p_block_src
,
p_thread_buffer
);
#endif
}
template
<
typename
TData
,
address_space_t
ThreadBufferAddressSpace
=
address_space_t
::
generic
,
address_space_t
BlockDstAddressSpace
=
address_space_t
::
generic
>
__device__
void
RunStoreThreadBuffer
(
const
TData
*
p_thread_buffer
,
TData
*
p_block_dst
)
const
{
#if 0
mThreadwiseStore.Run(p_thread_buffer, p_block_dst);
#else
// tweaking
mThreadwiseStore
.
template
Run_optimized_address_calculation
<
TData
,
ThreadBufferAddressSpace
,
BlockDstAddressSpace
>(
p_thread_buffer
,
p_block_dst
);
#endif
}
template
<
typename
TData
,
address_space_t
BlockSrcAddressSpace
=
address_space_t
::
generic
,
address_space_t
BlockDstAddressSpace
=
address_space_t
::
generic
>
__device__
void
Run
(
const
TData
*
p_block_src
,
TData
*
p_block_dst
)
const
{
TData
p_thread_buffer
[
GetThreadBufferSize
()];
RunLoadThreadBuffer
<
TData
,
BlockSrcAddressSpace
,
address_space_t
::
generic
>
(
p_block_src
,
p_thread_buffer
);
RunStoreThreadBuffer
<
TData
,
address_space_t
::
generic
,
BlockDstAddressSpace
>
(
p_thread_buffer
,
p_block_dst
);
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
mThreadwiseLoad
.
MoveSrcSliceWindow
(
step_sizes
,
positive_direction
);
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
mThreadwiseStore
.
MoveDstSliceWindow
(
step_sizes
,
positive_direction
);
}
private:
using
ThreadBufferDesc
=
decltype
(
make_ConstantTensorDescriptor_packed
(
SubLengths
{}));
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v2r1
<
SrcDesc
,
ThreadBufferDesc
,
SubLengths
,
SrcDimAccessOrder
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
1
>
;
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v2r1
<
ThreadBufferDesc
,
DstDesc
,
SubLengths
,
DstDimAccessOrder
,
DstDimAccessOrder
,
DstVectorAccessDim
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
;
ThreadwiseLoad
mThreadwiseLoad
;
ThreadwiseStore
mThreadwiseStore
;
};
// this version use TensorView and TensorCoordinate_deprecated
template
<
index_t
BlockSize
,
typename
SrcTensor
,
typename
DstTensor
,
typename
SliceLengths
,
typename
SubLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
BlockwiseGenericTensorSliceCopy_v3
{
static
constexpr
index_t
nDim
=
SrcTensor
::
GetNumOfDimension
();
using
data_type
=
remove_cv_t
<
typename
SrcTensor
::
data_type
>
;
using
SrcCoordinate
=
typename
SrcTensor
::
coordinate_type
;
using
DstCoordinate
=
typename
DstTensor
::
coordinate_type
;
__device__
constexpr
BlockwiseGenericTensorSliceCopy_v3
(
SrcTensor
src_block
,
SrcCoordinate
src_block_slice_origin
,
DstTensor
dst_block
,
DstCoordinate
dst_block_slice_origin
)
:
mThreadBuffer
{
make_TensorView
(
ThreadBufferDesc
{},
mpBuffer
)}
{
static_assert
(
nDim
==
SrcTensor
::
GetNumOfDimension
()
&&
nDim
==
DstTensor
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
ThreadClusterLengths
::
GetSize
()
&&
nDim
==
ThreadClusterArrangeOrder
::
GetSize
()
&&
nDim
==
SrcDimAccessOrder
::
GetSize
()
&&
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong! nDim not consistent"
);
static_assert
(
is_same
<
SliceLengths
,
decltype
(
SubLengths
{}
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
static_assert
(
is_same
<
remove_cv_t
<
typename
SrcTensor
::
data_type
>
,
remove_cv_t
<
typename
DstTensor
::
data_type
>>
{},
"wrong! type conversion not supported yet"
);
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
const
auto
thread_cluster_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
const
auto
data_cluster_id
=
reorder_array_given_old2new
(
thread_cluster_id
,
ThreadClusterArrangeOrder
{});
const
auto
thread_data_id_begin
=
data_cluster_id
*
SubLengths
{};
mThreadwiseLoad
=
ThreadwiseLoad
(
src_block
,
src_block_slice_origin
+
thread_data_id_begin
,
mThreadBuffer
,
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
=
ThreadwiseStore
(
mThreadBuffer
,
make_zero_array
<
index_t
,
nDim
>
(),
dst_block
,
dst_block_slice_origin
+
thread_data_id_begin
);
}
__device__
void
RunLoadRegisterBuffer
()
{
mThreadwiseLoad
.
Run
();
}
__device__
void
RunStoreRegisterBuffer
()
const
{
mThreadwiseStore
.
Run
();
}
__device__
void
Run
()
{
mThreadwiseLoad
.
Run
();
mThreadwiseStore
.
Run
();
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
mThreadwiseLoad
.
MoveSrcSliceWindow
(
step_sizes
,
positive_direction
);
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
positive_direction
)
{
mThreadwiseStore
.
MoveDstSliceWindow
(
step_sizes
,
positive_direction
);
}
private:
using
ThreadBufferDesc
=
decltype
(
make_ConstantTensorDescriptor_packed
(
SubLengths
{}));
using
ThreadBufferTensor
=
NormalTensorView
<
ThreadBufferDesc
,
data_type
>
;
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v3r1
<
SrcTensor
,
ThreadBufferTensor
,
SubLengths
,
SrcDimAccessOrder
,
SrcDimAccessOrder
,
SrcVectorAccessDim
,
SrcVectorAccessDim
,
SrcDataPerAccess
,
1
>
;
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v3r1
<
ThreadBufferTensor
,
DstTensor
,
SubLengths
,
DstDimAccessOrder
,
DstDimAccessOrder
,
DstVectorAccessDim
,
DstVectorAccessDim
,
1
,
DstDataPerAccess
>
;
data_type
mpBuffer
[
ThreadBufferDesc
::
GetElementSpace
()];
ThreadBufferTensor
mThreadBuffer
;
ThreadwiseLoad
mThreadwiseLoad
;
ThreadwiseStore
mThreadwiseStore
;
};
}
// namespace ck
#endif
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
b12bbcee
...
@@ -4,1124 +4,18 @@
...
@@ -4,1124 +4,18 @@
#include "common_header.hpp"
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "tensor_coordinate
_v2
.hpp"
#include "tensor_coordinate.hpp"
#ifndef CK_
EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
#ifndef CK_
USE_AMD_INTRINSIC
#define CK_
EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#define CK_
USE_AMD_INTRINSIC 1
#endif
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
#ifndef CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#endif
#endif
namespace
ck
{
namespace
ck
{
#if 0
// This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
// It also allows order of access to be different on src and dst.
// It use register as buffer to hold all data moving from src to dst.
// It is designed for copying small amount of data, and src and dst are
// device memory or LDS.
// When copying large amout of data, let's hope compiler will reduce register
// used for the buffer.
template <typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
index_t DstDataPerAccess>
struct ThreadwiseGenericTensorSliceCopy_v1r1
{
static constexpr index_t nDim = SliceLengths::GetSize();
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1(
Array<index_t, nDim> src_slice_origin, Array<index_t, nDim> dst_slice_origin)
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
{
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::GetSize() &&
nDim == SrcDimAccessOrder::GetSize() &&
nDim == DstDimAccessOrder::GetSize(),
"wrong! # of dimensions not the same");
static_assert(is_valid_sequence_map<SrcDimAccessOrder>::value &&
is_valid_sequence_map<DstDimAccessOrder>::value,
"wrong! map is not valid");
static_assert(SliceLengths{}[SrcVectorAccessDim] % SrcDataPerAccess == 0 &&
SliceLengths{}[DstVectorAccessDim] % DstDataPerAccess == 0,
"wrong! cannot evenly divide");
// check vectorized memory access
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
constexpr auto dst_vector_access_dim = Number<DstVectorAccessDim>{};
static_if<!SrcDesc::ContainMultipleOriginalDimensions(src_vector_access_dim)>{}(
[&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetStride(src_vector_access_dim) == 1 || SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
})
.Else([&](auto fwd) {
static_assert(
(fwd(SrcDesc{}).GetLastOriginalDimensionStride(src_vector_access_dim) == 1 ||
SrcDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
static_if<!DstDesc::ContainMultipleOriginalDimensions(dst_vector_access_dim)>{}(
[&](auto fwd) {
static_assert(
(fwd(DstDesc{}).GetStride(dst_vector_access_dim) == 1 || DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
})
.Else([&](auto fwd) {
static_assert(
(fwd(DstDesc{}).GetLastOriginalDimensionStride(dst_vector_access_dim) == 1 ||
DstDataPerAccess == 1),
"wrong! vectorized access is allowed only if stride == 1");
});
}
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v1r1()
: ThreadwiseGenericTensorSliceCopy_v1r1(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>())
{
}
__device__ void SetSrcSliceOrigin(Array<index_t, nDim> src_slice_origin)
{
mSrcSliceOrigin = src_slice_origin;
}
__device__ void SetDstSliceOrigin(Array<index_t, nDim> dst_slice_origin)
{
mDstSliceOrigin = dst_slice_origin;
}
template <typename TData>
__device__ void Run(const TData* p_src, TData* p_dst) const
{
constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{});
TData p_buffer_[buffer_desc.GetElementSpace()];
TData* p_buffer = p_buffer_;
// copy data from src into buffer
{
using vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
constexpr auto src_vector_access_dim = Number<SrcVectorAccessDim>{};
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
constexpr auto src_access_lengths = SliceLengths::Modify(
src_vector_access_dim,
SliceLengths::Get(src_vector_access_dim) / src_data_per_access);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
static_ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) {
constexpr auto src_data_begin_id = src_access_id.Modify(
src_vector_access_dim,
src_access_id[src_vector_access_dim] * src_data_per_access);
const index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id);
// load vector from src
const vector_t vector_data = *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
// unpack vector into buffer
static_for<0, SrcDataPerAccess, 1>{}([&](auto i) {
constexpr auto scalar_id =
typename uniform_sequence_gen<nDim, 0>::type{}.Modify(src_vector_access_dim,
i);
constexpr index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id);
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
});
});
#else
ford<decltype(src_access_lengths), SrcDimAccessOrder>{}([&](auto src_access_id) {
auto src_data_begin_id = src_access_id;
src_data_begin_id(src_vector_access_dim) =
src_access_id[src_vector_access_dim] * src_data_per_access;
const index_t src_offset =
SrcDesc::GetOffsetFromMultiIndex(mSrcSliceOrigin + src_data_begin_id);
// load vector from src
const vector_t vector_data = *reinterpret_cast<const vector_t*>(&p_src[src_offset]);
// unpack vector into buffer
for(index_t i = 0; i < SrcDataPerAccess; ++i)
{
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(src_vector_access_dim) = i;
const index_t buffer_offset =
buffer_desc.GetOffsetFromMultiIndex(src_data_begin_id + scalar_id);
p_buffer[buffer_offset] = reinterpret_cast<const TData*>(&vector_data)[i];
}
});
#endif
}
// copy data from buffer to dst
{
using
vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
dst_access_lengths
=
SliceLengths
::
Modify
(
dst_vector_access_dim
,
SliceLengths
::
Get
(
dst_vector_access_dim
)
/
dst_data_per_access
);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
static_ford
<
decltype
(
dst_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_access_id
)
{
constexpr
auto
dst_data_begin_id
=
dst_access_id
.
Modify
(
dst_vector_access_dim
,
dst_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
);
vector_t
vector_data
{};
// pack vector from buffer
static_for
<
0
,
DstDataPerAccess
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
dst_vector_access_dim
,
i
);
constexpr
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_data_begin_id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
});
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
mDstSliceOrigin
+
dst_data_begin_id
);
// store vector into dst
*
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_offset
])
=
vector_data
;
});
#else
ford
<
decltype
(
dst_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_access_id
)
{
auto
dst_data_begin_id
=
dst_access_id
;
dst_data_begin_id
(
dst_vector_access_dim
)
=
dst_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
vector_t
vector_data
{};
// pack vector from buffer
for
(
index_t
i
=
0
;
i
<
DstDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
dst_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_data_begin_id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
}
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
mDstSliceOrigin
+
dst_data_begin_id
);
// store vector into dst
*
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_offset
])
=
vector_data
;
});
#endif
}
}
private
:
Array
<
index_t
,
nDim
>
mSrcSliceOrigin
;
Array
<
index_t
,
nDim
>
mDstSliceOrigin
;
}
;
// This threadwise copy allow vector access of src and dst.
// It allows the vector size to be different on src and dst.
// The dimensions of vector access should be the same on src and dst.
// The dimension access order should be the same on src and dst.
// It is designed for cases, where one of src and dst is register, and
// the other is device memory or LDS
template
<
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
DimAccessOrder
,
index_t
VectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v1r2
{
static
constexpr
index_t
nDim
=
SliceLengths
::
GetSize
();
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1r2
(
Array
<
index_t
,
nDim
>
src_slice_origin
,
Array
<
index_t
,
nDim
>
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
DimAccessOrder
::
GetSize
(),
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
DimAccessOrder
>::
value
,
"wrong! map is not valid"
);
static_assert
(
SliceLengths
{}[
VectorAccessDim
]
%
math
::
lcm
(
SrcDataPerAccess
,
DstDataPerAccess
)
==
0
,
"wrong! cannot evenly divide"
);
// check vectorized memory access
constexpr
auto
vector_access_dim
=
Number
<
VectorAccessDim
>
{};
static_if
<!
SrcDesc
::
ContainMultipleOriginalDimensions
(
vector_access_dim
)
>
{}([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
SrcDesc
{}).
GetStride
(
vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
}).
Else
([
&
](
auto
fwd
)
{
static_assert
((
fwd
(
SrcDesc
{}).
GetLastOriginalDimensionStride
(
vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
static_if
<!
DstDesc
::
ContainMultipleOriginalDimensions
(
vector_access_dim
)
>
{}([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
DstDesc
{}).
GetStride
(
vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
}).
Else
([
&
](
auto
fwd
)
{
static_assert
((
fwd
(
DstDesc
{}).
GetLastOriginalDimensionStride
(
vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1r2
()
:
ThreadwiseGenericTensorSliceCopy_v1r2
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
{
}
__device__
void
SetSrcSliceOrigin
(
Array
<
index_t
,
nDim
>
src_slice_origin
)
{
mSrcSliceOrigin
=
src_slice_origin
;
}
__device__
void
SetDstSliceOrigin
(
Array
<
index_t
,
nDim
>
dst_slice_origin
)
{
mDstSliceOrigin
=
dst_slice_origin
;
}
template
<
typename
TData
>
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
using
src_vector_t
=
typename
vector_type
<
TData
,
SrcDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
vector_access_dim
=
Number
<
VectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
long_vector_size
=
Number
<
math
::
lcm
(
SrcDataPerAccess
,
DstDataPerAccess
)
>
{};
constexpr
auto
long_vector_access_lengths
=
SliceLengths
::
Modify
(
vector_access_dim
,
SliceLengths
::
Get
(
vector_access_dim
)
/
long_vector_size
);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
static_ford
<
decltype
(
long_vector_access_lengths
),
DimAccessOrder
>
{}([
&
](
auto
long_vector_access_id
)
{
// data id w.r.t slicing-window
constexpr
auto
long_vector_data_begin_id
=
long_vector_access_id
.
Modify
(
vector_access_dim
,
long_vector_access_id
[
vector_access_dim
]
*
long_vector_size
);
// buffer to hold a long-vector
TData
p_long_vector
[
long_vector_size
];
// load data from src to the long-vector buffer
static_for
<
0
,
long_vector_size
/
src_data_per_access
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
vector_access_dim
,
i
*
src_data_per_access
);
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
));
constexpr
index_t
buffer_offset
=
i
*
src_data_per_access
;
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_offset
]);
});
// store data from the long-vector buffer to dst
static_for
<
0
,
long_vector_size
/
dst_data_per_access
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
vector_access_dim
,
i
*
dst_data_per_access
);
constexpr
index_t
buffer_offset
=
i
*
dst_data_per_access
;
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
mDstSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
));
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_offset
])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]);
});
});
#else
ford
<
decltype
(
long_vector_access_lengths
),
DimAccessOrder
>
{}(
[
&
](
auto
long_vector_access_id
)
{
// data id w.r.t slicing-window
auto
long_vector_data_begin_id
=
long_vector_access_id
;
long_vector_data_begin_id
(
vector_access_dim
)
=
long_vector_size
*
long_vector_access_id
[
vector_access_dim
];
// buffer to hold a long-vector
TData
p_long_vector
[
long_vector_size
];
// load data from src to the long-vector buffer
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
src_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
src_data_per_access
;
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
));
const
index_t
buffer_offset
=
i
*
src_data_per_access
;
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_offset
]);
}
// store data from the long-vector buffer to dst
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
dst_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
dst_data_per_access
;
const
index_t
buffer_offset
=
i
*
dst_data_per_access
;
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
mDstSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
));
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_offset
])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]);
}
});
#endif
}
private:
Array
<
index_t
,
nDim
>
mSrcSliceOrigin
;
Array
<
index_t
,
nDim
>
mDstSliceOrigin
;
};
// This version use TensorCoordinate
// This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
// It also allows order of access to be different on src and dst.
// It use register as buffer to hold all data moving from src to dst.
// It is designed for copying small amount of data, and src and dst are
// device memory or LDS.
// When copying large amout of data, let's hope compiler will reduce register
// used for the buffer.
template
<
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v2r1
{
static
constexpr
index_t
nDim
=
SliceLengths
::
GetSize
();
using
Index
=
MultiIndex
<
nDim
>
;
using
SrcCoordinate
=
typename
TensorCoordinate
<
SrcDesc
>::
type
;
using
DstCoordinate
=
typename
TensorCoordinate
<
DstDesc
>::
type
;
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v2r1
(
const
Index
&
src_slice_origin
,
const
Index
&
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SrcDimAccessOrder
::
GetSize
()
&&
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
SrcDimAccessOrder
>::
value
&&
is_valid_sequence_map
<
DstDimAccessOrder
>::
value
,
"wrong! map is not valid"
);
static_assert
(
SliceLengths
{}[
SrcVectorAccessDim
]
%
SrcDataPerAccess
==
0
&&
SliceLengths
{}[
DstVectorAccessDim
]
%
DstDataPerAccess
==
0
,
"wrong! cannot evenly divide"
);
// check vectorized memory access
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
static_if
<!
SrcDesc
::
ContainMultipleOriginalDimensions
(
src_vector_access_dim
)
>
{}(
[
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
SrcDesc
{}).
GetStride
(
src_vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
})
.
Else
([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
SrcDesc
{}).
GetLastOriginalDimensionStride
(
src_vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
static_if
<!
DstDesc
::
ContainMultipleOriginalDimensions
(
dst_vector_access_dim
)
>
{}(
[
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
DstDesc
{}).
GetStride
(
dst_vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
})
.
Else
([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
DstDesc
{}).
GetLastOriginalDimensionStride
(
dst_vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v2r1
()
:
ThreadwiseGenericTensorSliceCopy_v2r1
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
{
}
__device__
void
SetSrcSliceOrigin
(
SrcCoordinate
src_slice_origin
)
{
mSrcSliceOrigin
=
src_slice_origin
;
}
__device__
void
SetDstSliceOrigin
(
DstCoordinate
dst_slice_origin
)
{
mDstSliceOrigin
=
dst_slice_origin
;
}
template
<
typename
TDesc
,
class
Lengths
>
struct
IsolateMergedDimLengths
{
template
<
typename
IDim
>
__device__
constexpr
index_t
operator
()(
IDim
idim
)
const
{
return
TDesc
::
ContainMultipleOriginalDimensions
(
idim
)
?
Lengths
{}[
idim
]
:
1
;
}
};
template
<
typename
TData
>
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SliceLengths
{});
TData
p_buffer_
[
buffer_desc
.
GetElementSpace
()];
TData
*
p_buffer
=
p_buffer_
;
// copy data from src into buffer
{
using
src_vector_t
=
typename
vector_type
<
TData
,
SrcDataPerAccess
>::
MemoryType
;
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
src_access_lengths
=
SliceLengths
::
Modify
(
src_vector_access_dim
,
SliceLengths
::
Get
(
src_vector_access_dim
)
/
src_data_per_access
);
// Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t
// normal dimensions is known at compile time.
// Below is a hack to isolate merged dimension id from normal dimension id, so the
// corresponding offset can be calculated seperately at run-time and compile-time.
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// merged dimensions, and has value = 1 on normal dimensions;
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// normal dimensions, and has value = 1 on merged dimensions;
constexpr
auto
src_merged_dim_access_lengths
=
typename
sequence_gen
<
nDim
,
IsolateMergedDimLengths
<
SrcDesc
,
decltype
(
src_access_lengths
)
>>::
type
{};
constexpr
auto
src_normal_dim_access_lengths
=
src_access_lengths
+
Number
<
1
>
{}
-
src_merged_dim_access_lengths
;
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
// offset w.r.t. merged dimension need to be computed at run-time
static_ford
<
decltype
(
src_merged_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_merged_dim_access_id_
)
{
constexpr
auto
src_merged_dim_access_id
=
decltype
(
src_merged_dim_access_id_
){};
constexpr
auto
src_merged_dim_data_id
=
src_merged_dim_access_id
.
Modify
(
src_vector_access_dim
,
src_merged_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
);
const
TData
*
p_src_tmp
=
p_src
+
(
mSrcSliceOrigin
+
src_merged_dim_data_id
).
GetOffset
();
// offset w.r.t. normal dimension can be computed at compile-time
static_ford
<
decltype
(
src_normal_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_normal_dim_access_id_
)
{
constexpr
auto
src_normal_dim_access_id
=
decltype
(
src_normal_dim_access_id_
){};
constexpr
auto
src_normal_dim_data_id
=
src_normal_dim_access_id
.
Modify
(
src_vector_access_dim
,
src_normal_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
);
constexpr
index_t
src_normal_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_normal_dim_data_id
);
// load vector from src
const
src_vector_t
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src_tmp
[
src_normal_offset
]);
// unpack vector into buffer
static_for
<
0
,
SrcDataPerAccess
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
src_vector_access_dim
,
i
);
constexpr
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_merged_dim_data_id
+
src_normal_dim_data_id
+
scalar_id
);
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
});
});
});
#else
ford
<
decltype
(
src_merged_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_merged_dim_access_id
)
{
auto
src_merged_dim_data_id
=
src_merged_dim_access_id
;
src_merged_dim_data_id
(
src_vector_access_dim
)
=
src_merged_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
const
TData
*
p_src_tmp
=
p_src
+
(
mSrcSliceOrigin
+
src_merged_dim_data_id
).
GetOffset
();
// these should be compile-time known
ford
<
decltype
(
src_normal_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_normal_dim_access_id
)
{
auto
src_normal_dim_data_id
=
src_normal_dim_access_id
;
src_normal_dim_data_id
(
src_vector_access_dim
)
=
src_normal_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
const
index_t
src_normal_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_normal_dim_data_id
);
// load vector from src
const
src_vector_t
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src_tmp
[
src_normal_offset
]);
// unpack vector into buffer
for
(
index_t
i
=
0
;
i
<
SrcDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
src_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_merged_dim_data_id
+
src_normal_dim_data_id
+
scalar_id
);
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
}
});
});
#endif
}
// copy data from buffer into dst
{
using
dst_vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
dst_access_lengths
=
SliceLengths
::
Modify
(
dst_vector_access_dim
,
SliceLengths
::
Get
(
dst_vector_access_dim
)
/
dst_data_per_access
);
constexpr
auto
dst_merged_dim_access_lengths
=
typename
sequence_gen
<
nDim
,
IsolateMergedDimLengths
<
DstDesc
,
decltype
(
dst_access_lengths
)
>>::
type
{};
constexpr
auto
dst_normal_dim_access_lengths
=
dst_access_lengths
+
Number
<
1
>
{}
-
dst_merged_dim_access_lengths
;
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
// offset w.r.t. merged dimension need to be computed at run-time
static_ford
<
decltype
(
dst_merged_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_merged_dim_access_id_
)
{
constexpr
auto
dst_merged_dim_access_id
=
decltype
(
dst_merged_dim_access_id_
){};
constexpr
auto
dst_merged_dim_data_id
=
dst_merged_dim_access_id
.
Modify
(
dst_vector_access_dim
,
dst_merged_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
);
TData
*
p_dst_tmp
=
p_dst
+
(
mDstSliceOrigin
+
dst_merged_dim_data_id
).
GetOffset
();
// offset w.r.t. normal dimension can be computed at compile-time
static_ford
<
decltype
(
dst_normal_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_normal_dim_access_id_
)
{
constexpr
auto
dst_normal_dim_access_id
=
decltype
(
dst_normal_dim_access_id_
){};
constexpr
auto
dst_normal_dim_data_id
=
dst_normal_dim_access_id
.
Modify
(
dst_vector_access_dim
,
dst_normal_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
);
dst_vector_t
vector_data
;
// pack vector from buffer
static_for
<
0
,
DstDataPerAccess
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
dst_vector_access_dim
,
i
);
constexpr
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_merged_dim_data_id
+
dst_normal_dim_data_id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
});
constexpr
index_t
dst_normal_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_normal_dim_data_id
);
// write vector into dst
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_tmp
[
dst_normal_offset
])
=
vector_data
;
});
});
#else
// offset w.r.t. merged dimension need to be computed at run-time
ford
<
decltype
(
dst_merged_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_merged_dim_access_id
)
{
auto
dst_merged_dim_data_id
=
dst_merged_dim_access_id
;
dst_merged_dim_data_id
(
dst_vector_access_dim
)
=
dst_merged_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
TData
*
p_dst_tmp
=
p_dst
+
(
mDstSliceOrigin
+
dst_merged_dim_data_id
).
GetOffset
();
// offset w.r.t. normal dimension can be computed at compile-time
ford
<
decltype
(
dst_normal_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_normal_dim_access_id
)
{
auto
dst_normal_dim_data_id
=
dst_normal_dim_access_id
;
dst_normal_dim_data_id
(
dst_vector_access_dim
)
=
dst_normal_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
dst_vector_t
vector_data
;
// pack vector from buffer
for
(
index_t
i
=
0
;
i
<
DstDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
dst_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_merged_dim_data_id
+
dst_normal_dim_data_id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
}
const
index_t
dst_normal_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_normal_dim_data_id
);
// write vector into dst
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_tmp
[
dst_normal_offset
])
=
vector_data
;
});
});
#endif
}
}
// memory-space
// 0: VGPR
// 1: LDS
// 2: global-memory
template
<
typename
TData
,
index_t
SrcMemorySpace
,
index_t
DstMemorySpace
>
__device__
void
Run_amd_experiment
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SliceLengths
{});
TData
p_buffer_
[
buffer_desc
.
GetElementSpace
()];
TData
*
p_buffer
=
p_buffer_
;
// copy data from src into buffer
{
using
src_vector_t
=
typename
vector_type
<
TData
,
SrcDataPerAccess
>::
MemoryType
;
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
src_access_lengths
=
SliceLengths
::
Modify
(
src_vector_access_dim
,
SliceLengths
::
Get
(
src_vector_access_dim
)
/
src_data_per_access
);
// Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t
// normal dimensions is known at compile time.
// Below is a hack to isolate merged dimension id from normal dimension id, so the
// corresponding offset can be calculated seperately at run-time and compile-time.
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// merged dimensions, and has value = 1 on normal dimensions;
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// normal dimensions, and has value = 1 on merged dimensions;
constexpr
auto
src_merged_dim_access_lengths
=
typename
sequence_gen
<
nDim
,
IsolateMergedDimLengths
<
SrcDesc
,
decltype
(
src_access_lengths
)
>>::
type
{};
constexpr
auto
src_normal_dim_access_lengths
=
src_access_lengths
+
Number
<
1
>
{}
-
src_merged_dim_access_lengths
;
ford
<
decltype
(
src_merged_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_merged_dim_access_id
)
{
auto
src_merged_dim_data_id
=
src_merged_dim_access_id
;
src_merged_dim_data_id
(
src_vector_access_dim
)
=
src_merged_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
// offset w.r.t. merged dimension need be computed at run-time,
const
index_t
src_merged_offset
=
(
mSrcSliceOrigin
+
src_merged_dim_data_id
).
GetOffset
();
ford
<
decltype
(
src_normal_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_normal_dim_access_id
)
{
auto
src_normal_dim_data_id
=
src_normal_dim_access_id
;
src_normal_dim_data_id
(
src_vector_access_dim
)
=
src_normal_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
// offset w.r.t. normal dimension is known at compile-time
const
index_t
src_normal_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_normal_dim_data_id
);
src_vector_t
vector_data
;
// Read vector from src.
// 1. Source code version can take src of all kinds of memory-space
// 2. Inline asm versions using global_load or buffer_load can only take
// src from global-memory
//
// Commemt for loading from global-memory:
// When
// 1) using source code, in order for compiler to emit optimal
// load instruction, or
// 2) using inline asm (global_load or buffer_load), in order
// for inline asm to be valid,
// following assumptions need to be satisfied:
// 1. p_src need to be block-invariant (assumption)
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
static_if
<
SrcMemorySpace
==
2
>
{}([
&
](
auto
)
{
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
vector_data
=
__buffer_load
<
TData
,
SrcDataPerAccess
>
(
p_src
,
static_cast
<
uint32_t
>
(
src_merged_offset
),
static_cast
<
uint32_t
>
(
src_normal_offset
));
#else
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_normal_offset
+
src_merged_offset
]);
#endif
}).
Else
([
&
](
auto
)
{
// src can be all kinds of memory-space.
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_normal_offset
+
src_merged_offset
]);
});
// unpack vector into buffer
for
(
index_t
i
=
0
;
i
<
SrcDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
src_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_merged_dim_data_id
+
src_normal_dim_data_id
+
scalar_id
);
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
}
});
});
}
// copy data from buffer into dst
{
using
dst_vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
dst_access_lengths
=
SliceLengths
::
Modify
(
dst_vector_access_dim
,
SliceLengths
::
Get
(
dst_vector_access_dim
)
/
dst_data_per_access
);
constexpr
auto
dst_merged_dim_access_lengths
=
typename
sequence_gen
<
nDim
,
IsolateMergedDimLengths
<
DstDesc
,
decltype
(
dst_access_lengths
)
>>::
type
{};
constexpr
auto
dst_normal_dim_access_lengths
=
dst_access_lengths
+
Number
<
1
>
{}
-
dst_merged_dim_access_lengths
;
ford
<
decltype
(
dst_merged_dim_access_lengths
),
DstDimAccessOrder
>
{}(
[
&
](
auto
dst_merged_dim_access_id
)
{
auto
dst_merged_dim_data_id
=
dst_merged_dim_access_id
;
dst_merged_dim_data_id
(
dst_vector_access_dim
)
=
dst_merged_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
// offset w.r.t. merged dimension need be computed at run-time,
const
index_t
dst_merged_offset
=
(
mDstSliceOrigin
+
dst_merged_dim_data_id
).
GetOffset
();
ford
<
decltype
(
dst_normal_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_normal_dim_access_id
)
{
auto
dst_normal_dim_data_id
=
dst_normal_dim_access_id
;
dst_normal_dim_data_id
(
dst_vector_access_dim
)
=
dst_normal_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
dst_vector_t
vector_data
;
// pack vector from buffer
for
(
index_t
i
=
0
;
i
<
DstDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
dst_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_merged_dim_data_id
+
dst_normal_dim_data_id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
}
// offset w.r.t. normal dimension is known at compile-time
const
index_t
dst_normal_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_normal_dim_data_id
);
// Write vector into dst.
// 1. Source code version can take dst of all kinds of memory-space
// 2. Inline asm versions using global_store or buffer_store can only take
// dst from global-memory
//
// Commemt for storing into global-memory:
// When
// 1) using source code, in order for compiler to emit optimal
// store instruction, or
// 2) using inline asm (global_store or buffer_store), in order
// for inline asm to be valid,
// following assumptions need to be satisfied:
// 1. p_dst need to be block-invariant (assumption)
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
static_if
<
DstMemorySpace
==
2
>
{}([
&
](
auto
)
{
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
__buffer_store
<
TData
,
DstDataPerAccess
>
(
vector_data
,
p_dst
,
dst_merged_offset
,
dst_normal_offset
);
#else
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_normal_offset
+
dst_merged_offset
])
=
vector_data
;
#endif
}).
Else
([
&
](
auto
)
{
// dst can be all kinds of memory-space
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_normal_offset
+
dst_merged_offset
])
=
vector_data
;
});
});
});
}
}
// T can be Sequence or Array
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mSrcSliceOrigin
+=
step_sizes
;
}).
Else
([
&
](
auto
)
{
mSrcSliceOrigin
-=
step_sizes
;
});
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mDstSliceOrigin
+=
step_sizes
;
}).
Else
([
&
](
auto
)
{
mDstSliceOrigin
-=
step_sizes
;
});
}
private:
SrcCoordinate
mSrcSliceOrigin
;
DstCoordinate
mDstSliceOrigin
;
};
// this version use TensorView and TensorCoordinate
template
<
typename
SrcTensor
,
typename
DstTensor
,
typename
SliceLengths
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v3r1
{
static
constexpr
index_t
nDim
=
SrcTensor
::
GetNumOfDimension
();
using
data_type
=
remove_cv_t
<
typename
SrcTensor
::
data_type
>
;
using
SrcCoordinate
=
typename
SrcTensor
::
coordinate_type
;
using
DstCoordinate
=
typename
DstTensor
::
coordinate_type
;
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v3r1
(
SrcTensor
src
,
SrcCoordinate
src_slice_origin
,
DstTensor
dst
,
DstCoordinate
dst_slice_origin
)
:
mSrc
{
src
},
mDst
{
dst
},
mSrcSlice
{
src
.
Slice
(
src_slice_origin
,
SliceLengths
{})},
mDstSlice
{
dst
.
Slice
(
dst_slice_origin
,
SliceLengths
{})}
{
static_assert
(
nDim
==
SrcTensor
::
GetNumOfDimension
()
&&
nDim
==
DstTensor
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SrcDimAccessOrder
::
GetSize
()
&&
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
SrcDimAccessOrder
>::
value
&&
is_valid_sequence_map
<
DstDimAccessOrder
>::
value
,
"wrong! map is not valid"
);
static_assert
(
is_same
<
remove_cv_t
<
typename
SrcTensor
::
data_type
>
,
remove_cv_t
<
typename
DstTensor
::
data_type
>>
{},
"wrong! type conversion is not supported yet"
);
static_assert
(
decltype
(
mSrcSlice
)
::
IsVectorizationAllowed
(
Number
<
SrcVectorAccessDim
>
{},
Number
<
SrcDataPerAccess
>
{})
&&
decltype
(
mDstSlice
)
::
IsVectorizationAllowed
(
Number
<
DstVectorAccessDim
>
{},
Number
<
DstDataPerAccess
>
{}),
"wrong! vectorized access is not allowed"
);
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v3r1
()
:
ThreadwiseGenericTensorSliceCopy_v3r1
(
SrcTensor
{},
SrcCoordinate
{},
DstTensor
{},
DstCoordinate
{})
{
}
__device__
void
Run
()
const
{
// buffer
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SrcTensor
::
GetLengths
());
data_type
p_buffer
[
buffer_desc
.
GetElementSpace
()];
auto
buffer
=
make_TensorView
(
buffer_desc
,
p_buffer
);
// copy data from src into buffer
{
using
src_vector_t
=
typename
vector_type
<
data_type
,
SrcDataPerAccess
>::
MemoryType
;
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
auto
src_slice_vectorized
=
mSrcSlice
.
Vectorize
(
src_vector_access_dim
,
src_data_per_access
);
ford
<
decltype
(
src_slice_vectorized
.
GetLengths
()),
SrcDimAccessOrder
>
{}(
[
&
](
auto
src_vector_id
)
{
// load vector from src
const
src_vector_t
vector_data
=
src_slice_vectorized
[
src_vector_id
];
// unpack vector into buffer
auto
src_scalar_id
=
src_vector_id
;
src_scalar_id
(
src_vector_access_dim
)
*=
src_data_per_access
;
for
(
index_t
i
=
0
;
i
<
SrcDataPerAccess
;
++
i
)
{
auto
id
=
make_zero_array
<
index_t
,
nDim
>
();
id
(
src_vector_access_dim
)
=
i
;
buffer
(
src_scalar_id
+
id
)
=
reinterpret_cast
<
const
data_type
*>
(
&
vector_data
)[
i
];
}
});
}
// copy data from buffer into dst
{
using
dst_vector_t
=
typename
vector_type
<
data_type
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
auto
dst_slice_vectorized
=
mDstSlice
.
Vectorize
(
dst_vector_access_dim
,
dst_data_per_access
);
ford
<
decltype
(
dst_slice_vectorized
.
GetLengths
()),
DstDimAccessOrder
>
{}(
[
&
](
auto
dst_vector_id
)
{
dst_vector_t
vector_data
{};
// pack vector from buffer
auto
dst_scalar_id
=
dst_vector_id
;
dst_scalar_id
(
dst_vector_access_dim
)
*=
dst_data_per_access
;
for
(
index_t
i
=
0
;
i
<
DstDataPerAccess
;
++
i
)
{
auto
id
=
make_zero_array
<
index_t
,
nDim
>
();
id
(
dst_vector_access_dim
)
=
i
;
reinterpret_cast
<
data_type
*>
(
&
vector_data
)[
i
]
=
buffer
[
dst_scalar_id
+
id
];
}
// write vector into dst
dst_slice_vectorized
(
dst_vector_id
)
=
vector_data
;
});
}
}
// T can be Sequence or Array
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
mSrc
.
MoveSliceWindow
(
mSrcSlice
,
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
{});
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
mDst
.
MoveSliceWindow
(
mDstSlice
,
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
{});
}
private:
using
SrcSlice
=
decltype
(
SrcTensor
{}.
Slice
(
make_zero_array
<
index_t
,
nDim
>
(),
SliceLengths
{}));
using
DstSlice
=
decltype
(
DstTensor
{}.
Slice
(
make_zero_array
<
index_t
,
nDim
>
(),
SliceLengths
{}));
SrcTensor
mSrc
;
DstTensor
mDst
;
SrcSlice
mSrcSlice
;
DstSlice
mDstSlice
;
};
#endif
// This version use multi-index transformation
// This version use multi-index transformation
// This threadwise copy allow vector access of src and dst.
// This threadwise copy allow vector access of src and dst.
// It allows the vector size to be different on src and dst.
// It allows the vector size to be different on src and dst.
...
@@ -1141,8 +35,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
...
@@ -1141,8 +35,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
using
Index
=
MultiIndex
<
nDim
>
;
using
Index
=
MultiIndex
<
nDim
>
;
using
SrcCoord
=
typename
TensorCoordinate
_v2
<
SrcDesc
>::
type
;
using
SrcCoord
=
typename
TensorCoordinate
<
SrcDesc
>::
type
;
using
DstCoord
=
typename
TensorCoordinate
_v2
<
DstDesc
>::
type
;
using
DstCoord
=
typename
TensorCoordinate
<
DstDesc
>::
type
;
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v4r2
(
const
Index
&
src_slice_origin
,
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v4r2
(
const
Index
&
src_slice_origin
,
const
Index
&
dst_slice_origin
)
const
Index
&
dst_slice_origin
)
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
0 → 100644
View file @
b12bbcee
#ifndef CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
#define CK_THREADWISE_GENERIC_TENSOR_SLICE_COPY_DEPRECATED_HPP
#include "common_header.hpp"
#include "ConstantTensorDescriptor.hpp"
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_view.hpp"
#include "tensor_coordinate_deprecated.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0
#endif
#ifndef CK_USE_AMD_INTRINSIC
#define CK_USE_AMD_INTRINSIC 1
#endif
#ifndef CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
#endif
namespace
ck
{
// This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
// It also allows order of access to be different on src and dst.
// It use register as buffer to hold all data moving from src to dst.
// It is designed for copying small amount of data, and src and dst are
// device memory or LDS.
// When copying large amout of data, let's hope compiler will reduce register
// used for the buffer.
template
<
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v1r1
{
static
constexpr
index_t
nDim
=
SliceLengths
::
GetSize
();
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1r1
(
Array
<
index_t
,
nDim
>
src_slice_origin
,
Array
<
index_t
,
nDim
>
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SrcDimAccessOrder
::
GetSize
()
&&
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
SrcDimAccessOrder
>::
value
&&
is_valid_sequence_map
<
DstDimAccessOrder
>::
value
,
"wrong! map is not valid"
);
static_assert
(
SliceLengths
{}[
SrcVectorAccessDim
]
%
SrcDataPerAccess
==
0
&&
SliceLengths
{}[
DstVectorAccessDim
]
%
DstDataPerAccess
==
0
,
"wrong! cannot evenly divide"
);
// check vectorized memory access
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
static_if
<!
SrcDesc
::
ContainMultipleOriginalDimensions
(
src_vector_access_dim
)
>
{}(
[
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
SrcDesc
{}).
GetStride
(
src_vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
})
.
Else
([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
SrcDesc
{}).
GetLastOriginalDimensionStride
(
src_vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
static_if
<!
DstDesc
::
ContainMultipleOriginalDimensions
(
dst_vector_access_dim
)
>
{}(
[
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
DstDesc
{}).
GetStride
(
dst_vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
})
.
Else
([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
DstDesc
{}).
GetLastOriginalDimensionStride
(
dst_vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1r1
()
:
ThreadwiseGenericTensorSliceCopy_v1r1
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
{
}
__device__
void
SetSrcSliceOrigin
(
Array
<
index_t
,
nDim
>
src_slice_origin
)
{
mSrcSliceOrigin
=
src_slice_origin
;
}
__device__
void
SetDstSliceOrigin
(
Array
<
index_t
,
nDim
>
dst_slice_origin
)
{
mDstSliceOrigin
=
dst_slice_origin
;
}
template
<
typename
TData
>
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SliceLengths
{});
TData
p_buffer_
[
buffer_desc
.
GetElementSpace
()];
TData
*
p_buffer
=
p_buffer_
;
// copy data from src into buffer
{
using
vector_t
=
typename
vector_type
<
TData
,
SrcDataPerAccess
>::
MemoryType
;
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
src_access_lengths
=
SliceLengths
::
Modify
(
src_vector_access_dim
,
SliceLengths
::
Get
(
src_vector_access_dim
)
/
src_data_per_access
);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
static_ford
<
decltype
(
src_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_access_id
)
{
constexpr
auto
src_data_begin_id
=
src_access_id
.
Modify
(
src_vector_access_dim
,
src_access_id
[
src_vector_access_dim
]
*
src_data_per_access
);
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
mSrcSliceOrigin
+
src_data_begin_id
);
// load vector from src
const
vector_t
vector_data
=
*
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
src_offset
]);
// unpack vector into buffer
static_for
<
0
,
SrcDataPerAccess
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
src_vector_access_dim
,
i
);
constexpr
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_data_begin_id
+
scalar_id
);
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
});
});
#else
ford
<
decltype
(
src_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_access_id
)
{
auto
src_data_begin_id
=
src_access_id
;
src_data_begin_id
(
src_vector_access_dim
)
=
src_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
mSrcSliceOrigin
+
src_data_begin_id
);
// load vector from src
const
vector_t
vector_data
=
*
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
src_offset
]);
// unpack vector into buffer
for
(
index_t
i
=
0
;
i
<
SrcDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
src_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_data_begin_id
+
scalar_id
);
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
}
});
#endif
}
// copy data from buffer to dst
{
using
vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
dst_access_lengths
=
SliceLengths
::
Modify
(
dst_vector_access_dim
,
SliceLengths
::
Get
(
dst_vector_access_dim
)
/
dst_data_per_access
);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
static_ford
<
decltype
(
dst_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_access_id
)
{
constexpr
auto
dst_data_begin_id
=
dst_access_id
.
Modify
(
dst_vector_access_dim
,
dst_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
);
vector_t
vector_data
{};
// pack vector from buffer
static_for
<
0
,
DstDataPerAccess
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
dst_vector_access_dim
,
i
);
constexpr
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_data_begin_id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
});
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
mDstSliceOrigin
+
dst_data_begin_id
);
// store vector into dst
*
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_offset
])
=
vector_data
;
});
#else
ford
<
decltype
(
dst_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_access_id
)
{
auto
dst_data_begin_id
=
dst_access_id
;
dst_data_begin_id
(
dst_vector_access_dim
)
=
dst_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
vector_t
vector_data
{};
// pack vector from buffer
for
(
index_t
i
=
0
;
i
<
DstDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
dst_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_data_begin_id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
}
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
mDstSliceOrigin
+
dst_data_begin_id
);
// store vector into dst
*
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_offset
])
=
vector_data
;
});
#endif
}
}
private:
Array
<
index_t
,
nDim
>
mSrcSliceOrigin
;
Array
<
index_t
,
nDim
>
mDstSliceOrigin
;
};
// This threadwise copy allow vector access of src and dst.
// It allows the vector size to be different on src and dst.
// The dimensions of vector access should be the same on src and dst.
// The dimension access order should be the same on src and dst.
// It is designed for cases, where one of src and dst is register, and
// the other is device memory or LDS
template
<
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
DimAccessOrder
,
index_t
VectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v1r2
{
static
constexpr
index_t
nDim
=
SliceLengths
::
GetSize
();
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1r2
(
Array
<
index_t
,
nDim
>
src_slice_origin
,
Array
<
index_t
,
nDim
>
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
DimAccessOrder
::
GetSize
(),
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
DimAccessOrder
>::
value
,
"wrong! map is not valid"
);
static_assert
(
SliceLengths
{}[
VectorAccessDim
]
%
math
::
lcm
(
SrcDataPerAccess
,
DstDataPerAccess
)
==
0
,
"wrong! cannot evenly divide"
);
// check vectorized memory access
constexpr
auto
vector_access_dim
=
Number
<
VectorAccessDim
>
{};
static_if
<!
SrcDesc
::
ContainMultipleOriginalDimensions
(
vector_access_dim
)
>
{}([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
SrcDesc
{}).
GetStride
(
vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
}).
Else
([
&
](
auto
fwd
)
{
static_assert
((
fwd
(
SrcDesc
{}).
GetLastOriginalDimensionStride
(
vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
static_if
<!
DstDesc
::
ContainMultipleOriginalDimensions
(
vector_access_dim
)
>
{}([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
DstDesc
{}).
GetStride
(
vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
}).
Else
([
&
](
auto
fwd
)
{
static_assert
((
fwd
(
DstDesc
{}).
GetLastOriginalDimensionStride
(
vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v1r2
()
:
ThreadwiseGenericTensorSliceCopy_v1r2
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
{
}
__device__
void
SetSrcSliceOrigin
(
Array
<
index_t
,
nDim
>
src_slice_origin
)
{
mSrcSliceOrigin
=
src_slice_origin
;
}
__device__
void
SetDstSliceOrigin
(
Array
<
index_t
,
nDim
>
dst_slice_origin
)
{
mDstSliceOrigin
=
dst_slice_origin
;
}
template
<
typename
TData
>
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
using
src_vector_t
=
typename
vector_type
<
TData
,
SrcDataPerAccess
>::
MemoryType
;
using
dst_vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
vector_access_dim
=
Number
<
VectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
long_vector_size
=
Number
<
math
::
lcm
(
SrcDataPerAccess
,
DstDataPerAccess
)
>
{};
constexpr
auto
long_vector_access_lengths
=
SliceLengths
::
Modify
(
vector_access_dim
,
SliceLengths
::
Get
(
vector_access_dim
)
/
long_vector_size
);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
static_ford
<
decltype
(
long_vector_access_lengths
),
DimAccessOrder
>
{}([
&
](
auto
long_vector_access_id
)
{
// data id w.r.t slicing-window
constexpr
auto
long_vector_data_begin_id
=
long_vector_access_id
.
Modify
(
vector_access_dim
,
long_vector_access_id
[
vector_access_dim
]
*
long_vector_size
);
// buffer to hold a long-vector
TData
p_long_vector
[
long_vector_size
];
// load data from src to the long-vector buffer
static_for
<
0
,
long_vector_size
/
src_data_per_access
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
vector_access_dim
,
i
*
src_data_per_access
);
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
));
constexpr
index_t
buffer_offset
=
i
*
src_data_per_access
;
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_offset
]);
});
// store data from the long-vector buffer to dst
static_for
<
0
,
long_vector_size
/
dst_data_per_access
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
vector_access_dim
,
i
*
dst_data_per_access
);
constexpr
index_t
buffer_offset
=
i
*
dst_data_per_access
;
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
mDstSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
));
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_offset
])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]);
});
});
#else
ford
<
decltype
(
long_vector_access_lengths
),
DimAccessOrder
>
{}(
[
&
](
auto
long_vector_access_id
)
{
// data id w.r.t slicing-window
auto
long_vector_data_begin_id
=
long_vector_access_id
;
long_vector_data_begin_id
(
vector_access_dim
)
=
long_vector_size
*
long_vector_access_id
[
vector_access_dim
];
// buffer to hold a long-vector
TData
p_long_vector
[
long_vector_size
];
// load data from src to the long-vector buffer
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
src_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
src_data_per_access
;
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
mSrcSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
));
const
index_t
buffer_offset
=
i
*
src_data_per_access
;
*
reinterpret_cast
<
src_vector_t
*>
(
&
p_long_vector
[
buffer_offset
])
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_offset
]);
}
// store data from the long-vector buffer to dst
for
(
index_t
i
=
0
;
i
<
long_vector_size
/
dst_data_per_access
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
vector_access_dim
)
=
i
*
dst_data_per_access
;
const
index_t
buffer_offset
=
i
*
dst_data_per_access
;
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
mDstSliceOrigin
+
(
long_vector_data_begin_id
+
scalar_id
));
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_offset
])
=
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_long_vector
[
buffer_offset
]);
}
});
#endif
}
private:
Array
<
index_t
,
nDim
>
mSrcSliceOrigin
;
Array
<
index_t
,
nDim
>
mDstSliceOrigin
;
};
// This version use TensorCoordinate_deprecated
// This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
// It also allows order of access to be different on src and dst.
// It use register as buffer to hold all data moving from src to dst.
// It is designed for copying small amount of data, and src and dst are
// device memory or LDS.
// When copying large amout of data, let's hope compiler will reduce register
// used for the buffer.
template
<
typename
SrcDesc
,
typename
DstDesc
,
typename
SliceLengths
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v2r1
{
static
constexpr
index_t
nDim
=
SliceLengths
::
GetSize
();
using
Index
=
MultiIndex
<
nDim
>
;
using
SrcCoordinate
=
typename
TensorCoordinate_deprecated
<
SrcDesc
>::
type
;
using
DstCoordinate
=
typename
TensorCoordinate_deprecated
<
DstDesc
>::
type
;
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v2r1
(
const
Index
&
src_slice_origin
,
const
Index
&
dst_slice_origin
)
:
mSrcSliceOrigin
(
src_slice_origin
),
mDstSliceOrigin
(
dst_slice_origin
)
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SrcDimAccessOrder
::
GetSize
()
&&
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
SrcDimAccessOrder
>::
value
&&
is_valid_sequence_map
<
DstDimAccessOrder
>::
value
,
"wrong! map is not valid"
);
static_assert
(
SliceLengths
{}[
SrcVectorAccessDim
]
%
SrcDataPerAccess
==
0
&&
SliceLengths
{}[
DstVectorAccessDim
]
%
DstDataPerAccess
==
0
,
"wrong! cannot evenly divide"
);
// check vectorized memory access
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
static_if
<!
SrcDesc
::
ContainMultipleOriginalDimensions
(
src_vector_access_dim
)
>
{}(
[
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
SrcDesc
{}).
GetStride
(
src_vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
})
.
Else
([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
SrcDesc
{}).
GetLastOriginalDimensionStride
(
src_vector_access_dim
)
==
1
||
SrcDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
static_if
<!
DstDesc
::
ContainMultipleOriginalDimensions
(
dst_vector_access_dim
)
>
{}(
[
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
DstDesc
{}).
GetStride
(
dst_vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
})
.
Else
([
&
](
auto
fwd
)
{
static_assert
(
(
fwd
(
DstDesc
{}).
GetLastOriginalDimensionStride
(
dst_vector_access_dim
)
==
1
||
DstDataPerAccess
==
1
),
"wrong! vectorized access is allowed only if stride == 1"
);
});
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v2r1
()
:
ThreadwiseGenericTensorSliceCopy_v2r1
(
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
())
{
}
__device__
void
SetSrcSliceOrigin
(
SrcCoordinate
src_slice_origin
)
{
mSrcSliceOrigin
=
src_slice_origin
;
}
__device__
void
SetDstSliceOrigin
(
DstCoordinate
dst_slice_origin
)
{
mDstSliceOrigin
=
dst_slice_origin
;
}
template
<
typename
TDesc
,
class
Lengths
>
struct
IsolateMergedDimLengths
{
template
<
typename
IDim
>
__device__
constexpr
index_t
operator
()(
IDim
idim
)
const
{
return
TDesc
::
ContainMultipleOriginalDimensions
(
idim
)
?
Lengths
{}[
idim
]
:
1
;
}
};
template
<
typename
TData
>
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SliceLengths
{});
TData
p_buffer_
[
buffer_desc
.
GetElementSpace
()];
TData
*
p_buffer
=
p_buffer_
;
// copy data from src into buffer
{
using
src_vector_t
=
typename
vector_type
<
TData
,
SrcDataPerAccess
>::
MemoryType
;
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
src_access_lengths
=
SliceLengths
::
Modify
(
src_vector_access_dim
,
SliceLengths
::
Get
(
src_vector_access_dim
)
/
src_data_per_access
);
// Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t
// normal dimensions is known at compile time.
// Below is a hack to isolate merged dimension id from normal dimension id, so the
// corresponding offset can be calculated seperately at run-time and compile-time.
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// merged dimensions, and has value = 1 on normal dimensions;
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// normal dimensions, and has value = 1 on merged dimensions;
constexpr
auto
src_merged_dim_access_lengths
=
typename
sequence_gen
<
nDim
,
IsolateMergedDimLengths
<
SrcDesc
,
decltype
(
src_access_lengths
)
>>::
type
{};
constexpr
auto
src_normal_dim_access_lengths
=
src_access_lengths
+
Number
<
1
>
{}
-
src_merged_dim_access_lengths
;
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
// offset w.r.t. merged dimension need to be computed at run-time
static_ford
<
decltype
(
src_merged_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_merged_dim_access_id_
)
{
constexpr
auto
src_merged_dim_access_id
=
decltype
(
src_merged_dim_access_id_
){};
constexpr
auto
src_merged_dim_data_id
=
src_merged_dim_access_id
.
Modify
(
src_vector_access_dim
,
src_merged_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
);
const
TData
*
p_src_tmp
=
p_src
+
(
mSrcSliceOrigin
+
src_merged_dim_data_id
).
GetOffset
();
// offset w.r.t. normal dimension can be computed at compile-time
static_ford
<
decltype
(
src_normal_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_normal_dim_access_id_
)
{
constexpr
auto
src_normal_dim_access_id
=
decltype
(
src_normal_dim_access_id_
){};
constexpr
auto
src_normal_dim_data_id
=
src_normal_dim_access_id
.
Modify
(
src_vector_access_dim
,
src_normal_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
);
constexpr
index_t
src_normal_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_normal_dim_data_id
);
// load vector from src
const
src_vector_t
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src_tmp
[
src_normal_offset
]);
// unpack vector into buffer
static_for
<
0
,
SrcDataPerAccess
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
src_vector_access_dim
,
i
);
constexpr
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_merged_dim_data_id
+
src_normal_dim_data_id
+
scalar_id
);
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
});
});
});
#else
ford
<
decltype
(
src_merged_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_merged_dim_access_id
)
{
auto
src_merged_dim_data_id
=
src_merged_dim_access_id
;
src_merged_dim_data_id
(
src_vector_access_dim
)
=
src_merged_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
const
TData
*
p_src_tmp
=
p_src
+
(
mSrcSliceOrigin
+
src_merged_dim_data_id
).
GetOffset
();
// these should be compile-time known
ford
<
decltype
(
src_normal_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_normal_dim_access_id
)
{
auto
src_normal_dim_data_id
=
src_normal_dim_access_id
;
src_normal_dim_data_id
(
src_vector_access_dim
)
=
src_normal_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
const
index_t
src_normal_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_normal_dim_data_id
);
// load vector from src
const
src_vector_t
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src_tmp
[
src_normal_offset
]);
// unpack vector into buffer
for
(
index_t
i
=
0
;
i
<
SrcDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
src_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_merged_dim_data_id
+
src_normal_dim_data_id
+
scalar_id
);
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
}
});
});
#endif
}
// copy data from buffer into dst
{
using
dst_vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
dst_access_lengths
=
SliceLengths
::
Modify
(
dst_vector_access_dim
,
SliceLengths
::
Get
(
dst_vector_access_dim
)
/
dst_data_per_access
);
constexpr
auto
dst_merged_dim_access_lengths
=
typename
sequence_gen
<
nDim
,
IsolateMergedDimLengths
<
DstDesc
,
decltype
(
dst_access_lengths
)
>>::
type
{};
constexpr
auto
dst_normal_dim_access_lengths
=
dst_access_lengths
+
Number
<
1
>
{}
-
dst_merged_dim_access_lengths
;
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1
// offset w.r.t. merged dimension need to be computed at run-time
static_ford
<
decltype
(
dst_merged_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_merged_dim_access_id_
)
{
constexpr
auto
dst_merged_dim_access_id
=
decltype
(
dst_merged_dim_access_id_
){};
constexpr
auto
dst_merged_dim_data_id
=
dst_merged_dim_access_id
.
Modify
(
dst_vector_access_dim
,
dst_merged_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
);
TData
*
p_dst_tmp
=
p_dst
+
(
mDstSliceOrigin
+
dst_merged_dim_data_id
).
GetOffset
();
// offset w.r.t. normal dimension can be computed at compile-time
static_ford
<
decltype
(
dst_normal_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_normal_dim_access_id_
)
{
constexpr
auto
dst_normal_dim_access_id
=
decltype
(
dst_normal_dim_access_id_
){};
constexpr
auto
dst_normal_dim_data_id
=
dst_normal_dim_access_id
.
Modify
(
dst_vector_access_dim
,
dst_normal_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
);
dst_vector_t
vector_data
;
// pack vector from buffer
static_for
<
0
,
DstDataPerAccess
,
1
>
{}([
&
](
auto
i
)
{
constexpr
auto
scalar_id
=
typename
uniform_sequence_gen
<
nDim
,
0
>::
type
{}.
Modify
(
dst_vector_access_dim
,
i
);
constexpr
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_merged_dim_data_id
+
dst_normal_dim_data_id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
});
constexpr
index_t
dst_normal_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_normal_dim_data_id
);
// write vector into dst
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_tmp
[
dst_normal_offset
])
=
vector_data
;
});
});
#else
// offset w.r.t. merged dimension need to be computed at run-time
ford
<
decltype
(
dst_merged_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_merged_dim_access_id
)
{
auto
dst_merged_dim_data_id
=
dst_merged_dim_access_id
;
dst_merged_dim_data_id
(
dst_vector_access_dim
)
=
dst_merged_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
TData
*
p_dst_tmp
=
p_dst
+
(
mDstSliceOrigin
+
dst_merged_dim_data_id
).
GetOffset
();
// offset w.r.t. normal dimension can be computed at compile-time
ford
<
decltype
(
dst_normal_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_normal_dim_access_id
)
{
auto
dst_normal_dim_data_id
=
dst_normal_dim_access_id
;
dst_normal_dim_data_id
(
dst_vector_access_dim
)
=
dst_normal_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
dst_vector_t
vector_data
;
// pack vector from buffer
for
(
index_t
i
=
0
;
i
<
DstDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
dst_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_merged_dim_data_id
+
dst_normal_dim_data_id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
}
const
index_t
dst_normal_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_normal_dim_data_id
);
// write vector into dst
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst_tmp
[
dst_normal_offset
])
=
vector_data
;
});
});
#endif
}
}
template
<
typename
TData
,
address_space_t
SrcAddressSpace
=
address_space_t
::
generic
,
address_space_t
DstAddressSpace
=
address_space_t
::
generic
>
__device__
void
Run_optimized_address_calculation
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SliceLengths
{});
TData
p_buffer_
[
buffer_desc
.
GetElementSpace
()];
TData
*
p_buffer
=
p_buffer_
;
// copy data from src into buffer
{
using
src_vector_t
=
typename
vector_type
<
TData
,
SrcDataPerAccess
>::
MemoryType
;
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
constexpr
auto
src_access_lengths
=
SliceLengths
::
Modify
(
src_vector_access_dim
,
SliceLengths
::
Get
(
src_vector_access_dim
)
/
src_data_per_access
);
// Offset w.r.t merged dimensions need to be calculated at run-time. Offset w.r.t
// normal dimensions is known at compile time.
// Below is a hack to isolate merged dimension id from normal dimension id, so the
// corresponding offset can be calculated seperately at run-time and compile-time.
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// merged dimensions, and has value = 1 on normal dimensions;
// src_merged_dim_access_lengths has the same value as src_access_lengths on src's
// normal dimensions, and has value = 1 on merged dimensions;
constexpr
auto
src_merged_dim_access_lengths
=
typename
sequence_gen
<
nDim
,
IsolateMergedDimLengths
<
SrcDesc
,
decltype
(
src_access_lengths
)
>>::
type
{};
constexpr
auto
src_normal_dim_access_lengths
=
src_access_lengths
+
Number
<
1
>
{}
-
src_merged_dim_access_lengths
;
ford
<
decltype
(
src_merged_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_merged_dim_access_id
)
{
auto
src_merged_dim_data_id
=
src_merged_dim_access_id
;
src_merged_dim_data_id
(
src_vector_access_dim
)
=
src_merged_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
// offset w.r.t. merged dimension need be computed at run-time,
const
index_t
src_merged_offset
=
(
mSrcSliceOrigin
+
src_merged_dim_data_id
).
GetOffset
();
ford
<
decltype
(
src_normal_dim_access_lengths
),
SrcDimAccessOrder
>
{}([
&
](
auto
src_normal_dim_access_id
)
{
auto
src_normal_dim_data_id
=
src_normal_dim_access_id
;
src_normal_dim_data_id
(
src_vector_access_dim
)
=
src_normal_dim_access_id
[
src_vector_access_dim
]
*
src_data_per_access
;
// offset w.r.t. normal dimension is known at compile-time
const
index_t
src_normal_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_normal_dim_data_id
);
src_vector_t
vector_data
;
// Read vector from src.
// 1. Source code version can take src of all kinds of memory-space
// 2. Inline asm versions using global_load or buffer_load can only take
// src from global-memory
//
// Commemt for loading from global-memory:
// When
// 1) using source code, in order for compiler to emit optimal
// load instruction, or
// 2) using inline asm (global_load or buffer_load), in order
// for inline asm to be valid,
// following assumptions need to be satisfied:
// 1. p_src need to be block-invariant (assumption)
// 2. src_normal_offset must be calculatd at compile time (guaranteed)
// 3. src_merged_offset can be runtime value (no assumption imposed)
static_if
<
SrcAddressSpace
==
address_space_t
::
global
>
{}([
&
](
auto
)
{
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
vector_data
=
__buffer_load
<
TData
,
SrcDataPerAccess
>
(
p_src
,
static_cast
<
uint32_t
>
(
src_merged_offset
),
static_cast
<
uint32_t
>
(
src_normal_offset
));
#else
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_normal_offset
+
src_merged_offset
]);
#endif
}).
Else
([
&
](
auto
)
{
// src can be all kinds of memory-space.
vector_data
=
*
reinterpret_cast
<
const
src_vector_t
*>
(
&
p_src
[
src_normal_offset
+
src_merged_offset
]);
});
// unpack vector into buffer
for
(
index_t
i
=
0
;
i
<
SrcDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
src_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
src_merged_dim_data_id
+
src_normal_dim_data_id
+
scalar_id
);
p_buffer
[
buffer_offset
]
=
reinterpret_cast
<
const
TData
*>
(
&
vector_data
)[
i
];
}
});
});
}
// copy data from buffer into dst
{
using
dst_vector_t
=
typename
vector_type
<
TData
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
constexpr
auto
dst_access_lengths
=
SliceLengths
::
Modify
(
dst_vector_access_dim
,
SliceLengths
::
Get
(
dst_vector_access_dim
)
/
dst_data_per_access
);
constexpr
auto
dst_merged_dim_access_lengths
=
typename
sequence_gen
<
nDim
,
IsolateMergedDimLengths
<
DstDesc
,
decltype
(
dst_access_lengths
)
>>::
type
{};
constexpr
auto
dst_normal_dim_access_lengths
=
dst_access_lengths
+
Number
<
1
>
{}
-
dst_merged_dim_access_lengths
;
ford
<
decltype
(
dst_merged_dim_access_lengths
),
DstDimAccessOrder
>
{}(
[
&
](
auto
dst_merged_dim_access_id
)
{
auto
dst_merged_dim_data_id
=
dst_merged_dim_access_id
;
dst_merged_dim_data_id
(
dst_vector_access_dim
)
=
dst_merged_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
// offset w.r.t. merged dimension need be computed at run-time,
const
index_t
dst_merged_offset
=
(
mDstSliceOrigin
+
dst_merged_dim_data_id
).
GetOffset
();
ford
<
decltype
(
dst_normal_dim_access_lengths
),
DstDimAccessOrder
>
{}([
&
](
auto
dst_normal_dim_access_id
)
{
auto
dst_normal_dim_data_id
=
dst_normal_dim_access_id
;
dst_normal_dim_data_id
(
dst_vector_access_dim
)
=
dst_normal_dim_access_id
[
dst_vector_access_dim
]
*
dst_data_per_access
;
dst_vector_t
vector_data
;
// pack vector from buffer
for
(
index_t
i
=
0
;
i
<
DstDataPerAccess
;
++
i
)
{
auto
scalar_id
=
make_zero_array
<
index_t
,
nDim
>
();
scalar_id
(
dst_vector_access_dim
)
=
i
;
const
index_t
buffer_offset
=
buffer_desc
.
GetOffsetFromMultiIndex
(
dst_merged_dim_data_id
+
dst_normal_dim_data_id
+
scalar_id
);
reinterpret_cast
<
TData
*>
(
&
vector_data
)[
i
]
=
p_buffer
[
buffer_offset
];
}
// offset w.r.t. normal dimension is known at compile-time
const
index_t
dst_normal_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_normal_dim_data_id
);
// Write vector into dst.
// 1. Source code version can take dst of all kinds of memory-space
// 2. Inline asm versions using global_store or buffer_store can only take
// dst from global-memory
//
// Commemt for storing into global-memory:
// When
// 1) using source code, in order for compiler to emit optimal
// store instruction, or
// 2) using inline asm (global_store or buffer_store), in order
// for inline asm to be valid,
// following assumptions need to be satisfied:
// 1. p_dst need to be block-invariant (assumption)
// 2. dst_normal_offset must be calculatd at compile time (guaranteed)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
static_if
<
DstAddressSpace
==
address_space_t
::
global
>
{}([
&
](
auto
)
{
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
__buffer_store
<
TData
,
DstDataPerAccess
>
(
vector_data
,
p_dst
,
dst_merged_offset
,
dst_normal_offset
);
#else
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_normal_offset
+
dst_merged_offset
])
=
vector_data
;
#endif
}).
Else
([
&
](
auto
)
{
// dst can be all kinds of memory-space
*
reinterpret_cast
<
dst_vector_t
*>
(
&
p_dst
[
dst_normal_offset
+
dst_merged_offset
])
=
vector_data
;
});
});
});
}
}
// T can be Sequence or Array
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mSrcSliceOrigin
+=
step_sizes
;
}).
Else
([
&
](
auto
)
{
mSrcSliceOrigin
-=
step_sizes
;
});
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
static_if
<
PositiveDirection
>
{}([
&
](
auto
)
{
mDstSliceOrigin
+=
step_sizes
;
}).
Else
([
&
](
auto
)
{
mDstSliceOrigin
-=
step_sizes
;
});
}
private:
SrcCoordinate
mSrcSliceOrigin
;
DstCoordinate
mDstSliceOrigin
;
};
// this version use TensorView and TensorCoordinate_deprecated
template
<
typename
SrcTensor
,
typename
DstTensor
,
typename
SliceLengths
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorAccessDim
,
index_t
DstVectorAccessDim
,
index_t
SrcDataPerAccess
,
index_t
DstDataPerAccess
>
struct
ThreadwiseGenericTensorSliceCopy_v3r1
{
static
constexpr
index_t
nDim
=
SrcTensor
::
GetNumOfDimension
();
using
data_type
=
remove_cv_t
<
typename
SrcTensor
::
data_type
>
;
using
SrcCoordinate
=
typename
SrcTensor
::
coordinate_type
;
using
DstCoordinate
=
typename
DstTensor
::
coordinate_type
;
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v3r1
(
SrcTensor
src
,
SrcCoordinate
src_slice_origin
,
DstTensor
dst
,
DstCoordinate
dst_slice_origin
)
:
mSrc
{
src
},
mDst
{
dst
},
mSrcSlice
{
src
.
Slice
(
src_slice_origin
,
SliceLengths
{})},
mDstSlice
{
dst
.
Slice
(
dst_slice_origin
,
SliceLengths
{})}
{
static_assert
(
nDim
==
SrcTensor
::
GetNumOfDimension
()
&&
nDim
==
DstTensor
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SrcDimAccessOrder
::
GetSize
()
&&
nDim
==
DstDimAccessOrder
::
GetSize
(),
"wrong! # of dimensions not the same"
);
static_assert
(
is_valid_sequence_map
<
SrcDimAccessOrder
>::
value
&&
is_valid_sequence_map
<
DstDimAccessOrder
>::
value
,
"wrong! map is not valid"
);
static_assert
(
is_same
<
remove_cv_t
<
typename
SrcTensor
::
data_type
>
,
remove_cv_t
<
typename
DstTensor
::
data_type
>>
{},
"wrong! type conversion is not supported yet"
);
static_assert
(
decltype
(
mSrcSlice
)
::
IsVectorizationAllowed
(
Number
<
SrcVectorAccessDim
>
{},
Number
<
SrcDataPerAccess
>
{})
&&
decltype
(
mDstSlice
)
::
IsVectorizationAllowed
(
Number
<
DstVectorAccessDim
>
{},
Number
<
DstDataPerAccess
>
{}),
"wrong! vectorized access is not allowed"
);
}
__device__
constexpr
ThreadwiseGenericTensorSliceCopy_v3r1
()
:
ThreadwiseGenericTensorSliceCopy_v3r1
(
SrcTensor
{},
SrcCoordinate
{},
DstTensor
{},
DstCoordinate
{})
{
}
__device__
void
Run
()
const
{
// buffer
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SrcTensor
::
GetLengths
());
data_type
p_buffer
[
buffer_desc
.
GetElementSpace
()];
auto
buffer
=
make_TensorView
(
buffer_desc
,
p_buffer
);
// copy data from src into buffer
{
using
src_vector_t
=
typename
vector_type
<
data_type
,
SrcDataPerAccess
>::
MemoryType
;
constexpr
auto
src_vector_access_dim
=
Number
<
SrcVectorAccessDim
>
{};
constexpr
auto
src_data_per_access
=
Number
<
SrcDataPerAccess
>
{};
auto
src_slice_vectorized
=
mSrcSlice
.
Vectorize
(
src_vector_access_dim
,
src_data_per_access
);
ford
<
decltype
(
src_slice_vectorized
.
GetLengths
()),
SrcDimAccessOrder
>
{}(
[
&
](
auto
src_vector_id
)
{
// load vector from src
const
src_vector_t
vector_data
=
src_slice_vectorized
[
src_vector_id
];
// unpack vector into buffer
auto
src_scalar_id
=
src_vector_id
;
src_scalar_id
(
src_vector_access_dim
)
*=
src_data_per_access
;
for
(
index_t
i
=
0
;
i
<
SrcDataPerAccess
;
++
i
)
{
auto
id
=
make_zero_array
<
index_t
,
nDim
>
();
id
(
src_vector_access_dim
)
=
i
;
buffer
(
src_scalar_id
+
id
)
=
reinterpret_cast
<
const
data_type
*>
(
&
vector_data
)[
i
];
}
});
}
// copy data from buffer into dst
{
using
dst_vector_t
=
typename
vector_type
<
data_type
,
DstDataPerAccess
>::
MemoryType
;
constexpr
auto
dst_vector_access_dim
=
Number
<
DstVectorAccessDim
>
{};
constexpr
auto
dst_data_per_access
=
Number
<
DstDataPerAccess
>
{};
auto
dst_slice_vectorized
=
mDstSlice
.
Vectorize
(
dst_vector_access_dim
,
dst_data_per_access
);
ford
<
decltype
(
dst_slice_vectorized
.
GetLengths
()),
DstDimAccessOrder
>
{}(
[
&
](
auto
dst_vector_id
)
{
dst_vector_t
vector_data
{};
// pack vector from buffer
auto
dst_scalar_id
=
dst_vector_id
;
dst_scalar_id
(
dst_vector_access_dim
)
*=
dst_data_per_access
;
for
(
index_t
i
=
0
;
i
<
DstDataPerAccess
;
++
i
)
{
auto
id
=
make_zero_array
<
index_t
,
nDim
>
();
id
(
dst_vector_access_dim
)
=
i
;
reinterpret_cast
<
data_type
*>
(
&
vector_data
)[
i
]
=
buffer
[
dst_scalar_id
+
id
];
}
// write vector into dst
dst_slice_vectorized
(
dst_vector_id
)
=
vector_data
;
});
}
}
// T can be Sequence or Array
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveSrcSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
mSrc
.
MoveSliceWindow
(
mSrcSlice
,
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
{});
}
template
<
typename
T
,
bool
PositiveDirection
>
__device__
void
MoveDstSliceWindow
(
T
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
)
{
mDst
.
MoveSliceWindow
(
mDstSlice
,
step_sizes
,
integral_constant
<
bool
,
PositiveDirection
>
{});
}
private:
using
SrcSlice
=
decltype
(
SrcTensor
{}.
Slice
(
make_zero_array
<
index_t
,
nDim
>
(),
SliceLengths
{}));
using
DstSlice
=
decltype
(
DstTensor
{}.
Slice
(
make_zero_array
<
index_t
,
nDim
>
(),
SliceLengths
{}));
SrcTensor
mSrc
;
DstTensor
mDst
;
SrcSlice
mSrcSlice
;
DstSlice
mDstSlice
;
};
}
// namespace ck
#endif
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
b12bbcee
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "device.hpp"
#include "device.hpp"
#include "tensor.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
//
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#include "gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp"
template
<
class
T
,
template
<
class
T
,
...
...
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
b12bbcee
...
@@ -3,7 +3,7 @@
...
@@ -3,7 +3,7 @@
#include "device.hpp"
#include "device.hpp"
#include "tensor.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
//
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
using
namespace
ck
;
using
namespace
ck
;
...
@@ -164,7 +164,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
...
@@ -164,7 +164,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
constexpr
auto
gridwise_conv
=
constexpr
auto
gridwise_conv
=
#if
1
#if
0
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
#else
#else
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer
...
...
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp
View file @
b12bbcee
...
@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc,
...
@@ -51,7 +51,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(InDesc,
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
wei_kcyx_device_buf
.
ToDevice
(
wei_kcyx
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if
0
#if
1
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BPerBlock
=
128
;
constexpr
index_t
BPerBlock
=
128
;
...
...
driver/src/driver.cpp
View file @
b12bbcee
...
@@ -14,11 +14,11 @@
...
@@ -14,11 +14,11 @@
//#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
//#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp"
//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp"
//
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp"
//#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
//#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp"
//
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
#include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp"
struct
GeneratorTensor_1
struct
GeneratorTensor_1
...
@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
...
@@ -103,7 +103,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
1
#elif
0
// 1x1 filter, 8x8 image
// 1x1 filter, 8x8 image
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
// cudnn@V100 68%, ck@V100 72%, ck@P100 52%, ck@VII 42%
constexpr
index_t
N
=
64
;
constexpr
index_t
N
=
64
;
...
@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
...
@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
0
#elif
1
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
...
@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
...
@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
LeftPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
using
RightPads
=
Sequence
<
3
,
0
>
;
#elif
1
#elif
0
// 1x7 filter, 0x3 pad, 17x17 input
// 1x7 filter, 0x3 pad, 17x17 input
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
128
;
constexpr
index_t
C
=
128
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment