Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
dfe6dedb
Commit
dfe6dedb
authored
Feb 21, 2023
by
Chao Liu
Browse files
adding im2col
parent
fe2ea9b6
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
372 additions
and
91 deletions
+372
-91
example/91_tile_program/hello_world.cpp
example/91_tile_program/hello_world.cpp
+10
-4
example/91_tile_program/im2col.cpp
example/91_tile_program/im2col.cpp
+255
-30
example/91_tile_program/tile_program.hpp
example/91_tile_program/tile_program.hpp
+58
-32
include/ck/tensor/static_tensor.hpp
include/ck/tensor/static_tensor.hpp
+6
-2
include/ck/tensor/tensor.hpp
include/ck/tensor/tensor.hpp
+12
-6
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
...ion/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+4
-0
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
...tion/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+10
-2
include/ck/utility/dynamic_buffer.hpp
include/ck/utility/dynamic_buffer.hpp
+17
-15
No files found.
example/91_tile_program/hello_world.cpp
View file @
dfe6dedb
...
@@ -14,10 +14,10 @@
...
@@ -14,10 +14,10 @@
// program
// program
struct
HelloWorld
struct
HelloWorld
{
{
__host__
__device__
void
operator
()(
Tile
Program
&
tp
,
int
x
,
int
y
,
int
*
res
)
__host__
__device__
void
operator
()(
Program
Server
&
ps
,
int
x
,
int
y
,
int
*
res
)
{
{
auto
desc0
=
t
p
(
make_naive_tensor_descriptor_packed
(
ck
::
make_tuple
(
x
)));
auto
desc0
=
p
s
(
make_naive_tensor_descriptor_packed
(
ck
::
make_tuple
(
x
)));
auto
desc1
=
t
p
(
make_naive_tensor_descriptor_packed
(
ck
::
make_tuple
(
y
)));
auto
desc1
=
p
s
(
make_naive_tensor_descriptor_packed
(
ck
::
make_tuple
(
y
)));
// only for testing purpose
// only for testing purpose
// cpu should not do work here
// cpu should not do work here
...
@@ -33,7 +33,13 @@ int main()
...
@@ -33,7 +33,13 @@ int main()
DeviceMem
res_dev_buf
(
2
*
sizeof
(
int
));
DeviceMem
res_dev_buf
(
2
*
sizeof
(
int
));
launch
(
HelloWorld
{},
1
,
1
,
x
,
y
,
static_cast
<
int
*>
(
res_dev_buf
.
GetDeviceBuffer
()));
launch
(
ProgramServer
{},
HelloWorld
{},
1
,
1
,
x
,
y
,
static_cast
<
int
*>
(
res_dev_buf
.
GetDeviceBuffer
()));
int
res_host
[
2
];
int
res_host
[
2
];
...
...
example/91_tile_program/im2col.cpp
View file @
dfe6dedb
#include "tile_program.hpp"
#include "tile_program.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/utility/thread_group.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor_description/cluster_descriptor.hpp"
#include "ck/tensor/tensor.hpp"
#include "ck/tensor/tensor.hpp"
#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp"
#include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp"
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
...
@@ -12,6 +18,182 @@
...
@@ -12,6 +18,182 @@
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/device_memory.hpp"
namespace
ck
{
template
<
typename
ThreadGroup
,
typename
SrcElementwiseOperation
,
typename
DstElementwiseOperation
,
InMemoryDataOperationEnum
DstInMemOp
,
typename
BlockSliceLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
typename
SrcTensor
,
typename
DstTensor
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectorDim
,
index_t
DstVectorDim
,
index_t
SrcScalarPerVector
,
index_t
DstScalarPerVector
,
index_t
SrcScalarStrideInVector
,
index_t
DstScalarStrideInVector
,
bool
ThreadTransferSrcResetCoordinateAfterRun
,
bool
ThreadTransferDstResetCoordinateAfterRun
>
struct
Copier
{
using
SrcDesc
=
typename
SrcTensor
::
TensorDescriptor
;
using
DstDesc
=
typename
DstTensor
::
TensorDescriptor
;
static
constexpr
ck
::
index_t
nDim
=
remove_reference_t
<
SrcDesc
>::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
__host__
__device__
constexpr
Copier
()
:
block_copy_
{},
src_tensor_
{},
dst_tensor_
{}
{}
__device__
constexpr
Copier
(
const
SrcTensor
&
src_tensor
,
const
Index
&
src_block_slice_origin
,
const
SrcElementwiseOperation
&
src_element_op
,
DstTensor
&
dst_tensor
,
const
Index
&
dst_block_slice_origin
,
const
DstElementwiseOperation
&
dst_element_op
)
:
block_copy_
{
src_tensor
.
desc_
,
src_block_slice_origin
,
src_element_op
,
dst_tensor
.
desc_
,
dst_block_slice_origin
,
dst_element_op
},
src_tensor_
{
src_tensor
.
buf_
.
p_data_
,
src_tensor
.
desc_
},
dst_tensor_
{
dst_tensor
.
buf_
.
p_data_
,
dst_tensor
.
desc_
}
{
}
__host__
void
operator
()()
{}
__device__
void
operator
()()
{
block_copy_
.
Run
(
src_tensor_
.
desc_
,
src_tensor_
.
buf_
,
dst_tensor_
.
desc_
,
dst_tensor_
.
buf_
,
Number
<
0
>
{});
}
__host__
void
move_src_window
(
const
Index
&
)
{}
__device__
void
move_src_window
(
const
Index
&
step
)
{
block_copy_
.
MoveSrcSliceWindow
(
src_tensor_
.
desc_
,
step
);
}
__host__
void
move_dst_window
(
const
Index
&
)
{}
__device__
void
move_dst_window
(
const
Index
&
step
)
{
block_copy_
.
MoveDstSliceWindow
(
dst_tensor_
.
desc_
,
step
);
}
// member
ThreadGroupTensorSliceTransfer_v4r1
<
ThreadGroup
,
SrcElementwiseOperation
,
DstElementwiseOperation
,
DstInMemOp
,
BlockSliceLengths
,
ThreadClusterLengths
,
ThreadClusterArrangeOrder
,
typename
SrcTensor
::
DataType
,
typename
SrcTensor
::
DataType
,
SrcDesc
,
DstDesc
,
SrcDimAccessOrder
,
DstDimAccessOrder
,
SrcVectorDim
,
DstVectorDim
,
SrcScalarPerVector
,
DstScalarPerVector
,
SrcScalarStrideInVector
,
DstScalarStrideInVector
,
ThreadTransferSrcResetCoordinateAfterRun
,
ThreadTransferDstResetCoordinateAfterRun
>
block_copy_
;
SrcTensor
src_tensor_
;
DstTensor
dst_tensor_
;
};
}
// namespace ck
struct
CopierStrategy
{
};
template
<
ck
::
index_t
BlockSize
>
struct
MyProgramServer
:
public
ProgramServer
{
template
<
typename
SrcTensor
,
typename
DstTensor
,
typename
Index
,
typename
Strategy
>
__host__
auto
make_copier
(
const
SrcTensor
&
src_tensor
,
const
Index
&
src_window_origin
,
DstTensor
&
dst_tensor
,
const
Index
&
dst_window_origin
,
const
Index
&
window_lengths
,
const
Strategy
&
strategy
)
{
using
namespace
ck
;
return
Copier
<
ThisThreadBlock
<
BlockSize
>
,
tensor_operation
::
element_wise
::
PassThrough
,
tensor_operation
::
element_wise
::
PassThrough
,
InMemoryDataOperationEnum
::
Set
,
Sequence
<
128
,
16
>
,
// BlockSliceLengths,
Sequence
<
16
,
16
>
,
Sequence
<
0
,
1
>
,
SrcTensor
,
DstTensor
,
Sequence
<
0
,
1
>
,
Sequence
<
0
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
true
,
true
>
{};
}
template
<
typename
SrcTensor
,
typename
DstTensor
,
typename
Index
,
typename
Strategy
>
__device__
auto
make_copier
(
const
SrcTensor
&
src_tensor
,
const
Index
&
src_window_origin
,
DstTensor
&
dst_tensor
,
const
Index
&
dst_window_origin
,
const
Index
&
window_lengths
,
const
Strategy
&
strategy
)
{
using
namespace
ck
;
return
Copier
<
ThisThreadBlock
<
BlockSize
>
,
tensor_operation
::
element_wise
::
PassThrough
,
tensor_operation
::
element_wise
::
PassThrough
,
InMemoryDataOperationEnum
::
Set
,
Sequence
<
128
,
16
>
,
// BlockSliceLengths,
Sequence
<
16
,
16
>
,
Sequence
<
0
,
1
>
,
SrcTensor
,
DstTensor
,
Sequence
<
0
,
1
>
,
Sequence
<
0
,
1
>
,
1
,
1
,
1
,
1
,
1
,
1
,
true
,
true
>
{
src_tensor
,
src_window_origin
,
tensor_operation
::
element_wise
::
PassThrough
{},
dst_tensor
,
dst_window_origin
,
tensor_operation
::
element_wise
::
PassThrough
{}};
}
};
// program
// program
template
<
ck
::
index_t
NDimSpatial
,
template
<
ck
::
index_t
NDimSpatial
,
typename
ALayout
,
typename
ALayout
,
...
@@ -21,8 +203,9 @@ template <ck::index_t NDimSpatial,
...
@@ -21,8 +203,9 @@ template <ck::index_t NDimSpatial,
ck
::
index_t
kKPerTile
>
ck
::
index_t
kKPerTile
>
struct
Im2Col
struct
Im2Col
{
{
template
<
typename
Server
,
typename
CopierStrategy
>
__host__
__device__
void
__host__
__device__
void
operator
()(
TileProgram
&
tp
,
operator
()(
Server
&
ps
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_strides
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_strides
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
...
@@ -34,17 +217,19 @@ struct Im2Col
...
@@ -34,17 +217,19 @@ struct Im2Col
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
>&
input_left_pads
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
>&
input_right_pads
,
const
std
::
array
<
ck
::
index_t
,
NDimSpatial
>&
input_right_pads
,
//
//
const
std
::
array
<
ck
::
index_t
,
3
>
a_
gemmg_
gemmm_gemmk_lengths
,
const
std
::
array
<
ck
::
index_t
,
2
>
a_gemmm_gemmk_lengths
,
const
std
::
array
<
ck
::
index_t
,
3
>
a_
gemmg_
gemmm_gemmk_strides
,
const
std
::
array
<
ck
::
index_t
,
2
>
a_gemmm_gemmk_strides
,
//
//
const
T
*
p_a_img
,
const
T
*
p_a_img
,
T
*
p_a_mtx
)
T
*
p_a_mtx
,
// strategy
const
CopierStrategy
&
copier_strategy
)
{
{
using
namespace
ck
;
using
namespace
ck
;
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
#if 0 // debug
const auto a_src_desc = tensor_operation::TransformConvFwdToGemm<
const auto a_src_desc = tensor_operation::TransformConvFwdToGemm<
NDimSpatial,
NDimSpatial,
tensor_operation::device::ConvolutionForwardSpecialization::Default>::
tensor_operation::device::ConvolutionForwardSpecialization::Default>::
...
@@ -58,36 +243,55 @@ struct Im2Col
...
@@ -58,36 +243,55 @@ struct Im2Col
conv_filter_dilations,
conv_filter_dilations,
input_left_pads,
input_left_pads,
input_right_pads);
input_right_pads);
#else
const
auto
a_src_desc
=
ps
(
tensor_operation
::
TransformConvFwdToGemm
<
NDimSpatial
,
tensor_operation
::
device
::
ConvolutionForwardSpecialization
::
Default
>::
template
MakeADescriptor_M_K
<
ALayout
>(
a_g_n_c_wis_lengths
,
a_g_n_c_wis_strides
,
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
,
c_g_n_k_wos_lengths
,
c_g_n_k_wos_strides
,
conv_filter_strides
,
conv_filter_dilations
,
input_left_pads
,
input_right_pads
));
#endif
const
auto
a_dst_desc
=
#if 1 // debug
make_naive_tensor_descriptor
(
make_tuple
(
a_gemmg_gemmm_gemmk_lengths
[
0
],
const
auto
a_dst_desc
=
ps
(
make_naive_tensor_descriptor
(
a_gemmg_gemmm_gemmk_lengths
[
1
],
make_tuple
(
a_gemmm_gemmk_lengths
[
0
],
a_gemmm_gemmk_lengths
[
1
]),
a_gemmg_gemmm_gemmk_lengths
[
2
]),
make_tuple
(
a_gemmm_gemmk_strides
[
0
],
a_gemmm_gemmk_strides
[
1
])));
make_tuple
(
a_gemmg_gemmm_gemmk_strides
[
0
],
#else
a_gemmg_gemmm_gemmk_strides
[
1
],
const
auto
a_dst_desc
=
make_naive_tensor_descriptor
(
a_gemmg_gemmm_gemmk_strides
[
2
]));
make_tuple
(
a_gemmm_gemmk_lengths
[
0
],
a_gemmm_gemmk_lengths
[
1
]),
make_tuple
(
a_gemmm_gemmk_strides
[
0
],
a_gemmm_gemmk_strides
[
1
]));
#endif
const
auto
a_src
=
tp
(
make_tensor
<
AddressSpaceEnum
::
Global
,
true
>
(
a_src_desc
,
p_a_img
)
)
;
const
auto
a_src
=
make_tensor
<
AddressSpaceEnum
::
Global
,
true
>
(
a_src_desc
,
p_a_img
);
auto
a_dst
=
tp
(
make_tensor
<
AddressSpaceEnum
::
Global
,
true
>
(
a_dst_desc
,
p_a_mtx
)
)
;
auto
a_dst
=
make_tensor
<
AddressSpaceEnum
::
Global
,
true
>
(
a_dst_desc
,
p_a_mtx
);
const
auto
num_gemmg
=
a_gemmg_gemmm_gemmk_lengths
[
0
];
const
auto
num_gemmm
=
a_gemmm_gemmk_lengths
[
0
];
const
auto
num_gemmm
=
a_gemmg_gemmm_gemmk_lengths
[
1
];
const
auto
num_gemmk
=
a_gemmm_gemmk_lengths
[
1
];
const
auto
num_gemmk
=
a_gemmg_gemmm_gemmk_lengths
[
2
];
const
auto
id_block
=
t
p
.
get_block_1d_id
();
const
auto
id_block
=
p
s
.
get_block_1d_id
();
const
auto
num_tile_m
=
num_gemmm
/
kMPerTile
;
const
auto
num_tile_m
=
ps
.
read_first_lane
(
num_gemmm
/
kMPerTile
);
const
auto
num_tile_k
=
num_gemmk
/
kKPerTile
;
const
auto
block2tile
=
tp
(
make_cluster_descriptor
(
make_tuple
(
num_tile_m
,
num_tile_k
)));
#if 1 // debug
const
auto
block2tile
=
ps
(
make_cluster_descriptor
(
make_tuple
(
num_tile_m
)));
#else
const
auto
block2tile
=
make_cluster_descriptor
(
make_tuple
(
num_tile_m
));
#endif
const
auto
id_tile
=
block2tile
.
CalculateBottomIndex
(
make_tuple
(
id_block
));
const
auto
id_tile
=
block2tile
.
CalculateBottomIndex
(
make_tuple
(
id_block
));
const
auto
id_tile_m
=
id_tile
[
I0
];
const
auto
id_tile_m
=
ps
.
read_first_lane
(
id_tile
[
I0
]);
const
auto
id_tile_k
=
id_tile
[
I1
];
#if
1
#if
0
// data-based syntax: per-data solution strategy
// data-based syntax: per-data solution strategy
auto window_a_src = make_window(a_src,
auto window_a_src = make_window(a_src,
make_tuple(1, MPerTile, KPerTile),
make_tuple(1, MPerTile, KPerTile),
...
@@ -107,6 +311,25 @@ struct Im2Col
...
@@ -107,6 +311,25 @@ struct Im2Col
window_a_dst += make_tuple(1, 0, 0);
window_a_dst += make_tuple(1, 0, 0);
}
}
#else
#else
// operator-based syntax
auto
copier
=
ps
.
make_copier
(
a_src
,
make_tuple
(
id_tile_m
*
kMPerTile
,
0
),
a_dst
,
make_tuple
(
id_tile_m
*
kMPerTile
,
0
),
make_tuple
(
kMPerTile
,
kKPerTile
),
copier_strategy
);
ck
::
index_t
id_gemmk
=
0
;
do
{
copier
();
copier
.
move_src_window
(
make_tuple
(
0
,
kKPerTile
));
copier
.
move_dst_window
(
make_tuple
(
0
,
kKPerTile
));
id_gemmk
+=
kKPerTile
;
}
while
(
id_gemmk
<
num_gemmk
-
kKPerTile
);
#endif
#endif
}
}
};
};
...
@@ -117,7 +340,7 @@ int main()
...
@@ -117,7 +340,7 @@ int main()
constexpr
ck
::
index_t
NumDimSpatial
=
2
;
constexpr
ck
::
index_t
NumDimSpatial
=
2
;
ck
::
index_t
G
=
32
;
ck
::
index_t
G
=
1
;
ck
::
index_t
N
=
256
;
ck
::
index_t
N
=
256
;
ck
::
index_t
K
=
192
;
ck
::
index_t
K
=
192
;
ck
::
index_t
C
=
192
;
ck
::
index_t
C
=
192
;
...
@@ -170,8 +393,8 @@ int main()
...
@@ -170,8 +393,8 @@ int main()
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
input_right_pads
{
1
,
1
};
std
::
array
<
ck
::
index_t
,
NumDimSpatial
>
input_right_pads
{
1
,
1
};
// matrix
// matrix
std
::
array
<
ck
::
index_t
,
3
>
in_mtx_lengths
{
G
,
G
*
Ho
*
Wo
,
C
*
Y
*
X
};
std
::
array
<
ck
::
index_t
,
2
>
in_mtx_lengths
{
N
*
Ho
*
Wo
,
C
*
Y
*
X
};
std
::
array
<
ck
::
index_t
,
3
>
in_mtx_strides
{
0
,
0
,
1
};
std
::
array
<
ck
::
index_t
,
2
>
in_mtx_strides
{
0
,
1
};
std
::
partial_sum
(
rbegin
(
in_mtx_lengths
),
std
::
partial_sum
(
rbegin
(
in_mtx_lengths
),
std
::
prev
(
rend
(
in_mtx_lengths
)),
std
::
prev
(
rend
(
in_mtx_lengths
)),
...
@@ -181,9 +404,10 @@ int main()
...
@@ -181,9 +404,10 @@ int main()
DeviceMem
in
(
sizeof
(
DataType
)
*
G
*
N
*
Hi
*
Wi
*
C
);
DeviceMem
in
(
sizeof
(
DataType
)
*
G
*
N
*
Hi
*
Wi
*
C
);
DeviceMem
in_mtx
(
sizeof
(
DataType
)
*
G
*
N
*
Ho
*
Wo
*
C
*
Y
*
X
);
DeviceMem
in_mtx
(
sizeof
(
DataType
)
*
G
*
N
*
Ho
*
Wo
*
C
*
Y
*
X
);
launch
(
Im2Col
<
2
,
ck
::
tensor_layout
::
convolution
::
GNHWC
,
float
,
128
,
128
>
{},
launch
(
MyProgramServer
<
256
>
{},
Im2Col
<
2
,
ck
::
tensor_layout
::
convolution
::
GNHWC
,
float
,
128
,
16
>
{},
1
,
1
,
1
,
256
,
in_lengths
,
in_lengths
,
in_strides
,
in_strides
,
wei_lengths
,
wei_lengths
,
...
@@ -199,7 +423,8 @@ int main()
...
@@ -199,7 +423,8 @@ int main()
in_mtx_strides
,
in_mtx_strides
,
//
//
static_cast
<
DataType
*>
(
in
.
GetDeviceBuffer
()),
static_cast
<
DataType
*>
(
in
.
GetDeviceBuffer
()),
static_cast
<
DataType
*>
(
in_mtx
.
GetDeviceBuffer
()));
static_cast
<
DataType
*>
(
in_mtx
.
GetDeviceBuffer
()),
CopierStrategy
{});
return
0
;
return
0
;
}
}
example/91_tile_program/tile_program.hpp
View file @
dfe6dedb
...
@@ -6,10 +6,13 @@
...
@@ -6,10 +6,13 @@
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
// hidden intermediate argument
// Meta data for GPU
struct
Arg
// TODO: do we need to take care of data alignment in code or it's done by compiler?
template
<
ck
::
index_t
kSize
>
struct
MetaData
{
{
char
data_
[
128
];
char
p_data_
[
kSize
];
ck
::
index_t
size_
=
0
;
ck
::
index_t
size_
=
0
;
ck
::
index_t
pos_
=
0
;
ck
::
index_t
pos_
=
0
;
...
@@ -21,49 +24,59 @@ struct Arg
...
@@ -21,49 +24,59 @@ struct Arg
__device__
void
reset_pos
()
{
pos_
=
0
;
}
__device__
void
reset_pos
()
{
pos_
=
0
;
}
// push arg on host
// push meta data on host
// TODO: correct forwarding?
template
<
typename
T
>
template
<
typename
T
>
__host__
T
push
(
const
T
&
a
)
__host__
auto
push
(
T
&
&
a
)
{
{
*
reinterpret_cast
<
T
*>
(
data
_
+
size
_
)
=
a
;
assert
(
size
_
+
size
of
(
Type
)
<
=
kSize
)
;
si
ze_
+=
sizeof
(
T
)
;
u
si
ng
Type
=
ck
::
remove_cvref_t
<
T
>
;
return
a
;
*
reinterpret_cast
<
Type
*>
(
p_data_
+
size_
)
=
a
;
size_
+=
sizeof
(
Type
);
return
ck
::
forwarder
{}(
a
);
}
}
// pull arg on device
// pull meta data on device
// TODO: correct forwarding?
template
<
typename
T
>
template
<
typename
T
>
__device__
T
pull
()
__device__
auto
pull
()
{
{
T
a
=
*
reinterpret_cast
<
T
*>
(
data_
+
pos_
)
;
using
Type
=
ck
::
remove_cvref_t
<
T
>
;
pos_
+=
sizeof
(
T
);
Type
a
=
*
reinterpret_cast
<
Type
*>
(
p_data_
+
pos_
);
pos_
+=
sizeof
(
Type
);
return
a
;
return
a
;
}
}
};
};
// namespace tp (for tile programming)
// namespace tp (for tile programming)
struct
Tile
Program
struct
Program
Server
{
{
// arg on device
// meta data on device
Arg
arg_
;
MetaData
<
1024
>
meta_data_
;
__host__
void
cpu_init
()
{
meta_data_
.
reset
();
}
__device__
void
gpu_init
()
{
arg
_
.
reset_pos
();
}
__device__
void
gpu_init
()
{
meta_data
_
.
reset_pos
();
}
// push
arg
on host
// push
meta data
on host
template
<
typename
T
>
template
<
typename
T
>
__host__
T
operator
()(
const
T
&
a
)
__host__
auto
operator
()(
T
&
&
a
)
{
{
return
arg
_
.
push
(
a
);
return
ck
::
forwarder
{}(
meta_data
_
.
push
(
a
)
)
;
}
}
// push
arg
on host
// push
meta data
on host
template
<
typename
T
>
template
<
typename
T
>
__device__
T
operator
()(
const
T
&
)
__device__
auto
operator
()(
T
&
&
)
{
{
return
arg
_
.
pull
<
T
>
();
return
ck
::
forwarder
{}(
meta_data
_
.
pull
<
T
>
()
)
;
}
}
__host__
static
ck
::
index_t
get_block_1d_id
()
{
return
-
1
;
}
__host__
static
ck
::
index_t
get_block_1d_id
()
{
return
-
1
;
}
...
@@ -73,23 +86,36 @@ struct TileProgram
...
@@ -73,23 +86,36 @@ struct TileProgram
__device__
static
ck
::
index_t
get_block_1d_id
()
{
return
ck
::
get_block_1d_id
();
}
__device__
static
ck
::
index_t
get_block_1d_id
()
{
return
ck
::
get_block_1d_id
();
}
__device__
static
ck
::
index_t
get_grid_size
()
{
return
ck
::
get_grid_size
();
}
__device__
static
ck
::
index_t
get_grid_size
()
{
return
ck
::
get_grid_size
();
}
// TODO: correct forwarding?
template
<
typename
T
>
__host__
static
constexpr
auto
read_first_lane
(
T
&&
a
)
{
return
ck
::
forwarder
{}(
a
);
}
template
<
typename
T
>
__device__
static
constexpr
auto
read_first_lane
(
T
&&
a
)
{
return
__builtin_amdgcn_readfirstlane
(
a
);
}
};
};
template
<
typename
Program
,
typename
...
Xs
>
template
<
typename
Server
,
typename
Program
,
typename
...
Xs
>
__global__
void
gpu_program_wrapper
(
Program
f
,
Tile
Program
tp
,
Xs
...
xs
)
__global__
void
gpu_program_wrapper
(
Server
server
,
Program
f
,
Xs
...
xs
)
{
{
tp
.
gpu_init
();
server
.
gpu_init
();
f
(
tp
,
xs
...);
f
(
server
,
xs
...);
}
}
template
<
typename
Program
,
typename
...
Xs
>
template
<
typename
Server
,
typename
Program
,
typename
...
Xs
>
void
launch
(
Program
f
,
dim3
grid_dim
,
dim3
block_dim
,
Xs
...
xs
)
void
launch
(
Server
server
,
Program
f
,
dim3
grid_dim
,
dim3
block_dim
,
Xs
...
xs
)
{
{
TileProgram
tp
;
server
.
cpu_init
()
;
f
(
tp
,
xs
...);
f
(
server
,
xs
...);
printf
(
"
cpu arg
size %d
\n
"
,
tp
.
arg
_
.
size_
);
printf
(
"
meta data
size %d
\n
"
,
server
.
meta_data
_
.
size_
);
gpu_program_wrapper
<
Program
><<<
grid_dim
,
block_dim
,
0
,
nullptr
>>>
(
f
,
tp
,
xs
...);
gpu_program_wrapper
<
Server
,
Program
><<<
grid_dim
,
block_dim
,
0
,
nullptr
>>>
(
server
,
f
,
xs
...);
}
}
include/ck/tensor/static_tensor.hpp
View file @
dfe6dedb
...
@@ -78,7 +78,9 @@ struct StaticTensor
...
@@ -78,7 +78,9 @@ struct StaticTensor
StaticBuffer
<
AddressSpace
,
T
,
element_space_size_
,
true
>
data_
;
StaticBuffer
<
AddressSpace
,
T
,
element_space_size_
,
true
>
data_
;
static
constexpr
T
zero_scalar_value_
=
T
{
0
};
static
constexpr
T
zero_scalar_value_
=
T
{
0
};
// for read access of invalid element
const
T
invalid_element_scalar_value_
;
const
T
invalid_element_scalar_value_
;
// for write access of invalid element
T
ignored_element_scalar_
;
T
ignored_element_scalar_
;
};
};
...
@@ -101,12 +103,12 @@ struct StaticTensorTupleOfVectorBuffer
...
@@ -101,12 +103,12 @@ struct StaticTensorTupleOfVectorBuffer
using
V
=
vector_type
<
S
,
ScalarPerVector
>
;
using
V
=
vector_type
<
S
,
ScalarPerVector
>
;
__host__
__device__
constexpr
StaticTensorTupleOfVectorBuffer
()
__host__
__device__
constexpr
StaticTensorTupleOfVectorBuffer
()
:
invalid_element_scalar_value_
{
0
}
:
invalid_element_scalar_value_
{
0
}
,
ignored_element_scalar_
{
0
}
{
{
}
}
__host__
__device__
constexpr
StaticTensorTupleOfVectorBuffer
(
S
invalid_element_value
)
__host__
__device__
constexpr
StaticTensorTupleOfVectorBuffer
(
S
invalid_element_value
)
:
invalid_element_scalar_value_
{
invalid_element_value
}
:
invalid_element_scalar_value_
{
invalid_element_value
}
,
ignored_element_scalar_
{
0
}
{
{
}
}
...
@@ -244,7 +246,9 @@ struct StaticTensorTupleOfVectorBuffer
...
@@ -244,7 +246,9 @@ struct StaticTensorTupleOfVectorBuffer
StaticBufferTupleOfVector
<
AddressSpace
,
S
,
num_of_vector_
,
ScalarPerVector
,
true
>
data_
;
StaticBufferTupleOfVector
<
AddressSpace
,
S
,
num_of_vector_
,
ScalarPerVector
,
true
>
data_
;
static
constexpr
S
zero_scalar_value_
=
S
{
0
};
static
constexpr
S
zero_scalar_value_
=
S
{
0
};
// for read access of invalid element
const
S
invalid_element_scalar_value_
=
S
{
0
};
const
S
invalid_element_scalar_value_
=
S
{
0
};
// for write access of invalid element
S
ignored_element_scalar_
;
S
ignored_element_scalar_
;
};
};
...
...
include/ck/tensor/tensor.hpp
View file @
dfe6dedb
...
@@ -8,32 +8,38 @@ namespace ck {
...
@@ -8,32 +8,38 @@ namespace ck {
template
<
AddressSpaceEnum
AddressSpace
,
template
<
AddressSpaceEnum
AddressSpace
,
bool
InvalidElementUseNumericalZeroValue
,
bool
InvalidElementUseNumericalZeroValue
,
typename
T
,
typename
T
,
typename
TensorDesc
>
typename
TensorDesc
Tmp
>
struct
Tensor
struct
Tensor
{
{
using
TensorDescriptor
=
remove_cvref_t
<
TensorDescTmp
>
;
using
DataType
=
remove_reference_t
<
T
>
;
static
constexpr
AddressSpaceEnum
kAdressSpace_
=
AddressSpace
;
static
constexpr
AddressSpaceEnum
kAdressSpace_
=
AddressSpace
;
static
constexpr
bool
kInvalidElementUseNumericalZeroValue_
=
static
constexpr
bool
kInvalidElementUseNumericalZeroValue_
=
InvalidElementUseNumericalZeroValue
;
InvalidElementUseNumericalZeroValue
;
__host__
__device__
constexpr
Tensor
()
:
buf_
{
nullptr
,
0
},
desc_
{}
{}
__host__
__device__
constexpr
Tensor
()
:
buf_
{
nullptr
,
0
},
desc_
{}
{}
__host__
__device__
constexpr
Tensor
(
T
*
p_data
,
TensorDesc
desc
)
__host__
__device__
constexpr
Tensor
(
DataType
*
p_data
,
TensorDesc
riptor
desc
)
:
buf_
{
p_data
,
desc
.
GetElementSpaceSize
()},
desc_
{
desc
}
:
buf_
{
p_data
,
desc
.
GetElementSpaceSize
()},
desc_
{
desc
}
{
{
}
}
__host__
__device__
constexpr
Tensor
(
T
*
p_data
,
TensorDesc
desc
,
T
invalid_element_value
)
__host__
__device__
constexpr
Tensor
(
DataType
*
p_data
,
TensorDescriptor
desc
,
DataType
invalid_element_value
)
:
buf_
{
p_data
,
desc
.
GetElementSpaceSize
(),
invalid_element_value
},
desc_
{
desc
}
:
buf_
{
p_data
,
desc
.
GetElementSpaceSize
(),
invalid_element_value
},
desc_
{
desc
}
{
{
}
}
// member
DynamicBuffer
<
AddressSpace
,
DynamicBuffer
<
AddressSpace
,
T
,
DataType
,
typename
TensorDesc
::
ElementSpaceSizeType
,
typename
TensorDesc
riptor
::
ElementSpaceSizeType
,
InvalidElementUseNumericalZeroValue
>
InvalidElementUseNumericalZeroValue
>
buf_
;
buf_
;
TensorDesc
desc_
;
TensorDesc
riptor
desc_
;
};
};
template
<
AddressSpaceEnum
AddressSpace
,
template
<
AddressSpaceEnum
AddressSpace
,
...
...
include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
View file @
dfe6dedb
...
@@ -45,6 +45,10 @@ struct ThreadGroupTensorSliceTransfer_v4r1
...
@@ -45,6 +45,10 @@ struct ThreadGroupTensorSliceTransfer_v4r1
using
Index
=
MultiIndex
<
nDim
>
;
using
Index
=
MultiIndex
<
nDim
>
;
#if 1 // debug
__host__
__device__
constexpr
ThreadGroupTensorSliceTransfer_v4r1
()
:
threadwise_transfer_
{}
{}
#endif
__device__
constexpr
ThreadGroupTensorSliceTransfer_v4r1
(
__device__
constexpr
ThreadGroupTensorSliceTransfer_v4r1
(
const
SrcDesc
&
src_desc
,
const
SrcDesc
&
src_desc
,
const
Index
&
src_block_slice_origin
,
const
Index
&
src_block_slice_origin
,
...
...
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
View file @
dfe6dedb
...
@@ -52,8 +52,8 @@ template <typename SliceLengths,
...
@@ -52,8 +52,8 @@ template <typename SliceLengths,
typename
SrcElementwiseOperation
,
typename
SrcElementwiseOperation
,
typename
DstElementwiseOperation
,
typename
DstElementwiseOperation
,
InMemoryDataOperationEnum
DstInMemOp
,
InMemoryDataOperationEnum
DstInMemOp
,
typename
SrcData
,
typename
SrcData
Tmp
,
typename
DstData
,
typename
DstData
Tmp
,
typename
SrcDesc
,
typename
SrcDesc
,
typename
DstDesc
,
typename
DstDesc
,
typename
SrcDimAccessOrder
,
typename
SrcDimAccessOrder
,
...
@@ -76,6 +76,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -76,6 +76,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
using
Index
=
MultiIndex
<
nDim
>
;
using
Index
=
MultiIndex
<
nDim
>
;
using
SrcData
=
remove_cvref_t
<
SrcDataTmp
>
;
using
DstData
=
remove_cvref_t
<
DstDataTmp
>
;
using
SrcCoord
=
decltype
(
make_tensor_coordinate
(
SrcDesc
{},
Index
{}));
using
SrcCoord
=
decltype
(
make_tensor_coordinate
(
SrcDesc
{},
Index
{}));
using
DstCoord
=
decltype
(
make_tensor_coordinate
(
DstDesc
{},
Index
{}));
using
DstCoord
=
decltype
(
make_tensor_coordinate
(
DstDesc
{},
Index
{}));
...
@@ -84,6 +87,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
...
@@ -84,6 +87,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1
static
constexpr
auto
I0
=
Number
<
0
>
{};
static
constexpr
auto
I0
=
Number
<
0
>
{};
__host__
__device__
constexpr
ThreadwiseTensorSliceTransfer_v3r1
()
:
src_coord_
{},
dst_coord_
{},
src_element_op_
{},
dst_element_op_
{}
{
}
__device__
constexpr
ThreadwiseTensorSliceTransfer_v3r1
(
__device__
constexpr
ThreadwiseTensorSliceTransfer_v3r1
(
const
SrcDesc
&
src_desc
,
const
SrcDesc
&
src_desc
,
const
Index
&
src_slice_origin
,
const
Index
&
src_slice_origin
,
...
...
include/ck/utility/dynamic_buffer.hpp
View file @
dfe6dedb
...
@@ -24,12 +24,17 @@ struct DynamicBuffer
...
@@ -24,12 +24,17 @@ struct DynamicBuffer
{
{
using
type
=
T
;
using
type
=
T
;
T
*
p_data_
;
T
*
p_data_
=
nullptr
;
ElementSpaceSize
element_space_size_
;
ElementSpaceSize
element_space_size_
;
remove_cvref_t
<
T
>
invalid_element_value_
=
T
{
0
};
remove_cvref_t
<
T
>
invalid_element_value_
=
T
{
0
};
__host__
__device__
constexpr
DynamicBuffer
()
:
p_data_
{},
element_space_size_
{},
invalid_element_value_
{}
{
}
__host__
__device__
constexpr
DynamicBuffer
(
T
*
p_data
,
ElementSpaceSize
element_space_size
)
__host__
__device__
constexpr
DynamicBuffer
(
T
*
p_data
,
ElementSpaceSize
element_space_size
)
:
p_data_
{
p_data
},
element_space_size_
{
element_space_size
}
:
p_data_
{
p_data
},
element_space_size_
{
element_space_size
}
,
invalid_element_value_
{
0
}
{
{
}
}
...
@@ -42,20 +47,17 @@ struct DynamicBuffer
...
@@ -42,20 +47,17 @@ struct DynamicBuffer
{
{
}
}
__host__
__device__
static
constexpr
AddressSpaceEnum
GetAddressSpace
()
__device__
static
constexpr
AddressSpaceEnum
GetAddressSpace
()
{
return
BufferAddressSpace
;
}
{
return
BufferAddressSpace
;
}
__host__
__device__
constexpr
const
T
&
operator
[](
index_t
i
)
const
{
return
p_data_
[
i
];
}
__device__
constexpr
const
T
&
operator
[](
index_t
i
)
const
{
return
p_data_
[
i
];
}
__host__
__device__
constexpr
T
&
operator
()(
index_t
i
)
{
return
p_data_
[
i
];
}
__device__
constexpr
T
&
operator
()(
index_t
i
)
{
return
p_data_
[
i
];
}
template
<
typename
X
,
template
<
typename
X
,
typename
enable_if
<
is_same
<
typename
scalar_type
<
remove_cvref_t
<
X
>
>::
type
,
typename
enable_if
<
is_same
<
typename
scalar_type
<
remove_cvref_t
<
X
>
>::
type
,
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
>::
value
,
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
>::
value
,
bool
>::
type
=
false
>
bool
>::
type
=
false
>
__host__
__device__
constexpr
auto
Get
(
index_t
i
,
bool
is_valid_element
)
const
__device__
constexpr
auto
Get
(
index_t
i
,
bool
is_valid_element
)
const
{
{
// X contains multiple T
// X contains multiple T
constexpr
index_t
scalar_per_t_vector
=
scalar_type
<
remove_cvref_t
<
T
>>::
vector_size
;
constexpr
index_t
scalar_per_t_vector
=
scalar_type
<
remove_cvref_t
<
T
>>::
vector_size
;
...
@@ -120,7 +122,7 @@ struct DynamicBuffer
...
@@ -120,7 +122,7 @@ struct DynamicBuffer
typename
enable_if
<
is_same
<
typename
scalar_type
<
remove_cvref_t
<
X
>
>::
type
,
typename
enable_if
<
is_same
<
typename
scalar_type
<
remove_cvref_t
<
X
>
>::
type
,
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
>::
value
,
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
>::
value
,
bool
>::
type
=
false
>
bool
>::
type
=
false
>
__host__
__device__
void
Update
(
index_t
i
,
bool
is_valid_element
,
const
X
&
x
)
__device__
void
Update
(
index_t
i
,
bool
is_valid_element
,
const
X
&
x
)
{
{
if
constexpr
(
Op
==
InMemoryDataOperationEnum
::
Set
)
if
constexpr
(
Op
==
InMemoryDataOperationEnum
::
Set
)
{
{
...
@@ -147,7 +149,7 @@ struct DynamicBuffer
...
@@ -147,7 +149,7 @@ struct DynamicBuffer
typename
enable_if
<
is_same
<
typename
scalar_type
<
remove_cvref_t
<
X
>
>::
type
,
typename
enable_if
<
is_same
<
typename
scalar_type
<
remove_cvref_t
<
X
>
>::
type
,
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
>::
value
,
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
>::
value
,
bool
>::
type
=
false
>
bool
>::
type
=
false
>
__host__
__device__
void
Set
(
index_t
i
,
bool
is_valid_element
,
const
X
&
x
)
__device__
void
Set
(
index_t
i
,
bool
is_valid_element
,
const
X
&
x
)
{
{
// X contains multiple T
// X contains multiple T
constexpr
index_t
scalar_per_t_vector
=
scalar_type
<
remove_cvref_t
<
T
>>::
vector_size
;
constexpr
index_t
scalar_per_t_vector
=
scalar_type
<
remove_cvref_t
<
T
>>::
vector_size
;
...
@@ -290,7 +292,7 @@ struct DynamicBuffer
...
@@ -290,7 +292,7 @@ struct DynamicBuffer
typename
enable_if
<
is_same
<
typename
scalar_type
<
remove_cvref_t
<
X
>
>::
type
,
typename
enable_if
<
is_same
<
typename
scalar_type
<
remove_cvref_t
<
X
>
>::
type
,
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
>::
value
,
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
>::
value
,
bool
>::
type
=
false
>
bool
>::
type
=
false
>
__host__
__device__
void
AtomicAdd
(
index_t
i
,
bool
is_valid_element
,
const
X
&
x
)
__device__
void
AtomicAdd
(
index_t
i
,
bool
is_valid_element
,
const
X
&
x
)
{
{
using
scalar_t
=
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
;
using
scalar_t
=
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
;
...
@@ -339,7 +341,7 @@ struct DynamicBuffer
...
@@ -339,7 +341,7 @@ struct DynamicBuffer
typename
enable_if
<
is_same
<
typename
scalar_type
<
remove_cvref_t
<
X
>
>::
type
,
typename
enable_if
<
is_same
<
typename
scalar_type
<
remove_cvref_t
<
X
>
>::
type
,
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
>::
value
,
typename
scalar_type
<
remove_cvref_t
<
T
>>::
type
>::
value
,
bool
>::
type
=
false
>
bool
>::
type
=
false
>
__host__
__device__
void
AtomicMax
(
index_t
i
,
bool
is_valid_element
,
const
X
&
x
)
__device__
void
AtomicMax
(
index_t
i
,
bool
is_valid_element
,
const
X
&
x
)
{
{
// X contains multiple T
// X contains multiple T
constexpr
index_t
scalar_per_t_vector
=
scalar_type
<
remove_cvref_t
<
T
>>::
vector_size
;
constexpr
index_t
scalar_per_t_vector
=
scalar_type
<
remove_cvref_t
<
T
>>::
vector_size
;
...
@@ -371,9 +373,9 @@ struct DynamicBuffer
...
@@ -371,9 +373,9 @@ struct DynamicBuffer
}
}
}
}
__host__
__device__
static
constexpr
bool
IsStaticBuffer
()
{
return
false
;
}
__device__
static
constexpr
bool
IsStaticBuffer
()
{
return
false
;
}
__host__
__device__
static
constexpr
bool
IsDynamicBuffer
()
{
return
true
;
}
__device__
static
constexpr
bool
IsDynamicBuffer
()
{
return
true
;
}
};
};
template
<
AddressSpaceEnum
BufferAddressSpace
,
typename
T
,
typename
ElementSpaceSize
>
template
<
AddressSpaceEnum
BufferAddressSpace
,
typename
T
,
typename
ElementSpaceSize
>
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment