Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
7da654a4
Commit
7da654a4
authored
Oct 25, 2020
by
Chao Liu
Browse files
added dyanmic col2im
parent
9d000309
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
206 additions
and
407 deletions
+206
-407
composable_kernel/include/kernel_algorithm/dynamic_gridwise_col2im_gemmkgemmn_nchw.hpp
...nel_algorithm/dynamic_gridwise_col2im_gemmkgemmn_nchw.hpp
+26
-20
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
...lude/tensor_description/dynamic_multi_index_transform.hpp
+5
-6
composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
...sor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
+123
-4
composable_kernel/include/tensor_operation/gridwise_tensor_contraction.hpp
.../include/tensor_operation/gridwise_tensor_contraction.hpp
+0
-330
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
...or_operation/threadwise_dynamic_tensor_slice_transfer.hpp
+17
-3
driver/include/device_dynamic_col2im_gemmkgemmn_nchw.hpp
driver/include/device_dynamic_col2im_gemmkgemmn_nchw.hpp
+3
-3
driver/include/host_col2im.hpp
driver/include/host_col2im.hpp
+19
-13
driver/src/col2im_driver.cpp
driver/src/col2im_driver.cpp
+2
-17
driver/src/conv_bwd_data_driver.cpp
driver/src/conv_bwd_data_driver.cpp
+11
-11
No files found.
composable_kernel/include/kernel_algorithm/dynamic_gridwise_col2im_gemmkgemmn_nchw.hpp
View file @
7da654a4
...
@@ -119,6 +119,7 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
...
@@ -119,6 +119,7 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
// blockwise atomic accumulation
// blockwise atomic accumulation
auto
blockwise_copy
=
auto
blockwise_copy
=
#if 0
BlockwiseDynamicTensorSliceTransfer_v1<BlockSize,
BlockwiseDynamicTensorSliceTransfer_v1<BlockSize,
float,
float,
float,
float,
...
@@ -129,8 +130,6 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
...
@@ -129,8 +130,6 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyClusterLengths_GemmK_GemmN,
BlockCopyThreadClusterArrangeOrder,
BlockCopyThreadClusterArrangeOrder,
BlockCopySrcAccessOrder,
BlockCopySrcAccessOrder,
BlockCopyDstAccessOrder
,
1
,
1,
1,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
BlockCopyDataPerAccess_GemmN,
...
@@ -139,32 +138,39 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
...
@@ -139,32 +138,39 @@ struct DynamicGridwiseCol2Im_gemmkgemmn_nchw
InMemoryDataOperation::AtomicAdd,
InMemoryDataOperation::AtomicAdd,
1,
1,
1>(
1>(
#else
BlockwiseDynamicTensorSliceTransfer_v2
<
BlockSize
,
float
,
float
,
decltype
(
col_gemmk_gemmn_global_desc
),
decltype
(
img_gemmk_gemmn_global_desc
),
Sequence
<
GemmKPerBlock
,
GemmNPerBlock
>
,
BlockCopySubLengths_GemmK_GemmN
,
BlockCopyClusterLengths_GemmK_GemmN
,
BlockCopyThreadClusterArrangeOrder
,
BlockCopySrcAccessOrder
,
BlockCopyDstAccessOrder
,
1
,
1
,
BlockCopyDataPerAccess_GemmN
,
BlockCopyDataPerAccess_GemmN
,
AddressSpace
::
Global
,
AddressSpace
::
Global
,
InMemoryDataOperation
::
AtomicAdd
,
1
,
1
>
(
#endif
col_gemmk_gemmn_global_desc
,
col_gemmk_gemmn_global_desc
,
make_multi_index
(
0
,
gemmn_block_data_on_global
),
make_multi_index
(
0
,
gemmn_block_data_on_global
),
img_gemmk_gemmn_global_desc
,
img_gemmk_gemmn_global_desc
,
make_multi_index
(
0
,
gemmn_block_data_on_global
));
make_multi_index
(
0
,
gemmn_block_data_on_global
));
auto
col_gemmk_gemmn_coord
=
for
(
index_t
gemmk
=
0
;
gemmk
<
GemmK
;
gemmk
+=
GemmKPerBlock
)
make_dynamic_tensor_coordinate
(
col_gemmk_gemmn_global_desc
,
make_multi_index
(
0
,
0
));
auto
img_gemmk_gemmn_coord
=
make_dynamic_tensor_coordinate
(
img_gemmk_gemmn_global_desc
,
make_multi_index
(
0
,
0
));
const
auto
col_gemmk_gemmn_coord_step
=
make_dynamic_tensor_coordinate_step
(
col_gemmk_gemmn_global_desc
,
make_multi_index
(
GemmKPerBlock
,
0
));
const
auto
img_gemmk_gemmn_coord_step
=
make_dynamic_tensor_coordinate_step
(
img_gemmk_gemmn_global_desc
,
make_multi_index
(
GemmKPerBlock
,
0
));
for
(
index_t
gemmk
=
0
;
gemmk
<
GemmK
-
GemmKPerBlock
;
gemmk
+=
GemmKPerBlock
)
{
{
blockwise_copy
.
Run
(
p_col_global
,
p_img_global
);
blockwise_copy
.
Run
(
p_col_global
,
p_img_global
);
move_dynamic_tensor_coordinate
(
blockwise_copy
.
MoveSrcSliceWindow
(
make_multi_index
(
GemmKPerBlock
,
0
));
col_gemmk_gemmn_global_desc
,
col_gemmk_gemmn_coord
,
col_gemmk_gemmn_coord_step
);
blockwise_copy
.
MoveDstSliceWindow
(
make_multi_index
(
GemmKPerBlock
,
0
));
move_dynamic_tensor_coordinate
(
img_gemmk_gemmn_global_desc
,
img_gemmk_gemmn_coord
,
img_gemmk_gemmn_coord_step
);
}
}
}
}
};
};
...
...
composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
View file @
7da654a4
...
@@ -459,7 +459,7 @@ struct DynamicMerge
...
@@ -459,7 +459,7 @@ struct DynamicMerge
// this should be saved in SGPR as well
// this should be saved in SGPR as well
index_t
idx_low_length_minus_idx_diff_low_const
=
index_t
idx_low_length_minus_idx_diff_low_const
=
low_lengths_
[
i
]
-
idx_diff_low_const
[
i
];
low_lengths_
[
i
]
-
idx_diff_low_const
[
i
];
#if
0
#if
1
index_t
idx_low_length_plus_idx_diff_low_const
=
index_t
idx_low_length_plus_idx_diff_low_const
=
low_lengths_
[
i
]
+
idx_diff_low_const
[
i
];
low_lengths_
[
i
]
+
idx_diff_low_const
[
i
];
#endif
#endif
...
@@ -467,21 +467,20 @@ struct DynamicMerge
...
@@ -467,21 +467,20 @@ struct DynamicMerge
index_t
idx_low_tmp
=
idx_low_old
[
i
]
+
carry
;
index_t
idx_low_tmp
=
idx_low_old
[
i
]
+
carry
;
bool
do_carry
=
idx_low_tmp
>=
idx_low_length_minus_idx_diff_low_const
;
bool
do_carry
=
idx_low_tmp
>=
idx_low_length_minus_idx_diff_low_const
;
#if
0
#if
1
bool
do_borrow
=
idx_low_tmp
<
-
idx_diff_low_const
[
i
];
bool
do_borrow
=
idx_low_tmp
<
-
idx_diff_low_const
[
i
];
#endif
#endif
idx_diff_low
(
i
)
=
idx_diff_low
(
i
)
=
do_carry
?
-
idx_low_length_minus_idx_diff_low_const
:
idx_diff_low_const
[
i
];
do_carry
?
-
idx_low_length_minus_idx_diff_low_const
:
idx_diff_low_const
[
i
];
#if 0
#if 1
idx_diff_low(i) =
idx_diff_low
(
i
)
=
do_borrow
?
idx_low_length_plus_idx_diff_low_const
:
idx_diff_low
[
i
];
do_borrow ? idx_low_length_plus_idx_diff_low_const : idx_diff_low[i];
#endif
#endif
idx_diff_low
(
i
)
+=
carry
;
idx_diff_low
(
i
)
+=
carry
;
carry
=
do_carry
?
1
:
0
;
carry
=
do_carry
?
1
:
0
;
#if
0
#if
1
carry
=
do_borrow
?
-
1
:
carry
;
carry
=
do_borrow
?
-
1
:
carry
;
#endif
#endif
});
});
...
...
composable_kernel/include/tensor_operation/blockwise_dynamic_tensor_slice_transfer.hpp
View file @
7da654a4
...
@@ -9,6 +9,116 @@
...
@@ -9,6 +9,116 @@
namespace
ck
{
namespace
ck
{
template
<
index_t
BlockSize
,
typename
BlockSrcData
,
typename
BlockDstData
,
typename
BlockSrcDesc
,
typename
BlockDstDesc
,
typename
BlockSliceLengths
,
typename
ThreadSliceLengths
,
typename
ThreadClusterLengths
,
typename
ThreadClusterArrangeOrder
,
typename
SrcDstDimAccessOrder
,
index_t
SrcDstVectoReadDim
,
index_t
SrcDataPerRead
,
index_t
DstDataPerWrite
,
AddressSpace
SrcAddressSpace
,
AddressSpace
DstAddressSpace
,
InMemoryDataOperation
DstInMemOp
,
index_t
SrcDataStride
,
index_t
DstDataStride
>
struct
BlockwiseDynamicTensorSliceTransfer_v1
{
static
constexpr
index_t
nDim
=
remove_reference_t
<
remove_cv_t
<
BlockSrcDesc
>>::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
__device__
constexpr
BlockwiseDynamicTensorSliceTransfer_v1
(
const
BlockSrcDesc
&
block_src_desc
,
const
Index
&
src_block_slice_origin
,
const
BlockDstDesc
&
block_dst_desc
,
const
Index
&
dst_block_slice_origin
)
:
threadwise_transfer_
(
block_src_desc
,
make_zero_multi_index
<
nDim
>
(),
block_dst_desc
,
make_zero_multi_index
<
nDim
>
())
{
static_assert
(
nDim
==
remove_reference_t
<
remove_cv_t
<
BlockSrcDesc
>>::
GetNumOfDimension
()
&&
nDim
==
remove_reference_t
<
remove_cv_t
<
BlockDstDesc
>>::
GetNumOfDimension
()
&&
nDim
==
BlockSliceLengths
::
Size
()
&&
nDim
==
ThreadSliceLengths
::
Size
()
&&
nDim
==
ThreadClusterLengths
::
Size
()
&&
nDim
==
ThreadClusterArrangeOrder
::
Size
()
&&
nDim
==
SrcDstDimAccessOrder
::
Size
(),
"wrong! nDim not consistent"
);
static_assert
(
is_same
<
BlockSliceLengths
,
decltype
(
ThreadSliceLengths
{}
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
static_assert
(
BlockSize
>=
thread_cluster_desc_
.
GetElementSize
(),
"wrong! BlockSize too small"
);
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
const
auto
thread_cluster_id
=
thread_cluster_desc_
.
CalculateClusterIndex
(
get_thread_local_1d_id
());
const
auto
thread_data_id_begin
=
thread_cluster_id
*
ThreadSliceLengths
{};
threadwise_transfer_
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_id_begin
);
threadwise_transfer_
.
SetDstSliceOrigin
(
dst_block_slice_origin
+
thread_data_id_begin
);
}
}
__device__
void
Run
(
const
BlockSrcData
*
p_block_src
,
BlockDstData
*
p_block_dst
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
Run
(
p_block_src
,
p_block_dst
);
}
}
__device__
void
MoveSrcSliceWindow
(
const
Index
&
step
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
MoveSrcSliceWindow
(
step
);
}
}
__device__
void
MoveDstSliceWindow
(
const
Index
&
step
)
{
if
(
BlockSize
==
thread_cluster_desc_
.
GetElementSize
()
or
get_thread_local_1d_id
()
<
thread_cluster_desc_
.
GetElementSize
())
{
threadwise_transfer_
.
MoveDstSliceWindow
(
step
);
}
}
private:
static
constexpr
auto
thread_cluster_desc_
=
make_cluster_descriptor
(
ThreadClusterLengths
{},
ThreadClusterArrangeOrder
{});
using
ThreadwiseTransfer
=
ThreadwiseDynamicTensorSliceTransfer_v1
<
BlockSrcDesc
,
BlockDstDesc
,
ThreadSliceLengths
,
SrcDstDimAccessOrder
,
SrcDstVectoReadDim
,
SrcDataPerRead
,
DstDataPerWrite
,
SrcAddressSpace
,
DstAddressSpace
,
DstInMemOp
,
SrcDataStride
,
DstDataStride
>
;
ThreadwiseTransfer
threadwise_transfer_
;
};
template
<
index_t
BlockSize
,
template
<
index_t
BlockSize
,
typename
BlockSrcData
,
typename
BlockSrcData
,
typename
BlockDstData
,
typename
BlockDstData
,
...
@@ -20,7 +130,7 @@ template <index_t BlockSize,
...
@@ -20,7 +130,7 @@ template <index_t BlockSize,
typename
ThreadClusterArrangeOrder
,
typename
ThreadClusterArrangeOrder
,
typename
SrcDimAccessOrder
,
typename
SrcDimAccessOrder
,
typename
DstDimAccessOrder
,
typename
DstDimAccessOrder
,
index_t
SrcVectoReadDim
,
index_t
SrcVecto
r
ReadDim
,
index_t
DstVectorWriteDim
,
index_t
DstVectorWriteDim
,
index_t
SrcDataPerRead
,
index_t
SrcDataPerRead
,
index_t
DstDataPerWrite
,
index_t
DstDataPerWrite
,
...
@@ -29,17 +139,26 @@ template <index_t BlockSize,
...
@@ -29,17 +139,26 @@ template <index_t BlockSize,
InMemoryDataOperation
DstInMemOp
,
InMemoryDataOperation
DstInMemOp
,
index_t
SrcDataStride
,
index_t
SrcDataStride
,
index_t
DstDataStride
>
index_t
DstDataStride
>
struct
BlockwiseDynamicTensorSliceTransfer_v
1
struct
BlockwiseDynamicTensorSliceTransfer_v
2
{
{
static
constexpr
index_t
nDim
=
static
constexpr
index_t
nDim
=
remove_reference_t
<
remove_cv_t
<
BlockSrcDesc
>>::
GetNumOfDimension
();
remove_reference_t
<
remove_cv_t
<
BlockSrcDesc
>>::
GetNumOfDimension
();
using
Index
=
MultiIndex
<
nDim
>
;
using
Index
=
MultiIndex
<
nDim
>
;
__device__
constexpr
BlockwiseDynamicTensorSliceTransfer_v
1
(
const
BlockSrcDesc
&
block_src_desc
,
__device__
constexpr
BlockwiseDynamicTensorSliceTransfer_v
2
(
const
BlockSrcDesc
&
block_src_desc
,
const
Index
&
src_block_slice_origin
,
const
Index
&
src_block_slice_origin
,
const
BlockDstDesc
&
block_dst_desc
,
const
BlockDstDesc
&
block_dst_desc
,
const
Index
&
dst_block_slice_origin
)
const
Index
&
dst_block_slice_origin
)
:
threadwise_read_
(
block_src_desc
,
make_zero_multi_index
<
nDim
>
(),
thread_buffer_desc_
,
make_zero_multi_index
<
nDim
>
()),
threadwise_write_
(
thread_buffer_desc_
,
make_zero_multi_index
<
nDim
>
(),
block_dst_desc
,
make_zero_multi_index
<
nDim
>
())
{
{
static_assert
(
static_assert
(
nDim
==
remove_reference_t
<
remove_cv_t
<
BlockSrcDesc
>>::
GetNumOfDimension
()
&&
nDim
==
remove_reference_t
<
remove_cv_t
<
BlockSrcDesc
>>::
GetNumOfDimension
()
&&
...
@@ -131,7 +250,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v1
...
@@ -131,7 +250,7 @@ struct BlockwiseDynamicTensorSliceTransfer_v1
decltype
(
thread_buffer_desc_
),
decltype
(
thread_buffer_desc_
),
ThreadSliceLengths
,
ThreadSliceLengths
,
SrcDimAccessOrder
,
SrcDimAccessOrder
,
SrcVectoReadDim
,
SrcVecto
r
ReadDim
,
SrcDataPerRead
,
SrcDataPerRead
,
1
,
1
,
SrcAddressSpace
,
SrcAddressSpace
,
...
...
composable_kernel/include/tensor_operation/gridwise_tensor_contraction.hpp
deleted
100644 → 0
View file @
9d000309
#ifndef CK_GRIDWISE_TENSOR_CONTRACTION_HPP
#define CK_GRIDWISE_TENSOR_CONTRACTION_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "threadwise_generic_tensor_slice_copy.hpp"
#include "blockwise_gemm.hpp"
namespace
ck
{
template
<
index_t
GridSize
,
index_t
BlockSize
,
typename
Float
,
typename
AccFloat
,
typename
AGlobalDesc
,
typename
BGlobalDesc
,
typename
CGlobalDesc
,
typename
CBlockLengths
,
index_t
KPerBlock
,
InMemoryDataOperation
CGlobalMemoryDataOperation
>
struct
GridwiseTensorContraction_v1
{
__host__
__device__
static
constexpr
index_t
GetSharedMemoryNumberOfByte
()
{}
__device__
void
Run
(
const
Float
*
__restrict__
p_a_global
,
const
Float
*
__restrict__
p_b_global
,
Float
*
__restrict__
p_c_global
,
Float
*
__restrict__
p_shared_block
)
const
{
/// \todo sanity-check on AGlobalDesc, BGlboalDesc, CGlobalDesc length consisitency
/// \todo santiy-check on CBlockLengtsh
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
a_global_desc
=
AGlobalDesc
{};
constexpr
auto
b_global_desc
=
BGlobalDesc
{};
constexpr
auto
c_global_desc
=
CGlobalDesc
{};
constexpr
auto
K
=
a_global_desc
.
GetLengths
()[
0
];
// don't do anything if K == 0
if
(
K
==
0
)
{
return
;
}
// lds max alignment
constexpr
index_t
max_lds_align
=
math
::
lcm
(
ABlockCopyDstDataPerWrite_M
,
BBlockCopyDstDataPerWrite_N
,
ThreadGemmAThreadCopySrcDataPerRead_M
,
ThreadGemmBThreadCopySrcDataPerRead_N
);
// divide block work by [M, N]
static_assert
(
M
%
MPerBlock
==
0
&&
N
%
NPerBlock
==
0
&&
K
%
KPerBlock
==
0
,
"wrong! cannot divide work evenly among block"
);
constexpr
index_t
MBlockWork
=
M
/
MPerBlock
;
constexpr
index_t
NBlockWork
=
N
/
NPerBlock
;
constexpr
auto
block_work_desc
=
make_cluster_descriptor
(
Sequence
<
MBlockWork
,
NBlockWork
>
{});
const
auto
block_work_id
=
block_work_desc
.
CalculateClusterIndex
(
get_block_1d_id
());
const
index_t
m_block_data_on_global
=
block_work_id
[
0
]
*
MPerBlock
;
const
index_t
n_block_data_on_global
=
block_work_id
[
1
]
*
NPerBlock
;
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
a_k_m_block_desc
=
make_native_tensor_descriptor_aligned
(
Sequence
<
KPerBlock
,
MPerBlock
>
{},
Number
<
max_lds_align
>
{});
// A matrix blockwise copy
auto
a_blockwise_copy
=
BlockwiseGenericTensorSliceCopy_v4
<
BlockSize
,
AGlobalDesc
,
decltype
(
a_block_desc
),
decltype
(
a_k_m_block_desc
.
GetLengths
()),
ABlockCopyThreadSliceLengths_K_M
,
ABlockCopyThreadClusterLengths_K_M
,
ABlockCopyThreadClusterArrangeOrder
,
ABlockCopySrcAccessOrder
,
Sequence
<
0
,
1
>
,
ABlockCopySrcVectorReadDim
,
1
,
ABlockCopySrcDataPerRead
,
ABlockCopyDstDataPerWrite_M
,
AddressSpace
::
Global
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Lds
,
InMemoryDataOperation
::
Set
>
(
{
0
,
m_block_data_on_global
},
{
0
,
0
});
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr
auto
b_k_n_block_desc
=
make_native_tensor_descriptor_aligned
(
Sequence
<
KPerBlock
,
NPerBlock
>
{},
Number
<
max_lds_align
>
{});
// B matrix blockwise copy
auto
b_blockwise_copy
=
BlockwiseGenericTensorSliceCopy_v4
<
BlockSize
,
decltype
(
b_k_n_global_desc
),
decltype
(
b_k_n_block_desc
),
decltype
(
b_k_n_block_desc
.
GetLengths
()),
BBlockCopyThreadSliceLengths_K_N
,
BBlockCopyThreadClusterLengths_K_N
,
BBlockCopyThreadClusterArrangeOrder
,
BBlockCopySrcAccessOrder
,
Sequence
<
0
,
1
>
,
BBlockCopySrcVectorReadDim
,
1
,
BBlockCopySrcDataPerRead
,
BBlockCopyDstDataPerWrite_N
,
AddressSpace
::
Global
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Lds
,
InMemoryDataOperation
::
Set
>
(
{
0
,
n_block_data_on_global
},
{
0
,
0
});
// GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx
// a_mtx[KPerBlock, MPerBlock] is in LDS
// b_mtx[KPerBlocl, NPerBlock] is in LDS
// c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
// register
constexpr
auto
a_k_m_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
a_k_m_block_desc
);
constexpr
auto
b_k_n_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
b_k_n_block_desc
);
// sanity check
static_assert
(
MPerBlock
%
(
MPerThread
*
MLevel0Cluster
*
MLevel1Cluster
)
==
0
&&
NPerBlock
%
(
NPerThread
*
NLevel0Cluster
*
NLevel1Cluster
)
==
0
,
"wrong!"
);
constexpr
index_t
GemmMRepeat
=
MPerBlock
/
(
MPerThread
*
MLevel0Cluster
*
MLevel1Cluster
);
constexpr
index_t
GemmNRepeat
=
NPerBlock
/
(
NPerThread
*
NLevel0Cluster
*
NLevel1Cluster
);
// c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx
constexpr
auto
c_m0m1_n0n1_thread_mtx_desc
=
make_ConstantMatrixDescriptor_packed
(
Number
<
GemmMRepeat
*
MPerThread
>
{},
Number
<
GemmNRepeat
*
NPerThread
>
{});
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
<
BlockSize
,
decltype
(
a_k_m_block_mtx_desc
),
decltype
(
b_k_n_block_mtx_desc
),
decltype
(
c_m0m1_n0n1_thread_mtx_desc
),
MPerThread
,
NPerThread
,
MLevel0Cluster
,
NLevel0Cluster
,
MLevel1Cluster
,
NLevel1Cluster
,
KPerThread
,
ThreadGemmAThreadCopySrcDataPerRead_M
,
ThreadGemmBThreadCopySrcDataPerRead_N
>
{};
// LDS allocation for A and B: be careful of alignment
constexpr
index_t
a_block_space
=
math
::
integer_least_multiple
(
a_k_m_block_desc
.
GetElementSpace
(),
max_lds_align
);
constexpr
index_t
b_block_space
=
math
::
integer_least_multiple
(
b_k_n_block_desc
.
GetElementSpace
(),
max_lds_align
);
Float
*
p_a_block_double
=
p_shared_block
;
Float
*
p_b_block_double
=
p_shared_block
+
2
*
a_block_space
;
// register allocation for output
AccFloat
p_c_thread
[
c_m0m1_n0n1_thread_mtx_desc
.
GetElementSpace
()];
// zero out threadwise output
threadwise_matrix_set_zero
(
c_m0m1_n0n1_thread_mtx_desc
,
p_c_thread
);
// LDS double buffer: preload data into LDS
{
a_blockwise_copy
.
Run
(
p_a_global
,
p_a_block_double
);
b_blockwise_copy
.
Run
(
p_b_global
,
p_b_block_double
);
}
constexpr
auto
a_block_slice_copy_steps
=
Sequence
<
KPerBlock
,
0
>
{};
constexpr
auto
b_block_slice_copy_steps
=
Sequence
<
KPerBlock
,
0
>
{};
// LDS double buffer: main body
for
(
index_t
k_block_data_begin
=
0
;
k_block_data_begin
+
2
*
KPerBlock
<
K
;
k_block_data_begin
+=
2
*
KPerBlock
)
{
#pragma unroll
for
(
index_t
iloop
=
0
;
iloop
<
2
;
++
iloop
)
{
const
bool
even_loop
=
(
iloop
%
2
==
0
);
Float
*
p_a_block_now
=
even_loop
?
p_a_block_double
:
p_a_block_double
+
a_block_space
;
Float
*
p_b_block_now
=
even_loop
?
p_b_block_double
:
p_b_block_double
+
b_block_space
;
Float
*
p_a_block_next
=
even_loop
?
p_a_block_double
+
a_block_space
:
p_a_block_double
;
Float
*
p_b_block_next
=
even_loop
?
p_b_block_double
+
b_block_space
:
p_b_block_double
;
Float
p_a_thread_buffer
[
a_blockwise_copy
.
GetThreadBufferSize
()];
Float
p_b_thread_buffer
[
b_blockwise_copy
.
GetThreadBufferSize
()];
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_block_slice_copy_steps
,
True
);
b_blockwise_copy
.
MoveSrcSliceWindow
(
b_block_slice_copy_steps
,
True
);
__syncthreads
();
// LDS doubel buffer: load next data from device mem
a_blockwise_copy
.
RunLoadThreadBuffer
(
p_a_global
,
p_a_thread_buffer
);
b_blockwise_copy
.
RunLoadThreadBuffer
(
p_b_global
,
p_b_thread_buffer
);
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_a_block_now
,
p_b_block_now
,
p_c_thread
);
// LDS double buffer: store next data to LDS
a_blockwise_copy
.
RunStoreThreadBuffer
(
p_a_thread_buffer
,
p_a_block_next
);
b_blockwise_copy
.
RunStoreThreadBuffer
(
p_b_thread_buffer
,
p_b_block_next
);
}
}
// LDS double buffer: tail
{
constexpr
bool
has_two_iteration_left
=
(
K
%
(
2
*
KPerBlock
)
==
0
);
if
(
has_two_iteration_left
)
// if has 2 iteration left
{
Float
p_a_thread_buffer
[
a_blockwise_copy
.
GetThreadBufferSize
()];
Float
p_b_thread_buffer
[
b_blockwise_copy
.
GetThreadBufferSize
()];
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_block_slice_copy_steps
,
True
);
b_blockwise_copy
.
MoveSrcSliceWindow
(
b_block_slice_copy_steps
,
True
);
__syncthreads
();
// LDS double buffer: load last data from device mem
a_blockwise_copy
.
RunLoadThreadBuffer
(
p_a_global
,
p_a_thread_buffer
);
b_blockwise_copy
.
RunLoadThreadBuffer
(
p_b_global
,
p_b_thread_buffer
);
// LDS double buffer: GEMM on 2nd-last data
blockwise_gemm
.
Run
(
p_a_block_double
,
p_b_block_double
,
p_c_thread
);
// LDS double buffer: store last data to LDS
a_blockwise_copy
.
RunStoreThreadBuffer
(
p_a_thread_buffer
,
p_a_block_double
+
a_block_space
);
b_blockwise_copy
.
RunStoreThreadBuffer
(
p_b_thread_buffer
,
p_b_block_double
+
b_block_space
);
__syncthreads
();
// LDS double buffer: GEMM on last data
blockwise_gemm
.
Run
(
p_a_block_double
+
a_block_space
,
p_b_block_double
+
b_block_space
,
p_c_thread
);
}
else
// if has 1 iteration left
{
__syncthreads
();
// LDS double buffer: GEMM on last data
blockwise_gemm
.
Run
(
p_a_block_double
,
p_b_block_double
,
p_c_thread
);
}
}
// input: register to global memory
{
constexpr
index_t
M1
=
MPerThread
*
MLevel0Cluster
*
MLevel1Cluster
;
constexpr
index_t
M0
=
M
/
M1
;
constexpr
index_t
N1
=
NPerThread
*
NLevel0Cluster
*
NLevel1Cluster
;
constexpr
index_t
N0
=
N
/
N1
;
// define input tensor descriptor for threadwise copy
// thread input tensor, src of threadwise copy
constexpr
auto
c_m0_m1_n0_n1_thread_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
GemmMRepeat
,
MPerThread
,
GemmNRepeat
,
NPerThread
>
{});
constexpr
auto
c_m0_m1_n0_n1_global_desc
=
transform_tensor_descriptor
(
c_m_n_global_desc
,
make_tuple
(
UnMerge
<
Sequence
<
M0
,
M1
>>
{},
UnMerge
<
Sequence
<
N0
,
N1
>>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}),
make_tuple
(
Sequence
<
0
,
1
>
{},
Sequence
<
2
,
3
>
{}));
// calculate origin of thread input tensor on global memory
// blockwise GEMM c matrix starting index
const
auto
c_thread_mtx_on_block
=
blockwise_gemm
.
GetBeginOfThreadMatrixC
(
get_thread_local_1d_id
());
const
index_t
m_thread_data_on_global
=
m_block_data_on_global
+
c_thread_mtx_on_block
.
row
;
const
index_t
n_thread_data_on_global
=
n_block_data_on_global
+
c_thread_mtx_on_block
.
col
;
ThreadwiseGenericTensorSliceCopy_v4r2
<
decltype
(
c_m0_m1_n0_n1_thread_desc
),
decltype
(
c_m0_m1_n0_n1_global_desc
),
decltype
(
c_m0_m1_n0_n1_thread_desc
.
GetLengths
()),
CThreadCopySrcDstAccessOrder
,
CThreadCopySrcDstVectorReadWriteDim
,
1
,
CThreadCopyDstDataPerWrite
,
AddressSpace
::
Vgpr
,
AddressSpace
::
Global
,
CGlobalMemoryDataOperation
>
(
{
0
,
0
,
0
,
0
},
{
m_thread_data_on_global
/
M1
,
m_thread_data_on_global
%
M1
,
n_thread_data_on_global
/
N1
,
n_thread_data_on_global
%
N1
})
.
Run
(
p_c_thread
,
p_c_global
);
}
}
__device__
void
Run
(
const
Float
*
__restrict__
p_a_global
,
const
Float
*
__restrict__
p_b_global
,
Float
*
__restrict__
p_c_global
)
const
{
constexpr
index_t
shared_block_size
=
GetSharedMemoryNumberOfByte
()
/
sizeof
(
Float
);
__shared__
Float
p_shared_block
[
shared_block_size
];
Run
(
p_a_global
,
p_b_global
,
p_c_global
,
p_shared_block
);
}
};
}
// namespace ck
#endif
composable_kernel/include/tensor_operation/threadwise_dynamic_tensor_slice_transfer.hpp
View file @
7da654a4
...
@@ -58,14 +58,28 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
...
@@ -58,14 +58,28 @@ __host__ __device__ constexpr void threadwise_dynamic_tensor_slice_transfer_v1(
bool
forward_dim1
=
true
;
bool
forward_dim1
=
true
;
// hardcoded for 2d loop for now
// hardcoded for 2d loop for now
#pragma unroll
1
#pragma unroll
for
(
int
j0
=
0
;
j0
<
J0
;
++
j0
)
for
(
int
j0
=
0
;
j0
<
J0
;
++
j0
)
{
{
#pragma unroll
1
#pragma unroll
for
(
int
j1
=
0
;
j1
<
J1
;
++
j1
)
for
(
int
j1
=
0
;
j1
<
J1
;
++
j1
)
{
{
// do work
// do work
p_dst
[
dst_coord
.
GetOffset
()]
=
p_src
[
src_coord
.
GetOffset
()];
transfer_data
<
SrcData
,
1
,
SrcAddressSpace
,
DstAddressSpace
,
DstInMemOp
,
SrcScalarStrideInVector
,
DstScalarStrideInVector
>
(
p_src
,
src_coord
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
src_desc
,
src_coord
),
src_desc
.
GetElementSpaceSize
(),
p_dst
,
dst_coord
.
GetOffset
(),
coordinate_has_valid_offset_assuming_visible_index_is_valid
(
dst_desc
,
dst_coord
),
dst_desc
.
GetElementSpaceSize
());
// move dim1 iterator
// move dim1 iterator
if
(
j1
<
J1
-
1
)
if
(
j1
<
J1
-
1
)
...
...
driver/include/device_dynamic_col2im_gemmkgemmn_nchw.hpp
View file @
7da654a4
...
@@ -66,11 +66,11 @@ void device_dynamic_col2im_gemmkgemmn_nchw(ColDesc,
...
@@ -66,11 +66,11 @@ void device_dynamic_col2im_gemmkgemmn_nchw(ColDesc,
#if 1
#if 1
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
GemmKPerBlock
=
12
8
;
constexpr
index_t
GemmKPerBlock
=
8
;
constexpr
index_t
GemmNPerBlock
=
128
;
constexpr
index_t
GemmNPerBlock
=
128
;
using
BlockCopySubLengths_GemmK_GemmN
=
Sequence
<
8
,
8
>
;
using
BlockCopySubLengths_GemmK_GemmN
=
Sequence
<
1
,
8
>
;
using
BlockCopyClusterLengths_GemmK_GemmN
=
Sequence
<
16
,
16
>
;
using
BlockCopyClusterLengths_GemmK_GemmN
=
Sequence
<
8
,
16
>
;
using
BlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
>
;
// [GemmK, GemmN]
using
BlockCopyThreadClusterArrangeOrder
=
Sequence
<
0
,
1
>
;
// [GemmK, GemmN]
using
BlockCopySrcAccessOrder
=
Sequence
<
0
,
1
>
;
// [GemmK, GemmN]
using
BlockCopySrcAccessOrder
=
Sequence
<
0
,
1
>
;
// [GemmK, GemmN]
using
BlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [GemmK, GemmN]
using
BlockCopyDstAccessOrder
=
Sequence
<
0
,
1
>
;
// [GemmK, GemmN]
...
...
driver/include/host_col2im.hpp
View file @
7da654a4
...
@@ -21,14 +21,14 @@ void host_col2im(const Tensor<T>& in_eb,
...
@@ -21,14 +21,14 @@ void host_col2im(const Tensor<T>& in_eb,
int
N
=
in_nchw
.
mDesc
.
GetLengths
()[
0
];
int
N
=
in_nchw
.
mDesc
.
GetLengths
()[
0
];
int
C
=
in_nchw
.
mDesc
.
GetLengths
()[
1
];
int
C
=
in_nchw
.
mDesc
.
GetLengths
()[
1
];
int
H
I
=
in_nchw
.
mDesc
.
GetLengths
()[
2
];
int
H
i
=
in_nchw
.
mDesc
.
GetLengths
()[
2
];
int
W
I
=
in_nchw
.
mDesc
.
GetLengths
()[
3
];
int
W
i
=
in_nchw
.
mDesc
.
GetLengths
()[
3
];
int
Y
=
FilterSizes
{}[
0
];
int
Y
=
FilterSizes
{}[
0
];
int
X
=
FilterSizes
{}[
1
];
int
X
=
FilterSizes
{}[
1
];
int
H
O
=
OutputSizes
{}[
0
];
int
H
o
=
OutputSizes
{}[
0
];
int
W
O
=
OutputSizes
{}[
1
];
int
W
o
=
OutputSizes
{}[
1
];
auto
f
=
[
&
](
auto
n
,
auto
c
,
auto
hi
,
auto
wi
)
{
auto
f
=
[
&
](
auto
n
,
auto
c
,
auto
hi
,
auto
wi
)
{
double
v
=
0
;
double
v
=
0
;
...
@@ -37,22 +37,28 @@ void host_col2im(const Tensor<T>& in_eb,
...
@@ -37,22 +37,28 @@ void host_col2im(const Tensor<T>& in_eb,
{
{
int
h_tmp
=
hi
+
LeftPads
{}[
0
]
-
y
*
ConvDilations
{}[
0
];
int
h_tmp
=
hi
+
LeftPads
{}[
0
]
-
y
*
ConvDilations
{}[
0
];
if
(
h_tmp
>=
0
&&
h_tmp
<
HI
&&
h_tmp
%
ConvStrides
{}[
0
]
==
0
)
if
(
h_tmp
%
ConvStrides
{}[
0
]
==
0
)
{
{
int
ho
=
h_tmp
/
ConvStrides
{}[
0
];
int
ho
=
h_tmp
/
ConvStrides
{}[
0
];
for
(
int
x
=
0
;
x
<
X
;
++
x
)
if
(
ho
>=
0
&&
ho
<
Ho
)
{
{
int
w_tmp
=
wi
+
LeftPads
{}[
1
]
-
x
*
ConvDilations
{}[
1
];
for
(
int
x
=
0
;
x
<
X
;
++
x
)
if
(
w_tmp
>=
0
&&
w_tmp
<
WI
&&
w_tmp
%
ConvStrides
{}[
1
]
==
0
)
{
{
int
wo
=
w_tmp
/
ConvStrides
{}[
1
];
int
w_tmp
=
wi
+
LeftPads
{}[
1
]
-
x
*
ConvDilations
{}[
1
];
if
(
w_tmp
%
ConvStrides
{}[
1
]
==
0
)
{
int
wo
=
w_tmp
/
ConvStrides
{}[
1
];
int
e
=
c
*
(
Y
*
X
)
+
y
*
X
+
x
;
if
(
wo
>=
0
&&
wo
<
Wo
&&
w_tmp
%
ConvStrides
{}[
1
]
==
0
)
int
b
=
n
*
(
HO
*
WO
)
+
ho
*
WO
+
wo
;
{
int
e
=
c
*
(
Y
*
X
)
+
y
*
X
+
x
;
int
b
=
n
*
(
Ho
*
Wo
)
+
ho
*
Wo
+
wo
;
v
+=
in_eb
(
e
,
b
);
v
+=
in_eb
(
e
,
b
);
}
}
}
}
}
}
}
}
...
...
driver/src/col2im_driver.cpp
View file @
7da654a4
...
@@ -19,21 +19,6 @@ int main(int argc, char* argv[])
...
@@ -19,21 +19,6 @@ int main(int argc, char* argv[])
using
namespace
ck
;
using
namespace
ck
;
#if 1
#if 1
// 1x1, 8x8
constexpr
index_t
N
=
2
;
constexpr
index_t
C
=
128
;
constexpr
index_t
HI
=
8
;
constexpr
index_t
WI
=
8
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
1
;
constexpr
index_t
X
=
1
;
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif 0
// 3x3, 71x71
// 3x3, 71x71
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
192
;
constexpr
index_t
C
=
192
;
...
@@ -541,7 +526,7 @@ int main(int argc, char* argv[])
...
@@ -541,7 +526,7 @@ int main(int argc, char* argv[])
if
(
do_verification
)
if
(
do_verification
)
{
{
#if
1
#if
0
col_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
col_eb.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#else
#else
col_eb
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
col_eb
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
...
@@ -587,7 +572,7 @@ int main(int argc, char* argv[])
...
@@ -587,7 +572,7 @@ int main(int argc, char* argv[])
check_error
(
img_nchw_host
,
img_nchw_device
);
check_error
(
img_nchw_host
,
img_nchw_device
);
#if
1
#if
0
LogRange(std::cout << "col_eb : ", col_eb.mData, ",") << std::endl;
LogRange(std::cout << "col_eb : ", col_eb.mData, ",") << std::endl;
LogRange(std::cout << "img_nchw_host : ", img_nchw_host.mData, ",") << std::endl;
LogRange(std::cout << "img_nchw_host : ", img_nchw_host.mData, ",") << std::endl;
LogRange(std::cout << "img_nchw_device : ", img_nchw_device.mData, ",") << std::endl;
LogRange(std::cout << "img_nchw_device : ", img_nchw_device.mData, ",") << std::endl;
...
...
driver/src/conv_bwd_data_driver.cpp
View file @
7da654a4
...
@@ -250,17 +250,17 @@ int main(int argc, char* argv[])
...
@@ -250,17 +250,17 @@ int main(int argc, char* argv[])
#elif 1
#elif 1
device_convolution_backward_data_implicit_gemm_v5r1_nhwc_kyxc_nhwk
device_convolution_backward_data_implicit_gemm_v5r1_nhwc_kyxc_nhwk
#endif
#endif
(
in_nchw_desc
,
(
in_nchw_desc
,
in_nchw_device
,
in_nchw_device
,
wei_kcyx_desc
,
wei_kcyx_desc
,
wei_kcyx
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_desc
,
out_nkhw
,
out_nkhw
,
ConvStrides
{},
ConvStrides
{},
ConvDilations
{},
ConvDilations
{},
LeftPads
{},
LeftPads
{},
RightPads
{},
RightPads
{},
nrepeat
);
nrepeat
);
if
(
do_verification
)
if
(
do_verification
)
{
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment