Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
cd8de112
Commit
cd8de112
authored
Jul 30, 2019
by
Chao Liu
Browse files
experimenting new merged tensor copy
parent
284e7bb3
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
136 additions
and
24 deletions
+136
-24
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
...ridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
+37
-2
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
...ridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+4
-3
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...ridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+45
-2
composable_kernel/include/tensor_description/tensor_coordinate.hpp
...e_kernel/include/tensor_description/tensor_coordinate.hpp
+34
-13
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+12
-0
driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
...de/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
+2
-2
driver/src/driver.cpp
driver/src/driver.cpp
+2
-2
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hpp
View file @
cd8de112
...
@@ -6,6 +6,7 @@
...
@@ -6,6 +6,7 @@
#include "ConstantMatrixDescriptor.hpp"
#include "ConstantMatrixDescriptor.hpp"
#include "blockwise_2d_tensor_op.hpp"
#include "blockwise_2d_tensor_op.hpp"
#include "blockwise_4d_tensor_op.hpp"
#include "blockwise_4d_tensor_op.hpp"
#include "blockwise_generic_tensor_slice_copy.hpp"
#include "threadwise_tensor_slice_copy.hpp"
#include "threadwise_tensor_slice_copy.hpp"
#include "threadwise_4d_tensor_op.hpp"
#include "threadwise_4d_tensor_op.hpp"
#include "blockwise_batched_gemm.hpp"
#include "blockwise_batched_gemm.hpp"
...
@@ -122,8 +123,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
...
@@ -122,8 +123,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor_packed
(
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
KPerThread
,
HoPerThread
,
WoPerThread
,
NPerThread
>
{});
Sequence
<
KPerThread
,
HoPerThread
,
WoPerThread
,
NPerThread
>
{});
// blockwise copy
// blockwise copy
// input: format is [C, Hi, Wi, N]
// input: format is [C, Hi, Wi, N]
#if 0
const auto blockwise_in_copy =
const auto blockwise_in_copy =
Blockwise4dTensorCopy3<BlockSize,
Blockwise4dTensorCopy3<BlockSize,
Float,
Float,
...
@@ -132,6 +134,39 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
...
@@ -132,6 +134,39 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
decltype(in_c_h_w_n_block_desc.GetLengths()),
decltype(in_c_h_w_n_block_desc.GetLengths()),
InBlockCopyClusterLengths_CHWN,
InBlockCopyClusterLengths_CHWN,
InBlockCopyDataPerRead_N>{};
InBlockCopyDataPerRead_N>{};
#elif
0
using
InBlockCopySubLengths_CHWN
=
decltype
(
in_c_h_w_n_block_desc
.
GetLengths
()
/
InBlockCopyClusterLengths_CHWN
{});
auto
blockwise_in_copy
=
BlockwiseGenericTensorSliceCopy_v1
<
BlockSize
,
Float
,
decltype
(
in_c_h_w_n_global_desc
),
decltype
(
in_c_h_w_n_block_desc
),
decltype
(
in_c_h_w_n_block_desc
.
GetLengths
()),
InBlockCopySubLengths_CHWN
,
InBlockCopyClusterLengths_CHWN
,
Sequence
<
0
,
1
,
2
,
3
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
Sequence
<
0
,
1
,
2
,
3
>
,
1
,
1
>
({
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
});
#elif 1
using
InBlockCopySubLengths_CHWN
=
decltype
(
in_c_h_w_n_block_desc
.
GetLengths
()
/
InBlockCopyClusterLengths_CHWN
{});
auto
blockwise_in_copy
=
BlockwiseGenericTensorSliceCopy_v2
<
BlockSize
,
Float
,
decltype
(
in_c_h_w_n_global_desc
),
decltype
(
in_c_h_w_n_block_desc
),
NormalTensorCoordinate
<
decltype
(
in_c_h_w_n_global_desc
)
>
,
NormalTensorCoordinate
<
decltype
(
in_c_h_w_n_block_desc
)
>
,
decltype
(
in_c_h_w_n_block_desc
.
GetLengths
()),
InBlockCopySubLengths_CHWN
,
InBlockCopyClusterLengths_CHWN
,
Sequence
<
0
,
1
,
2
,
3
>>
({
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
});
#endif
// blockwise wei copy
// blockwise wei copy
// format is [CPerBlock, X * KPerBlock]
// format is [CPerBlock, X * KPerBlock]
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
cd8de112
...
@@ -224,7 +224,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
...
@@ -224,7 +224,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
// c_thread_mtx definition: this is a mess
// c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx
// TODO:: more elegent way of defining c_thread_mtx
constexpr
auto
c_k0k2_n1n2_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
constexpr
auto
c_k0k2_n1n2_thread_mtx_desc
=
make_ConstantMatrixDescriptor
_packed
(
Number
<
GemmMRepeat
*
GemmMPerThreadSubC
>
{},
Number
<
N1
*
N2
>
{});
Number
<
GemmMRepeat
*
GemmMPerThreadSubC
>
{},
Number
<
N1
*
N2
>
{});
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
<
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
<
...
@@ -258,9 +258,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
...
@@ -258,9 +258,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
GemmDataPerReadB
);
GemmDataPerReadB
);
constexpr
index_t
in_block_space
=
constexpr
index_t
in_block_space
=
in_e_n1_b_n2_block_desc
.
GetElementSpace
(
Number
<
max_align
>
{}
);
math
::
integer_least_multiple
(
in_e_n1_b_n2_block_desc
.
GetElementSpace
(
),
max_align
);
constexpr
index_t
wei_block_space
=
wei_e_k_block_desc
.
GetElementSpace
(
Number
<
max_align
>
{});
constexpr
index_t
wei_block_space
=
math
::
integer_least_multiple
(
wei_e_k_block_desc
.
GetElementSpace
(),
max_align
);
__shared__
Float
p_in_block
[
in_block_space
];
__shared__
Float
p_in_block
[
in_block_space
];
__shared__
Float
p_wei_block
[
wei_block_space
];
__shared__
Float
p_wei_block
[
wei_block_space
];
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
cd8de112
...
@@ -161,7 +161,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
...
@@ -161,7 +161,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
Float
,
Float
,
decltype
(
wei_e_k_global_desc
),
decltype
(
wei_e_k_global_desc
),
decltype
(
wei_e_k_block_desc
),
decltype
(
wei_e_k_block_desc
),
Merged
TensorCoordinate
<
decltype
(
wei_e_k_global_desc
)
>
,
Normal
TensorCoordinate
<
decltype
(
wei_e_k_global_desc
)
>
,
NormalTensorCoordinate
<
decltype
(
wei_e_k_block_desc
)
>
,
NormalTensorCoordinate
<
decltype
(
wei_e_k_block_desc
)
>
,
decltype
(
wei_e_k_block_desc
.
GetLengths
()),
decltype
(
wei_e_k_block_desc
.
GetLengths
()),
WeiBlockCopySubLengths_E_K
,
WeiBlockCopySubLengths_E_K
,
...
@@ -301,7 +301,41 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
...
@@ -301,7 +301,41 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
b_thread_data_on_global % B1});
b_thread_data_on_global % B1});
threadwise_out_copy.Run(p_out_thread, p_out_thread_on_global);
threadwise_out_copy.Run(p_out_thread, p_out_thread_on_global);
#else
#elif
0
// This is a hack, because slicing a merged dimension is not supported yet.
// This should be replaced with logic above, once slicing a merged dimension support
// become available
// dst descriptor
constexpr
auto
out_k0_k1_b_global_desc
=
make_ConstantMergedTensorDescriptor
(
out_n_k_h_w_global_desc
.
Fold
(
I1
,
Number
<
K1
>
{}),
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
0
,
3
,
4
>
{});
// src descriptor
constexpr
auto
out_k0_k1_b_thread_desc
=
make_ConstantTensorDescriptor_packed
(
Sequence
<
GemmMRepeat
,
GemmMPerThreadSubC
,
GemmNRepeat
*
GemmNPerThreadSubC
>
{});
auto
threadwise_out_copy
=
ThreadwiseGenericTensorSliceCopy_v2
<
Float
,
decltype
(
out_k0_k1_b_thread_desc
),
decltype
(
out_k0_k1_b_global_desc
),
NormalTensorCoordinate
<
decltype
(
out_k0_k1_b_thread_desc
)
>
,
MergedTensorCoordinate
<
decltype
(
out_k0_k1_b_global_desc
)
>
,
Sequence
<
GemmMRepeat
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
>>
(
{
0
,
0
,
0
},
{
k_thread_data_on_global
/
K1
,
k_thread_data_on_global
%
K1
,
b_thread_data_on_global
});
for
(
index_t
nrepeat
=
0
;
nrepeat
<
GemmNRepeat
;
++
nrepeat
)
{
threadwise_out_copy
.
Run
(
p_out_thread
,
p_out_global
);
threadwise_out_copy
.
MoveSrcSlicingWindow
({
0
,
0
,
GemmNPerThreadSubC
},
true
);
threadwise_out_copy
.
MoveDstSlicingWindow
({
0
,
0
,
B1
},
true
);
}
#elif 1
// This is a hack, because slicing a merged dimension is not supported yet.
// This is a hack, because slicing a merged dimension is not supported yet.
// This should be replaced with logic above, once slicing a merged dimension support
// This should be replaced with logic above, once slicing a merged dimension support
// become available
// become available
...
@@ -318,10 +352,19 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
...
@@ -318,10 +352,19 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
auto
threadwise_out_copy
=
ThreadwiseGenericTensorSliceCopy_v2
<
auto
threadwise_out_copy
=
ThreadwiseGenericTensorSliceCopy_v2
<
Float
,
Float
,
#if 1 // debug
decltype
(
out_k0_k1_b_thread_desc
),
decltype
(
out_k0_k1_b_thread_desc
),
decltype
(
out_k0_k1_b_global_desc
),
decltype
(
out_k0_k1_b_global_desc
),
NormalTensorCoordinate
<
decltype
(
out_k0_k1_b_thread_desc
)
>
,
NormalTensorCoordinate
<
decltype
(
out_k0_k1_b_thread_desc
)
>
,
MergedTensorCoordinate
<
decltype
(
out_k0_k1_b_global_desc
)
>
,
MergedTensorCoordinate
<
decltype
(
out_k0_k1_b_global_desc
)
>
,
#else
decltype
(
out_k0_k1_b_thread_desc
),
decltype
(
make_ConstantTensorDescriptor_packed
(
out_k0_k1_b_global_desc
.
GetLengths
())),
NormalTensorCoordinate
<
decltype
(
out_k0_k1_b_thread_desc
)
>
,
NormalTensorCoordinate
<
decltype
(
make_ConstantTensorDescriptor_packed
(
out_k0_k1_b_global_desc
.
GetLengths
()))
>
,
#endif
Sequence
<
GemmMRepeat
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
>>
(
Sequence
<
GemmMRepeat
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
>>
(
{
0
,
0
,
0
},
{
0
,
0
,
0
},
{
k_thread_data_on_global
/
K1
,
{
k_thread_data_on_global
/
K1
,
...
...
composable_kernel/include/tensor_description/tensor_coordinate.hpp
View file @
cd8de112
...
@@ -93,6 +93,17 @@ struct NormalTensorCoordinate
...
@@ -93,6 +93,17 @@ struct NormalTensorCoordinate
return
coord
;
return
coord
;
}
}
// reposition point of origin, and return compensated offset
__host__
__device__
constexpr
index_t
RepositionOrigin
()
{
index_t
offset_diff
=
mOffset
;
mIndex
=
make_zero_array
<
index_t
,
nDim
>
();
mOffset
=
0
;
return
offset_diff
;
}
// private:
// private:
Array
<
index_t
,
nDim
>
mIndex
;
Array
<
index_t
,
nDim
>
mIndex
;
index_t
mOffset
;
index_t
mOffset
;
...
@@ -305,6 +316,29 @@ struct MergedTensorCoordinate
...
@@ -305,6 +316,29 @@ struct MergedTensorCoordinate
return
coord
;
return
coord
;
}
}
// reposition point of origin, and return compensated offset
__host__
__device__
constexpr
index_t
RepositionOrigin
()
{
index_t
offset_diff
=
0
;
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
idim_
)
{
constexpr
auto
idim
=
decltype
(
idim_
){};
static_if
<!
tensor_desc_type
::
ContainMultipleOriginalDimensions
(
idim
)
>
{}([
&
](
auto
)
{
constexpr
auto
idim_original
=
tensor_desc_type
::
GetContainedOriginalDimensions
(
idim
).
Front
();
mIndex
(
idim
)
=
0
;
mOriginalIndex
(
idim_original
)
=
0
;
mOffset
-=
mPartialOffsets
[
idim
];
offset_diff
+=
mPartialOffsets
[
idim
];
mPartialOffsets
(
idim
)
=
0
;
});
});
return
offset_diff
;
}
// private:
// private:
Array
<
index_t
,
nDim
>
mIndex
;
Array
<
index_t
,
nDim
>
mIndex
;
Array
<
index_t
,
nOriginalDim
>
mOriginalIndex
;
Array
<
index_t
,
nOriginalDim
>
mOriginalIndex
;
...
@@ -312,18 +346,5 @@ struct MergedTensorCoordinate
...
@@ -312,18 +346,5 @@ struct MergedTensorCoordinate
index_t
mOffset
;
index_t
mOffset
;
};
};
#if 0
// implementation of MergedTensorCoordinate, when index_t is signed integer
// mPartialOffsets is not needed, if index_t is signed integer type
template<>
struct TensorCoordinate<signed_t>
{
private:
Array<_t, nDim> mIndex;
Array<_t, nOriginalDim> mOriginalIndex;
index_t mOffset;
};
#endif
}
// namespace ck
}
// namespace ck
#endif
#endif
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
cd8de112
...
@@ -140,11 +140,23 @@ struct ThreadwiseGenericTensorSliceCopy_v2
...
@@ -140,11 +140,23 @@ struct ThreadwiseGenericTensorSliceCopy_v2
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
{
#if 0
static_ford<SliceLengths>{}([&](auto data_id) {
static_ford<SliceLengths>{}([&](auto data_id) {
p_dst[(mDstSliceOrigin + data_id).GetOffset()] =
p_dst[(mDstSliceOrigin + data_id).GetOffset()] =
p_src[(mSrcSliceOrigin + data_id).GetOffset()];
p_src[(mSrcSliceOrigin + data_id).GetOffset()];
});
#elif
1
auto
src_slice_origin
=
mSrcSliceOrigin
;
auto
dst_slice_origin
=
mDstSliceOrigin
;
p_src
+=
src_slice_origin
.
RepositionOrigin
();
p_dst
+=
dst_slice_origin
.
RepositionOrigin
();
static_ford
<
SliceLengths
>
{}([
&
](
auto
data_id
)
{
p_dst
[(
dst_slice_origin
+
data_id
).
GetOffset
()]
=
p_src
[(
src_slice_origin
+
data_id
).
GetOffset
()];
});
});
#endif
}
}
__device__
void
MoveSrcSlicingWindow
(
Array
<
index_t
,
nDim
>
step_sizes
,
bool
positive_direction
)
__device__
void
MoveSrcSlicingWindow
(
Array
<
index_t
,
nDim
>
step_sizes
,
bool
positive_direction
)
...
...
driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp
View file @
cd8de112
...
@@ -478,9 +478,9 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
...
@@ -478,9 +478,9 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc,
GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
GridwiseConvolutionImplicitGemm_v1r1_chwn_cyxk_khwn
#elif
0
#elif
0
GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
GridwiseConvolutionImplicitGemm_v1r2_chwn_cyxk_khwn
#elif 0
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
#elif 1
#elif 1
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
#elif 0
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer
#endif
#endif
<
GridSize
,
<
GridSize
,
...
...
driver/src/driver.cpp
View file @
cd8de112
...
@@ -85,7 +85,7 @@ int main(int argc, char* argv[])
...
@@ -85,7 +85,7 @@ int main(int argc, char* argv[])
constexpr index_t HPad = 0;
constexpr index_t HPad = 0;
constexpr index_t WPad = 0;
constexpr index_t WPad = 0;
#elif
1
#elif
0
// 3x3, 34x34
// 3x3, 34x34
constexpr
index_t
N
=
128
;
constexpr
index_t
N
=
128
;
constexpr
index_t
C
=
256
;
constexpr
index_t
C
=
256
;
...
@@ -520,7 +520,7 @@ int main(int argc, char* argv[])
...
@@ -520,7 +520,7 @@ int main(int argc, char* argv[])
#if 0
#if 0
device_convolution_direct_v2_nchw_kcyx_nkhw
device_convolution_direct_v2_nchw_kcyx_nkhw
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
#elif
1
#elif
0
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
(
device_convolution_implicit_gemm_v1_chwn_cyxk_khwn
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif 0
#elif 0
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment