Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
c01af899
Commit
c01af899
authored
Aug 03, 2019
by
Chao Liu
Browse files
added new tensor copy operator
parent
e9733a9f
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
214 additions
and
195 deletions
+214
-195
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
...ridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+1
-27
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+66
-19
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...ridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+1
-27
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+93
-63
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+17
-39
composable_kernel/include/utility/Sequence.hpp
composable_kernel/include/utility/Sequence.hpp
+33
-18
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+1
-1
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
.../device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+1
-0
driver/src/driver.cpp
driver/src/driver.cpp
+1
-1
No files found.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
c01af899
...
@@ -295,27 +295,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
...
@@ -295,27 +295,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
// do work
// do work
for
(
index_t
e
=
0
;
e
<
E
;
e
+=
EPerBlock
)
for
(
index_t
e
=
0
;
e
<
E
;
e
+=
EPerBlock
)
{
{
#if 0 // debug
blockwise_in_copy
.
Run
(
p_in_global
,
p_in_block
);
blockwise_in_copy
.
Run
(
p_in_global
,
p_in_block
);
blockwise_wei_copy
.
Run
(
p_wei_global
,
p_wei_block
);
blockwise_wei_copy
.
Run
(
p_wei_global
,
p_wei_block
);
#else
using
InSrcMergedDimSubLengthsHack
=
Sequence
<
InBlockCopySubLengths_E_N1_B_N2
{}[
0
],
1
,
InBlockCopySubLengths_E_N1_B_N2
{}[
2
],
1
>
;
using
InDstMergedDimSubLengthsHack
=
Sequence
<
1
,
1
,
1
,
1
>
;
blockwise_in_copy
.
Run_hack
(
p_in_global
,
p_in_block
,
InSrcMergedDimSubLengthsHack
{},
InDstMergedDimSubLengthsHack
{});
using
WeiSrcMergedDimSubLengthsHack
=
Sequence
<
1
,
1
>
;
using
WeiDstMergedDimSubLengthsHack
=
Sequence
<
1
,
1
>
;
blockwise_wei_copy
.
Run_hack
(
p_wei_global
,
p_wei_block
,
WeiSrcMergedDimSubLengthsHack
{},
WeiDstMergedDimSubLengthsHack
{});
#endif
__syncthreads
();
__syncthreads
();
...
@@ -391,10 +372,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
...
@@ -391,10 +372,6 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
arithmetic_sequence_gen<0, 8, 1>::type{},
arithmetic_sequence_gen<0, 8, 1>::type{},
Number<1>{});
Number<1>{});
#else
#else
using
OutSrcMergedDimSliceLengthsHack
=
Sequence
<
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
;
using
OutDstMergedDimSliceLengthsHack
=
Sequence
<
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
>
;
ThreadwiseGenericTensorSliceCopy_v2
<
ThreadwiseGenericTensorSliceCopy_v2
<
Float
,
Float
,
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
),
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
),
...
@@ -403,10 +380,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
...
@@ -403,10 +380,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
MergedTensorCoordinate
<
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
)
>
,
MergedTensorCoordinate
<
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
)
>
,
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
.
GetLengths
())
>
(
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
.
GetLengths
())
>
(
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
})
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
})
.
Run_hack
(
p_out_thread
,
.
Run
(
p_out_thread
,
p_out_thread_on_global
);
p_out_thread_on_global
,
OutSrcMergedDimSliceLengthsHack
{},
OutDstMergedDimSliceLengthsHack
{});
#endif
#endif
}
}
}
}
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
c01af899
...
@@ -155,6 +155,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -155,6 +155,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
static_assert
(
in_e_n1_b_n2_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
static_assert
(
in_e_n1_b_n2_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
"GemmDataPerReadB alignment requirement is not satisfied"
);
"GemmDataPerReadB alignment requirement is not satisfied"
);
#if 1 // debug
// input blockwise copy
// input blockwise copy
// slice a merged tensor, reorder and copy to a normal tensor
// slice a merged tensor, reorder and copy to a normal tensor
// this copy operator already has blockwise offset built-in
// this copy operator already has blockwise offset built-in
...
@@ -172,6 +173,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -172,6 +173,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
InBlockCopySrcDataPerRead_B
,
InBlockCopySrcDataPerRead_B
,
InBlockCopyDstDataPerWrite_N2
>
(
InBlockCopyDstDataPerWrite_N2
>
(
{
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
{
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
#else
auto
blockwise_in_copy
=
BlockwiseGenericTensorSliceCopy_v2
<
BlockSize
,
Float
,
decltype
(
in_e_n1_b_n2_global_merged_desc
),
decltype
(
in_e_n1_b_n2_block_desc
),
MergedTensorCoordinate
<
decltype
(
in_e_n1_b_n2_global_merged_desc
)
>
,
NormalTensorCoordinate
<
decltype
(
in_e_n1_b_n2_block_desc
)
>
,
decltype
(
in_e_n1_b_n2_block_desc
.
GetLengths
()),
InBlockCopySubLengths_E_N1_B_N2
,
InBlockCopyClusterLengths_E_N1_B_N2
,
InBlockCopyThreadClusterArrangeOrder
>
({
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
#endif
// weight tensor
// weight tensor
// tensor descriptor in device memory, src of blockwise copy
// tensor descriptor in device memory, src of blockwise copy
...
@@ -184,6 +198,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -184,6 +198,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Sequence
<
EPerBlock
,
KPerBlock
>
{},
Sequence
<
EPerBlock
,
KPerBlock
>
{},
Number
<
math
::
lcm
(
WeiBlockCopyDstDataPerWrite_K
,
GemmDataPerReadA
)
>
{});
Number
<
math
::
lcm
(
WeiBlockCopyDstDataPerWrite_K
,
GemmDataPerReadA
)
>
{});
#if 1 // debug
// operator for blockwise copy of weight into LDS
// operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor
// slice a tensor, and copy it into another tensor
// this copy operator already have blockwise offset built-in
// this copy operator already have blockwise offset built-in
...
@@ -201,6 +216,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -201,6 +216,19 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
>
(
WeiBlockCopyDstDataPerWrite_K
>
(
{
0
,
k_block_data_on_global
},
{
0
,
0
});
{
0
,
k_block_data_on_global
},
{
0
,
0
});
#else
auto
blockwise_wei_copy
=
BlockwiseGenericTensorSliceCopy_v2
<
BlockSize
,
Float
,
decltype
(
wei_e_k_global_desc
),
decltype
(
wei_e_k_block_desc
),
NormalTensorCoordinate
<
decltype
(
wei_e_k_global_desc
)
>
,
NormalTensorCoordinate
<
decltype
(
wei_e_k_block_desc
)
>
,
decltype
(
wei_e_k_block_desc
.
GetLengths
()),
WeiBlockCopySubLengths_E_K
,
WeiBlockCopyClusterLengths_E_K
,
WeiBlockCopyThreadClusterArrangeOrder
>
({
0
,
k_block_data_on_global
},
{
0
,
0
});
#endif
// GEMM definition
// GEMM definition
// c_mtx += transpose(a_mtx) * b_mtx
// c_mtx += transpose(a_mtx) * b_mtx
...
@@ -291,53 +319,60 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -291,53 +319,60 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float
*
p_wei_block_next
=
Float
*
p_wei_block_next
=
even_loop
?
p_wei_block_double
+
wei_block_space
:
p_wei_block_double
;
even_loop
?
p_wei_block_double
+
wei_block_space
:
p_wei_block_double
;
Float
p_in_register_
clipboard
[
blockwise_in_copy
.
GetRegister
Clipboard
Size
()];
Float
p_in_register_
buffer
[
blockwise_in_copy
.
GetRegister
Buffer
Size
()];
Float
p_wei_register_
clipboard
[
blockwise_wei_copy
.
GetRegister
Clipboard
Size
()];
Float
p_wei_register_
buffer
[
blockwise_wei_copy
.
GetRegister
Buffer
Size
()];
#if 1
blockwise_in_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
EPerBlock
>
{},
True
);
blockwise_in_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
EPerBlock
>
{},
True
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStride
(
I0
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStride
(
I0
);
#else
blockwise_in_copy
.
MoveSrcSlicingWindow
({
EPerBlock
,
0
,
0
,
0
},
true
);
blockwise_wei_copy
.
MoveSrcSlicingWindow
({
EPerBlock
,
0
},
true
);
#endif
__syncthreads
();
__syncthreads
();
// LDS doubel buffer: load next data from device mem
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
RunLoadRegister
Clipboard
(
p_in_global
,
p_in_register_
clipboard
);
blockwise_in_copy
.
RunLoadRegister
Buffer
(
p_in_global
,
p_in_register_
buffer
);
blockwise_wei_copy
.
RunLoadRegister
Clipboard
(
p_wei_block_on_global
,
blockwise_wei_copy
.
RunLoadRegister
Buffer
(
p_wei_block_on_global
,
p_wei_register_
clipboard
);
p_wei_register_
buffer
);
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_now
,
p_in_block_now
,
p_out_thread
);
// LDS double buffer: store next data to LDS
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
RunStoreRegisterClipboard
(
p_in_register_clipboard
,
blockwise_in_copy
.
RunStoreRegisterBuffer
(
p_in_register_buffer
,
p_in_block_next
);
p_in_block_next
);
blockwise_wei_copy
.
RunStoreRegisterBuffer
(
p_wei_register_buffer
,
p_wei_block_next
);
blockwise_wei_copy
.
RunStoreRegisterClipboard
(
p_wei_register_clipboard
,
p_wei_block_next
);
}
}
}
}
// LDS double buffer: tail
// LDS double buffer: tail
{
{
Float
p_in_register_
clipboard
[
blockwise_in_copy
.
GetRegister
Clipboard
Size
()];
Float
p_in_register_
buffer
[
blockwise_in_copy
.
GetRegister
Buffer
Size
()];
Float
p_wei_register_
clipboard
[
blockwise_wei_copy
.
GetRegister
Clipboard
Size
()];
Float
p_wei_register_
buffer
[
blockwise_wei_copy
.
GetRegister
Buffer
Size
()];
// even iteration
// even iteration
#if 1
blockwise_in_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
EPerBlock
>
{},
True
);
blockwise_in_copy
.
MoveSlicingWindowOnSourceTensor
(
I0
,
Number
<
EPerBlock
>
{},
True
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStride
(
I0
);
p_wei_block_on_global
+=
EPerBlock
*
wei_e_k_global_desc
.
GetStride
(
I0
);
#else
blockwise_in_copy
.
MoveSrcSlicingWindow
({
EPerBlock
,
0
,
0
,
0
},
true
);
blockwise_wei_copy
.
MoveSrcSlicingWindow
({
EPerBlock
,
0
},
true
);
#endif
__syncthreads
();
__syncthreads
();
// LDS doubel buffer: load next data from device mem
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
RunLoadRegisterClipboard
(
p_in_global
,
p_in_register_clipboard
);
blockwise_in_copy
.
RunLoadRegisterBuffer
(
p_in_global
,
p_in_register_buffer
);
blockwise_wei_copy
.
RunLoadRegisterClipboard
(
p_wei_block_on_global
,
blockwise_wei_copy
.
RunLoadRegisterBuffer
(
p_wei_block_on_global
,
p_wei_register_buffer
);
p_wei_register_clipboard
);
// LDS double buffer: GEMM on current data
// LDS double buffer: GEMM on current data
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
// LDS double buffer: store next data to LDS
// LDS double buffer: store next data to LDS
blockwise_in_copy
.
RunStoreRegister
Clipboard
(
p_in_register_
clipboard
,
blockwise_in_copy
.
RunStoreRegister
Buffer
(
p_in_register_
buffer
,
p_in_block_double
+
in_block_space
);
p_in_block_double
+
in_block_space
);
blockwise_wei_copy
.
RunStoreRegister
Clipboard
(
p_wei_register_
clipboard
,
blockwise_wei_copy
.
RunStoreRegister
Buffer
(
p_wei_register_
buffer
,
p_wei_block_double
+
wei_block_space
);
p_wei_block_double
+
wei_block_space
);
// odd iteration
// odd iteration
...
@@ -396,6 +431,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -396,6 +431,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
out_k_n1_b_n2_global_merged_desc
.
GetOffsetFromMultiIndex
(
out_k_n1_b_n2_global_merged_desc
.
GetOffsetFromMultiIndex
(
k_thread_data_on_global
,
0
,
b_thread_data_on_global
,
0
);
k_thread_data_on_global
,
0
,
b_thread_data_on_global
,
0
);
#if 1 // debug
threadwise_generic_tensor_slice_copy_v1
(
threadwise_generic_tensor_slice_copy_v1
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
,
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
,
p_out_thread
,
p_out_thread
,
...
@@ -406,6 +442,17 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
@@ -406,6 +442,17 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
.
GetLengths
(),
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
.
GetLengths
(),
arithmetic_sequence_gen
<
0
,
8
,
1
>::
type
{},
arithmetic_sequence_gen
<
0
,
8
,
1
>::
type
{},
Number
<
1
>
{});
Number
<
1
>
{});
#else
ThreadwiseGenericTensorSliceCopy_v2
<
Float
,
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
),
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
),
NormalTensorCoordinate
<
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
)
>
,
MergedTensorCoordinate
<
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc
)
>
,
decltype
(
out_n0_n1_n2_k0_k1_k2_h_w_thread_desc
.
GetLengths
())
>
(
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
},
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
})
.
Run
(
p_out_thread
,
p_out_thread_on_global
);
#endif
}
}
}
}
};
};
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
c01af899
...
@@ -11,7 +11,7 @@
...
@@ -11,7 +11,7 @@
namespace
ck
{
namespace
ck
{
// B = merge(N, H, W)
// B = merge(N, H
o
, W
o
)
template
<
index_t
GridSize
,
template
<
index_t
GridSize
,
index_t
BlockSize
,
index_t
BlockSize
,
class
Float
,
class
Float
,
...
@@ -237,24 +237,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
...
@@ -237,24 +237,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
for
(
index_t
e_block_data_begin
=
0
;
e_block_data_begin
<
E
;
e_block_data_begin
+=
EPerBlock
)
for
(
index_t
e_block_data_begin
=
0
;
e_block_data_begin
<
E
;
e_block_data_begin
+=
EPerBlock
)
{
{
#if 0
blockwise_in_copy
.
Run
(
p_in_global
,
p_in_block
);
blockwise_in_copy
.
Run
(
p_in_global
,
p_in_block
);
blockwise_wei_copy
.
Run
(
p_wei_global
,
p_wei_block
);
blockwise_wei_copy
.
Run
(
p_wei_global
,
p_wei_block
);
#else
using
InSrcMergedDimSubLengthsHack
=
InBlockCopySubLengths_E_B
;
using
InDstMergedDimSubLengthsHack
=
Sequence
<
1
,
1
>
;
blockwise_in_copy
.
Run_hack
(
p_in_global
,
p_in_block
,
InSrcMergedDimSubLengthsHack
{},
InDstMergedDimSubLengthsHack
{});
using
WeiSrcMergedDimSubLengthsHack
=
Sequence
<
1
,
1
>
;
using
WeiDstMergedDimSubLengthsHack
=
Sequence
<
1
,
1
>
;
blockwise_wei_copy
.
Run_hack
(
p_wei_global
,
p_wei_block
,
WeiSrcMergedDimSubLengthsHack
{},
WeiDstMergedDimSubLengthsHack
{});
#endif
__syncthreads
();
__syncthreads
();
...
@@ -318,17 +302,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
...
@@ -318,17 +302,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
for
(
index_t
nrepeat
=
0
;
nrepeat
<
GemmNRepeat
;
++
nrepeat
)
for
(
index_t
nrepeat
=
0
;
nrepeat
<
GemmNRepeat
;
++
nrepeat
)
{
{
#if 0
threadwise_out_copy
.
Run
(
p_out_thread
,
p_out_global
);
threadwise_out_copy
.
Run
(
p_out_thread
,
p_out_global
);
#else
using
OutSrcMergedDimSubLengthsHack
=
Sequence
<
1
,
1
,
1
>
;
using
OutDstMergedDimSubLengthsHack
=
Sequence
<
1
,
1
,
OutThreadCopySliceLengths
{}[
2
]
>
;
threadwise_out_copy
.
Run_hack
(
p_out_thread
,
p_out_global
,
OutSrcMergedDimSubLengthsHack
{},
OutDstMergedDimSubLengthsHack
{});
#endif
threadwise_out_copy
.
MoveSrcSlicingWindow
({
0
,
0
,
GemmNPerThreadSubC
},
true
);
threadwise_out_copy
.
MoveSrcSlicingWindow
({
0
,
0
,
GemmNPerThreadSubC
},
true
);
threadwise_out_copy
.
MoveDstSlicingWindow
({
0
,
0
,
B1
},
true
);
threadwise_out_copy
.
MoveDstSlicingWindow
({
0
,
0
,
B1
},
true
);
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
c01af899
...
@@ -24,7 +24,7 @@ template <index_t BlockSize,
...
@@ -24,7 +24,7 @@ template <index_t BlockSize,
class
DstDesc
,
class
DstDesc
,
class
SliceLengths
,
class
SliceLengths
,
class
SubLengths
,
class
SubLengths
,
class
Data
ClusterLengths
,
class
Thread
ClusterLengths
,
class
ThreadClusterArrangeOrder
,
class
ThreadClusterArrangeOrder
,
class
SrcAccessOrder
,
class
SrcAccessOrder
,
class
DstAccessOrder
,
class
DstAccessOrder
,
...
@@ -65,7 +65,8 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -65,7 +65,8 @@ struct BlockwiseGenericTensorSliceCopy_v1
// check NDim consistency
// check NDim consistency
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
DataClusterLengths
::
GetSize
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
ThreadClusterLengths
::
GetSize
()
&&
nDim
==
ThreadClusterArrangeOrder
::
GetSize
()
&&
nDim
==
ThreadClusterArrangeOrder
::
GetSize
()
&&
nDim
==
SrcAccessOrder
::
GetSize
()
&&
nDim
==
DstAccessOrder
::
GetSize
(),
nDim
==
SrcAccessOrder
::
GetSize
()
&&
nDim
==
DstAccessOrder
::
GetSize
(),
"wrong"
);
"wrong"
);
...
@@ -78,13 +79,13 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -78,13 +79,13 @@ struct BlockwiseGenericTensorSliceCopy_v1
// thread cluster
// thread cluster
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
Data
ClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
Thread
ClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
// BlockSize
// BlockSize
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize"
);
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize"
);
// divide work
// divide work
constexpr
auto
data_per_cluster_per_dims
=
SubLengths
{}
*
Data
ClusterLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
SubLengths
{}
*
Thread
ClusterLengths
{};
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
static_for
<
0
,
nDim
,
1
>
{}([
&
](
auto
IDim
)
{
static_assert
(
SliceLengths
::
Get
(
IDim
)
%
SubLengths
::
Get
(
IDim
)
==
0
,
static_assert
(
SliceLengths
::
Get
(
IDim
)
%
SubLengths
::
Get
(
IDim
)
==
0
,
...
@@ -160,9 +161,9 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -160,9 +161,9 @@ struct BlockwiseGenericTensorSliceCopy_v1
mThreadDstPartialOffsets
,
math
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
mThreadDstPartialOffsets
,
math
::
plus
<
index_t
>
{},
static_cast
<
index_t
>
(
0
));
}
}
__device__
static
constexpr
index_t
GetRegister
Clipboard
Size
()
__device__
static
constexpr
index_t
GetRegister
Buffer
Size
()
{
{
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
Data
ClusterLengths
{});
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
Thread
ClusterLengths
{});
constexpr
auto
thread_tensor_desc
=
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor_packed
(
SubLengths
{}
*
repeat_lengths
);
make_ConstantTensorDescriptor_packed
(
SubLengths
{}
*
repeat_lengths
);
...
@@ -170,14 +171,15 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -170,14 +171,15 @@ struct BlockwiseGenericTensorSliceCopy_v1
return
thread_tensor_desc
.
GetElementSpace
();
return
thread_tensor_desc
.
GetElementSpace
();
}
}
__device__
void
RunLoadRegister
Clipboard
(
const
Float
*
__restrict__
p_src
,
__device__
void
RunLoadRegister
Buffer
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_
clipboard
)
const
Float
*
__restrict__
p_
Buffer
)
const
{
{
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
DataClusterLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
ThreadClusterLengths
{};
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
Data
ClusterLengths
{});
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
Thread
ClusterLengths
{});
constexpr
auto
thread_tensor_desc
=
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor_packed
(
thread_sub_tensor_lengths
*
repeat_lengths
);
make_ConstantTensorDescriptor_packed
(
thread_sub_tensor_lengths
*
repeat_lengths
);
...
@@ -187,25 +189,24 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -187,25 +189,24 @@ struct BlockwiseGenericTensorSliceCopy_v1
constexpr
auto
src_thread_data_multi_id_begin
=
constexpr
auto
src_thread_data_multi_id_begin
=
repeat_multi_id
*
data_per_cluster_per_dims
;
repeat_multi_id
*
data_per_cluster_per_dims
;
constexpr
auto
clipboard_data_multi_id_begin
=
constexpr
auto
Buffer_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
repeat_multi_id
*
thread_sub_tensor_lengths
;
constexpr
index_t
src_offset
=
constexpr
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_thread_data_multi_id_begin
);
SrcDesc
::
GetOffsetFromMultiIndex
(
src_thread_data_multi_id_begin
);
constexpr
index_t
clipboard
_offset
=
constexpr
index_t
Buffer
_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
clipboard
_data_multi_id_begin
);
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
Buffer
_data_multi_id_begin
);
#else
#else
ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id
)
{
ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id
)
{
const
auto
src_thread_data_multi_id_begin
=
repeat_multi_id
*
data_per_cluster_per_dims
;
const
auto
src_thread_data_multi_id_begin
=
repeat_multi_id
*
data_per_cluster_per_dims
;
const
auto
clipboard
_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
const
auto
Buffer
_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
const
index_t
src_offset
=
const
index_t
src_offset
=
SrcDesc
::
GetOffsetFromMultiIndex
(
src_thread_data_multi_id_begin
);
SrcDesc
::
GetOffsetFromMultiIndex
(
src_thread_data_multi_id_begin
);
const
index_t
clipboard
_offset
=
const
index_t
Buffer
_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
clipboard
_data_multi_id_begin
);
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
Buffer
_data_multi_id_begin
);
#endif
#endif
// By position the origin of the per-thread window at the point, where multi-index
// By position the origin of the per-thread window at the point, where multi-index
...
@@ -219,7 +220,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -219,7 +220,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
p_src
+
src_offset
+
mThreadSrcOffset
,
p_src
+
src_offset
+
mThreadSrcOffset
,
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
(),
thread_tensor_desc
,
thread_tensor_desc
,
p_
clipboard
+
clipboard
_offset
,
p_
Buffer
+
Buffer
_offset
,
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
(),
thread_sub_tensor_lengths
,
thread_sub_tensor_lengths
,
SrcAccessOrder
{},
SrcAccessOrder
{},
...
@@ -227,38 +228,38 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -227,38 +228,38 @@ struct BlockwiseGenericTensorSliceCopy_v1
});
});
}
}
__device__
void
RunStoreRegister
Clipboard
(
const
Float
*
__restrict__
p_
clipboard
,
__device__
void
RunStoreRegister
Buffer
(
const
Float
*
__restrict__
p_
Buffer
,
Float
*
__restrict__
p_dst
)
const
Float
*
__restrict__
p_dst
)
const
{
{
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
constexpr
auto
thread_sub_tensor_lengths
=
SubLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
DataClusterLengths
{};
constexpr
auto
data_per_cluster_per_dims
=
thread_sub_tensor_lengths
*
ThreadClusterLengths
{};
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
Data
ClusterLengths
{});
constexpr
auto
repeat_lengths
=
SliceLengths
{}
/
(
SubLengths
{}
*
Thread
ClusterLengths
{});
constexpr
auto
thread_tensor_desc
=
constexpr
auto
thread_tensor_desc
=
make_ConstantTensorDescriptor_packed
(
thread_sub_tensor_lengths
*
repeat_lengths
);
make_ConstantTensorDescriptor_packed
(
thread_sub_tensor_lengths
*
repeat_lengths
);
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id
)
{
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id
)
{
constexpr
auto
clipboard_data_multi_id_begin
=
constexpr
auto
Buffer_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
repeat_multi_id
*
thread_sub_tensor_lengths
;
constexpr
auto
dst_data_multi_id_begin
=
repeat_multi_id
*
data_per_cluster_per_dims
;
constexpr
auto
dst_data_multi_id_begin
=
repeat_multi_id
*
data_per_cluster_per_dims
;
constexpr
index_t
clipboard
_offset
=
constexpr
index_t
Buffer
_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
clipboard
_data_multi_id_begin
);
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
Buffer
_data_multi_id_begin
);
constexpr
index_t
dst_offset
=
constexpr
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_data_multi_id_begin
);
DstDesc
::
GetOffsetFromMultiIndex
(
dst_data_multi_id_begin
);
#else
#else
ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id
)
{
ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id
)
{
const
auto
clipboard
_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
const
auto
Buffer
_data_multi_id_begin
=
repeat_multi_id
*
thread_sub_tensor_lengths
;
const
auto
dst_data_multi_id_begin
=
repeat_multi_id
*
data_per_cluster_per_dims
;
const
auto
dst_data_multi_id_begin
=
repeat_multi_id
*
data_per_cluster_per_dims
;
const
index_t
clipboard
_offset
=
const
index_t
Buffer
_offset
=
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
clipboard
_data_multi_id_begin
);
thread_tensor_desc
.
GetOffsetFromMultiIndex
(
Buffer
_data_multi_id_begin
);
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_data_multi_id_begin
);
const
index_t
dst_offset
=
DstDesc
::
GetOffsetFromMultiIndex
(
dst_data_multi_id_begin
);
#endif
#endif
...
@@ -271,7 +272,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -271,7 +272,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
// If in the future, you want to enable SubLengths > 1 at the merged dimension,
// If in the future, you want to enable SubLengths > 1 at the merged dimension,
// special care in implementation is needed
// special care in implementation is needed
threadwise_generic_tensor_slice_copy_v1
(
thread_tensor_desc
,
threadwise_generic_tensor_slice_copy_v1
(
thread_tensor_desc
,
p_
clipboard
+
clipboard
_offset
,
p_
Buffer
+
Buffer
_offset
,
make_zero_array
<
index_t
,
nDim
>
(),
make_zero_array
<
index_t
,
nDim
>
(),
DstDesc
{},
DstDesc
{},
p_dst
+
dst_offset
+
mThreadDstOffset
,
p_dst
+
dst_offset
+
mThreadDstOffset
,
...
@@ -284,10 +285,10 @@ struct BlockwiseGenericTensorSliceCopy_v1
...
@@ -284,10 +285,10 @@ struct BlockwiseGenericTensorSliceCopy_v1
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
__device__
void
Run
(
const
Float
*
__restrict__
p_src
,
Float
*
__restrict__
p_dst
)
const
{
{
Float
p_
clipboard
[
GetRegister
Clipboard
Size
()];
Float
p_
Buffer
[
GetRegister
Buffer
Size
()];
RunLoadRegister
Clipboard
(
p_src
,
p_
clipboard
);
RunLoadRegister
Buffer
(
p_src
,
p_
Buffer
);
RunStoreRegister
Clipboard
(
p_clipboard
,
p_dst
);
RunStoreRegister
Buffer
(
p_Buffer
,
p_dst
);
}
}
// When moving the slicing windows along a merged dimension, if the strides of the
// When moving the slicing windows along a merged dimension, if the strides of the
...
@@ -382,24 +383,30 @@ template <index_t BlockSize,
...
@@ -382,24 +383,30 @@ template <index_t BlockSize,
class
DstCoordinate
,
class
DstCoordinate
,
class
SliceLengths
,
class
SliceLengths
,
class
SubLengths
,
class
SubLengths
,
class
Data
ClusterLengths
,
class
Thread
ClusterLengths
,
class
ThreadClusterArrangeOrder
>
class
ThreadClusterArrangeOrder
>
struct
BlockwiseGenericTensorSliceCopy_v2
struct
BlockwiseGenericTensorSliceCopy_v2
{
{
using
ThreadwiseCopy
=
ThreadwiseGenericTensorSliceCopy_v2
<
TData
,
SrcDesc
,
DstDesc
,
SrcCoordinate
,
DstCoordinate
,
SubLengths
>
;
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
static
constexpr
index_t
nDim
=
SrcDesc
::
GetNumOfDimension
();
__device__
constexpr
BlockwiseGenericTensorSliceCopy_v2
(
SrcCoordinate
src_block_slice_origin
,
__device__
constexpr
BlockwiseGenericTensorSliceCopy_v2
(
SrcCoordinate
src_block_slice_origin
,
DstCoordinate
dst_block_slice_origin
)
DstCoordinate
dst_block_slice_origin
)
{
{
static_assert
(
nDim
==
SrcDesc
::
GetNumOfDimension
()
&&
nDim
==
DstDesc
::
GetNumOfDimension
()
&&
nDim
==
SliceLengths
::
GetSize
()
&&
nDim
==
SubLengths
::
GetSize
()
&&
nDim
==
ThreadClusterLengths
::
GetSize
()
&&
nDim
==
ThreadClusterArrangeOrder
::
GetSize
(),
"wrong! nDim not consistent"
);
static_assert
(
is_same
<
SliceLengths
,
decltype
(
SubLengths
{}
*
ThreadClusterLengths
{})
>
{},
"wrong! threads should be mapped to cover entire slicing window"
);
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
constexpr
auto
thread_cluster_desc
=
make_ConstantTensorDescriptor_packed
(
DataClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
ThreadClusterLengths
::
ReorderGivenNew2Old
(
ThreadClusterArrangeOrder
{}));
static_assert
(
BlockSize
==
thread_cluster_desc
.
GetElementSize
(),
"wrong! BlockSize not consistent with ThreadClusterLengths"
);
const
auto
thread_cluster_multi_id
=
const
auto
thread_cluster_multi_id
=
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
thread_cluster_desc
.
GetMultiIndexFrom1dIndex
(
get_thread_local_1d_id
());
...
@@ -409,43 +416,66 @@ struct BlockwiseGenericTensorSliceCopy_v2
...
@@ -409,43 +416,66 @@ struct BlockwiseGenericTensorSliceCopy_v2
const
auto
thread_data_multi_id_begin
=
data_cluster_multi_id
*
SubLengths
{};
const
auto
thread_data_multi_id_begin
=
data_cluster_multi_id
*
SubLengths
{};
mThreadwiseCopy
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_multi_id_begin
);
mThreadwiseLoad
.
SetSrcSliceOrigin
(
src_block_slice_origin
+
thread_data_multi_id_begin
);
mThreadwiseCopy
.
SetDstSliceOrigin
(
dst_block_slice_origin
+
thread_data_multi_id_begin
);
mThreadwiseLoad
.
SetDstSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
.
SetSrcSliceOrigin
(
make_zero_array
<
index_t
,
nDim
>
());
mThreadwiseStore
.
SetDstSliceOrigin
(
dst_block_slice_origin
+
thread_data_multi_id_begin
);
}
}
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
__device__
static
constexpr
index_t
GetRegisterBufferSize
()
{
{
mThreadwiseCopy
.
Run
(
p_src
,
p_dst
);
return
RegisterBufferDesc
::
GetElementSpace
(
);
}
}
template
<
class
SrcMergedDimSubLengthsHack
,
class
DstMergedDimSubLengthsHack
>
__device__
void
RunLoadRegisterBuffer
(
const
TData
*
p_src
,
TData
*
p_buffer
)
const
__device__
void
Run_hack
(
const
TData
*
p_src
,
TData
*
p_dst
,
SrcMergedDimSubLengthsHack
,
DstMergedDimSubLengthsHack
)
const
{
{
// hacks to isolate merged dimension from normal dimensions, and calculate their offset
mThreadwiseLoad
.
Run
(
p_src
,
p_buffer
);
// seperately
}
// SrcMergedDimSliceLengthsHack has entry same as SliceLengths on src merged dimensions,
// but 1 on normal dimensions;
__device__
void
RunStoreRegisterBuffer
(
const
TData
*
p_buffer
,
TData
*
p_dst
)
const
// SrcNormalDimSliceLengthsHack has entry same as SliceLengths on src normal dimensions,
{
// but 1 on merged dimensions;
mThreadwiseStore
.
Run
(
p_buffer
,
p_dst
);
mThreadwiseCopy
.
Run_hack
(
}
p_src
,
p_dst
,
SrcMergedDimSubLengthsHack
{},
DstMergedDimSubLengthsHack
{});
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
{
TData
p_buffer
[
GetRegisterBufferSize
()];
mThreadwiseLoad
.
Run
(
p_src
,
p_buffer
);
mThreadwiseStore
.
Run
(
p_buffer
,
p_dst
);
}
}
__device__
void
MoveSrcSlicingWindow
(
Array
<
index_t
,
nDim
>
step_sizes
,
bool
positive_direction
)
__device__
void
MoveSrcSlicingWindow
(
Array
<
index_t
,
nDim
>
step_sizes
,
bool
positive_direction
)
{
{
mThreadwise
Copy
.
MoveSrcSlicingWindow
(
step_sizes
,
positive_direction
);
mThreadwise
Load
.
MoveSrcSlicingWindow
(
step_sizes
,
positive_direction
);
}
}
__device__
void
MoveDstSlicingWindow
(
Array
<
index_t
,
nDim
>
step_sizes
,
bool
positive_direction
)
__device__
void
MoveDstSlicingWindow
(
Array
<
index_t
,
nDim
>
step_sizes
,
bool
positive_direction
)
{
{
mThreadwise
Copy
.
MoveDstSlicingWindow
(
step_sizes
,
positive_direction
);
mThreadwise
Store
.
MoveDstSlicingWindow
(
step_sizes
,
positive_direction
);
}
}
// private:
private:
ThreadwiseCopy
mThreadwiseCopy
;
using
RegisterBufferDesc
=
decltype
(
make_ConstantTensorDescriptor_packed
(
SubLengths
{}));
using
ThreadwiseLoad
=
ThreadwiseGenericTensorSliceCopy_v2
<
TData
,
SrcDesc
,
RegisterBufferDesc
,
SrcCoordinate
,
NormalTensorCoordinate
<
RegisterBufferDesc
>
,
SubLengths
>
;
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v2
<
TData
,
RegisterBufferDesc
,
DstDesc
,
NormalTensorCoordinate
<
RegisterBufferDesc
>
,
DstCoordinate
,
SubLengths
>
;
ThreadwiseLoad
mThreadwiseLoad
;
ThreadwiseStore
mThreadwiseStore
;
};
};
}
// namespace ck
}
// namespace ck
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
c01af899
...
@@ -138,47 +138,17 @@ struct ThreadwiseGenericTensorSliceCopy_v2
...
@@ -138,47 +138,17 @@ struct ThreadwiseGenericTensorSliceCopy_v2
mDstSliceOrigin
=
dst_slice_origin
;
mDstSliceOrigin
=
dst_slice_origin
;
}
}
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
template
<
class
TDesc
,
class
Seq
>
struct
IsolateMergedDimSliceLengthsHack
{
{
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SliceLengths
{});
template
<
class
IDim
>
__device__
constexpr
index_t
operator
()(
IDim
idim
)
const
TData
p_buffer_
[
buffer_desc
.
GetElementSpace
()];
{
TData
*
p_buffer
=
p_buffer_
;
return
TDesc
::
ContainMultipleOriginalDimensions
(
idim
)
?
Seq
{}[
idim
]
:
1
;
#if 0
static_ford<SliceLengths>{}([&](auto data_id) {
p_buffer[buffer_desc.GetOffsetFromMultiIndex(data_id)] =
p_src[(mSrcSliceOrigin + data_id).GetOffset()];
});
static_ford<SliceLengths>{}([&](auto data_id) {
p_dst[(mDstSliceOrigin + data_id).GetOffset()] =
p_buffer[buffer_desc.GetOffsetFromMultiIndex(data_id)];
});
#elif
1
auto
src_slice_origin
=
mSrcSliceOrigin
;
auto
dst_slice_origin
=
mDstSliceOrigin
;
const
TData
*
p_src_tmp
=
p_src
+
src_slice_origin
.
RepositionOrigin
();
TData
*
p_dst_tmp
=
p_dst
+
dst_slice_origin
.
RepositionOrigin
();
static_ford
<
SliceLengths
>
{}([
&
](
auto
data_id
)
{
p_buffer
[
buffer_desc
.
GetOffsetFromMultiIndex
(
data_id
)]
=
p_src_tmp
[(
src_slice_origin
+
data_id
).
GetOffset
()];
});
static_ford
<
SliceLengths
>
{}([
&
](
auto
data_id
)
{
p_dst_tmp
[(
dst_slice_origin
+
data_id
).
GetOffset
()]
=
p_buffer
[
buffer_desc
.
GetOffsetFromMultiIndex
(
data_id
)];
});
#endif
}
}
};
template
<
class
SrcMergedDimSliceLengthsHack
,
class
DstMergedDimSliceLengthsHack
>
__device__
void
Run
(
const
TData
*
p_src
,
TData
*
p_dst
)
const
__device__
void
Run_hack
(
const
TData
*
p_src
,
TData
*
p_dst
,
SrcMergedDimSliceLengthsHack
,
DstMergedDimSliceLengthsHack
)
const
{
{
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SliceLengths
{});
constexpr
auto
buffer_desc
=
make_ConstantTensorDescriptor_packed
(
SliceLengths
{});
...
@@ -191,6 +161,10 @@ struct ThreadwiseGenericTensorSliceCopy_v2
...
@@ -191,6 +161,10 @@ struct ThreadwiseGenericTensorSliceCopy_v2
// but 1 on normal dimensions;
// but 1 on normal dimensions;
// SrcNormalDimSliceLengthsHack has entry same as SliceLengths on src normal dimensions,
// SrcNormalDimSliceLengthsHack has entry same as SliceLengths on src normal dimensions,
// but 1 on merged dimensions;
// but 1 on merged dimensions;
using
SrcMergedDimSliceLengthsHack
=
typename
sequence_gen
<
SliceLengths
::
GetSize
(),
IsolateMergedDimSliceLengthsHack
<
SrcDesc
,
SliceLengths
>>::
type
;
using
SrcNormalDimSliceLengthsHack
=
using
SrcNormalDimSliceLengthsHack
=
decltype
((
SliceLengths
{}
+
Number
<
1
>
{})
-
SrcMergedDimSliceLengthsHack
{});
decltype
((
SliceLengths
{}
+
Number
<
1
>
{})
-
SrcMergedDimSliceLengthsHack
{});
...
@@ -216,6 +190,10 @@ struct ThreadwiseGenericTensorSliceCopy_v2
...
@@ -216,6 +190,10 @@ struct ThreadwiseGenericTensorSliceCopy_v2
// but 1 on normal dimensions;
// but 1 on normal dimensions;
// DstNormalDimSliceLengthsHack has entry same as SliceLengths on dst normal dimensions,
// DstNormalDimSliceLengthsHack has entry same as SliceLengths on dst normal dimensions,
// but 1 on merged dimensions;
// but 1 on merged dimensions;
using
DstMergedDimSliceLengthsHack
=
typename
sequence_gen
<
SliceLengths
::
GetSize
(),
IsolateMergedDimSliceLengthsHack
<
DstDesc
,
SliceLengths
>>::
type
;
using
DstNormalDimSliceLengthsHack
=
using
DstNormalDimSliceLengthsHack
=
decltype
((
SliceLengths
{}
+
Number
<
1
>
{})
-
DstMergedDimSliceLengthsHack
{});
decltype
((
SliceLengths
{}
+
Number
<
1
>
{})
-
DstMergedDimSliceLengthsHack
{});
...
...
composable_kernel/include/utility/Sequence.hpp
View file @
c01af899
...
@@ -128,48 +128,63 @@ struct sequence_merge<Sequence<Xs...>, Sequence<Ys...>>
...
@@ -128,48 +128,63 @@ struct sequence_merge<Sequence<Xs...>, Sequence<Ys...>>
using
type
=
Sequence
<
Xs
...,
Ys
...
>
;
using
type
=
Sequence
<
Xs
...,
Ys
...
>
;
};
};
//
arithmetic sqe
uence
//
generate seq
uence
template
<
index_t
IBegin
,
index_t
N
Size
,
index_t
Increment
>
template
<
index_t
IBegin
,
index_t
N
Remain
,
class
F
>
struct
arithmetic_
sequence_gen_impl
struct
sequence_gen_impl
{
{
static
constexpr
index_t
NSizeLeft
=
NSize
/
2
;
static
constexpr
index_t
NRemainLeft
=
NRemain
/
2
;
static
constexpr
index_t
NRemainRight
=
NRemain
-
NRemainLeft
;
static
constexpr
index_t
IMiddle
=
IBegin
+
NRemainLeft
;
using
type
=
typename
sequence_merge
<
using
type
=
typename
arithmetic_sequence_gen_impl
<
IBegin
,
NSizeLeft
,
Increment
>::
type
,
typename
sequence_merge
<
typename
sequence_gen_impl
<
IBegin
,
NRemainLeft
,
F
>::
type
,
typename
arithmetic_sequence_gen_impl
<
IBegin
+
NSizeLeft
*
Increment
,
typename
sequence_gen_impl
<
IMiddle
,
NRemainRight
,
F
>::
type
>::
type
;
NSize
-
NSizeLeft
,
Increment
>::
type
>::
type
;
};
};
template
<
index_t
I
Begin
,
index_t
Increment
>
template
<
index_t
I
,
class
F
>
struct
arithmetic_
sequence_gen_impl
<
I
Begin
,
1
,
Increment
>
struct
sequence_gen_impl
<
I
,
1
,
F
>
{
{
using
type
=
Sequence
<
IBegin
>
;
static
constexpr
index_t
Is
=
F
{}(
Number
<
I
>
{});
using
type
=
Sequence
<
Is
>
;
};
};
template
<
index_t
I
Begin
,
index_t
Increment
>
template
<
index_t
I
,
class
F
>
struct
arithmetic_
sequence_gen_impl
<
I
Begin
,
0
,
Increment
>
struct
sequence_gen_impl
<
I
,
0
,
F
>
{
{
using
type
=
Sequence
<>
;
using
type
=
Sequence
<>
;
};
};
template
<
index_t
NSize
,
class
F
>
struct
sequence_gen
{
using
type
=
typename
sequence_gen_impl
<
0
,
NSize
,
F
>::
type
;
};
// arithmetic sequence
template
<
index_t
IBegin
,
index_t
IEnd
,
index_t
Increment
>
template
<
index_t
IBegin
,
index_t
IEnd
,
index_t
Increment
>
struct
arithmetic_sequence_gen
struct
arithmetic_sequence_gen
{
{
using
type
=
typename
arithmetic_sequence_gen_impl
<
IBegin
,
IEnd
-
IBegin
,
Increment
>::
type
;
struct
F
{
__host__
__device__
constexpr
index_t
operator
()(
index_t
i
)
const
{
return
i
*
Increment
+
IBegin
;
}
};
using
type
=
typename
sequence_gen
<
(
IEnd
-
IBegin
)
/
Increment
,
F
>::
type
;
};
};
// uniform sequence
// uniform sequence
template
<
index_t
NSize
,
index_t
I
>
template
<
index_t
NSize
,
index_t
I
>
struct
uniform_sequence_gen
struct
uniform_sequence_gen
{
{
struct
return_constant
struct
F
{
{
__host__
__device__
constexpr
index_t
operator
()(
index_t
)
const
{
return
I
;
}
__host__
__device__
constexpr
index_t
operator
()(
index_t
)
const
{
return
I
;
}
};
};
using
type
=
decltype
(
using
type
=
typename
sequence_gen
<
NSize
,
F
>::
type
;
typename
arithmetic_sequence_gen
<
0
,
NSize
,
1
>::
type
{}.
Transform
(
return_constant
{}));
};
};
// reverse inclusive scan (with init) sequence
// reverse inclusive scan (with init) sequence
...
...
driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
c01af899
...
@@ -139,7 +139,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
...
@@ -139,7 +139,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
for
(
index_t
i
=
0
;
i
<
nrepeat
;
++
i
)
{
{
constexpr
auto
gridwise_conv
=
constexpr
auto
gridwise_conv
=
#if
1
#if
0
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw
#else
#else
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
...
...
driver/include/device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
c01af899
...
@@ -4,6 +4,7 @@
...
@@ -4,6 +4,7 @@
#include "tensor.hpp"
#include "tensor.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_kernel_wrapper.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
//#include "gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp"
using
namespace
ck
;
using
namespace
ck
;
...
...
driver/src/driver.cpp
View file @
c01af899
...
@@ -532,7 +532,7 @@ int main(int argc, char* argv[])
...
@@ -532,7 +532,7 @@ int main(int argc, char* argv[])
#elif 0
#elif 0
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
(
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw
(
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
(
in_nchw_desc
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#elif
0
#elif
1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw
(
in_nchw_desc
,
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw
(
in_nchw_desc
,
in_nchw
,
in_nchw
,
wei_kcyx_desc
,
wei_kcyx_desc
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment