Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
1a66e35b
Unverified
Commit
1a66e35b
authored
Feb 17, 2020
by
Chao Liu
Committed by
GitHub
Feb 17, 2020
Browse files
MIopen integration (#13)
* update for miopen integration: cosmetic refactor
parent
3406a114
Changes
29
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
561 additions
and
478 deletions
+561
-478
composable_kernel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp
...rnel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp
+4
-4
composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
...ution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
+28
-29
composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp
...a_implicit_gemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp
+13
-13
composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp
...ution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp
+57
-57
composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw.hpp
...ution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw.hpp
+83
-88
composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
...ution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+203
-131
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
...n_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
+23
-20
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp
...gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp
+44
-31
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
...ridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
+11
-11
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp
...gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp
+7
-7
composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
...l/include/tensor_description/ConstantMatrixDescriptor.hpp
+1
-1
composable_kernel/include/tensor_description/tensor_coordinate.hpp
...e_kernel/include/tensor_description/tensor_coordinate.hpp
+2
-2
composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
...clude/tensor_description/tensor_coordinate_deprecated.hpp
+2
-2
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
...l/include/tensor_description/tensor_descriptor_helper.hpp
+8
-8
composable_kernel/include/tensor_operation/blockwise_gemm.hpp
...osable_kernel/include/tensor_operation/blockwise_gemm.hpp
+6
-6
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
.../tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+7
-7
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
...ration/blockwise_generic_tensor_slice_copy_deprecated.hpp
+4
-4
composable_kernel/include/tensor_operation/gridwise_gemm.hpp
composable_kernel/include/tensor_operation/gridwise_gemm.hpp
+39
-38
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
...tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+16
-16
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
...ation/threadwise_generic_tensor_slice_copy_deprecated.hpp
+3
-3
No files found.
composable_kernel/include/kernel_algorithm/gridwise_col2im_eb_nchw.hpp
View file @
1a66e35b
...
...
@@ -114,10 +114,10 @@ struct GridwiseCol2Im_eb_nchw
1
,
BlockCopyDataPerAccess_B
,
BlockCopyDataPerAccess_B
,
AddressSpace
::
v
gpr
,
AddressSpace
::
v
gpr
,
AddressSpace
::
g
lobal
,
InMemoryDataOperation
::
a
tomic
_a
dd
>
(
AddressSpace
::
V
gpr
,
AddressSpace
::
V
gpr
,
AddressSpace
::
G
lobal
,
InMemoryDataOperation
::
A
tomic
A
dd
>
(
{
e_block_data_on_global
,
b_block_data_on_global
},
{
e_block_data_on_global
,
b_block_data_on_global
});
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r1_nchw_kcyx_nkhw.hpp
View file @
1a66e35b
...
...
@@ -25,15 +25,15 @@ template <index_t GridSize,
index_t
GemmMPerBlock
,
index_t
GemmNPerBlock
,
index_t
GemmKPerBlock
,
index_t
GemmMPerThreadSubC
,
index_t
GemmNPerThreadSubC
,
index_t
GemmMPerThread
,
index_t
GemmNPerThread
,
index_t
GemmKPerThread
,
index_t
GemmMLevel0Cluster
,
index_t
GemmNLevel0Cluster
,
index_t
GemmMLevel1Cluster
,
index_t
GemmNLevel1Cluster
,
index_t
GemmKPerThreadLoop
,
index_t
GemmThreadGemmDataPerReadM
,
index_t
GemmThreadGemmDataPerReadN
,
index_t
ThreadGemmAThreadCopySrcDataPerRead_GemmM
,
index_t
ThreadGemmAThreadCopySrcDataPerRead_GemmN
,
typename
GemmABlockCopyThreadSliceLengths_GemmK_GemmM
,
typename
GemmABlockCopyThreadClusterLengths_GemmK_GemmM
,
index_t
GemmABlockCopySrcDataPerRead_GemmN
,
...
...
@@ -75,25 +75,20 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw
constexpr
index_t
ConvDilationH
=
ConvDilations
{}[
0
];
constexpr
index_t
ConvDilationW
=
ConvDilations
{}[
1
];
// sanity-check for vectorized memory load
// TODO: this logic may not be correct for bwd-data
static_assert
(
(
Wo
==
1
||
(
ConvStrideW
==
1
||
GemmCThreadCopyDstDataPerWrite_GemmN1
==
1
))
&&
(
X
==
1
||
ConvDilationW
%
GemmCThreadCopyDstDataPerWrite_GemmN1
==
0
),
"wrong! aligment requirement for vectorized global load of input tensor will "
"be violated"
);
//\todo static_assert for global vector load/store
// statc_assert();
// weight tensor
constexpr
auto
wei_gemmk_gemmm_global_desc
=
unfold_tensor_descriptor
(
wei_k_c_y_x_global_desc
,
I1
,
I3
);
// output tensor
constexpr
auto
out_
k_b
_global_desc
=
constexpr
auto
out_
gemmk_gemmn
_global_desc
=
transform_tensor_descriptor
(
unfold_tensor_descriptor
(
out_n_k_ho_wo_global_desc
,
I2
,
I3
),
make_tuple
(
PassThrough
<
K
>
{},
Merge
<
Sequence
<
N
,
Ho
*
Wo
>>
{}),
make_tuple
(
Sequence
<
1
>
{},
Sequence
<
0
,
2
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
// weight tensor
constexpr
auto
wei_k_e_global_desc
=
unfold_tensor_descriptor
(
wei_k_c_y_x_global_desc
,
I1
,
I3
);
// input tensor
constexpr
auto
in_n_c_hip_wip_global_desc
=
transform_tensor_descriptor
(
in_n_c_hi_wi_global_desc
,
...
...
@@ -116,38 +111,42 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r1_nchw_kcyx_nkhw
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
constexpr
auto
in_
e_b
_global_desc
=
transform_tensor_descriptor
(
constexpr
auto
in_
gemmm_gemmn
_global_desc
=
transform_tensor_descriptor
(
in_n_c_y_ho_x_wo_global_desc
,
make_tuple
(
Merge
<
Sequence
<
C
,
Y
,
X
>>
{},
Merge
<
Sequence
<
N
,
Ho
,
Wo
>>
{}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
// GEMM
constexpr
auto
in_memory_op
=
(
Y
<=
ConvStrideH
&&
X
<=
ConvStrideW
)
?
InMemoryDataOperation
::
none
:
InMemoryDataOperation
::
atomic_add
;
// \todo there are more combinations of Y, ConvDilationH and ConvStrideH that don't need
// atomic, find out all of them
constexpr
bool
not_need_atomic
=
(
ConvStrideH
>=
ConvDilationH
*
(
Y
-
1
)
+
1
)
and
(
ConvStrideW
>=
ConvDilationW
*
(
X
-
1
)
+
1
);
constexpr
auto
in_memory_op
=
not_need_atomic
?
InMemoryDataOperation
::
Set
:
InMemoryDataOperation
::
AtomicAdd
;
constexpr
auto
gridwise_gemm
=
GridwiseGemmTransposedANormalBNormalC_v1
<
GridSize
,
BlockSize
,
Float
,
AccFloat
,
decltype
(
wei_
k_e
_global_desc
),
decltype
(
out_
k_b
_global_desc
),
decltype
(
in_
e_b
_global_desc
),
decltype
(
wei_
gemmk_gemmm
_global_desc
),
decltype
(
out_
gemmk_gemmn
_global_desc
),
decltype
(
in_
gemmm_gemmn
_global_desc
),
in_memory_op
,
GemmMPerBlock
,
GemmNPerBlock
,
GemmKPerBlock
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMPerThread
,
GemmNPerThread
,
GemmKPerThread
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
GemmThreadGemmDataPerReadM
,
GemmThreadGemmDataPerReadN
,
ThreadGemmAThreadCopySrcDataPerRead_GemmM
,
ThreadGemmAThreadCopySrcDataPerRead_GemmN
,
GemmABlockCopyThreadSliceLengths_GemmK_GemmM
,
GemmABlockCopyThreadClusterLengths_GemmK_GemmM
,
Sequence
<
0
,
1
>
,
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v1r2_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
1a66e35b
...
...
@@ -147,10 +147,10 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r2_nchw_kcyx_nkhw_lds_doubl
2
,
OutBlockCopySrcDataPerRead_B
,
OutBlockCopyDstDataPerWrite_N0
,
AddressSpace
::
g
lobal
,
AddressSpace
::
v
gpr
,
AddressSpace
::
l
ds
,
InMemoryDataOperation
::
none
>
(
AddressSpace
::
G
lobal
,
AddressSpace
::
V
gpr
,
AddressSpace
::
L
ds
,
InMemoryDataOperation
::
Set
>
(
{
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
});
// weight tensor
...
...
@@ -187,10 +187,10 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r2_nchw_kcyx_nkhw_lds_doubl
2
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_C0
,
AddressSpace
::
g
lobal
,
AddressSpace
::
v
gpr
,
AddressSpace
::
l
ds
,
InMemoryDataOperation
::
none
>
(
AddressSpace
::
G
lobal
,
AddressSpace
::
V
gpr
,
AddressSpace
::
L
ds
,
InMemoryDataOperation
::
Set
>
(
{
0
,
e_block_data_on_global
,
0
},
{
0
,
0
,
0
});
// GEMM definition
...
...
@@ -356,10 +356,10 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r2_nchw_kcyx_nkhw_lds_doubl
#if 1 // debug
// input: register to global memory, atomic add
constexpr
auto
in_memory_op
=
(
Y
<=
ConvStrideH
&&
X
<=
ConvStrideW
)
?
InMemoryDataOperation
::
none
:
InMemoryDataOperation
::
a
tomic
_a
dd
;
?
InMemoryDataOperation
::
Set
:
InMemoryDataOperation
::
A
tomic
A
dd
;
#else
constexpr
auto
in_memory_op
=
InMemoryDataOperation
::
a
tomic
_a
dd
;
constexpr
auto
in_memory_op
=
InMemoryDataOperation
::
A
tomic
A
dd
;
#endif
constexpr
index_t
E1
=
GemmMLevel0Cluster
*
GemmMLevel1Cluster
;
...
...
@@ -432,8 +432,8 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v1r2_nchw_kcyx_nkhw_lds_doubl
4
,
1
,
InThreadCopyDstDataPerWrite_B
,
AddressSpace
::
v
gpr
,
AddressSpace
::
g
lobal
,
AddressSpace
::
V
gpr
,
AddressSpace
::
G
lobal
,
in_memory_op
>
({
0
,
0
,
0
,
0
,
0
,
0
},
{
e_thread_data_on_global
/
E1
,
e_thread_data_on_global
%
E1
,
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v2r1_nchw_kcyx_nkhw.hpp
View file @
1a66e35b
...
...
@@ -8,9 +8,9 @@
namespace
ck
{
// GemmM = C * Y
t
ilda * X
t
ilda;
// GemmN = N * H
t
ilda
NonZero
* W
t
ilda
NonZero
;
// GemmK = K * Y
d
ot * X
d
ot;
// GemmM = C * Y
T
ilda * X
T
ilda;
// GemmN = N * H
T
ilda
Slice
* W
T
ilda
Slice
;
// GemmK = K * Y
D
ot * X
D
ot;
template
<
index_t
GridSize
,
index_t
BlockSize
,
typename
Float
,
...
...
@@ -25,13 +25,13 @@ template <index_t GridSize,
index_t
GemmMPerBlock
,
index_t
GemmNPerBlock
,
index_t
GemmKPerBlock
,
index_t
GemmMPerThreadSubC
,
index_t
GemmNPerThreadSubC
,
index_t
GemmMPerThread
,
index_t
GemmNPerThread
,
index_t
GemmKPerThread
,
index_t
GemmMLevel0Cluster
,
index_t
GemmNLevel0Cluster
,
index_t
GemmMLevel1Cluster
,
index_t
GemmNLevel1Cluster
,
index_t
GemmKPerThreadLoop
,
index_t
GemmThreadGemmDataPerReadM
,
index_t
GemmThreadGemmDataPerReadN
,
typename
GemmABlockCopyThreadSliceLengths_GemmK_GemmM
,
...
...
@@ -81,32 +81,32 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v2r1_nchw_kcyx_nkhw
"be violated");
#endif
constexpr
index_t
g
cd
_s
tride
_d
ilation
_h
=
math
::
gcd
(
ConvStrideH
,
ConvDilationH
);
constexpr
index_t
g
cd
_s
tride
_d
ilation
_w
=
math
::
gcd
(
ConvStrideW
,
ConvDilationW
);
constexpr
index_t
G
cd
S
tride
D
ilation
H
=
math
::
gcd
(
ConvStrideH
,
ConvDilationH
);
constexpr
index_t
G
cd
S
tride
D
ilation
W
=
math
::
gcd
(
ConvStrideW
,
ConvDilationW
);
constexpr
index_t
Y
t
ilda
=
ConvStrideH
/
g
cd
_s
tride
_d
ilation
_h
;
constexpr
index_t
X
t
ilda
=
ConvStrideW
/
g
cd
_s
tride
_d
ilation
_w
;
constexpr
index_t
Y
T
ilda
=
ConvStrideH
/
G
cd
S
tride
D
ilation
H
;
constexpr
index_t
X
T
ilda
=
ConvStrideW
/
G
cd
S
tride
D
ilation
W
;
constexpr
index_t
Y
d
ot
=
math
::
integer_divide_ceil
(
Y
,
Y
t
ilda
);
constexpr
index_t
X
d
ot
=
math
::
integer_divide_ceil
(
X
,
X
t
ilda
);
constexpr
index_t
Y
D
ot
=
math
::
integer_divide_ceil
(
Y
,
Y
T
ilda
);
constexpr
index_t
X
D
ot
=
math
::
integer_divide_ceil
(
X
,
X
T
ilda
);
constexpr
index_t
H
t
ilda
=
constexpr
index_t
H
T
ilda
=
Ho
+
math
::
integer_divide_ceil
(
ConvDilationH
*
(
Y
-
1
),
ConvStrideH
);
constexpr
index_t
W
t
ilda
=
constexpr
index_t
W
T
ilda
=
Wo
+
math
::
integer_divide_ceil
(
ConvDilationW
*
(
X
-
1
),
ConvStrideW
);
constexpr
index_t
H
t
ildaLeft
=
math
::
integer_divide_floor
(
math
::
max
(
0
,
InLeftPads
{}[
0
]
-
ConvDilationH
*
(
Y
t
ilda
-
1
)),
ConvStrides
{}[
0
]);
constexpr
index_t
W
t
ildaLeft
=
math
::
integer_divide_floor
(
math
::
max
(
0
,
InLeftPads
{}[
1
]
-
ConvDilationW
*
(
X
t
ilda
-
1
)),
ConvStrides
{}[
1
]);
constexpr
index_t
H
T
ildaLeft
=
math
::
integer_divide_floor
(
math
::
max
(
0
,
InLeftPads
{}[
0
]
-
ConvDilationH
*
(
Y
T
ilda
-
1
)),
ConvStrides
{}[
0
]);
constexpr
index_t
W
T
ildaLeft
=
math
::
integer_divide_floor
(
math
::
max
(
0
,
InLeftPads
{}[
1
]
-
ConvDilationW
*
(
X
T
ilda
-
1
)),
ConvStrides
{}[
1
]);
constexpr
index_t
H
t
ildaRight
=
math
::
min
(
H
t
ilda
,
math
::
integer_divide_ceil
(
InLeftPads
{}[
0
]
+
Hi
-
1
,
ConvStrides
{}[
0
])
+
1
);
constexpr
index_t
W
t
ildaRight
=
math
::
min
(
W
t
ilda
,
math
::
integer_divide_ceil
(
InLeftPads
{}[
1
]
+
Wi
-
1
,
ConvStrides
{}[
1
])
+
1
);
constexpr
index_t
H
T
ildaRight
=
math
::
min
(
H
T
ilda
,
math
::
integer_divide_ceil
(
InLeftPads
{}[
0
]
+
Hi
-
1
,
ConvStrides
{}[
0
])
+
1
);
constexpr
index_t
W
T
ildaRight
=
math
::
min
(
W
T
ilda
,
math
::
integer_divide_ceil
(
InLeftPads
{}[
1
]
+
Wi
-
1
,
ConvStrides
{}[
1
])
+
1
);
constexpr
index_t
H
t
ilda
Trim
=
H
t
ildaRight
-
H
t
ildaLeft
;
constexpr
index_t
W
t
ilda
Trim
=
W
t
ildaRight
-
W
t
ildaLeft
;
constexpr
index_t
H
T
ilda
Slice
=
H
T
ildaRight
-
H
T
ildaLeft
;
constexpr
index_t
W
T
ilda
Slice
=
W
T
ildaRight
-
W
T
ildaLeft
;
// weight tensor
constexpr
auto
wei_k_c_ydot_ytilda_xdot_xtilda_global_desc
=
transform_tensor_descriptor
(
...
...
@@ -114,17 +114,17 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v2r1_nchw_kcyx_nkhw
make_tuple
(
PassThrough
<
K
>
{},
PassThrough
<
C
>
{},
Embed
<
Y
,
Sequence
<
Y
d
ot
,
Y
t
ilda
>
,
Sequence
<
ConvStrideH
/
g
cd
_s
tride
_d
ilation
_h
,
1
,
0
>>
{},
Sequence
<
Y
D
ot
,
Y
T
ilda
>
,
Sequence
<
ConvStrideH
/
G
cd
S
tride
D
ilation
H
,
1
,
0
>>
{},
Embed
<
X
,
Sequence
<
X
d
ot
,
X
t
ilda
>
,
Sequence
<
ConvStrideW
/
g
cd
_s
tride
_d
ilation
_w
,
1
,
0
>>
{}),
Sequence
<
X
D
ot
,
X
T
ilda
>
,
Sequence
<
ConvStrideW
/
G
cd
S
tride
D
ilation
W
,
1
,
0
>>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
constexpr
auto
wei_gemmk_gemmm_global_desc
=
transform_tensor_descriptor
(
wei_k_c_ydot_ytilda_xdot_xtilda_global_desc
,
make_tuple
(
Merge
<
Sequence
<
K
,
Y
d
ot
,
X
d
ot
>>
{},
Merge
<
Sequence
<
C
,
Y
t
ilda
,
X
t
ilda
>>
{}),
make_tuple
(
Merge
<
Sequence
<
K
,
Y
D
ot
,
X
D
ot
>>
{},
Merge
<
Sequence
<
C
,
Y
T
ilda
,
X
T
ilda
>>
{}),
make_tuple
(
Sequence
<
0
,
2
,
4
>
{},
Sequence
<
1
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
...
...
@@ -134,33 +134,33 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v2r1_nchw_kcyx_nkhw
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
K
>
{},
Embed
<
Ho
,
Sequence
<
Y
d
ot
,
H
t
ilda
>
,
Sequence
<-
ConvDilationH
/
g
cd
_s
tride
_d
ilation
_h
,
1
,
0
>>
{},
Sequence
<
Y
D
ot
,
H
T
ilda
>
,
Sequence
<-
ConvDilationH
/
G
cd
S
tride
D
ilation
H
,
1
,
0
>>
{},
Embed
<
Wo
,
Sequence
<
X
d
ot
,
W
t
ilda
>
,
Sequence
<-
ConvDilationW
/
g
cd
_s
tride
_d
ilation
_w
,
1
,
0
>>
{}),
Sequence
<
X
D
ot
,
W
T
ilda
>
,
Sequence
<-
ConvDilationW
/
G
cd
S
tride
D
ilation
W
,
1
,
0
>>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
constexpr
auto
out_n_k_ydot_htilda
trim
_xdot_wtilda
trim
_global_desc
=
constexpr
auto
out_n_k_ydot_htilda
slice
_xdot_wtilda
slice
_global_desc
=
transform_tensor_descriptor
(
out_n_k_ydot_htilda_xdot_wtilda_global_desc
,
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
K
>
{},
PassThrough
<
Y
t
ilda
>
{},
PassThrough
<
X
t
ilda
>
{},
Slice
<
Sequence
<
H
t
ilda
,
W
t
ilda
>
,
Sequence
<
H
t
ildaLeft
,
W
t
ildaLeft
>
,
Sequence
<
H
t
ildaRight
,
W
t
ildaRight
>>
{}),
PassThrough
<
Y
T
ilda
>
{},
PassThrough
<
X
T
ilda
>
{},
Slice
<
Sequence
<
H
T
ilda
,
W
T
ilda
>
,
Sequence
<
H
T
ildaLeft
,
W
T
ildaLeft
>
,
Sequence
<
H
T
ildaRight
,
W
T
ildaRight
>>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
4
>
{},
Sequence
<
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
4
>
{},
Sequence
<
3
,
5
>
{}));
constexpr
auto
out_gemmk_gemmn_global_desc
=
transform_tensor_descriptor
(
out_n_k_ydot_htilda
trim
_xdot_wtilda
trim
_global_desc
,
make_tuple
(
Merge
<
Sequence
<
K
,
Y
d
ot
,
X
d
ot
>>
{},
Merge
<
Sequence
<
N
,
H
t
ilda
Trim
,
W
t
ilda
Trim
>>
{}),
transform_tensor_descriptor
(
out_n_k_ydot_htilda
slice
_xdot_wtilda
slice
_global_desc
,
make_tuple
(
Merge
<
Sequence
<
K
,
Y
D
ot
,
X
D
ot
>>
{},
Merge
<
Sequence
<
N
,
H
T
ilda
Slice
,
W
T
ilda
Slice
>>
{}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
...
...
@@ -188,35 +188,35 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v2r1_nchw_kcyx_nkhw
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
C
>
{},
Embed
<
Hip
,
Sequence
<
Y
t
ilda
,
H
t
ilda
>
,
Sequence
<
Y
T
ilda
,
H
T
ilda
>
,
Sequence
<
ConvDilationH
,
ConvStrideH
,
0
>
,
in_skip_all_out_of_bound_check
>
{},
Embed
<
Wip
,
Sequence
<
X
t
ilda
,
W
t
ilda
>
,
Sequence
<
X
T
ilda
,
W
T
ilda
>
,
Sequence
<
ConvDilationW
,
ConvStrideW
,
0
>
,
in_skip_all_out_of_bound_check
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
constexpr
auto
in_n_c_ytilda_htilda
trim
_xtilda_wtilda
trim
_global_desc
=
constexpr
auto
in_n_c_ytilda_htilda
slice
_xtilda_wtilda
slice
_global_desc
=
transform_tensor_descriptor
(
in_n_c_ytilda_htilda_xtilda_wtilda_global_desc
,
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
C
>
{},
PassThrough
<
Y
t
ilda
>
{},
PassThrough
<
X
t
ilda
>
{},
Slice
<
Sequence
<
H
t
ilda
,
W
t
ilda
>
,
Sequence
<
H
t
ildaLeft
,
W
t
ildaLeft
>
,
Sequence
<
H
t
ildaRight
,
W
t
ildaRight
>>
{}),
PassThrough
<
Y
T
ilda
>
{},
PassThrough
<
X
T
ilda
>
{},
Slice
<
Sequence
<
H
T
ilda
,
W
T
ilda
>
,
Sequence
<
H
T
ildaLeft
,
W
T
ildaLeft
>
,
Sequence
<
H
T
ildaRight
,
W
T
ildaRight
>>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
4
>
{},
Sequence
<
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
4
>
{},
Sequence
<
3
,
5
>
{}));
constexpr
auto
in_gemmm_gemmn_global_desc
=
transform_tensor_descriptor
(
in_n_c_ytilda_htilda
trim
_xtilda_wtilda
trim
_global_desc
,
make_tuple
(
Merge
<
Sequence
<
C
,
Y
t
ilda
,
X
t
ilda
>>
{},
Merge
<
Sequence
<
N
,
H
t
ilda
Trim
,
W
t
ilda
Trim
>>
{}),
transform_tensor_descriptor
(
in_n_c_ytilda_htilda
slice
_xtilda_wtilda
slice
_global_desc
,
make_tuple
(
Merge
<
Sequence
<
C
,
Y
T
ilda
,
X
T
ilda
>>
{},
Merge
<
Sequence
<
N
,
H
T
ilda
Slice
,
W
T
ilda
Slice
>>
{}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
...
...
@@ -229,17 +229,17 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v2r1_nchw_kcyx_nkhw
decltype
(
wei_gemmk_gemmm_global_desc
),
decltype
(
out_gemmk_gemmn_global_desc
),
decltype
(
in_gemmm_gemmn_global_desc
),
InMemoryDataOperation
::
none
,
InMemoryDataOperation
::
Set
,
GemmMPerBlock
,
GemmNPerBlock
,
GemmKPerBlock
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMPerThread
,
GemmNPerThread
,
GemmKPerThread
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
GemmThreadGemmDataPerReadM
,
GemmThreadGemmDataPerReadN
,
GemmABlockCopyThreadSliceLengths_GemmK_GemmM
,
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v3r1_nchw_kcyx_nkhw.hpp
View file @
1a66e35b
...
...
@@ -8,10 +8,10 @@
namespace
ck
{
//
Ytilda*Xtilda number of GEMMs
// GemmM = C
;
// GemmN = N * H
t
ilda
NonZero
* W
t
ilda
NonZero;
// GemmK = K * Y
d
ot
NonZero
* X
d
ot
NonZero;
//
Number of GEMMs: YTilda * XTilda
// GemmM = C
// GemmN = N * H
T
ilda
Slice
* W
T
ilda
Slice
// GemmK = K * Y
D
ot
Slice
* X
D
ot
Slice
template
<
index_t
GridSize
,
index_t
BlockSize
,
typename
Float
,
...
...
@@ -26,13 +26,13 @@ template <index_t GridSize,
index_t
GemmMPerBlock
,
index_t
GemmNPerBlock
,
index_t
GemmKPerBlock
,
index_t
GemmMPerThreadSubC
,
index_t
GemmNPerThreadSubC
,
index_t
GemmMPerThread
,
index_t
GemmNPerThread
,
index_t
GemmKPerThread
,
index_t
GemmMLevel0Cluster
,
index_t
GemmNLevel0Cluster
,
index_t
GemmMLevel1Cluster
,
index_t
GemmNLevel1Cluster
,
index_t
GemmKPerThreadLoop
,
index_t
GemmThreadGemmDataPerReadM
,
index_t
GemmThreadGemmDataPerReadN
,
typename
GemmABlockCopyThreadSliceLengths_GemmK_GemmM
,
...
...
@@ -110,32 +110,32 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v3r1_nchw_kcyx_nkhw
"be violated");
#endif
constexpr
index_t
g
cd
_s
tride
_d
ilation
_h
=
math
::
gcd
(
ConvStrideH
,
ConvDilationH
);
constexpr
index_t
g
cd
_s
tride
_d
ilation
_w
=
math
::
gcd
(
ConvStrideW
,
ConvDilationW
);
constexpr
index_t
G
cd
S
tride
D
ilation
H
=
math
::
gcd
(
ConvStrideH
,
ConvDilationH
);
constexpr
index_t
G
cd
S
tride
D
ilation
W
=
math
::
gcd
(
ConvStrideW
,
ConvDilationW
);
constexpr
index_t
Y
t
ilda
=
ConvStrideH
/
g
cd
_s
tride
_d
ilation
_h
;
constexpr
index_t
X
t
ilda
=
ConvStrideW
/
g
cd
_s
tride
_d
ilation
_w
;
constexpr
index_t
Y
T
ilda
=
ConvStrideH
/
G
cd
S
tride
D
ilation
H
;
constexpr
index_t
X
T
ilda
=
ConvStrideW
/
G
cd
S
tride
D
ilation
W
;
constexpr
index_t
Y
d
ot
=
math
::
integer_divide_ceil
(
Y
,
Y
t
ilda
);
constexpr
index_t
X
d
ot
=
math
::
integer_divide_ceil
(
X
,
X
t
ilda
);
constexpr
index_t
Y
D
ot
=
math
::
integer_divide_ceil
(
Y
,
Y
T
ilda
);
constexpr
index_t
X
D
ot
=
math
::
integer_divide_ceil
(
X
,
X
T
ilda
);
constexpr
index_t
H
t
ilda
=
constexpr
index_t
H
T
ilda
=
Ho
+
math
::
integer_divide_ceil
(
ConvDilationH
*
(
Y
-
1
),
ConvStrideH
);
constexpr
index_t
W
t
ilda
=
constexpr
index_t
W
T
ilda
=
Wo
+
math
::
integer_divide_ceil
(
ConvDilationW
*
(
X
-
1
),
ConvStrideW
);
constexpr
index_t
H
t
ildaLeft
=
math
::
integer_divide_floor
(
math
::
max
(
0
,
InLeftPads
{}[
0
]
-
ConvDilationH
*
(
Y
t
ilda
-
1
)),
ConvStrides
{}[
0
]);
constexpr
index_t
W
t
ildaLeft
=
math
::
integer_divide_floor
(
math
::
max
(
0
,
InLeftPads
{}[
1
]
-
ConvDilationW
*
(
X
t
ilda
-
1
)),
ConvStrides
{}[
1
]);
constexpr
index_t
H
T
ildaLeft
=
math
::
integer_divide_floor
(
math
::
max
(
0
,
InLeftPads
{}[
0
]
-
ConvDilationH
*
(
Y
T
ilda
-
1
)),
ConvStrides
{}[
0
]);
constexpr
index_t
W
T
ildaLeft
=
math
::
integer_divide_floor
(
math
::
max
(
0
,
InLeftPads
{}[
1
]
-
ConvDilationW
*
(
X
T
ilda
-
1
)),
ConvStrides
{}[
1
]);
constexpr
index_t
H
t
ildaRight
=
math
::
min
(
H
t
ilda
,
math
::
integer_divide_ceil
(
InLeftPads
{}[
0
]
+
Hi
-
1
,
ConvStrides
{}[
0
])
+
1
);
constexpr
index_t
W
t
ildaRight
=
math
::
min
(
W
t
ilda
,
math
::
integer_divide_ceil
(
InLeftPads
{}[
1
]
+
Wi
-
1
,
ConvStrides
{}[
1
])
+
1
);
constexpr
index_t
H
T
ildaRight
=
math
::
min
(
H
T
ilda
,
math
::
integer_divide_ceil
(
InLeftPads
{}[
0
]
+
Hi
-
1
,
ConvStrides
{}[
0
])
+
1
);
constexpr
index_t
W
T
ildaRight
=
math
::
min
(
W
T
ilda
,
math
::
integer_divide_ceil
(
InLeftPads
{}[
1
]
+
Wi
-
1
,
ConvStrides
{}[
1
])
+
1
);
constexpr
index_t
H
t
ilda
Trim
=
H
t
ildaRight
-
H
t
ildaLeft
;
constexpr
index_t
W
t
ilda
Trim
=
W
t
ildaRight
-
W
t
ildaLeft
;
constexpr
index_t
H
T
ilda
Slice
=
H
T
ildaRight
-
H
T
ildaLeft
;
constexpr
index_t
W
T
ilda
Slice
=
W
T
ildaRight
-
W
T
ildaLeft
;
constexpr
bool
wei_skip_all_out_of_bound_check
=
true
;
...
...
@@ -145,12 +145,12 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v3r1_nchw_kcyx_nkhw
make_tuple
(
PassThrough
<
K
>
{},
PassThrough
<
C
>
{},
Embed
<
Y
,
Sequence
<
Y
d
ot
,
Y
t
ilda
>
,
Sequence
<
ConvStrideH
/
g
cd
_s
tride
_d
ilation
_h
,
1
,
0
>
,
Sequence
<
Y
D
ot
,
Y
T
ilda
>
,
Sequence
<
ConvStrideH
/
G
cd
S
tride
D
ilation
H
,
1
,
0
>
,
wei_skip_all_out_of_bound_check
>
{},
Embed
<
X
,
Sequence
<
X
d
ot
,
X
t
ilda
>
,
Sequence
<
ConvStrideW
/
g
cd
_s
tride
_d
ilation
_w
,
1
,
0
>
,
Sequence
<
X
D
ot
,
X
T
ilda
>
,
Sequence
<
ConvStrideW
/
G
cd
S
tride
D
ilation
W
,
1
,
0
>
,
wei_skip_all_out_of_bound_check
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
...
...
@@ -167,26 +167,26 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v3r1_nchw_kcyx_nkhw
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
K
>
{},
Embed
<
Ho
,
Sequence
<
Y
d
ot
,
H
t
ilda
>
,
Sequence
<-
ConvDilationH
/
g
cd
_s
tride
_d
ilation
_h
,
1
,
0
>
,
Sequence
<
Y
D
ot
,
H
T
ilda
>
,
Sequence
<-
ConvDilationH
/
G
cd
S
tride
D
ilation
H
,
1
,
0
>
,
out_skip_all_out_of_bound_check
>
{},
Embed
<
Wo
,
Sequence
<
X
d
ot
,
W
t
ilda
>
,
Sequence
<-
ConvDilationW
/
g
cd
_s
tride
_d
ilation
_w
,
1
,
0
>
,
Sequence
<
X
D
ot
,
W
T
ilda
>
,
Sequence
<-
ConvDilationW
/
G
cd
S
tride
D
ilation
W
,
1
,
0
>
,
out_skip_all_out_of_bound_check
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
constexpr
auto
out_n_k_ydot_htilda
trim
_xdot_wtilda
trim
_global_desc
=
constexpr
auto
out_n_k_ydot_htilda
slice
_xdot_wtilda
slice
_global_desc
=
transform_tensor_descriptor
(
out_n_k_ydot_htilda_xdot_wtilda_global_desc
,
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
K
>
{},
PassThrough
<
Y
t
ilda
>
{},
PassThrough
<
X
t
ilda
>
{},
Slice
<
Sequence
<
H
t
ilda
,
W
t
ilda
>
,
Sequence
<
H
t
ildaLeft
,
W
t
ildaLeft
>
,
Sequence
<
H
t
ildaRight
,
W
t
ildaRight
>>
{}),
PassThrough
<
Y
T
ilda
>
{},
PassThrough
<
X
T
ilda
>
{},
Slice
<
Sequence
<
H
T
ilda
,
W
T
ilda
>
,
Sequence
<
H
T
ildaLeft
,
W
T
ildaLeft
>
,
Sequence
<
H
T
ildaRight
,
W
T
ildaRight
>>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
4
>
{},
Sequence
<
3
,
5
>
{}),
make_tuple
(
...
...
@@ -216,26 +216,26 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v3r1_nchw_kcyx_nkhw
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
C
>
{},
Embed
<
Hip
,
Sequence
<
Y
t
ilda
,
H
t
ilda
>
,
Sequence
<
Y
T
ilda
,
H
T
ilda
>
,
Sequence
<
ConvDilationH
,
ConvStrideH
,
0
>
,
in_skip_all_out_of_bound_check
>
{},
Embed
<
Wip
,
Sequence
<
X
t
ilda
,
W
t
ilda
>
,
Sequence
<
X
T
ilda
,
W
T
ilda
>
,
Sequence
<
ConvDilationW
,
ConvStrideW
,
0
>
,
in_skip_all_out_of_bound_check
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
3
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
3
>
{},
Sequence
<
4
,
5
>
{}));
constexpr
auto
in_n_c_ytilda_htilda
trim
_xtilda_wtilda
trim
_global_desc
=
constexpr
auto
in_n_c_ytilda_htilda
slice
_xtilda_wtilda
slice
_global_desc
=
transform_tensor_descriptor
(
in_n_c_ytilda_htilda_xtilda_wtilda_global_desc
,
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
C
>
{},
PassThrough
<
Y
t
ilda
>
{},
PassThrough
<
X
t
ilda
>
{},
Slice
<
Sequence
<
H
t
ilda
,
W
t
ilda
>
,
Sequence
<
H
t
ildaLeft
,
W
t
ildaLeft
>
,
Sequence
<
H
t
ildaRight
,
W
t
ildaRight
>>
{}),
PassThrough
<
Y
T
ilda
>
{},
PassThrough
<
X
T
ilda
>
{},
Slice
<
Sequence
<
H
T
ilda
,
W
T
ilda
>
,
Sequence
<
H
T
ildaLeft
,
W
T
ildaLeft
>
,
Sequence
<
H
T
ildaRight
,
W
T
ildaRight
>>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
>
{},
Sequence
<
4
>
{},
Sequence
<
3
,
5
>
{}),
make_tuple
(
...
...
@@ -246,54 +246,49 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v3r1_nchw_kcyx_nkhw
__shared__
Float
p_shared_block
[
shared_block_size
];
#if 1 // debug
static_for
<
0
,
Ytilda
,
1
>
{}([
&
](
auto
ytilda_
)
{
static_for
<
0
,
Xtilda
,
1
>
{}([
&
](
auto
xtilda_
)
{
#else
static_for
<
0
,
1
,
1
>
{}([
&
](
auto
ytilda_
)
{
static_for
<
0
,
1
,
1
>
{}([
&
](
auto
xtilda_
)
{
#endif
constexpr
index_t
ytilda
=
decltype
(
ytilda_
){};
constexpr
index_t
xtilda
=
decltype
(
xtilda_
){};
static_for
<
0
,
YTilda
,
1
>
{}([
&
](
auto
iYTilda_
)
{
static_for
<
0
,
XTilda
,
1
>
{}([
&
](
auto
iXTilda_
)
{
constexpr
index_t
iYTilda
=
decltype
(
iYTilda_
){};
constexpr
index_t
iXTilda
=
decltype
(
iXTilda_
){};
constexpr
index_t
Y
d
ot
NonZero
=
(
yt
ilda
+
1
)
*
Y
d
ot
<=
Y
?
Y
d
ot
:
Y
%
Y
d
ot
;
constexpr
index_t
X
d
ot
NonZero
=
(
xt
ilda
+
1
)
*
X
d
ot
<=
X
?
X
d
ot
:
X
%
X
d
ot
;
constexpr
index_t
Y
D
ot
Slice
=
(
iYT
ilda
+
1
)
*
Y
D
ot
<=
Y
?
Y
D
ot
:
Y
%
Y
D
ot
;
constexpr
index_t
X
D
ot
Slice
=
(
iXT
ilda
+
1
)
*
X
D
ot
<=
X
?
X
D
ot
:
X
%
X
D
ot
;
// A matrix
constexpr
auto
wei_k_c_
Y
dot
NonZero_1_XdotNonZero_1
_global_desc
=
constexpr
auto
wei_k_c_
y
dot
slice_ytidaslice_xdotslice_xtildaslice
_global_desc
=
transform_tensor_descriptor
(
wei_k_c_ydot_ytilda_xdot_xtilda_global_desc
,
make_tuple
(
PassThrough
<
K
>
{},
PassThrough
<
C
>
{},
Slice
<
Sequence
<
Y
d
ot
,
X
d
ot
>
,
Slice
<
Sequence
<
Y
D
ot
,
X
D
ot
>
,
Sequence
<
0
,
0
>
,
Sequence
<
Y
d
ot
NonZero
,
X
d
ot
NonZero
>>
{},
Slice
<
Sequence
<
Y
t
ilda
,
X
t
ilda
>
,
Sequence
<
yt
ilda
,
xt
ilda
>
,
Sequence
<
yt
ilda
+
1
,
xt
ilda
+
1
>>
{}),
Sequence
<
Y
D
ot
Slice
,
X
D
ot
Slice
>>
{},
Slice
<
Sequence
<
Y
T
ilda
,
X
T
ilda
>
,
Sequence
<
iYT
ilda
,
iXT
ilda
>
,
Sequence
<
iYT
ilda
+
1
,
iXT
ilda
+
1
>>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
4
>
{},
Sequence
<
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
2
,
4
>
{},
Sequence
<
3
,
5
>
{}));
constexpr
auto
wei_gemmk_gemmm_global_desc
=
transform_tensor_descriptor
(
wei_k_c_
Y
dot
NonZero_1_XdotNonZero_1
_global_desc
,
make_tuple
(
Merge
<
Sequence
<
K
,
Y
d
ot
NonZero
,
X
d
ot
NonZero
>>
{},
wei_k_c_
y
dot
slice_ytidaslice_xdotslice_xtildaslice
_global_desc
,
make_tuple
(
Merge
<
Sequence
<
K
,
Y
D
ot
Slice
,
X
D
ot
Slice
>>
{},
Merge
<
Sequence
<
C
,
1
,
1
>>
{}),
make_tuple
(
Sequence
<
0
,
2
,
4
>
{},
Sequence
<
1
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
// B matrix
constexpr
auto
out_n_k_
Y
dot
NonZero_htildatrim_XdotNonZero
_wtilda
trim
_global_desc
=
constexpr
auto
out_n_k_
y
dot
slice_htildaslice_xdotslice
_wtilda
slice
_global_desc
=
transform_tensor_descriptor
(
out_n_k_ydot_htilda
trim
_xdot_wtilda
trim
_global_desc
,
out_n_k_ydot_htilda
slice
_xdot_wtilda
slice
_global_desc
,
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
K
>
{},
PassThrough
<
H
t
ilda
Trim
>
{},
PassThrough
<
W
t
ilda
Trim
>
{},
Slice
<
Sequence
<
Y
d
ot
,
X
d
ot
>
,
PassThrough
<
H
T
ilda
Slice
>
{},
PassThrough
<
W
T
ilda
Slice
>
{},
Slice
<
Sequence
<
Y
D
ot
,
X
D
ot
>
,
Sequence
<
0
,
0
>
,
Sequence
<
Y
d
ot
NonZero
,
X
d
ot
NonZero
>>
{}),
Sequence
<
Y
D
ot
Slice
,
X
D
ot
Slice
>>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
3
>
{},
...
...
@@ -306,23 +301,23 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v3r1_nchw_kcyx_nkhw
Sequence
<
2
,
4
>
{}));
constexpr
auto
out_gemmk_gemmn_global_desc
=
transform_tensor_descriptor
(
out_n_k_
Y
dot
NonZero_htildatrim_XdotNonZero
_wtilda
trim
_global_desc
,
make_tuple
(
Merge
<
Sequence
<
K
,
Y
d
ot
NonZero
,
X
d
ot
NonZero
>>
{},
Merge
<
Sequence
<
N
,
H
t
ilda
Trim
,
W
t
ilda
Trim
>>
{}),
out_n_k_
y
dot
slice_htildaslice_xdotslice
_wtilda
slice
_global_desc
,
make_tuple
(
Merge
<
Sequence
<
K
,
Y
D
ot
Slice
,
X
D
ot
Slice
>>
{},
Merge
<
Sequence
<
N
,
H
T
ilda
Slice
,
W
T
ilda
Slice
>>
{}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
// C matrix
constexpr
auto
in_n_c_
1_htildatrim_1
_wtilda
trim
_global_desc
=
constexpr
auto
in_n_c_
ytildaslice_htildaslice_xtildaslice
_wtilda
slice
_global_desc
=
transform_tensor_descriptor
(
in_n_c_ytilda_htilda
trim
_xtilda_wtilda
trim
_global_desc
,
in_n_c_ytilda_htilda
slice
_xtilda_wtilda
slice
_global_desc
,
make_tuple
(
PassThrough
<
N
>
{},
PassThrough
<
C
>
{},
PassThrough
<
H
t
ilda
Trim
>
{},
PassThrough
<
W
t
ilda
Trim
>
{},
Slice
<
Sequence
<
Y
t
ilda
,
X
t
ilda
>
,
Sequence
<
yt
ilda
,
xt
ilda
>
,
Sequence
<
yt
ilda
+
1
,
xt
ilda
+
1
>>
{}),
PassThrough
<
H
T
ilda
Slice
>
{},
PassThrough
<
W
T
ilda
Slice
>
{},
Slice
<
Sequence
<
Y
T
ilda
,
X
T
ilda
>
,
Sequence
<
iYT
ilda
,
iXT
ilda
>
,
Sequence
<
iYT
ilda
+
1
,
iXT
ilda
+
1
>>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{},
Sequence
<
3
>
{},
...
...
@@ -335,9 +330,9 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v3r1_nchw_kcyx_nkhw
Sequence
<
2
,
4
>
{}));
constexpr
auto
in_gemmm_gemmn_global_desc
=
transform_tensor_descriptor
(
in_n_c_
1_htildatrim_1
_wtilda
trim
_global_desc
,
in_n_c_
ytildaslice_htildaslice_xtildaslice
_wtilda
slice
_global_desc
,
make_tuple
(
Merge
<
Sequence
<
C
,
1
,
1
>>
{},
Merge
<
Sequence
<
N
,
H
t
ilda
Trim
,
W
t
ilda
Trim
>>
{}),
Merge
<
Sequence
<
N
,
H
T
ilda
Slice
,
W
T
ilda
Slice
>>
{}),
make_tuple
(
Sequence
<
1
,
2
,
4
>
{},
Sequence
<
0
,
3
,
5
>
{}),
make_tuple
(
Sequence
<
0
>
{},
Sequence
<
1
>
{}));
...
...
@@ -349,17 +344,17 @@ struct GridwiseConvolutionBackwardDataImplicitGemm_v3r1_nchw_kcyx_nkhw
decltype
(
wei_gemmk_gemmm_global_desc
),
decltype
(
out_gemmk_gemmn_global_desc
),
decltype
(
in_gemmm_gemmn_global_desc
),
InMemoryDataOperation
::
none
,
InMemoryDataOperation
::
Set
,
GemmMPerBlock
,
GemmNPerBlock
,
GemmKPerBlock
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMPerThread
,
GemmNPerThread
,
GemmKPerThread
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
GemmThreadGemmDataPerReadM
,
GemmThreadGemmDataPerReadN
,
GemmABlockCopyThreadSliceLengths_GemmK_GemmM
,
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_backward_data_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
View file @
1a66e35b
This diff is collapsed.
Click to expand it.
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer.hpp
View file @
1a66e35b
...
...
@@ -229,10 +229,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
3
,
InBlockCopySrcDataPerRead_B
,
InBlockCopyDstDataPerWrite_N2
,
AddressSpace
::
g
lobal
,
AddressSpace
::
v
gpr
,
AddressSpace
::
l
ds
,
InMemoryDataOperation
::
none
>
(
AddressSpace
::
G
lobal
,
AddressSpace
::
V
gpr
,
AddressSpace
::
L
ds
,
InMemoryDataOperation
::
Set
>
(
{
0
,
0
,
b_block_data_on_global
,
0
},
{
0
,
0
,
0
,
0
});
// weight tensor
...
...
@@ -269,10 +269,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
1
,
WeiBlockCopySrcDataPerRead_E
,
WeiBlockCopyDstDataPerWrite_K
,
AddressSpace
::
g
lobal
,
AddressSpace
::
v
gpr
,
AddressSpace
::
l
ds
,
InMemoryDataOperation
::
none
>
(
AddressSpace
::
G
lobal
,
AddressSpace
::
V
gpr
,
AddressSpace
::
L
ds
,
InMemoryDataOperation
::
Set
>
(
{
0
,
k_block_data_on_global
},
{
0
,
0
});
// GEMM definition
...
...
@@ -344,6 +344,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
blockwise_wei_copy
.
Run
(
p_wei_global
,
p_wei_block_double
);
}
constexpr
auto
in_block_slice_copy_steps
=
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{};
constexpr
auto
wei_block_slice_copy_steps
=
Sequence
<
EPerBlock
,
0
>
{};
// LDS double buffer: main body
for
(
index_t
e_block_data_begin
=
0
;
e_block_data_begin
+
2
*
EPerBlock
<
E
;
e_block_data_begin
+=
2
*
EPerBlock
)
...
...
@@ -366,8 +369,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float
p_in_thread_buffer
[
blockwise_in_copy
.
GetThreadBufferSize
()];
Float
p_wei_thread_buffer
[
blockwise_wei_copy
.
GetThreadBufferSize
()];
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{}
,
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{}
,
True
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
in_block_slice_copy_steps
,
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
wei_block_slice_copy_steps
,
True
);
__syncthreads
();
...
...
@@ -393,8 +396,8 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float
p_in_thread_buffer
[
blockwise_in_copy
.
GetThreadBufferSize
()];
Float
p_wei_thread_buffer
[
blockwise_wei_copy
.
GetThreadBufferSize
()];
blockwise_in_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
,
0
,
0
>
{}
,
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
Sequence
<
EPerBlock
,
0
>
{}
,
True
);
blockwise_in_copy
.
MoveSrcSliceWindow
(
in_block_slice_copy_steps
,
True
);
blockwise_wei_copy
.
MoveSrcSliceWindow
(
wei_block_slice_copy_steps
,
True
);
__syncthreads
();
...
...
@@ -482,9 +485,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
3
,
1
,
1
,
AddressSpace
::
v
gpr
,
AddressSpace
::
g
lobal
,
InMemoryDataOperation
::
none
>
({
0
,
0
,
0
,
0
,
0
},
AddressSpace
::
V
gpr
,
AddressSpace
::
G
lobal
,
InMemoryDataOperation
::
Set
>
({
0
,
0
,
0
,
0
,
0
},
{
k_thread_data_on_global
/
K1
,
k_thread_data_on_global
%
K1
,
0
,
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp
View file @
1a66e35b
...
...
@@ -94,9 +94,9 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep
constexpr
auto
True
=
integral_constant
<
bool
,
true
>
{};
constexpr
auto
generic_address_space
=
integral_constant
<
AddressSpace
,
AddressSpace
::
g
eneric
>
{};
integral_constant
<
AddressSpace
,
AddressSpace
::
G
eneric
>
{};
constexpr
auto
global_address_space
=
integral_constant
<
AddressSpace
,
AddressSpace
::
g
lobal
>
{};
integral_constant
<
AddressSpace
,
AddressSpace
::
G
lobal
>
{};
static_assert
(
ConvDirection
==
ConvolutionDirection
::
Forward
||
ConvDirection
==
ConvolutionDirection
::
BackwardWeight
,
...
...
@@ -141,13 +141,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep
constexpr
index_t
E
=
C
*
Y
*
X
;
// sanity-check for vectorized memory load
static_assert
((
Wo
==
1
||
(
ConvStrideW
==
1
||
InBlockCopySrcDataPerRead_B
==
1
))
&&
static_assert
(
(
Wo
==
1
||
(
ConvStrideW
==
1
||
InBlockCopySrcDataPerRead_B
==
1
))
&&
(
X
==
1
||
ConvDilationW
%
InBlockCopySrcDataPerRead_B
==
0
),
"wrong! aligment requirement for vectorized global load of input tensor will "
"wrong! alig
n
ment requirement for vectorized global load of input tensor will "
"be violated"
);
// divide block work by [K, B]
static_assert
(
K
%
KPerBlock
==
0
&&
B
%
BPerBlock
==
0
&&
E
%
(
2
*
EPerBlock
)
==
0
,
static_assert
(
K
%
KPerBlock
==
0
&&
B
%
BPerBlock
==
0
&&
E
%
EPerBlock
==
0
,
"wrong! cannot divide work evenly among block"
);
constexpr
index_t
KBlockWork
=
K
/
KPerBlock
;
...
...
@@ -356,6 +357,10 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep
}
// LDS double buffer: tail
{
constexpr
bool
has_two_iteration_left
=
(
E
%
(
2
*
EPerBlock
)
==
0
);
if
(
has_two_iteration_left
)
// if has 2 iteration left
{
// even iteration
Float
p_in_thread_buffer
[
blockwise_in_copy
.
GetThreadBufferSize
()];
...
...
@@ -389,6 +394,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer_dep
p_in_block_double
+
in_block_space
,
p_out_thread
);
}
else
// if has 1 iteration left
{
__syncthreads
();
// LDS double buffer: GEMM on last data
blockwise_gemm
.
Run
(
p_wei_block_double
,
p_in_block_double
,
p_out_thread
);
}
}
// copy output: register to global memory
{
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp
View file @
1a66e35b
...
...
@@ -25,15 +25,15 @@ template <index_t GridSize,
index_t
GemmMPerBlock
,
index_t
GemmNPerBlock
,
index_t
GemmKPerBlock
,
index_t
GemmMPerThreadSubC
,
index_t
GemmNPerThreadSubC
,
index_t
GemmMPerThread
,
index_t
GemmNPerThread
,
index_t
GemmKPerThread
,
index_t
GemmMLevel0Cluster
,
index_t
GemmNLevel0Cluster
,
index_t
GemmMLevel1Cluster
,
index_t
GemmNLevel1Cluster
,
index_t
GemmKPerThreadLoop
,
index_t
GemmThreadGemmDataPerReadM
,
index_t
GemmThreadGemmDataPerReadN
,
index_t
ThreadGemmAThreadCopySrcDataPerRead_GemmM
,
index_t
ThreadGemmAThreadCopySrcDataPerRead_GemmN
,
typename
GemmABlockCopyThreadSliceLengths_GemmK_GemmM
,
typename
GemmABlockCopyThreadClusterLengths_GemmK_GemmM
,
index_t
GemmABlockCopySrcDataPerRead_GemmK
,
...
...
@@ -130,19 +130,19 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw
decltype
(
wei_e_k_global_desc
),
decltype
(
in_e_b_global_desc
),
decltype
(
out_k_b_global_desc
),
InMemoryDataOperation
::
none
,
InMemoryDataOperation
::
Set
,
GemmMPerBlock
,
GemmNPerBlock
,
GemmKPerBlock
,
GemmMPerThreadSubC
,
GemmNPerThreadSubC
,
GemmMPerThread
,
GemmNPerThread
,
GemmKPerThread
,
GemmMLevel0Cluster
,
GemmNLevel0Cluster
,
GemmMLevel1Cluster
,
GemmNLevel1Cluster
,
GemmKPerThreadLoop
,
GemmThreadGemmDataPerReadM
,
GemmThreadGemmDataPerReadN
,
ThreadGemmAThreadCopySrcDataPerRead_GemmM
,
ThreadGemmAThreadCopySrcDataPerRead_GemmN
,
GemmABlockCopyThreadSliceLengths_GemmK_GemmM
,
GemmABlockCopyThreadClusterLengths_GemmK_GemmM
,
Sequence
<
1
,
0
>
,
...
...
composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_deprecated.hpp
View file @
1a66e35b
...
...
@@ -251,9 +251,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_dep
// LDS double buffer: preload data into LDS
{
blockwise_in_copy
.
template
Run
<
Float
,
AddressSpace
::
g
lobal
>(
p_in_global
,
blockwise_in_copy
.
template
Run
<
Float
,
AddressSpace
::
G
lobal
>(
p_in_global
,
p_in_block_double
);
blockwise_wei_copy
.
template
Run
<
Float
,
AddressSpace
::
g
lobal
>(
p_wei_global
,
blockwise_wei_copy
.
template
Run
<
Float
,
AddressSpace
::
G
lobal
>(
p_wei_global
,
p_wei_block_double
);
}
...
...
@@ -285,9 +285,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_dep
__syncthreads
();
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
template
RunLoadThreadBuffer
<
Float
,
AddressSpace
::
g
lobal
>(
blockwise_in_copy
.
template
RunLoadThreadBuffer
<
Float
,
AddressSpace
::
G
lobal
>(
p_in_global
,
p_in_thread_buffer
);
blockwise_wei_copy
.
template
RunLoadThreadBuffer
<
Float
,
AddressSpace
::
g
lobal
>(
blockwise_wei_copy
.
template
RunLoadThreadBuffer
<
Float
,
AddressSpace
::
G
lobal
>(
p_wei_global
,
p_wei_thread_buffer
);
// LDS double buffer: GEMM on current data
...
...
@@ -311,9 +311,9 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_dep
__syncthreads
();
// LDS doubel buffer: load next data from device mem
blockwise_in_copy
.
template
RunLoadThreadBuffer
<
Float
,
AddressSpace
::
g
lobal
>(
blockwise_in_copy
.
template
RunLoadThreadBuffer
<
Float
,
AddressSpace
::
G
lobal
>(
p_in_global
,
p_in_thread_buffer
);
blockwise_wei_copy
.
template
RunLoadThreadBuffer
<
Float
,
AddressSpace
::
g
lobal
>(
blockwise_wei_copy
.
template
RunLoadThreadBuffer
<
Float
,
AddressSpace
::
G
lobal
>(
p_wei_global
,
p_wei_thread_buffer
);
// LDS double buffer: GEMM on current data
...
...
@@ -390,7 +390,7 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer_dep
for
(
index_t
nrepeat
=
0
;
nrepeat
<
GemmNRepeat
;
++
nrepeat
)
{
threadwise_out_copy
.
template
Run
<
Float
,
AddressSpace
::
g
eneric
,
AddressSpace
::
g
lobal
>(
p_out_thread
,
.
template
Run
<
Float
,
AddressSpace
::
G
eneric
,
AddressSpace
::
G
lobal
>(
p_out_thread
,
p_out_global
);
threadwise_out_copy
.
MoveSrcSliceWindow
(
Sequence
<
0
,
0
,
GemmNPerThreadSubC
>
{},
True
);
...
...
composable_kernel/include/tensor_description/ConstantMatrixDescriptor.hpp
View file @
1a66e35b
...
...
@@ -60,7 +60,7 @@ __host__ __device__ constexpr auto
template
<
typename
...
Ts
>
__host__
__device__
constexpr
auto
make_ConstantMatrixDescriptor
(
ConstantTensorDescriptor_deprecated
<
Ts
...
>
)
make_ConstantMatrixDescriptor
(
ConstantTensorDescriptor_deprecated
<
Ts
...
>
)
{
using
TDesc
=
ConstantTensorDescriptor_deprecated
<
Ts
...
>
;
static_assert
(
TDesc
::
GetNumOfDimension
()
==
2
,
"wrong"
);
...
...
composable_kernel/include/tensor_description/tensor_coordinate.hpp
View file @
1a66e35b
composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
View file @
1a66e35b
composable_kernel/include/tensor_description/tensor_descriptor_helper.hpp
View file @
1a66e35b
...
...
@@ -64,7 +64,7 @@ template <typename LowerTensorDescriptor,
index_t
...
LowerDimensionIds
,
index_t
...
UpperDimensionIds
>
__host__
__device__
constexpr
auto
reorder_transformed_tensor_descriptor_impl
(
LowerTensorDescriptor
,
reorder_transformed_tensor_descriptor_impl
(
LowerTensorDescriptor
,
Sequence
<
LowerLengths
...
>
,
Sequence
<
LowerDimensionIds
...
>
,
Sequence
<
UpperDimensionIds
...
>
)
...
...
@@ -78,7 +78,7 @@ reorder_transformed_tensor_descriptor_impl(LowerTensorDescriptor,
// reorder a NativeTensorDescriptor
template
<
typename
...
Ts
,
typename
MapLower2Upper
>
__host__
__device__
constexpr
auto
reorder_tensor_descriptor_given_lower2upper
(
NativeTensorDescriptor
<
Ts
...
>
,
MapLower2Upper
)
reorder_tensor_descriptor_given_lower2upper
(
NativeTensorDescriptor
<
Ts
...
>
,
MapLower2Upper
)
{
static_assert
(
is_valid_sequence_map
<
MapLower2Upper
>
{},
"wrong! MapLower2Upper is not a valid map"
);
...
...
@@ -96,7 +96,7 @@ reorder_tensor_descriptor_given_lower2upper(NativeTensorDescriptor<Ts...>, MapLo
// reorder a TransformedTensorDescriptor
template
<
typename
...
Ts
,
typename
MapLower2Upper
>
__host__
__device__
constexpr
auto
reorder_tensor_descriptor_given_lower2upper
(
TransformedTensorDescriptor
<
Ts
...
>
,
MapLower2Upper
)
reorder_tensor_descriptor_given_lower2upper
(
TransformedTensorDescriptor
<
Ts
...
>
,
MapLower2Upper
)
{
static_assert
(
is_valid_sequence_map
<
MapLower2Upper
>
{},
"wrong! MapLower2Upper is not a valid map"
);
...
...
@@ -152,9 +152,9 @@ __host__ __device__ constexpr auto unfold_tensor_descriptor(NativeTensorDescript
typename
arithmetic_sequence_gen
<
FirstUnfoldDim
,
LastUnfoldDim
+
1
,
1
>::
type
{};
constexpr
auto
right
=
typename
arithmetic_sequence_gen
<
LastUnfoldDim
+
1
,
nDim
,
1
>::
type
{};
// sanity-checknfoldable
// sanity-check
if u
nfold
-
able
static_assert
(
are_dimensions_unfoldable
(
desc
.
GetLengths
(
middle
),
desc
.
GetStrides
(
middle
)),
"wrong! not unfoldable"
);
"wrong! not unfold
-
able"
);
// unfolded length, stride
constexpr
index_t
unfold_length
=
...
...
composable_kernel/include/tensor_operation/blockwise_gemm.hpp
View file @
1a66e35b
...
...
@@ -23,8 +23,8 @@ template <index_t BlockSize,
index_t
MLevel1ThreadCluster
,
index_t
NLevel1ThreadCluster
,
index_t
KPerThreadLoop
,
index_t
DataPerRead
A
,
index_t
DataPerRead
B
>
index_t
ThreadGemmA
DataPerRead
_M
,
index_t
ThreadGemmB
DataPerRead
_N
>
struct
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
{
struct
MatrixIndex
...
...
@@ -150,13 +150,13 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
decltype
(
a_thread_mtx
),
KPerThreadLoop
,
MPerThreadSubC
,
DataPerRead
A
>
{};
ThreadGemmA
DataPerRead
_M
>
{};
constexpr
auto
b_thread_copy
=
ThreadwiseMatrixSliceCopy
<
BlockMatrixB
,
decltype
(
b_thread_mtx
),
KPerThreadLoop
,
NPerThreadSubC
,
DataPerRead
B
>
{};
ThreadGemmB
DataPerRead
_N
>
{};
constexpr
auto
threadwise_gemm
=
ThreadwiseGemmTransANormalBNormalC
<
decltype
(
a_thread_mtx
),
...
...
@@ -238,13 +238,13 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
decltype
(
a_thread_mtx
),
KPerThreadLoop
,
MPerThreadSubC
,
DataPerRead
A
>
{};
ThreadGemmA
DataPerRead
_M
>
{};
constexpr
auto
b_thread_copy
=
ThreadwiseMatrixSliceCopy
<
BlockMatrixB
,
decltype
(
b_thread_mtx
),
KPerThreadLoop
,
NPerThreadSubC
,
DataPerRead
B
>
{};
ThreadGemmB
DataPerRead
_N
>
{};
constexpr
auto
threadwise_gemm
=
ThreadwiseGemmTransANormalBNormalC
<
decltype
(
a_thread_sub_mtx
),
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
View file @
1a66e35b
...
...
@@ -9,7 +9,7 @@
namespace
ck
{
// This
thread
wise copy allow vector access of src and dst.
// This
block
wise copy allow vector access of src and dst.
// It allows the vector size to be different on src and dst.
// The dimension of vector access can be different for src and dst.
// The dimension access order can be different for src and dst.
...
...
@@ -28,10 +28,10 @@ template <index_t BlockSize,
index_t
DstVectorWriteDim
,
index_t
SrcDataPerRead
,
index_t
DstDataPerWrite
,
AddressSpace
SrcAddressSpace
=
AddressSpace
::
g
eneric
,
AddressSpace
ThreadBufferAddressSpace
=
AddressSpace
::
g
eneric
,
AddressSpace
DstAddressSpace
=
AddressSpace
::
g
eneric
,
InMemoryDataOperation
DstInMemOp
=
InMemoryDataOperation
::
none
>
AddressSpace
SrcAddressSpace
=
AddressSpace
::
G
eneric
,
AddressSpace
ThreadBufferAddressSpace
=
AddressSpace
::
G
eneric
,
AddressSpace
DstAddressSpace
=
AddressSpace
::
G
eneric
,
InMemoryDataOperation
DstInMemOp
=
InMemoryDataOperation
::
Set
>
struct
BlockwiseGenericTensorSliceCopy_v4
{
static
constexpr
index_t
nDim
=
BlockSrcDesc
::
GetNumOfDimension
();
...
...
@@ -115,7 +115,7 @@ struct BlockwiseGenericTensorSliceCopy_v4
template
<
typename
BlockSrcData
,
typename
BlockDstData
>
__device__
void
Run
(
const
BlockSrcData
*
p_block_src
,
BlockDstData
*
p_block_dst
)
const
{
static_assert
(
ThreadBufferAddressSpace
==
AddressSpace
::
v
gpr
,
static_assert
(
ThreadBufferAddressSpace
==
AddressSpace
::
V
gpr
,
"wrong! This function use vgpr as its thread "
"buffer. However, you have set RunLoadThreadBuffer and RunStoreThreadBuffer "
"to use ThreadBufferAddressSpace as their thread buffer, which is not vgpr. "
...
...
@@ -157,7 +157,7 @@ struct BlockwiseGenericTensorSliceCopy_v4
1
,
SrcAddressSpace
,
ThreadBufferAddressSpace
,
InMemoryDataOperation
::
none
>
;
InMemoryDataOperation
::
Set
>
;
using
ThreadwiseStore
=
ThreadwiseGenericTensorSliceCopy_v4r2
<
ThreadBufferDesc
,
BlockDstDesc
,
...
...
composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
View file @
1a66e35b
...
...
@@ -499,7 +499,7 @@ struct BlockwiseGenericTensorSliceCopy_v2_deprecated
ThreadBufferData
*
p_thread_buffer
)
const
{
constexpr
auto
generic_address_space
=
integral_constant
<
AddressSpace
,
AddressSpace
::
g
eneric
>
{};
integral_constant
<
AddressSpace
,
AddressSpace
::
G
eneric
>
{};
RunLoadThreadBuffer
(
p_block_src
,
p_thread_buffer
,
generic_address_space
,
generic_address_space
);
...
...
@@ -529,7 +529,7 @@ struct BlockwiseGenericTensorSliceCopy_v2_deprecated
BlockDstData
*
p_block_dst
)
const
{
constexpr
auto
generic_address_space
=
integral_constant
<
AddressSpace
,
AddressSpace
::
g
eneric
>
{};
integral_constant
<
AddressSpace
,
AddressSpace
::
G
eneric
>
{};
RunStoreThreadBuffer
(
p_thread_buffer
,
p_block_dst
,
generic_address_space
,
generic_address_space
);
...
...
@@ -548,7 +548,7 @@ struct BlockwiseGenericTensorSliceCopy_v2_deprecated
BlockSrcData
p_thread_buffer
[
GetThreadBufferSize
()];
constexpr
auto
generic_address_space
=
integral_constant
<
AddressSpace
,
AddressSpace
::
g
eneric
>
{};
integral_constant
<
AddressSpace
,
AddressSpace
::
G
eneric
>
{};
RunLoadThreadBuffer
(
p_block_src
,
p_thread_buffer
,
block_src_address_space
,
generic_address_space
);
...
...
@@ -562,7 +562,7 @@ struct BlockwiseGenericTensorSliceCopy_v2_deprecated
__device__
void
Run
(
const
BlockSrcData
*
p_block_src
,
BlockDstData
*
p_block_dst
)
const
{
constexpr
auto
generic_address_space
=
integral_constant
<
AddressSpace
,
AddressSpace
::
g
eneric
>
{};
integral_constant
<
AddressSpace
,
AddressSpace
::
G
eneric
>
{};
Run
(
p_block_src
,
p_block_dst
,
generic_address_space
,
generic_address_space
);
}
...
...
composable_kernel/include/tensor_operation/gridwise_gemm.hpp
View file @
1a66e35b
...
...
@@ -22,15 +22,15 @@ template <index_t GridSize,
index_t
MPerBlock
,
index_t
NPerBlock
,
index_t
KPerBlock
,
index_t
MPerThreadSubC
,
index_t
NPerThreadSubC
,
index_t
MPerThread
,
index_t
NPerThread
,
index_t
KPerThread
,
index_t
MLevel0Cluster
,
index_t
NLevel0Cluster
,
index_t
MLevel1Cluster
,
index_t
NLevel1Cluster
,
index_t
KPerThreadLoop
,
index_t
ThreadGemmDataPerReadM
,
index_t
ThreadGemmDataPerReadN
,
index_t
ThreadGemmAThreadCopySrcDataPerRead_M
,
index_t
ThreadGemmBThreadCopySrcDataPerRead_N
,
typename
ABlockCopyThreadSliceLengths_K_M
,
typename
ABlockCopyThreadClusterLengths_K_M
,
typename
ABlockCopyThreadClusterArrangeOrder
,
...
...
@@ -54,8 +54,8 @@ struct GridwiseGemmTransposedANormalBNormalC_v1
{
constexpr
index_t
max_lds_align
=
math
::
lcm
(
ABlockCopyDstDataPerWrite_M
,
BBlockCopyDstDataPerWrite_N
,
ThreadGemmDataPerReadM
,
ThreadGemmDataPerReadN
);
ThreadGemm
AThreadCopySrc
DataPerRead
_
M
,
ThreadGemm
BThreadCopySrc
DataPerRead
_
N
);
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
...
...
@@ -101,8 +101,8 @@ struct GridwiseGemmTransposedANormalBNormalC_v1
// lds max alignment
constexpr
index_t
max_lds_align
=
math
::
lcm
(
ABlockCopyDstDataPerWrite_M
,
BBlockCopyDstDataPerWrite_N
,
ThreadGemmDataPerReadM
,
ThreadGemmDataPerReadN
);
ThreadGemm
AThreadCopySrc
DataPerRead
_
M
,
ThreadGemm
BThreadCopySrc
DataPerRead
_
N
);
// divide block work by [M, N]
static_assert
(
M
%
MPerBlock
==
0
&&
N
%
NPerBlock
==
0
&&
K
%
KPerBlock
==
0
,
...
...
@@ -139,10 +139,10 @@ struct GridwiseGemmTransposedANormalBNormalC_v1
1
,
ABlockCopySrcDataPerRead
,
ABlockCopyDstDataPerWrite_M
,
AddressSpace
::
g
lobal
,
AddressSpace
::
v
gpr
,
AddressSpace
::
l
ds
,
InMemoryDataOperation
::
none
>
(
AddressSpace
::
G
lobal
,
AddressSpace
::
V
gpr
,
AddressSpace
::
L
ds
,
InMemoryDataOperation
::
Set
>
(
{
0
,
m_block_data_on_global
},
{
0
,
0
});
// B matrix in LDS memory, dst of blockwise copy
...
...
@@ -165,10 +165,10 @@ struct GridwiseGemmTransposedANormalBNormalC_v1
1
,
BBlockCopySrcDataPerRead
,
BBlockCopyDstDataPerWrite_N
,
AddressSpace
::
g
lobal
,
AddressSpace
::
v
gpr
,
AddressSpace
::
l
ds
,
InMemoryDataOperation
::
none
>
(
AddressSpace
::
G
lobal
,
AddressSpace
::
V
gpr
,
AddressSpace
::
L
ds
,
InMemoryDataOperation
::
Set
>
(
{
0
,
n_block_data_on_global
},
{
0
,
0
});
// GEMM definition
...
...
@@ -181,35 +181,33 @@ struct GridwiseGemmTransposedANormalBNormalC_v1
constexpr
auto
b_k_n_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
b_k_n_block_desc
);
// sanity check
static_assert
(
MPerBlock
%
(
MPerThread
SubC
*
MLevel0Cluster
*
MLevel1Cluster
)
==
0
&&
NPerBlock
%
(
NPerThread
SubC
*
NLevel0Cluster
*
NLevel1Cluster
)
==
0
,
static_assert
(
MPerBlock
%
(
MPerThread
*
MLevel0Cluster
*
MLevel1Cluster
)
==
0
&&
NPerBlock
%
(
NPerThread
*
NLevel0Cluster
*
NLevel1Cluster
)
==
0
,
"wrong!"
);
constexpr
index_t
GemmMRepeat
=
MPerBlock
/
(
MPerThreadSubC
*
MLevel0Cluster
*
MLevel1Cluster
);
constexpr
index_t
GemmMRepeat
=
MPerBlock
/
(
MPerThread
*
MLevel0Cluster
*
MLevel1Cluster
);
constexpr
index_t
GemmNRepeat
=
NPerBlock
/
(
NPerThreadSubC
*
NLevel0Cluster
*
NLevel1Cluster
);
constexpr
index_t
GemmNRepeat
=
NPerBlock
/
(
NPerThread
*
NLevel0Cluster
*
NLevel1Cluster
);
// c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx
constexpr
auto
c_m0m1_n0n1_thread_mtx_desc
=
make_ConstantMatrixDescriptor_packed
(
Number
<
GemmMRepeat
*
MPerThread
SubC
>
{},
Number
<
GemmNRepeat
*
NPerThread
SubC
>
{});
Number
<
GemmMRepeat
*
MPerThread
>
{},
Number
<
GemmNRepeat
*
NPerThread
>
{});
const
auto
blockwise_gemm
=
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
<
BlockSize
,
decltype
(
a_k_m_block_mtx_desc
),
decltype
(
b_k_n_block_mtx_desc
),
decltype
(
c_m0m1_n0n1_thread_mtx_desc
),
MPerThread
SubC
,
NPerThread
SubC
,
MPerThread
,
NPerThread
,
MLevel0Cluster
,
NLevel0Cluster
,
MLevel1Cluster
,
NLevel1Cluster
,
KPerThread
Loop
,
ThreadGemmDataPerReadM
,
ThreadGemmDataPerReadN
>
{};
KPerThread
,
ThreadGemm
AThreadCopySrc
DataPerRead
_
M
,
ThreadGemm
BThreadCopySrc
DataPerRead
_
N
>
{};
// LDS allocation for A and B: be careful of alignment
constexpr
index_t
a_block_space
=
...
...
@@ -233,6 +231,9 @@ struct GridwiseGemmTransposedANormalBNormalC_v1
b_blockwise_copy
.
Run
(
p_b_global
,
p_b_block_double
);
}
constexpr
auto
a_block_slice_copy_steps
=
Sequence
<
KPerBlock
,
0
>
{};
constexpr
auto
b_block_slice_copy_steps
=
Sequence
<
KPerBlock
,
0
>
{};
// LDS double buffer: main body
for
(
index_t
k_block_data_begin
=
0
;
k_block_data_begin
+
2
*
KPerBlock
<
K
;
k_block_data_begin
+=
2
*
KPerBlock
)
...
...
@@ -255,8 +256,8 @@ struct GridwiseGemmTransposedANormalBNormalC_v1
Float
p_a_thread_buffer
[
a_blockwise_copy
.
GetThreadBufferSize
()];
Float
p_b_thread_buffer
[
b_blockwise_copy
.
GetThreadBufferSize
()];
a_blockwise_copy
.
MoveSrcSliceWindow
(
Sequence
<
KPerBlock
,
0
>
{}
,
True
);
b_blockwise_copy
.
MoveSrcSliceWindow
(
Sequence
<
KPerBlock
,
0
>
{}
,
True
);
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_block_slice_copy_steps
,
True
);
b_blockwise_copy
.
MoveSrcSliceWindow
(
b_block_slice_copy_steps
,
True
);
__syncthreads
();
...
...
@@ -282,8 +283,8 @@ struct GridwiseGemmTransposedANormalBNormalC_v1
Float
p_a_thread_buffer
[
a_blockwise_copy
.
GetThreadBufferSize
()];
Float
p_b_thread_buffer
[
b_blockwise_copy
.
GetThreadBufferSize
()];
a_blockwise_copy
.
MoveSrcSliceWindow
(
Sequence
<
KPerBlock
,
0
>
{}
,
True
);
b_blockwise_copy
.
MoveSrcSliceWindow
(
Sequence
<
KPerBlock
,
0
>
{}
,
True
);
a_blockwise_copy
.
MoveSrcSliceWindow
(
a_block_slice_copy_steps
,
True
);
b_blockwise_copy
.
MoveSrcSliceWindow
(
b_block_slice_copy_steps
,
True
);
__syncthreads
();
...
...
@@ -317,16 +318,16 @@ struct GridwiseGemmTransposedANormalBNormalC_v1
// input: register to global memory
{
constexpr
index_t
M1
=
MPerThread
SubC
*
MLevel0Cluster
*
MLevel1Cluster
;
constexpr
index_t
M1
=
MPerThread
*
MLevel0Cluster
*
MLevel1Cluster
;
constexpr
index_t
M0
=
M
/
M1
;
constexpr
index_t
N1
=
NPerThread
SubC
*
NLevel0Cluster
*
NLevel1Cluster
;
constexpr
index_t
N1
=
NPerThread
*
NLevel0Cluster
*
NLevel1Cluster
;
constexpr
index_t
N0
=
N
/
N1
;
// define input tensor descriptor for threadwise copy
// thread input tensor, src of threadwise copy
constexpr
auto
c_m0_m1_n0_n1_thread_desc
=
make_native_tensor_descriptor_packed
(
Sequence
<
GemmMRepeat
,
MPerThread
SubC
,
GemmNRepeat
,
NPerThread
SubC
>
{});
Sequence
<
GemmMRepeat
,
MPerThread
,
GemmNRepeat
,
NPerThread
>
{});
constexpr
auto
c_m0_m1_n0_n1_global_desc
=
transform_tensor_descriptor
(
c_m_n_global_desc
,
...
...
@@ -352,8 +353,8 @@ struct GridwiseGemmTransposedANormalBNormalC_v1
CThreadCopySrcDstVectorReadWriteDim
,
1
,
CThreadCopyDstDataPerWrite
,
AddressSpace
::
v
gpr
,
AddressSpace
::
g
lobal
,
AddressSpace
::
V
gpr
,
AddressSpace
::
G
lobal
,
CGlobalMemoryDataOperation
>
(
{
0
,
0
,
0
,
0
},
{
m_thread_data_on_global
/
M1
,
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
View file @
1a66e35b
...
...
@@ -21,9 +21,9 @@ template <typename SrcDesc,
index_t
SrcDstVectorReadWriteDim
,
index_t
SrcDataPerRead
,
index_t
DstDataPerWrite
,
AddressSpace
SrcAddressSpace
=
AddressSpace
::
g
eneric
,
AddressSpace
DstAddressSpace
=
AddressSpace
::
g
eneric
,
InMemoryDataOperation
DstInMemOp
=
InMemoryDataOperation
::
none
>
AddressSpace
SrcAddressSpace
=
AddressSpace
::
G
eneric
,
AddressSpace
DstAddressSpace
=
AddressSpace
::
G
eneric
,
InMemoryDataOperation
DstInMemOp
=
InMemoryDataOperation
::
Set
>
struct
ThreadwiseGenericTensorSliceCopy_v4r2
{
static
constexpr
index_t
nDim
=
SliceLengths
::
Size
();
...
...
@@ -115,8 +115,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
transfer_data
<
SrcData
,
SrcDataPerRead
,
SrcAddressSpace
,
AddressSpace
::
v
gpr
,
InMemoryDataOperation
::
none
>
(
AddressSpace
::
V
gpr
,
InMemoryDataOperation
::
Set
>
(
p_src
,
src_coord
.
GetOffset
(),
p_src_long_vector
,
buffer_offset
);
}
}
...
...
@@ -146,7 +146,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
{
transfer_data
<
DstData
,
DstDataPerWrite
,
AddressSpace
::
v
gpr
,
AddressSpace
::
V
gpr
,
DstAddressSpace
,
DstInMemOp
>
(
p_dst_long_vector
,
buffer_offset
,
p_dst
,
dst_coord
.
GetOffset
());
...
...
@@ -265,8 +265,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
transfer_data
<
SrcData
,
SrcDataPerRead
,
SrcAddressSpace
,
AddressSpace
::
v
gpr
,
InMemoryDataOperation
::
none
>
(
p_src
,
AddressSpace
::
V
gpr
,
InMemoryDataOperation
::
Set
>
(
p_src
,
src_nonlinear_coord
.
GetOffset
()
+
src_linear_offset
,
p_src_long_vector
,
...
...
@@ -303,7 +303,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
{
transfer_data
<
DstData
,
DstDataPerWrite
,
AddressSpace
::
v
gpr
,
AddressSpace
::
V
gpr
,
DstAddressSpace
,
DstInMemOp
>
(
p_dst_long_vector
,
buffer_offset
,
p_dst
,
dst_coord
.
GetOffset
());
...
...
@@ -404,8 +404,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
transfer_data
<
SrcData
,
SrcDataPerRead
,
SrcAddressSpace
,
AddressSpace
::
v
gpr
,
InMemoryDataOperation
::
none
>
(
AddressSpace
::
V
gpr
,
InMemoryDataOperation
::
Set
>
(
p_src
,
src_coord
.
GetOffset
(),
p_src_long_vector
,
buffer_offset
);
}
}
...
...
@@ -448,7 +448,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
{
transfer_data
<
DstData
,
DstDataPerWrite
,
AddressSpace
::
v
gpr
,
AddressSpace
::
V
gpr
,
DstAddressSpace
,
DstInMemOp
>
(
p_dst_long_vector
,
buffer_offset
,
...
...
composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy_deprecated.hpp
View file @
1a66e35b
...
...
@@ -333,7 +333,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
// 2. src_normal_offset must be calculatd at compile time (guaranteed by
// algorithm)
// 3. src_merged_offset can be runtime value (no assumption imposed)
static_if
<
SrcAddressSpace
==
AddressSpace
::
g
lobal
>
{}([
&
](
auto
fwd
)
{
static_if
<
SrcAddressSpace
==
AddressSpace
::
G
lobal
>
{}([
&
](
auto
fwd
)
{
#if CK_USE_AMD_BUFFER_ADDRESSING
vector_data
=
amd_intrinsic_buffer_load
<
SrcData
,
SrcDataPerAccess
>
(
fwd
(
p_src
),
src_merged_offset
,
src_normal_offset
);
...
...
@@ -442,7 +442,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
// 2. dst_normal_offset must be calculatd at compile time (guaranteed by
// algorithm)
// 3. dst_merged_offset can be runtime value (no assumption imposed)
static_if
<
DstAddressSpace
==
AddressSpace
::
g
lobal
>
{}([
&
](
auto
fwd
)
{
static_if
<
DstAddressSpace
==
AddressSpace
::
G
lobal
>
{}([
&
](
auto
fwd
)
{
#if CK_USE_AMD_BUFFER_ADDRESSING
amd_intrinsic_buffer_store
<
DstData
,
DstDataPerAccess
>
(
vector_data
,
fwd
(
p_dst
),
dst_merged_offset
,
dst_normal_offset
);
...
...
@@ -464,7 +464,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1_deprecated
__device__
void
Run
(
const
SrcData
*
p_src
,
DstData
*
p_dst
)
const
{
constexpr
auto
generic_address_space
=
integral_constant
<
AddressSpace
,
AddressSpace
::
g
eneric
>
{};
integral_constant
<
AddressSpace
,
AddressSpace
::
G
eneric
>
{};
Run
(
p_src
,
p_dst
,
generic_address_space
,
generic_address_space
);
}
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment