Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
6e28a8ac
Commit
6e28a8ac
authored
Mar 06, 2023
by
aska-0096
Browse files
format
parent
708fd81f
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
77 additions
and
100 deletions
+77
-100
example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
...ple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
+2
-2
include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
...l/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
+6
-8
include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
.../gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
+7
-10
include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
.../ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
+9
-12
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
...impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
+27
-28
include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
...ation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
+13
-18
include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
+10
-14
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
...operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
+2
-7
script/clang-format-overwrite.sh
script/clang-format-overwrite.sh
+1
-1
No files found.
example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc
View file @
6e28a8ac
include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
View file @
6e28a8ac
...
...
@@ -167,7 +167,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle
// lengths for K0, K1, ...
const
auto
kLengths
=
get_container_subset
(
a_ms_ks_lengths
,
kDimIds
);
const
auto
a_grid_desc_m_k
=
[
&
](){
const
auto
a_grid_desc_m_k
=
[
&
]()
{
if
constexpr
(
ASpec
==
TensorSpecialization
::
Packed
)
{
auto
M
=
container_reduce
(
mLengths
,
math
::
multiplies
{},
Number
<
1
>
{});
...
...
@@ -256,7 +256,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle
// lengths for N0, N1, ...
const
auto
nLengths
=
get_container_subset
(
b_ns_ks_lengths
,
nDimIds
);
const
auto
b_grid_desc_n_k
=
[
&
](){
const
auto
b_grid_desc_n_k
=
[
&
]()
{
if
constexpr
(
BSpec
==
TensorSpecialization
::
Packed
)
{
auto
N
=
container_reduce
(
nLengths
,
math
::
multiplies
{},
Number
<
1
>
{});
...
...
@@ -522,8 +522,8 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle
EGridDesc_G_M_N
e_grid_desc_g_m_n_
;
};
using
AGridDesc
=
decltype
(
DeviceOp
::
MakeAGridDescriptor
({},{}));
using
BGridDesc
=
decltype
(
DeviceOp
::
MakeBGridDescriptor
({},{}));
using
AGridDesc
=
decltype
(
DeviceOp
::
MakeAGridDescriptor
({},
{}));
using
BGridDesc
=
decltype
(
DeviceOp
::
MakeBGridDescriptor
({},
{}));
// GridwiseOp
using
GridwiseOp
=
GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle
<
...
...
@@ -648,7 +648,6 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle
e_grid_desc_m_n_
=
DeviceOp
::
MakeEGridDescriptor_M_N
(
e_gs_ms_ns_lengths
,
e_gs_ms_ns_strides
);
block_2_ctile_map_
=
GridwiseOp
::
MakeDefaultBlock2CTileMap
(
e_grid_desc_m_n_
,
M01
,
N01
);
ds_grid_desc_mblock_mperblock_nblock_nperblock
=
...
...
@@ -686,7 +685,6 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle
DsGridDesc_G_M_N
ds_grid_desc_g_m_n_
;
EGridDesc_G_M_N
e_grid_desc_g_m_n_
;
typename
GridwiseOp
::
DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
ds_grid_desc_mblock_mperblock_nblock_nperblock
;
typename
GridwiseOp
::
EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp
View file @
6e28a8ac
...
...
@@ -18,7 +18,6 @@
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
namespace
ck
{
namespace
tensor_operation
{
namespace
device
{
...
...
@@ -163,16 +162,14 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD<ALayout,
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
BLayout
>::
value
)
{
const
auto
b_grid_desc_nraw_kraw
=
make_naive_tensor_descriptor
(
make_tuple
(
NRaw
,
KRaw
),
make_tuple
(
I1
,
StrideB
));
make_naive_tensor_descriptor
(
make_tuple
(
NRaw
,
KRaw
),
make_tuple
(
I1
,
StrideB
));
return
matrix_padder
.
PadBDescriptor_N_K
(
b_grid_desc_nraw_kraw
);
}
else
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
ColumnMajor
,
BLayout
>::
value
)
{
const
auto
b_grid_desc_nraw_kraw
=
make_naive_tensor_descriptor
(
make_tuple
(
NRaw
,
KRaw
),
make_tuple
(
StrideB
,
I1
));
make_naive_tensor_descriptor
(
make_tuple
(
NRaw
,
KRaw
),
make_tuple
(
StrideB
,
I1
));
return
matrix_padder
.
PadBDescriptor_N_K
(
b_grid_desc_nraw_kraw
);
}
...
...
include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp
View file @
6e28a8ac
...
...
@@ -153,16 +153,14 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm<ALayout,
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
RowMajor
,
BLayout
>::
value
)
{
const
auto
b_grid_desc_nraw_kraw
=
make_naive_tensor_descriptor
(
make_tuple
(
NRaw
,
KRaw
),
make_tuple
(
I1
,
StrideB
));
make_naive_tensor_descriptor
(
make_tuple
(
NRaw
,
KRaw
),
make_tuple
(
I1
,
StrideB
));
return
matrix_padder
.
PadBDescriptor_N_K
(
b_grid_desc_nraw_kraw
);
}
else
if
constexpr
(
is_same
<
tensor_layout
::
gemm
::
ColumnMajor
,
BLayout
>::
value
)
{
const
auto
b_grid_desc_nraw_kraw
=
make_naive_tensor_descriptor
(
make_tuple
(
NRaw
,
KRaw
),
make_tuple
(
StrideB
,
I1
));
make_naive_tensor_descriptor
(
make_tuple
(
NRaw
,
KRaw
),
make_tuple
(
StrideB
,
I1
));
return
matrix_padder
.
PadBDescriptor_N_K
(
b_grid_desc_nraw_kraw
);
}
...
...
@@ -304,8 +302,7 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm<ALayout,
c_element_op_
{
c_element_op
}
{
a_grid_desc_
=
DeviceGemmWmma_CShuffle
::
MakeAGridDescriptor
(
M
,
K
,
StrideA
);
b_grid_desc_k0_n_k1_
=
DeviceGemmWmma_CShuffle
::
MakeBGridDescriptor
(
K
,
N
,
StrideB
);
b_grid_desc_k0_n_k1_
=
DeviceGemmWmma_CShuffle
::
MakeBGridDescriptor
(
K
,
N
,
StrideB
);
c_grid_desc_m_n_
=
DeviceGemmWmma_CShuffle
::
MakeCGridDescriptor_M_N
(
M
,
N
,
StrideC
);
block_2_ctile_map_
=
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
View file @
6e28a8ac
...
...
@@ -184,8 +184,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
MatrixPadder
<
GemmSpec
,
index_t
,
index_t
,
index_t
>
{
MPerBlock
,
NPerBlock
,
KPerBlock
};
template
<
typename
ALay
>
static
auto
MakeAGridDescriptor
(
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
static
auto
MakeAGridDescriptor
(
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
a_g_n_c_wis_strides
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
,
...
...
@@ -244,8 +243,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
}
template
<
typename
BLay
>
static
auto
MakeBGridDescriptor
(
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
static
auto
MakeBGridDescriptor
(
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_lengths
,
const
std
::
array
<
index_t
,
NDimSpatial
+
3
>&
b_g_k_c_xs_strides
)
{
const
auto
wei_gemmnraw_gemmkraw_desc
=
...
...
@@ -320,7 +318,8 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
using
DsGridDesc_M_N
=
remove_cvref_t
<
decltype
(
MakeDsGridDescriptor_M_N
({},
{}))
>
;
using
EGridDesc_M_N
=
remove_cvref_t
<
decltype
(
MakeEGridDescriptor_M_N
<
ELayout
>
({},
{}))
>
;
using
AGridDesc
=
decltype
(
DeviceOp
::
MakeAGridDescriptor
<
ALayout
>
({},
{},
{},
{},
{},
{},
{},
{},
{},
{}));
using
AGridDesc
=
decltype
(
DeviceOp
::
MakeAGridDescriptor
<
ALayout
>
({},
{},
{},
{},
{},
{},
{},
{},
{},
{}));
using
BGridDesc
=
decltype
(
DeviceOp
::
MakeBGridDescriptor
<
BLayout
>
({},
{}));
// GridwiseOp
...
...
@@ -423,8 +422,8 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
conv_filter_dilations
,
input_left_pads
,
input_right_pads
)},
b_grid_desc_
{
DeviceOp
::
MakeBGridDescriptor
<
BLayout
>
(
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
)},
b_grid_desc_
{
DeviceOp
::
MakeBGridDescriptor
<
BLayout
>
(
b_g_k_c_xs_lengths
,
b_g_k_c_xs_strides
)},
ds_grid_desc_mblock_mperblock_nblock_nperblock_
{},
e_grid_desc_mblock_mperblock_nblock_nperblock_
{},
block_2_etile_map_
{
GridwiseOp
::
MakeDefaultBlock2CTileMap
(
e_grid_desc_m_n_
,
M01
,
N01
)},
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp
View file @
6e28a8ac
...
...
@@ -513,8 +513,7 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle
}
template
<
typename
BBlockDesc_
>
__host__
__device__
static
constexpr
auto
MakeBWaveDescriptor
(
const
BBlockDesc_
&
)
__host__
__device__
static
constexpr
auto
MakeBWaveDescriptor
(
const
BBlockDesc_
&
)
{
constexpr
auto
b_wave_desc
=
[
&
]()
{
if
constexpr
(
BEnableLds
)
...
...
@@ -595,8 +594,7 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle
// block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
template
<
typename
Block2CTileMap
>
__host__
__device__
static
constexpr
bool
CheckValidity
(
const
AGridDesc
&
a_grid_desc
,
__host__
__device__
static
constexpr
bool
CheckValidity
(
const
AGridDesc
&
a_grid_desc
,
const
BGridDesc
&
b_grid_desc
,
const
DsGridDesc_M_N
&
ds_grid_desc_m_n
,
const
EGridDesc_M_N
&
e_grid_desc_m_n
,
...
...
@@ -628,13 +626,11 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle
if
constexpr
(
BEnableLds
)
{
return
make_tuple
(
b_grid_desc
.
GetLength
(
I1
),
b_grid_desc
.
GetLength
(
I0
)
*
b_grid_desc
.
GetLength
(
I2
));
b_grid_desc
.
GetLength
(
I0
)
*
b_grid_desc
.
GetLength
(
I2
));
}
else
{
return
make_tuple
(
b_grid_desc
.
GetLength
(
I1
)
*
b_grid_desc
.
GetLength
(
I2
)
*
return
make_tuple
(
b_grid_desc
.
GetLength
(
I1
)
*
b_grid_desc
.
GetLength
(
I2
)
*
b_grid_desc
.
GetLength
(
I4
),
b_grid_desc
.
GetLength
(
I0
)
*
b_grid_desc
.
GetLength
(
I3
)
*
b_grid_desc
.
GetLength
(
I5
));
...
...
@@ -747,8 +743,7 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle
max_lds_align
)
:
0
;
static
constexpr
auto
b_block_space_size_aligned
=
BEnableLds
?
math
::
integer_least_multiple
(
MakeBBlockDescriptor
().
GetElementSpaceSize
(),
BEnableLds
?
math
::
integer_least_multiple
(
MakeBBlockDescriptor
().
GetElementSpaceSize
(),
max_lds_align
)
:
0
;
...
...
include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp
View file @
6e28a8ac
...
...
@@ -327,8 +327,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma
}
template
<
typename
BBlockDesc_
>
__host__
__device__
static
constexpr
auto
MakeBWaveDescriptor
(
const
BBlockDesc_
&
)
__host__
__device__
static
constexpr
auto
MakeBWaveDescriptor
(
const
BBlockDesc_
&
)
{
constexpr
auto
b_wave_desc
=
[
&
]()
{
if
constexpr
(
BEnableLds
)
...
...
@@ -394,8 +393,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma
// block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
template
<
typename
Block2CTileMap
>
__host__
__device__
static
constexpr
bool
CheckValidity
(
const
AGridDesc
&
a_grid_desc
,
__host__
__device__
static
constexpr
bool
CheckValidity
(
const
AGridDesc
&
a_grid_desc
,
const
BGridDesc
&
b_grid_desc
,
const
CGridDesc_M_N
&
c_grid_desc_m_n
,
const
Block2CTileMap
&
block_2_ctile_map
)
...
...
@@ -426,13 +424,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma
if
constexpr
(
BEnableLds
)
{
return
make_tuple
(
b_grid_desc
.
GetLength
(
I1
),
b_grid_desc
.
GetLength
(
I0
)
*
b_grid_desc
.
GetLength
(
I2
));
b_grid_desc
.
GetLength
(
I0
)
*
b_grid_desc
.
GetLength
(
I2
));
}
else
{
return
make_tuple
(
b_grid_desc
.
GetLength
(
I1
)
*
b_grid_desc
.
GetLength
(
I2
)
*
return
make_tuple
(
b_grid_desc
.
GetLength
(
I1
)
*
b_grid_desc
.
GetLength
(
I2
)
*
b_grid_desc
.
GetLength
(
I4
),
b_grid_desc
.
GetLength
(
I0
)
*
b_grid_desc
.
GetLength
(
I3
)
*
b_grid_desc
.
GetLength
(
I5
));
...
...
include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
View file @
6e28a8ac
...
...
@@ -1399,12 +1399,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow
if
constexpr
(
IntraRowSwizzlePerm
)
{
temp
=
__builtin_amdgcn_permlane16
(
temp
,
type_convert
<
int
>
(
v_this_row
),
0xb3a29180
,
0xf7e6d5c4
,
1
,
0
);
temp
,
type_convert
<
int
>
(
v_this_row
),
0xb3a29180
,
0xf7e6d5c4
,
1
,
0
);
v_this_row
=
type_convert
<
SrcData
>
(
temp
);
}
...
...
script/clang-format-overwrite.sh
View file @
6e28a8ac
#find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}'
#
find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}'
git status
--porcelain
|
awk
'$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}'
| xargs
-n
1
-P
16
-I
{}
-t
sh
-c
'clang-format-10 -i -style=file {}'
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment