Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
3e8cc094
Unverified
Commit
3e8cc094
authored
Nov 15, 2022
by
rocking5566
Committed by
GitHub
Nov 15, 2022
Browse files
Merge branch 'develop' into gemm_layernorm_welford
parents
24af0144
7038723a
Changes
29
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
78 additions
and
76 deletions
+78
-76
example/44_conv2d_fwd_quant/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
...d_fwd_quant/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
+2
-1
example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
...le/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
+2
-1
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp
...e_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp
+4
-0
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
...e_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
+7
-8
include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
...eration/operator_transform/transform_conv_fwd_to_gemm.hpp
+30
-44
library/include/ck/library/utility/convolution_parameter.hpp
library/include/ck/library/utility/convolution_parameter.hpp
+6
-8
library/include/ck/library/utility/numeric.hpp
library/include/ck/library/utility/numeric.hpp
+16
-0
library/src/utility/convolution_parameter.cpp
library/src/utility/convolution_parameter.cpp
+4
-8
profiler/include/profile_reduce_impl.hpp
profiler/include/profile_reduce_impl.hpp
+7
-6
No files found.
example/44_conv2d_fwd_quant/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
View file @
3e8cc094
...
...
@@ -6,6 +6,7 @@
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
...
...
@@ -131,7 +132,7 @@ bool run_grouped_conv_fwd(bool do_verification,
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_left_pads
{};
std
::
array
<
ck
::
index_t
,
NDimSpatial
>
input_right_pads
{};
auto
copy
=
[](
auto
&
x
,
auto
&
y
)
{
std
::
copy
(
x
.
begin
(),
x
.
end
()
,
y
.
begin
());
};
auto
copy
=
[](
auto
&
x
,
auto
&
y
)
{
ck
::
ranges
::
copy
(
x
,
y
.
begin
());
};
copy
(
in_g_n_c_wis_desc
.
GetLengths
(),
a_g_n_c_wis_lengths
);
copy
(
in_g_n_c_wis_desc
.
GetStrides
(),
a_g_n_c_wis_strides
);
...
...
example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
View file @
3e8cc094
...
...
@@ -5,6 +5,7 @@
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
...
...
@@ -69,7 +70,7 @@ int main()
static_cast
<
int
>
(
nhwc
[
2
]
*
nhwc
[
3
]),
static_cast
<
int
>
(
nhwc
[
3
])};
std
::
copy
(
nchw
.
begin
(),
nchw
.
end
()
,
ab_lengths
.
begin
());
ck
::
ranges
::
copy
(
nchw
,
ab_lengths
.
begin
());
auto
broadcastPermute
=
DeviceElementwisePermuteInstance
{};
auto
argument
=
broadcastPermute
.
MakeArgumentPointer
(
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp
View file @
3e8cc094
...
...
@@ -116,6 +116,10 @@ __global__ void
ignore
=
batch_count
;
ignore
=
block_2_ctile_map
;
ignore
=
compute_ptr_offset_of_batch
;
compute_ptr_offset_of_batch
.
GetAPtrOffset
(
0
);
compute_ptr_offset_of_batch
.
GetBPtrOffset
(
0
);
compute_ptr_offset_of_batch
.
GetCPtrOffset
(
0
);
#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
}
...
...
include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
View file @
3e8cc094
...
...
@@ -22,6 +22,7 @@
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/io.hpp"
#include "ck/library/utility/numeric.hpp"
namespace
ck
{
namespace
tensor_operation
{
...
...
@@ -410,10 +411,9 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
{
const
index_t
N
=
r_g_n_wos_lengths
[
1
];
const
index_t
NHoWo
=
N
*
std
::
accumulate
(
r_g_n_wos_lengths
.
begin
()
+
2
,
r_g_n_wos_lengths
.
begin
()
+
2
+
NDimSpatial
,
index_t
{
1
},
std
::
multiplies
<
index_t
>
());
const
index_t
NHoWo
=
N
*
ck
::
accumulate_n
<
index_t
>
(
r_g_n_wos_lengths
.
begin
()
+
2
,
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
auto
r_grid_desc_mraw
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
NHoWo
));
...
...
@@ -435,10 +435,9 @@ struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
const
index_t
WoStride
=
r_g_n_wos_strides
[
NDimSpatial
+
2
];
const
index_t
NHoWo
=
N
*
std
::
accumulate
(
r_g_n_wos_lengths
.
begin
()
+
2
,
r_g_n_wos_lengths
.
begin
()
+
2
+
NDimSpatial
,
index_t
{
1
},
std
::
multiplies
<
index_t
>
());
const
index_t
NHoWo
=
N
*
ck
::
accumulate_n
<
index_t
>
(
r_g_n_wos_lengths
.
begin
()
+
2
,
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
auto
r_grid_desc_mraw
=
make_naive_tensor_descriptor
(
make_tuple
(
NHoWo
),
make_tuple
(
WoStride
));
...
...
include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
View file @
3e8cc094
...
...
@@ -4,6 +4,7 @@
#pragma once
#include "ck/library/utility/numeric.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
...
...
@@ -47,10 +48,9 @@ struct TransformConvFwdToGemm
if
constexpr
(
ConvForwardSpecialization
==
device
::
ConvolutionForwardSpecialization
::
Filter1x1Stride1Pad0
)
{
const
index_t
NWo
=
N
*
std
::
accumulate
(
c_g_n_k_wos_lengths
.
begin
()
+
3
,
c_g_n_k_wos_lengths
.
begin
()
+
3
+
NDimSpatial
,
index_t
{
1
},
std
::
multiplies
<
index_t
>
());
const
index_t
NWo
=
N
*
ck
::
accumulate_n
<
index_t
>
(
c_g_n_k_wos_lengths
.
begin
()
+
3
,
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
auto
in_gemmm_gemmk_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
NWo
,
C
));
...
...
@@ -146,10 +146,9 @@ struct TransformConvFwdToGemm
if
constexpr
(
ConvForwardSpecialization
==
device
::
ConvolutionForwardSpecialization
::
Filter1x1Stride1Pad0
)
{
const
index_t
NHoWo
=
N
*
std
::
accumulate
(
c_g_n_k_wos_lengths
.
begin
()
+
3
,
c_g_n_k_wos_lengths
.
begin
()
+
3
+
NDimSpatial
,
index_t
{
1
},
std
::
multiplies
<
index_t
>
());
const
index_t
NHoWo
=
N
*
ck
::
accumulate_n
<
index_t
>
(
c_g_n_k_wos_lengths
.
begin
()
+
3
,
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
auto
in_gemmm_gemmk_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
NHoWo
,
C
));
...
...
@@ -262,10 +261,8 @@ struct TransformConvFwdToGemm
device
::
ConvolutionForwardSpecialization
::
Filter1x1Stride1Pad0
)
{
const
index_t
NDoHoWo
=
N
*
std
::
accumulate
(
c_g_n_k_wos_lengths
.
begin
()
+
3
,
c_g_n_k_wos_lengths
.
begin
()
+
3
+
NDimSpatial
,
index_t
{
1
},
std
::
multiplies
<
index_t
>
());
N
*
ck
::
accumulate_n
<
index_t
>
(
c_g_n_k_wos_lengths
.
begin
()
+
3
,
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
auto
in_gemmm_gemmk_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
NDoHoWo
,
C
));
...
...
@@ -390,10 +387,9 @@ struct TransformConvFwdToGemm
if
constexpr
(
ConvForwardSpecialization
==
device
::
ConvolutionForwardSpecialization
::
Filter1x1Stride1Pad0
)
{
const
index_t
NHoWo
=
N
*
std
::
accumulate
(
c_g_n_k_wos_lengths
.
begin
()
+
3
,
c_g_n_k_wos_lengths
.
begin
()
+
3
+
NDimSpatial
,
index_t
{
1
},
std
::
multiplies
<
index_t
>
());
const
index_t
NHoWo
=
N
*
ck
::
accumulate_n
<
index_t
>
(
c_g_n_k_wos_lengths
.
begin
()
+
3
,
NDimSpatial
,
1
,
std
::
multiplies
<>
());
// This is different
const
index_t
WiStride
=
a_g_n_c_wis_strides
[
2
+
NDimSpatial
];
...
...
@@ -506,10 +502,9 @@ struct TransformConvFwdToGemm
if
constexpr
(
ConvForwardSpecialization
==
device
::
ConvolutionForwardSpecialization
::
Filter1x1Stride1Pad0
)
{
const
index_t
NHoWo
=
N
*
std
::
accumulate
(
c_g_n_k_wos_lengths
.
begin
()
+
3
,
c_g_n_k_wos_lengths
.
begin
()
+
3
+
NDimSpatial
,
index_t
{
1
},
std
::
multiplies
<
index_t
>
());
const
index_t
NHoWo
=
N
*
ck
::
accumulate_n
<
index_t
>
(
c_g_n_k_wos_lengths
.
begin
()
+
3
,
NDimSpatial
,
1
,
std
::
multiplies
<>
());
// This is different
const
index_t
WiStride
=
a_g_n_c_wis_strides
[
2
+
NDimSpatial
];
...
...
@@ -639,10 +634,8 @@ struct TransformConvFwdToGemm
device
::
ConvolutionForwardSpecialization
::
Filter1x1Stride1Pad0
)
{
const
index_t
NDoHoWo
=
N
*
std
::
accumulate
(
c_g_n_k_wos_lengths
.
begin
()
+
3
,
c_g_n_k_wos_lengths
.
begin
()
+
3
+
NDimSpatial
,
index_t
{
1
},
std
::
multiplies
<
index_t
>
());
N
*
ck
::
accumulate_n
<
index_t
>
(
c_g_n_k_wos_lengths
.
begin
()
+
3
,
NDimSpatial
,
1
,
std
::
multiplies
<>
());
// This is different
const
index_t
WiStride
=
a_g_n_c_wis_strides
[
2
+
NDimSpatial
];
...
...
@@ -768,10 +761,8 @@ struct TransformConvFwdToGemm
const
index_t
K
=
b_g_k_c_xs_lengths
[
1
];
const
index_t
C
=
b_g_k_c_xs_lengths
[
2
];
const
index_t
YX
=
std
::
accumulate
(
b_g_k_c_xs_lengths
.
begin
()
+
3
,
b_g_k_c_xs_lengths
.
begin
()
+
3
+
NDimSpatial
,
index_t
{
1
},
std
::
multiplies
<
index_t
>
());
const
index_t
YX
=
ck
::
accumulate_n
<
index_t
>
(
b_g_k_c_xs_lengths
.
begin
()
+
3
,
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
auto
wei_gemmn_gemmk_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
K
,
YX
*
C
));
...
...
@@ -794,10 +785,8 @@ struct TransformConvFwdToGemm
const
index_t
K
=
b_g_k_c_xs_lengths
[
1
];
const
index_t
C
=
b_g_k_c_xs_lengths
[
2
];
const
index_t
YX
=
std
::
accumulate
(
b_g_k_c_xs_lengths
.
begin
()
+
3
,
b_g_k_c_xs_lengths
.
begin
()
+
3
+
NDimSpatial
,
index_t
{
1
},
std
::
multiplies
<
index_t
>
());
const
index_t
YX
=
ck
::
accumulate_n
<
index_t
>
(
b_g_k_c_xs_lengths
.
begin
()
+
3
,
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
index_t
KStride
=
b_g_k_c_xs_strides
[
1
];
const
index_t
XStride
=
b_g_k_c_xs_strides
[
2
+
NDimSpatial
];
...
...
@@ -827,10 +816,9 @@ struct TransformConvFwdToGemm
const
index_t
N
=
c_g_n_k_wos_lengths
[
1
];
const
index_t
K
=
c_g_n_k_wos_lengths
[
2
];
const
index_t
NHoWo
=
N
*
std
::
accumulate
(
c_g_n_k_wos_lengths
.
begin
()
+
3
,
c_g_n_k_wos_lengths
.
begin
()
+
3
+
NDimSpatial
,
index_t
{
1
},
std
::
multiplies
<
index_t
>
());
const
index_t
NHoWo
=
N
*
ck
::
accumulate_n
<
index_t
>
(
c_g_n_k_wos_lengths
.
begin
()
+
3
,
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
auto
out_gemmm_gemmn_desc
=
make_naive_tensor_descriptor_packed
(
make_tuple
(
NHoWo
,
K
));
...
...
@@ -855,10 +843,9 @@ struct TransformConvFwdToGemm
const
auto
KStride
=
I1
;
const
index_t
WoStride
=
c_g_n_k_wos_strides
[
NDimSpatial
+
2
];
const
index_t
NHoWo
=
N
*
std
::
accumulate
(
c_g_n_k_wos_lengths
.
begin
()
+
3
,
c_g_n_k_wos_lengths
.
begin
()
+
3
+
NDimSpatial
,
index_t
{
1
},
std
::
multiplies
<
index_t
>
());
const
index_t
NHoWo
=
N
*
ck
::
accumulate_n
<
index_t
>
(
c_g_n_k_wos_lengths
.
begin
()
+
3
,
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
auto
out_gemmm_gemmn_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
NHoWo
,
K
),
make_tuple
(
WoStride
,
KStride
));
...
...
@@ -878,10 +865,9 @@ struct TransformConvFwdToGemm
const
index_t
N
=
c_g_n_k_wos_lengths
[
1
];
const
index_t
K
=
c_g_n_k_wos_lengths
[
2
];
const
index_t
NHoWo
=
N
*
std
::
accumulate
(
c_g_n_k_wos_lengths
.
begin
()
+
3
,
c_g_n_k_wos_lengths
.
begin
()
+
3
+
NDimSpatial
,
index_t
{
1
},
std
::
multiplies
<
index_t
>
());
const
index_t
NHoWo
=
N
*
ck
::
accumulate_n
<
index_t
>
(
c_g_n_k_wos_lengths
.
begin
()
+
3
,
NDimSpatial
,
1
,
std
::
multiplies
<>
());
const
auto
out_gemmm_gemmn_desc
=
make_naive_tensor_descriptor
(
make_tuple
(
NHoWo
,
K
),
make_tuple
(
I0
,
I1
));
...
...
library/include/ck/library/utility/convolution_parameter.hpp
View file @
3e8cc094
...
...
@@ -10,6 +10,8 @@
#include "ck/ck.hpp"
#include "ck/library/utility/numeric.hpp"
namespace
ck
{
namespace
utils
{
namespace
conv
{
...
...
@@ -55,10 +57,8 @@ struct ConvParam
// sizeof(InDataType) * (G * N * C * <input spatial lengths product>) +
return
sizeof
(
InDataType
)
*
(
G_
*
N_
*
C_
*
std
::
accumulate
(
std
::
begin
(
input_spatial_lengths_
),
std
::
begin
(
input_spatial_lengths_
)
+
num_dim_spatial_
,
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
()));
ck
::
accumulate_n
<
std
::
size_t
>
(
std
::
begin
(
input_spatial_lengths_
),
num_dim_spatial_
,
1
,
std
::
multiplies
<>
()));
}
template
<
typename
WeiDataType
>
...
...
@@ -67,10 +67,8 @@ struct ConvParam
// sizeof(WeiDataType) * (G * K * C * <filter spatial lengths product>) +
return
sizeof
(
WeiDataType
)
*
(
G_
*
K_
*
C_
*
std
::
accumulate
(
std
::
begin
(
filter_spatial_lengths_
),
std
::
begin
(
filter_spatial_lengths_
)
+
num_dim_spatial_
,
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
()));
ck
::
accumulate_n
<
std
::
size_t
>
(
std
::
begin
(
filter_spatial_lengths_
),
num_dim_spatial_
,
1
,
std
::
multiplies
<>
()));
}
template
<
typename
OutDataType
>
...
...
library/include/ck/library/utility/numeric.hpp
0 → 100644
View file @
3e8cc094
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iterator>
#include <numeric>
namespace
ck
{
template
<
typename
T
,
typename
ForwardIterator
,
typename
Size
,
typename
BinaryOperation
>
auto
accumulate_n
(
ForwardIterator
first
,
Size
count
,
T
init
,
BinaryOperation
op
)
->
decltype
(
std
::
accumulate
(
first
,
std
::
next
(
first
,
count
),
init
,
op
))
{
return
std
::
accumulate
(
first
,
std
::
next
(
first
,
count
),
init
,
op
);
}
}
// namespace ck
library/src/utility/convolution_parameter.cpp
View file @
3e8cc094
...
...
@@ -72,14 +72,10 @@ std::size_t ConvParam::GetFlops() const
{
// 2 * G * N * K * C * <output spatial lengths product> * <filter spatial lengths product>
return
static_cast
<
std
::
size_t
>
(
2
)
*
G_
*
N_
*
K_
*
C_
*
std
::
accumulate
(
std
::
begin
(
output_spatial_lengths_
),
std
::
begin
(
output_spatial_lengths_
)
+
num_dim_spatial_
,
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
())
*
std
::
accumulate
(
std
::
begin
(
filter_spatial_lengths_
),
std
::
begin
(
filter_spatial_lengths_
)
+
num_dim_spatial_
,
static_cast
<
std
::
size_t
>
(
1
),
std
::
multiplies
<
std
::
size_t
>
());
ck
::
accumulate_n
<
std
::
size_t
>
(
std
::
begin
(
output_spatial_lengths_
),
num_dim_spatial_
,
1
,
std
::
multiplies
<>
())
*
ck
::
accumulate_n
<
std
::
size_t
>
(
std
::
begin
(
filter_spatial_lengths_
),
num_dim_spatial_
,
1
,
std
::
multiplies
<>
());
}
std
::
string
get_conv_param_parser_helper_msg
()
...
...
profiler/include/profile_reduce_impl.hpp
View file @
3e8cc094
...
...
@@ -6,8 +6,9 @@
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_reduction.hpp"
#include "ck/library/utility/host_common_util.hpp"
...
...
@@ -359,10 +360,10 @@ bool profile_reduce_impl_impl(bool do_verification,
std
::
array
<
index_t
,
NumOutDim
>
arrOutLengths
;
std
::
array
<
index_t
,
NumOutDim
>
arrOutStrides
;
std
::
copy
(
inLengths
.
begin
(),
inLengths
.
end
()
,
arrInLengths
.
begin
());
std
::
copy
(
inStrides
.
begin
(),
inStrides
.
end
()
,
arrInStrides
.
begin
());
std
::
copy
(
outLengths
.
begin
(),
outLengths
.
end
()
,
arrOutLengths
.
begin
());
std
::
copy
(
outStrides
.
begin
(),
outStrides
.
end
()
,
arrOutStrides
.
begin
());
ck
::
ranges
::
copy
(
inLengths
,
arrInLengths
.
begin
());
ck
::
ranges
::
copy
(
inStrides
,
arrInStrides
.
begin
());
ck
::
ranges
::
copy
(
outLengths
,
arrOutLengths
.
begin
());
ck
::
ranges
::
copy
(
outStrides
,
arrOutStrides
.
begin
());
for
(
auto
&
reduce_ptr
:
reduce_ptrs
)
{
...
...
@@ -491,7 +492,7 @@ bool profile_reduce_impl(bool do_verification,
std
::
array
<
ck
::
index_t
,
descType
::
NumReduceDim_
>
arrReduceDims
;
std
::
copy
(
reduceDims
.
begin
(),
reduceDims
.
end
()
,
arrReduceDims
.
begin
());
ck
::
ranges
::
copy
(
reduceDims
,
arrReduceDims
.
begin
());
pass
=
pass
&&
profile_reduce_impl_impl
<
InDataType
,
AccDataType
,
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment