Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
1098ced2
"...git@developer.sourcefind.cn:OpenDAS/mmdetection3d.git" did not exist on "304f17b3d838e049d0d6c103c6f91e210ffdf9c5"
Commit
1098ced2
authored
Apr 05, 2021
by
Jing Zhang
Browse files
inital implement of add fusion
parent
316fcc3f
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
97 additions
and
14 deletions
+97
-14
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
...tion_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
+9
-0
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
...nel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
+51
-2
driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
...convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
+15
-3
driver/include/host_conv.hpp
driver/include/host_conv.hpp
+1
-1
driver/src/conv_driver.cpp
driver/src/conv_driver.cpp
+21
-8
No files found.
composable_kernel/include/driver/driver_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw_outpad.hpp
View file @
1098ced2
...
...
@@ -45,6 +45,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
const
InRightPads
&
in_right_pads
,
const
FloatAB
*
__restrict__
p_wei_global
,
const
FloatAB
*
__restrict__
p_in_global
,
const
FloatC
*
__restrict__
p_d_global
,
FloatC
*
__restrict__
p_out_global
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
...
...
@@ -250,6 +251,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
decltype
(
in_e_n_ho_wo_global_desc
),
const
FloatAB
*
,
decltype
(
out_k_n_hop_wop_global_desc
),
const
FloatC
*
,
FloatC
*
,
integral_constant
<
bool
,
true
>
,
integral_constant
<
bool
,
true
>>
;
...
...
@@ -264,6 +266,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
in_e_n_ho_wo_global_desc
,
p_in_global
,
out_k_n_hop_wop_global_desc
,
p_d_global
,
p_out_global
,
integral_constant
<
bool
,
true
>
{},
integral_constant
<
bool
,
true
>
{});
...
...
@@ -277,6 +280,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
decltype
(
in_e_n_ho_wo_global_desc
),
const
FloatAB
*
,
decltype
(
out_k_n_hop_wop_global_desc
),
const
FloatC
*
,
FloatC
*
,
integral_constant
<
bool
,
true
>
,
integral_constant
<
bool
,
false
>>
;
...
...
@@ -291,6 +295,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
in_e_n_ho_wo_global_desc
,
p_in_global
,
out_k_n_hop_wop_global_desc
,
p_d_global
,
p_out_global
,
integral_constant
<
bool
,
true
>
{},
integral_constant
<
bool
,
false
>
{});
...
...
@@ -304,6 +309,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
decltype
(
in_e_n_ho_wo_global_desc
),
const
FloatAB
*
,
decltype
(
out_k_n_hop_wop_global_desc
),
const
FloatC
*
,
FloatC
*
,
integral_constant
<
bool
,
false
>
,
integral_constant
<
bool
,
true
>>
;
...
...
@@ -318,6 +324,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
in_e_n_ho_wo_global_desc
,
p_in_global
,
out_k_n_hop_wop_global_desc
,
p_d_global
,
p_out_global
,
integral_constant
<
bool
,
false
>
{},
integral_constant
<
bool
,
true
>
{});
...
...
@@ -331,6 +338,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
decltype
(
in_e_n_ho_wo_global_desc
),
const
FloatAB
*
,
decltype
(
out_k_n_hop_wop_global_desc
),
const
FloatC
*
,
FloatC
*
,
integral_constant
<
bool
,
false
>
,
integral_constant
<
bool
,
false
>>
;
...
...
@@ -345,6 +353,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v5r1_nchw_kcyx_nkhw_outpad
in_e_n_ho_wo_global_desc
,
p_in_global
,
out_k_n_hop_wop_global_desc
,
p_d_global
,
p_out_global
,
integral_constant
<
bool
,
false
>
{},
integral_constant
<
bool
,
false
>
{});
...
...
composable_kernel/include/tensor_operation/gridwise_dynamic_gemm_v2.hpp
View file @
1098ced2
...
...
@@ -74,6 +74,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
const
BGlobalDesc
&
b_e_n_ho_wo_global_desc
,
const
FloatAB
*
__restrict__
p_b_global
,
const
CGlobalDesc
&
c_k_n_ho_wo_global_desc
,
const
FloatC
*
__restrict__
p_d_global
,
FloatC
*
__restrict__
p_c_global
,
FloatAB
*
__restrict__
p_shared_block
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
,
...
...
@@ -352,6 +353,48 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
}
#endif
FloatC
p_d_thread
[
c_k_n_ho_wo_thread_desc
.
GetElementSpaceSize
()];
threadwise_matrix_set_zero_v3
(
c_k_n_ho_wo_thread_desc
,
p_d_thread
);
#if 1
{
constexpr
auto
c_k_n_ho_wo_global_tensor_iterator_hacks
=
CGlobalIteratorHacks
{};
const
index_t
k_thread_data_on_global
=
k_block_data_on_global
+
k_thread_id
*
KPerThread
;
ThreadwiseDynamicTensorSliceTransfer_v2
<
FloatC
,
FloatC
,
decltype
(
c_k_n_ho_wo_global_desc
),
decltype
(
c_k_n_ho_wo_thread_desc
),
Sequence
<
KPerThread
,
1
,
HoPerThread
,
WoPerThread
>
,
CThreadTransferSrcDstAccessOrder
,
CThreadTransferSrcDstVectorDim
,
CThreadTransferDstScalarPerVector
,
AddressSpace
::
Global
,
AddressSpace
::
Vgpr
,
InMemoryDataOperation
::
Set
,
1
,
true
>
(
c_k_n_ho_wo_global_desc
,
make_multi_index
(
k_thread_data_on_global
,
0
,
ho_thread_data_on_global
,
wo_thread_data_on_global
))
.
Run
(
c_k_n_ho_wo_global_desc
,
p_d_global
,
c_k_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
p_d_thread
,
c_k_n_ho_wo_global_tensor_iterator_hacks
);
}
#endif
for
(
index_t
i
=
0
;
i
<
c_k_n_ho_wo_thread_desc
.
GetElementSpaceSize
();
i
++
)
{
p_d_thread
[
i
]
+=
p_c_thread
[
i
];
}
#if 1
// output: register to global memory
{
...
...
@@ -362,7 +405,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
k_block_data_on_global
+
k_thread_id
*
KPerThread
;
ThreadwiseDynamicTensorSliceTransfer_v1r3
<
Float
Acc
,
Float
C
,
FloatC
,
decltype
(
c_k_n_ho_wo_thread_desc
),
decltype
(
c_k_n_ho_wo_global_desc
),
...
...
@@ -380,7 +423,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
k_thread_data_on_global
,
0
,
ho_thread_data_on_global
,
wo_thread_data_on_global
))
.
Run
(
c_k_n_ho_wo_thread_desc
,
make_tuple
(
I0
,
I0
,
I0
,
I0
),
p_
c
_thread
,
p_
d
_thread
,
c_k_n_ho_wo_global_desc
,
p_c_global
,
c_k_n_ho_wo_global_tensor_iterator_hacks
);
...
...
@@ -395,6 +438,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
const
BGlobalDesc
&
b_e_n_ho_wo_global_desc
,
const
FloatAB
*
__restrict__
p_b_global
,
const
CGlobalDesc
&
c_k_n_ho_wo_global_desc
,
const
FloatC
*
__restrict__
p_d_global
,
FloatC
*
__restrict__
p_c_global
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
,
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
)
const
...
...
@@ -408,6 +452,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
b_e_n_ho_wo_global_desc
,
p_b_global
,
c_k_n_ho_wo_global_desc
,
p_d_global
,
p_c_global
,
p_shared_block
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
{},
...
...
@@ -421,6 +466,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
const
BGlobalDesc
*
p_b_e_n_ho_wo_global_desc
,
const
FloatAB
*
__restrict__
p_b_global
,
const
CGlobalDesc
*
p_c_k_n_ho_wo_global_desc
,
const
FloatC
*
__restrict__
p_d_global
,
FloatC
*
__restrict__
p_c_global
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
,
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
)
const
...
...
@@ -434,6 +480,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
b_e_n_ho_wo_global_desc
,
p_b_global
,
c_k_n_ho_wo_global_desc
,
p_d_global
,
p_c_global
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
{},
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
{});
...
...
@@ -446,6 +493,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
const
void
*
p_b_e_n_ho_wo_global_desc
,
const
FloatAB
*
__restrict__
p_b_global
,
const
void
*
p_c_k_n_ho_wo_global_desc
,
const
FloatC
*
__restrict__
p_d_global
,
FloatC
*
__restrict__
p_c_global
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
,
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
)
const
...
...
@@ -461,6 +509,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v3
b_e_n_ho_wo_global_desc
,
p_b_global
,
c_k_n_ho_wo_global_desc
,
p_d_global
,
p_c_global
,
integral_constant
<
bool
,
HasMainKBlockLoop
>
{},
integral_constant
<
bool
,
HasDoubleTailKBlockLoop
>
{});
...
...
driver/include/device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw.hpp
View file @
1098ced2
...
...
@@ -21,6 +21,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
WeiDesc
,
const
Tensor
<
TInWei
>&
wei_k_c_y_x
,
OutDesc
,
Tensor
<
TOut
>&
add_n_k_ho_wo
,
Tensor
<
TOut
>&
out_n_k_ho_wo
,
ConvStrides
,
ConvDilations
,
...
...
@@ -35,6 +36,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
DeviceMem
in_n_c_hi_wi_device_buf
(
sizeof
(
TInWei
)
*
in_n_c_hi_wi
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_k_c_y_x_device_buf
(
sizeof
(
TInWei
)
*
wei_k_c_y_x
.
mDesc
.
GetElementSpace
());
DeviceMem
add_n_k_ho_wo_device_buf
(
sizeof
(
TOut
)
*
add_n_k_ho_wo
.
mDesc
.
GetElementSpace
());
DeviceMem
out_n_k_ho_wo_device_buf
(
sizeof
(
TOut
)
*
out_n_k_ho_wo
.
mDesc
.
GetElementSpace
());
constexpr
auto
I0
=
Number
<
0
>
{};
...
...
@@ -93,6 +95,8 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
C0
,
Hi
,
Wi
,
C1
>
{})));
Tensor
<
TInWei
>
wei_k_c0_y_x_c1
(
make_HostTensorDescriptor
(
make_native_tensor_descriptor_packed
(
Sequence
<
K
,
C0
,
Y
,
X
,
C1
>
{})));
Tensor
<
TOut
>
add_n_k0_ho_wo_k1
(
make_HostTensorDescriptor
(
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
K0
,
Ho
,
Wo
,
K1
>
{})));
Tensor
<
TOut
>
out_n_k0_ho_wo_k1
(
make_HostTensorDescriptor
(
make_native_tensor_descriptor_packed
(
Sequence
<
N
,
K0
,
Ho
,
Wo
,
K1
>
{})));
...
...
@@ -106,11 +110,18 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
wei_k_c_y_x
(
k
,
c
,
y
,
x
);
};
auto
f_nkhw_to_nk0hwk1
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
add_n_k0_ho_wo_k1
(
n
,
k
/
InWeiVectorSize
,
ho
,
wo
,
k
%
InWeiVectorSize
)
=
add_n_k_ho_wo
(
n
,
k
,
ho
,
wo
);
};
make_ParallelTensorFunctor
(
f_nchw2nc0hwc1
,
N
,
Hi
,
Wi
,
C
)();
make_ParallelTensorFunctor
(
f_kcyx2kc0yxc1
,
K
,
Y
,
X
,
C
)();
make_ParallelTensorFunctor
(
f_nkhw_to_nk0hwk1
,
N
,
K
,
Ho
,
Wo
)();
in_n_c_hi_wi_device_buf
.
ToDevice
(
in_n_c0_hi_wi_c1
.
mData
.
data
());
wei_k_c_y_x_device_buf
.
ToDevice
(
wei_k_c0_y_x_c1
.
mData
.
data
());
add_n_k_ho_wo_device_buf
.
ToDevice
(
add_n_k0_ho_wo_k1
.
mData
.
data
());
#if 1
// cdata = 64, BlockSize = 64, 16x8x32x4
...
...
@@ -126,15 +137,15 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
EPerThread
=
EPerBlock
;
using
ABlockTransferThreadSliceLengths_E_K
=
Sequence
<
3
,
1
>
;
using
ABlockTransferThreadClusterLengths_E_K
=
Sequence
<
3
*
EPerBlock
,
KPerBlock
>
;
using
ABlockTransferThreadSliceLengths_E_K
=
Sequence
<
9
,
1
>
;
using
ABlockTransferThreadClusterLengths_E_K
=
Sequence
<
EPerBlock
,
KPerBlock
>
;
constexpr
index_t
ABlockTransferSrcScalarPerVector_E
=
1
;
constexpr
index_t
ABlockTransferDstScalarPerVector_K
=
1
;
constexpr
index_t
BThreadTransferSrcScalarPerVector_W
=
1
;
constexpr
index_t
CThreadTransferDstScalarPerVector_W
=
K
1
;
constexpr
index_t
CThreadTransferDstScalarPerVector_W
=
1
;
static_assert
(
KPerThread
%
CThreadTransferDstScalarPerVector_W
==
0
,
""
);
#else
...
...
@@ -196,6 +207,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(
wei_k_c_y_x_device_buf
.
GetDeviceBuffer
()),
static_cast
<
typename
vector_type
<
TInWei
,
InWeiVectorSize
>::
type
*>
(
in_n_c_hi_wi_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
add_n_k_ho_wo_device_buf
.
GetDeviceBuffer
()),
static_cast
<
TOut
*>
(
out_n_k_ho_wo_device_buf
.
GetDeviceBuffer
()));
out_n_k_ho_wo_device_buf
.
FromDevice
(
out_n_k0_ho_wo_k1
.
mData
.
data
());
...
...
driver/include/host_conv.hpp
View file @
1098ced2
...
...
@@ -40,7 +40,7 @@ void host_direct_convolution(const Tensor<TIn>& in_nchw,
}
}
}
out_nkhw
(
n
,
k
,
ho
,
wo
)
=
v
;
out_nkhw
(
n
,
k
,
ho
,
wo
)
+
=
v
;
};
auto
f_par
=
make_ParallelTensorFunctor
(
f
,
...
...
driver/src/conv_driver.cpp
View file @
1098ced2
...
...
@@ -64,7 +64,7 @@ int main(int argc, char* argv[])
using
LeftPads
=
Sequence
<
0
,
0
>
;
using
RightPads
=
Sequence
<
0
,
0
>
;
#elif
1
#elif
0
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
16
;
constexpr
index_t
HI
=
1080
;
...
...
@@ -76,13 +76,13 @@ int main(int argc, char* argv[])
using
ConvStrides
=
Sequence
<
1
,
1
>
;
using
ConvDilations
=
Sequence
<
1
,
1
>
;
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif
0
using
LeftPads
=
Sequence
<
1
,
1
>
;
using
RightPads
=
Sequence
<
1
,
1
>
;
#elif
1
constexpr
index_t
N
=
1
;
constexpr
index_t
C
=
1
;
constexpr
index_t
HI
=
102
4
;
constexpr
index_t
WI
=
2048
;
constexpr
index_t
C
=
4
;
constexpr
index_t
HI
=
6
4
;
constexpr
index_t
WI
=
64
;
constexpr
index_t
K
=
4
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
...
...
@@ -630,11 +630,16 @@ int main(int argc, char* argv[])
print_array
(
"ConvStrides"
,
to_multi_index
(
ConvStrides
{}));
print_array
(
"ConvDilations"
,
to_multi_index
(
ConvDilations
{}));
#if
0
#if
1
using
in_data_t
=
float
;
constexpr
index_t
in_vector_size
=
1
;
using
acc_data_t
=
float
;
using
out_data_t
=
float
;
#elif 0
using
in_data_t
=
half_t
;
constexpr
index_t
in_vector_size
=
16
;
using
acc_data_t
=
float
;
using
out_data_t
=
half_t
;
#elif 0
using
in_data_t
=
float
;
constexpr
index_t
in_vector_size
=
1
;
...
...
@@ -650,6 +655,8 @@ int main(int argc, char* argv[])
Tensor
<
in_data_t
>
in_nchw
(
make_HostTensorDescriptor
(
in_nchw_desc
));
Tensor
<
in_data_t
>
wei_kcyx
(
make_HostTensorDescriptor
(
wei_kcyx_desc
));
Tensor
<
out_data_t
>
out_nkhw_host
(
make_HostTensorDescriptor
(
out_nkhw_desc
));
Tensor
<
out_data_t
>
add_nkhw_device
(
make_HostTensorDescriptor
(
out_nkhw_desc
));
Tensor
<
out_data_t
>
out_nkhw_device
(
make_HostTensorDescriptor
(
out_nkhw_desc
));
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
...
...
@@ -686,6 +693,9 @@ int main(int argc, char* argv[])
};
wei_kcyx
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
#endif
out_nkhw_host
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
add_nkhw_device
.
GenerateTensorValue
(
GeneratorTensor_1
{},
num_thread
);
}
#if 0
...
...
@@ -768,6 +778,7 @@ int main(int argc, char* argv[])
wei_kcyx_desc
,
wei_kcyx
,
out_nkhw_desc
,
add_nkhw_device
,
out_nkhw_device
,
ConvStrides
{},
ConvDilations
{},
...
...
@@ -788,6 +799,7 @@ int main(int argc, char* argv[])
check_error
(
out_nkhw_host
,
out_nkhw_device
);
#if 1
if
(
do_log
)
{
LogRange
(
std
::
cout
<<
"in_nchw : "
,
in_nchw
.
mData
,
","
)
<<
std
::
endl
;
...
...
@@ -795,5 +807,6 @@ int main(int argc, char* argv[])
LogRange
(
std
::
cout
<<
"out_nkhw_host : "
,
out_nkhw_host
.
mData
,
","
)
<<
std
::
endl
;
LogRange
(
std
::
cout
<<
"out_nkhw_device: "
,
out_nkhw_device
.
mData
,
","
)
<<
std
::
endl
;
}
#endif
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment