Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
2a48812e
Commit
2a48812e
authored
May 21, 2019
by
Chao Liu
Browse files
behavior has changed (better and worse), figuring out why
parent
acd7082f
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
16 additions
and
13 deletions
+16
-13
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
...er/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+2
-2
src/include/ConstantTensorDescriptor.hip.hpp
src/include/ConstantTensorDescriptor.hip.hpp
+3
-4
src/include/blockwise_tensor_slice_op.hip.hpp
src/include/blockwise_tensor_slice_op.hip.hpp
+8
-4
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
...plicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
+2
-2
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
...plicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
+1
-1
No files found.
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
View file @
2a48812e
...
@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
...
@@ -57,7 +57,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
wei_cyxk_device_buf
.
ToDevice
(
wei_cyxk
.
mData
.
data
());
wei_cyxk_device_buf
.
ToDevice
(
wei_cyxk
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
out_nkhw_device_buf
.
ToDevice
(
out_nkhw
.
mData
.
data
());
#if
1
#if
0
// for 3x3, 34x34, v1r3, Pascal
// for 3x3, 34x34, v1r3, Pascal
constexpr index_t BlockSize = 128;
constexpr index_t BlockSize = 128;
...
@@ -162,7 +162,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
...
@@ -162,7 +162,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
constexpr
index_t
WeiBlockCopyDataPerRead_K
=
4
;
constexpr
index_t
WeiBlockCopyDataPerRead_K
=
4
;
constexpr
index_t
OutThreadCopyDataPerWrite_W
=
2
;
constexpr
index_t
OutThreadCopyDataPerWrite_W
=
2
;
#elif
0
#elif
1
// for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 8
// for 3x3, 34x34, v1r3, Vega 20, WoPerBlock = 8
constexpr
index_t
BlockSize
=
256
;
constexpr
index_t
BlockSize
=
256
;
...
...
src/include/ConstantTensorDescriptor.hip.hpp
View file @
2a48812e
...
@@ -286,10 +286,9 @@ struct ConstantTensorDescriptor
...
@@ -286,10 +286,9 @@ struct ConstantTensorDescriptor
"wrong! dimensions to be unfolded need to be packed"
);
"wrong! dimensions to be unfolded need to be packed"
);
// checkt ranks
// checkt ranks
static_assert
(
GetMemoryRank
(
IDim_p1
)
=
GetMemoryRank
(
IDim
)
+
1
,
static_assert
(
GetMemoryRank
(
IDim_p1
)
==
GetMemoryRank
(
IDim
)
+
1
,
"wrong! ranks of dimensions to be "
"wrong! ranks of dimensions to be unfolded need to be in increasing and "
"unfolded need to be in increasing "
"continuous ranks"
);
"and continuous ranks"
);
});
});
// left and right
// left and right
...
...
src/include/blockwise_tensor_slice_op.hip.hpp
View file @
2a48812e
...
@@ -39,7 +39,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -39,7 +39,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr
auto
thread_cluster_lengths
=
constexpr
auto
thread_cluster_lengths
=
src_cluster_lengths
.
ReorderGivenNew2Old
(
map_thread_cluster_2_src_cluster
);
src_cluster_lengths
.
ReorderGivenNew2Old
(
map_thread_cluster_2_src_cluster
);
constexpr
auto
thread_cluster_desc
=
make_packed_ConstantTensorDescriptor
(
thread_cluster_lengths
);
constexpr
auto
thread_cluster_desc
=
make_packed_ConstantTensorDescriptor
(
thread_cluster_lengths
);
// sanity check: data type
// sanity check: data type
static_assert
(
is_same
<
Float
,
float
>::
value
,
"wrong! only support float for now!
\n
"
);
static_assert
(
is_same
<
Float
,
float
>::
value
,
"wrong! only support float for now!
\n
"
);
...
@@ -147,7 +148,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -147,7 +148,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_desc
=
make_packed_ConstantTensorDescriptor
(
thread_tensor_lengths
);
constexpr
auto
thread_tensor_desc
=
make_packed_ConstantTensorDescriptor
(
thread_tensor_lengths
);
return
thread_tensor_desc
.
GetElementSpace
();
return
thread_tensor_desc
.
GetElementSpace
();
}
}
...
@@ -167,7 +169,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -167,7 +169,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_desc
=
make_packed_ConstantTensorDescriptor
(
thread_tensor_lengths
);
constexpr
auto
thread_tensor_desc
=
make_packed_ConstantTensorDescriptor
(
thread_tensor_lengths
);
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
...
@@ -204,7 +207,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
...
@@ -204,7 +207,8 @@ struct BlockwiseTensorSliceReorderCopy_v3
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_lengths
=
thread_sub_tensor_lengths
*
repeat_lengths
;
constexpr
auto
thread_tensor_desc
=
make_packed_ConstantTensorDescriptor
(
thread_tensor_lengths
);
constexpr
auto
thread_tensor_desc
=
make_packed_ConstantTensorDescriptor
(
thread_tensor_lengths
);
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
static_ford
<
decltype
(
repeat_lengths
)
>
{}([
&
](
auto
repeat_multi_id_
)
{
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
constexpr
auto
repeat_multi_id
=
decltype
(
repeat_multi_id_
){};
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
View file @
2a48812e
...
@@ -362,8 +362,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
...
@@ -362,8 +362,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
const
index_t
n_thread_data_begin
=
c_thread_mtx_begin
.
col
%
NPerBlock
;
const
index_t
n_thread_data_begin
=
c_thread_mtx_begin
.
col
%
NPerBlock
;
static_if
<
GemmNPerThreadSubC
<=
NPerBlock
>
{}([
&
](
auto
fwd
)
{
// fwd do nothing but
static_if
<
GemmNPerThreadSubC
<=
NPerBlock
>
{}([
&
](
auto
fwd
)
{
// fwd do nothing but
// perfect forwarding.
// perfect forwarding.
// Using this trick to
// Using this trick to
// make this lambda a generic lambda, so it won't be compiled until
// make this lambda a generic lambda, so it won't be compiled until
// instantiated
// instantiated
static_assert
(
static_assert
(
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
View file @
2a48812e
...
@@ -196,7 +196,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
...
@@ -196,7 +196,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
// choose GEMM implementation here
// choose GEMM implementation here
const
auto
run_blockwise_batch_gemm
=
[
&
](
auto
...
Xs
)
{
const
auto
run_blockwise_batch_gemm
=
[
&
](
auto
...
Xs
)
{
#if
1
#if
0
return blockwise_batch_gemm.Run(Xs...);
return blockwise_batch_gemm.Run(Xs...);
#elif
0
#elif
0
return
blockwise_batch_gemm
.
Run_asm
(
Xs
...);
return
blockwise_batch_gemm
.
Run_asm
(
Xs
...);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment