Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
63cdc6d2
Commit
63cdc6d2
authored
Apr 27, 2019
by
Chao Liu
Browse files
fix v1r3 output reorder bug
parent
c138e212
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
24 additions
and
82 deletions
+24
-82
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
...er/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
+13
-13
driver/driver.hip.cpp
driver/driver.hip.cpp
+8
-6
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
...ise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
+3
-63
No files found.
driver/device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp
View file @
63cdc6d2
...
@@ -87,7 +87,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
...
@@ -87,7 +87,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW
constexpr index_t InBlockReorderDataPerRead_W = 1; // v1r3 cannot do vector load input for NCHW
constexpr index_t InBlockReorderDataPerWrite_N = 1;
constexpr index_t InBlockReorderDataPerWrite_N = 1;
using WeiBlockCopyClusterLengths =
Sequence<0, 0>; // not used
using WeiBlockCopyClusterLengths =
void;
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t WeiBlockCopyDataPerRead_K = 4;
constexpr index_t OutThreadCopyDataPerWrite_W = 2;
constexpr index_t OutThreadCopyDataPerWrite_W = 2;
...
@@ -122,7 +122,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
...
@@ -122,7 +122,7 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
constexpr
index_t
InBlockReorderDataPerRead_W
=
1
;
// v1r3 cannot do vector load NCHW
constexpr
index_t
InBlockReorderDataPerRead_W
=
1
;
// v1r3 cannot do vector load NCHW
constexpr
index_t
InBlockReorderDataPerWrite_N
=
2
;
constexpr
index_t
InBlockReorderDataPerWrite_N
=
2
;
using
WeiBlockCopyClusterLengths
=
Sequence
<
0
,
0
>
;
// not used
using
WeiBlockCopyClusterLengths
=
void
;
constexpr
index_t
WeiBlockCopyDataPerRead_K
=
4
;
constexpr
index_t
WeiBlockCopyDataPerRead_K
=
4
;
constexpr
index_t
OutThreadCopyDataPerWrite_W
=
4
;
constexpr
index_t
OutThreadCopyDataPerWrite_W
=
4
;
...
@@ -136,10 +136,10 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
...
@@ -136,10 +136,10 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
constexpr
index_t
HoPerBlock
=
4
;
constexpr
index_t
HoPerBlock
=
4
;
constexpr
index_t
WoPerBlock
=
8
;
constexpr
index_t
WoPerBlock
=
8
;
constexpr
index_t
NPerThread
=
2
;
constexpr
index_t
NPerThread
=
4
;
constexpr
index_t
KPerThread
=
8
;
constexpr
index_t
KPerThread
=
8
;
constexpr
index_t
HoPerThread
=
1
;
constexpr
index_t
HoPerThread
=
1
;
constexpr
index_t
WoPerThread
=
4
;
constexpr
index_t
WoPerThread
=
2
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmMPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
constexpr
index_t
GemmNPerThreadSubC
=
4
;
...
@@ -155,14 +155,14 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
...
@@ -155,14 +155,14 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
using
InBlockReorderSrcClusterLengths_NCHW
=
Sequence
<
1
,
8
,
4
,
8
>
;
using
InBlockReorderSrcClusterLengths_NCHW
=
Sequence
<
1
,
8
,
4
,
8
>
;
using
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
=
Sequence
<
1
,
2
,
0
,
3
>
;
using
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
=
Sequence
<
1
,
2
,
0
,
3
>
;
constexpr
index_t
InBlockReorderDataPerRead_W
=
1
;
// v1r3 cannot do vector load NCHW
constexpr
index_t
InBlockReorderDataPerRead_W
=
1
;
// v1r3 cannot do vector load NCHW
constexpr
index_t
InBlockReorderDataPerWrite_N
=
1
;
constexpr
index_t
InBlockReorderDataPerWrite_N
=
4
;
using
WeiBlockCopyClusterLengths
=
Sequence
<
0
,
0
>
;
// not used
using
WeiBlockCopyClusterLengths
=
void
;
constexpr
index_t
WeiBlockCopyDataPerRead_K
=
4
;
constexpr
index_t
WeiBlockCopyDataPerRead_K
=
4
;
constexpr
index_t
OutThreadCopyDataPerWrite_W
=
1
;
constexpr
index_t
OutThreadCopyDataPerWrite_W
=
2
;
#elif
0
#elif
1
// for 3x3, 28x28, v1r
2
, Pascal
// for 3x3, 28x28, v1r
3
, Pascal
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
BlockSize
=
128
;
constexpr
index_t
NPerBlock
=
16
;
constexpr
index_t
NPerBlock
=
16
;
...
@@ -186,13 +186,13 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
...
@@ -186,13 +186,13 @@ void device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw(InDesc,
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadA
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
constexpr
index_t
GemmDataPerReadB
=
4
;
using
InBlockReorderSrcSubLengths_NCHW
=
Sequence
<
4
,
1
,
1
,
2
>
;
using
InBlockReorderSrcSubLengths_NCHW
=
Sequence
<
4
,
1
,
1
,
1
>
;
using
InBlockReorderSrcClusterLengths_NCHW
=
Sequence
<
4
,
8
,
2
,
2
>
;
using
InBlockReorderSrcClusterLengths_NCHW
=
Sequence
<
4
,
8
,
2
,
2
>
;
using
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
=
Sequence
<
1
,
2
,
0
,
3
>
;
using
InBlockReorderMapThreadCluster2SrcCluster_CHNW2NCHW
=
Sequence
<
1
,
2
,
0
,
3
>
;
constexpr
index_t
InBlockReorderDataPerRead_W
=
2
;
constexpr
index_t
InBlockReorderDataPerRead_W
=
1
;
// v1r3 cannot do vector load NCHW
constexpr
index_t
InBlockReorderDataPerWrite_N
=
4
;
constexpr
index_t
InBlockReorderDataPerWrite_N
=
4
;
using
WeiBlockCopyClusterLengths
=
Sequence
<
4
,
1
,
32
>
;
using
WeiBlockCopyClusterLengths
=
void
;
constexpr
index_t
WeiBlockCopyDataPerRead_K
=
4
;
constexpr
index_t
WeiBlockCopyDataPerRead_K
=
4
;
constexpr
index_t
OutThreadCopyDataPerWrite_W
=
2
;
constexpr
index_t
OutThreadCopyDataPerWrite_W
=
2
;
...
...
driver/driver.hip.cpp
View file @
63cdc6d2
...
@@ -371,7 +371,7 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
...
@@ -371,7 +371,7 @@ void host_winograd_3x3_convolution(const Tensor<TIn>& in_nchw,
std
::
size_t
ho
=
HoPerTile
*
htile
+
j
;
std
::
size_t
ho
=
HoPerTile
*
htile
+
j
;
for
(
int
i
=
0
;
i
<
WoPerTile
;
++
i
)
for
(
int
i
=
0
;
i
<
WoPerTile
;
++
i
)
{
{
std
::
size_t
wo
=
WoPerTile
*
wtile
+
i
;
std
::
size_t
wo
=
WoPerTile
*
wtile
+
i
;
out_nkhw
(
n
,
k
,
ho
,
wo
)
=
out_hold
(
n
,
k
,
htile
,
wtile
,
j
,
i
);
out_nkhw
(
n
,
k
,
ho
,
wo
)
=
out_hold
(
n
,
k
,
htile
,
wtile
,
j
,
i
);
}
}
}
}
...
@@ -413,13 +413,13 @@ int main(int argc, char* argv[])
...
@@ -413,13 +413,13 @@ int main(int argc, char* argv[])
{
{
#if 1
#if 1
// 3x3, 34x34
// 3x3, 34x34
constexpr
index_t
N
=
64
;
constexpr
index_t
N
=
64
;
constexpr
index_t
C
=
256
;
constexpr
index_t
C
=
256
;
constexpr
index_t
HI
=
34
;
constexpr
index_t
HI
=
34
;
constexpr
index_t
WI
=
34
;
constexpr
index_t
WI
=
34
;
constexpr
index_t
K
=
128
;
constexpr
index_t
K
=
128
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
Y
=
3
;
constexpr
index_t
X
=
3
;
constexpr
index_t
X
=
3
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
HPad
=
0
;
constexpr
index_t
WPad
=
0
;
constexpr
index_t
WPad
=
0
;
...
@@ -597,6 +597,8 @@ int main(int argc, char* argv[])
...
@@ -597,6 +597,8 @@ int main(int argc, char* argv[])
};
};
wei_kcyx
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
wei_kcyx
.
GenerateTensorValue
(
gen_wei
,
num_thread
);
#endif
#endif
// out_nkhw_device.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
}
}
#if 1
#if 1
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
View file @
63cdc6d2
...
@@ -359,19 +359,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
...
@@ -359,19 +359,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
constexpr
index_t
K2
=
GemmMPerThreadSubC
;
constexpr
index_t
K2
=
GemmMPerThreadSubC
;
constexpr
index_t
K1
=
KPerBlock
/
KPerThread
;
constexpr
index_t
K1
=
KPerBlock
/
KPerThread
;
#if 0
constexpr auto out_10d_global_desc =
make_ConstantTensorDescriptor(Sequence<K / (K1 * K2),
K1,
K2,
Ho,
Wo / (W1 * W2),
W1,
W2,
N / f_dummy(N1 * N2),
N1,
N2>{});
#else
constexpr
auto
out_10d_global_desc
=
constexpr
auto
out_10d_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
N
/
f_dummy
(
N1
*
N2
),
make_ConstantTensorDescriptor
(
Sequence
<
N
/
f_dummy
(
N1
*
N2
),
N1
,
N1
,
...
@@ -383,7 +370,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
...
@@ -383,7 +370,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
Wo
/
(
W1
*
W2
),
Wo
/
(
W1
*
W2
),
W1
,
W1
,
W2
>
{});
W2
>
{});
#endif
constexpr
auto
out_10d_thread_desc
=
make_ConstantTensorDescriptor
(
constexpr
auto
out_10d_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
/
K2
,
1
,
K2
,
HoPerThread
,
1
,
W1
,
1
,
1
,
1
,
N2
>
{});
Sequence
<
KPerThread
/
K2
,
1
,
K2
,
HoPerThread
,
1
,
W1
,
1
,
1
,
1
,
N2
>
{});
...
@@ -401,20 +387,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
...
@@ -401,20 +387,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
}
}
#endif
#endif
#if 0
constexpr
auto
map_out_global2thread
=
Sequence
<
7
,
8
,
9
,
0
,
1
,
2
,
3
,
4
,
5
,
6
>
{};
threadwise_nd_tensor_copy(out_10d_thread_desc,
p_out_thread,
out_10d_global_desc,
p_out_global +
out_k_h_w_n_global_desc.Get1dIndex(
k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin,
n_block_data_begin + n_thread_data_begin),
out_10d_thread_desc.GetLengths(),
Number<OutThreadCopyDataPerWrite_N>{});
#else
constexpr
auto
map_out_global2thread
=
Sequence
<
7
,
8
,
9
,
0
,
1
,
2
,
6
,
3
,
4
,
5
>
{};
threadwise_nd_tensor_copy_reorder_given_dst2src_v2
(
threadwise_nd_tensor_copy_reorder_given_dst2src_v2
(
out_10d_thread_desc
,
out_10d_thread_desc
,
...
@@ -428,8 +401,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
...
@@ -428,8 +401,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
wo_block_data_begin
+
wo_thread_data_begin
),
wo_block_data_begin
+
wo_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
out_10d_thread_desc
.
GetLengths
(),
map_out_global2thread
);
map_out_global2thread
);
// Number<OutThreadCopyDataPerWrite_W>{});
// Number<OutThreadCopyDataPerWrite_W>{});
#endif
})
})
.
else_
([
&
](
auto
f_dummy
)
{
.
else_
([
&
](
auto
f_dummy
)
{
static_assert
(
f_dummy
(
GemmNPerThreadSubC
)
>=
NPerBlock
&&
NPerThread
==
NPerBlock
&&
static_assert
(
f_dummy
(
GemmNPerThreadSubC
)
>=
NPerBlock
&&
NPerThread
==
NPerBlock
&&
...
@@ -446,19 +418,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
...
@@ -446,19 +418,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
constexpr
index_t
K2
=
GemmMPerThreadSubC
;
constexpr
index_t
K2
=
GemmMPerThreadSubC
;
constexpr
index_t
K1
=
KPerBlock
/
KPerThread
;
constexpr
index_t
K1
=
KPerBlock
/
KPerThread
;
#if 0
constexpr auto out_10d_global_desc =
make_ConstantTensorDescriptor(Sequence<K / (K1 * K2),
K1,
K2,
Ho,
Wo / (W1 * W2 * W3),
W1,
W2,
W3,
N / N1,
N1>{});
#else
constexpr
auto
out_10d_global_desc
=
constexpr
auto
out_10d_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
N
/
N1
,
make_ConstantTensorDescriptor
(
Sequence
<
N
/
N1
,
N1
,
N1
,
...
@@ -470,7 +429,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
...
@@ -470,7 +429,6 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
W1
,
W1
,
W2
,
W2
,
W3
>
{});
W3
>
{});
#endif
constexpr
auto
out_10d_thread_desc
=
make_ConstantTensorDescriptor
(
constexpr
auto
out_10d_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
/
K2
,
1
,
K2
,
HoPerThread
,
1
,
W1
,
1
,
W3
,
1
,
N1
>
{});
Sequence
<
KPerThread
/
K2
,
1
,
K2
,
HoPerThread
,
1
,
W1
,
1
,
W3
,
1
,
N1
>
{});
...
@@ -486,26 +444,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
...
@@ -486,26 +444,9 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
"out_k_h_w_n_global_desc");
"out_k_h_w_n_global_desc");
print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc");
print_ConstantTensorDescriptor(out_10d_global_desc, "out_10d_global_desc");
for(index_t i = 0; i < 64; ++i)
{
printf("out %f, ", p_out_thread[i]);
}
}
}
#endif
#endif
#if 0
threadwise_nd_tensor_copy(out_10d_thread_desc,
p_out_thread,
out_10d_global_desc,
p_out_global +
out_k_h_w_n_global_desc.Get1dIndex(
k_block_data_begin + k_thread_data_begin,
ho_block_data_begin + ho_thread_data_begin,
wo_block_data_begin + wo_thread_data_begin,
n_block_data_begin + n_thread_data_begin),
out_10d_thread_desc.GetLengths(),
Number<OutThreadCopyDataPerWrite_N>{});
#else
constexpr
auto
map_out_global2thread
=
Sequence
<
8
,
9
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
>
{};
constexpr
auto
map_out_global2thread
=
Sequence
<
8
,
9
,
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
>
{};
threadwise_nd_tensor_copy_reorder_given_dst2src_v2
(
threadwise_nd_tensor_copy_reorder_given_dst2src_v2
(
...
@@ -520,8 +461,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
...
@@ -520,8 +461,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
wo_block_data_begin
+
wo_thread_data_begin
),
wo_block_data_begin
+
wo_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
out_10d_thread_desc
.
GetLengths
(),
map_out_global2thread
);
map_out_global2thread
);
// Number<OutThreadCopyDataPerWrite_W>{});
// Number<OutThreadCopyDataPerWrite_W>{});
#endif
});
});
}
}
}
;
}
;
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment