Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
6ffd41ae
Commit
6ffd41ae
authored
Jun 21, 2022
by
carlushuang
Browse files
fix a bug when upsampling value
parent
d8fef836
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
330 additions
and
290 deletions
+330
-290
include/ck/tensor_operation/cpu/grid/gridwise_direct_conv_avx2.hpp
...k/tensor_operation/cpu/grid/gridwise_direct_conv_avx2.hpp
+330
-290
No files found.
include/ck/tensor_operation/cpu/grid/gridwise_direct_conv_avx2.hpp
View file @
6ffd41ae
...
...
@@ -421,19 +421,25 @@ struct GridwiseDirectConvNHWCAvx2
}
return
t_
;
};
const
ck
::
index_t
num_works_nho
=
N
*
Ho
;
const
ck
::
index_t
num_works_n
=
N
;
const
ck
::
index_t
num_works_ho
=
Ho
;
// const ck::index_t num_works_nho = N * Ho;
const
ck
::
index_t
num_works_wo
=
math
::
integer_divide_ceil
(
Wo
,
m_per_thread
);
const
ck
::
index_t
num_works_k
=
math
::
integer_divide_ceil
(
K
,
n_per_thread
);
auto
distribute_num_threads_nho_wo_k
=
[
&
](
ck
::
index_t
&
num_threads_nho_
,
auto
distribute_num_threads_n_ho_wo_k
=
[
&
](
ck
::
index_t
&
num_threads_n_
,
ck
::
index_t
&
num_threads_ho_
,
ck
::
index_t
&
num_threads_wo_
,
ck
::
index_t
&
num_threads_k_
)
{
// TODO: only consider multiply of 2 to divide threads
ck
::
index_t
num_threads
=
total_threads
;
num_threads_nho_
=
devide_thread
(
num_threads
,
num_works_nho
,
2
);
num_threads
=
num_threads
/
num_threads_nho_
;
num_threads_n_
=
devide_thread
(
num_threads
,
num_works_n
,
2
);
num_threads
=
num_threads
/
num_threads_n_
;
num_threads_ho_
=
devide_thread
(
num_threads
,
num_works_ho
,
2
);
num_threads
=
num_threads
/
num_threads_ho_
;
num_threads_wo_
=
devide_thread
(
num_threads
,
num_works_wo
,
2
);
num_threads
=
num_threads
/
num_threads_wo_
;
...
...
@@ -442,14 +448,18 @@ struct GridwiseDirectConvNHWCAvx2
// num_threads = num_threads / num_threads_k_;
};
ck
::
index_t
num_threads_nho
;
ck
::
index_t
num_threads_n
;
ck
::
index_t
num_threads_ho
;
ck
::
index_t
num_threads_wo
;
ck
::
index_t
num_threads_k
;
distribute_num_threads_nho_wo_k
(
num_threads_nho
,
num_threads_wo
,
num_threads_k
);
distribute_num_threads_n_ho_wo_k
(
num_threads_n
,
num_threads_ho
,
num_threads_wo
,
num_threads_k
);
const
ck
::
index_t
num_works_nho_per_thread
=
math
::
integer_divide_ceil
(
num_works_nho
,
num_threads_nho
);
const
ck
::
index_t
num_works_n_per_thread
=
math
::
integer_divide_ceil
(
num_works_n
,
num_threads_n
);
const
ck
::
index_t
num_works_ho_per_thread
=
math
::
integer_divide_ceil
(
num_works_ho
,
num_threads_ho
);
const
ck
::
index_t
num_works_wo_per_thread
=
math
::
integer_divide_ceil
(
num_works_wo
,
num_threads_wo
);
const
ck
::
index_t
num_works_k_per_thread
=
...
...
@@ -460,6 +470,14 @@ struct GridwiseDirectConvNHWCAvx2
// num_threads_nho, num_threads_wo, num_threads_k, num_works_nho_per_thread,
// num_works_wo_per_thread, num_works_k_per_thread); fflush(stdout);
if
((
X
-
1
)
*
Dx
+
1
<=
Px
||
(
Y
-
1
)
*
Dy
+
1
<=
Py
)
{
// padding zero case, outpout will have zero due to upsampling
// TODO: This is ugly and slow
ck
::
cpu
::
avx2_util
::
memset32_avx2
(
&
c_grid_buf
.
p_data_
[
0
],
0
,
N
*
Ho
*
Wo
*
K
);
// printf("___ clear\n");
}
if
(
dynamic_tunable
.
loop_over_spec
==
ck
::
tensor_operation
::
cpu
::
device
::
ConvolutionForwardBlockLoopOverSpecialization_t
::
LoopOver_MNK
)
...
...
@@ -495,10 +513,14 @@ struct GridwiseDirectConvNHWCAvx2
UseCLocalBuffer
?
c_block_mem
.
mMemSize
/
sizeof
(
FloatC
)
:
c_grid_desc
.
GetElementSpaceSize
());
const
ck
::
index_t
tid
=
omp_get_thread_num
();
const
ck
::
index_t
tid_k
=
tid
%
num_threads_k
;
const
ck
::
index_t
tid_wo
=
(
tid
/
num_threads_k
)
%
num_threads_wo
;
const
ck
::
index_t
tid_nho
=
tid
/
(
num_threads_k
*
num_threads_wo
);
ck
::
index_t
tid
=
omp_get_thread_num
();
const
ck
::
index_t
tid_n
=
tid
%
num_threads_n
;
tid
/=
num_threads_n
;
const
ck
::
index_t
tid_ho
=
tid
%
num_threads_ho
;
tid
/=
num_threads_ho
;
const
ck
::
index_t
tid_wo
=
tid
%
num_threads_wo
;
tid
/=
num_threads_wo
;
const
ck
::
index_t
tid_k
=
tid
;
ck
::
cpu
::
ThreadwiseGemmParam
param
;
// param.Kr = k_per_block;
...
...
@@ -510,35 +532,43 @@ struct GridwiseDirectConvNHWCAvx2
// ihi = iho * s_stride_h + iy * s_dilation_h - s_pad_h
// iwi = iwo * s_stride_w + ix * s_dilation_w - s_pad_w
ck
::
index_t
i_nho
=
tid_nho
*
num_works_nho_per_thread
;
ck
::
index_t
i_ho
=
i_nho
%
Ho
;
ck
::
index_t
i_n
=
i_nho
/
Ho
;
auto
accumulate_n_ho
=
[
&
]()
{
i_ho
++
;
if
(
i_ho
>=
Wo
)
{
i_ho
=
0
;
i_n
++
;
}
};
for
(;
(
i_nho
<
(
tid_nho
+
1
)
*
num_works_nho_per_thread
)
&&
(
i_nho
<
num_works_nho
);
i_nho
+=
1
,
accumulate_n_ho
())
// ck::index_t i_nho = tid_nho * num_works_nho_per_thread;
// ck::index_t i_ho = i_nho % Ho;
// ck::index_t i_n = i_nho / Ho;
// auto accumulate_n_ho = [&]() {
// i_ho++;
// if(i_ho >= Wo)
// {
// i_ho = 0;
// i_n++;
// }
// };
for
(
ck
::
index_t
i_n
=
tid_n
*
num_works_n_per_thread
;
(
i_n
<
(
tid_n
+
1
)
*
num_works_n_per_thread
)
&&
i_n
<
num_works_n
;
i_n
+=
1
)
{
for
(
ck
::
index_t
i_ho
=
tid_ho
*
num_works_ho_per_thread
;
(
i_ho
<
(
tid_ho
+
1
)
*
num_works_ho_per_thread
)
&&
i_ho
<
num_works_ho
;
i_ho
+=
1
)
{
// for input
ck
::
index_t
i_hi_no_y
=
i_ho
*
Sy
-
Py
;
for
(
ck
::
index_t
i_wo
=
tid_wo
*
num_works_wo_per_thread
*
m_per_thread
;
i_wo
<
(
tid_wo
+
1
)
*
num_works_wo_per_thread
*
m_per_thread
;
i_wo
<
(
tid_wo
+
1
)
*
num_works_wo_per_thread
*
m_per_thread
&&
i_wo
<
Wo
;
i_wo
+=
m_per_thread
)
{
ck
::
index_t
current_wo_size_no_dx
=
ck
::
math
::
min
(
Wo
-
i_wo
,
m_per_thread
);
ck
::
index_t
current_wo_size_no_dx
=
ck
::
math
::
min
(
Wo
-
i_wo
,
m_per_thread
);
ck
::
index_t
i_wi_no_x
=
i_wo
*
Sx
-
Px
;
// printf("-- i_nho:%d, i_wo:%d, num_works_nho:%d,
// num_threads_nho:%d(Hi:%d,nWi:%d)\n",
// i_nho, i_wo, num_works_nho, num_threads_nho, Hi, Wi);fflush(stdout);
// i_nho, i_wo, num_works_nho, num_threads_nho, Hi,
// Wi);fflush(stdout);
for
(
ck
::
index_t
i_k
=
tid_k
*
num_works_k_per_thread
*
n_per_thread
;
i_k
<
(
tid_k
+
1
)
*
num_works_k_per_thread
*
n_per_thread
;
...
...
@@ -569,8 +599,8 @@ struct GridwiseDirectConvNHWCAvx2
ck
::
index_t
i_wi
=
i_wi_no_x
+
i_dx
;
ck
::
index_t
current_wo_size
=
current_wo_size_no_dx
;
ck
::
index_t
pad_wo_size
=
0
;
// when left pad, we may never have
a
//
chance to clear zero (like
ck
::
index_t
pad_wo_size
=
0
;
// when left pad, we may never have
// a
chance to clear zero (like
// padding) we need to manually clear that
if
(
i_wi
<
0
)
...
...
@@ -588,8 +618,8 @@ struct GridwiseDirectConvNHWCAvx2
current_i_wo
+=
steps_wo_turn_possitive
;
if
(
!
accmulate_c
)
pad_wo_size
=
steps_wo_turn_possitive
;
// if already accumulating,
no
//
need to manually set
steps_wo_turn_possitive
;
// if already accumulating,
// no
need to manually set
i_wi
+=
steps_wo_turn_possitive
*
Sx
;
// now i_wi will be a possitive number
}
...
...
@@ -602,8 +632,8 @@ struct GridwiseDirectConvNHWCAvx2
{
// printf(" ->[r] i_wi:%d, r:%d(%d), ", i_wi, i_wi +
// ((current_wo_size - 1) * Sx), current_wo_size);
current_wo_size
=
(
Wi
-
1
-
i_wi
)
/
Sx
+
1
;
// NOTE: this be careful why here
current_wo_size
=
(
Wi
-
1
-
i_wi
)
/
Sx
+
1
;
// NOTE: this be careful why here
// should be compute like this.
if
(
current_wo_size
<=
0
)
continue
;
...
...
@@ -621,10 +651,12 @@ struct GridwiseDirectConvNHWCAvx2
i_wo_pad
++
)
{
const
intptr_t
offset_c
=
GetCBlockStartOffset
(
c_grid_desc
,
i_nho
*
Wo
+
i_wo_pad
,
i_k
);
c_grid_desc
,
(
i_n
*
Ho
+
i_ho
)
*
Wo
+
i_wo_pad
,
i_k
);
// printf("pad_wo_size:%d, current_k_block_size:%d,
clear
//
offset_c:%d\n",
// printf("pad_wo_size:%d, current_k_block_size:%d,
// clear
offset_c:%d\n",
// pad_wo_size, current_k_size,
// offset_c);fflush(stdout);
ck
::
cpu
::
avx2_util
::
memset32_avx2
(
...
...
@@ -636,16 +668,16 @@ struct GridwiseDirectConvNHWCAvx2
const
intptr_t
offset_b
=
GetBBlockStartOffset
(
b_grid_desc
,
i_yxc
,
i_k
);
const
intptr_t
offset_c
=
GetCBlockStartOffset
(
c_grid_desc
,
i_nho
*
Wo
+
current_i_wo
,
i_k
);
c_grid_desc
,
(
i_n
*
Ho
+
i_
ho
)
*
Wo
+
current_i_wo
,
i_k
);
// printf("offset_a:%lu, offset_b:%lu, offset_c:%lu, i_n:%d,
// i_hi:%d, i_wi:%d, i_dx:%d, i_dy:%d, i_k:%d, i_ho:%d, i_wo:%d,
// current_wo_size:%d, current_k_size:%d, i_nho:%d, lda:%d,
ldb:%d,
//
ldc:%d, acc:%d\n",
// offset_a, offset_b, offset_c, i_n, i_hi, i_wi, i_dx,
i_dy,
// i_k, i_ho, current_i_wo, current_wo_size,
current_k_size,
// i_nho, param.lda / sizeof(FloatA),
param.ldb /
// sizeof(FloatB), param.ldc / sizeof(FloatC),
// current_wo_size:%d, current_k_size:%d, i_nho:%d, lda:%d,
// ldb:%d,
ldc:%d, acc:%d\n",
// offset_a, offset_b, offset_c, i_n, i_hi, i_wi, i_dx,
//
i_dy,
i_k, i_ho, current_i_wo, current_wo_size,
//
current_k_size,
i_nho, param.lda / sizeof(FloatA),
//
param.ldb /
sizeof(FloatB), param.ldc / sizeof(FloatC),
// param.accmulate_c); fflush(stdout);
param
.
p_a
=
&
a_block_buf
.
p_data_
[
offset_a
];
...
...
@@ -660,6 +692,7 @@ struct GridwiseDirectConvNHWCAvx2
}
}
}
}
else
if
(
dynamic_tunable
.
loop_over_spec
==
ck
::
tensor_operation
::
cpu
::
device
::
ConvolutionForwardBlockLoopOverSpecialization_t
::
LoopOver_MKN
)
...
...
@@ -695,11 +728,6 @@ struct GridwiseDirectConvNHWCAvx2
UseCLocalBuffer
?
c_block_mem
.
mMemSize
/
sizeof
(
FloatC
)
:
c_grid_desc
.
GetElementSpaceSize
());
const
ck
::
index_t
tid
=
omp_get_thread_num
();
const
ck
::
index_t
tid_k
=
tid
%
num_threads_k
;
const
ck
::
index_t
tid_wo
=
(
tid
/
num_threads_k
)
%
num_threads_wo
;
const
ck
::
index_t
tid_nho
=
tid
/
(
num_threads_k
*
num_threads_wo
);
ck
::
cpu
::
ThreadwiseGemmParam
param
;
// param.Kr = k_per_block;
param
.
lda
=
Sx
*
C
*
sizeof
(
FloatA
);
...
...
@@ -710,39 +738,42 @@ struct GridwiseDirectConvNHWCAvx2
// ihi = iho * s_stride_h + iy * s_dilation_h - s_pad_h
// iwi = iwo * s_stride_w + ix * s_dilation_w - s_pad_w
ck
::
index_t
i_nho
=
tid_nho
*
num_works_nho_per_thread
;
ck
::
index_t
i_ho
=
i_nho
%
Ho
;
ck
::
index_t
i_n
=
i_nho
/
Ho
;
auto
accumulate_n_ho
=
[
&
]()
{
i_ho
++
;
if
(
i_ho
>=
Wo
)
{
i_ho
=
0
;
i_n
++
;
}
};
for
(;
(
i_nho
<
(
tid_nho
+
1
)
*
num_works_nho_per_thread
)
&&
(
i_nho
<
num_works_nho
);
i_nho
+=
1
,
accumulate_n_ho
())
ck
::
index_t
tid
=
omp_get_thread_num
();
const
ck
::
index_t
tid_n
=
tid
%
num_threads_n
;
tid
/=
num_threads_n
;
const
ck
::
index_t
tid_ho
=
tid
%
num_threads_ho
;
tid
/=
num_threads_ho
;
const
ck
::
index_t
tid_wo
=
tid
%
num_threads_wo
;
tid
/=
num_threads_wo
;
const
ck
::
index_t
tid_k
=
tid
;
for
(
ck
::
index_t
i_n
=
tid_n
*
num_works_n_per_thread
;
(
i_n
<
(
tid_n
+
1
)
*
num_works_n_per_thread
)
&&
i_n
<
num_works_n
;
i_n
+=
1
)
{
for
(
ck
::
index_t
i_ho
=
tid_ho
*
num_works_ho_per_thread
;
(
i_ho
<
(
tid_ho
+
1
)
*
num_works_ho_per_thread
)
&&
i_ho
<
num_works_ho
;
i_ho
+=
1
)
{
// for input
ck
::
index_t
i_hi_no_y
=
i_ho
*
Sy
-
Py
;
for
(
ck
::
index_t
i_wo
=
tid_wo
*
num_works_wo_per_thread
*
m_per_thread
;
i_wo
<
(
tid_wo
+
1
)
*
num_works_wo_per_thread
*
m_per_thread
;
i_wo
<
(
tid_wo
+
1
)
*
num_works_wo_per_thread
*
m_per_thread
&&
i_wo
<
Wo
;
i_wo
+=
m_per_thread
)
{
ck
::
index_t
current_wo_size_no_dx
=
ck
::
math
::
min
(
Wo
-
i_wo
,
m_per_thread
);
ck
::
index_t
current_wo_size_no_dx
=
ck
::
math
::
min
(
Wo
-
i_wo
,
m_per_thread
);
ck
::
index_t
i_wi_no_x
=
i_wo
*
Sx
-
Px
;
ck
::
index_t
i_dx
=
0
;
ck
::
index_t
i_dy
=
0
;
bool
accmulate_c
=
false
;
// printf("-- [%d] i_nho:%d, i_wo:%d, num_works_n
ho
:%d,
// num_threads_n
ho
:%d(Hi:%d, Wi:%d), current_wo_size_no_dx:%d,
// printf("-- [%d] i_n
:%d, i_
ho:%d, i_wo:%d, num_works_n:%d,
// num_threads_n:%d(Hi:%d, Wi:%d), current_wo_size_no_dx:%d,
// m_per_thread:%d\n",
// tid, i_nho, i_wo, num_works_n
ho
, num_threads_n
ho
, Hi, Wi,
// tid, i_n
, i_
ho, i_wo, num_works_n, num_threads_n, Hi, Wi,
// current_wo_size_no_dx, m_per_thread);fflush(stdout);
auto
accumulate_dy_dx
=
[
&
]()
{
...
...
@@ -759,26 +790,25 @@ struct GridwiseDirectConvNHWCAvx2
{
ck
::
index_t
current_i_wo
=
i_wo
;
ck
::
index_t
i_hi
=
i_hi_no_y
+
i_dy
;
bool
run_pad_only
=
false
;
if
(
i_hi
<
0
||
i_hi
>=
Hi
)
continue
;
ck
::
index_t
i_wi
=
i_wi_no_x
+
i_dx
;
ck
::
index_t
current_wo_size
=
current_wo_size_no_dx
;
ck
::
index_t
pad_wo_size
=
0
;
// when left pad, we may never have a
chance to clear zero (like
ck
::
index_t
pad_wo_size
=
0
;
// when left pad, we may never have a
//
chance to clear zero (like
// padding) we need to manually clear that
/* left corner shift
* when i_wi is negative, need shift i_wo to right to make i_wi
* possitive sx px i_wi steps_wo_turn_possitive 1 0 0,
* 1, 2.... 0 2 0 0, 2, 4... 0 1 1 -1, 0, 1.... 1
* 2 1 -1, 1, 3.... 1
* 2 2 -2, 0, 2... 1
* 2 3 -3, -1, 1... 2
* 3 1 -1, 2, 5... 1
* 3 2 -2, 1, 4.... 1
* 3 3 -3, 0, 3 1
* 3 4 -4, -1, 2... 2
* possitive sx px i_wi steps_wo_turn_possitive
* 1 0
* 0, 1, 2.... 0 2 0 0, 2, 4... 0 1 1 -1,
* 0, 1.... 1 2 1 -1, 1, 3.... 1 2 2 -2, 0, 2... 1 2
* 3 -3, -1, 1... 2 3 1 -1, 2, 5... 1 3 2 -2,
* 1, 4.... 1 3 3 -3, 0, 3 1 3 4 -4,
* -1, 2... 2
*/
if
(
i_wi
<
0
)
{
...
...
@@ -786,23 +816,27 @@ struct GridwiseDirectConvNHWCAvx2
-
i_wi
;
// keep this a possitive number
ck
::
index_t
steps_wo_turn_possitive
=
(
wi_to_zero_length
+
Sx
-
1
)
/
Sx
;
// how many steps need to move wo, to let wi to be possitive
Sx
;
// how many steps need to move wo, to let wi to be
// possitive
current_wo_size
-=
steps_wo_turn_possitive
;
// printf("--- current_wo_size:%d, i_wi:%d\n", current_wo_size,
// i_wi);
if
(
current_wo_size
<=
0
)
continue
;
current_i_wo
+=
steps_wo_turn_possitive
;
if
(
!
accmulate_c
)
pad_wo_size
=
steps_wo_turn_possitive
;
// if already accumulating, no
need
//
to manually set
steps_wo_turn_possitive
;
// if already accumulating, no
// need
to manually set
i_wi
+=
steps_wo_turn_possitive
*
Sx
;
// now i_wi will be a possitive number
}
if
(
i_wi
>=
Wi
)
{
continue
;
}
// shrink right wi/wo
if
((
i_wi
+
((
current_wo_size
-
1
)
*
Sx
))
>=
Wi
)
{
...
...
@@ -823,44 +857,49 @@ struct GridwiseDirectConvNHWCAvx2
if
(
pad_wo_size
!=
0
)
{
// manually clear zero. this may and only may need once along
the
//
gemm_k reduction
// manually clear zero. this may and only may need once along
// the
gemm_k reduction
ck
::
index_t
i_k
=
tid_k
*
num_works_k_per_thread
*
n_per_thread
;
ck
::
index_t
current_k_block_size
=
ck
::
math
::
min
(
K
-
i_k
,
num_works_k_per_thread
*
n_per_thread
);
ck
::
index_t
current_k_block_size
=
ck
::
math
::
min
(
K
-
i_k
,
num_works_k_per_thread
*
n_per_thread
);
const
intptr_t
offset_c
=
GetCBlockStartOffset
(
c_grid_desc
,
i_nho
*
Wo
,
i_k
);
const
intptr_t
offset_c
=
GetCBlockStartOffset
(
c_grid_desc
,
(
i_n
*
Ho
+
i_
ho
)
*
Wo
,
i_k
);
// printf("[%d] pad_wo_size:%d, current_k_block_size:%d,
// offset_c:%d\n",
// tid, pad_wo_size, current_k_block_size,
// offset_c);fflush(stdout);
ck
::
cpu
::
avx2_util
::
memset32_avx2
(
&
c_block_buf
.
p_data_
[
offset_c
],
// offset_c:%d, i_wo:%d\n",
// tid, pad_wo_size, current_k_block_size, offset_c,
// i_wo);fflush(stdout);
ck
::
cpu
::
avx2_util
::
memset32_avx2
(
&
c_block_buf
.
p_data_
[
offset_c
],
0
,
current_k_block_size
*
pad_wo_size
);
current_k_block_size
*
pad_wo_size
);
}
if
(
run_pad_only
)
continue
;
for
(
ck
::
index_t
i_k
=
tid_k
*
num_works_k_per_thread
*
n_per_thread
;
i_k
<
(
tid_k
+
1
)
*
num_works_k_per_thread
*
n_per_thread
;
i_k
+=
n_per_thread
)
{
ck
::
index_t
current_k_size
=
ck
::
math
::
min
(
K
-
i_k
,
n_per_thread
);
ck
::
index_t
current_k_size
=
ck
::
math
::
min
(
K
-
i_k
,
n_per_thread
);
const
intptr_t
offset_a
=
current_input_offset
;
const
intptr_t
offset_b
=
GetBBlockStartOffset
(
b_grid_desc
,
i_yxc
,
i_k
);
const
intptr_t
offset_c
=
GetCBlockStartOffset
(
c_grid_desc
,
i_nho
*
Wo
+
current_i_wo
,
i_k
);
c_grid_desc
,
(
i_n
*
Ho
+
i_
ho
)
*
Wo
+
current_i_wo
,
i_k
);
// printf("[%d] offset_a:%lu, offset_b:%lu, offset_c:%lu, i_n:%d,
// i_hi:%d, i_wi:%d, i_dx:%d, i_dy:%d, i_k:%d, i_ho:%d, i_wo:%d,
// current_wo_size:%d, i_nho:%d, lda:%d, ldb:%d\n",
// printf("[%d] offset_a:%lu, offset_b:%lu, offset_c:%lu,
// i_n:%d, i_hi:%d, i_wi:%d, i_dx:%d, i_dy:%d, i_k:%d, i_ho:%d,
// i_wo:%d, current_wo_size:%d, i_n:%d, i_ho:%d, lda:%d,
// ldb:%d\n",
// tid, offset_a, offset_b, offset_c, i_n, i_hi, i_wi, i_dx,
// i_dy, i_k, i_ho, current_i_wo, current_wo_size, i_n
ho
,
// param.lda / sizeof(FloatA), param.ldb /
sizeof(FloatB));
// fflush(stdout);
// i_dy, i_k, i_ho, current_i_wo, current_wo_size, i_n,
//
i_ho,
param.lda / sizeof(FloatA), param.ldb /
//
sizeof(FloatB));
fflush(stdout);
param
.
p_a
=
&
a_block_buf
.
p_data_
[
offset_a
];
param
.
p_b
=
&
b_block_buf
.
p_data_
[
offset_b
];
...
...
@@ -875,6 +914,7 @@ struct GridwiseDirectConvNHWCAvx2
}
}
}
}
};
}
// namespace cpu
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment