Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
f4d67cf8
Commit
f4d67cf8
authored
Nov 03, 2022
by
Qianfeng Zhang
Browse files
Move common expression 1/N out of the static_for loops
parent
7a190876
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
10 additions
and
6 deletions
+10
-6
include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp
...ultiblock_reduce_second_half_batchnorm_backward_final.hpp
+5
-3
include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp
...pu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp
+5
-3
No files found.
include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp
View file @
f4d67cf8
...
@@ -473,6 +473,9 @@ struct GridwiseReduceSecondHalfBatchNormBackwardFinal
...
@@ -473,6 +473,9 @@ struct GridwiseReduceSecondHalfBatchNormBackwardFinal
constexpr
auto
xy_thread_copy_step_m_k
=
make_multi_index
(
0
,
K_BlockTileSize
);
constexpr
auto
xy_thread_copy_step_m_k
=
make_multi_index
(
0
,
K_BlockTileSize
);
AccDataType
inv_reduce_size
=
type_convert
<
AccDataType
>
(
1.0
)
/
type_convert
<
AccDataType
>
(
reduce_size
);
for
(
index_t
reducedTiles
=
0
;
reducedTiles
<
num_xy_k_block_tile_iteration
;
++
reducedTiles
)
for
(
index_t
reducedTiles
=
0
;
reducedTiles
<
num_xy_k_block_tile_iteration
;
++
reducedTiles
)
{
{
threadwise_x_load
.
Run
(
x_grid_desc_m_k
,
threadwise_x_load
.
Run
(
x_grid_desc_m_k
,
...
@@ -488,9 +491,8 @@ struct GridwiseReduceSecondHalfBatchNormBackwardFinal
...
@@ -488,9 +491,8 @@ struct GridwiseReduceSecondHalfBatchNormBackwardFinal
dy_thread_buf
);
dy_thread_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
AccDataType
multiplier
=
type_convert
<
AccDataType
>
(
1.0
)
/
AccDataType
multiplier
=
type_convert
<
AccDataType
>
(
reduce_size
)
*
inv_reduce_size
*
inv_var_thread_buf
[
iM
]
*
scale_thread_buf
[
iM
];
inv_var_thread_buf
[
iM
]
*
scale_thread_buf
[
iM
];
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset
=
constexpr
auto
offset
=
...
...
include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp
View file @
f4d67cf8
...
@@ -506,6 +506,9 @@ struct GridwiseBatchNormBackwardWithBlockwiseWelford
...
@@ -506,6 +506,9 @@ struct GridwiseBatchNormBackwardWithBlockwiseWelford
threadwise_dy_load
.
MoveSrcSliceWindow
(
dy_grid_desc_m_k
,
thread_copy_bwd_step_m_k
);
threadwise_dy_load
.
MoveSrcSliceWindow
(
dy_grid_desc_m_k
,
thread_copy_bwd_step_m_k
);
threadwise_dx_store
.
MoveDstSliceWindow
(
dx_grid_desc_m_k
,
thread_copy_tail_m_k
);
threadwise_dx_store
.
MoveDstSliceWindow
(
dx_grid_desc_m_k
,
thread_copy_tail_m_k
);
AccDataType
inv_reduce_size
=
type_convert
<
AccDataType
>
(
1.0
)
/
type_convert
<
AccDataType
>
(
reduce_size
);
for
(
index_t
reducedTiles
=
0
;
reducedTiles
<
num_k_block_tile_iteration
;
++
reducedTiles
)
for
(
index_t
reducedTiles
=
0
;
reducedTiles
<
num_k_block_tile_iteration
;
++
reducedTiles
)
{
{
threadwise_x_load
.
Run
(
x_grid_desc_m_k
,
threadwise_x_load
.
Run
(
x_grid_desc_m_k
,
...
@@ -521,9 +524,8 @@ struct GridwiseBatchNormBackwardWithBlockwiseWelford
...
@@ -521,9 +524,8 @@ struct GridwiseBatchNormBackwardWithBlockwiseWelford
dy_thread_buf
);
dy_thread_buf
);
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
static_for
<
0
,
MThreadSliceSize
,
1
>
{}([
&
](
auto
iM
)
{
AccDataType
multiplier
=
type_convert
<
AccDataType
>
(
1.0
)
/
AccDataType
multiplier
=
type_convert
<
AccDataType
>
(
reduce_size
)
*
inv_reduce_size
*
inv_var_thread_buf
[
iM
]
*
scale_thread_buf
[
iM
];
inv_var_thread_buf
[
iM
]
*
scale_thread_buf
[
iM
];
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
static_for
<
0
,
KThreadSliceSize
,
1
>
{}([
&
](
auto
iK
)
{
constexpr
auto
offset
=
constexpr
auto
offset
=
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment