Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
acd7082f
Commit
acd7082f
authored
May 21, 2019
by
Chao Liu
Browse files
adding ConstantMergedTensorDescriptor, refactering ConstantTensorDescriptor, Sequence
parent
cd29b09a
Changes
38
Hide whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
370 additions
and
291 deletions
+370
-291
src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp
+24
-23
src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
+38
-33
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
...plicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
+31
-29
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
...plicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
+67
-39
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
...plicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
+19
-15
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
...ise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
+41
-35
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
...ise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
+21
-17
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
...dwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
+7
-5
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
...implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
+18
-15
src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
...dwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
+5
-4
src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
...se_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
+33
-30
src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
...implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
+10
-9
src/include/tensor.hpp
src/include/tensor.hpp
+3
-3
src/include/threadwise_2d_tensor_op.hip.hpp
src/include/threadwise_2d_tensor_op.hip.hpp
+4
-4
src/include/threadwise_4d_tensor_op.hip.hpp
src/include/threadwise_4d_tensor_op.hip.hpp
+4
-4
src/include/threadwise_direct_convolution.hip.hpp
src/include/threadwise_direct_convolution.hip.hpp
+13
-10
src/include/threadwise_gemm.hip.hpp
src/include/threadwise_gemm.hip.hpp
+6
-6
src/include/threadwise_tensor_slice_op.hip.hpp
src/include/threadwise_tensor_slice_op.hip.hpp
+26
-10
No files found.
src/include/gridwise_convolution_implicit_gemm_v1r2_nchw_cyxk_khwn.hip.hpp
View file @
acd7082f
...
...
@@ -221,11 +221,11 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
#if 0
const Float* p_in_global_block_offset =
p_in_global + in_n_c_h_w_global_desc.Get
1d
Index(
p_in_global + in_n_c_h_w_global_desc.Get
OffsetFromMulti
Index(
n_block_data_begin, 0, hi_block_data_begin, wi_block_data_begin);
const Float* p_wei_global_block_offset =
p_wei_global + wei_c_y_x_k_global_desc.Get
1d
Index(0, 0, 0, k_block_data_begin);
p_wei_global + wei_c_y_x_k_global_desc.Get
OffsetFromMulti
Index(0, 0, 0, k_block_data_begin);
for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1),
...
...
@@ -234,20 +234,20 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
for(index_t y = 0; y < Y; ++y)
{
blockwise_in_copy_reorder.Run(p_in_global_block_offset +
in_n_c_h_w_global_desc.Get
1d
Index(0, 0, y, 0),
in_n_c_h_w_global_desc.Get
OffsetFromMulti
Index(0, 0, y, 0),
p_in_block);
blockwise_wei_copy.Run(p_wei_global_block_offset +
wei_c_y_x_k_global_desc.Get
1d
Index(0, y, 0, 0),
wei_c_y_x_k_global_desc.Get
OffsetFromMulti
Index(0, y, 0, 0),
p_wei_block);
__syncthreads();
for(index_t x = 0; x < X; ++x)
{
blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.Get
1d
Index(0, x, 0),
blockwise_batch_gemm.Run(p_wei_block + wei_c_x_k_block_desc.Get
OffsetFromMulti
Index(0, x, 0),
p_in_block +
in_c_h_w_n_block_desc.Get
1d
Index(0, 0, x, 0),
in_c_h_w_n_block_desc.Get
OffsetFromMulti
Index(0, 0, x, 0),
p_out_thread);
}
...
...
@@ -259,11 +259,12 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
{
const
Float
*
p_in_global_block_offset
=
p_in_global
+
in_n_c_h_w_global_desc
.
Get
1d
Index
(
in_n_c_h_w_global_desc
.
Get
OffsetFromMulti
Index
(
n_block_data_begin
,
0
,
hi_block_data_begin
+
y
,
wi_block_data_begin
);
const
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_c_y_x_k_global_desc
.
Get1dIndex
(
0
,
y
,
0
,
k_block_data_begin
);
p_wei_global
+
wei_c_y_x_k_global_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
0
,
k_block_data_begin
);
for
(
index_t
c_block_data_begin
=
0
;
...
...
@@ -287,10 +288,10 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
{
blockwise_batch_gemm
.
Run
(
p_wei_block
+
wei_c_x_k_block_desc
.
Get1dIndex
(
0
,
x
,
0
),
p_in_block
+
in_c_h_w_n_block_desc
.
Get
1d
Index
(
0
,
0
,
x
,
0
),
p_out_thread
);
blockwise_batch_gemm
.
Run
(
p_wei_block
+
wei_c_x_k_block_desc
.
GetOffsetFromMultiIndex
(
0
,
x
,
0
),
p_in_block
+
in_c_h_w_n_block_desc
.
Get
OffsetFromMulti
Index
(
0
,
0
,
x
,
0
),
p_out_thread
);
}
__syncthreads
();
...
...
@@ -336,16 +337,16 @@ struct GridwiseConvolutionImplicitGemm_v1r2_nchw_cyxk_khwn
}
#endif
threadwise_10d_tensor_copy
(
out_10d
_thread
_desc
,
p_out_thread
,
out_10d_global_desc
,
p_out_global
+
out_k_h_w_n_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite_N
>
{});
threadwise_10d_tensor_copy
(
out_10d_thread_desc
,
p_out
_thread
,
out_10d_global_desc
,
p_out_global
+
out_k_h_w_n_global_desc
.
GetOffsetFromMultiIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite_N
>
{});
}
};
src/include/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn.hip.hpp
View file @
acd7082f
...
...
@@ -82,7 +82,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NBlockWork
,
KBlockWork
,
HBlockWork
,
WBlockWork
>
{});
const
auto
block_work_multi_id
=
block_work_desc
.
GetMultiIndex
(
get_block_1d_id
());
const
auto
block_work_multi_id
=
block_work_desc
.
GetMultiIndexFrom1dIndex
(
get_block_1d_id
());
const
index_t
n_block_data_begin
=
block_work_multi_id
[
0
]
*
NPerBlock
;
const
index_t
k_block_data_begin
=
block_work_multi_id
[
1
]
*
KPerBlock
;
...
...
@@ -225,11 +226,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
#if 1
const
Float
*
p_in_global_block_offset
=
p_in_global
+
in_c_h_w_n_global_desc
.
Get
1d
Index
(
in_c_h_w_n_global_desc
.
Get
OffsetFromMulti
Index
(
0
,
hi_block_data_begin
,
wi_block_data_begin
,
n_block_data_begin
);
const
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_c_y_x_k_global_desc
.
Get1dIndex
(
0
,
0
,
0
,
k_block_data_begin
);
p_wei_global
+
wei_c_y_x_k_global_desc
.
GetOffsetFromMultiIndex
(
0
,
0
,
0
,
k_block_data_begin
);
for
(
index_t
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
p_in_global_block_offset
+=
CPerBlock
*
in_c_h_w_n_global_desc
.
GetStride
(
I0
),
...
...
@@ -240,13 +242,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
#pragma unroll
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
{
blockwise_in_copy
.
Run
(
p_in_global_block_offset
+
in_c_h_w_n_global_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
p_in_block
);
blockwise_in_copy
.
Run
(
p_in_global_block_offset
+
in_c_h_w_n_global_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
0
),
p_in_block
);
blockwise_wei_copy
.
Run
(
p_wei_global_block_offset
+
wei_c_y_x_k_global_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
p_wei_block
);
blockwise_wei_copy
.
Run
(
p_wei_global_block_offset
+
wei_c_y_x_k_global_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
0
),
p_wei_block
);
__syncthreads
();
...
...
@@ -263,11 +267,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
{
const
Float
*
p_in_global_block_offset
=
p_in_global
+
in_c_h_w_n_global_desc
.
Get
1d
Index
(
in_c_h_w_n_global_desc
.
Get
OffsetFromMulti
Index
(
0
,
hi_block_data_begin
+
y
,
wi_block_data_begin
+
x
,
n_block_data_begin
);
const
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_c_y_x_k_global_desc
.
Get1dIndex
(
0
,
y
,
x
,
k_block_data_begin
);
p_wei_global
+
wei_c_y_x_k_global_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
k_block_data_begin
);
for
(
index_t
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
...
...
@@ -347,17 +352,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
}
#endif
threadwise_tensor_slice_copy
(
out_10d
_thread
_desc
,
p_out_thread
,
out_10d_global_desc
,
p_out_global
+
out_k_h_w_n_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite_N
>
{});
threadwise_tensor_slice_copy
(
out_10d_thread_desc
,
p_out
_thread
,
out_10d_global_desc
,
p_out_global
+
out_k_h_w_n_global_desc
.
GetOffsetFromMultiIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite_N
>
{});
}).
else_
([
&
](
auto
f_dummy
)
{
static_assert
(
f_dummy
(
GemmNPerThreadSubC
)
>=
NPerBlock
&&
NPerThread
==
NPerBlock
&&
GemmNPerThreadSubC
%
NPerThread
==
0
,
...
...
@@ -397,17 +402,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn
}
#endif
threadwise_tensor_slice_copy
(
out_10d
_thread
_desc
,
p_out_thread
,
out_10d_global_desc
,
p_out_global
+
out_k_h_w_n_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite_N
>
{});
threadwise_tensor_slice_copy
(
out_10d_thread_desc
,
p_out
_thread
,
out_10d_global_desc
,
p_out_global
+
out_k_h_w_n_global_desc
.
GetOffsetFromMultiIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite_N
>
{});
});
}
};
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_chwn_cyxk_khwn.hip.hpp
View file @
acd7082f
...
...
@@ -85,10 +85,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
constexpr
index_t
WBlockWork
=
mod_conv
::
integer_divide_ceil
(
Wo
,
WoPerBlock
);
constexpr
index_t
NBlockWork
=
mod_conv
::
integer_divide_ceil
(
N
,
NPerBlock
);
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor
(
constexpr
auto
block_work_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
KBlockWork
,
HBlockWork
,
WBlockWork
,
NBlockWork
>
{});
const
auto
block_work_multi_id
=
block_work_desc
.
GetMultiIndex
(
get_block_1d_id
());
const
auto
block_work_multi_id
=
block_work_desc
.
GetMultiIndexFrom1dIndex
(
get_block_1d_id
());
const
index_t
k_block_data_begin
=
block_work_multi_id
[
0
]
*
KPerBlock
;
const
index_t
ho_block_data_begin
=
block_work_multi_id
[
1
]
*
HoPerBlock
;
...
...
@@ -108,7 +109,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
GemmDataPerReadA
,
GemmDataPerReadB
);
constexpr
auto
in_c_h_w_n_block_desc
=
make_ConstantTensorDescriptor_align
ed
(
constexpr
auto
in_c_h_w_n_block_desc
=
make_
ranked_
ConstantTensorDescriptor_
with_
align
ment
(
Sequence
<
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerBlock
>
{},
Number
<
InBlockCopyDataPerRead_N
>
{});
...
...
@@ -117,12 +118,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
static_assert
(
in_c_h_w_n_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
"GemmDataPerReadB alignment requirement is not meet"
);
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_align
ed
(
constexpr
auto
wei_c_k_block_desc
=
make_
ranked_
ConstantTensorDescriptor_
with_
align
ment
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor
(
constexpr
auto
out_k_h_w_n_thread_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
HoPerThread
,
WoPerThread
,
NPerThread
>
{});
// blockwise copy
...
...
@@ -243,11 +244,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
{
const
Float
*
p_in_global_block_offset
=
p_in_global
+
in_c_h_w_n_global_desc
.
Get
1d
Index
(
in_c_h_w_n_global_desc
.
Get
OffsetFromMulti
Index
(
0
,
hi_block_data_begin
+
y
,
wi_block_data_begin
+
x
,
n_block_data_begin
);
const
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_c_y_x_k_global_desc
.
Get1dIndex
(
0
,
y
,
x
,
k_block_data_begin
);
p_wei_global
+
wei_c_y_x_k_global_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
k_block_data_begin
);
// LDS double buffer: preload data into LDS
{
...
...
@@ -399,17 +401,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
}
#endif
threadwise_tensor_slice_copy
(
out_10d
_thread
_desc
,
p_out_thread
,
out_10d_global_desc
,
p_out_global
+
out_k_h_w_n_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite_N
>
{});
threadwise_tensor_slice_copy
(
out_10d_thread_desc
,
p_out
_thread
,
out_10d_global_desc
,
p_out_global
+
out_k_h_w_n_global_desc
.
GetOffsetFromMultiIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite_N
>
{});
}).
else_
([
&
](
auto
fwd
)
{
static_assert
(
fwd
(
GemmNPerThreadSubC
)
>=
NPerBlock
&&
NPerThread
==
NPerBlock
&&
GemmNPerThreadSubC
%
NPerThread
==
0
,
...
...
@@ -450,17 +452,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_chwn_cyxk_khwn
}
#endif
threadwise_tensor_slice_copy
(
out_10d
_thread
_desc
,
p_out_thread
,
out_10d_global_desc
,
p_out_global
+
out_k_h_w_n_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite_N
>
{});
threadwise_tensor_slice_copy
(
out_10d_thread_desc
,
p_out
_thread
,
out_10d_global_desc
,
p_out_global
+
out_k_h_w_n_global_desc
.
GetOffsetFromMultiIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite_N
>
{});
});
}
};
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_khwn.hip.hpp
View file @
acd7082f
...
...
@@ -86,10 +86,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
constexpr
index_t
HBlockWork
=
mod_conv
::
integer_divide_ceil
(
Ho
,
HoPerBlock
);
constexpr
index_t
WBlockWork
=
mod_conv
::
integer_divide_ceil
(
Wo
,
WoPerBlock
);
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor
(
constexpr
auto
block_work_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
NBlockWork
,
KBlockWork
,
HBlockWork
,
WBlockWork
>
{});
const
auto
block_work_multi_id
=
block_work_desc
.
GetMultiIndex
(
get_block_1d_id
());
const
auto
block_work_multi_id
=
block_work_desc
.
GetMultiIndexFrom1dIndex
(
get_block_1d_id
());
const
index_t
n_block_data_begin
=
block_work_multi_id
[
0
]
*
NPerBlock
;
const
index_t
k_block_data_begin
=
block_work_multi_id
[
1
]
*
KPerBlock
;
...
...
@@ -101,7 +102,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
// global tensor view
constexpr
auto
wei_c_k_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
K
>
{},
Sequence
<
Y
*
X
*
K
,
1
>
{});
make_
ranked_
ConstantTensorDescriptor
(
Sequence
<
C
,
K
>
{},
Sequence
<
Y
*
X
*
K
,
1
>
{});
// LDS tensor view
// be careful of alignment
...
...
@@ -110,7 +111,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
GemmDataPerReadA
,
GemmDataPerReadB
);
constexpr
auto
in_c_h_w_n_block_desc
=
make_ConstantTensorDescriptor_align
ed
(
constexpr
auto
in_c_h_w_n_block_desc
=
make_
ranked_
ConstantTensorDescriptor_
with_
align
ment
(
Sequence
<
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerBlock
>
{},
Number
<
InBlockReorderDataPerWrite_N
>
{});
...
...
@@ -119,12 +120,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
static_assert
(
in_c_h_w_n_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
"GemmDataPerReadB alignment requirement is not meet"
);
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_align
ed
(
constexpr
auto
wei_c_k_block_desc
=
make_
ranked_
ConstantTensorDescriptor_
with_
align
ment
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor
(
constexpr
auto
out_k_h_w_n_thread_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
HoPerThread
,
WoPerThread
,
NPerThread
>
{});
// blockwise copy
...
...
@@ -241,11 +242,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
{
const
Float
*
p_in_global_block_offset
=
p_in_global
+
in_n_c_h_w_global_desc
.
Get
1d
Index
(
in_n_c_h_w_global_desc
.
Get
OffsetFromMulti
Index
(
n_block_data_begin
,
0
,
hi_block_data_begin
+
y
,
wi_block_data_begin
+
x
);
const
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_c_y_x_k_global_desc
.
Get1dIndex
(
0
,
y
,
x
,
k_block_data_begin
);
p_wei_global
+
wei_c_y_x_k_global_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
k_block_data_begin
);
// LDS double buffer: preload data into LDS
{
...
...
@@ -359,13 +361,13 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
const
index_t
wo_thread_data_begin
=
c_thread_mtx_begin
.
col
/
NPerBlock
;
const
index_t
n_thread_data_begin
=
c_thread_mtx_begin
.
col
%
NPerBlock
;
static_if
<
GemmNPerThreadSubC
<=
NPerBlock
>
{}([
&
](
auto
f
_dummy
)
{
// f
_dummy
do nothing but
static_if
<
GemmNPerThreadSubC
<=
NPerBlock
>
{}([
&
](
auto
f
wd
)
{
// f
wd
do nothing but
// perfect forwarding.
// Using this trick to
// make this lambda a generic lambda, so it won't be compiled until
// instantiated
static_assert
(
(
f
_dummy
(
GemmNPerThreadSubC
)
<=
NPerBlock
&&
NPerBlock
%
GemmNPerThreadSubC
==
0
),
(
f
wd
(
GemmNPerThreadSubC
)
<=
NPerBlock
&&
NPerBlock
%
GemmNPerThreadSubC
==
0
),
"wrong!"
);
// output is a 10d tensor
...
...
@@ -373,12 +375,13 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
constexpr
index_t
N1
=
NPerBlock
/
N2
;
constexpr
index_t
W2
=
(
GemmNLevel0Cluster
*
GemmNLevel1Cluster
)
/
f
_dummy
(
NPerBlock
/
GemmNPerThreadSubC
);
(
GemmNLevel0Cluster
*
GemmNLevel1Cluster
)
/
f
wd
(
NPerBlock
/
GemmNPerThreadSubC
);
constexpr
index_t
W1
=
WoPerBlock
/
W2
;
constexpr
index_t
K2
=
GemmMPerThreadSubC
;
constexpr
index_t
K1
=
KPerBlock
/
KPerThread
;
#if 0
constexpr auto out_10d_global_desc =
make_ConstantTensorDescriptor(Sequence<K / (K1 * K2),
K1,
...
...
@@ -387,12 +390,23 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
Wo / (W1 * W2),
W1,
W2,
N
/
f
_dummy
(
N1
*
N2
),
N / f
wd
(N1 * N2),
N1,
N2>{});
constexpr auto out_10d_thread_desc = make_ConstantTensorDescriptor(
Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, 1, 1, N2>{});
#else
constexpr
auto
out_10d_global_desc
=
fwd
(
out_k_h_w_n_global_desc
)
.
Fold
(
I3
,
Number
<
N1
>
{},
Number
<
N2
>
{})
.
Fold
(
I2
,
Number
<
W1
>
{},
Number
<
W2
>
{})
.
Fold
(
I0
,
Number
<
K1
>
{},
Number
<
K2
>
{});
constexpr
auto
out_10d_thread_desc
=
fwd
(
out_k_h_w_n_thread_desc
)
.
Fold
(
I3
,
Number
<
1
>
{},
Number
<
N2
>
{})
.
Fold
(
I2
,
Number
<
W1
>
{},
Number
<
1
>
{})
.
Fold
(
I0
,
Number
<
1
>
{},
Number
<
K2
>
{});
#endif
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
...
...
@@ -407,19 +421,19 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
}
#endif
threadwise_tensor_slice_copy
(
out_10d
_thread
_desc
,
p_out_thread
,
out_10d_global_desc
,
p_out_global
+
out_k_h_w_n_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite_N
>
{});
}).
else_
([
&
](
auto
f
_dummy
)
{
static_assert
(
f
_dummy
(
GemmNPerThreadSubC
)
>=
NPerBlock
&&
NPerThread
==
NPerBlock
&&
threadwise_tensor_slice_copy
(
out_10d_thread_desc
,
p_out
_thread
,
out_10d_global_desc
,
p_out_global
+
out_k_h_w_n_global_desc
.
GetOffsetFromMultiIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite_N
>
{});
}).
else_
([
&
](
auto
f
wd
)
{
static_assert
(
f
wd
(
GemmNPerThreadSubC
)
>=
NPerBlock
&&
NPerThread
==
NPerBlock
&&
GemmNPerThreadSubC
%
NPerThread
==
0
,
"wrong!"
);
...
...
@@ -428,16 +442,30 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
constexpr
index_t
W3
=
GemmNPerThreadSubC
/
NPerBlock
;
constexpr
index_t
W2
=
GemmNLevel0Cluster
*
GemmNLevel1Cluster
;
constexpr
index_t
W1
=
WoPerBlock
/
f
_dummy
(
W2
*
W3
);
constexpr
index_t
W1
=
WoPerBlock
/
f
wd
(
W2
*
W3
);
constexpr
index_t
K2
=
GemmMPerThreadSubC
;
constexpr
index_t
K1
=
KPerBlock
/
KPerThread
;
constexpr
auto
out_10d_global_desc
=
make_ConstantTensorDescriptor
(
#if 0
constexpr auto out_10d_global_desc = make_packed_ConstantTensorDescriptor(
Sequence<K / (K1 * K2), K1, K2, Ho, Wo / (W1 * W2 * W3), W1, W2, W3, N / N1, N1>{});
constexpr
auto
out_10d_thread_desc
=
make_ConstantTensorDescriptor
(
constexpr auto out_10d_thread_desc = make_
packed_
ConstantTensorDescriptor(
Sequence<KPerThread / K2, 1, K2, HoPerThread, 1, W1, 1, W3, 1, N1>{});
#else
constexpr
auto
out_10d_global_desc
=
fwd
(
out_k_h_w_n_global_desc
)
.
Fold
(
I3
,
Number
<
N1
>
{})
.
Fold
(
I2
,
Number
<
W1
>
{},
Number
<
W2
>
{},
Number
<
W3
>
{})
.
Fold
(
I0
,
Number
<
K1
>
{},
Number
<
K2
>
{});
constexpr
auto
out_10d_thread_desc
=
fwd
(
out_k_h_w_n_thread_desc
)
.
Fold
(
I3
,
Number
<
N1
>
{})
.
Fold
(
I2
,
Number
<
W1
>
{},
Number
<
1
>
{},
Number
<
W3
>
{})
.
Fold
(
I0
,
Number
<
1
>
{},
Number
<
K2
>
{});
#endif
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
...
...
@@ -457,17 +485,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_khwn
}
#endif
threadwise_tensor_slice_copy
(
out_10d
_thread
_desc
,
p_out_thread
,
out_10d_global_desc
,
p_out_global
+
out_k_h_w_n_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite_N
>
{});
threadwise_tensor_slice_copy
(
out_10d_thread_desc
,
p_out
_thread
,
out_10d_global_desc
,
p_out_global
+
out_k_h_w_n_global_desc
.
GetOffsetFromMultiIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite_N
>
{});
});
}
};
src/include/gridwise_convolution_implicit_gemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw.hip.hpp
View file @
acd7082f
...
...
@@ -86,10 +86,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
constexpr
index_t
HBlockWork
=
mod_conv
::
integer_divide_ceil
(
Ho
,
HoPerBlock
);
constexpr
index_t
WBlockWork
=
mod_conv
::
integer_divide_ceil
(
Wo
,
WoPerBlock
);
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor
(
constexpr
auto
block_work_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
NBlockWork
,
KBlockWork
,
HBlockWork
,
WBlockWork
>
{});
const
auto
block_work_multi_id
=
block_work_desc
.
GetMultiIndex
(
get_block_1d_id
());
const
auto
block_work_multi_id
=
block_work_desc
.
GetMultiIndexFrom1dIndex
(
get_block_1d_id
());
const
index_t
n_block_data_begin
=
block_work_multi_id
[
0
]
*
NPerBlock
;
const
index_t
k_block_data_begin
=
block_work_multi_id
[
1
]
*
KPerBlock
;
...
...
@@ -109,7 +110,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
GemmDataPerReadA
,
GemmDataPerReadB
);
constexpr
auto
in_c_h_w_n_block_desc
=
make_ConstantTensorDescriptor_align
ed
(
constexpr
auto
in_c_h_w_n_block_desc
=
make_
ranked_
ConstantTensorDescriptor_
with_
align
ment
(
Sequence
<
CPerBlock
,
HoPerBlock
,
WoPerBlock
,
NPerBlock
>
{},
Number
<
InBlockReorderDataPerWrite_N
>
{});
...
...
@@ -118,12 +119,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
static_assert
(
in_c_h_w_n_block_desc
.
GetStride
(
I1
)
%
GemmDataPerReadB
==
0
,
"GemmDataPerReadB alignment requirement is not meet"
);
constexpr
auto
wei_c_k_block_desc
=
make_ConstantTensorDescriptor_align
ed
(
constexpr
auto
wei_c_k_block_desc
=
make_
ranked_
ConstantTensorDescriptor_
with_
align
ment
(
Sequence
<
CPerBlock
,
KPerBlock
>
{},
Number
<
mod_conv
::
max
(
WeiBlockCopyDataPerRead_K
,
GemmDataPerReadA
)
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_k_h_w_n_thread_desc
=
make_ConstantTensorDescriptor
(
constexpr
auto
out_k_h_w_n_thread_desc
=
make_
packed_
ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
HoPerThread
,
WoPerThread
,
NPerThread
>
{});
// blockwise copy
...
...
@@ -240,11 +241,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
{
const
Float
*
p_in_global_block_offset
=
p_in_global
+
in_n_c_h_w_global_desc
.
Get
1d
Index
(
in_n_c_h_w_global_desc
.
Get
OffsetFromMulti
Index
(
n_block_data_begin
,
0
,
hi_block_data_begin
+
y
,
wi_block_data_begin
+
x
);
const
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_c_y_x_k_global_desc
.
Get1dIndex
(
0
,
y
,
x
,
k_block_data_begin
);
p_wei_global
+
wei_c_y_x_k_global_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
k_block_data_begin
);
// LDS double buffer: preload data into LDS
{
...
...
@@ -407,10 +409,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
p_out_thread
,
out_10d_global_desc
,
p_out_global
+
out_n_k_h_w_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
out_n_k_h_w_global_desc
.
GetOffsetFromMultiIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
map_out_global2thread
);
// Number<OutThreadCopyDataPerWrite_W>{});
...
...
@@ -461,10 +464,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_lds_double_buffer_nchw_cyxk_nkhw
p_out_thread
,
out_10d_global_desc
,
p_out_global
+
out_n_k_h_w_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
out_n_k_h_w_global_desc
.
GetOffsetFromMultiIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
map_out_global2thread
);
// Number<OutThreadCopyDataPerWrite_W>{});
...
...
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_khwn.hip.hpp
View file @
acd7082f
...
...
@@ -236,11 +236,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
#if 1
const
Float
*
p_in_global_block_offset
=
p_in_global
+
in_n_c_h_w_global_desc
.
Get
1d
Index
(
in_n_c_h_w_global_desc
.
Get
OffsetFromMulti
Index
(
n_block_data_begin
,
0
,
hi_block_data_begin
,
wi_block_data_begin
);
const
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_c_y_x_k_global_desc
.
Get1dIndex
(
0
,
0
,
0
,
k_block_data_begin
);
p_wei_global
+
wei_c_y_x_k_global_desc
.
GetOffsetFromMultiIndex
(
0
,
0
,
0
,
k_block_data_begin
);
for
(
index_t
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
p_in_global_block_offset
+=
CPerBlock
*
in_n_c_h_w_global_desc
.
GetStride
(
I1
),
...
...
@@ -251,23 +252,27 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
for
(
index_t
x
=
0
;
x
<
X
;
++
x
)
{
#if 1
blockwise_in_copy_reorder
.
Run
(
p_in_global_block_offset
+
in_n_c_h_w_global_desc
.
Get1dIndex
(
0
,
0
,
y
,
x
),
p_in_block
);
blockwise_wei_copy
.
Run
(
p_wei_global_block_offset
+
wei_c_y_x_k_global_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
p_wei_block
);
blockwise_in_copy_reorder
.
Run
(
p_in_global_block_offset
+
in_n_c_h_w_global_desc
.
GetOffsetFromMultiIndex
(
0
,
0
,
y
,
x
),
p_in_block
);
blockwise_wei_copy
.
Run
(
p_wei_global_block_offset
+
wei_c_y_x_k_global_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
0
),
p_wei_block
);
#else
Float
p_in_clipboard
[
blockwise_in_copy_reorder
.
GetRegisterClipboardSize
()];
Float
p_wei_clipboard
[
blockwise_wei_copy
.
GetRegisterClipboardSize
()];
blockwise_in_copy_reorder
.
RunLoadRegisterClipboard
(
p_in_global_block_offset
+
in_n_c_h_w_global_desc
.
Get1dIndex
(
0
,
0
,
y
,
x
),
p_in_global_block_offset
+
in_n_c_h_w_global_desc
.
GetOffsetFromMultiIndex
(
0
,
0
,
y
,
x
),
p_in_clipboard
);
blockwise_wei_copy
.
RunLoadRegisterClipboard
(
p_wei_global_block_offset
+
wei_c_y_x_k_global_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
p_wei_global_block_offset
+
wei_c_y_x_k_global_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
0
),
p_wei_clipboard
);
blockwise_wei_copy
.
RunStoreRegisterClipboard
(
p_wei_clipboard
,
p_wei_block
);
...
...
@@ -291,11 +296,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
{
const
Float
*
p_in_global_block_offset
=
p_in_global
+
in_n_c_h_w_global_desc
.
Get
1d
Index
(
in_n_c_h_w_global_desc
.
Get
OffsetFromMulti
Index
(
n_block_data_begin
,
0
,
hi_block_data_begin
+
y
,
wi_block_data_begin
+
x
);
const
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_c_y_x_k_global_desc
.
Get1dIndex
(
0
,
y
,
x
,
k_block_data_begin
);
p_wei_global
+
wei_c_y_x_k_global_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
k_block_data_begin
);
for
(
index_t
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
...
...
@@ -390,17 +396,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
}
#endif
threadwise_tensor_slice_copy
(
out_10d
_thread
_desc
,
p_out_thread
,
out_10d_global_desc
,
p_out_global
+
out_k_h_w_n_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite_N
>
{});
threadwise_tensor_slice_copy
(
out_10d_thread_desc
,
p_out
_thread
,
out_10d_global_desc
,
p_out_global
+
out_k_h_w_n_global_desc
.
GetOffsetFromMultiIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite_N
>
{});
}).
else_
([
&
](
auto
f_dummy
)
{
static_assert
(
f_dummy
(
GemmNPerThreadSubC
)
>=
NPerBlock
&&
NPerThread
==
NPerBlock
&&
GemmNPerThreadSubC
%
NPerThread
==
0
,
...
...
@@ -440,17 +446,17 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_khwn
}
#endif
threadwise_tensor_slice_copy
(
out_10d
_thread
_desc
,
p_out_thread
,
out_10d_global_desc
,
p_out_global
+
out_k_h_w_n_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite_N
>
{});
threadwise_tensor_slice_copy
(
out_10d_thread_desc
,
p_out
_thread
,
out_10d_global_desc
,
p_out_global
+
out_k_h_w_n_global_desc
.
GetOffsetFromMultiIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite_N
>
{});
});
}
};
src/include/gridwise_convolution_implicit_gemm_v1r3_nchw_cyxk_nkhw.hip.hpp
View file @
acd7082f
...
...
@@ -86,7 +86,8 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NBlockWork
,
KBlockWork
,
HBlockWork
,
WBlockWork
>
{});
const
auto
block_work_multi_id
=
block_work_desc
.
GetMultiIndex
(
get_block_1d_id
());
const
auto
block_work_multi_id
=
block_work_desc
.
GetMultiIndexFrom1dIndex
(
get_block_1d_id
());
const
index_t
n_block_data_begin
=
block_work_multi_id
[
0
]
*
NPerBlock
;
const
index_t
k_block_data_begin
=
block_work_multi_id
[
1
]
*
KPerBlock
;
...
...
@@ -234,11 +235,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
#if 0
const Float* p_in_global_block_offset =
p_in_global +
in_n_c_h_w_global_desc.Get
1d
Index(
in_n_c_h_w_global_desc.Get
OffsetFromMulti
Index(
n_block_data_begin, 0, hi_block_data_begin, wi_block_data_begin);
const Float* p_wei_global_block_offset =
p_wei_global + wei_c_y_x_k_global_desc.Get
1d
Index(0, 0, 0, k_block_data_begin);
p_wei_global + wei_c_y_x_k_global_desc.Get
OffsetFromMulti
Index(0, 0, 0, k_block_data_begin);
for(index_t c_block_data_begin = 0; c_block_data_begin < C; c_block_data_begin += CPerBlock,
p_in_global_block_offset += CPerBlock * in_n_c_h_w_global_desc.GetStride(I1),
...
...
@@ -250,22 +251,22 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
{
#if 1
blockwise_in_copy_reorder.Run(p_in_global_block_offset +
in_n_c_h_w_global_desc.Get
1d
Index(0, 0, y, x),
in_n_c_h_w_global_desc.Get
OffsetFromMulti
Index(0, 0, y, x),
p_in_block);
blockwise_wei_copy.Run(p_wei_global_block_offset +
wei_c_y_x_k_global_desc.Get
1d
Index(0, y, x, 0),
wei_c_y_x_k_global_desc.Get
OffsetFromMulti
Index(0, y, x, 0),
p_wei_block);
#else
Float p_in_clipboard[blockwise_in_copy_reorder.GetRegisterClipboardSize()];
Float p_wei_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
blockwise_in_copy_reorder.RunLoadRegisterClipboard(
p_in_global_block_offset + in_n_c_h_w_global_desc.Get
1d
Index(0, 0, y, x),
p_in_global_block_offset + in_n_c_h_w_global_desc.Get
OffsetFromMulti
Index(0, 0, y, x),
p_in_clipboard);
blockwise_wei_copy.RunLoadRegisterClipboard(
p_wei_global_block_offset + wei_c_y_x_k_global_desc.Get
1d
Index(0, y, x, 0),
p_wei_global_block_offset + wei_c_y_x_k_global_desc.Get
OffsetFromMulti
Index(0, y, x, 0),
p_wei_clipboard);
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_clipboard, p_wei_block);
...
...
@@ -289,11 +290,12 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
{
const
Float
*
p_in_global_block_offset
=
p_in_global
+
in_n_c_h_w_global_desc
.
Get
1d
Index
(
in_n_c_h_w_global_desc
.
Get
OffsetFromMulti
Index
(
n_block_data_begin
,
0
,
hi_block_data_begin
+
y
,
wi_block_data_begin
+
x
);
const
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_c_y_x_k_global_desc
.
Get1dIndex
(
0
,
y
,
x
,
k_block_data_begin
);
p_wei_global
+
wei_c_y_x_k_global_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
k_block_data_begin
);
for
(
index_t
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
...
...
@@ -395,10 +397,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
p_out_thread
,
out_10d_global_desc
,
p_out_global
+
out_n_k_h_w_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
out_n_k_h_w_global_desc
.
GetOffsetFromMultiIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
map_out_global2thread
);
// Number<OutThreadCopyDataPerWrite_W>{});
...
...
@@ -444,10 +447,11 @@ struct GridwiseConvolutionImplicitGemm_v1r3_nchw_cyxk_nkhw
p_out_thread
,
out_10d_global_desc
,
p_out_global
+
out_n_k_h_w_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
out_n_k_h_w_global_desc
.
GetOffsetFromMultiIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
out_10d_thread_desc
.
GetLengths
(),
map_out_global2thread
);
// Number<OutThreadCopyDataPerWrite_W>{});
...
...
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hip.hpp
View file @
acd7082f
...
...
@@ -193,10 +193,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
__shared__
Float
p_wei_block
[
wei_block_space
];
const
Float
*
p_in_global_block_offset
=
p_in_global
+
in_cb_global_desc
.
Get
1d
Index
(
0
,
b_block_data_begin
);
p_in_global
+
in_cb_global_desc
.
Get
OffsetFromMulti
Index
(
0
,
b_block_data_begin
);
const
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_cyxk_global_desc
.
Get1dIndex
(
0
,
0
,
0
,
k_block_data_begin
);
p_wei_global
+
wei_cyxk_global_desc
.
GetOffsetFromMultiIndex
(
0
,
0
,
0
,
k_block_data_begin
);
// register
Float
p_out_thread
[
out_kb_thread_desc
.
GetElementSpace
()];
...
...
@@ -236,7 +237,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
#elif 1
blockwise_gemm
.
Run_asm
#endif
(
p_wei_block
+
wei_cyxk_block_desc
.
Get
1d
Index
(
0
,
y
,
x
,
0
),
(
p_wei_block
+
wei_cyxk_block_desc
.
Get
OffsetFromMulti
Index
(
0
,
y
,
x
,
0
),
p_in_block
+
y
*
Wi
+
x
,
p_out_thread
);
}
...
...
@@ -267,8 +268,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
if
(
n_data
<
N
&&
h_data
<
Ho
&&
w_data
<
Wo
)
{
p_out_global
[
out_khwn_global_desc
.
Get1dIndex
(
k_data
,
h_data
,
w_data
,
n_data
)]
=
p_out_thread
[
out_kb_thread_desc
.
Get1dIndex
(
k
,
b
)];
p_out_global
[
out_khwn_global_desc
.
GetOffsetFromMultiIndex
(
k_data
,
h_data
,
w_data
,
n_data
)]
=
p_out_thread
[
out_kb_thread_desc
.
GetOffsetFromMultiIndex
(
k
,
b
)];
}
}
}
...
...
src/include/gridwise_convolution_implicit_gemm_v2_chwn_cyxk_khwn_lds_double_buffer.hip.hpp
View file @
acd7082f
...
...
@@ -198,10 +198,11 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
__shared__
Float
p_wei_block_double
[
2
*
wei_block_space
];
const
Float
*
p_in_global_block_offset
=
p_in_global
+
in_cb_global_desc
.
Get
1d
Index
(
0
,
b_block_data_begin
);
p_in_global
+
in_cb_global_desc
.
Get
OffsetFromMulti
Index
(
0
,
b_block_data_begin
);
const
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_cyxk_global_desc
.
Get1dIndex
(
0
,
0
,
0
,
k_block_data_begin
);
p_wei_global
+
wei_cyxk_global_desc
.
GetOffsetFromMultiIndex
(
0
,
0
,
0
,
k_block_data_begin
);
// preload data into LDS
{
...
...
@@ -269,7 +270,8 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
#elif 0
blockwise_gemm
.
Run_asm
#endif
(
p_wei_block_now
+
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
(
p_wei_block_now
+
wei_cyxk_block_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
0
),
p_in_block_now
+
y
*
Wi
+
x
,
p_out_thread
);
}
...
...
@@ -310,7 +312,8 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
#elif 0
blockwise_gemm
.
Run_asm
#endif
(
p_wei_block_double
+
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
(
p_wei_block_double
+
wei_cyxk_block_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
0
),
p_in_block_double
+
y
*
Wi
+
x
,
p_out_thread
);
}
...
...
@@ -336,7 +339,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
blockwise_gemm
.
Run_asm
#endif
(
p_wei_block_double
+
wei_block_space
+
wei_cyxk_block_desc
.
Get
1d
Index
(
0
,
y
,
x
,
0
),
wei_cyxk_block_desc
.
Get
OffsetFromMulti
Index
(
0
,
y
,
x
,
0
),
p_in_block_double
+
in_block_space
+
y
*
Wi
+
x
,
p_out_thread
);
}
...
...
@@ -365,14 +368,14 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
constexpr
auto
out_kb_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
B
>
{});
threadwise_6d_tensor_copy
(
out_
6d_
thread
_desc
,
p_out_thread
,
out
_6d
_global
_desc
,
p_
out_global
+
out_kb_global_desc
.
Get1dIndex
(
k_thread_data_begin
,
b_thread_data_begin
),
out_6d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite
>
{});
threadwise_6d_tensor_copy
(
out_6d_thread_desc
,
p_
out_thread
,
out_6d_global_desc
,
p_
out_global
+
out_
kb_
global
_desc
.
GetOffsetFromMultiIndex
(
k_thread_data_begin
,
b_thread_data_begin
),
out_6d_thread_desc
.
GetLengths
(),
Number
<
OutThreadCopyDataPerWrite
>
{});
}
else
{
...
...
@@ -393,9 +396,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
if
(
n_data
<
N
&&
h_data
<
Ho
&&
w_data
<
Wo
)
{
p_out_global
[
out_khwn_global_desc
.
Get
1d
Index
(
p_out_global
[
out_khwn_global_desc
.
Get
OffsetFromMulti
Index
(
k_data
,
h_data
,
w_data
,
n_data
)]
=
p_out_thread
[
out_kb_thread_desc
.
Get
1d
Index
(
k
,
b
)];
p_out_thread
[
out_kb_thread_desc
.
Get
OffsetFromMulti
Index
(
k
,
b
)];
}
}
}
...
...
src/include/gridwise_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hip.hpp
View file @
acd7082f
...
...
@@ -83,7 +83,8 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
constexpr
auto
block_work_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KBlockWork
,
BBlockWork
>
{});
const
auto
block_work_multi_id
=
block_work_desc
.
GetMultiIndex
(
get_block_1d_id
());
const
auto
block_work_multi_id
=
block_work_desc
.
GetMultiIndexFrom1dIndex
(
get_block_1d_id
());
const
index_t
k_block_data_on_global
=
block_work_multi_id
[
0
]
*
KPerBlock
;
const
index_t
b_block_data_on_global
=
block_work_multi_id
[
1
]
*
BPerBlock
;
...
...
@@ -219,10 +220,10 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
{
// calculate origin of block input and weight tensor on global memory
const
Float
*
p_in_block_on_global
=
p_in_global
+
in_n_c_h_w_global_desc
.
Get
1d
Index
(
0
,
0
,
y
,
x
);
p_in_global
+
in_n_c_h_w_global_desc
.
Get
OffsetFromMulti
Index
(
0
,
0
,
y
,
x
);
const
Float
*
p_wei_block_on_global
=
p_wei_global
+
wei_c_y_x_k_global_desc
.
Get
1d
Index
(
0
,
y
,
x
,
0
);
p_wei_global
+
wei_c_y_x_k_global_desc
.
Get
OffsetFromMulti
Index
(
0
,
y
,
x
,
0
);
for
(
index_t
c_block_data_on_global
=
0
;
...
...
@@ -285,7 +286,7 @@ struct GridwiseConvolutionImplicitGemm_v3_nchw_cyxk_nkhw
// origin of thread tensor in global memory
const
index_t
p_out_thread_on_global
=
p_out_global
+
out_k_n1_b_n2_global_merged_desc
.
Get
1d
Index
(
out_k_n1_b_n2_global_merged_desc
.
Get
OffsetFromMulti
Index
(
k_thread_data_on_global
,
0
,
0
,
0
);
// dst origin on merged global tensor
// copy
...
...
src/include/gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw.hip.hpp
View file @
acd7082f
...
...
@@ -190,18 +190,19 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
{
// copy input tensor to LDS
blockwise_in_copy
.
Run
(
p_in_vec_global
+
in_nchw_vec_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
p_in_vec_block
);
blockwise_in_copy
.
Run
(
p_in_vec_global
+
in_nchw_vec_global_desc
.
GetOffsetFromMultiIndex
(
n_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
p_in_vec_block
);
// copy weight tensor to LDS
blockwise_wei_copy
.
Run
(
p_wei_vec_global
+
wei_kcyx_vec_global_desc
.
Get1dIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
p_wei_vec_block
);
blockwise_wei_copy
.
Run
(
p_wei_vec_global
+
wei_kcyx_vec_global_desc
.
GetOffsetFromMultiIndex
(
k_block_data_begin
,
c_block_data_begin
,
0
,
0
),
p_wei_vec_block
);
__syncthreads
();
...
...
@@ -212,26 +213,28 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
threadwise_direct_convolution_2
(
in_nchw_vec_thread_block_desc
,
p_in_vec_block
+
in_nchw_vec_block_desc
.
Get
1d
Index
(
n_thread_data_begin
,
c_thread_data
,
hi_thread_data_begin
,
wi_thread_data_begin
),
in_nchw_vec_block_desc
.
Get
OffsetFromMulti
Index
(
n_thread_data_begin
,
c_thread_data
,
hi_thread_data_begin
,
wi_thread_data_begin
),
wei_kcyx_vec_thread_block_desc
,
p_wei_vec_block
+
wei_kcyx_vec_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
wei_kcyx_vec_block_desc
.
GetOffsetFromMultiIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
out_nkhw_thread_desc
,
p_out_thread
);
#elif 0
threadwise_direct_convolution_3
(
in_nchw_vec_thread_block_desc
,
p_in_vec_block
+
in_nchw_vec_block_desc
.
Get
1d
Index
(
n_thread_data_begin
,
c_thread_data
,
hi_thread_data_begin
,
wi_thread_data_begin
),
in_nchw_vec_block_desc
.
Get
OffsetFromMulti
Index
(
n_thread_data_begin
,
c_thread_data
,
hi_thread_data_begin
,
wi_thread_data_begin
),
wei_kcyx_vec_thread_block_desc
,
p_wei_vec_block
+
wei_kcyx_vec_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
wei_kcyx_vec_block_desc
.
GetOffsetFromMultiIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
out_nkhw_thread_desc
,
p_out_thread
);
#endif
...
...
@@ -239,14 +242,14 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
}
// copy output tensor from register to global mem
threadwise_4d_tensor_copy
(
out_nkhw
_thread
_desc
,
p_out_thread
,
out_nkhw_global_desc
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
out_nkhw_thread_desc
.
GetLengths
());
threadwise_4d_tensor_copy
(
out_nkhw_thread_desc
,
p_out
_thread
,
out_nkhw_global_desc
,
p_out_global
+
out_nkhw_global_desc
.
GetOffsetFromMultiIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
out_nkhw_thread_desc
.
GetLengths
());
}
src/include/gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded.hip.hpp
View file @
acd7082f
...
...
@@ -217,7 +217,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
threadwise_4d_tensor_set_zero
(
out_hkwn_thread_desc
,
p_out_thread
);
const
Float
*
p_wei_global_block_begin
=
p_wei_global
+
wei_ek_global_desc
.
Get
1d
Index
(
0
,
k_block_data_begin
);
p_wei_global
+
wei_ek_global_desc
.
Get
OffsetFromMulti
Index
(
0
,
k_block_data_begin
);
for
(
index_t
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
p_wei_global_block_begin
+=
CPerBlock
*
wei_ek_global_desc
.
GetStride
(
I0
),
...
...
@@ -251,10 +251,11 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
{
auto
f_accum
=
[](
auto
&
acc
,
const
auto
&&
v
)
{
acc
+=
v
;
};
blockwise_batch_gemm
.
Run
(
p_wei_block
+
wei_cyxk_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
p_in_block
+
in_chwn_block_desc
.
Get1dIndex
(
0
,
y
,
x
,
0
),
p_out_thread
,
f_accum
);
blockwise_batch_gemm
.
Run
(
p_wei_block
+
wei_cyxk_block_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
0
),
p_in_block
+
in_chwn_block_desc
.
GetOffsetFromMultiIndex
(
0
,
y
,
x
,
0
),
p_out_thread
,
f_accum
);
}
}
}
...
...
@@ -284,10 +285,10 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_cyxk_khwn_padded(
p_out_thread
,
out_khwn_global_desc
,
p_out_global
+
out_khwn_global_desc
.
Get
1d
Index
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_khwn_global_desc
.
Get
OffsetFromMulti
Index
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
out_hkwn_thread_desc
.
GetLengths
(),
reorder_khwn_from_hkwn
);
}
src/include/tensor.hpp
View file @
acd7082f
...
...
@@ -93,7 +93,7 @@ struct TensorDescriptor
const
std
::
vector
<
std
::
size_t
>&
GetStrides
()
const
;
template
<
class
...
Is
>
std
::
size_t
Get
1d
Index
(
Is
...
is
)
const
std
::
size_t
Get
OffsetFromMulti
Index
(
Is
...
is
)
const
{
assert
(
sizeof
...(
Is
)
==
this
->
GetNumOfDimension
());
std
::
initializer_list
<
std
::
size_t
>
iss
{
static_cast
<
std
::
size_t
>
(
is
)...};
...
...
@@ -246,13 +246,13 @@ struct Tensor
template
<
class
...
Is
>
T
&
operator
()(
Is
...
is
)
{
return
mData
[
mDesc
.
Get
1d
Index
(
is
...)];
return
mData
[
mDesc
.
Get
OffsetFromMulti
Index
(
is
...)];
}
template
<
class
...
Is
>
const
T
&
operator
()(
Is
...
is
)
const
{
return
mData
[
mDesc
.
Get
1d
Index
(
is
...)];
return
mData
[
mDesc
.
Get
OffsetFromMulti
Index
(
is
...)];
}
typename
std
::
vector
<
T
>::
iterator
begin
()
{
return
mData
.
begin
();
}
...
...
src/include/threadwise_2d_tensor_op.hip.hpp
View file @
acd7082f
...
...
@@ -20,7 +20,7 @@ __device__ void threadwise_2d_tensor_pointwise_operation_unary(Desc, Float* __re
{
for
(
index_t
did1
=
0
;
did1
<
desc
.
GetLength
(
I1
);
++
did1
)
{
const
index_t
dindex
=
desc
.
Get
1d
Index
(
did0
,
did1
);
const
index_t
dindex
=
desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
);
f
(
p
[
dindex
]);
}
...
...
@@ -53,11 +53,11 @@ __device__ void threadwise_2d_tensor_pointwise_operation_binary_reorder_by_get_d
{
for
(
index_t
did1
=
0
;
did1
<
ref_desc
.
GetLength
(
I1
);
++
did1
)
{
const
index_t
aindex
=
src_desc
.
Get
1d
Index
(
did0
,
did1
);
const
index_t
aindex
=
src_desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
);
const
index_t
did
[
2
]
=
{
did0
,
did1
};
const
index_t
bindex
=
dst_desc
.
Get
1d
Index
(
did
[
IR0
],
did
[
IR1
]);
const
index_t
bindex
=
dst_desc
.
Get
OffsetFromMulti
Index
(
did
[
IR0
],
did
[
IR1
]);
f
(
p_src
[
aindex
],
p_dst
[
bindex
]);
}
...
...
@@ -127,7 +127,7 @@ __device__ void threadwise_2d_tensor_shift_down(Desc, Float* __restrict__ p, IDi
{
for
(
index_t
did1
=
0
;
did1
<
did1_end
;
++
did1
)
{
const
index_t
dindex
=
desc
.
Get
1d
Index
(
did0
,
did1
);
const
index_t
dindex
=
desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
);
const
index_t
sindex
=
dindex
+
nshift
*
desc
.
GetStride
(
IDim
{});
...
...
src/include/threadwise_4d_tensor_op.hip.hpp
View file @
acd7082f
...
...
@@ -26,7 +26,7 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, Float* __re
{
for
(
index_t
did3
=
0
;
did3
<
desc
.
GetLength
(
I3
);
++
did3
)
{
const
index_t
dindex
=
desc
.
Get
1d
Index
(
did0
,
did1
,
did2
,
did3
);
const
index_t
dindex
=
desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
,
did2
,
did3
);
f
(
p
[
dindex
]);
}
...
...
@@ -75,12 +75,12 @@ __device__ void threadwise_4d_tensor_pointwise_operation_binary_reorder_given_ds
{
for
(
index_t
did3
=
0
;
did3
<
ref_desc
.
GetLength
(
I3
);
++
did3
)
{
const
index_t
aindex
=
src_desc
.
Get
1d
Index
(
did0
,
did1
,
did2
,
did3
);
const
index_t
aindex
=
src_desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
,
did2
,
did3
);
const
index_t
did
[
4
]
=
{
did0
,
did1
,
did2
,
did3
};
const
index_t
bindex
=
dst_desc
.
Get
1d
Index
(
did
[
IR0
],
did
[
IR1
],
did
[
IR2
],
did
[
IR3
]);
dst_desc
.
Get
OffsetFromMulti
Index
(
did
[
IR0
],
did
[
IR1
],
did
[
IR2
],
did
[
IR3
]);
f
(
p_src
[
aindex
],
p_dst
[
bindex
]);
...
...
@@ -178,7 +178,7 @@ __device__ void threadwise_4d_tensor_shift_down(Desc, Float* __restrict__ p, IDi
{
for
(
index_t
did3
=
0
;
did3
<
did3_end
;
++
did3
)
{
const
index_t
dindex
=
desc
.
Get
1d
Index
(
did0
,
did1
,
did2
,
did3
);
const
index_t
dindex
=
desc
.
Get
OffsetFromMulti
Index
(
did0
,
did1
,
did2
,
did3
);
const
index_t
sindex
=
dindex
+
nshift
*
desc
.
GetStride
(
IDim
{});
...
...
src/include/threadwise_direct_convolution.hip.hpp
View file @
acd7082f
...
...
@@ -46,11 +46,14 @@ __device__ void threadwise_direct_convolution_1(InDesc,
const
index_t
hi
=
ho
+
y
;
const
index_t
wi
=
wo
+
x
;
const
index_t
in_index
=
in_desc
.
Get1dIndex
(
n
,
c
,
hi
,
wi
);
const
index_t
in_index
=
in_desc
.
GetOffsetFromMultiIndex
(
n
,
c
,
hi
,
wi
);
const
index_t
wei_index
=
wei_desc
.
Get1dIndex
(
k
,
c
,
y
,
x
);
const
index_t
wei_index
=
wei_desc
.
GetOffsetFromMultiIndex
(
k
,
c
,
y
,
x
);
const
index_t
out_index
=
out_desc
.
Get1dIndex
(
n
,
k
,
ho
,
wo
);
const
index_t
out_index
=
out_desc
.
GetOffsetFromMultiIndex
(
n
,
k
,
ho
,
wo
);
fused_multiply_accumulate
(
p_out
[
out_index
],
p_wei
[
wei_index
],
p_in
[
in_index
]);
...
...
@@ -143,14 +146,14 @@ __device__ void threadwise_direct_convolution_3(InDesc,
{
// read first input
threadwise_4d_tensor_copy(in_desc,
p_in + in_desc.Get
1d
Index(0, 0, y, 0),
p_in + in_desc.Get
OffsetFromMulti
Index(0, 0, y, 0),
in_reg_desc,
p_in_reg,
in_reg_desc.GetLengths());
// read first 1x1 weight
threadwise_4d_tensor_copy(wei_desc,
p_wei + wei_desc.Get
1d
Index(0, 0, y, 0),
p_wei + wei_desc.Get
OffsetFromMulti
Index(0, 0, y, 0),
wei_reg_desc,
p_wei_reg,
wei_reg_desc.GetLengths());
...
...
@@ -164,7 +167,7 @@ __device__ void threadwise_direct_convolution_3(InDesc,
{
// read new weight
threadwise_4d_tensor_copy(wei_desc,
p_wei + wei_desc.Get
1d
Index(0, 0, y, x),
p_wei + wei_desc.Get
OffsetFromMulti
Index(0, 0, y, x),
wei_reg_desc,
p_wei_reg,
wei_reg_desc.GetLengths());
...
...
@@ -175,10 +178,10 @@ __device__ void threadwise_direct_convolution_3(InDesc,
// read new input
threadwise_4d_tensor_copy(
in_desc,
p_in + in_desc.Get
1d
Index(0, 0, y, x + in_reg_desc.GetLength(I3) - 1),
p_in + in_desc.Get
OffsetFromMulti
Index(0, 0, y, x + in_reg_desc.GetLength(I3) - 1),
in_reg_desc,
p_in_reg +
in_reg_desc.Get
1d
Index(0, 0, 0, in_reg_desc.GetLength(I3) - in_w_new_read),
in_reg_desc.Get
OffsetFromMulti
Index(0, 0, 0, in_reg_desc.GetLength(I3) - in_w_new_read),
in_desc_reg_new_read.GetLengths());
// do 1x1 conv
...
...
@@ -196,14 +199,14 @@ __device__ void threadwise_direct_convolution_3(InDesc,
{
// read new weight
threadwise_4d_tensor_copy
(
wei_desc
,
p_wei
+
wei_desc
.
Get
1d
Index
(
0
,
0
,
y
,
x
),
p_wei
+
wei_desc
.
Get
OffsetFromMulti
Index
(
0
,
0
,
y
,
x
),
wei_reg_desc
,
p_wei_reg
,
wei_reg_desc
.
GetLengths
());
// read new input
threadwise_4d_tensor_copy
(
in_desc
,
p_in
+
in_desc
.
Get
1d
Index
(
0
,
0
,
y
,
x
),
p_in
+
in_desc
.
Get
OffsetFromMulti
Index
(
0
,
0
,
y
,
x
),
in_reg_desc
,
p_in_reg
,
in_reg_desc
.
GetLengths
());
...
...
src/include/threadwise_gemm.hip.hpp
View file @
acd7082f
...
...
@@ -9,7 +9,7 @@ __device__ void threadwise_matrix_set_zero(Matrix, Float* __restrict__ p_thread)
{
for
(
index_t
j
=
0
;
j
<
Matrix
::
NCol
();
++
j
)
{
const
index_t
id
=
Matrix
::
Get
1d
Index
(
i
,
j
);
const
index_t
id
=
Matrix
::
Get
OffsetFromMulti
Index
(
i
,
j
);
p_thread
[
id
]
=
0
;
}
}
...
...
@@ -39,8 +39,8 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
{
for
(
index_t
j
=
0
;
j
<
NCol
;
j
+=
DataPerRead
)
{
const
index_t
src_index
=
src_mtx
.
Get
1d
Index
(
i
,
j
);
const
index_t
dst_index
=
dst_mtx
.
Get
1d
Index
(
i
,
j
);
const
index_t
src_index
=
src_mtx
.
Get
OffsetFromMulti
Index
(
i
,
j
);
const
index_t
dst_index
=
dst_mtx
.
Get
OffsetFromMulti
Index
(
i
,
j
);
*
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_index
])
=
*
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
src_index
]);
...
...
@@ -83,9 +83,9 @@ __device__ void threadwise_gemm(MatrixA,
{
for
(
index_t
j
=
0
;
j
<
N
;
++
j
)
{
const
index_t
aindex
=
a_mtx
.
Get
1d
Index
(
k
,
i
);
// A is transposed
const
index_t
bindex
=
b_mtx
.
Get
1d
Index
(
k
,
j
);
const
index_t
cindex
=
c_mtx
.
Get
1d
Index
(
i
,
j
);
const
index_t
aindex
=
a_mtx
.
Get
OffsetFromMulti
Index
(
k
,
i
);
// A is transposed
const
index_t
bindex
=
b_mtx
.
Get
OffsetFromMulti
Index
(
k
,
j
);
const
index_t
cindex
=
c_mtx
.
Get
OffsetFromMulti
Index
(
i
,
j
);
p_c_thread
[
cindex
]
+=
p_a_thread
[
aindex
]
*
p_b_thread
[
bindex
];
}
...
...
src/include/threadwise_tensor_slice_op.hip.hpp
View file @
acd7082f
...
...
@@ -19,7 +19,7 @@ __device__ void threadwise_tensor_slice_copy(SrcDesc,
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor
(
SrcOpLengths
{});
constexpr
auto
ref_desc
=
make_
packed_
ConstantTensorDescriptor
(
SrcOpLengths
{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
...
...
@@ -53,9 +53,9 @@ __device__ void threadwise_tensor_slice_copy(SrcDesc,
static_for
<
0
,
nRead
,
1
>
{}([
&
](
auto
IRead
)
{
constexpr
auto
multi_id
=
decltype
(
Ids
){}.
PushBack
(
Number
<
IRead
.
Get
()
*
DataPerRead
>
{});
const
index_t
src_index
=
src_desc
.
Get
1d
Index
(
multi_id
);
const
index_t
src_index
=
src_desc
.
Get
OffsetFromMulti
Index
(
multi_id
);
const
index_t
dst_index
=
dst_desc
.
Get
1d
Index
(
multi_id
);
const
index_t
dst_index
=
dst_desc
.
Get
OffsetFromMulti
Index
(
multi_id
);
*
(
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_index
]))
=
*
(
reinterpret_cast
<
const
vector_t
*>
(
&
p_src
[
src_index
]));
...
...
@@ -84,9 +84,9 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v1(SrcDesc,
ford
<
SrcOpLengths
>
{}([
&
](
auto
src_multi_id
)
{
const
auto
dst_multi_id
=
reorder_array_given_new2old
(
src_multi_id
,
MapDst2Src
{});
const
index_t
dst_index
=
dst_desc
.
Get
1d
Index
(
dst_multi_id
);
const
index_t
dst_index
=
dst_desc
.
Get
OffsetFromMulti
Index
(
dst_multi_id
);
const
index_t
src_index
=
src_desc
.
Get
1d
Index
(
src_multi_id
);
const
index_t
src_index
=
src_desc
.
Get
OffsetFromMulti
Index
(
src_multi_id
);
p_dst
[
dst_index
]
=
p_src
[
src_index
];
});
...
...
@@ -115,9 +115,9 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v2(SrcDesc,
ford
<
decltype
(
dst_op_lengths
)
>
{}([
&
](
auto
dst_multi_id
)
{
const
auto
src_multi_id
=
reorder_array_given_old2new
(
dst_multi_id
,
MapDst2Src
{});
const
index_t
dst_index
=
dst_desc
.
Get
1d
Index
(
dst_multi_id
);
const
index_t
dst_index
=
dst_desc
.
Get
OffsetFromMulti
Index
(
dst_multi_id
);
const
index_t
src_index
=
src_desc
.
Get
1d
Index
(
src_multi_id
);
const
index_t
src_index
=
src_desc
.
Get
OffsetFromMulti
Index
(
src_multi_id
);
p_dst
[
dst_index
]
=
p_src
[
src_index
];
});
...
...
@@ -177,7 +177,7 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc,
const
auto
src_multi_id
=
reorder_array_given_old2new
(
dst_multi_id
,
MapDst2Src
{});
const
index_t
src_index
=
src_desc
.
Get
1d
Index
(
src_multi_id
);
const
index_t
src_index
=
src_desc
.
Get
OffsetFromMulti
Index
(
src_multi_id
);
vector_type
<
Float
,
DstDataPerWrite
>::
SetScalar
(
dst_vec_data
,
p_src
[
src_index
],
IDstData
);
...
...
@@ -186,7 +186,7 @@ threadwise_tensor_slice_copy_reorder_given_dst2src_v3(SrcDesc,
// write data
const
auto
dst_multi_id
=
ids
.
PushBack
(
IWrite
.
Get
()
*
DstDataPerWrite
);
const
index_t
dst_index
=
dst_desc
.
Get
1d
Index
(
dst_multi_id
);
const
index_t
dst_index
=
dst_desc
.
Get
OffsetFromMulti
Index
(
dst_multi_id
);
*
(
reinterpret_cast
<
vector_t
*>
(
&
p_dst
[
dst_index
]))
=
dst_vec_data
;
});
...
...
@@ -204,5 +204,21 @@ threadwise_tensor_slice_copy_generic(SrcDesc,
SliceLengths
,
DimAccessOrder
)
{
// not implemented
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
slice_lengths_in_access_order
=
SliceLengths
{}.
ReorderGivenNew2Old
(
DimAccessOrder
{});
ford
<
decltype
(
slice_lengths_in_access_order
)
>
{}([
&
](
auto
data_multi_id_in_access_order
)
{
const
auto
data_multi_id
=
reorder_array_given_old2new
(
data_multi_id_in_access_order
,
DimAccessOrder
{});
const
index_t
dst_index
=
dst_desc
.
GetOffsetFromMultiIndex
(
src_multi_offset
+
data_multi_id
);
const
index_t
src_index
=
src_desc
.
GetOffsetFromMultiIndex
(
dst_multi_offset
+
data_multi_id
);
p_dst
[
dst_index
]
=
p_src
[
src_index
];
});
}
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment