Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b2888adf
Commit
b2888adf
authored
Feb 15, 2019
by
Chao Liu
Browse files
change file extension to hip.hpp and hip.cpp
parent
a414e3fd
Changes
35
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
124 additions
and
135 deletions
+124
-135
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.hip.hpp
...idwise_implicit_gemm_convolution_1_chwn_csrk_khwn.hip.hpp
+13
-15
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hip.hpp
...implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hip.hpp
+11
-12
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.hip.hpp
..._convolution_1_chwn_csrk_khwn_padded_lds_pipeline.hip.hpp
+11
-12
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hip.hpp
...idwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hip.hpp
+18
-21
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.hip.hpp
...idwise_implicit_gemm_convolution_1_nchw_srck_nkhw.hip.hpp
+17
-20
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.hip.hpp
...idwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.hip.hpp
+9
-9
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.hip.hpp
...mm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.hip.hpp
+9
-9
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.hip.hpp
...idwise_implicit_gemm_convolution_2_cnhw_srck_knhw.hip.hpp
+7
-7
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.hip.hpp
...it_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.hip.hpp
+10
-11
src/include/gridwise_winograd_convolution.hip.hpp
src/include/gridwise_winograd_convolution.hip.hpp
+14
-15
src/include/tensor.hpp
src/include/tensor.hpp
+2
-1
src/include/threadwise_2d_tensor_op.hip.hpp
src/include/threadwise_2d_tensor_op.hip.hpp
+1
-1
src/include/threadwise_4d_tensor_op.hip.hpp
src/include/threadwise_4d_tensor_op.hip.hpp
+1
-1
src/include/threadwise_direct_convolution.hip.hpp
src/include/threadwise_direct_convolution.hip.hpp
+1
-1
src/include/threadwise_gemm.hip.hpp
src/include/threadwise_gemm.hip.hpp
+0
-0
No files found.
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.
cuh
→
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.
hip.hpp
View file @
b2888adf
#pragma once
#pragma once
#include "common.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_2d_tensor_op.
cuh
"
#include "blockwise_2d_tensor_op.
hip.hpp
"
#include "threadwise_4d_tensor_op.
cuh
"
#include "threadwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
cuh
"
#include "blockwise_gemm.
hip.hpp
"
template
<
unsigned
GridSize
,
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
unsigned
BlockSize
,
...
@@ -199,9 +199,8 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
...
@@ -199,9 +199,8 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
threadwise_4d_tensor_set_zero
(
out_hkwn_thread_desc
,
p_out_thread
);
threadwise_4d_tensor_set_zero
(
out_hkwn_thread_desc
,
p_out_thread
);
const
Float
*
p_in_global_block_begin
=
const
Float
*
p_in_global_block_begin
=
p_in_global
+
p_in_global
+
in_chwn_global_desc
.
Get1dIndex
(
in_chwn_global_desc
.
Get1dIndex
(
0
,
hi_block_data_begin
,
wi_block_data_begin
,
n_block_data_begin
);
0
,
hi_block_data_begin
,
wi_block_data_begin
,
n_block_data_begin
);
const
Float
*
p_wei_global_block_begin
=
const
Float
*
p_wei_global_block_begin
=
p_wei_global
+
wei_csrk_global_desc
.
Get1dIndex
(
0
,
0
,
0
,
k_block_data_begin
);
p_wei_global
+
wei_csrk_global_desc
.
Get1dIndex
(
0
,
0
,
0
,
k_block_data_begin
);
...
@@ -258,11 +257,10 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
...
@@ -258,11 +257,10 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
out_hkwn_thread_desc
,
out_hkwn_thread_desc
,
p_out_thread
,
p_out_thread
,
out_khwn_global_desc
,
out_khwn_global_desc
,
p_out_global
+
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
n_block_data_begin
+
n_thread_data_begin
),
out_hkwn_thread_desc
.
GetLengths
(),
out_hkwn_thread_desc
.
GetLengths
(),
reorder_khwn_from_hkwn
);
reorder_khwn_from_hkwn
);
}
}
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.
cuh
→
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.
hip.hpp
View file @
b2888adf
#pragma once
#pragma once
#include "common.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_2d_tensor_op.
cuh
"
#include "blockwise_2d_tensor_op.
hip.hpp
"
#include "threadwise_4d_tensor_op.
cuh
"
#include "threadwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
cuh
"
#include "blockwise_gemm.
hip.hpp
"
template
<
unsigned
GridSize
,
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
unsigned
BlockSize
,
...
@@ -283,11 +283,10 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
...
@@ -283,11 +283,10 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
out_hkwn_thread_desc
,
out_hkwn_thread_desc
,
p_out_thread
,
p_out_thread
,
out_khwn_global_desc
,
out_khwn_global_desc
,
p_out_global
+
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
n_block_data_begin
+
n_thread_data_begin
),
out_hkwn_thread_desc
.
GetLengths
(),
out_hkwn_thread_desc
.
GetLengths
(),
reorder_khwn_from_hkwn
);
reorder_khwn_from_hkwn
);
}
}
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.
cuh
→
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.
hip.hpp
View file @
b2888adf
#pragma once
#pragma once
#include "common.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_2d_tensor_op.
cuh
"
#include "blockwise_2d_tensor_op.
hip.hpp
"
#include "threadwise_4d_tensor_op.
cuh
"
#include "threadwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
cuh
"
#include "blockwise_gemm.
hip.hpp
"
template
<
unsigned
GridSize
,
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
unsigned
BlockSize
,
...
@@ -339,11 +339,10 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
...
@@ -339,11 +339,10 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
out_hkwn_thread_desc
,
out_hkwn_thread_desc
,
p_out_thread
,
p_out_thread
,
out_khwn_global_desc
,
out_khwn_global_desc
,
p_out_global
+
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
n_block_data_begin
+
n_thread_data_begin
),
out_hkwn_thread_desc
.
GetLengths
(),
out_hkwn_thread_desc
.
GetLengths
(),
reorder_khwn_from_hkwn
);
reorder_khwn_from_hkwn
);
}
}
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.
cuh
→
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.
hip.hpp
View file @
b2888adf
#pragma once
#pragma once
#include "common.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "threadwise_4d_tensor_op.
cuh
"
#include "threadwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
cuh
"
#include "blockwise_gemm.
hip.hpp
"
template
<
unsigned
GridSize
,
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
unsigned
BlockSize
,
...
@@ -160,11 +160,10 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
...
@@ -160,11 +160,10 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
// convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
// convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
<
BlockSize
>
(
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
<
BlockSize
>
(
in_nchw_global_desc
,
in_nchw_global_desc
,
p_in_global
+
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
wi_block_data_begin
),
in_chwn_block_desc
,
in_chwn_block_desc
,
p_in_block
,
p_in_block
,
in_nchw_block_desc
.
GetLengths
(),
in_nchw_block_desc
.
GetLengths
(),
...
@@ -245,11 +244,10 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
...
@@ -245,11 +244,10 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
out_hkwn_thread_desc
,
out_hkwn_thread_desc
,
p_out_thread
,
p_out_thread
,
out_nkhw_global_desc
,
out_nkhw_global_desc
,
p_out_global
+
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
wo_block_data_begin
+
wo_thread_data_begin
),
out_hkwn_thread_desc
.
GetLengths
(),
out_hkwn_thread_desc
.
GetLengths
(),
reorder_nkhw_from_hkwn
);
reorder_nkhw_from_hkwn
);
#else
#else
...
@@ -263,11 +261,10 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
...
@@ -263,11 +261,10 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
out_nkhw_thread_desc
,
out_nkhw_thread_desc
,
p_out_thread
,
p_out_thread
,
out_nkhw_global_desc
,
out_nkhw_global_desc
,
p_out_global
+
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
wo_block_data_begin
+
wo_thread_data_begin
),
out_nkhw_thread_desc
.
GetLengths
());
out_nkhw_thread_desc
.
GetLengths
());
#endif
#endif
}
}
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.
cuh
→
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.
hip.hpp
View file @
b2888adf
#pragma once
#pragma once
#include "common.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "threadwise_4d_tensor_op.
cuh
"
#include "threadwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
cuh
"
#include "blockwise_gemm.
hip.hpp
"
template
<
unsigned
GridSize
,
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
unsigned
BlockSize
,
...
@@ -166,11 +166,10 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
...
@@ -166,11 +166,10 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
// convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
// convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
<
BlockSize
>
(
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
<
BlockSize
>
(
in_nchw_global_desc
,
in_nchw_global_desc
,
p_in_global
+
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
wi_block_data_begin
),
in_chwn_block_desc
,
in_chwn_block_desc
,
p_in_block
,
p_in_block
,
in_nchw_block_desc
.
GetLengths
(),
in_nchw_block_desc
.
GetLengths
(),
...
@@ -180,10 +179,9 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
...
@@ -180,10 +179,9 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
#if 1
#if 1
// weight: global mem to LDS,
// weight: global mem to LDS,
// format is [S,R,C,K], no conversion needed
// format is [S,R,C,K], no conversion needed
blockwise_wei_copy
.
Run
(
blockwise_wei_copy
.
Run
(
p_wei_global
+
wei_srck_global_desc
.
Get1dIndex
(
p_wei_global
+
0
,
0
,
c_block_data_begin
,
k_block_data_begin
),
wei_srck_global_desc
.
Get1dIndex
(
0
,
0
,
c_block_data_begin
,
k_block_data_begin
),
p_wei_block
);
p_wei_block
);
#endif
#endif
__syncthreads
();
__syncthreads
();
...
@@ -219,11 +217,10 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
...
@@ -219,11 +217,10 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
out_hkwn_thread_desc
,
out_hkwn_thread_desc
,
p_out_thread
,
p_out_thread
,
out_nkhw_global_desc
,
out_nkhw_global_desc
,
p_out_global
+
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
wo_block_data_begin
+
wo_thread_data_begin
),
out_hkwn_thread_desc
.
GetLengths
(),
out_hkwn_thread_desc
.
GetLengths
(),
reorder_nkhw_from_hkwn
);
reorder_nkhw_from_hkwn
);
}
}
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.
cuh
→
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.
hip.hpp
View file @
b2888adf
#pragma once
#pragma once
#include "common.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_2d_tensor_op.
cuh
"
#include "blockwise_2d_tensor_op.
hip.hpp
"
#include "threadwise_2d_tensor_op.
cuh
"
#include "threadwise_2d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
cuh
"
#include "blockwise_gemm.
hip.hpp
"
// define B = flatten(N, Hi, Wi)
// define B = flatten(N, Hi, Wi)
template
<
unsigned
GridSize
,
template
<
unsigned
GridSize
,
...
@@ -121,7 +121,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(const Float* const __restric
...
@@ -121,7 +121,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(const Float* const __restric
decltype(in_cb_block_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths())>{};
decltype(in_cb_block_desc.GetLengths())>{};
#elif
0
#elif
0
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
Float
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
decltype
(
in_cb_block_desc
),
...
@@ -129,7 +129,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(const Float* const __restric
...
@@ -129,7 +129,7 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(const Float* const __restric
InBlockCopyThreadPerDim0
,
InBlockCopyThreadPerDim0
,
InBlockCopyThreadPerDim1
>
{};
InBlockCopyThreadPerDim1
>
{};
#elif 1
#elif 1
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
Float
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
decltype
(
in_cb_block_desc
),
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.
cuh
→
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.
hip.hpp
View file @
b2888adf
#pragma once
#pragma once
#include "common.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_2d_tensor_op.
cuh
"
#include "blockwise_2d_tensor_op.
hip.hpp
"
#include "threadwise_2d_tensor_op.
cuh
"
#include "threadwise_2d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
cuh
"
#include "blockwise_gemm.
hip.hpp
"
// define B = flatten(N, Hi, Wi)
// define B = flatten(N, Hi, Wi)
template
<
unsigned
GridSize
,
template
<
unsigned
GridSize
,
...
@@ -121,7 +121,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
...
@@ -121,7 +121,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
decltype(in_cb_block_desc),
decltype(in_cb_block_desc),
decltype(in_cb_block_desc.GetLengths())>{};
decltype(in_cb_block_desc.GetLengths())>{};
#elif
0
#elif
0
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy2
<
BlockSize
,
Float
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
decltype
(
in_cb_block_desc
),
...
@@ -129,7 +129,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
...
@@ -129,7 +129,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
InBlockCopyThreadPerDim0
,
InBlockCopyThreadPerDim0
,
InBlockCopyThreadPerDim1
>
{};
InBlockCopyThreadPerDim1
>
{};
#elif 1
#elif 1
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy3
<
BlockSize
,
Float
,
Float
,
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_global_desc
),
decltype
(
in_cb_block_desc
),
decltype
(
in_cb_block_desc
),
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.
cuh
→
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.
hip.hpp
View file @
b2888adf
#pragma once
#pragma once
#include "common.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_2d_tensor_op.
cuh
"
#include "blockwise_2d_tensor_op.
hip.hpp
"
#include "threadwise_2d_tensor_op.
cuh
"
#include "threadwise_2d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
cuh
"
#include "blockwise_gemm.
hip.hpp
"
// define B = flatten(N, Hi, Wi)
// define B = flatten(N, Hi, Wi)
template
<
unsigned
GridSize
,
template
<
unsigned
GridSize
,
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.
cuh
→
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.
hip.hpp
View file @
b2888adf
#pragma once
#pragma once
#include "common.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_2d_tensor_op.
cuh
"
#include "blockwise_2d_tensor_op.
hip.hpp
"
#include "threadwise_2d_tensor_op.
cuh
"
#include "threadwise_2d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
cuh
"
#include "blockwise_gemm.
hip.hpp
"
// define B = N*Hi*Wi
// define B = N*Hi*Wi
template
<
unsigned
GridSize
,
template
<
unsigned
GridSize
,
...
@@ -220,10 +220,9 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
...
@@ -220,10 +220,9 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
#if 1
#if 1
// preload next data
// preload next data
// input: global mem to LDS,
// input: global mem to LDS,
blockwise_in_copy
.
Run
(
blockwise_in_copy
.
Run
(
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
p_in_global
+
c_block_data_begin
+
CPerBlock
,
b_block_data_begin
),
in_cb_global_desc
.
Get1dIndex
(
c_block_data_begin
+
CPerBlock
,
b_block_data_begin
),
p_in_block_next
);
p_in_block_next
);
#endif
#endif
#if 1
#if 1
...
...
src/include/gridwise_winograd_convolution.
cuh
→
src/include/gridwise_winograd_convolution.
hip.hpp
View file @
b2888adf
#pragma once
#pragma once
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "blockwise_winograd_transform.
cuh
"
#include "blockwise_winograd_transform.
hip.hpp
"
#include "threadwise_winograd_transform.
cuh
"
#include "threadwise_winograd_transform.
hip.hpp
"
template
<
class
Float
,
template
<
class
Float
,
class
InGlobalDesc
,
class
InGlobalDesc
,
...
@@ -189,18 +189,17 @@ __global__ void gridwise_winograd_convolution(const Float* const __restrict__ p_
...
@@ -189,18 +189,17 @@ __global__ void gridwise_winograd_convolution(const Float* const __restrict__ p_
S
,
S
,
R
,
R
,
OutTileSizeH
,
OutTileSizeH
,
OutTileSizeW
>
(
OutTileSizeW
>
(
in_transform_thread_block_desc
,
in_transform_thread_block_desc
,
p_in_transform_block
+
in_transform_block_desc
.
Get1dIndex
(
p_in_transform_block
+
n_thread_data_begin
,
in_transform_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
c_thread_data
,
y_thread_data_begin
*
InTileSizeH
,
y_thread_data_begin
*
InTileSizeH
,
x_thread_data_begin
*
InTileSizeW
),
x_thread_data_begin
*
InTileSizeW
),
wei_transform_thread_block_desc
,
wei_transform_thread_block_desc
,
p_wei_transform_block
+
wei_transform_block_desc
.
Get1dIndex
(
p_wei_transform_block
+
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
wei_transform_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
out_transform_thread_desc
,
out_transform_thread_desc
,
p_out_transform_thread
);
p_out_transform_thread
);
}
}
};
};
...
...
src/include/tensor.hpp
View file @
b2888adf
...
@@ -22,7 +22,8 @@ std::ostream& LogRange(std::ostream& os, Range&& r, std::string delim)
...
@@ -22,7 +22,8 @@ std::ostream& LogRange(std::ostream& os, Range&& r, std::string delim)
return
os
;
return
os
;
}
}
typedef
enum
{
typedef
enum
{
Half
=
0
,
Half
=
0
,
Float
=
1
,
Float
=
1
,
}
DataType_t
;
}
DataType_t
;
...
...
src/include/threadwise_2d_tensor_op.
cuh
→
src/include/threadwise_2d_tensor_op.
hip.hpp
View file @
b2888adf
#pragma once
#pragma once
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantTensorDescriptor.
hip.hpp
"
template
<
class
Float
,
class
Desc
,
class
F
>
template
<
class
Float
,
class
Desc
,
class
F
>
__device__
void
threadwise_2d_tensor_pointwise_operation_unary
(
Desc
,
Float
*
__restrict__
p
,
F
f
)
__device__
void
threadwise_2d_tensor_pointwise_operation_unary
(
Desc
,
Float
*
__restrict__
p
,
F
f
)
...
...
src/include/threadwise_4d_tensor_op.
cuh
→
src/include/threadwise_4d_tensor_op.
hip.hpp
View file @
b2888adf
#pragma once
#pragma once
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantTensorDescriptor.
hip.hpp
"
template
<
class
Float
,
class
Desc
,
class
F
>
template
<
class
Float
,
class
Desc
,
class
F
>
__device__
void
threadwise_4d_tensor_pointwise_operation_unary
(
Desc
,
Float
*
__restrict__
p
,
F
f
)
__device__
void
threadwise_4d_tensor_pointwise_operation_unary
(
Desc
,
Float
*
__restrict__
p
,
F
f
)
...
...
src/include/threadwise_direct_convolution.
cuh
→
src/include/threadwise_direct_convolution.
hip.hpp
View file @
b2888adf
#pragma once
#pragma once
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantTensorDescriptor.
hip.hpp
"
// optimized for scenario if p_in, p_wei, p_out are in register
// optimized for scenario if p_in, p_wei, p_out are in register
template
<
class
Float
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
Float
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
...
...
src/include/threadwise_gemm.
cuh
→
src/include/threadwise_gemm.
hip.hpp
View file @
b2888adf
File moved
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment