Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b2888adf
Commit
b2888adf
authored
Feb 15, 2019
by
Chao Liu
Browse files
change file extension to hip.hpp and hip.cpp
parent
a414e3fd
Changes
35
Show whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
124 additions
and
135 deletions
+124
-135
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.hip.hpp
...idwise_implicit_gemm_convolution_1_chwn_csrk_khwn.hip.hpp
+13
-15
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hip.hpp
...implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hip.hpp
+11
-12
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.hip.hpp
..._convolution_1_chwn_csrk_khwn_padded_lds_pipeline.hip.hpp
+11
-12
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hip.hpp
...idwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hip.hpp
+18
-21
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.hip.hpp
...idwise_implicit_gemm_convolution_1_nchw_srck_nkhw.hip.hpp
+17
-20
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.hip.hpp
...idwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.hip.hpp
+9
-9
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.hip.hpp
...mm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.hip.hpp
+9
-9
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.hip.hpp
...idwise_implicit_gemm_convolution_2_cnhw_srck_knhw.hip.hpp
+7
-7
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.hip.hpp
...it_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.hip.hpp
+10
-11
src/include/gridwise_winograd_convolution.hip.hpp
src/include/gridwise_winograd_convolution.hip.hpp
+14
-15
src/include/tensor.hpp
src/include/tensor.hpp
+2
-1
src/include/threadwise_2d_tensor_op.hip.hpp
src/include/threadwise_2d_tensor_op.hip.hpp
+1
-1
src/include/threadwise_4d_tensor_op.hip.hpp
src/include/threadwise_4d_tensor_op.hip.hpp
+1
-1
src/include/threadwise_direct_convolution.hip.hpp
src/include/threadwise_direct_convolution.hip.hpp
+1
-1
src/include/threadwise_gemm.hip.hpp
src/include/threadwise_gemm.hip.hpp
+0
-0
No files found.
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.
cuh
→
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.
hip.hpp
View file @
b2888adf
#pragma once
#include "common.
cuh
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_2d_tensor_op.
cuh
"
#include "threadwise_4d_tensor_op.
cuh
"
#include "blockwise_gemm.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_2d_tensor_op.
hip.hpp
"
#include "threadwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
hip.hpp
"
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
...
...
@@ -199,8 +199,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
threadwise_4d_tensor_set_zero
(
out_hkwn_thread_desc
,
p_out_thread
);
const
Float
*
p_in_global_block_begin
=
p_in_global
+
in_chwn_global_desc
.
Get1dIndex
(
p_in_global
+
in_chwn_global_desc
.
Get1dIndex
(
0
,
hi_block_data_begin
,
wi_block_data_begin
,
n_block_data_begin
);
const
Float
*
p_wei_global_block_begin
=
...
...
@@ -258,8 +257,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
out_hkwn_thread_desc
,
p_out_thread
,
out_khwn_global_desc
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.
cuh
→
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.
hip.hpp
View file @
b2888adf
#pragma once
#include "common.
cuh
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_2d_tensor_op.
cuh
"
#include "threadwise_4d_tensor_op.
cuh
"
#include "blockwise_gemm.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_2d_tensor_op.
hip.hpp
"
#include "threadwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
hip.hpp
"
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
...
...
@@ -283,8 +283,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
out_hkwn_thread_desc
,
p_out_thread
,
out_khwn_global_desc
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.
cuh
→
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.
hip.hpp
View file @
b2888adf
#pragma once
#include "common.
cuh
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_2d_tensor_op.
cuh
"
#include "threadwise_4d_tensor_op.
cuh
"
#include "blockwise_gemm.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_2d_tensor_op.
hip.hpp
"
#include "threadwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
hip.hpp
"
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
...
...
@@ -339,8 +339,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
out_hkwn_thread_desc
,
p_out_thread
,
out_khwn_global_desc
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
...
...
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.
cuh
→
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.
hip.hpp
View file @
b2888adf
#pragma once
#include "common.
cuh
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "threadwise_4d_tensor_op.
cuh
"
#include "blockwise_gemm.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "threadwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
hip.hpp
"
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
...
...
@@ -160,8 +160,7 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
// convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
<
BlockSize
>
(
in_nchw_global_desc
,
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
...
...
@@ -245,8 +244,7 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
out_hkwn_thread_desc
,
p_out_thread
,
out_nkhw_global_desc
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
...
...
@@ -263,8 +261,7 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
out_nkhw_thread_desc
,
p_out_thread
,
out_nkhw_global_desc
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
...
...
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.
cuh
→
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.
hip.hpp
View file @
b2888adf
#pragma once
#include "common.
cuh
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "threadwise_4d_tensor_op.
cuh
"
#include "blockwise_gemm.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "threadwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
hip.hpp
"
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
...
...
@@ -166,8 +166,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
// convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
<
BlockSize
>
(
in_nchw_global_desc
,
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
...
...
@@ -180,9 +179,8 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
#if 1
// weight: global mem to LDS,
// format is [S,R,C,K], no conversion needed
blockwise_wei_copy
.
Run
(
p_wei_global
+
wei_srck_global_desc
.
Get1dIndex
(
0
,
0
,
c_block_data_begin
,
k_block_data_begin
),
blockwise_wei_copy
.
Run
(
p_wei_global
+
wei_srck_global_desc
.
Get1dIndex
(
0
,
0
,
c_block_data_begin
,
k_block_data_begin
),
p_wei_block
);
#endif
...
...
@@ -219,8 +217,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
out_hkwn_thread_desc
,
p_out_thread
,
out_nkhw_global_desc
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.
cuh
→
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.
hip.hpp
View file @
b2888adf
#pragma once
#include "common.
cuh
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_2d_tensor_op.
cuh
"
#include "threadwise_2d_tensor_op.
cuh
"
#include "blockwise_gemm.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_2d_tensor_op.
hip.hpp
"
#include "threadwise_2d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
hip.hpp
"
// define B = flatten(N, Hi, Wi)
template
<
unsigned
GridSize
,
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.
cuh
→
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.
hip.hpp
View file @
b2888adf
#pragma once
#include "common.
cuh
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_2d_tensor_op.
cuh
"
#include "threadwise_2d_tensor_op.
cuh
"
#include "blockwise_gemm.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_2d_tensor_op.
hip.hpp
"
#include "threadwise_2d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
hip.hpp
"
// define B = flatten(N, Hi, Wi)
template
<
unsigned
GridSize
,
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.
cuh
→
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.
hip.hpp
View file @
b2888adf
#pragma once
#include "common.
cuh
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_2d_tensor_op.
cuh
"
#include "threadwise_2d_tensor_op.
cuh
"
#include "blockwise_gemm.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_2d_tensor_op.
hip.hpp
"
#include "threadwise_2d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
hip.hpp
"
// define B = flatten(N, Hi, Wi)
template
<
unsigned
GridSize
,
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.
cuh
→
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.
hip.hpp
View file @
b2888adf
#pragma once
#include "common.
cuh
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_2d_tensor_op.
cuh
"
#include "threadwise_2d_tensor_op.
cuh
"
#include "blockwise_gemm.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_2d_tensor_op.
hip.hpp
"
#include "threadwise_2d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
hip.hpp
"
// define B = N*Hi*Wi
template
<
unsigned
GridSize
,
...
...
@@ -220,9 +220,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
#if 1
// preload next data
// input: global mem to LDS,
blockwise_in_copy
.
Run
(
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
c_block_data_begin
+
CPerBlock
,
b_block_data_begin
),
blockwise_in_copy
.
Run
(
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
c_block_data_begin
+
CPerBlock
,
b_block_data_begin
),
p_in_block_next
);
#endif
...
...
src/include/gridwise_winograd_convolution.
cuh
→
src/include/gridwise_winograd_convolution.
hip.hpp
View file @
b2888adf
#pragma once
#include "ConstantTensorDescriptor.
cuh
"
#include "blockwise_winograd_transform.
cuh
"
#include "threadwise_winograd_transform.
cuh
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "blockwise_winograd_transform.
hip.hpp
"
#include "threadwise_winograd_transform.
hip.hpp
"
template
<
class
Float
,
class
InGlobalDesc
,
...
...
@@ -189,16 +189,15 @@ __global__ void gridwise_winograd_convolution(const Float* const __restrict__ p_
S
,
R
,
OutTileSizeH
,
OutTileSizeW
>
(
in_transform_thread_block_desc
,
p_in_transform_block
+
in_transform_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
OutTileSizeW
>
(
in_transform_thread_block_desc
,
p_in_transform_block
+
in_transform_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
y_thread_data_begin
*
InTileSizeH
,
x_thread_data_begin
*
InTileSizeW
),
wei_transform_thread_block_desc
,
p_wei_transform_block
+
wei_transform_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
p_wei_transform_block
+
wei_transform_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
out_transform_thread_desc
,
p_out_transform_thread
);
}
...
...
src/include/tensor.hpp
View file @
b2888adf
...
...
@@ -22,7 +22,8 @@ std::ostream& LogRange(std::ostream& os, Range&& r, std::string delim)
return
os
;
}
typedef
enum
{
typedef
enum
{
Half
=
0
,
Float
=
1
,
}
DataType_t
;
...
...
src/include/threadwise_2d_tensor_op.
cuh
→
src/include/threadwise_2d_tensor_op.
hip.hpp
View file @
b2888adf
#pragma once
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantTensorDescriptor.
hip.hpp
"
template
<
class
Float
,
class
Desc
,
class
F
>
__device__
void
threadwise_2d_tensor_pointwise_operation_unary
(
Desc
,
Float
*
__restrict__
p
,
F
f
)
...
...
src/include/threadwise_4d_tensor_op.
cuh
→
src/include/threadwise_4d_tensor_op.
hip.hpp
View file @
b2888adf
#pragma once
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantTensorDescriptor.
hip.hpp
"
template
<
class
Float
,
class
Desc
,
class
F
>
__device__
void
threadwise_4d_tensor_pointwise_operation_unary
(
Desc
,
Float
*
__restrict__
p
,
F
f
)
...
...
src/include/threadwise_direct_convolution.
cuh
→
src/include/threadwise_direct_convolution.
hip.hpp
View file @
b2888adf
#pragma once
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantTensorDescriptor.
hip.hpp
"
// optimized for scenario if p_in, p_wei, p_out are in register
template
<
class
Float
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
...
...
src/include/threadwise_gemm.
cuh
→
src/include/threadwise_gemm.
hip.hpp
View file @
b2888adf
File moved
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment