Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
b2888adf
"docs/source/ko/in_translation.mdx" did not exist on "2a69c0b7b8c4ab2ced90a1784a3eeef6ddf5ae8c"
Commit
b2888adf
authored
Feb 15, 2019
by
Chao Liu
Browse files
change file extension to hip.hpp and hip.cpp
parent
a414e3fd
Changes
35
Show whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
124 additions
and
135 deletions
+124
-135
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.hip.hpp
...idwise_implicit_gemm_convolution_1_chwn_csrk_khwn.hip.hpp
+13
-15
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hip.hpp
...implicit_gemm_convolution_1_chwn_csrk_khwn_padded.hip.hpp
+11
-12
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.hip.hpp
..._convolution_1_chwn_csrk_khwn_padded_lds_pipeline.hip.hpp
+11
-12
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hip.hpp
...idwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.hip.hpp
+18
-21
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.hip.hpp
...idwise_implicit_gemm_convolution_1_nchw_srck_nkhw.hip.hpp
+17
-20
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.hip.hpp
...idwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.hip.hpp
+9
-9
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.hip.hpp
...mm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.hip.hpp
+9
-9
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.hip.hpp
...idwise_implicit_gemm_convolution_2_cnhw_srck_knhw.hip.hpp
+7
-7
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.hip.hpp
...it_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.hip.hpp
+10
-11
src/include/gridwise_winograd_convolution.hip.hpp
src/include/gridwise_winograd_convolution.hip.hpp
+14
-15
src/include/tensor.hpp
src/include/tensor.hpp
+2
-1
src/include/threadwise_2d_tensor_op.hip.hpp
src/include/threadwise_2d_tensor_op.hip.hpp
+1
-1
src/include/threadwise_4d_tensor_op.hip.hpp
src/include/threadwise_4d_tensor_op.hip.hpp
+1
-1
src/include/threadwise_direct_convolution.hip.hpp
src/include/threadwise_direct_convolution.hip.hpp
+1
-1
src/include/threadwise_gemm.hip.hpp
src/include/threadwise_gemm.hip.hpp
+0
-0
No files found.
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.
cuh
→
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.
hip.hpp
View file @
b2888adf
#pragma once
#include "common.
cuh
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_2d_tensor_op.
cuh
"
#include "threadwise_4d_tensor_op.
cuh
"
#include "blockwise_gemm.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_2d_tensor_op.
hip.hpp
"
#include "threadwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
hip.hpp
"
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
...
...
@@ -199,8 +199,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
threadwise_4d_tensor_set_zero
(
out_hkwn_thread_desc
,
p_out_thread
);
const
Float
*
p_in_global_block_begin
=
p_in_global
+
in_chwn_global_desc
.
Get1dIndex
(
p_in_global
+
in_chwn_global_desc
.
Get1dIndex
(
0
,
hi_block_data_begin
,
wi_block_data_begin
,
n_block_data_begin
);
const
Float
*
p_wei_global_block_begin
=
...
...
@@ -258,8 +257,7 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
out_hkwn_thread_desc
,
p_out_thread
,
out_khwn_global_desc
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.
cuh
→
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.
hip.hpp
View file @
b2888adf
#pragma once
#include "common.
cuh
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_2d_tensor_op.
cuh
"
#include "threadwise_4d_tensor_op.
cuh
"
#include "blockwise_gemm.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_2d_tensor_op.
hip.hpp
"
#include "threadwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
hip.hpp
"
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
...
...
@@ -283,8 +283,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
out_hkwn_thread_desc
,
p_out_thread
,
out_khwn_global_desc
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.
cuh
→
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.
hip.hpp
View file @
b2888adf
#pragma once
#include "common.
cuh
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_2d_tensor_op.
cuh
"
#include "threadwise_4d_tensor_op.
cuh
"
#include "blockwise_gemm.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_2d_tensor_op.
hip.hpp
"
#include "threadwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
hip.hpp
"
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
...
...
@@ -339,8 +339,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
out_hkwn_thread_desc
,
p_out_thread
,
out_khwn_global_desc
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
...
...
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.
cuh
→
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.
hip.hpp
View file @
b2888adf
#pragma once
#include "common.
cuh
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "threadwise_4d_tensor_op.
cuh
"
#include "blockwise_gemm.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "threadwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
hip.hpp
"
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
...
...
@@ -160,8 +160,7 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
// convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
<
BlockSize
>
(
in_nchw_global_desc
,
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
...
...
@@ -245,8 +244,7 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
out_hkwn_thread_desc
,
p_out_thread
,
out_nkhw_global_desc
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
...
...
@@ -263,8 +261,7 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
out_nkhw_thread_desc
,
p_out_thread
,
out_nkhw_global_desc
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
...
...
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.
cuh
→
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.
hip.hpp
View file @
b2888adf
#pragma once
#include "common.
cuh
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "threadwise_4d_tensor_op.
cuh
"
#include "blockwise_gemm.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "threadwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
hip.hpp
"
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
...
...
@@ -166,8 +166,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
// convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
<
BlockSize
>
(
in_nchw_global_desc
,
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
...
...
@@ -180,9 +179,8 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
#if 1
// weight: global mem to LDS,
// format is [S,R,C,K], no conversion needed
blockwise_wei_copy
.
Run
(
p_wei_global
+
wei_srck_global_desc
.
Get1dIndex
(
0
,
0
,
c_block_data_begin
,
k_block_data_begin
),
blockwise_wei_copy
.
Run
(
p_wei_global
+
wei_srck_global_desc
.
Get1dIndex
(
0
,
0
,
c_block_data_begin
,
k_block_data_begin
),
p_wei_block
);
#endif
...
...
@@ -219,8 +217,7 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
out_hkwn_thread_desc
,
p_out_thread
,
out_nkhw_global_desc
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.
cuh
→
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.
hip.hpp
View file @
b2888adf
#pragma once
#include "common.
cuh
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_2d_tensor_op.
cuh
"
#include "threadwise_2d_tensor_op.
cuh
"
#include "blockwise_gemm.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_2d_tensor_op.
hip.hpp
"
#include "threadwise_2d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
hip.hpp
"
// define B = flatten(N, Hi, Wi)
template
<
unsigned
GridSize
,
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.
cuh
→
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.
hip.hpp
View file @
b2888adf
#pragma once
#include "common.
cuh
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_2d_tensor_op.
cuh
"
#include "threadwise_2d_tensor_op.
cuh
"
#include "blockwise_gemm.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_2d_tensor_op.
hip.hpp
"
#include "threadwise_2d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
hip.hpp
"
// define B = flatten(N, Hi, Wi)
template
<
unsigned
GridSize
,
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.
cuh
→
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.
hip.hpp
View file @
b2888adf
#pragma once
#include "common.
cuh
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_2d_tensor_op.
cuh
"
#include "threadwise_2d_tensor_op.
cuh
"
#include "blockwise_gemm.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_2d_tensor_op.
hip.hpp
"
#include "threadwise_2d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
hip.hpp
"
// define B = flatten(N, Hi, Wi)
template
<
unsigned
GridSize
,
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.
cuh
→
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.
hip.hpp
View file @
b2888adf
#pragma once
#include "common.
cuh
"
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantMatrixDescriptor.
cuh
"
#include "blockwise_4d_tensor_op.
cuh
"
#include "blockwise_2d_tensor_op.
cuh
"
#include "threadwise_2d_tensor_op.
cuh
"
#include "blockwise_gemm.
cuh
"
#include "common.
hip.hpp
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "ConstantMatrixDescriptor.
hip.hpp
"
#include "blockwise_4d_tensor_op.
hip.hpp
"
#include "blockwise_2d_tensor_op.
hip.hpp
"
#include "threadwise_2d_tensor_op.
hip.hpp
"
#include "blockwise_gemm.
hip.hpp
"
// define B = N*Hi*Wi
template
<
unsigned
GridSize
,
...
...
@@ -220,9 +220,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
#if 1
// preload next data
// input: global mem to LDS,
blockwise_in_copy
.
Run
(
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
c_block_data_begin
+
CPerBlock
,
b_block_data_begin
),
blockwise_in_copy
.
Run
(
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
c_block_data_begin
+
CPerBlock
,
b_block_data_begin
),
p_in_block_next
);
#endif
...
...
src/include/gridwise_winograd_convolution.
cuh
→
src/include/gridwise_winograd_convolution.
hip.hpp
View file @
b2888adf
#pragma once
#include "ConstantTensorDescriptor.
cuh
"
#include "blockwise_winograd_transform.
cuh
"
#include "threadwise_winograd_transform.
cuh
"
#include "ConstantTensorDescriptor.
hip.hpp
"
#include "blockwise_winograd_transform.
hip.hpp
"
#include "threadwise_winograd_transform.
hip.hpp
"
template
<
class
Float
,
class
InGlobalDesc
,
...
...
@@ -189,16 +189,15 @@ __global__ void gridwise_winograd_convolution(const Float* const __restrict__ p_
S
,
R
,
OutTileSizeH
,
OutTileSizeW
>
(
in_transform_thread_block_desc
,
p_in_transform_block
+
in_transform_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
OutTileSizeW
>
(
in_transform_thread_block_desc
,
p_in_transform_block
+
in_transform_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
y_thread_data_begin
*
InTileSizeH
,
x_thread_data_begin
*
InTileSizeW
),
wei_transform_thread_block_desc
,
p_wei_transform_block
+
wei_transform_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
p_wei_transform_block
+
wei_transform_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
out_transform_thread_desc
,
p_out_transform_thread
);
}
...
...
src/include/tensor.hpp
View file @
b2888adf
...
...
@@ -22,7 +22,8 @@ std::ostream& LogRange(std::ostream& os, Range&& r, std::string delim)
return
os
;
}
typedef
enum
{
typedef
enum
{
Half
=
0
,
Float
=
1
,
}
DataType_t
;
...
...
src/include/threadwise_2d_tensor_op.
cuh
→
src/include/threadwise_2d_tensor_op.
hip.hpp
View file @
b2888adf
#pragma once
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantTensorDescriptor.
hip.hpp
"
template
<
class
Float
,
class
Desc
,
class
F
>
__device__
void
threadwise_2d_tensor_pointwise_operation_unary
(
Desc
,
Float
*
__restrict__
p
,
F
f
)
...
...
src/include/threadwise_4d_tensor_op.
cuh
→
src/include/threadwise_4d_tensor_op.
hip.hpp
View file @
b2888adf
#pragma once
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantTensorDescriptor.
hip.hpp
"
template
<
class
Float
,
class
Desc
,
class
F
>
__device__
void
threadwise_4d_tensor_pointwise_operation_unary
(
Desc
,
Float
*
__restrict__
p
,
F
f
)
...
...
src/include/threadwise_direct_convolution.
cuh
→
src/include/threadwise_direct_convolution.
hip.hpp
View file @
b2888adf
#pragma once
#include "ConstantTensorDescriptor.
cuh
"
#include "ConstantTensorDescriptor.
hip.hpp
"
// optimized for scenario if p_in, p_wei, p_out are in register
template
<
class
Float
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
...
...
src/include/threadwise_gemm.
cuh
→
src/include/threadwise_gemm.
hip.hpp
View file @
b2888adf
File moved
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment