Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
67c6f73f
Commit
67c6f73f
authored
Feb 15, 2019
by
Chao Liu
Browse files
hip build
parent
121693b3
Changes
34
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
149 additions
and
103 deletions
+149
-103
src/include/device.hpp
src/include/device.hpp
+29
-1
src/include/gridwise_direct_convolution_1.cuh
src/include/gridwise_direct_convolution_1.cuh
+11
-9
src/include/gridwise_direct_convolution_2.cuh
src/include/gridwise_direct_convolution_2.cuh
+22
-17
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
...e/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
+10
-8
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh
...ise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh
+5
-4
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh
...gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh
+6
-5
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh
...e/gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh
+15
-12
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
...e/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
+14
-11
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
...e/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
+6
-6
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh
...t_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh
+7
-7
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
...e/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
+2
-2
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh
...plicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh
+9
-8
src/include/gridwise_winograd_convolution.cuh
src/include/gridwise_winograd_convolution.cuh
+12
-11
src/include/tensor.hpp
src/include/tensor.hpp
+1
-2
No files found.
src/include/device.hpp
View file @
67c6f73f
#pragma once
#include <memory>
#include "config.h"
struct
DeviceMem
{
...
...
@@ -27,4 +28,31 @@ struct KernelTimer
std
::
unique_ptr
<
KernelTimerImpl
>
impl
;
};
void
launch_kernel
(
const
void
*
func
,
dim3
grid_dim
,
dim3
block_dim
,
void
**
args
,
float
&
time
);
template
<
typename
...
Args
,
typename
F
>
float
launch_kernel
(
F
kernel
,
dim3
grid_dim
,
dim3
block_dim
,
Args
...
args
)
{
KernelTimer
timer
;
#if DEVICE_BACKEND_HIP
timer
.
Start
();
hipLaunchKernelGGL
(
kernel
,
grid_dim
,
block_dim
,
0
,
0
,
args
...);
timer
.
End
();
hipGetErrorString
(
hipGetLastError
());
#elif DEVICE_BACKEND_CUDA
const
void
*
f
=
reinterpret_cast
<
const
void
*>
(
kernel
);
void
*
p_args
=
{
&
args
...};
timer
.
Start
();
cudaError_t
error
=
cudaLaunchKernel
(
f
,
grid_dim
,
block_dim
,
p_args
,
0
,
0
);
timer
.
End
();
checkCudaErrors
(
error
);
#endif
return
timer
.
GetElapsedTime
();
}
src/include/gridwise_direct_convolution_1.cuh
View file @
67c6f73f
#pragma once
#include "common.cuh"
#include "ConstantTensorDescriptor.cuh"
#include "blockwise_4d_tensor_op.cuh"
#include "blockwise_direct_convolution.cuh"
...
...
@@ -146,7 +147,8 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
c_block_work_begin
+=
CPerBlock
)
{
// copy input tensor to LDS
blockwise_in_copy
.
Run
(
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_work_begin
,
blockwise_in_copy
.
Run
(
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_work_begin
,
c_block_work_begin
,
hi_block_work_begin
,
wi_block_work_begin
),
...
...
@@ -176,9 +178,9 @@ __global__ void gridwise_direct_convolution_1(const Float* const __restrict__ p_
}
// copy output tensor from LDS to device mem
blockwise_out_copy
.
Run
(
p_out_block
,
p_out_global
+
out_global_desc
.
Get1dIndex
(
n_block_work_begin
,
k_block_work_begin
,
ho_block_work_begin
,
wo_block_work_begin
));
blockwise_out_copy
.
Run
(
p_out_block
,
p_out_global
+
out_global_desc
.
Get1dIndex
(
n_block_work_begin
,
k_block_work_begin
,
ho_block_work_begin
,
wo_block_work_begin
));
}
src/include/gridwise_direct_convolution_2.cuh
View file @
67c6f73f
#pragma once
#include "common.cuh"
#include "ConstantTensorDescriptor.cuh"
#include "blockwise_4d_tensor_op.cuh"
#include "blockwise_direct_convolution.cuh"
...
...
@@ -162,7 +163,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
{
// copy input tensor to LDS
blockwise_in_copy
.
Run
(
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_data_begin
,
blockwise_in_copy
.
Run
(
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
...
...
@@ -177,11 +179,12 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
for
(
unsigned
c_thread_data
=
0
;
c_thread_data
<
CPerBlock
;
c_thread_data
+=
CPerThread
)
{
// threadwise convolution
// threadwise convolution
#if 1
threadwise_direct_convolution_2
(
in_thread_block_desc
,
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
hi_thread_data_begin
,
wi_thread_data_begin
),
...
...
@@ -192,7 +195,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
#elif 0
threadwise_direct_convolution_3
(
in_thread_block_desc
,
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
hi_thread_data_begin
,
wi_thread_data_begin
),
...
...
@@ -209,7 +213,8 @@ __global__ void gridwise_direct_convolution_2(const Float* const __restrict__ p_
out_thread_desc
,
p_out_thread
,
out_global_desc
,
p_out_global
+
out_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
p_out_global
+
out_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn.cuh
View file @
67c6f73f
...
...
@@ -121,8 +121,8 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
decltype
(
in_chwn_block_desc
),
decltype
(
in_chwn_block_desc
.
GetLengths
())
>
{};
// blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock]
// blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock]
#if 0
const auto blockwise_wei_copy =
Blockwise2dTensorCopy1<BlockSize,
...
...
@@ -199,7 +199,8 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
threadwise_4d_tensor_set_zero
(
out_hkwn_thread_desc
,
p_out_thread
);
const
Float
*
p_in_global_block_begin
=
p_in_global
+
in_chwn_global_desc
.
Get1dIndex
(
p_in_global
+
in_chwn_global_desc
.
Get1dIndex
(
0
,
hi_block_data_begin
,
wi_block_data_begin
,
n_block_data_begin
);
const
Float
*
p_wei_global_block_begin
=
...
...
@@ -257,7 +258,8 @@ gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn(const Float* const __restric
out_hkwn_thread_desc
,
p_out_thread
,
out_khwn_global_desc
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded.cuh
View file @
67c6f73f
...
...
@@ -283,7 +283,8 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded(
out_hkwn_thread_desc
,
p_out_thread
,
out_khwn_global_desc
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
...
...
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh
View file @
67c6f73f
...
...
@@ -256,7 +256,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
Float
*
p_in_block_next
=
even_loop
?
p_in_block_1
:
p_in_block_0
;
Float
*
p_wei_block_next
=
even_loop
?
p_wei_block_1
:
p_wei_block_0
;
// preload next data
// preload next data
#if 1
// input: global mem to LDS,
blockwise_in_copy
.
Run
(
p_in_global
,
...
...
@@ -339,7 +339,8 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
out_hkwn_thread_desc
,
p_out_thread
,
out_khwn_global_desc
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
p_out_global
+
out_khwn_global_desc
.
Get1dIndex
(
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
,
n_block_data_begin
+
n_thread_data_begin
),
...
...
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh
View file @
67c6f73f
...
...
@@ -160,7 +160,8 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
// convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
<
BlockSize
>
(
in_nchw_global_desc
,
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
...
...
@@ -244,7 +245,8 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
out_hkwn_thread_desc
,
p_out_thread
,
out_nkhw_global_desc
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
...
...
@@ -261,7 +263,8 @@ gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw(const Float* const __restric
out_nkhw_thread_desc
,
p_out_thread
,
out_nkhw_global_desc
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
...
...
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
View file @
67c6f73f
...
...
@@ -166,7 +166,8 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
// convert [N,C,Hi,Wi] to [C,Hi,Wi,N]
blockwise_4d_tensor_copy_reorder_by_get_dst_from_src
<
BlockSize
>
(
in_nchw_global_desc
,
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
p_in_global
+
in_nchw_global_desc
.
Get1dIndex
(
n_block_data_begin
,
c_block_data_begin
,
hi_block_data_begin
,
wi_block_data_begin
),
...
...
@@ -179,8 +180,9 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
#if 1
// weight: global mem to LDS,
// format is [S,R,C,K], no conversion needed
blockwise_wei_copy
.
Run
(
p_wei_global
+
wei_srck_global_desc
.
Get1dIndex
(
0
,
0
,
c_block_data_begin
,
k_block_data_begin
),
blockwise_wei_copy
.
Run
(
p_wei_global
+
wei_srck_global_desc
.
Get1dIndex
(
0
,
0
,
c_block_data_begin
,
k_block_data_begin
),
p_wei_block
);
#endif
...
...
@@ -217,7 +219,8 @@ gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw(const Float* const __restric
out_hkwn_thread_desc
,
p_out_thread
,
out_nkhw_global_desc
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
p_out_global
+
out_nkhw_global_desc
.
Get1dIndex
(
n_block_data_begin
+
n_thread_data_begin
,
k_block_data_begin
+
k_thread_data_begin
,
ho_block_data_begin
+
ho_thread_data_begin
,
wo_block_data_begin
+
wo_thread_data_begin
),
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
View file @
67c6f73f
...
...
@@ -111,8 +111,8 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(const Float* const __restric
}
#endif
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0
const auto blockwise_in_copy =
Blockwise2dTensorCopy1<BlockSize,
...
...
@@ -137,8 +137,8 @@ gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw(const Float* const __restric
InBlockCopyDataPerRead
>
{};
#endif
// blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock]
// blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock]
#if 0
const auto blockwise_wei_copy =
Blockwise2dTensorCopy1<BlockSize,
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh
View file @
67c6f73f
...
...
@@ -111,8 +111,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
}
#endif
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0
const auto blockwise_in_copy =
Blockwise2dTensorCopy1<BlockSize,
...
...
@@ -137,8 +137,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
InBlockCopyDataPerRead
>
{};
#endif
// blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock]
// blockwise wei copy
// format is [CPerBlock*S*R,KPerBlock]
#if 0
const auto blockwise_wei_copy =
Blockwise2dTensorCopy1<BlockSize,
...
...
@@ -258,7 +258,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_b
__syncthreads
();
// load next data
// load next data
#if 0
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_next);
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_next);
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
View file @
67c6f73f
...
...
@@ -103,8 +103,8 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(const Float* const __restric
}
#endif
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
// blockwise in copy
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 0
const auto blockwise_in_copy =
Blockwise2dTensorCopy1<BlockSize,
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh
View file @
67c6f73f
...
...
@@ -103,8 +103,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
}
#endif
// in: global mem to LDS
// formmat is [CPerBlock,BPerBlock + BGhostRead]
// in: global mem to LDS
// formmat is [CPerBlock,BPerBlock + BGhostRead]
#if 1
const
auto
blockwise_in_copy
=
Blockwise2dTensorCopy1
<
BlockSize
,
...
...
@@ -129,8 +129,8 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
decltype
(
in_cb_block_desc
.
GetLengths
())
>
{};
#endif
// weight: global mem to LDS,
// format is [S,R,CPerBlock,KPerBlock]
// weight: global mem to LDS,
// format is [S,R,CPerBlock,KPerBlock]
#if 1
const
auto
blockwise_wei_copy
=
Blockwise4dTensorCopy1
<
BlockSize
,
...
...
@@ -191,7 +191,7 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
// set threadwise output tensor to 0
threadwise_2d_tensor_set_zero
(
out_kb_thread_desc
,
p_out_thread
);
// prelog: load data
// prelog: load data
#if 1
// input: global mem to LDS,
blockwise_in_copy
.
Run
(
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
0
,
b_block_data_begin
),
...
...
@@ -220,8 +220,9 @@ __global__ void gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
#if 1
// preload next data
// input: global mem to LDS,
blockwise_in_copy
.
Run
(
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
c_block_data_begin
+
CPerBlock
,
b_block_data_begin
),
blockwise_in_copy
.
Run
(
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
c_block_data_begin
+
CPerBlock
,
b_block_data_begin
),
p_in_block_next
);
#endif
...
...
src/include/gridwise_winograd_convolution.cuh
View file @
67c6f73f
...
...
@@ -189,15 +189,16 @@ __global__ void gridwise_winograd_convolution(const Float* const __restrict__ p_
S
,
R
,
OutTileSizeH
,
OutTileSizeW
>
(
in_transform_thread_block_desc
,
p_in_transform_block
+
in_transform_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
OutTileSizeW
>
(
in_transform_thread_block_desc
,
p_in_transform_block
+
in_transform_block_desc
.
Get1dIndex
(
n_thread_data_begin
,
c_thread_data
,
y_thread_data_begin
*
InTileSizeH
,
x_thread_data_begin
*
InTileSizeW
),
wei_transform_thread_block_desc
,
p_wei_transform_block
+
wei_transform_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
p_wei_transform_block
+
wei_transform_block_desc
.
Get1dIndex
(
k_thread_data_begin
,
c_thread_data
,
0
,
0
),
out_transform_thread_desc
,
p_out_transform_thread
);
}
...
...
src/include/tensor.hpp
View file @
67c6f73f
...
...
@@ -22,8 +22,7 @@ std::ostream& LogRange(std::ostream& os, Range&& r, std::string delim)
return
os
;
}
typedef
enum
{
typedef
enum
{
Half
=
0
,
Float
=
1
,
}
DataType_t
;
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment