Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangql
composable_kernel-1
Commits
e80fbbdd
Commit
e80fbbdd
authored
Feb 14, 2019
by
Chao Liu
Browse files
refactor build, clean up
parent
28354a0f
Changes
29
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
26 additions
and
76 deletions
+26
-76
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh
...gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh
+4
-4
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh
...e/gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw.cuh
+3
-6
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
...e/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
+3
-6
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
...e/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
+1
-4
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh
...t_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh
+3
-6
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
...e/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
+5
-8
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh
...plicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh
+3
-6
src/include/gridwise_winograd_convolution.cuh
src/include/gridwise_winograd_convolution.cuh
+4
-7
src/include/tensor.hpp
src/include/tensor.hpp
+0
-29
No files found.
src/include/gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline.cuh
View file @
e80fbbdd
...
...
@@ -28,9 +28,9 @@ template <unsigned GridSize,
unsigned
WeiBlockCopyThreadPerDim0
,
unsigned
WeiBlockCopyThreadPerDim1
>
__global__
void
gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_pipeline
(
Float
*
const
__restrict__
p_in_global
,
Float
*
const
__restrict__
p_wei_global
,
Float
*
__restrict__
p_out_global
)
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
{
// NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N]
// for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N"
...
...
@@ -220,7 +220,7 @@ __global__ void gridwise_implicit_gemm_convolution_1_chwn_csrk_khwn_padded_lds_p
// set threadwise output tensor to 0
threadwise_4d_tensor_set_zero
(
out_hkwn_thread_desc
,
p_out_thread
);
Float
*
p_wei_global_block_begin
=
const
Float
*
p_wei_global_block_begin
=
p_wei_global
+
wei_ek_global_desc
.
Get1dIndex
(
0
,
k_block_data_begin
);
// prelog: load data
...
...
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh
→
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr
_nkhw
.cuh
View file @
e80fbbdd
...
...
@@ -22,12 +22,9 @@ template <unsigned GridSize,
unsigned
HoPerThread
,
unsigned
WoPerThread
>
__global__
void
gridwise_implicit_gemm_convolution_1_nchw_kcsr
(
InGlobalDesc
,
Float
*
const
__restrict__
p_in_global
,
WeiGlobalDesc
,
Float
*
const
__restrict__
p_wei_global
,
OutGlobalDesc
,
Float
*
__restrict__
p_out_global
)
gridwise_implicit_gemm_convolution_1_nchw_kcsr_nkhw
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
{
// NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N]
// for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N"
...
...
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw.cuh
View file @
e80fbbdd
...
...
@@ -23,12 +23,9 @@ template <unsigned GridSize,
unsigned
HoPerThread
,
unsigned
WoPerThread
>
__global__
void
gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw
(
InGlobalDesc
,
Float
*
const
__restrict__
p_in_global
,
WeiGlobalDesc
,
Float
*
const
__restrict__
p_wei_global
,
OutGlobalDesc
,
Float
*
__restrict__
p_out_global
)
gridwise_implicit_gemm_convolution_1_nchw_srck_nkhw
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
{
// NPerThread == NPerBlock, because the format of input in LDS [C,Hi,Wi,N]
// for GEMM trans([C,K]) * [C,Wo*N], we need a thread to do all the "N"
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw.cuh
View file @
e80fbbdd
...
...
@@ -35,11 +35,8 @@ template <unsigned GridSize,
unsigned
InBlockCopyDataPerRead
,
unsigned
WeiBlockCopyDataPerRead
>
__global__
void
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw
(
InGlobalDesc
,
const
Float
*
const
__restrict__
p_in_global
,
WeiGlobalDesc
,
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
OutGlobalDesc
,
Float
*
const
__restrict__
p_out_global
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer.cuh
View file @
e80fbbdd
...
...
@@ -35,12 +35,9 @@ template <unsigned GridSize,
unsigned
InBlockCopyDataPerRead
,
unsigned
WeiBlockCopyDataPerRead
>
__global__
void
gridwise_implicit_gemm_convolution_2_cnhw_csrk_knhw_lds_double_buffer
(
InGlobalDesc
,
Float
*
const
__restrict__
p_in_global
,
WeiGlobalDesc
,
Float
*
const
__restrict__
p_wei_global
,
OutGlobalDesc
,
Float
*
__restrict__
p_out_global
)
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
View file @
e80fbbdd
...
...
@@ -25,12 +25,9 @@ template <unsigned GridSize,
unsigned
InBlockCopyThreadPerDim0
,
unsigned
InBlockCopyThreadPerDim1
>
__global__
void
gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw
(
InGlobalDesc
,
Float
*
const
__restrict__
p_in_global
,
WeiGlobalDesc
,
Float
*
const
__restrict__
p_wei_global
,
OutGlobalDesc
,
Float
*
__restrict__
p_out_global
)
gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
@@ -174,10 +171,10 @@ gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw(InGlobalDesc,
// set threadwise output tensor to 0
threadwise_2d_tensor_set_zero
(
out_kb_thread_desc
,
p_out_thread
);
Float
*
p_in_global_block_offset
=
const
Float
*
p_in_global_block_offset
=
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
0
,
b_block_data_begin
);
Float
*
p_wei_global_block_offset
=
const
Float
*
p_wei_global_block_offset
=
p_wei_global
+
wei_srck_global_desc
.
Get1dIndex
(
0
,
0
,
0
,
k_block_data_begin
);
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline.cuh
View file @
e80fbbdd
...
...
@@ -25,12 +25,9 @@ template <unsigned GridSize,
unsigned
InBlockCopyThreadPerDim0
,
unsigned
InBlockCopyThreadPerDim1
>
__global__
void
gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw_lds_pipeline
(
InGlobalDesc
,
Float
*
const
__restrict__
p_in_global
,
WeiGlobalDesc
,
Float
*
const
__restrict__
p_wei_global
,
OutGlobalDesc
,
Float
*
__restrict__
p_out_global
)
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
src/include/gridwise_winograd_convolution.cuh
View file @
e80fbbdd
...
...
@@ -19,12 +19,9 @@ template <class Float,
unsigned
CPerThread
,
unsigned
BlockSize
,
unsigned
GridSize
>
__global__
void
gridwise_winograd_convolution
(
InGlobalDesc
,
Float
*
const
__restrict__
p_in_global
,
WeiGlobalDesc
,
Float
*
const
__restrict__
p_wei_global
,
OutGlobalDesc
,
Float
*
__restrict__
p_out_global
)
__global__
void
gridwise_winograd_convolution
(
const
Float
*
const
__restrict__
p_in_global
,
const
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_out_global
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
...
...
src/include/tensor.hpp
View file @
e80fbbdd
...
...
@@ -6,8 +6,6 @@
#include <utility>
#include <cassert>
#include <iostream>
#include "cuda_runtime.h"
#include "helper_cuda.h"
template
<
class
Range
>
std
::
ostream
&
LogRange
(
std
::
ostream
&
os
,
Range
&&
r
,
std
::
string
delim
)
...
...
@@ -108,33 +106,6 @@ struct TensorDescriptor
std
::
vector
<
std
::
size_t
>
mStrides
;
};
struct
DeviceMem
{
DeviceMem
()
=
delete
;
DeviceMem
(
std
::
size_t
mem_size
)
:
mMemSize
(
mem_size
)
{
cudaMalloc
(
static_cast
<
void
**>
(
&
mpDeviceBuf
),
mMemSize
);
}
void
*
GetDeviceBuffer
()
{
return
mpDeviceBuf
;
}
int
ToDevice
(
const
void
*
p
)
{
return
static_cast
<
int
>
(
cudaMemcpy
(
mpDeviceBuf
,
const_cast
<
void
*>
(
p
),
mMemSize
,
cudaMemcpyHostToDevice
));
}
int
FromDevice
(
void
*
p
)
{
return
static_cast
<
int
>
(
cudaMemcpy
(
p
,
mpDeviceBuf
,
mMemSize
,
cudaMemcpyDeviceToHost
));
}
~
DeviceMem
()
{
cudaFree
(
mpDeviceBuf
);
}
void
*
mpDeviceBuf
;
std
::
size_t
mMemSize
;
};
struct
joinable_thread
:
std
::
thread
{
template
<
class
...
Xs
>
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment