Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
20968472
Commit
20968472
authored
Jan 20, 2019
by
Chao Liu
Browse files
add 2nd variation of implicit gemm
parent
3bd51021
Changes
17
Show whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
988 additions
and
122 deletions
+988
-122
driver/conv.cu
driver/conv.cu
+36
-22
driver/device_implicit_gemm_convolution_1_nchw_kcsr.cuh
driver/device_implicit_gemm_convolution_1_nchw_kcsr.cuh
+17
-17
driver/device_implicit_gemm_convolution_1_nchw_srck.cuh
driver/device_implicit_gemm_convolution_1_nchw_srck.cuh
+35
-22
driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
+163
-0
src/include/ConstantMatrixDescriptor.cuh
src/include/ConstantMatrixDescriptor.cuh
+11
-0
src/include/ConstantTensorDescriptor.cuh
src/include/ConstantTensorDescriptor.cuh
+113
-33
src/include/blockwise_2d_tensor_op.cuh
src/include/blockwise_2d_tensor_op.cuh
+172
-0
src/include/blockwise_4d_tensor_op.cuh
src/include/blockwise_4d_tensor_op.cuh
+0
-0
src/include/blockwise_direct_convolution.cuh
src/include/blockwise_direct_convolution.cuh
+1
-1
src/include/gemm.cuh
src/include/gemm.cuh
+15
-8
src/include/gridwise_direct_convolution_1.cuh
src/include/gridwise_direct_convolution_1.cuh
+1
-1
src/include/gridwise_direct_convolution_2.cuh
src/include/gridwise_direct_convolution_2.cuh
+2
-2
src/include/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh
...nclude/gridwise_implicit_gemm_convolution_1_nchw_kcsr.cuh
+9
-8
src/include/gridwise_implicit_gemm_convolution_1_nchw_srck.cuh
...nclude/gridwise_implicit_gemm_convolution_1_nchw_srck.cuh
+9
-8
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
...e/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
+264
-0
src/include/threadwise_2d_tensor_op.cuh
src/include/threadwise_2d_tensor_op.cuh
+140
-0
src/include/threadwise_4d_tensor_op.cuh
src/include/threadwise_4d_tensor_op.cuh
+0
-0
No files found.
driver/conv.cu
View file @
20968472
...
@@ -8,8 +8,9 @@
...
@@ -8,8 +8,9 @@
#include "conv_common.cuh"
#include "conv_common.cuh"
#include "device_direct_convolution_1.cuh"
#include "device_direct_convolution_1.cuh"
#include "device_direct_convolution_2.cuh"
#include "device_direct_convolution_2.cuh"
#include "device_implicit_gemm_convolution_nchw_kcsr.cuh"
#include "device_implicit_gemm_convolution_1_nchw_kcsr.cuh"
#include "device_implicit_gemm_convolution_nchw_srck.cuh"
#include "device_implicit_gemm_convolution_1_nchw_srck.cuh"
#include "device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh"
//#include "device_winograd_convolution.cuh"
//#include "device_winograd_convolution.cuh"
struct
GeneratorTensor_1
struct
GeneratorTensor_1
...
@@ -52,6 +53,21 @@ struct GeneratorTensor_3
...
@@ -52,6 +53,21 @@ struct GeneratorTensor_3
}
}
};
};
struct
GeneratorTensor_Checkboard
{
template
<
class
...
Ts
>
double
operator
()(
Ts
...
Xs
)
const
{
std
::
array
<
unsigned
long
,
sizeof
...(
Ts
)
>
dims
=
{{
Xs
...}};
return
std
::
accumulate
(
dims
.
begin
(),
dims
.
end
(),
true
,
[](
bool
init
,
unsigned
long
x
)
->
int
{
return
init
!=
(
x
%
2
);
})
?
1
:
-
1
;
}
};
// this is ugly, only for 4d
// this is ugly, only for 4d
template
<
class
TConstTensorDesc
>
template
<
class
TConstTensorDesc
>
void
ostream_ConstantTensorDescriptor
(
TConstTensorDesc
,
std
::
ostream
&
os
=
std
::
cout
)
void
ostream_ConstantTensorDescriptor
(
TConstTensorDesc
,
std
::
ostream
&
os
=
std
::
cout
)
...
@@ -338,10 +354,10 @@ int main()
...
@@ -338,10 +354,10 @@ int main()
{
{
#if 0
#if 0
constexpr unsigned N = 1;
constexpr unsigned N = 1;
constexpr unsigned C =
1
;
constexpr unsigned C
=
2
;
constexpr unsigned HI = 34;
constexpr unsigned HI = 34;
constexpr unsigned WI = 34;
constexpr unsigned WI = 34;
constexpr unsigned K =
1
;
constexpr unsigned K
=
2
;
constexpr unsigned S = 3;
constexpr unsigned S = 3;
constexpr unsigned R = 3;
constexpr unsigned R = 3;
#elif
1
#elif
1
...
@@ -389,31 +405,29 @@ int main()
...
@@ -389,31 +405,29 @@ int main()
#if 0
#if 0
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcsr.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
wei_kcsr.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
#elif
0
#elif
1
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
in_nchw
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei_kcsr
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei_kcsr
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
#endif
#endif
for
(
int
i
=
0
;
i
<
40
;
++
i
)
unsigned
nrepeat
=
100
;
{
#if 0
#if 0
device_direct_convolution_1
(in_nchw_desc, in_nchw, wei_kcsr_desc, wei_kcsr, out_nkhw_desc, out_nkhw_device);
device_direct_convolution_1
#elif
0
#elif
0
device_direct_convolution_2
(
device_direct_convolution_2
in_nchw_desc
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr
,
out_nkhw_desc
,
out_nkhw_device
);
#elif 0
#elif 1
device_implicit_gemm_convolution_1_nchw_kcsr
device_implicit_gemm_convolution_nchw_kcsr
(
#elif 0
in_nchw_desc
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr
,
out_nkhw_desc
,
out
_n
k
hw_
device
);
device_implicit_gemm_convolution_1
_n
c
hw_
srck
#elif 1
#elif 1
device_implicit_gemm_convolution_nchw_srck
(
device_implicit_gemm_convolution_2_cnhw_srck_knhw
in_nchw_desc
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr
,
out_nkhw_desc
,
out_nkhw_device
);
#elif 0
#elif 0
device_winograd_convolution
(
device_winograd_convolution
in_nchw_desc
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr
,
out_nkhw_desc
,
out_nkhw_device
);
#endif
#endif
}
(
in_nchw_desc
,
in_nchw
,
wei_kcsr_desc
,
wei_kcsr
,
out_nkhw_desc
,
out_nkhw_device
,
nrepeat
);
#if
0
#if
1
host_winograd_3x3_convolution
(
in_nchw
,
wei_kcsr
,
out_nkhw_host
);
host_winograd_3x3_convolution
(
in_nchw
,
wei_kcsr
,
out_nkhw_host
);
check_error
(
out_nkhw_host
,
out_nkhw_device
);
check_error
(
out_nkhw_host
,
out_nkhw_device
);
#elif 0
#elif 0
...
...
driver/device_implicit_gemm_convolution_nchw_kcsr.cuh
→
driver/device_implicit_gemm_convolution_
1_
nchw_kcsr.cuh
View file @
20968472
#pragma once
#pragma once
#include "gridwise_implicit_gemm_convolution_nchw_kcsr.cuh"
#include "gridwise_implicit_gemm_convolution_
1_
nchw_kcsr.cuh"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_implicit_gemm_convolution_nchw_kcsr
(
void
device_implicit_gemm_convolution_
1_
nchw_kcsr
(
InDesc
,
const
Tensor
<
T
>&
in
,
WeiDesc
,
const
Tensor
<
T
>&
wei
,
OutDesc
,
Tensor
<
T
>&
out
)
InDesc
,
const
Tensor
<
T
>&
in
,
WeiDesc
,
const
Tensor
<
T
>&
wei
,
OutDesc
,
Tensor
<
T
>&
out
)
{
{
std
::
size_t
data_sz
=
sizeof
(
T
);
std
::
size_t
data_sz
=
sizeof
(
T
);
...
@@ -81,7 +81,7 @@ void device_implicit_gemm_convolution_nchw_kcsr(
...
@@ -81,7 +81,7 @@ void device_implicit_gemm_convolution_nchw_kcsr(
cudaEventCreate
(
&
start
);
cudaEventCreate
(
&
start
);
cudaEventRecord
(
start
,
0
);
cudaEventRecord
(
start
,
0
);
gridwise_implicit_gemm_convolution_nchw_kcsr
<
GridSize
,
gridwise_implicit_gemm_convolution_
1_
nchw_kcsr
<
GridSize
,
BlockSize
,
BlockSize
,
T
,
T
,
InDesc
,
InDesc
,
...
...
driver/device_implicit_gemm_convolution_nchw_srck.cuh
→
driver/device_implicit_gemm_convolution_
1_
nchw_srck.cuh
View file @
20968472
#pragma once
#pragma once
#include "gridwise_implicit_gemm_convolution_nchw_srck.cuh"
#include "gridwise_implicit_gemm_convolution_
1_
nchw_srck.cuh"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_implicit_gemm_convolution_nchw_srck
(
InDesc
,
void
device_implicit_gemm_convolution_
1_
nchw_srck
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcsr
,
const
Tensor
<
T
>&
wei_kcsr
,
...
@@ -88,6 +88,19 @@ void device_implicit_gemm_convolution_nchw_srck(InDesc,
...
@@ -88,6 +88,19 @@ void device_implicit_gemm_convolution_nchw_srck(InDesc,
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
2
;
constexpr
unsigned
BlockSize
=
128
;
#elif 0
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
HoPerBlock
=
2
;
constexpr
unsigned
WoPerBlock
=
32
;
constexpr
unsigned
KPerThread
=
16
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
HoPerThread
=
2
;
constexpr
unsigned
WoPerThread
=
1
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
#endif
#endif
...
@@ -106,7 +119,7 @@ void device_implicit_gemm_convolution_nchw_srck(InDesc,
...
@@ -106,7 +119,7 @@ void device_implicit_gemm_convolution_nchw_srck(InDesc,
cudaEventCreate
(
&
start
);
cudaEventCreate
(
&
start
);
cudaEventRecord
(
start
,
0
);
cudaEventRecord
(
start
,
0
);
gridwise_implicit_gemm_convolution_nchw_srck
<
GridSize
,
gridwise_implicit_gemm_convolution_
1_
nchw_srck
<
GridSize
,
BlockSize
,
BlockSize
,
T
,
T
,
decltype
(
in_nchw_desc
),
decltype
(
in_nchw_desc
),
...
...
driver/device_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
0 → 100644
View file @
20968472
#pragma once
#include "gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_implicit_gemm_convolution_2_cnhw_srck_knhw
(
InDesc
,
const
Tensor
<
T
>&
in_nchw
,
WeiDesc
,
const
Tensor
<
T
>&
wei_kcsr
,
OutDesc
,
Tensor
<
T
>&
out_nkhw
,
unsigned
nrepeat
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_nchw_desc
=
InDesc
{};
constexpr
auto
wei_kcsr_desc
=
WeiDesc
{};
constexpr
auto
out_nkhw_desc
=
OutDesc
{};
constexpr
unsigned
N
=
in_nchw_desc
.
GetLength
(
I0
);
constexpr
unsigned
Hi
=
in_nchw_desc
.
GetLength
(
I2
);
constexpr
unsigned
Wi
=
in_nchw_desc
.
GetLength
(
I3
);
constexpr
unsigned
Ho
=
out_nkhw_desc
.
GetLength
(
I2
);
constexpr
unsigned
Wo
=
out_nkhw_desc
.
GetLength
(
I3
);
constexpr
unsigned
K
=
wei_kcsr_desc
.
GetLength
(
I0
);
constexpr
unsigned
C
=
wei_kcsr_desc
.
GetLength
(
I1
);
constexpr
unsigned
S
=
wei_kcsr_desc
.
GetLength
(
I2
);
constexpr
unsigned
R
=
wei_kcsr_desc
.
GetLength
(
I3
);
constexpr
unsigned
BGhostRead
=
(
S
-
1
)
*
Wi
+
(
R
-
1
);
// convert in_nchw to in_cnhw
auto
in_cnhw_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
N
,
Hi
,
Wi
>
{});
ostream_ConstantTensorDescriptor
(
in_cnhw_desc
,
std
::
cout
<<
"in_cnhw_desc: "
);
Tensor
<
T
>
in_cnhw
(
make_TensorDescriptor
(
in_cnhw_desc
));
auto
f_reorder_nchw2cnhw
=
[
&
](
auto
n
,
auto
c
,
auto
hi
,
auto
wi
)
{
in_cnhw
(
c
,
n
,
hi
,
wi
)
=
in_nchw
(
n
,
c
,
hi
,
wi
);
};
make_ParallelTensorFunctor
(
f_reorder_nchw2cnhw
,
N
,
C
,
Hi
,
Wi
)(
std
::
thread
::
hardware_concurrency
());
// convert wei_kcsr to wei_srck
auto
wei_srck_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
S
,
R
,
C
,
K
>
{});
ostream_ConstantTensorDescriptor
(
wei_srck_desc
,
std
::
cout
<<
"wei_srck_desc: "
);
Tensor
<
T
>
wei_srck
(
make_TensorDescriptor
(
wei_srck_desc
));
auto
f_reorder_kcsr2srck
=
[
&
](
auto
k
,
auto
c
,
auto
s
,
auto
r
)
{
wei_srck
(
s
,
r
,
c
,
k
)
=
wei_kcsr
(
k
,
c
,
s
,
r
);
};
make_ParallelTensorFunctor
(
f_reorder_kcsr2srck
,
K
,
C
,
S
,
R
)(
std
::
thread
::
hardware_concurrency
());
// conver out_nkhw to out_knhw
auto
out_knhw_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
K
,
N
,
Ho
,
Wo
>
{});
ostream_ConstantTensorDescriptor
(
out_knhw_desc
,
std
::
cout
<<
"out_knhw_desc: "
);
Tensor
<
T
>
out_knhw
(
make_TensorDescriptor
(
out_knhw_desc
));
#if 0
constexpr unsigned BPerBlock = 128;
constexpr unsigned KPerBlock = 1;
constexpr unsigned CPerBlock = 1;
constexpr unsigned BPerThread = 4;
constexpr unsigned KPerThread = 1;
constexpr unsigned CPerThread = 1;
constexpr unsigned BlockSize = 32;
#elif
0
constexpr
unsigned
BPerBlock
=
128
;
constexpr
unsigned
KPerBlock
=
2
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
BPerThread
=
4
;
constexpr
unsigned
KPerThread
=
2
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
BlockSize
=
32
;
#elif 1
constexpr
unsigned
BPerBlock
=
128
;
constexpr
unsigned
KPerBlock
=
64
;
constexpr
unsigned
CPerBlock
=
2
;
constexpr
unsigned
BPerThread
=
4
;
constexpr
unsigned
KPerThread
=
16
;
constexpr
unsigned
CPerThread
=
1
;
constexpr
unsigned
BlockSize
=
128
;
#endif
constexpr
unsigned
GridSize
=
((
N
*
Hi
*
Wi
+
BPerBlock
-
1
)
/
BPerBlock
)
*
((
K
+
KPerBlock
-
1
)
/
KPerBlock
);
dim3
block_dim
(
BlockSize
);
dim3
grid_dim
(
GridSize
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
// mem
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_cnhw_device_buf
(
data_sz
*
(
in_cnhw
.
mDesc
.
GetElementSpace
()
+
BGhostRead
+
BPerBlock
));
// reserve extra space for BGhostRead
DeviceMem
wei_srck_device_buf
(
data_sz
*
wei_srck
.
mDesc
.
GetElementSpace
());
DeviceMem
out_knhw_device_buf
(
data_sz
*
out_knhw
.
mDesc
.
GetElementSpace
());
in_cnhw_device_buf
.
ToDevice
(
in_cnhw
.
mData
.
data
());
wei_srck_device_buf
.
ToDevice
(
wei_srck
.
mData
.
data
());
out_knhw_device_buf
.
ToDevice
(
out_knhw
.
mData
.
data
());
for
(
unsigned
i
=
0
;
i
<
nrepeat
;
++
i
)
{
cudaEvent_t
start
,
stop
;
float
elapsedTime
;
cudaEventCreate
(
&
start
);
cudaEventRecord
(
start
,
0
);
gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw
<
GridSize
,
BlockSize
,
T
,
decltype
(
in_cnhw_desc
),
decltype
(
wei_srck_desc
),
decltype
(
out_knhw_desc
),
BPerBlock
,
KPerBlock
,
CPerBlock
,
BPerThread
,
KPerThread
,
CPerThread
>
<<<
grid_dim
,
block_dim
>>>
(
in_cnhw_desc
,
static_cast
<
T
*>
(
in_cnhw_device_buf
.
GetDeviceBuffer
()),
wei_srck_desc
,
static_cast
<
T
*>
(
wei_srck_device_buf
.
GetDeviceBuffer
()),
out_knhw_desc
,
static_cast
<
T
*>
(
out_knhw_device_buf
.
GetDeviceBuffer
()));
cudaEventCreate
(
&
stop
);
cudaEventRecord
(
stop
,
0
);
cudaEventSynchronize
(
stop
);
cudaEventElapsedTime
(
&
elapsedTime
,
start
,
stop
);
printf
(
"Elapsed time : %f ms
\n
"
,
elapsedTime
);
}
checkCudaErrors
(
cudaGetLastError
());
out_knhw_device_buf
.
FromDevice
(
out_knhw
.
mData
.
data
());
// convert out_knhw to out_nkhw
auto
f_reorder_knhw2nkhw
=
[
&
](
auto
n
,
auto
k
,
auto
ho
,
auto
wo
)
{
out_nkhw
(
n
,
k
,
ho
,
wo
)
=
out_knhw
(
k
,
n
,
ho
,
wo
);
};
make_ParallelTensorFunctor
(
f_reorder_knhw2nkhw
,
N
,
K
,
Ho
,
Wo
)(
std
::
thread
::
hardware_concurrency
());
}
src/include/ConstantMatrixDescriptor.cuh
View file @
20968472
...
@@ -46,3 +46,14 @@ __host__ __device__ constexpr auto
...
@@ -46,3 +46,14 @@ __host__ __device__ constexpr auto
{
{
return
ConstantMatrixDescriptor
<
NRow
,
NCol
,
RowStride
>
{};
return
ConstantMatrixDescriptor
<
NRow
,
NCol
,
RowStride
>
{};
}
}
template
<
class
TDesc
>
__host__
__device__
void
print_ConstantMatrixDescriptor
(
TDesc
,
const
char
*
s
)
{
const
auto
desc
=
TDesc
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
printf
(
"%s NRow %u NCol %u RowStride %u
\n
"
,
s
,
desc
.
NRow
(),
desc
.
NCol
(),
desc
.
RowStride
());
}
src/include/ConstantTensorDescriptor.cuh
View file @
20968472
#pragma once
#pragma once
#include "common.cuh"
#include "common.cuh"
// this is ugly, only for 2d
template
<
unsigned
L0
,
unsigned
L1
>
__host__
__device__
constexpr
auto
calculate_default_strides
(
Sequence
<
L0
,
L1
>
)
{
return
Sequence
<
L1
,
1
>
{};
}
// this is ugly, only for 4d
// this is ugly, only for 4d
template
<
unsigned
L0
,
unsigned
L1
,
unsigned
L2
,
unsigned
L3
>
template
<
unsigned
L0
,
unsigned
L1
,
unsigned
L2
,
unsigned
L3
>
__host__
__device__
constexpr
auto
calculate_default_strides
(
Sequence
<
L0
,
L1
,
L2
,
L3
>
)
__host__
__device__
constexpr
auto
calculate_default_strides
(
Sequence
<
L0
,
L1
,
L2
,
L3
>
)
...
@@ -49,8 +56,25 @@ struct ConstantTensorDescriptor
...
@@ -49,8 +56,25 @@ struct ConstantTensorDescriptor
// this is ugly, only for 4d
// this is ugly, only for 4d
__host__
__device__
constexpr
unsigned
GetElementSize
()
const
__host__
__device__
constexpr
unsigned
GetElementSize
()
const
{
{
static_assert
(
nDim
==
4
,
"nDim is not 4"
);
static_assert
(
nDim
>=
2
&&
nDim
<=
4
,
"nDim"
);
if
(
nDim
==
2
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
return
GetLength
(
I0
)
*
GetLength
(
I1
);
}
else
if
(
nDim
==
3
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
return
GetLength
(
I0
)
*
GetLength
(
I1
)
*
GetLength
(
I2
);
}
else
if
(
nDim
==
4
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
...
@@ -58,12 +82,30 @@ struct ConstantTensorDescriptor
...
@@ -58,12 +82,30 @@ struct ConstantTensorDescriptor
return
GetLength
(
I0
)
*
GetLength
(
I1
)
*
GetLength
(
I2
)
*
GetLength
(
I3
);
return
GetLength
(
I0
)
*
GetLength
(
I1
)
*
GetLength
(
I2
)
*
GetLength
(
I3
);
}
}
}
// this is ugly, only for 4d
__host__
__device__
constexpr
unsigned
GetElementSpace
()
const
__host__
__device__
constexpr
unsigned
GetElementSpace
()
const
{
{
static_assert
(
nDim
=
=
4
,
"nDim
is not 4
"
);
static_assert
(
nDim
>=
2
&&
nDim
<
=
4
,
"nDim"
);
if
(
nDim
==
2
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
return
(
GetLength
(
I0
)
-
1
)
*
GetStride
(
I0
)
+
(
GetLength
(
I1
)
-
1
)
*
GetStride
(
I1
)
+
1
;
}
else
if
(
nDim
==
3
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
return
(
GetLength
(
I0
)
-
1
)
*
GetStride
(
I0
)
+
(
GetLength
(
I1
)
-
1
)
*
GetStride
(
I1
)
+
(
GetLength
(
I2
)
-
1
)
*
GetStride
(
I2
)
+
1
;
}
else
if
(
nDim
==
4
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
...
@@ -72,6 +114,28 @@ struct ConstantTensorDescriptor
...
@@ -72,6 +114,28 @@ struct ConstantTensorDescriptor
return
(
GetLength
(
I0
)
-
1
)
*
GetStride
(
I0
)
+
(
GetLength
(
I1
)
-
1
)
*
GetStride
(
I1
)
+
return
(
GetLength
(
I0
)
-
1
)
*
GetStride
(
I0
)
+
(
GetLength
(
I1
)
-
1
)
*
GetStride
(
I1
)
+
(
GetLength
(
I2
)
-
1
)
*
GetStride
(
I2
)
+
(
GetLength
(
I3
)
-
1
)
*
GetStride
(
I3
)
+
1
;
(
GetLength
(
I2
)
-
1
)
*
GetStride
(
I2
)
+
(
GetLength
(
I3
)
-
1
)
*
GetStride
(
I3
)
+
1
;
}
}
}
// this is ugly, only for 2d
__host__
__device__
unsigned
Get1dIndex
(
unsigned
i0
,
unsigned
i1
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
static_assert
(
nDim
==
2
,
"nDim is not 2"
);
return
i0
*
GetStride
(
I0
)
+
i1
*
GetStride
(
I1
);
}
// this is ugly, only for 3d
__host__
__device__
unsigned
Get1dIndex
(
unsigned
i0
,
unsigned
i1
,
unsigned
i2
)
const
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
static_assert
(
nDim
==
3
,
"nDim is not 3"
);
return
i0
*
GetStride
(
I0
)
+
i1
*
GetStride
(
I1
)
+
i2
*
GetStride
(
I2
);
}
// this is ugly, only for 4d
// this is ugly, only for 4d
__host__
__device__
unsigned
__host__
__device__
unsigned
...
@@ -106,19 +170,34 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
...
@@ -106,19 +170,34 @@ __host__ __device__ constexpr auto make_ConstantTensorDescriptor(Lengths, Stride
return
ConstantTensorDescriptor
<
Lengths
,
Strides
>
{};
return
ConstantTensorDescriptor
<
Lengths
,
Strides
>
{};
}
}
// this is ugly, only for 4d
template
<
class
TDesc
>
template
<
class
TDesc
>
__host__
__device__
void
print_ConstantTensorDescriptor
(
TDesc
,
const
char
*
s
)
__host__
__device__
void
print_ConstantTensorDescriptor
(
TDesc
,
const
char
*
s
)
{
{
constexpr
auto
desc
=
TDesc
{};
constexpr
auto
desc
=
TDesc
{};
constexpr
unsigned
ndim
=
desc
.
GetDimension
();
static_assert
(
ndim
>=
2
&&
ndim
<=
4
,
"wrong!"
);
if
(
ndim
==
2
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
printf
(
"%s dim %u, lengths {%u %u}, strides {%u %u}
\n
"
,
s
,
desc
.
GetDimension
(),
desc
.
GetLength
(
I0
),
desc
.
GetLength
(
I1
),
desc
.
GetStride
(
I0
),
desc
.
GetStride
(
I1
));
}
else
if
(
ndim
==
4
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
static_assert
(
desc
.
GetDimension
()
==
4
,
"dim is not 4"
);
printf
(
"%s dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}
\n
"
,
printf
(
"%s dim %u, lengths {%u %u %u %u}, strides {%u %u %u %u}
\n
"
,
s
,
s
,
desc
.
GetDimension
(),
desc
.
GetDimension
(),
...
@@ -130,4 +209,5 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
...
@@ -130,4 +209,5 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
desc
.
GetStride
(
I1
),
desc
.
GetStride
(
I1
),
desc
.
GetStride
(
I2
),
desc
.
GetStride
(
I2
),
desc
.
GetStride
(
I3
));
desc
.
GetStride
(
I3
));
}
}
}
src/include/blockwise_2d_tensor_op.cuh
0 → 100644
View file @
20968472
#pragma once
#include "ConstantTensorDescriptor.cuh"
template
<
unsigned
BlockSize
,
class
Float
,
class
DstDesc
,
class
F
>
__device__
void
blockwise_2d_tensor_pointwise_operation_unary
(
DstDesc
,
Float
*
__restrict__
p_dst
,
F
f
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
desc
=
make_ConstantTensorDescriptor
(
dst_desc
.
GetLengths
());
#if 0
if(threadIdx.x == 0)
{
print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_unary: dst_desc: ");
print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_unary: desc: ");
}
#endif
constexpr
unsigned
NLoop
=
desc
.
GetElementSize
()
/
BlockSize
;
for
(
unsigned
iloop
=
0
;
iloop
<
NLoop
;
++
iloop
)
{
unsigned
is
=
threadIdx
.
x
+
iloop
*
BlockSize
;
const
unsigned
did0
=
is
/
desc
.
GetStride
(
I0
);
is
-=
did0
*
desc
.
GetStride
(
I0
);
const
unsigned
did1
=
is
/
desc
.
GetStride
(
I1
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
);
f
(
p_dst
[
dindex
]);
}
constexpr
bool
has_tail
=
(
desc
.
GetElementSize
()
>
NLoop
*
BlockSize
);
if
(
has_tail
)
{
unsigned
is
=
threadIdx
.
x
+
NLoop
*
BlockSize
;
if
(
is
<
desc
.
GetElementSize
())
{
const
unsigned
did0
=
is
/
desc
.
GetStride
(
I0
);
is
-=
did0
*
desc
.
GetStride
(
I0
);
const
unsigned
did1
=
is
/
desc
.
GetStride
(
I1
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
);
f
(
p_dst
[
dindex
]);
}
}
}
// Function: p_dst[reorder[i0], reorder[i1], reorder[i2], reorder[i3]] = p_src[i0,i1,i2,i3]
// TODO: in order to optimize mem access for different mem type,
// need to write specialized version
template
<
unsigned
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
,
class
DstFromSrcReorder
,
class
F
>
__device__
void
blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src
(
SrcDesc
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
SrcOpLengths
,
DstFromSrcReorder
,
F
f
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
unsigned
IR0
=
DstFromSrcReorder
{}.
Get
(
I0
);
constexpr
unsigned
IR1
=
DstFromSrcReorder
{}.
Get
(
I1
);
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor
(
SrcOpLengths
{});
constexpr
unsigned
NLoop
=
ref_desc
.
GetElementSize
()
/
BlockSize
;
for
(
unsigned
iloop
=
0
;
iloop
<
NLoop
;
++
iloop
)
{
unsigned
is
=
threadIdx
.
x
+
iloop
*
BlockSize
;
unsigned
did
[
2
];
did
[
0
]
=
is
/
ref_desc
.
GetStride
(
I0
);
is
-=
did
[
0
]
*
ref_desc
.
GetStride
(
I0
);
did
[
1
]
=
is
/
ref_desc
.
GetStride
(
I1
);
const
unsigned
aindex
=
src_desc
.
Get1dIndex
(
did
[
0
],
did
[
1
]);
const
unsigned
bindex
=
dst_desc
.
Get1dIndex
(
did
[
IR0
],
did
[
IR1
]);
f
(
p_src
[
aindex
],
p_dst
[
bindex
]);
}
constexpr
bool
has_tail
=
(
ref_desc
.
GetElementSize
()
>
NLoop
*
BlockSize
);
if
(
has_tail
)
{
unsigned
is
=
threadIdx
.
x
+
NLoop
*
BlockSize
;
if
(
is
<
ref_desc
.
GetElementSize
())
{
unsigned
did
[
2
];
did
[
0
]
=
is
/
ref_desc
.
GetStride
(
I0
);
is
-=
did
[
0
]
*
ref_desc
.
GetStride
(
I0
);
did
[
1
]
=
is
/
ref_desc
.
GetStride
(
I1
);
const
unsigned
aindex
=
src_desc
.
Get1dIndex
(
did
[
0
],
did
[
1
]);
const
unsigned
bindex
=
dst_desc
.
Get1dIndex
(
did
[
IR0
],
did
[
IR1
]);
f
(
p_src
[
aindex
],
p_dst
[
bindex
]);
}
}
}
template
<
unsigned
BlockSize
,
class
Float
,
class
DstDesc
>
__device__
void
blockwise_2d_tensor_set_zero
(
DstDesc
,
Float
*
__restrict__
p_dst
)
{
auto
f_set_zero
=
[](
Float
&
v
)
{
v
=
Float
(
0
);
};
blockwise_2d_tensor_pointwise_operation_unary
<
BlockSize
>
(
DstDesc
{},
p_dst
,
f_set_zero
);
}
template
<
unsigned
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
,
class
DstFromSrcReorder
>
__device__
void
blockwise_2d_tensor_copy_reorder_by_get_dst_from_src
(
SrcDesc
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
SrcOpLengths
,
DstFromSrcReorder
)
{
auto
f_copy
=
[](
const
Float
&
src
,
Float
&
dst
)
{
dst
=
src
;
};
blockwise_2d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src
<
BlockSize
>
(
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
SrcOpLengths
{},
DstFromSrcReorder
{},
f_copy
);
}
template
<
unsigned
BlockSize
,
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
>
__device__
void
blockwise_2d_tensor_copy
(
SrcDesc
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
SrcOpLengths
)
{
constexpr
auto
dst_from_src_reorder
=
Sequence
<
0
,
1
>
{};
blockwise_2d_tensor_copy_reorder_by_get_dst_from_src
<
BlockSize
>
(
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
SrcOpLengths
{},
dst_from_src_reorder
);
}
src/include/blockwise_tensor_op.cuh
→
src/include/blockwise_
4d_
tensor_op.cuh
View file @
20968472
File moved
src/include/blockwise_direct_convolution.cuh
View file @
20968472
#pragma once
#pragma once
#include "ConstantTensorDescriptor.cuh"
#include "ConstantTensorDescriptor.cuh"
#include "threadwise_tensor_op.cuh"
#include "threadwise_
4d_
tensor_op.cuh"
#include "threadwise_direct_convolution.cuh"
#include "threadwise_direct_convolution.cuh"
template
<
unsigned
BlockSize
,
template
<
unsigned
BlockSize
,
...
...
src/include/gemm.cuh
View file @
20968472
...
@@ -114,6 +114,12 @@ struct blockwise_1d_strided_batched_gemm_block_a_block_b_thread_c
...
@@ -114,6 +114,12 @@ struct blockwise_1d_strided_batched_gemm_block_a_block_b_thread_c
:
b_block_mtx
.
Get1dIndex
(
c_thread_mtx_index
.
col_begin
,
0
));
:
b_block_mtx
.
Get1dIndex
(
c_thread_mtx_index
.
col_begin
,
0
));
#if 0
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantMatrixDescriptor(BlockMatrixA{}, "a_block_mtx: ");
print_ConstantMatrixDescriptor(BlockMatrixB{}, "b_block_mtx: ");
print_ConstantMatrixDescriptor(ThreadMatrixC{}, "c_thread_mtx: ");
printf("%u %u, %u %u %u, %u %u\n",
printf("%u %u, %u %u %u, %u %u\n",
get_block_1d_id(),
get_block_1d_id(),
get_thread_local_1d_id(),
get_thread_local_1d_id(),
...
@@ -122,6 +128,7 @@ struct blockwise_1d_strided_batched_gemm_block_a_block_b_thread_c
...
@@ -122,6 +128,7 @@ struct blockwise_1d_strided_batched_gemm_block_a_block_b_thread_c
c_thread_mtx_index.col_begin,
c_thread_mtx_index.col_begin,
mMyThreadOffsetA,
mMyThreadOffsetA,
mMyThreadOffsetB);
mMyThreadOffsetB);
}
#endif
#endif
}
}
...
...
src/include/gridwise_direct_convolution_1.cuh
View file @
20968472
#pragma once
#pragma once
#include "ConstantTensorDescriptor.cuh"
#include "ConstantTensorDescriptor.cuh"
#include "blockwise_tensor_op.cuh"
#include "blockwise_
4d_
tensor_op.cuh"
#include "blockwise_direct_convolution.cuh"
#include "blockwise_direct_convolution.cuh"
template
<
class
Float
,
template
<
class
Float
,
...
...
src/include/gridwise_direct_convolution_2.cuh
View file @
20968472
#pragma once
#pragma once
#include "ConstantTensorDescriptor.cuh"
#include "ConstantTensorDescriptor.cuh"
#include "blockwise_tensor_op.cuh"
#include "blockwise_
4d_
tensor_op.cuh"
#include "blockwise_direct_convolution.cuh"
#include "blockwise_direct_convolution.cuh"
#include "threadwise_tensor_op.cuh"
#include "threadwise_
4d_
tensor_op.cuh"
#include "threadwise_direct_convolution.cuh"
#include "threadwise_direct_convolution.cuh"
template
<
class
Float
,
template
<
class
Float
,
...
...
src/include/gridwise_implicit_gemm_convolution_nchw_kcsr.cuh
→
src/include/gridwise_implicit_gemm_convolution_
1_
nchw_kcsr.cuh
View file @
20968472
...
@@ -2,8 +2,8 @@
...
@@ -2,8 +2,8 @@
#include "common.cuh"
#include "common.cuh"
#include "ConstantTensorDescriptor.cuh"
#include "ConstantTensorDescriptor.cuh"
#include "ConstantMatrixDescriptor.cuh"
#include "ConstantMatrixDescriptor.cuh"
#include "blockwise_tensor_op.cuh"
#include "blockwise_
4d_
tensor_op.cuh"
#include "threadwise_tensor_op.cuh"
#include "threadwise_
4d_
tensor_op.cuh"
#include "gemm.cuh"
#include "gemm.cuh"
template
<
unsigned
GridSize
,
template
<
unsigned
GridSize
,
...
@@ -21,7 +21,8 @@ template <unsigned GridSize,
...
@@ -21,7 +21,8 @@ template <unsigned GridSize,
unsigned
CPerThread
,
unsigned
CPerThread
,
unsigned
HoPerThread
,
unsigned
HoPerThread
,
unsigned
WoPerThread
>
unsigned
WoPerThread
>
__global__
void
gridwise_implicit_gemm_convolution_nchw_kcsr
(
InGlobalDesc
,
__global__
void
gridwise_implicit_gemm_convolution_1_nchw_kcsr
(
InGlobalDesc
,
Float
*
const
__restrict__
p_in_global
,
Float
*
const
__restrict__
p_in_global
,
WeiGlobalDesc
,
WeiGlobalDesc
,
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_wei_global
,
...
...
src/include/gridwise_implicit_gemm_convolution_nchw_srck.cuh
→
src/include/gridwise_implicit_gemm_convolution_
1_
nchw_srck.cuh
View file @
20968472
...
@@ -2,8 +2,8 @@
...
@@ -2,8 +2,8 @@
#include "common.cuh"
#include "common.cuh"
#include "ConstantTensorDescriptor.cuh"
#include "ConstantTensorDescriptor.cuh"
#include "ConstantMatrixDescriptor.cuh"
#include "ConstantMatrixDescriptor.cuh"
#include "blockwise_tensor_op.cuh"
#include "blockwise_
4d_
tensor_op.cuh"
#include "threadwise_tensor_op.cuh"
#include "threadwise_
4d_
tensor_op.cuh"
#include "gemm.cuh"
#include "gemm.cuh"
template
<
unsigned
GridSize
,
template
<
unsigned
GridSize
,
...
@@ -21,7 +21,8 @@ template <unsigned GridSize,
...
@@ -21,7 +21,8 @@ template <unsigned GridSize,
unsigned
CPerThread
,
unsigned
CPerThread
,
unsigned
HoPerThread
,
unsigned
HoPerThread
,
unsigned
WoPerThread
>
unsigned
WoPerThread
>
__global__
void
gridwise_implicit_gemm_convolution_nchw_srck
(
InGlobalDesc
,
__global__
void
gridwise_implicit_gemm_convolution_1_nchw_srck
(
InGlobalDesc
,
Float
*
const
__restrict__
p_in_global
,
Float
*
const
__restrict__
p_in_global
,
WeiGlobalDesc
,
WeiGlobalDesc
,
Float
*
const
__restrict__
p_wei_global
,
Float
*
const
__restrict__
p_wei_global
,
...
...
src/include/gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw.cuh
0 → 100644
View file @
20968472
#pragma once
#include "common.cuh"
#include "ConstantTensorDescriptor.cuh"
#include "ConstantMatrixDescriptor.cuh"
#include "blockwise_4d_tensor_op.cuh"
#include "blockwise_2d_tensor_op.cuh"
#include "threadwise_2d_tensor_op.cuh"
#include "gemm.cuh"
// define B = N*Hi*Wi
template
<
unsigned
GridSize
,
unsigned
BlockSize
,
class
Float
,
class
InGlobalDesc
,
class
WeiGlobalDesc
,
class
OutGlobalDesc
,
unsigned
BPerBlock
,
unsigned
KPerBlock
,
unsigned
CPerBlock
,
unsigned
BPerThread
,
unsigned
KPerThread
,
unsigned
CPerThread
>
__global__
void
gridwise_implicit_gemm_convolution_2_cnhw_srck_knhw
(
InGlobalDesc
,
Float
*
const
__restrict__
p_in_global
,
WeiGlobalDesc
,
Float
*
const
__restrict__
p_wei_global
,
OutGlobalDesc
,
Float
*
__restrict__
p_out_global
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_cnhw_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_srck_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_knhw_global_desc
=
OutGlobalDesc
{};
constexpr
unsigned
C
=
in_cnhw_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
N
=
in_cnhw_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
Hi
=
in_cnhw_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
Wi
=
in_cnhw_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
K
=
out_knhw_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
Ho
=
out_knhw_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
Wo
=
out_knhw_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
S
=
wei_srck_global_desc
.
GetLength
(
I0
);
constexpr
unsigned
R
=
wei_srck_global_desc
.
GetLength
(
I1
);
constexpr
unsigned
B
=
N
*
Hi
*
Wi
;
constexpr
unsigned
BGhostRead
=
(
S
-
1
)
*
Wi
+
(
R
-
1
);
// divide block work by 2d: [K, B]
constexpr
unsigned
KBlockWork
=
(
K
+
KPerBlock
-
1
)
/
KPerBlock
;
constexpr
unsigned
BBlockWork
=
(
B
+
BPerBlock
-
1
)
/
BPerBlock
;
const
unsigned
k_block_work_id
=
get_block_1d_id
()
/
BBlockWork
;
const
unsigned
b_block_work_id
=
get_block_1d_id
()
-
k_block_work_id
*
BBlockWork
;
const
unsigned
k_block_data_begin
=
k_block_work_id
*
KPerBlock
;
const
unsigned
b_block_data_begin
=
b_block_work_id
*
BPerBlock
;
#if 0
if(get_thread_local_1d_id() == 0)
{
printf("K %u B %u, BGhostRead %u\n", K, B, BGhostRead);
printf("%u %u, KBlockWork %u BBlockWork %u, k_block_data_begin %u b_block_data_begin %u\n",
get_block_1d_id(),
get_thread_local_1d_id(),
KBlockWork,
BBlockWork,
k_block_data_begin,
b_block_data_begin);
}
#endif
// flattend (2d) tensor view of gridwise input
constexpr
auto
in_cb_global_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
C
,
B
>
{});
// tensor view of blockwise input and weight
constexpr
auto
in_cb_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
CPerBlock
,
BPerBlock
+
BGhostRead
>
{});
constexpr
auto
wei_srck_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
S
,
R
,
CPerBlock
,
KPerBlock
>
{});
// tensor view of threadwise output in register
constexpr
auto
out_kb_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
BPerThread
>
{});
#if 0
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
{
print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
print_ConstantTensorDescriptor(wei_srck_block_desc, "wei_srck_block_desc");
print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
printf("KPerBlock %u\n", KPerBlock);
}
#endif
// a series of blockwise GEMM
// c_mtx += transpose(a_mtx) * b_mtx
// a_mtx and b_mtx saved in LDS, c_mtx saved in register
// a_mtx[C,K] is a sub-matrix of wei_block[S,R,C,K]
// b_mtx[C,B] is a subset of in_block[C,B + BGhostRead]
// c_mtx[K,B] is out_block[K,B]
const
auto
a_cxk_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
KPerBlock
>
{});
// constexpr doesn't compile
const
auto
b_cxb_block_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
CPerBlock
>
{},
Number
<
BPerBlock
>
{},
Number
<
in_cb_block_desc
.
GetStride
(
I0
)
>
{});
// constexpr doesn't compile
const
auto
c_kxb_thread_mtx_desc
=
make_ConstantMatrixDescriptor
(
Number
<
KPerThread
>
{},
Number
<
BPerThread
>
{});
// constexpr doesn't compile
const
auto
blockwise_gemm
=
blockwise_1d_strided_batched_gemm_block_a_block_b_thread_c
<
BlockSize
,
decltype
(
a_cxk_block_mtx_desc
),
decltype
(
b_cxb_block_mtx_desc
),
decltype
(
c_kxb_thread_mtx_desc
),
true
,
false
,
false
,
0
,
0
,
0
,
1
,
1
,
CPerThread
,
true
>
{};
// LDS
constexpr
unsigned
in_block_size
=
in_cb_block_desc
.
GetElementSpace
();
constexpr
unsigned
wei_block_size
=
wei_srck_block_desc
.
GetElementSpace
();
__shared__
Float
p_in_block
[
in_block_size
];
__shared__
Float
p_wei_block
[
wei_block_size
];
// register
Float
p_out_thread
[
out_kb_thread_desc
.
GetElementSpace
()];
// set threadwise output tensor to 0
threadwise_2d_tensor_set_zero
(
out_kb_thread_desc
,
p_out_thread
);
for
(
unsigned
c_block_data_begin
=
0
;
c_block_data_begin
<
C
;
c_block_data_begin
+=
CPerBlock
,
__syncthreads
())
{
// input: global mem to LDS,
// formmat is [CPerBlock,BPerBlock + BGhostRead]
blockwise_2d_tensor_copy
<
BlockSize
>
(
in_cb_global_desc
,
p_in_global
+
in_cb_global_desc
.
Get1dIndex
(
c_block_data_begin
,
b_block_data_begin
),
in_cb_block_desc
,
p_in_block
,
in_cb_block_desc
.
GetLengths
());
// weight: global mem to LDS,
// format is [S,R,CPerBlock,KPerBlock]
blockwise_4d_tensor_copy
<
BlockSize
>
(
wei_srck_global_desc
,
p_wei_global
+
wei_srck_global_desc
.
Get1dIndex
(
0
,
0
,
c_block_data_begin
,
k_block_data_begin
),
wei_srck_block_desc
,
p_wei_block
,
wei_srck_block_desc
.
GetLengths
());
__syncthreads
();
// a series of GEMM
for
(
unsigned
s
=
0
;
s
<
S
;
++
s
)
{
for
(
unsigned
r
=
0
;
r
<
R
;
++
r
)
{
auto
f_accum
=
[](
auto
&
c
,
const
auto
&&
ab
)
{
c
+=
ab
;
};
blockwise_gemm
.
run
(
p_wei_block
+
wei_srck_block_desc
.
Get1dIndex
(
s
,
r
,
0
,
0
),
p_in_block
+
s
*
Wi
+
r
,
p_out_thread
,
f_accum
);
}
}
}
// output: register to global mem,
const
auto
matrix_c_index
=
blockwise_gemm
.
CalculateThreadMatrixCIndex
(
get_thread_local_1d_id
());
const
unsigned
k_thread_data_begin
=
matrix_c_index
.
row_begin
;
const
unsigned
b_thread_data_begin
=
matrix_c_index
.
col_begin
;
const
unsigned
k_data_begin
=
k_block_data_begin
+
k_thread_data_begin
;
const
unsigned
b_data_begin
=
b_block_data_begin
+
b_thread_data_begin
;
#if 0
//if(get_block_1d_id() == 10)
{
printf("%u %u, batch_begin %u row_begin %u col_begin %u, k_data_begin %u b_data_begin %u, %f %f %f %f\n",
get_block_1d_id(),
get_thread_local_1d_id(),
matrix_c_index.batch_begin,
matrix_c_index.row_begin,
matrix_c_index.col_begin,
k_data_begin,
b_data_begin,
p_out_thread[0], p_out_thread[1], p_out_thread[2], p_out_thread[3]);
}
#endif
for
(
unsigned
k
=
0
;
k
<
out_kb_thread_desc
.
GetLength
(
I0
);
++
k
)
{
for
(
unsigned
b
=
0
;
b
<
out_kb_thread_desc
.
GetLength
(
I1
);
++
b
)
{
unsigned
k_data
=
k_data_begin
+
k
;
unsigned
b_data
=
b_data_begin
+
b
;
unsigned
n_data
=
b_data
/
(
Hi
*
Wi
);
unsigned
itmp
=
b_data
-
n_data
*
(
Hi
*
Wi
);
unsigned
h_data
=
itmp
/
Wi
;
unsigned
w_data
=
itmp
-
h_data
*
Wi
;
#if 0
if(get_block_1d_id() == 10)
{
printf("%u %u, k %u b %u, k_data %u n_data %u h_data %u w_data %u %f\n",
get_block_1d_id(),
get_thread_local_1d_id(),
k,
b,
k_data,
n_data,
h_data,
w_data,
p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)]);
}
#endif
if
(
k_data
<
K
&&
n_data
<
N
&&
h_data
<
Ho
&&
w_data
<
Wo
)
{
p_out_global
[
out_knhw_global_desc
.
Get1dIndex
(
k_data
,
n_data
,
h_data
,
w_data
)]
=
p_out_thread
[
out_kb_thread_desc
.
Get1dIndex
(
k
,
b
)];
#if 0
if(get_block_1d_id() == 0)
{
printf("%u %u, k %u b %u, k_data %u n_data %u h_data %u w_data %u %f\n",
get_block_1d_id(),
get_thread_local_1d_id(),
k,
b,
k_data,
n_data,
h_data,
w_data,
p_out_thread[out_kb_thread_desc.Get1dIndex(k, b)]);
}
#endif
}
}
}
}
src/include/threadwise_2d_tensor_op.cuh
0 → 100644
View file @
20968472
#pragma once
#include "ConstantTensorDescriptor.cuh"
template
<
class
Float
,
class
Desc
,
class
F
>
__device__
void
threadwise_2d_tensor_pointwise_operation_unary
(
Desc
,
Float
*
__restrict__
p
,
F
f
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
desc
=
Desc
{};
#if 0
if(threadIdx.x == 0)
{
print_ConstantTensorDescriptor(desc, "threadwise_4d_tensor_op_unary: ");
}
#endif
for
(
unsigned
did0
=
0
;
did0
<
desc
.
GetLength
(
I0
);
++
did0
)
{
for
(
unsigned
did1
=
0
;
did1
<
desc
.
GetLength
(
I1
);
++
did1
)
{
const
unsigned
dindex
=
desc
.
Get1dIndex
(
did0
,
did1
);
f
(
p
[
dindex
]);
}
}
}
// TODO: in order to optimize mem access for different mem type,
// need to write specialized version
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
,
class
DstFromSrcReorder
,
class
F
>
__device__
void
threadwise_2d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src
(
SrcDesc
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
SrcOpLengths
,
DstFromSrcReorder
,
F
f
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
unsigned
IR0
=
DstFromSrcReorder
{}.
Get
(
I0
);
constexpr
unsigned
IR1
=
DstFromSrcReorder
{}.
Get
(
I1
);
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
ref_desc
=
make_ConstantTensorDescriptor
(
SrcOpLengths
{});
for
(
unsigned
did0
=
0
;
did0
<
ref_desc
.
GetLength
(
I0
);
++
did0
)
{
for
(
unsigned
did1
=
0
;
did1
<
ref_desc
.
GetLength
(
I1
);
++
did1
)
{
const
unsigned
aindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
);
const
unsigned
did
[
2
]
=
{
did0
,
did1
};
const
unsigned
bindex
=
dst_desc
.
Get1dIndex
(
did
[
IR0
],
did
[
IR1
]);
f
(
p_src
[
aindex
],
p_dst
[
bindex
]);
}
}
}
template
<
class
Float
,
class
Desc
>
__device__
void
threadwise_2d_tensor_set_zero
(
Desc
,
Float
*
__restrict__
p
)
{
auto
f_set_zero
=
[](
Float
&
v
)
{
v
=
Float
(
0
);
};
threadwise_2d_tensor_pointwise_operation_unary
<
Float
,
Desc
,
decltype
(
f_set_zero
)
>
(
Desc
{},
p
,
f_set_zero
);
}
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
,
class
DstFromSrcReorder
>
__device__
void
threadwise_2d_tensor_copy_reorder_by_get_dst_from_src
(
SrcDesc
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
SrcOpLengths
,
DstFromSrcReorder
)
{
auto
f_copy
=
[](
const
Float
&
src
,
Float
&
dst
)
{
dst
=
src
;
};
threadwise_2d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src
(
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
SrcOpLengths
{},
DstFromSrcReorder
{},
f_copy
);
}
template
<
class
Float
,
class
SrcDesc
,
class
DstDesc
,
class
SrcOpLengths
>
__device__
void
threadwise_2d_tensor_copy
(
SrcDesc
,
Float
*
const
__restrict__
p_src
,
DstDesc
,
Float
*
__restrict__
p_dst
,
SrcOpLengths
)
{
auto
dst_from_src_reorder
=
Sequence
<
0
,
1
>
{};
threadwise_2d_tensor_copy_reorder_by_get_dst_from_src
(
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
SrcOpLengths
{},
dst_from_src_reorder
);
}
template
<
class
Float
,
class
Desc
,
class
IDim
,
class
NShift
>
__device__
void
threadwise_2d_tensor_shift_down
(
Desc
,
Float
*
__restrict__
p
,
IDim
,
NShift
)
{
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
desc
=
Desc
{};
#if 0
if(threadIdx.x == 0)
{
print_ConstantTensorDescriptor(desc, "threadwise_4d_tensor_shift_down: ");
}
#endif
constexpr
unsigned
nshift
=
NShift
::
mValue
;
constexpr
unsigned
did0_end
=
is_same
<
decltype
(
I0
),
IDim
>::
value
?
desc
.
GetLength
(
I0
)
-
nshift
:
desc
.
GetLength
(
I0
);
constexpr
unsigned
did1_end
=
is_same
<
decltype
(
I1
),
IDim
>::
value
?
desc
.
GetLength
(
I1
)
-
nshift
:
desc
.
GetLength
(
I1
);
for
(
unsigned
did0
=
0
;
did0
<
did0_end
;
++
did0
)
{
for
(
unsigned
did1
=
0
;
did1
<
did1_end
;
++
did1
)
{
const
unsigned
dindex
=
desc
.
Get1dIndex
(
did0
,
did1
);
const
unsigned
sindex
=
dindex
+
nshift
*
desc
.
GetStride
(
IDim
{});
p
[
dindex
]
=
p
[
sindex
];
}
}
}
src/include/threadwise_tensor_op.cuh
→
src/include/threadwise_
4d_
tensor_op.cuh
View file @
20968472
File moved
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment