Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
24d2f034
"git@developer.sourcefind.cn:yangql/composable_kernel-1.git" did not exist on "7353ec0c25468d754ad5dd786e979a3bbade0a47"
Commit
24d2f034
authored
Nov 25, 2018
by
Chao Liu
Browse files
refactor direct
parent
8732ea04
Changes
14
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
234 additions
and
1272 deletions
+234
-1272
driver/conv.cu
driver/conv.cu
+13
-14
driver/device_direct_convolution_1.cuh
driver/device_direct_convolution_1.cuh
+4
-4
driver/device_direct_convolution_2.cuh
driver/device_direct_convolution_2.cuh
+4
-13
driver/device_winograd_convolution.cuh
driver/device_winograd_convolution.cuh
+0
-89
src/include/blockwise_direct_convolution.cuh
src/include/blockwise_direct_convolution.cuh
+32
-54
src/include/blockwise_tensor_op.cuh
src/include/blockwise_tensor_op.cuh
+52
-400
src/include/blockwise_winograd_transform.cuh
src/include/blockwise_winograd_transform.cuh
+0
-36
src/include/constant_tensor_descriptor.cuh
src/include/constant_tensor_descriptor.cuh
+28
-28
src/include/gridwise_direct_convolution_1.cuh
src/include/gridwise_direct_convolution_1.cuh
+29
-63
src/include/gridwise_direct_convolution_2.cuh
src/include/gridwise_direct_convolution_2.cuh
+32
-66
src/include/gridwise_winograd_convolution.cuh
src/include/gridwise_winograd_convolution.cuh
+0
-246
src/include/threadwise_direct_convolution.cuh
src/include/threadwise_direct_convolution.cuh
+4
-4
src/include/threadwise_tensor_op.cuh
src/include/threadwise_tensor_op.cuh
+36
-117
src/include/threadwise_winograd_transform.cuh
src/include/threadwise_winograd_transform.cuh
+0
-138
No files found.
driver/conv.cu
View file @
24d2f034
...
@@ -7,7 +7,6 @@
...
@@ -7,7 +7,6 @@
#include "constant_tensor_descriptor.cuh"
#include "constant_tensor_descriptor.cuh"
#include "device_direct_convolution_1.cuh"
#include "device_direct_convolution_1.cuh"
#include "device_direct_convolution_2.cuh"
#include "device_direct_convolution_2.cuh"
#include "device_winograd_convolution.cuh"
struct
GeneratorConstant
struct
GeneratorConstant
{
{
...
@@ -61,10 +60,10 @@ void ostream_ConstantTensorDescriptor(TConstTensorDesc, std::ostream& os = std::
...
@@ -61,10 +60,10 @@ void ostream_ConstantTensorDescriptor(TConstTensorDesc, std::ostream& os = std::
{
{
static_assert
(
TConstTensorDesc
::
nDim
==
4
,
"nDim is not 4"
);
static_assert
(
TConstTensorDesc
::
nDim
==
4
,
"nDim is not 4"
);
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
desc
=
TConstTensorDesc
{};
constexpr
auto
desc
=
TConstTensorDesc
{};
os
<<
"Lengths: {"
<<
desc
.
GetLength
(
I0
)
<<
", "
<<
desc
.
GetLength
(
I1
)
<<
", "
os
<<
"Lengths: {"
<<
desc
.
GetLength
(
I0
)
<<
", "
<<
desc
.
GetLength
(
I1
)
<<
", "
...
@@ -79,10 +78,10 @@ auto make_TensorDescriptor(TConstTensorDesc)
...
@@ -79,10 +78,10 @@ auto make_TensorDescriptor(TConstTensorDesc)
{
{
static_assert
(
TConstTensorDesc
::
nDim
==
4
,
"nDim is not 4"
);
static_assert
(
TConstTensorDesc
::
nDim
==
4
,
"nDim is not 4"
);
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
desc
=
TConstTensorDesc
{};
constexpr
auto
desc
=
TConstTensorDesc
{};
std
::
initializer_list
<
unsigned
>
lengths
=
{
std
::
initializer_list
<
unsigned
>
lengths
=
{
...
@@ -396,7 +395,7 @@ int main()
...
@@ -396,7 +395,7 @@ int main()
Tensor
<
float
>
out_host
(
make_TensorDescriptor
(
out_desc
));
Tensor
<
float
>
out_host
(
make_TensorDescriptor
(
out_desc
));
Tensor
<
float
>
out_device
(
make_TensorDescriptor
(
out_desc
));
Tensor
<
float
>
out_device
(
make_TensorDescriptor
(
out_desc
));
#if
0
#if
1
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
std
::
size_t
num_thread
=
std
::
thread
::
hardware_concurrency
();
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
in
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
wei
.
GenerateTensorValue
(
GeneratorTensor_2
{
-
5
,
5
},
num_thread
);
...
@@ -405,17 +404,17 @@ int main()
...
@@ -405,17 +404,17 @@ int main()
for
(
int
i
=
0
;
i
<
20
;
++
i
)
for
(
int
i
=
0
;
i
<
20
;
++
i
)
{
{
#if 1
#if 1
device_direct_convolution_
2
(
in_desc
,
in
,
wei_desc
,
wei
,
out_desc
,
out_device
);
device_direct_convolution_
1
(
in_desc
,
in
,
wei_desc
,
wei
,
out_desc
,
out_device
);
#else
#else
device_winograd_convolution
(
in_desc
,
in
,
wei_desc
,
wei
,
out_desc
,
out_device
);
device_winograd_convolution
(
in_desc
,
in
,
wei_desc
,
wei
,
out_desc
,
out_device
);
#endif
#endif
}
}
#if
0
#if
1
host_
direct
_convolution(in, wei, out_host);
host_
winograd_3x3
_convolution
(
in
,
wei
,
out_host
);
check_error
(
out_host
,
out_device
);
check_error
(
out_host
,
out_device
);
#elif 0
#elif 0
host_
winograd_3x3
_convolution
(
in
,
wei
,
out_host
);
host_
direct
_convolution
(
in
,
wei
,
out_host
);
check_error
(
out_host
,
out_device
);
check_error
(
out_host
,
out_device
);
#endif
#endif
...
...
driver/device_direct_convolution_1.cuh
View file @
24d2f034
...
@@ -16,10 +16,10 @@ void device_direct_convolution_1(
...
@@ -16,10 +16,10 @@ void device_direct_convolution_1(
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
out_device_buf
.
ToDevice
(
out
.
mData
.
data
());
out_device_buf
.
ToDevice
(
out
.
mData
.
data
());
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
...
...
driver/device_direct_convolution_2.cuh
View file @
24d2f034
...
@@ -16,10 +16,10 @@ void device_direct_convolution_2(
...
@@ -16,10 +16,10 @@ void device_direct_convolution_2(
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
out_device_buf
.
ToDevice
(
out
.
mData
.
data
());
out_device_buf
.
ToDevice
(
out
.
mData
.
data
());
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
...
@@ -36,11 +36,6 @@ void device_direct_convolution_2(
...
@@ -36,11 +36,6 @@ void device_direct_convolution_2(
constexpr
unsigned
KPerThread
=
4
;
constexpr
unsigned
KPerThread
=
4
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
NBlockOpLen0
=
1
;
constexpr
unsigned
NBlockOpLen1
=
1
;
constexpr
unsigned
NBlockOpLen2
=
4
;
constexpr
unsigned
NBlockOpLen3
=
32
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
GridSize
=
(
out_desc
.
GetLength
(
I0
)
/
NPerBlock
)
*
constexpr
unsigned
GridSize
=
(
out_desc
.
GetLength
(
I0
)
/
NPerBlock
)
*
...
@@ -73,10 +68,6 @@ void device_direct_convolution_2(
...
@@ -73,10 +68,6 @@ void device_direct_convolution_2(
NPerThread
,
NPerThread
,
KPerThread
,
KPerThread
,
CPerThread
,
CPerThread
,
NBlockOpLen0
,
NBlockOpLen1
,
NBlockOpLen2
,
NBlockOpLen3
,
BlockSize
,
BlockSize
,
GridSize
>
GridSize
>
<<<
grid_dim
,
block_dim
>>>
(
InDesc
{},
<<<
grid_dim
,
block_dim
>>>
(
InDesc
{},
...
...
driver/device_winograd_convolution.cuh
deleted
100644 → 0
View file @
8732ea04
#pragma once
#include "gridwise_winograd_convolution.cuh"
template
<
class
T
,
class
InDesc
,
class
WeiDesc
,
class
OutDesc
>
void
device_winograd_convolution
(
InDesc
,
const
Tensor
<
T
>&
in
,
WeiDesc
,
const
Tensor
<
T
>&
wei
,
OutDesc
,
Tensor
<
T
>&
out
)
{
std
::
size_t
data_sz
=
sizeof
(
T
);
DeviceMem
in_device_buf
(
data_sz
*
in
.
mDesc
.
GetElementSpace
());
DeviceMem
wei_device_buf
(
data_sz
*
wei
.
mDesc
.
GetElementSpace
());
DeviceMem
out_device_buf
(
data_sz
*
out
.
mDesc
.
GetElementSpace
());
int
num_thread
=
std
::
thread
::
hardware_concurrency
();
in_device_buf
.
ToDevice
(
in
.
mData
.
data
());
wei_device_buf
.
ToDevice
(
wei
.
mData
.
data
());
out_device_buf
.
ToDevice
(
out
.
mData
.
data
());
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
out_desc
=
OutDesc
{};
constexpr
unsigned
OutTileSizeH
=
2
;
constexpr
unsigned
OutTileSizeW
=
2
;
constexpr
unsigned
NPerBlock
=
2
;
constexpr
unsigned
KPerBlock
=
16
;
constexpr
unsigned
CPerBlock
=
4
;
constexpr
unsigned
YPerBlock
=
1
;
constexpr
unsigned
XPerBlock
=
16
;
constexpr
unsigned
NPerThread
=
2
;
constexpr
unsigned
KPerThread
=
2
;
constexpr
unsigned
CPerThread
=
2
;
constexpr
unsigned
BlockSize
=
128
;
constexpr
unsigned
GridSize
=
(
out_desc
.
GetLength
(
I0
)
/
NPerBlock
)
*
(
out_desc
.
GetLength
(
I1
)
/
KPerBlock
)
*
(
out_desc
.
GetLength
(
I2
)
/
(
OutTileSizeH
*
YPerBlock
))
*
(
out_desc
.
GetLength
(
I3
)
/
(
OutTileSizeW
*
XPerBlock
));
dim3
block_dim
(
BlockSize
);
dim3
grid_dim
(
GridSize
);
printf
(
"%s: BlockSize %u, GridSize %u
\n
"
,
__func__
,
BlockSize
,
GridSize
);
cudaEvent_t
start
,
stop
;
float
elapsedTime
;
cudaEventCreate
(
&
start
);
cudaEventRecord
(
start
,
0
);
gridwise_winograd_convolution
<
T
,
InDesc
,
WeiDesc
,
OutDesc
,
OutTileSizeH
,
OutTileSizeW
,
NPerBlock
,
KPerBlock
,
CPerBlock
,
YPerBlock
,
XPerBlock
,
NPerThread
,
KPerThread
,
CPerThread
,
BlockSize
,
GridSize
>
<<<
grid_dim
,
block_dim
>>>
(
InDesc
{},
static_cast
<
T
*>
(
in_device_buf
.
GetDeviceBuffer
()),
WeiDesc
{},
static_cast
<
T
*>
(
wei_device_buf
.
GetDeviceBuffer
()),
OutDesc
{},
static_cast
<
T
*>
(
out_device_buf
.
GetDeviceBuffer
()));
cudaEventCreate
(
&
stop
);
cudaEventRecord
(
stop
,
0
);
cudaEventSynchronize
(
stop
);
cudaEventElapsedTime
(
&
elapsedTime
,
start
,
stop
);
printf
(
"Elapsed time : %f ms
\n
"
,
elapsedTime
);
checkCudaErrors
(
cudaGetLastError
());
out_device_buf
.
FromDevice
(
out
.
mData
.
data
());
}
\ No newline at end of file
src/include/blockwise_direct_convolution.cuh
View file @
24d2f034
...
@@ -17,10 +17,10 @@ __device__ void blockwise_convolution(InBlockDesc,
...
@@ -17,10 +17,10 @@ __device__ void blockwise_convolution(InBlockDesc,
OutBlockDesc
,
OutBlockDesc
,
TFloat
*
__restrict__
p_out_block
)
TFloat
*
__restrict__
p_out_block
)
{
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_block_desc
=
InBlockDesc
{};
constexpr
auto
in_block_desc
=
InBlockDesc
{};
constexpr
auto
wei_block_desc
=
WeiBlockDesc
{};
constexpr
auto
wei_block_desc
=
WeiBlockDesc
{};
...
@@ -88,72 +88,50 @@ __device__ void blockwise_convolution(InBlockDesc,
...
@@ -88,72 +88,50 @@ __device__ void blockwise_convolution(InBlockDesc,
TFloat
p_wei_thread
[
wei_thread_src_desc
.
GetElementSpace
()];
TFloat
p_wei_thread
[
wei_thread_src_desc
.
GetElementSpace
()];
TFloat
p_out_thread
[
out_thread_src_desc
.
GetElementSpace
()];
TFloat
p_out_thread
[
out_thread_src_desc
.
GetElementSpace
()];
auto
f_copy
=
[](
const
TFloat
&
src
,
TFloat
&
dst
)
{
dst
=
src
;
};
// copy input tensor into register
// copy input tensor into register
threadwise_4d_tensor_op_binary
<
TFloat
,
threadwise_4d_tensor_copy
(
decltype
(
in_thread_src_desc
),
decltype
(
in_thread_dst_desc
),
decltype
(
f_copy
)
>
(
in_thread_src_desc
,
in_thread_src_desc
,
p_in_block
+
in_block_desc
.
Get1dIndex
(
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_work_begin
,
0
,
hi_thread_work_begin
,
wi_thread_work_begin
),
n_thread_work_begin
,
0
,
hi_thread_work_begin
,
wi_thread_work_begin
),
in_thread_dst_desc
,
in_thread_dst_desc
,
p_in_thread
,
p_in_thread
);
f_copy
);
for
(
unsigned
k_thread_work_begin
=
0
;
k_thread_work_begin
<
KPerBlock
;
for
(
unsigned
k_thread_work_begin
=
0
;
k_thread_work_begin
<
KPerBlock
;
++
k_thread_work_begin
)
++
k_thread_work_begin
)
{
{
// copy weight tensor into register
// copy weight tensor into register
threadwise_4d_tensor_op_binary
<
TFloat
,
threadwise_4d_tensor_copy
(
wei_thread_src_desc
,
decltype
(
wei_thread_src_desc
),
p_wei_block
+
decltype
(
wei_thread_dst_desc
),
wei_block_desc
.
Get1dIndex
(
k_thread_work_begin
,
0
,
0
,
0
),
decltype
(
f_copy
)
>
(
wei_thread_dst_desc
,
wei_thread_src_desc
,
p_wei_thread
);
p_wei_block
+
wei_block_desc
.
Get1dIndex
(
k_thread_work_begin
,
0
,
0
,
0
),
wei_thread_dst_desc
,
p_wei_thread
,
f_copy
);
// copy output tensor into register
// copy output tensor into register
threadwise_4d_tensor_op_binary
<
TFloat
,
threadwise_4d_tensor_copy
(
out_thread_src_desc
,
decltype
(
out_thread_src_desc
),
p_out_block
+
out_block_desc
.
Get1dIndex
(
n_thread_work_begin
,
decltype
(
out_thread_dst_desc
),
k_thread_work_begin
,
decltype
(
f_copy
)
>
(
ho_thread_work_begin
,
out_thread_src_desc
,
wo_thread_work_begin
),
p_out_block
+
out_block_desc
.
Get1dIndex
(
n_thread_work_begin
,
out_thread_dst_desc
,
k_thread_work_begin
,
p_out_thread
);
ho_thread_work_begin
,
wo_thread_work_begin
),
out_thread_dst_desc
,
p_out_thread
,
f_copy
);
// threadwise convolution
// threadwise convolution
threadwise_direct_convolution
<
TFloat
,
threadwise_direct_convolution
(
in_thread_dst_desc
,
decltype
(
in_thread_dst_desc
),
p_in_thread
,
decltype
(
wei_thread_dst_desc
),
wei_thread_dst_desc
,
decltype
(
out_thread_dst_desc
)
>
(
in_thread_dst_desc
,
p_wei_thread
,
p_in_thread
,
out_thread_dst_desc
,
wei_thread_dst_desc
,
p_out_thread
);
p_wei_thread
,
out_thread_dst_desc
,
p_out_thread
);
// accumulate output tensor into LDS
// accumulate output tensor into LDS
threadwise_4d_tensor_op_binary
<
TFloat
,
threadwise_4d_tensor_copy
(
out_thread_dst_desc
,
decltype
(
out_thread_dst_desc
),
p_out_thread
,
decltype
(
out_thread_src_desc
),
out_thread_src_desc
,
decltype
(
f_copy
)
>
(
p_out_block
+
out_thread_dst_desc
,
out_block_desc
.
Get1dIndex
(
n_thread_work_begin
,
p_out_thread
,
k_thread_work_begin
,
out_thread_src_desc
,
ho_thread_work_begin
,
p_out_block
+
out_block_desc
.
Get1dIndex
(
n_thread_work_begin
,
wo_thread_work_begin
));
k_thread_work_begin
,
ho_thread_work_begin
,
wo_thread_work_begin
),
f_copy
);
}
}
}
}
}
}
src/include/blockwise_tensor_op.cuh
View file @
24d2f034
#pragma once
#pragma once
#include "constant_tensor_descriptor.cuh"
#include "constant_tensor_descriptor.cuh"
#define BLOCKWISE_TENSOR_OP_METHOD 12
template
<
class
TFloat
,
class
DstDesc
,
class
F
,
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_pointwise_op_unary
(
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
#if BLOCKWISE_TENSOR_OP_METHOD == 11
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
unsigned
NBlockOpLen0
,
unsigned
NBlockOpLen1
,
unsigned
NBlockOpLen2
,
unsigned
NBlockOpLen3
,
class
F
,
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_op_binary
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
{
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
constexpr
auto
desc
=
make_ConstantTensorDescriptor
(
src_desc
.
GetLengths
());
#if 0
if(threadIdx.x == 0)
{
print_ConstantTensorDescriptor(src_desc, "blockwise_4d_tensor_op_binary: src_desc: ");
print_ConstantTensorDescriptor(dst_desc, "blockwise_4d_tensor_op_binary: dst_desc: ");
}
#endif
for
(
unsigned
i
=
threadIdx
.
x
;
i
<
desc
.
GetElementSize
();
i
+=
BlockSize
)
{
unsigned
is
=
i
;
const
unsigned
did0
=
is
/
desc
.
GetStride
(
I0
);
is
-=
did0
*
desc
.
GetStride
(
I0
);
const
unsigned
did1
=
is
/
desc
.
GetStride
(
I1
);
is
-=
did1
*
desc
.
GetStride
(
I1
);
const
unsigned
did2
=
is
/
desc
.
GetStride
(
I2
);
is
-=
did2
*
desc
.
GetStride
(
I2
);
const
unsigned
did3
=
is
/
desc
.
GetStride
(
I3
);
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
}
}
#endif
#if BLOCKWISE_TENSOR_OP_METHOD == 12
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
unsigned
NBlockOpLen0
,
unsigned
NBlockOpLen1
,
unsigned
NBlockOpLen2
,
unsigned
NBlockOpLen3
,
class
F
,
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_op_binary
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
constexpr
auto
desc
=
make_ConstantTensorDescriptor
(
dst_desc
.
GetLengths
());
constexpr
auto
desc
=
make_ConstantTensorDescriptor
(
src_desc
.
GetLengths
());
#if 0
#if 0
if(threadIdx.x == 0)
if(threadIdx.x == 0)
{
{
print_ConstantTensorDescriptor(
src
_desc, "blockwise_4d_tensor_op_
bi
nary:
src
_desc: ");
print_ConstantTensorDescriptor(
dst
_desc, "blockwise_4d_tensor_op_
u
nary:
dst
_desc: ");
print_ConstantTensorDescriptor(
dst_
desc, "blockwise_4d_tensor_op_
bi
nary:
dst_
desc: ");
print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_
u
nary: desc: ");
}
}
#endif
#endif
...
@@ -116,11 +41,9 @@ __device__ void blockwise_4d_tensor_op_binary(
...
@@ -116,11 +41,9 @@ __device__ void blockwise_4d_tensor_op_binary(
const
unsigned
did3
=
is
/
desc
.
GetStride
(
I3
);
const
unsigned
did3
=
is
/
desc
.
GetStride
(
I3
);
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
f
(
p_dst
[
dindex
]);
}
}
constexpr
bool
has_tail
=
(
desc
.
GetElementSize
()
>
NLoop
*
BlockSize
);
constexpr
bool
has_tail
=
(
desc
.
GetElementSize
()
>
NLoop
*
BlockSize
);
...
@@ -145,39 +68,34 @@ __device__ void blockwise_4d_tensor_op_binary(
...
@@ -145,39 +68,34 @@ __device__ void blockwise_4d_tensor_op_binary(
const
unsigned
did3
=
is
/
desc
.
GetStride
(
I3
);
const
unsigned
did3
=
is
/
desc
.
GetStride
(
I3
);
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
f
(
p_dst
[
dindex
]);
}
}
}
}
}
}
template
<
class
TFloat
,
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
class
F
,
unsigned
BlockSize
>
class
DstDesc
,
__device__
void
blockwise_4d_tensor_pointwise_op_binary
(
unsigned
NBlockOpLen0
,
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
unsigned
NBlockOpLen1
,
unsigned
NBlockOpLen2
,
unsigned
NBlockOpLen3
,
class
F
,
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_op_unary
(
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
{
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
desc
=
make_ConstantTensorDescriptor
(
dst_desc
.
GetLengths
());
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
constexpr
auto
desc
=
make_ConstantTensorDescriptor
(
src_desc
.
GetLengths
());
#if 0
#if 0
if(threadIdx.x == 0)
if(threadIdx.x == 0)
{
{
print_ConstantTensorDescriptor(
dst
_desc, "blockwise_4d_tensor_op_
u
nary:
dst
_desc: ");
print_ConstantTensorDescriptor(
src
_desc, "blockwise_4d_tensor_op_
bi
nary:
src
_desc: ");
print_ConstantTensorDescriptor(desc, "blockwise_4d_tensor_op_
u
nary: desc: ");
print_ConstantTensorDescriptor(
dst_
desc, "blockwise_4d_tensor_op_
bi
nary:
dst_
desc: ");
}
}
#endif
#endif
...
@@ -201,9 +119,11 @@ __device__ void blockwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst
...
@@ -201,9 +119,11 @@ __device__ void blockwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst
const
unsigned
did3
=
is
/
desc
.
GetStride
(
I3
);
const
unsigned
did3
=
is
/
desc
.
GetStride
(
I3
);
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
f
(
p_dst
[
dindex
]);
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
}
}
constexpr
bool
has_tail
=
(
desc
.
GetElementSize
()
>
NLoop
*
BlockSize
);
constexpr
bool
has_tail
=
(
desc
.
GetElementSize
()
>
NLoop
*
BlockSize
);
...
@@ -228,312 +148,44 @@ __device__ void blockwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst
...
@@ -228,312 +148,44 @@ __device__ void blockwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_dst
const
unsigned
did3
=
is
/
desc
.
GetStride
(
I3
);
const
unsigned
did3
=
is
/
desc
.
GetStride
(
I3
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
f
(
p_dst
[
dindex
]);
}
}
}
#endif
#if BLOCKWISE_TENSOR_OP_METHOD == 21
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
unsigned
NBlockOpLen0
,
unsigned
NBlockOpLen1
,
unsigned
NBlockOpLen2
,
unsigned
NBlockOpLen3
,
class
F
,
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_op_binary
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
constexpr
unsigned
NBlockOpStride3
=
1
;
constexpr
unsigned
NBlockOpStride2
=
NBlockOpLen3
*
NBlockOpStride3
;
constexpr
unsigned
NBlockOpStride1
=
NBlockOpLen2
*
NBlockOpStride2
;
constexpr
unsigned
NBlockOpStride0
=
NBlockOpLen1
*
NBlockOpStride1
;
unsigned
itmp
=
threadIdx
.
x
;
const
unsigned
did0_begin
=
itmp
/
NBlockOpStride0
;
itmp
-=
did0_begin
*
NBlockOpStride0
;
const
unsigned
did1_begin
=
itmp
/
NBlockOpStride1
;
itmp
-=
did1_begin
*
NBlockOpStride1
;
const
unsigned
did2_begin
=
itmp
/
NBlockOpStride2
;
itmp
-=
did2_begin
*
NBlockOpStride2
;
const
unsigned
did3_begin
=
itmp
/
NBlockOpStride3
;
for
(
unsigned
did0
=
did0_begin
;
did0
<
src_desc
.
GetLength
(
I0
);
did0
+=
NBlockOpLen0
)
{
for
(
unsigned
did1
=
did1_begin
;
did1
<
src_desc
.
GetLength
(
I1
);
did1
+=
NBlockOpLen1
)
{
for
(
unsigned
did2
=
did2_begin
;
did2
<
src_desc
.
GetLength
(
I2
);
did2
+=
NBlockOpLen2
)
{
for
(
unsigned
did3
=
did3_begin
;
did3
<
src_desc
.
GetLength
(
I3
);
did3
+=
NBlockOpLen3
)
{
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
}
}
}
}
}
}
}
}
#endif
#if BLOCKWISE_TENSOR_OP_METHOD == 22
template
<
class
TFloat
,
class
DstDesc
,
unsigned
BlockSize
>
template
<
class
TFloat
,
__device__
void
blockwise_4d_tensor_set_zero
(
DstDesc
,
TFloat
*
__restrict__
p_dst
)
class
SrcDesc
,
class
DstDesc
,
unsigned
NBlockOpLen0
,
unsigned
NBlockOpLen1
,
unsigned
NBlockOpLen2
,
unsigned
NBlockOpLen3
,
class
F
,
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_op_binary
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
{
{
constexpr
auto
I0
=
Index
<
0
>
{};
auto
f_set_zero
=
[](
TFloat
&
v
)
{
v
=
TFloat
(
0
);
};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
constexpr
unsigned
NBlockOpStride3
=
1
;
constexpr
unsigned
NBlockOpStride2
=
NBlockOpLen3
*
NBlockOpStride3
;
constexpr
unsigned
NBlockOpStride1
=
NBlockOpLen2
*
NBlockOpStride2
;
constexpr
unsigned
NBlockOpStride0
=
NBlockOpLen1
*
NBlockOpStride1
;
unsigned
itmp
=
threadIdx
.
x
;
blockwise_4d_tensor_pointwise_op_unary
<
TFloat
,
DstDesc
,
decltype
(
f_set_zero
),
BlockSize
>
(
DstDesc
{},
p_dst
,
f_set_zero
);
const
unsigned
did0_begin
=
itmp
/
NBlockOpStride0
;
itmp
-=
did0_begin
*
NBlockOpStride0
;
const
unsigned
did1_begin
=
itmp
/
NBlockOpStride1
;
itmp
-=
did1_begin
*
NBlockOpStride1
;
const
unsigned
did2_begin
=
itmp
/
NBlockOpStride2
;
itmp
-=
did2_begin
*
NBlockOpStride2
;
const
unsigned
did3_begin
=
itmp
/
NBlockOpStride3
;
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0_begin
,
did1_begin
,
did2_begin
,
did3_begin
);
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0_begin
,
did1_begin
,
did2_begin
,
did3_begin
);
for
(
unsigned
did0
=
did0_begin
;
did0
<
src_desc
.
GetLength
(
I0
);
did0
+=
NBlockOpLen0
)
{
const
unsigned
sindex_save0
=
sindex
;
const
unsigned
dindex_save0
=
dindex
;
for
(
unsigned
did1
=
did1_begin
;
did1
<
src_desc
.
GetLength
(
I1
);
did1
+=
NBlockOpLen1
)
{
const
unsigned
sindex_save1
=
sindex
;
const
unsigned
dindex_save1
=
dindex
;
for
(
unsigned
did2
=
did2_begin
;
did2
<
src_desc
.
GetLength
(
I2
);
did2
+=
NBlockOpLen2
)
{
const
unsigned
sindex_save2
=
sindex
;
const
unsigned
dindex_save2
=
dindex
;
for
(
unsigned
did3
=
did3_begin
;
did3
<
src_desc
.
GetLength
(
I3
);
did3
+=
NBlockOpLen3
)
{
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
sindex
+=
NBlockOpLen3
*
src_desc
.
GetStride
(
I3
);
dindex
+=
NBlockOpLen3
*
dst_desc
.
GetStride
(
I3
);
}
sindex
=
sindex_save2
+
NBlockOpLen2
*
src_desc
.
GetStride
(
I2
);
dindex
=
dindex_save2
+
NBlockOpLen2
*
dst_desc
.
GetStride
(
I2
);
}
sindex
=
sindex_save1
+
NBlockOpLen1
*
src_desc
.
GetStride
(
I1
);
dindex
=
dindex_save1
+
NBlockOpLen1
*
dst_desc
.
GetStride
(
I1
);
}
sindex
=
sindex_save0
+
NBlockOpLen0
*
src_desc
.
GetStride
(
I0
);
dindex
=
dindex_save0
+
NBlockOpLen0
*
dst_desc
.
GetStride
(
I0
);
}
}
}
#endif
#if BLOCKWISE_TENSOR_OP_METHOD == 23
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
unsigned
BlockSize
>
template
<
class
TFloat
,
__device__
void
blockwise_4d_tensor_copy
(
SrcDesc
,
class
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
class
DstDesc
,
DstDesc
,
unsigned
NBlockOpLen0
,
TFloat
*
__restrict__
p_dst
)
unsigned
NBlockOpLen1
,
unsigned
NBlockOpLen2
,
unsigned
NBlockOpLen3
,
class
F
,
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_op_binary
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
{
{
constexpr
auto
I0
=
Index
<
0
>
{};
auto
f_copy
=
[](
const
TFloat
&
src
,
TFloat
&
dst
)
{
dst
=
src
;
};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
src_desc
=
SrcDesc
{};
blockwise_4d_tensor_pointwise_op_binary
<
TFloat
,
SrcDesc
,
DstDesc
,
decltype
(
f_copy
),
BlockSize
>
(
constexpr
auto
dst_desc
=
DstDesc
{};
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
f_copy
);
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
constexpr
unsigned
NBlockOpStride3
=
1
;
constexpr
unsigned
NBlockOpStride2
=
NBlockOpLen3
*
NBlockOpStride3
;
constexpr
unsigned
NBlockOpStride1
=
NBlockOpLen2
*
NBlockOpStride2
;
constexpr
unsigned
NBlockOpStride0
=
NBlockOpLen1
*
NBlockOpStride1
;
unsigned
itmp
=
threadIdx
.
x
;
const
unsigned
did0_begin
=
itmp
/
NBlockOpStride0
;
itmp
-=
did0_begin
*
NBlockOpStride0
;
const
unsigned
did1_begin
=
itmp
/
NBlockOpStride1
;
itmp
-=
did1_begin
*
NBlockOpStride1
;
const
unsigned
did2_begin
=
itmp
/
NBlockOpStride2
;
itmp
-=
did2_begin
*
NBlockOpStride2
;
const
unsigned
did3_begin
=
itmp
/
NBlockOpStride3
;
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0_begin
,
did1_begin
,
did2_begin
,
did3_begin
);
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0_begin
,
did1_begin
,
did2_begin
,
did3_begin
);
for
(
unsigned
did0
=
did0_begin
;
did0
<
src_desc
.
GetLength
(
I0
);
did0
+=
NBlockOpLen0
)
{
unsigned
i1
=
0
;
for
(
unsigned
did1
=
did1_begin
;
did1
<
src_desc
.
GetLength
(
I1
);
did1
+=
NBlockOpLen1
)
{
unsigned
i2
=
0
;
for
(
unsigned
did2
=
did2_begin
;
did2
<
src_desc
.
GetLength
(
I2
);
did2
+=
NBlockOpLen2
)
{
unsigned
i3
=
0
;
for
(
unsigned
did3
=
did3_begin
;
did3
<
src_desc
.
GetLength
(
I3
);
did3
+=
NBlockOpLen3
)
{
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
sindex
+=
NBlockOpLen3
*
src_desc
.
GetStride
(
I3
);
dindex
+=
NBlockOpLen3
*
dst_desc
.
GetStride
(
I3
);
++
i3
;
}
sindex
+=
NBlockOpLen2
*
src_desc
.
GetStride
(
I2
)
-
i3
*
NBlockOpLen3
*
src_desc
.
GetStride
(
I3
);
dindex
+=
NBlockOpLen2
*
dst_desc
.
GetStride
(
I2
)
-
i3
*
NBlockOpLen3
*
dst_desc
.
GetStride
(
I3
);
++
i2
;
}
sindex
+=
NBlockOpLen1
*
src_desc
.
GetStride
(
I1
)
-
i2
*
NBlockOpLen2
*
src_desc
.
GetStride
(
I2
);
dindex
+=
NBlockOpLen1
*
dst_desc
.
GetStride
(
I1
)
-
i2
*
NBlockOpLen2
*
dst_desc
.
GetStride
(
I2
);
++
i1
;
}
sindex
+=
NBlockOpLen0
*
src_desc
.
GetStride
(
I0
)
-
i1
*
NBlockOpLen1
*
src_desc
.
GetStride
(
I1
);
dindex
+=
NBlockOpLen0
*
dst_desc
.
GetStride
(
I0
)
-
i1
*
NBlockOpLen1
*
dst_desc
.
GetStride
(
I1
);
}
}
}
#endif
#if BLOCKWISE_TENSOR_OP_METHOD == 31
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
unsigned
BlockSize
>
template
<
class
TFloat
,
__device__
void
blockwise_4d_tensor_accumulate
(
SrcDesc
,
class
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
class
DstDesc
,
DstDesc
,
unsigned
NBlockOpLen0
,
TFloat
*
__restrict__
p_dst
)
unsigned
NBlockOpLen1
,
unsigned
NBlockOpLen2
,
unsigned
NBlockOpLen3
,
class
F
,
unsigned
BlockSize
>
__device__
void
blockwise_4d_tensor_op_binary
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
{
{
constexpr
auto
I0
=
Index
<
0
>
{};
auto
f_accum
=
[](
const
TFloat
&
src
,
TFloat
&
dst
)
{
dst
+=
src
;
};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
src_desc
=
SrcDesc
{};
blockwise_4d_tensor_pointwise_op_binary
<
TFloat
,
SrcDesc
,
DstDesc
,
decltype
(
f_accum
),
BlockSize
>
(
constexpr
auto
dst_desc
=
DstDesc
{};
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
f_accum
);
}
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
\ No newline at end of file
constexpr
unsigned
NBlockOpStride3
=
1
;
constexpr
unsigned
NBlockOpStride2
=
NBlockOpLen3
*
NBlockOpStride3
;
constexpr
unsigned
NBlockOpStride1
=
NBlockOpLen2
*
NBlockOpStride2
;
constexpr
unsigned
NBlockOpStride0
=
NBlockOpLen1
*
NBlockOpStride1
;
unsigned
itmp
=
threadIdx
.
x
;
const
unsigned
did0_begin
=
itmp
/
NBlockOpStride0
;
itmp
-=
did0_begin
*
NBlockOpStride0
;
const
unsigned
did1_begin
=
itmp
/
NBlockOpStride1
;
itmp
-=
did1_begin
*
NBlockOpStride1
;
const
unsigned
did2_begin
=
itmp
/
NBlockOpStride2
;
itmp
-=
did2_begin
*
NBlockOpStride2
;
const
unsigned
did3_begin
=
itmp
/
NBlockOpStride3
;
for
(
unsigned
did0
=
did0_begin
;
did0
<
src_desc
.
GetLength
(
I0
);
did0
+=
NBlockOpLen0
)
{
for
(
unsigned
did1
=
did1_begin
;
did1
<
src_desc
.
GetLength
(
I1
);
did1
+=
NBlockOpLen1
)
{
for
(
unsigned
did2
=
did2_begin
;
did2
<
src_desc
.
GetLength
(
I2
);
did2
+=
NBlockOpLen2
)
{
for
(
unsigned
did3
=
did3_begin
;
did3
<
src_desc
.
GetLength
(
I3
);
did3
+=
NBlockOpLen3
)
{
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
}
}
}
}
}
#endif
src/include/blockwise_winograd_transform.cuh
deleted
100644 → 0
View file @
8732ea04
#pragma once
#include "constant_tensor_descriptor.cuh"
template
<
class
TFloat
,
unsigned
InTileSizeH
,
unsigned
InTileSizeW
,
unsigned
S
,
unsigned
R
,
unsigned
OutTileSizeH
,
unsigned
OutTileSizeW
,
unsigned
NPerBlock
,
unsigned
CPerBlock
,
unsigned
YPerBlock
,
unsigned
XPerBlock
,
unsigned
BlockSize
>
__device__
void
blockwise_winograd_transform_input
(
TFloat
*
const
__restrict__
p_in
,
TFloat
*
__restrict__
p_in_transform
)
{
p_in_transform
[
0
]
=
1
;
}
template
<
class
TFloat
,
unsigned
InTileSizeH
,
unsigned
InTileSizeW
,
unsigned
S
,
unsigned
R
,
unsigned
OutTileSizeH
,
unsigned
OutTileSizeW
,
unsigned
KPerBlock
,
unsigned
CPerBlock
,
unsigned
BlockSize
>
__device__
void
blockwise_winograd_transform_weight
(
TFloat
*
const
__restrict__
p_wei
,
TFloat
*
__restrict__
p_wei_transform
)
{
p_wei_transform
[
0
]
=
1
;
}
\ No newline at end of file
src/include/constant_tensor_descriptor.cuh
View file @
24d2f034
...
@@ -7,8 +7,8 @@ struct Constant
...
@@ -7,8 +7,8 @@ struct Constant
const
T
mValue
=
N
;
const
T
mValue
=
N
;
};
};
template
<
unsigned
I
>
template
<
unsigned
N
>
using
Index
=
Constant
<
unsigned
,
I
>
;
using
Number
=
Constant
<
unsigned
,
N
>
;
template
<
unsigned
...
Is
>
template
<
unsigned
...
Is
>
struct
Sequence
struct
Sequence
...
@@ -18,7 +18,7 @@ struct Sequence
...
@@ -18,7 +18,7 @@ struct Sequence
const
unsigned
mData
[
nDim
]
=
{
Is
...};
const
unsigned
mData
[
nDim
]
=
{
Is
...};
template
<
unsigned
I
>
template
<
unsigned
I
>
__host__
__device__
constexpr
unsigned
Get
(
Index
<
I
>
)
const
__host__
__device__
constexpr
unsigned
Get
(
Number
<
I
>
)
const
{
{
return
mData
[
I
];
return
mData
[
I
];
}
}
...
@@ -28,7 +28,7 @@ template <class Lengths, class Strides>
...
@@ -28,7 +28,7 @@ template <class Lengths, class Strides>
struct
ConstantTensorDescriptor
struct
ConstantTensorDescriptor
{
{
static
constexpr
unsigned
nDim
=
Lengths
::
nDim
;
static
constexpr
unsigned
nDim
=
Lengths
::
nDim
;
using
NDimConstant
=
Index
<
nDim
>
;
using
NDimConstant
=
Number
<
nDim
>
;
__host__
__device__
constexpr
ConstantTensorDescriptor
()
__host__
__device__
constexpr
ConstantTensorDescriptor
()
{
{
...
@@ -42,15 +42,15 @@ struct ConstantTensorDescriptor
...
@@ -42,15 +42,15 @@ struct ConstantTensorDescriptor
__host__
__device__
constexpr
Strides
GetStrides
()
const
{
return
Strides
{};
}
__host__
__device__
constexpr
Strides
GetStrides
()
const
{
return
Strides
{};
}
template
<
unsigned
I
>
template
<
unsigned
I
>
__host__
__device__
constexpr
unsigned
GetLength
(
Index
<
I
>
)
const
__host__
__device__
constexpr
unsigned
GetLength
(
Number
<
I
>
)
const
{
{
return
Lengths
{}.
Get
(
Index
<
I
>
{});
return
Lengths
{}.
Get
(
Number
<
I
>
{});
}
}
template
<
unsigned
I
>
template
<
unsigned
I
>
__host__
__device__
constexpr
unsigned
GetStride
(
Index
<
I
>
)
const
__host__
__device__
constexpr
unsigned
GetStride
(
Number
<
I
>
)
const
{
{
return
Strides
{}.
Get
(
Index
<
I
>
{});
return
Strides
{}.
Get
(
Number
<
I
>
{});
}
}
// this is ugly, only for 4d
// this is ugly, only for 4d
...
@@ -58,10 +58,10 @@ struct ConstantTensorDescriptor
...
@@ -58,10 +58,10 @@ struct ConstantTensorDescriptor
{
{
static_assert
(
nDim
==
4
,
"nDim is not 4"
);
static_assert
(
nDim
==
4
,
"nDim is not 4"
);
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
return
GetLength
(
I0
)
*
GetLength
(
I1
)
*
GetLength
(
I2
)
*
GetLength
(
I3
);
return
GetLength
(
I0
)
*
GetLength
(
I1
)
*
GetLength
(
I2
)
*
GetLength
(
I3
);
}
}
...
@@ -71,10 +71,10 @@ struct ConstantTensorDescriptor
...
@@ -71,10 +71,10 @@ struct ConstantTensorDescriptor
{
{
static_assert
(
nDim
==
4
,
"nDim is not 4"
);
static_assert
(
nDim
==
4
,
"nDim is not 4"
);
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
return
(
GetLength
(
I0
)
-
1
)
*
GetStride
(
I0
)
+
(
GetLength
(
I1
)
-
1
)
*
GetStride
(
I1
)
+
return
(
GetLength
(
I0
)
-
1
)
*
GetStride
(
I0
)
+
(
GetLength
(
I1
)
-
1
)
*
GetStride
(
I1
)
+
(
GetLength
(
I2
)
-
1
)
*
GetStride
(
I2
)
+
(
GetLength
(
I3
)
-
1
)
*
GetStride
(
I3
)
+
1
;
(
GetLength
(
I2
)
-
1
)
*
GetStride
(
I2
)
+
(
GetLength
(
I3
)
-
1
)
*
GetStride
(
I3
)
+
1
;
...
@@ -83,10 +83,10 @@ struct ConstantTensorDescriptor
...
@@ -83,10 +83,10 @@ struct ConstantTensorDescriptor
// this is ugly, only for 4d
// this is ugly, only for 4d
__host__
__device__
unsigned
Get1dIndex
(
unsigned
n
,
unsigned
c
,
unsigned
h
,
unsigned
w
)
const
__host__
__device__
unsigned
Get1dIndex
(
unsigned
n
,
unsigned
c
,
unsigned
h
,
unsigned
w
)
const
{
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
static_assert
(
nDim
==
4
,
"nDim is not 4"
);
static_assert
(
nDim
==
4
,
"nDim is not 4"
);
return
n
*
GetStride
(
I0
)
+
c
*
GetStride
(
I1
)
+
h
*
GetStride
(
I2
)
+
w
*
GetStride
(
I3
);
return
n
*
GetStride
(
I0
)
+
c
*
GetStride
(
I1
)
+
h
*
GetStride
(
I2
)
+
w
*
GetStride
(
I3
);
...
@@ -120,10 +120,10 @@ __host__ __device__ constexpr auto get_output_4d_tensor_descriptor(InDesc, WeiDe
...
@@ -120,10 +120,10 @@ __host__ __device__ constexpr auto get_output_4d_tensor_descriptor(InDesc, WeiDe
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
static_assert
(
in_desc
.
GetDimension
()
==
4
,
"input nDim is not 4"
);
static_assert
(
in_desc
.
GetDimension
()
==
4
,
"input nDim is not 4"
);
static_assert
(
wei_desc
.
GetDimension
()
==
4
,
"weight nDim is not 4"
);
static_assert
(
wei_desc
.
GetDimension
()
==
4
,
"weight nDim is not 4"
);
...
@@ -150,10 +150,10 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
...
@@ -150,10 +150,10 @@ __host__ __device__ void print_ConstantTensorDescriptor(TDesc, const char* s)
{
{
constexpr
auto
desc
=
TDesc
{};
constexpr
auto
desc
=
TDesc
{};
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
static_assert
(
desc
.
GetDimension
()
==
4
,
"dim is not 4"
);
static_assert
(
desc
.
GetDimension
()
==
4
,
"dim is not 4"
);
...
...
src/include/gridwise_direct_convolution_1.cuh
View file @
24d2f034
...
@@ -27,10 +27,10 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
...
@@ -27,10 +27,10 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
OutGlobalDesc
,
OutGlobalDesc
,
TFloat
*
__restrict__
p_out_global
)
TFloat
*
__restrict__
p_out_global
)
{
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_global_desc
=
InGlobalDesc
{};
constexpr
auto
in_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
wei_global_desc
=
WeiGlobalDesc
{};
...
@@ -120,62 +120,38 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
...
@@ -120,62 +120,38 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
}
}
#endif
#endif
auto
f_set0
=
[](
TFloat
&
v
)
{
v
=
TFloat
(
0
);
};
auto
f_copy
=
[](
const
TFloat
&
src
,
TFloat
&
dst
)
{
dst
=
src
;
};
auto
f_accu
=
[](
const
TFloat
&
src
,
TFloat
&
dst
)
{
dst
+=
src
;
};
// set output tensor in LDS to 0
// set output tensor in LDS to 0
blockwise_4d_tensor_op_unary
<
TFloat
,
blockwise_4d_tensor_set_zero
<
TFloat
,
decltype
(
out_block_desc
),
BlockSize
>
(
out_block_desc
,
decltype
(
out_block_desc
),
p_out_block
);
NBlockOpLen0
,
NBlockOpLen1
,
NBlockOpLen2
,
NBlockOpLen3
,
decltype
(
f_set0
),
BlockSize
>
(
out_block_desc
,
p_out_block
,
f_set0
);
for
(
unsigned
c_block_work_begin
=
0
;
c_block_work_begin
<
in_global_desc
.
GetLength
(
I1
);
for
(
unsigned
c_block_work_begin
=
0
;
c_block_work_begin
<
in_global_desc
.
GetLength
(
I1
);
c_block_work_begin
+=
CPerBlock
)
c_block_work_begin
+=
CPerBlock
,
__syncthreads
()
)
{
{
// copy input tensor to LDS
// copy input tensor to LDS
blockwise_4d_tensor_op_binary
<
TFloat
,
blockwise_4d_tensor_copy
<
TFloat
,
decltype
(
in_block_src_desc
),
decltype
(
in_block_src_desc
),
decltype
(
in_block_desc
),
decltype
(
in_block_desc
),
NBlockOpLen0
,
BlockSize
>
(
in_block_src_desc
,
NBlockOpLen1
,
p_in_global
+
NBlockOpLen2
,
in_global_desc
.
Get1dIndex
(
n_block_work_begin
,
NBlockOpLen3
,
c_block_work_begin
,
decltype
(
f_copy
),
hi_block_work_begin
,
BlockSize
>
(
in_block_src_desc
,
wi_block_work_begin
),
p_in_global
+
in_block_desc
,
in_global_desc
.
Get1dIndex
(
n_block_work_begin
,
p_in_block
);
c_block_work_begin
,
hi_block_work_begin
,
wi_block_work_begin
),
in_block_desc
,
p_in_block
,
f_copy
);
// copy weight tensor to LDS
// copy weight tensor to LDS
blockwise_4d_tensor_op_binary
<
TFloat
,
blockwise_4d_tensor_copy
<
TFloat
,
decltype
(
wei_block_src_desc
),
decltype
(
wei_block_src_desc
),
decltype
(
wei_block_desc
),
decltype
(
wei_block_desc
),
NBlockOpLen0
,
BlockSize
>
(
NBlockOpLen1
,
NBlockOpLen2
,
NBlockOpLen3
,
decltype
(
f_copy
),
BlockSize
>
(
wei_block_src_desc
,
wei_block_src_desc
,
p_wei_global
+
wei_global_desc
.
Get1dIndex
(
k_block_work_begin
,
c_block_work_begin
,
0
,
0
),
p_wei_global
+
wei_global_desc
.
Get1dIndex
(
k_block_work_begin
,
c_block_work_begin
,
0
,
0
),
wei_block_desc
,
wei_block_desc
,
p_wei_block
,
p_wei_block
);
f_copy
);
#if 1
__syncthreads
();
__syncthreads
();
#endif
// blockwise convolution
// blockwise convolution
blockwise_convolution
<
TFloat
,
blockwise_convolution
<
TFloat
,
...
@@ -186,27 +162,17 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
...
@@ -186,27 +162,17 @@ __global__ void gridwise_direct_convolution_1(InGlobalDesc,
OutTileSizeW
,
OutTileSizeW
,
BlockSize
>
(
BlockSize
>
(
in_block_desc
,
p_in_block
,
wei_block_desc
,
p_wei_block
,
out_block_desc
,
p_out_block
);
in_block_desc
,
p_in_block
,
wei_block_desc
,
p_wei_block
,
out_block_desc
,
p_out_block
);
#if 1
__syncthreads
();
#endif
}
}
// copy output tensor from LDS to device mem
// copy output tensor from LDS to device mem
blockwise_4d_tensor_op_binary
<
TFloat
,
blockwise_4d_tensor_copy
<
TFloat
,
decltype
(
out_block_desc
),
decltype
(
out_block_desc
),
decltype
(
out_block_src_desc
),
decltype
(
out_block_src_desc
),
NBlockOpLen0
,
BlockSize
>
(
NBlockOpLen1
,
NBlockOpLen2
,
NBlockOpLen3
,
decltype
(
f_copy
),
BlockSize
>
(
out_block_desc
,
out_block_desc
,
p_out_block
,
p_out_block
,
out_block_src_desc
,
out_block_src_desc
,
p_out_global
+
p_out_global
+
out_global_desc
.
Get1dIndex
(
out_global_desc
.
Get1dIndex
(
n_block_work_begin
,
k_block_work_begin
,
ho_block_work_begin
,
wo_block_work_begin
),
n_block_work_begin
,
k_block_work_begin
,
ho_block_work_begin
,
wo_block_work_begin
));
f_copy
);
}
}
\ No newline at end of file
src/include/gridwise_direct_convolution_2.cuh
View file @
24d2f034
...
@@ -19,10 +19,6 @@ template <class TFloat,
...
@@ -19,10 +19,6 @@ template <class TFloat,
unsigned
NPerThread
,
unsigned
NPerThread
,
unsigned
KPerThread
,
unsigned
KPerThread
,
unsigned
CPerThread
,
unsigned
CPerThread
,
unsigned
NBlockOpLen0
,
unsigned
NBlockOpLen1
,
unsigned
NBlockOpLen2
,
unsigned
NBlockOpLen3
,
unsigned
BlockSize
,
unsigned
BlockSize
,
unsigned
GridSize
>
unsigned
GridSize
>
__global__
void
gridwise_direct_convolution_2
(
InGlobalDesc
,
__global__
void
gridwise_direct_convolution_2
(
InGlobalDesc
,
...
@@ -32,10 +28,10 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
...
@@ -32,10 +28,10 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
OutGlobalDesc
,
OutGlobalDesc
,
TFloat
*
__restrict__
p_out_global
)
TFloat
*
__restrict__
p_out_global
)
{
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_global_desc
=
InGlobalDesc
{};
constexpr
auto
in_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
wei_global_desc
=
WeiGlobalDesc
{};
...
@@ -147,10 +143,6 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
...
@@ -147,10 +143,6 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
const
unsigned
hi_thread_data_offset
=
ho_thread_data_offset
;
const
unsigned
hi_thread_data_offset
=
ho_thread_data_offset
;
const
unsigned
wi_thread_data_offset
=
wo_thread_data_offset
;
const
unsigned
wi_thread_data_offset
=
wo_thread_data_offset
;
// op
auto
f_set0
=
[](
TFloat
&
v
)
{
v
=
TFloat
(
0
);
};
auto
f_copy
=
[](
const
TFloat
&
src
,
TFloat
&
dst
)
{
dst
=
src
;
};
#if 0
#if 0
if(threadIdx.x == 0)
if(threadIdx.x == 0)
{
{
...
@@ -170,76 +162,54 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
...
@@ -170,76 +162,54 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
#endif
#endif
// set threadwise output tensor to 0
// set threadwise output tensor to 0
threadwise_4d_tensor_op_unary
<
TFloat
,
decltype
(
out_thread_desc
),
decltype
(
f_set0
)
>
(
threadwise_4d_tensor_set_zero
(
out_thread_desc
,
p_out_thread
);
out_thread_desc
,
p_out_thread
,
f_set0
);
for
(
unsigned
c_block_data_offset
=
0
;
c_block_data_offset
<
in_global_desc
.
GetLength
(
I1
);
for
(
unsigned
c_block_data_offset
=
0
;
c_block_data_offset
<
in_global_desc
.
GetLength
(
I1
);
c_block_data_offset
+=
CPerBlock
,
__syncthreads
())
c_block_data_offset
+=
CPerBlock
,
__syncthreads
())
{
{
// copy input tensor to LDS
// copy input tensor to LDS
blockwise_4d_tensor_op_binary
<
TFloat
,
blockwise_4d_tensor_copy
<
TFloat
,
decltype
(
in_block_global_desc
),
decltype
(
in_block_global_desc
),
decltype
(
in_block_desc
),
decltype
(
in_block_desc
),
NBlockOpLen0
,
BlockSize
>
(
in_block_global_desc
,
NBlockOpLen1
,
p_in_global
+
NBlockOpLen2
,
in_global_desc
.
Get1dIndex
(
n_block_data_offset
,
NBlockOpLen3
,
c_block_data_offset
,
decltype
(
f_copy
),
hi_block_data_offset
,
BlockSize
>
(
wi_block_data_offset
),
in_block_global_desc
,
in_block_desc
,
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_data_offset
,
p_in_block
);
c_block_data_offset
,
hi_block_data_offset
,
wi_block_data_offset
),
in_block_desc
,
p_in_block
,
f_copy
);
// copy weight tensor to LDS
// copy weight tensor to LDS
blockwise_4d_tensor_op_binary
<
TFloat
,
blockwise_4d_tensor_copy
<
TFloat
,
decltype
(
wei_block_global_desc
),
decltype
(
wei_block_global_desc
),
decltype
(
wei_block_desc
),
decltype
(
wei_block_desc
),
NBlockOpLen0
,
BlockSize
>
(
NBlockOpLen1
,
NBlockOpLen2
,
NBlockOpLen3
,
decltype
(
f_copy
),
BlockSize
>
(
wei_block_global_desc
,
wei_block_global_desc
,
p_wei_global
+
p_wei_global
+
wei_global_desc
.
Get1dIndex
(
k_block_data_offset
,
c_block_data_offset
,
0
,
0
),
wei_global_desc
.
Get1dIndex
(
k_block_data_offset
,
c_block_data_offset
,
0
,
0
),
wei_block_desc
,
wei_block_desc
,
p_wei_block
,
p_wei_block
);
f_copy
);
__syncthreads
();
__syncthreads
();
for
(
unsigned
c_thread_data
=
0
;
c_thread_data
<
CPerBlock
;
c_thread_data
+=
CPerThread
)
for
(
unsigned
c_thread_data
=
0
;
c_thread_data
<
CPerBlock
;
c_thread_data
+=
CPerThread
)
{
{
// copy input tensor into register
// copy input tensor into register
threadwise_4d_tensor_op_binary
<
TFloat
,
threadwise_4d_tensor_copy
(
in_thread_block_desc
,
decltype
(
in_thread_block_desc
),
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_offset
,
decltype
(
in_thread_desc
),
c_thread_data
,
decltype
(
f_copy
)
>
(
hi_thread_data_offset
,
in_thread_block_desc
,
wi_thread_data_offset
),
p_in_block
+
in_block_desc
.
Get1dIndex
(
n_thread_data_offset
,
in_thread_desc
,
c_thread_data
,
p_in_thread
);
hi_thread_data_offset
,
wi_thread_data_offset
),
in_thread_desc
,
p_in_thread
,
f_copy
);
// copy weight tensor into register
// copy weight tensor into register
threadwise_4d_tensor_op_binary
<
TFloat
,
threadwise_4d_tensor_copy
(
decltype
(
wei_thread_block_desc
),
decltype
(
wei_thread_desc
),
decltype
(
f_copy
)
>
(
wei_thread_block_desc
,
wei_thread_block_desc
,
p_wei_block
+
wei_block_desc
.
Get1dIndex
(
k_thread_data_offset
,
c_thread_data
,
0
,
0
),
p_wei_block
+
wei_block_desc
.
Get1dIndex
(
k_thread_data_offset
,
c_thread_data
,
0
,
0
),
wei_thread_desc
,
wei_thread_desc
,
p_wei_thread
,
p_wei_thread
);
f_copy
);
// threadwise convolution
// threadwise convolution
threadwise_direct_convolution
<
TFloat
,
threadwise_direct_convolution
<
TFloat
,
...
@@ -255,16 +225,12 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
...
@@ -255,16 +225,12 @@ __global__ void gridwise_direct_convolution_2(InGlobalDesc,
}
}
// copy output tensor from register to global mem
// copy output tensor from register to global mem
threadwise_4d_tensor_op_binary
<
TFloat
,
threadwise_4d_tensor_copy
(
decltype
(
out_thread_desc
),
decltype
(
out_thread_global_desc
),
decltype
(
f_copy
)
>
(
out_thread_desc
,
out_thread_desc
,
p_out_thread
,
p_out_thread
,
out_thread_global_desc
,
out_thread_global_desc
,
p_out_global
+
out_global_desc
.
Get1dIndex
(
n_block_data_offset
+
n_thread_data_offset
,
p_out_global
+
out_global_desc
.
Get1dIndex
(
n_block_data_offset
+
n_thread_data_offset
,
k_block_data_offset
+
k_thread_data_offset
,
k_block_data_offset
+
k_thread_data_offset
,
ho_block_data_offset
+
ho_thread_data_offset
,
ho_block_data_offset
+
ho_thread_data_offset
,
wo_block_data_offset
+
wo_thread_data_offset
),
wo_block_data_offset
+
wo_thread_data_offset
));
f_copy
);
}
}
src/include/gridwise_winograd_convolution.cuh
deleted
100644 → 0
View file @
8732ea04
#pragma once
#include "constant_tensor_descriptor.cuh"
#include "blockwise_winograd_transform.cuh"
#include "threadwise_winograd_transform.cuh"
template
<
class
TFloat
,
class
InGlobalDesc
,
class
WeiGlobalDesc
,
class
OutGlobalDesc
,
unsigned
OutTileSizeH
,
unsigned
OutTileSizeW
,
unsigned
NPerBlock
,
unsigned
KPerBlock
,
unsigned
CPerBlock
,
unsigned
YPerBlock
,
unsigned
XPerBlock
,
unsigned
NPerThread
,
unsigned
KPerThread
,
unsigned
CPerThread
,
unsigned
BlockSize
,
unsigned
GridSize
>
__global__
void
gridwise_winograd_convolution
(
InGlobalDesc
,
TFloat
*
const
__restrict__
p_in_global
,
WeiGlobalDesc
,
TFloat
*
const
__restrict__
p_wei_global
,
OutGlobalDesc
,
TFloat
*
__restrict__
p_out_global
)
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
in_global_desc
=
InGlobalDesc
{};
constexpr
auto
wei_global_desc
=
WeiGlobalDesc
{};
constexpr
auto
out_global_desc
=
OutGlobalDesc
{};
constexpr
unsigned
S
=
wei_global_desc
.
GetLength
(
I2
);
constexpr
unsigned
R
=
wei_global_desc
.
GetLength
(
I3
);
constexpr
unsigned
HoPerBlock
=
OutTileSizeH
*
YPerBlock
;
constexpr
unsigned
WoPerBlock
=
OutTileSizeW
*
XPerBlock
;
constexpr
unsigned
HiPerBlock
=
YPerBlock
*
OutTileSizeH
+
S
-
1
;
constexpr
unsigned
WiPerBlock
=
XPerBlock
*
OutTileSizeW
+
R
-
1
;
constexpr
unsigned
InTileSizeH
=
OutTileSizeH
+
S
-
1
;
constexpr
unsigned
InTileSizeW
=
OutTileSizeW
+
R
-
1
;
// divide block work
constexpr
unsigned
NBlockWork
=
(
out_global_desc
.
GetLength
(
I0
)
+
NPerBlock
-
1
)
/
NPerBlock
;
constexpr
unsigned
KBlockWork
=
(
out_global_desc
.
GetLength
(
I1
)
+
KPerBlock
-
1
)
/
KPerBlock
;
constexpr
unsigned
YBlockWork
=
(
out_global_desc
.
GetLength
(
I2
)
+
HoPerBlock
-
1
)
/
HoPerBlock
;
constexpr
unsigned
XBlockWork
=
(
out_global_desc
.
GetLength
(
I3
)
+
WoPerBlock
-
1
)
/
WoPerBlock
;
const
unsigned
block_id
=
blockIdx
.
x
;
unsigned
itmp
=
block_id
;
const
unsigned
n_block_work_id
=
itmp
/
(
KBlockWork
*
YBlockWork
*
XBlockWork
);
itmp
-=
n_block_work_id
*
(
KBlockWork
*
YBlockWork
*
XBlockWork
);
const
unsigned
k_block_work_id
=
itmp
/
(
YBlockWork
*
XBlockWork
);
itmp
-=
k_block_work_id
*
(
YBlockWork
*
XBlockWork
);
const
unsigned
y_block_work_id
=
itmp
/
XBlockWork
;
const
unsigned
x_block_work_id
=
itmp
-
y_block_work_id
*
XBlockWork
;
const
unsigned
n_block_data_offset
=
n_block_work_id
*
NPerBlock
;
const
unsigned
k_block_data_offset
=
k_block_work_id
*
KPerBlock
;
const
unsigned
y_block_data_offset
=
y_block_work_id
*
YPerBlock
;
const
unsigned
x_block_data_offset
=
x_block_work_id
*
XPerBlock
;
const
unsigned
ho_block_data_offset
=
y_block_data_offset
*
OutTileSizeH
;
const
unsigned
wo_block_data_offset
=
x_block_data_offset
*
OutTileSizeW
;
const
unsigned
hi_block_data_offset
=
ho_block_data_offset
;
// minus padding
const
unsigned
wi_block_data_offset
=
wo_block_data_offset
;
// minus padding
// divide thread work
constexpr
unsigned
NThreadWork
=
(
NPerBlock
+
NPerThread
-
1
)
/
NPerThread
;
constexpr
unsigned
KThreadWork
=
(
KPerBlock
+
KPerThread
-
1
)
/
KPerThread
;
constexpr
unsigned
YThreadWork
=
YPerBlock
;
constexpr
unsigned
XThreadWork
=
XPerBlock
;
const
unsigned
thread_id
=
threadIdx
.
x
;
itmp
=
thread_id
;
const
unsigned
n_thread_work_id
=
itmp
/
(
KThreadWork
*
YThreadWork
*
XThreadWork
);
itmp
-=
n_thread_work_id
*
(
KThreadWork
*
YThreadWork
*
XThreadWork
);
const
unsigned
k_thread_work_id
=
itmp
/
(
YThreadWork
*
XThreadWork
);
itmp
-=
k_thread_work_id
*
(
YThreadWork
*
XThreadWork
);
const
unsigned
y_thread_work_id
=
itmp
/
XThreadWork
;
const
unsigned
x_thread_work_id
=
itmp
-
y_thread_work_id
*
XThreadWork
;
const
unsigned
n_thread_data_offset
=
n_thread_work_id
*
NPerThread
;
const
unsigned
k_thread_data_offset
=
k_thread_work_id
*
KPerThread
;
const
unsigned
y_thread_data_offset
=
y_thread_work_id
;
const
unsigned
x_thread_data_offset
=
x_thread_work_id
;
// op
auto
f_set0
=
[](
TFloat
&
v
)
{
v
=
TFloat
(
0
);
};
auto
f_copy
=
[](
const
TFloat
&
src
,
TFloat
&
dst
)
{
dst
=
src
;
};
// block data
constexpr
auto
in_transform_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerBlock
,
CPerBlock
,
YPerBlock
*
InTileSizeH
,
XPerBlock
*
InTileSizeW
>
{});
constexpr
auto
wei_transform_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerBlock
,
CPerBlock
,
InTileSizeH
,
InTileSizeW
>
{});
constexpr
unsigned
in_transform_block_size
=
in_transform_block_desc
.
GetElementSpace
();
constexpr
unsigned
wei_transform_block_size
=
wei_transform_block_desc
.
GetElementSpace
();
__shared__
TFloat
p_in_transform_block
[
in_transform_block_size
];
__shared__
TFloat
p_wei_transform_block
[
wei_transform_block_size
];
// thread data
constexpr
auto
in_transform_thread_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerThread
,
CPerThread
,
InTileSizeH
,
InTileSizeW
>
{},
in_transform_block_desc
.
GetStrides
());
constexpr
auto
wei_transform_thread_block_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
KPerThread
,
CPerThread
,
InTileSizeH
,
InTileSizeW
>
{},
wei_transform_block_desc
.
GetStrides
());
constexpr
auto
out_transform_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerThread
,
KPerThread
,
InTileSizeH
,
InTileSizeW
>
{});
constexpr
auto
out_thread_desc
=
make_ConstantTensorDescriptor
(
Sequence
<
NPerThread
,
KPerThread
,
OutTileSizeH
,
OutTileSizeW
>
{});
constexpr
auto
out_thread_global_desc
=
make_ConstantTensorDescriptor
(
out_thread_desc
.
GetLengths
(),
out_global_desc
.
GetStrides
());
constexpr
unsigned
out_transform_thread_size
=
out_transform_thread_desc
.
GetElementSpace
();
constexpr
unsigned
out_thread_size
=
out_thread_desc
.
GetElementSpace
();
TFloat
p_out_transform_thread
[
out_transform_thread_size
];
TFloat
p_out_thread
[
out_thread_size
];
#if 0
if(blockIdx.x == 0 && threadIdx.x == 0)
{
printf("in_transform_block_size %u, wei_transform_block_size %u, out_transform_thread_size "
"%u, out_thread_size %u \n",
in_transform_block_size,
wei_transform_block_size,
out_transform_thread_size,
out_thread_size);
}
#endif
// set threadwise output transform tensor to 0
threadwise_4d_tensor_op_unary
<
TFloat
,
decltype
(
out_transform_thread_desc
),
decltype
(
f_set0
)
>
(
out_transform_thread_desc
,
p_out_transform_thread
,
f_set0
);
for
(
unsigned
c_block_data_offset
=
0
;
c_block_data_offset
<
in_global_desc
.
GetLength
(
I1
);
c_block_data_offset
+=
CPerBlock
,
__syncthreads
())
{
// blockwise transform input
blockwise_winograd_transform_input
<
TFloat
,
InTileSizeH
,
InTileSizeW
,
S
,
R
,
OutTileSizeH
,
OutTileSizeW
,
NPerBlock
,
CPerBlock
,
YPerBlock
,
XPerBlock
,
BlockSize
>
(
p_in_global
+
in_global_desc
.
Get1dIndex
(
n_block_data_offset
,
c_block_data_offset
,
hi_block_data_offset
,
wi_block_data_offset
),
p_in_transform_block
);
// blockwise transform weights
blockwise_winograd_transform_weight
<
TFloat
,
InTileSizeH
,
InTileSizeW
,
S
,
R
,
OutTileSizeH
,
OutTileSizeW
,
KPerBlock
,
CPerBlock
,
BlockSize
>
(
p_wei_global
+
wei_global_desc
.
Get1dIndex
(
k_block_data_offset
,
c_block_data_offset
,
0
,
0
),
p_wei_transform_block
);
for
(
unsigned
c_thread_data
=
0
;
c_thread_data
<
CPerBlock
;
c_thread_data
+=
CPerThread
)
{
// threadwise point multiplication
threadwise_winograd_calculate_transformed_output
<
TFloat
,
decltype
(
in_transform_thread_block_desc
),
decltype
(
wei_transform_thread_block_desc
),
decltype
(
out_transform_thread_desc
),
InTileSizeH
,
InTileSizeW
,
S
,
R
,
OutTileSizeH
,
OutTileSizeW
>
(
in_transform_thread_block_desc
,
p_in_transform_block
+
in_transform_block_desc
.
Get1dIndex
(
n_thread_data_offset
,
c_thread_data
,
y_thread_data_offset
*
InTileSizeH
,
x_thread_data_offset
*
InTileSizeW
),
wei_transform_thread_block_desc
,
p_wei_transform_block
+
wei_transform_block_desc
.
Get1dIndex
(
k_thread_data_offset
,
c_thread_data
,
0
,
0
),
out_transform_thread_desc
,
p_out_transform_thread
);
}
};
// transform back
threadwise_winograd_reverse_transform_output
<
TFloat
,
decltype
(
out_transform_thread_desc
),
decltype
(
out_thread_desc
),
InTileSizeH
,
InTileSizeW
,
S
,
R
,
OutTileSizeH
,
OutTileSizeW
>
(
out_transform_thread_desc
,
p_out_transform_thread
,
out_thread_desc
,
p_out_thread
);
// copy output tensor from register to global mem
threadwise_4d_tensor_op_binary
<
TFloat
,
decltype
(
out_thread_desc
),
decltype
(
out_thread_global_desc
),
decltype
(
f_copy
)
>
(
out_thread_desc
,
p_out_thread
,
out_thread_global_desc
,
p_out_global
+
out_global_desc
.
Get1dIndex
(
n_block_data_offset
+
n_thread_data_offset
,
k_block_data_offset
+
k_thread_data_offset
,
ho_block_data_offset
+
y_thread_data_offset
*
OutTileSizeH
,
wo_block_data_offset
+
x_thread_data_offset
*
OutTileSizeW
),
f_copy
);
}
\ No newline at end of file
src/include/threadwise_direct_convolution.cuh
View file @
24d2f034
...
@@ -9,10 +9,10 @@ __device__ void threadwise_direct_convolution(InDesc,
...
@@ -9,10 +9,10 @@ __device__ void threadwise_direct_convolution(InDesc,
OutDesc
,
OutDesc
,
TFloat
*
__restrict__
p_out
)
TFloat
*
__restrict__
p_out
)
{
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
in_desc
=
InDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
constexpr
auto
wei_desc
=
WeiDesc
{};
...
...
src/include/threadwise_tensor_op.cuh
View file @
24d2f034
#pragma once
#pragma once
#include "constant_tensor_descriptor.cuh"
#include "constant_tensor_descriptor.cuh"
#define THREADWISE_TENSOR_OP_METHOD 0
template
<
class
TFloat
,
class
Desc
,
class
F
>
__device__
void
threadwise_4d_tensor_pointwise_op_unary
(
Desc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
#if THREADWISE_TENSOR_OP_METHOD == 0
template
<
class
TFloat
,
class
DstDesc
,
class
F
>
__device__
void
threadwise_4d_tensor_op_unary
(
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
{
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
dst_
desc
=
Dst
Desc
{};
constexpr
auto
desc
=
Desc
{};
#if 0
#if 0
if(threadIdx.x == 0)
if(threadIdx.x == 0)
{
{
print_ConstantTensorDescriptor(
dst_
desc, "threadwise_4d_tensor_op_unary: ");
print_ConstantTensorDescriptor(desc, "threadwise_4d_tensor_op_unary: ");
}
}
#endif
#endif
for
(
unsigned
did0
=
0
;
did0
<
dst_
desc
.
GetLength
(
I0
);
++
did0
)
for
(
unsigned
did0
=
0
;
did0
<
desc
.
GetLength
(
I0
);
++
did0
)
{
{
for
(
unsigned
did1
=
0
;
did1
<
dst_
desc
.
GetLength
(
I1
);
++
did1
)
for
(
unsigned
did1
=
0
;
did1
<
desc
.
GetLength
(
I1
);
++
did1
)
{
{
for
(
unsigned
did2
=
0
;
did2
<
dst_
desc
.
GetLength
(
I2
);
++
did2
)
for
(
unsigned
did2
=
0
;
did2
<
desc
.
GetLength
(
I2
);
++
did2
)
{
{
for
(
unsigned
did3
=
0
;
did3
<
dst_
desc
.
GetLength
(
I3
);
++
did3
)
for
(
unsigned
did3
=
0
;
did3
<
desc
.
GetLength
(
I3
);
++
did3
)
{
{
const
unsigned
dindex
=
const
unsigned
dindex
=
desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
dst_desc
.
GetStride
(
I0
)
*
did0
+
dst_desc
.
GetStride
(
I1
)
*
did1
+
dst_desc
.
GetStride
(
I2
)
*
did2
+
dst_desc
.
GetStride
(
I3
)
*
did3
;
f
(
p_dst
[
dindex
]);
f
(
p_dst
[
dindex
]);
#if 0
if(threadIdx.x == 0)
{
printf("threadwise_4d_tensor_op_unary: thread id %u, \t"
"dindex %u, p_dst[dindex] %f\n",
threadIdx.x,
dindex,
p_dst[dindex]);
}
#endif
}
}
}
}
}
}
...
@@ -52,13 +36,13 @@ __device__ void threadwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_ds
...
@@ -52,13 +36,13 @@ __device__ void threadwise_4d_tensor_op_unary(DstDesc, TFloat* __restrict__ p_ds
}
}
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
class
F
>
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
class
F
>
__device__
void
threadwise_4d_tensor_op_binary
(
__device__
void
threadwise_4d_tensor_
pointwise_
op_binary
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
{
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I0
=
Number
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I1
=
Number
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I2
=
Number
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
I3
=
Number
<
3
>
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
...
@@ -81,99 +65,34 @@ __device__ void threadwise_4d_tensor_op_binary(
...
@@ -81,99 +65,34 @@ __device__ void threadwise_4d_tensor_op_binary(
{
{
for
(
unsigned
did3
=
0
;
did3
<
src_desc
.
GetLength
(
I3
);
++
did3
)
for
(
unsigned
did3
=
0
;
did3
<
src_desc
.
GetLength
(
I3
);
++
did3
)
{
{
const
unsigned
sindex
=
const
unsigned
sindex
=
src_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
src_desc
.
GetStride
(
I0
)
*
did0
+
src_desc
.
GetStride
(
I1
)
*
did1
+
src_desc
.
GetStride
(
I2
)
*
did2
+
src_desc
.
GetStride
(
I3
)
*
did3
;
const
unsigned
dindex
=
const
unsigned
dindex
=
dst_desc
.
Get1dIndex
(
did0
,
did1
,
did2
,
did3
);
dst_desc
.
GetStride
(
I0
)
*
did0
+
dst_desc
.
GetStride
(
I1
)
*
did1
+
dst_desc
.
GetStride
(
I2
)
*
did2
+
dst_desc
.
GetStride
(
I3
)
*
did3
;
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
#if 0
if(threadIdx.x == 0)
{
printf("threadwise_4d_tensor_op_binary: thread id %u, \t"
"sindex %u, p_src[sindex] %f, \t"
"dindex %u, p_dst[dindex] %f\n",
threadIdx.x,
sindex,
p_src[sindex],
dindex,
p_dst[dindex]);
}
#endif
}
}
}
}
}
}
}
}
}
}
#endif
#if THREADWISE_TENSOR_OP_METHOD == 1
template
<
class
TFloat
,
class
Desc
>
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
,
class
F
>
__device__
void
threadwise_4d_tensor_set_zero
(
Desc
,
TFloat
*
__restrict__
p_dst
)
__device__
void
threadwise_4d_tensor_op
(
SrcDesc
,
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
,
F
f
)
{
{
constexpr
auto
I0
=
Index
<
0
>
{};
auto
f_set_zero
=
[](
TFloat
&
v
)
{
v
=
TFloat
(
0
);
};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
src_desc
=
SrcDesc
{};
constexpr
auto
dst_desc
=
DstDesc
{};
static_assert
(
is_same
<
decltype
(
src_desc
.
GetLengths
()),
decltype
(
dst_desc
.
GetLengths
())
>::
value
);
#if 0
if(threadIdx.x == 0)
{
print_ConstantTensorDescriptor(src_desc, "threadwise_4d_tensor_op: src_desc: ");
print_ConstantTensorDescriptor(dst_desc, "threadwise_4d_tensor_op: dst_desc: ");
}
#endif
unsigned
sindex
=
0
;
threadwise_4d_tensor_pointwise_op_unary
<
TFloat
,
Desc
,
decltype
(
f_set_zero
)
>
(
unsigned
dindex
=
0
;
Desc
{},
p_dst
,
f_set_zero
);
}
for
(
unsigned
did0
=
0
;
did0
<
src_desc
.
GetLength
(
I0
);
++
did0
)
{
for
(
unsigned
did1
=
0
;
did1
<
src_desc
.
GetLength
(
I1
);
++
did1
)
{
for
(
unsigned
did2
=
0
;
did2
<
src_desc
.
GetLength
(
I2
);
++
did2
)
{
for
(
unsigned
did3
=
0
;
did3
<
src_desc
.
GetLength
(
I3
);
++
did3
)
{
f
(
p_src
[
sindex
],
p_dst
[
dindex
]);
#if 0
if(threadIdx.x == 0)
{
printf("threadwise_4d_tensor_op: 1: thread id %u, \t"
"sindex %u, p_src[sindex] %f, \t"
"dindex %u, p_dst[dindex] %f\n",
threadIdx.x,
sindex,
p_src[sindex],
dindex,
p_dst[dindex]);
}
#endif
sindex
+=
src_desc
.
GetStride
(
I3
);
dindex
+=
dst_desc
.
GetStride
(
I3
);
}
sindex
+=
src_desc
.
GetStride
(
I2
)
-
src_desc
.
GetLength
(
I3
)
*
src_desc
.
GetStride
(
I3
);
dindex
+=
dst_desc
.
GetStride
(
I2
)
-
dst_desc
.
GetLength
(
I3
)
*
dst_desc
.
GetStride
(
I3
);
}
sindex
+=
src_desc
.
GetStride
(
I1
)
-
src_desc
.
GetLength
(
I2
)
*
src_desc
.
GetStride
(
I2
);
template
<
class
TFloat
,
class
SrcDesc
,
class
DstDesc
>
dindex
+=
dst_desc
.
GetStride
(
I1
)
-
dst_desc
.
GetLength
(
I2
)
*
dst_desc
.
GetStride
(
I2
);
__device__
void
threadwise_4d_tensor_copy
(
SrcDesc
,
}
TFloat
*
const
__restrict__
p_src
,
DstDesc
,
TFloat
*
__restrict__
p_dst
)
{
auto
f_copy
=
[](
const
TFloat
&
src
,
TFloat
&
dst
)
{
dst
=
src
;
};
sindex
+=
src_desc
.
GetStride
(
I0
)
-
src_desc
.
GetLength
(
I1
)
*
src_desc
.
GetStride
(
I1
);
threadwise_4d_tensor_pointwise_op_binary
<
TFloat
,
SrcDesc
,
DstDesc
,
decltype
(
f_copy
)
>
(
dindex
+=
dst_desc
.
GetStride
(
I0
)
-
dst_desc
.
GetLength
(
I1
)
*
dst_desc
.
GetStride
(
I1
);
SrcDesc
{},
p_src
,
DstDesc
{},
p_dst
,
f_copy
);
}
}
}
\ No newline at end of file
#endif
src/include/threadwise_winograd_transform.cuh
deleted
100644 → 0
View file @
8732ea04
#pragma once
#include "constant_tensor_descriptor.cuh"
template
<
class
TFloat
,
class
InTransThreadDesc
,
//{NPerThread, CPerThread, InTileSizeH, InTileSizeW}
class
WeiTransThreadDesc
,
//{KPerThread, CPerThread, InTileSizeH, InTileSizeW}
class
OutTransThreadDesc
,
//{NPerThread, KPerThread, InTileSizeH, InTileSizeW}
unsigned
InTileSizeH
,
unsigned
InTileSizeW
,
unsigned
S
,
unsigned
R
,
unsigned
OutTileSizeH
,
unsigned
OutTileSizeW
>
__device__
void
threadwise_winograd_calculate_transformed_output
(
InTransThreadDesc
,
TFloat
*
const
__restrict__
p_in_transform_thread
,
WeiTransThreadDesc
,
TFloat
*
const
__restrict__
p_wei_transform_thread
,
OutTransThreadDesc
,
TFloat
*
__restrict__
p_out_transform_thread
)
{
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
in_transform_thread_desc
=
InTransThreadDesc
{};
constexpr
auto
wei_transform_thread_desc
=
WeiTransThreadDesc
{};
constexpr
auto
out_transform_thread_desc
=
OutTransThreadDesc
{};
for
(
unsigned
n
=
0
;
n
<
out_transform_thread_desc
.
GetLength
(
I0
);
++
n
)
{
for
(
unsigned
k
=
0
;
k
<
out_transform_thread_desc
.
GetLength
(
I1
);
++
k
)
{
for
(
unsigned
h
=
0
;
h
<
out_transform_thread_desc
.
GetLength
(
I2
);
++
h
)
{
for
(
unsigned
w
=
0
;
w
<
out_transform_thread_desc
.
GetLength
(
I3
);
++
w
)
{
for
(
unsigned
c
=
0
;
c
<
wei_transform_thread_desc
.
GetLength
(
I1
);
++
c
)
{
const
unsigned
in_index
=
in_transform_thread_desc
.
Get1dIndex
(
n
,
c
,
h
,
w
);
const
unsigned
wei_index
=
wei_transform_thread_desc
.
Get1dIndex
(
k
,
c
,
h
,
w
);
const
unsigned
out_index
=
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
h
,
w
);
p_out_transform_thread
[
out_index
]
+=
p_wei_transform_thread
[
wei_index
]
*
p_in_transform_thread
[
in_index
];
}
}
}
}
}
}
template
<
class
TFloat
,
class
OutTransThreadDesc
,
//{NPerThread, KPerThread, InTileSizeH, InTileSizeW}
class
OutThreadDesc
,
//{NPerThread, CPerThread, OutTileSizeH, OutTileSizeW}
unsigned
InTileSizeH
,
unsigned
InTileSizeW
,
unsigned
S
,
unsigned
R
,
unsigned
OutTileSizeH
,
unsigned
OutTileSizeW
>
__device__
void
threadwise_winograd_reverse_transform_output
(
OutTransThreadDesc
,
TFloat
*
const
__restrict__
p_out_transform_thread
,
OutThreadDesc
,
TFloat
*
__restrict__
p_out_thread
)
{
static_assert
(
InTileSizeH
==
4
,
"wrong"
);
static_assert
(
InTileSizeW
==
4
,
"wrong"
);
static_assert
(
S
==
3
,
"wrong"
);
static_assert
(
R
==
3
,
"wrong"
);
static_assert
(
OutTileSizeH
==
2
,
"wrong"
);
static_assert
(
OutTileSizeW
==
2
,
"wrong"
);
constexpr
auto
I0
=
Index
<
0
>
{};
constexpr
auto
I1
=
Index
<
1
>
{};
constexpr
auto
I2
=
Index
<
2
>
{};
constexpr
auto
I3
=
Index
<
3
>
{};
constexpr
auto
out_transform_thread_desc
=
OutTransThreadDesc
{};
constexpr
auto
out_thread_desc
=
OutThreadDesc
{};
static_assert
(
InTileSizeH
==
out_transform_thread_desc
.
GetLength
(
I2
),
"wrong"
);
static_assert
(
InTileSizeW
==
out_transform_thread_desc
.
GetLength
(
I3
),
"wrong"
);
static_assert
(
OutTileSizeH
==
out_thread_desc
.
GetLength
(
I2
),
"wrong"
);
static_assert
(
OutTileSizeW
==
out_thread_desc
.
GetLength
(
I3
),
"wrong"
);
for
(
unsigned
n
=
0
;
n
<
out_thread_desc
.
GetLength
(
I0
);
++
n
)
{
for
(
unsigned
k
=
0
;
k
<
out_thread_desc
.
GetLength
(
I1
);
++
k
)
{
p_out_thread
[
out_thread_desc
.
Get1dIndex
(
n
,
k
,
0
,
0
)]
=
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
0
,
0
)]
+
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
0
,
1
)]
+
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
0
,
2
)]
+
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
1
,
0
)]
+
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
1
,
1
)]
+
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
1
,
2
)]
+
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
2
,
0
)]
+
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
2
,
1
)]
+
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
2
,
2
)];
p_out_thread
[
out_thread_desc
.
Get1dIndex
(
n
,
k
,
0
,
1
)]
=
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
0
,
1
)]
-
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
0
,
2
)]
-
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
0
,
3
)]
+
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
1
,
1
)]
-
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
1
,
2
)]
-
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
1
,
3
)]
+
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
2
,
1
)]
-
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
2
,
2
)]
-
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
2
,
3
)];
p_out_thread
[
out_thread_desc
.
Get1dIndex
(
n
,
k
,
1
,
0
)]
=
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
1
,
0
)]
+
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
1
,
1
)]
+
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
1
,
2
)]
-
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
2
,
0
)]
-
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
2
,
1
)]
-
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
2
,
2
)]
-
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
3
,
0
)]
-
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
3
,
1
)]
-
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
3
,
2
)];
p_out_thread
[
out_thread_desc
.
Get1dIndex
(
n
,
k
,
1
,
1
)]
=
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
1
,
1
)]
-
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
1
,
2
)]
-
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
1
,
3
)]
-
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
2
,
1
)]
+
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
2
,
2
)]
+
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
2
,
3
)]
-
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
3
,
1
)]
+
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
3
,
2
)]
+
p_out_transform_thread
[
out_transform_thread_desc
.
Get1dIndex
(
n
,
k
,
3
,
3
)];
}
}
}
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment